mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Do not reorder the matrix for GPU
This commit is contained in:
@@ -93,16 +93,16 @@ void reorderBlockedVectorByPattern(int Nb, double *vector, int *fromOrder, doubl
|
||||
bool canBeStarted(const int rowIndex, const int *rowPointers, const int *colIndices, const std::vector<bool>& doneRows);
|
||||
|
||||
/// Find a level scheduling reordering for an input matrix
|
||||
/// The toOrder and fromOrder arrays must be allocated already
|
||||
/// The toOrder and fromOrder arrays must be allocated already
|
||||
/// \param[in] CSRColIndices column indices array, obtained from storing the input matrix in the CSR format
|
||||
/// \param[in] CSRRowPointers row pointers array, obtained from storing the input matrix in the CSR format
|
||||
/// \param[in] CSCRowIndices row indices array, obtained from storing the input matrix in the CSC format
|
||||
/// \param[in] CSCColPointers column pointers array, obtained from storing the input matrix in the CSC format
|
||||
/// \param[in] Nb number of blockrows in the matrix
|
||||
/// \param[out] numColors a pointer to the number of colors needed for the level scheduling
|
||||
/// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
|
||||
/// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
|
||||
/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color, this function uses emplace_back() to fill
|
||||
/// \param[out] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
|
||||
/// \param[out] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
|
||||
/// \param[out] rowsPerColor for each color, an array of all rowIndices in that color, this function uses emplace_back() to fill
|
||||
void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector<int>& rowsPerColor);
|
||||
|
||||
/// Find a graph coloring reordering for an input matrix
|
||||
|
||||
@@ -78,14 +78,8 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
|
||||
fromOrder.resize(Nb);
|
||||
CSCRowIndices.resize(matToDecompose->nnzbs);
|
||||
CSCColPointers.resize(Nb + 1);
|
||||
rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
|
||||
|
||||
if (jacMat) {
|
||||
rJacMat = std::make_shared<BlockedMatrix>(jacMat->Nb, jacMat->nnzbs, block_size);
|
||||
LUmat = std::make_unique<BlockedMatrix>(*rJacMat);
|
||||
} else {
|
||||
LUmat = std::make_unique<BlockedMatrix>(*rmat);
|
||||
}
|
||||
LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);
|
||||
|
||||
Timer t_convert;
|
||||
csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
|
||||
@@ -101,17 +95,9 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
|
||||
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
||||
out << "BILU0 reordering strategy: " << "level_scheduling\n";
|
||||
findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
|
||||
if (jacMat) {
|
||||
reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
|
||||
}
|
||||
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
|
||||
out << "BILU0 reordering strategy: " << "graph_coloring\n";
|
||||
findGraphColoring<block_size>(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, Nb, Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
|
||||
if (jacMat) {
|
||||
reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
|
||||
}
|
||||
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
out << "BILU0 reordering strategy: none\n";
|
||||
// numColors = 1;
|
||||
@@ -143,6 +129,7 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
|
||||
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
|
||||
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
|
||||
s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
|
||||
s.rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned) * LUmat->Nb);
|
||||
#if CHOW_PATEL
|
||||
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
||||
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
||||
@@ -156,14 +143,16 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
|
||||
s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
|
||||
#endif
|
||||
|
||||
events.resize(2);
|
||||
events.resize(3);
|
||||
err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]);
|
||||
|
||||
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
|
||||
for (int i = 0; i < numColors; ++i) {
|
||||
rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
|
||||
rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
|
||||
}
|
||||
|
||||
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), fromOrder.data(), nullptr, &events[2]);
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
@@ -189,27 +178,7 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
auto *matToDecompose = jacMat;
|
||||
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) { // NONE should only be used in debug
|
||||
matToDecompose = jacMat ? jacMat : mat;
|
||||
} else {
|
||||
Timer t_reorder;
|
||||
if (jacMat) {
|
||||
matToDecompose = rJacMat.get();
|
||||
reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
|
||||
reorderNonzeroes(jacMat, jacReordermappingNonzeroes, rJacMat.get());
|
||||
} else {
|
||||
matToDecompose = rmat.get();
|
||||
reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
|
||||
}
|
||||
|
||||
if (verbosity >= 3){
|
||||
std::ostringstream out;
|
||||
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
auto *matToDecompose = jacMat ? jacMat : mat;
|
||||
|
||||
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
||||
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
|
||||
@@ -269,11 +238,11 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
||||
std::ostringstream out;
|
||||
for (int color = 0; color < numColors; ++color) {
|
||||
const unsigned int firstRow = rowsPerColorPrefix[color];
|
||||
const unsigned int lastRow = rowsPerColorPrefix[color+1];
|
||||
const unsigned int lastRow = rowsPerColorPrefix[color + 1];
|
||||
if (verbosity >= 5) {
|
||||
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
|
||||
}
|
||||
OpenclKernels::ILU_decomp(firstRow, lastRow,
|
||||
OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices,
|
||||
s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
|
||||
s.invDiagVals, rowsPerColor[color], block_size);
|
||||
}
|
||||
@@ -301,11 +270,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
|
||||
for (int color = 0; color < numColors; ++color) {
|
||||
#if CHOW_PATEL
|
||||
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows,
|
||||
OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
|
||||
s.diagIndex, y, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
#else
|
||||
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows,
|
||||
OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
|
||||
s.diagIndex, y, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
#endif
|
||||
@@ -313,11 +282,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
|
||||
for (int color = numColors - 1; color >= 0; --color) {
|
||||
#if CHOW_PATEL
|
||||
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows,
|
||||
OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
|
||||
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
#else
|
||||
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows,
|
||||
OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
|
||||
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
#endif
|
||||
|
||||
@@ -54,8 +54,6 @@ class BILU0 : public Preconditioner<block_size>
|
||||
|
||||
private:
|
||||
std::unique_ptr<BlockedMatrix> LUmat = nullptr;
|
||||
std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
|
||||
std::shared_ptr<BlockedMatrix> rJacMat = nullptr;
|
||||
#if CHOW_PATEL
|
||||
std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
|
||||
#endif
|
||||
@@ -76,6 +74,7 @@ private:
|
||||
cl::Buffer invDiagVals;
|
||||
cl::Buffer diagIndex;
|
||||
cl::Buffer rowsPerColor;
|
||||
cl::Buffer rowIndices;
|
||||
#if CHOW_PATEL
|
||||
cl::Buffer Lvals, Lcols, Lrows;
|
||||
cl::Buffer Uvals, Ucols, Urows;
|
||||
@@ -117,12 +116,7 @@ public:
|
||||
|
||||
BlockedMatrix* getRMat() override
|
||||
{
|
||||
return rmat.get();
|
||||
}
|
||||
|
||||
BlockedMatrix* getRJacMat()
|
||||
{
|
||||
return rJacMat.get();
|
||||
return LUmat.get();
|
||||
}
|
||||
|
||||
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
/// Here, L is it's own BSR matrix.
|
||||
/// Only used with ChowPatelIlu.
|
||||
__kernel void ILU_apply1(
|
||||
__global const unsigned *rowIndices,
|
||||
__global const double *LUvals,
|
||||
__global const unsigned int *LUcols,
|
||||
__global const unsigned int *LUrows,
|
||||
@@ -30,14 +31,16 @@ __kernel void ILU_apply1(
|
||||
target_block_row += nodesPerColorPrefix[color];
|
||||
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
const unsigned int first_block = LUrows[target_block_row];
|
||||
const unsigned int last_block = LUrows[target_block_row+1];
|
||||
unsigned row_idx = rowIndices[target_block_row];
|
||||
|
||||
const unsigned int first_block = LUrows[row_idx];
|
||||
const unsigned int last_block = LUrows[row_idx+1];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
if(lane < bs){
|
||||
local_out = y[target_block_row*bs+lane];
|
||||
local_out = y[row_idx*bs+lane];
|
||||
}
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
const double x_elem = x[LUcols[block]*bs + c];
|
||||
@@ -60,7 +63,7 @@ __kernel void ILU_apply1(
|
||||
}
|
||||
|
||||
if(lane < bs){
|
||||
const unsigned int row = target_block_row*bs + lane;
|
||||
const unsigned int row = row_idx*bs + lane;
|
||||
x[row] = tmp[lane];
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
/// Here, L is inside a normal, square matrix.
|
||||
/// In this case, diagIndex indicates where the rows of L end.
|
||||
__kernel void ILU_apply1(
|
||||
__global const unsigned *rowIndices,
|
||||
__global const double *LUvals,
|
||||
__global const unsigned int *LUcols,
|
||||
__global const unsigned int *LUrows,
|
||||
@@ -29,14 +30,17 @@ __kernel void ILU_apply1(
|
||||
const unsigned int r = lane % bs;
|
||||
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
const unsigned int first_block = LUrows[target_block_row];
|
||||
const unsigned int last_block = diagIndex[target_block_row];
|
||||
unsigned row_idx = rowIndices[target_block_row];
|
||||
|
||||
const unsigned int first_block = LUrows[row_idx];
|
||||
const unsigned int last_block = diagIndex[row_idx];
|
||||
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
if(lane < bs){
|
||||
local_out = y[target_block_row*bs+lane];
|
||||
local_out = y[row_idx*bs+lane];
|
||||
}
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
const double x_elem = x[LUcols[block]*bs + c];
|
||||
@@ -59,7 +63,7 @@ __kernel void ILU_apply1(
|
||||
}
|
||||
|
||||
if(lane < bs){
|
||||
const unsigned int row = target_block_row*bs + lane;
|
||||
const unsigned int row = row_idx*bs + lane;
|
||||
x[row] = tmp[lane];
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
/// Here, U is it's own BSR matrix.
|
||||
/// Only used with ChowPatelIlu.
|
||||
__kernel void ILU_apply2(
|
||||
__global const unsigned *rowIndices,
|
||||
__global const double *LUvals,
|
||||
__global const int *LUcols,
|
||||
__global const int *LUrows,
|
||||
@@ -29,14 +30,15 @@ __kernel void ILU_apply2(
|
||||
const unsigned int r = lane % bs;
|
||||
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
const unsigned int first_block = LUrows[target_block_row];
|
||||
const unsigned int last_block = LUrows[target_block_row+1];
|
||||
unsigned row_idx = rowIndices[target_block_row];
|
||||
const unsigned int first_block = LUrows[row_idx];
|
||||
const unsigned int last_block = LUrows[row_idx+1];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
if(lane < bs){
|
||||
const unsigned int row = target_block_row*bs+lane;
|
||||
const unsigned int row = row_idx*bs+lane;
|
||||
local_out = x[row];
|
||||
}
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
@@ -64,10 +66,10 @@ __kernel void ILU_apply2(
|
||||
tmp[lane + bs*idx_t/warpsize] = local_out;
|
||||
double sum = 0.0;
|
||||
for(int i = 0; i < bs; ++i){
|
||||
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
|
||||
sum += invDiagVals[row_idx*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
|
||||
}
|
||||
|
||||
const unsigned int row = target_block_row*bs + lane;
|
||||
const unsigned int row = row_idx*bs + lane;
|
||||
x[row] = sum;
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
/// Here, U is inside a normal, square matrix.
|
||||
/// In this case diagIndex indicates where the rows of U start.
|
||||
__kernel void ILU_apply2(
|
||||
__global const unsigned *rowIndices,
|
||||
__global const double *LUvals,
|
||||
__global const int *LUcols,
|
||||
__global const int *LUrows,
|
||||
@@ -30,14 +31,15 @@ __kernel void ILU_apply2(
|
||||
target_block_row += nodesPerColorPrefix[color];
|
||||
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
const unsigned int first_block = diagIndex[target_block_row] + 1;
|
||||
const unsigned int last_block = LUrows[target_block_row+1];
|
||||
unsigned row_idx = rowIndices[target_block_row];
|
||||
const unsigned int first_block = diagIndex[row_idx] + 1;
|
||||
const unsigned int last_block = LUrows[row_idx+1];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
if(lane < bs){
|
||||
const unsigned int row = target_block_row*bs+lane;
|
||||
const unsigned int row = row_idx*bs+lane;
|
||||
local_out = x[row];
|
||||
}
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
@@ -65,10 +67,10 @@ __kernel void ILU_apply2(
|
||||
tmp[lane + bs*idx_t/warpsize] = local_out;
|
||||
double sum = 0.0;
|
||||
for(int i = 0; i < bs; ++i){
|
||||
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
|
||||
sum += invDiagVals[row_idx*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
|
||||
}
|
||||
|
||||
const unsigned int row = target_block_row*bs + lane;
|
||||
const unsigned int row = row_idx*bs + lane;
|
||||
x[row] = sum;
|
||||
}
|
||||
|
||||
|
||||
@@ -68,6 +68,7 @@ __kernel void inverter(__global double *matrix, __global double *inverse)
|
||||
/// The kernel takes a full BSR matrix and performs inplace ILU decomposition
|
||||
__kernel void ilu_decomp(const unsigned int firstRow,
|
||||
const unsigned int lastRow,
|
||||
__global const unsigned *rowIndices,
|
||||
__global double *LUvals,
|
||||
__global const int *LUcols,
|
||||
__global const int *LUrows,
|
||||
@@ -91,14 +92,15 @@ __kernel void ilu_decomp(const unsigned int firstRow,
|
||||
// go through all rows
|
||||
for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group)
|
||||
{
|
||||
int iRowStart = LUrows[i];
|
||||
int iRowEnd = LUrows[i + 1];
|
||||
const unsigned row = rowIndices[i];
|
||||
int iRowStart = LUrows[row];
|
||||
int iRowEnd = LUrows[row + 1];
|
||||
|
||||
// go through all elements of the row
|
||||
for (int ij = iRowStart; ij < iRowEnd; ij++) {
|
||||
int j = LUcols[ij];
|
||||
|
||||
if (j < i) {
|
||||
if (j < row) {
|
||||
// calculate the pivot of this row
|
||||
block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset);
|
||||
|
||||
@@ -127,11 +129,11 @@ __kernel void ilu_decomp(const unsigned int firstRow,
|
||||
}
|
||||
|
||||
// store the inverse in the diagonal
|
||||
inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs);
|
||||
inverter(LUvals + diagIndex[row] * bs * bs, invDiagVals + row * bs * bs);
|
||||
|
||||
// copy inverse
|
||||
if (thread_id_in_hwarp < bs * bs) {
|
||||
LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp];
|
||||
LUvals[diagIndex[row] * bs * bs + thread_id_in_hwarp] = invDiagVals[row * bs * bs + thread_id_in_hwarp];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -390,7 +390,7 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
|
||||
void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
|
||||
cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
const cl::Buffer& y, cl::Buffer& x,
|
||||
cl::Buffer& rowsPerColor, int color,
|
||||
@@ -403,8 +403,8 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
|
||||
Timer t_ilu_apply1;
|
||||
|
||||
cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, diagIndex, y, x,
|
||||
rowsPerColor, color, block_size,
|
||||
rowIndices, vals, cols, rows, diagIndex,
|
||||
y, x, rowsPerColor, color, block_size,
|
||||
cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 5) {
|
||||
@@ -415,7 +415,7 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
|
||||
void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
|
||||
cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
cl::Buffer& invDiagVals, cl::Buffer& x,
|
||||
cl::Buffer& rowsPerColor, int color,
|
||||
@@ -428,8 +428,8 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
|
||||
Timer t_ilu_apply2;
|
||||
|
||||
cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, diagIndex, invDiagVals,
|
||||
x, rowsPerColor, color, block_size,
|
||||
rowIndices, vals, cols, rows, diagIndex,
|
||||
invDiagVals, x, rowsPerColor, color, block_size,
|
||||
cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 5) {
|
||||
@@ -440,7 +440,7 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::ILU_decomp(int firstRow, int lastRow,
|
||||
void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
|
||||
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
@@ -453,7 +453,8 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow,
|
||||
Timer t_ilu_decomp;
|
||||
|
||||
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
firstRow, lastRow, vals, cols, rows,
|
||||
firstRow, lastRow, rowIndices,
|
||||
vals, cols, rows,
|
||||
invDiagVals, diagIndex, rowsThisColor,
|
||||
cl::Local(lmem_per_work_group));
|
||||
|
||||
|
||||
@@ -39,9 +39,9 @@ using residual_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
|
||||
using residual_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, const cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
|
||||
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
|
||||
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
|
||||
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
|
||||
using stdwell_apply_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
@@ -51,7 +51,7 @@ using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const unsigned int, const unsigned int, cl::Buffer&,
|
||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
|
||||
using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&,
|
||||
using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>;
|
||||
using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;
|
||||
@@ -135,13 +135,13 @@ public:
|
||||
static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, const cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true, bool add = false);
|
||||
static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size);
|
||||
|
||||
static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
static void ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
|
||||
|
||||
static void ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
static void ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
|
||||
|
||||
static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size);
|
||||
|
||||
static void apply_stdwells_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
|
||||
|
||||
@@ -232,12 +232,6 @@ void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& co
|
||||
queue = queue_;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
openclSolverBackend<block_size>::~openclSolverBackend() {
|
||||
finalize();
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
|
||||
@@ -464,7 +458,6 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
|
||||
|
||||
bool reorder = (opencl_ilu_reorder != ILUReorder::NONE);
|
||||
if (reorder) {
|
||||
rb = new double[N];
|
||||
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
|
||||
}
|
||||
|
||||
@@ -482,13 +475,6 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::finalize() {
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
delete[] rb;
|
||||
}
|
||||
} // end finalize()
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
||||
Timer t;
|
||||
@@ -508,7 +494,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
||||
|
||||
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices, nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers, nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[3]);
|
||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]);
|
||||
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
events.resize(6);
|
||||
@@ -546,7 +532,7 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
|
||||
#endif
|
||||
|
||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]);
|
||||
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
@@ -602,13 +588,8 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b, Wel
|
||||
Timer t;
|
||||
|
||||
mat->nnzValues = vals;
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
|
||||
static_cast<WellContributionsOCL&>(wellContribs).setReordering(toOrder, true);
|
||||
} else {
|
||||
rb = b;
|
||||
static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false);
|
||||
}
|
||||
h_b = b;
|
||||
static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false);
|
||||
|
||||
if (verbosity > 2) {
|
||||
std::ostringstream out;
|
||||
@@ -670,12 +651,7 @@ template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::get_result(double *x) {
|
||||
Timer t;
|
||||
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
|
||||
reorderBlockedVectorByPattern<block_size>(mat->Nb, rb, toOrder, x);
|
||||
} else {
|
||||
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
|
||||
}
|
||||
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
|
||||
|
||||
if (verbosity > 2) {
|
||||
std::ostringstream out;
|
||||
|
||||
@@ -51,7 +51,7 @@ class openclSolverBackend : public BdaSolver<block_size>
|
||||
using Base::initialized;
|
||||
|
||||
private:
|
||||
double *rb = nullptr; // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b
|
||||
double *h_b = nullptr; // b vector, on host
|
||||
std::vector<double> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
|
||||
|
||||
// OpenCL variables must be reusable, they are initialized in initialize()
|
||||
@@ -184,9 +184,6 @@ public:
|
||||
/// For the CPR coarse solver
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
|
||||
|
||||
/// Destroy a openclSolver, and free memory
|
||||
~openclSolverBackend();
|
||||
|
||||
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
||||
/// \param[in] matrix matrix A
|
||||
/// \param[in] b input vector, contains N values
|
||||
|
||||
Reference in New Issue
Block a user