mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Bugfix: use full matrix for spmv
Remove references to reordering
This commit is contained in:
parent
24f8f7c857
commit
d30073a885
@ -57,7 +57,7 @@ public:
|
|||||||
/// \param[in] tolerance required relative tolerance for BdaSolver
|
/// \param[in] tolerance required relative tolerance for BdaSolver
|
||||||
/// \param[in] platformID the OpenCL platform ID to be used
|
/// \param[in] platformID the OpenCL platform ID to be used
|
||||||
/// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
|
/// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
|
||||||
/// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL
|
/// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
|
||||||
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
|
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
|
||||||
BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance,
|
BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance,
|
||||||
unsigned int platformID, unsigned int deviceID, bool opencl_ilu_parallel, std::string linsolver);
|
unsigned int platformID, unsigned int deviceID, bool opencl_ilu_parallel, std::string linsolver);
|
||||||
|
@ -77,7 +77,7 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
|
|||||||
for (unsigned int row = 0; row < Mb; ++row) {
|
for (unsigned int row = 0; row < Mb; ++row) {
|
||||||
// for every block in the row
|
// for every block in the row
|
||||||
for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
|
for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
|
||||||
unsigned int colIdx = getColIdx(Bcols[blockID]);
|
unsigned int colIdx = Bcols[blockID];
|
||||||
for (unsigned int j = 0; j < dim_wells; ++j) {
|
for (unsigned int j = 0; j < dim_wells; ++j) {
|
||||||
double temp = 0.0;
|
double temp = 0.0;
|
||||||
for (unsigned int k = 0; k < dim; ++k) {
|
for (unsigned int k = 0; k < dim; ++k) {
|
||||||
@ -97,7 +97,7 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
|
|||||||
for (unsigned int row = 0; row < Mb; ++row) {
|
for (unsigned int row = 0; row < Mb; ++row) {
|
||||||
// for every block in the row
|
// for every block in the row
|
||||||
for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
|
for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
|
||||||
unsigned int colIdx = getColIdx(Bcols[blockID]);
|
unsigned int colIdx = Bcols[blockID];
|
||||||
for (unsigned int j = 0; j < dim; ++j) {
|
for (unsigned int j = 0; j < dim; ++j) {
|
||||||
double temp = 0.0;
|
double temp = 0.0;
|
||||||
for (unsigned int k = 0; k < dim_wells; ++k) {
|
for (unsigned int k = 0; k < dim_wells; ++k) {
|
||||||
@ -116,20 +116,5 @@ void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
unsigned int MultisegmentWellContribution::getColIdx(unsigned int idx)
|
|
||||||
{
|
|
||||||
if (reorder) {
|
|
||||||
return toOrder[idx];
|
|
||||||
} else {
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void MultisegmentWellContribution::setReordering(int *toOrder_, bool reorder_)
|
|
||||||
{
|
|
||||||
this->toOrder = toOrder_;
|
|
||||||
this->reorder = reorder_;
|
|
||||||
}
|
|
||||||
|
|
||||||
} //namespace Opm
|
} //namespace Opm
|
||||||
|
|
||||||
|
@ -68,9 +68,6 @@ private:
|
|||||||
std::vector<double> z2; // z2 = D^-1 * B * x
|
std::vector<double> z2; // z2 = D^-1 * B * x
|
||||||
void *UMFPACK_Symbolic, *UMFPACK_Numeric;
|
void *UMFPACK_Symbolic, *UMFPACK_Numeric;
|
||||||
|
|
||||||
int *toOrder = nullptr;
|
|
||||||
bool reorder = false;
|
|
||||||
|
|
||||||
/// Translate the columnIndex if needed
|
/// Translate the columnIndex if needed
|
||||||
/// Some preconditioners reorder the rows of the matrix, this means the columnIndices of the wellcontributions need to be reordered as well
|
/// Some preconditioners reorder the rows of the matrix, this means the columnIndices of the wellcontributions need to be reordered as well
|
||||||
unsigned int getColIdx(unsigned int idx);
|
unsigned int getColIdx(unsigned int idx);
|
||||||
@ -117,12 +114,6 @@ public:
|
|||||||
/// \param[in] h_x vector x, must be on CPU
|
/// \param[in] h_x vector x, must be on CPU
|
||||||
/// \param[inout] h_y vector y, must be on CPU
|
/// \param[inout] h_y vector y, must be on CPU
|
||||||
void apply(double *h_x, double *h_y);
|
void apply(double *h_x, double *h_y);
|
||||||
|
|
||||||
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
|
|
||||||
/// Those indices need to be mapped via toOrder
|
|
||||||
/// \param[in] toOrder array with mappings
|
|
||||||
/// \param[in] reorder whether reordering is actually used or not
|
|
||||||
void setReordering(int *toOrder, bool reorder);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} //namespace Opm
|
} //namespace Opm
|
||||||
|
@ -184,7 +184,6 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
|||||||
auto *matToDecompose = jacMat ? jacMat : mat;
|
auto *matToDecompose = jacMat ? jacMat : mat;
|
||||||
|
|
||||||
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
||||||
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
|
|
||||||
Timer t_copy;
|
Timer t_copy;
|
||||||
memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs);
|
memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs);
|
||||||
|
|
||||||
@ -209,7 +208,6 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
|||||||
|
|
||||||
std::call_once(pattern_uploaded, [&](){
|
std::call_once(pattern_uploaded, [&](){
|
||||||
// find the positions of each diagonal block
|
// find the positions of each diagonal block
|
||||||
// must be done after reordering
|
|
||||||
for (int row = 0; row < Nb; ++row) {
|
for (int row = 0; row < Nb; ++row) {
|
||||||
int rowStart = LUmat->rowPointers[row];
|
int rowStart = LUmat->rowPointers[row];
|
||||||
int rowEnd = LUmat->rowPointers[row+1];
|
int rowEnd = LUmat->rowPointers[row+1];
|
||||||
|
@ -35,7 +35,8 @@ namespace Accelerator
|
|||||||
{
|
{
|
||||||
|
|
||||||
/// This class implements a Blocked ILU0 preconditioner
|
/// This class implements a Blocked ILU0 preconditioner
|
||||||
/// The decomposition is done on CPU, and reorders the rows of the matrix
|
/// The decomposition is done on GPU, using exact decomposition, or ChowPatel decomposition
|
||||||
|
/// The preconditioner is applied via two exact triangular solves
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
class BILU0 : public Preconditioner<block_size>
|
class BILU0 : public Preconditioner<block_size>
|
||||||
{
|
{
|
||||||
@ -66,9 +67,6 @@ private:
|
|||||||
|
|
||||||
bool opencl_ilu_parallel;
|
bool opencl_ilu_parallel;
|
||||||
|
|
||||||
std::vector<int> reordermappingNonzeroes; // maps nonzero blocks to new location in reordered matrix
|
|
||||||
std::vector<int> jacReordermappingNonzeroes; // same but for jacMatrix
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
cl::Buffer invDiagVals;
|
cl::Buffer invDiagVals;
|
||||||
cl::Buffer diagIndex;
|
cl::Buffer diagIndex;
|
||||||
@ -92,7 +90,7 @@ public:
|
|||||||
|
|
||||||
BILU0(bool opencl_ilu_parallel, int verbosity);
|
BILU0(bool opencl_ilu_parallel, int verbosity);
|
||||||
|
|
||||||
// analysis, find reordering if specified
|
// analysis, extract parallelism if specified
|
||||||
bool analyze_matrix(BlockedMatrix *mat) override;
|
bool analyze_matrix(BlockedMatrix *mat) override;
|
||||||
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||||
|
|
||||||
@ -101,23 +99,10 @@ public:
|
|||||||
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||||
|
|
||||||
// apply preconditioner, x = prec(y)
|
// apply preconditioner, x = prec(y)
|
||||||
|
// via Lz = y
|
||||||
|
// and Ux = z
|
||||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||||
|
|
||||||
int* getToOrder() override
|
|
||||||
{
|
|
||||||
return toOrder.data();
|
|
||||||
}
|
|
||||||
|
|
||||||
int* getFromOrder() override
|
|
||||||
{
|
|
||||||
return fromOrder.data();
|
|
||||||
}
|
|
||||||
|
|
||||||
BlockedMatrix* getRMat() override
|
|
||||||
{
|
|
||||||
return LUmat.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
|
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
|
||||||
{
|
{
|
||||||
return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
|
return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
|
||||||
|
@ -115,7 +115,7 @@ public:
|
|||||||
// set own Opencl variables, but also that of the bilu0 preconditioner
|
// set own Opencl variables, but also that of the bilu0 preconditioner
|
||||||
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
|
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
|
||||||
|
|
||||||
// analysis, find reordering if specified
|
// analysis, extract parallelism
|
||||||
bool analyze_matrix(BlockedMatrix *mat) override;
|
bool analyze_matrix(BlockedMatrix *mat) override;
|
||||||
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||||
|
|
||||||
@ -125,21 +125,6 @@ public:
|
|||||||
|
|
||||||
// apply preconditioner, x = prec(y)
|
// apply preconditioner, x = prec(y)
|
||||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||||
|
|
||||||
int* getToOrder() override
|
|
||||||
{
|
|
||||||
return bilu0->getToOrder();
|
|
||||||
}
|
|
||||||
|
|
||||||
int* getFromOrder() override
|
|
||||||
{
|
|
||||||
return bilu0->getFromOrder();
|
|
||||||
}
|
|
||||||
|
|
||||||
BlockedMatrix* getRMat() override
|
|
||||||
{
|
|
||||||
return bilu0->getRMat();
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Similar function to csrPatternToCsc. It gives an offset map from CSR to CSC instead of the full CSR to CSC conversion.
|
/// Similar function to csrPatternToCsc. It gives an offset map from CSR to CSC instead of the full CSR to CSC conversion.
|
||||||
|
@ -97,7 +97,7 @@ private:
|
|||||||
unsigned num_post_smooth_steps; // number of Jacobi smooth steps after prolongation
|
unsigned num_post_smooth_steps; // number of Jacobi smooth steps after prolongation
|
||||||
|
|
||||||
std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
|
std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
|
||||||
bool opencl_ilu_parallel; // reordering strategy for ILU0 in coarse solver
|
bool opencl_ilu_parallel; // whether ILU0 operation should be parallelized
|
||||||
|
|
||||||
// Analyze the AMG hierarchy build by Dune
|
// Analyze the AMG hierarchy build by Dune
|
||||||
void analyzeHierarchy();
|
void analyzeHierarchy();
|
||||||
@ -135,21 +135,6 @@ public:
|
|||||||
|
|
||||||
bool create_preconditioner(BlockedMatrix *mat) override;
|
bool create_preconditioner(BlockedMatrix *mat) override;
|
||||||
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||||
|
|
||||||
int* getToOrder() override
|
|
||||||
{
|
|
||||||
return bilu0->getToOrder();
|
|
||||||
}
|
|
||||||
|
|
||||||
int* getFromOrder() override
|
|
||||||
{
|
|
||||||
return bilu0->getFromOrder();
|
|
||||||
}
|
|
||||||
|
|
||||||
BlockedMatrix* getRMat() override
|
|
||||||
{
|
|
||||||
return bilu0->getRMat();
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// solve A^T * x = b
|
// solve A^T * x = b
|
||||||
|
@ -797,7 +797,6 @@ void ChowPatelIlu<block_size>::decomposition(
|
|||||||
|
|
||||||
std::call_once(pattern_uploaded, [&](){
|
std::call_once(pattern_uploaded, [&](){
|
||||||
// find the positions of each diagonal block
|
// find the positions of each diagonal block
|
||||||
// must be done after reordering
|
|
||||||
for (int row = 0; row < Nb; ++row) {
|
for (int row = 0; row < Nb; ++row) {
|
||||||
int rowStart = LUmat->rowPointers[row];
|
int rowStart = LUmat->rowPointers[row];
|
||||||
int rowEnd = LUmat->rowPointers[row+1];
|
int rowEnd = LUmat->rowPointers[row+1];
|
||||||
|
@ -77,13 +77,6 @@ public:
|
|||||||
// the version with two params can be overloaded, if not, it will default to using the one param version
|
// the version with two params can be overloaded, if not, it will default to using the one param version
|
||||||
virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
|
virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
|
||||||
virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);
|
virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);
|
||||||
|
|
||||||
// get reordering mappings
|
|
||||||
virtual int* getToOrder() = 0;
|
|
||||||
virtual int* getFromOrder() = 0;
|
|
||||||
|
|
||||||
// get reordered matrix
|
|
||||||
virtual BlockedMatrix* getRMat() = 0;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} //namespace Accelerator
|
} //namespace Accelerator
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
/// In this kernel there is reordering: the B/Ccols do not correspond with the x/y vector
|
/// Applies sdtwells
|
||||||
/// the x/y vector is reordered, using toOrder to address that
|
|
||||||
__kernel void stdwell_apply(
|
__kernel void stdwell_apply(
|
||||||
__global const double *Cnnzs,
|
__global const double *Cnnzs,
|
||||||
__global const double *Dnnzs,
|
__global const double *Dnnzs,
|
||||||
@ -8,7 +7,6 @@ __kernel void stdwell_apply(
|
|||||||
__global const int *Bcols,
|
__global const int *Bcols,
|
||||||
__global const double *x,
|
__global const double *x,
|
||||||
__global double *y,
|
__global double *y,
|
||||||
__global const int *toOrder,
|
|
||||||
const unsigned int dim,
|
const unsigned int dim,
|
||||||
const unsigned int dim_wells,
|
const unsigned int dim_wells,
|
||||||
__global const unsigned int *val_pointers,
|
__global const unsigned int *val_pointers,
|
||||||
@ -32,7 +30,7 @@ __kernel void stdwell_apply(
|
|||||||
if(wiId < numActiveWorkItems){
|
if(wiId < numActiveWorkItems){
|
||||||
int b = wiId/valsPerBlock + val_pointers[wgId];
|
int b = wiId/valsPerBlock + val_pointers[wgId];
|
||||||
while(b < valSize + val_pointers[wgId]){
|
while(b < valSize + val_pointers[wgId]){
|
||||||
int colIdx = toOrder[Bcols[b]];
|
int colIdx = Bcols[b];
|
||||||
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
||||||
b += numBlocksPerWarp;
|
b += numBlocksPerWarp;
|
||||||
}
|
}
|
||||||
@ -78,7 +76,7 @@ __kernel void stdwell_apply(
|
|||||||
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
int colIdx = toOrder[Ccols[bb]];
|
int colIdx = Ccols[bb];
|
||||||
y[colIdx*dim + c] -= temp;
|
y[colIdx*dim + c] -= temp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,82 +0,0 @@
|
|||||||
/// Applies sdtwells without reordering
|
|
||||||
__kernel void stdwell_apply_no_reorder(
|
|
||||||
__global const double *Cnnzs,
|
|
||||||
__global const double *Dnnzs,
|
|
||||||
__global const double *Bnnzs,
|
|
||||||
__global const int *Ccols,
|
|
||||||
__global const int *Bcols,
|
|
||||||
__global const double *x,
|
|
||||||
__global double *y,
|
|
||||||
const unsigned int dim,
|
|
||||||
const unsigned int dim_wells,
|
|
||||||
__global const unsigned int *val_pointers,
|
|
||||||
__local double *localSum,
|
|
||||||
__local double *z1,
|
|
||||||
__local double *z2)
|
|
||||||
{
|
|
||||||
int wgId = get_group_id(0);
|
|
||||||
int wiId = get_local_id(0);
|
|
||||||
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
|
|
||||||
int valsPerBlock = dim*dim_wells;
|
|
||||||
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
|
|
||||||
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
|
|
||||||
int c = wiId % dim;
|
|
||||||
int r = (wiId/dim) % dim_wells;
|
|
||||||
double temp;
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
localSum[wiId] = 0;
|
|
||||||
if(wiId < numActiveWorkItems){
|
|
||||||
int b = wiId/valsPerBlock + val_pointers[wgId];
|
|
||||||
while(b < valSize + val_pointers[wgId]){
|
|
||||||
int colIdx = Bcols[b];
|
|
||||||
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
|
||||||
b += numBlocksPerWarp;
|
|
||||||
}
|
|
||||||
|
|
||||||
// merge all blocks in this workgroup into 1 block
|
|
||||||
// if numBlocksPerWarp >= 3, should use loop
|
|
||||||
// block 1: block 2:
|
|
||||||
// 0 1 2 12 13 14
|
|
||||||
// 3 4 5 15 16 17
|
|
||||||
// 6 7 8 18 19 20
|
|
||||||
// 9 10 11 21 22 23
|
|
||||||
// workitem i will hold the sum of workitems i and i + valsPerBlock
|
|
||||||
if(wiId < valsPerBlock){
|
|
||||||
for (int i = 1; i < numBlocksPerWarp; ++i) {
|
|
||||||
localSum[wiId] += localSum[wiId + i*valsPerBlock];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(c == 0 && wiId < valsPerBlock){
|
|
||||||
for(unsigned int i = dim - 1; i > 0; --i){
|
|
||||||
localSum[wiId] += localSum[wiId + i];
|
|
||||||
}
|
|
||||||
z1[r] = localSum[wiId];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
if(wiId < dim_wells){
|
|
||||||
temp = 0.0;
|
|
||||||
for(unsigned int i = 0; i < dim_wells; ++i){
|
|
||||||
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
|
|
||||||
}
|
|
||||||
z2[wiId] = temp;
|
|
||||||
}
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
if(wiId < dim*valSize){
|
|
||||||
temp = 0.0;
|
|
||||||
int bb = wiId/dim + val_pointers[wgId];
|
|
||||||
for (unsigned int j = 0; j < dim_wells; ++j){
|
|
||||||
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
|
||||||
}
|
|
||||||
|
|
||||||
int colIdx = Ccols[bb];
|
|
||||||
y[colIdx*dim + c] -= temp;
|
|
||||||
}
|
|
||||||
}
|
|
@ -61,7 +61,6 @@ std::unique_ptr<residual_kernel_type> OpenclKernels::residual_k;
|
|||||||
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
|
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
|
||||||
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
|
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
|
||||||
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
|
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
|
||||||
std::unique_ptr<stdwell_apply_no_reorder_kernel_type> OpenclKernels::stdwell_apply_no_reorder_k;
|
|
||||||
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
|
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
|
||||||
std::unique_ptr<isaiL_kernel_type> OpenclKernels::isaiL_k;
|
std::unique_ptr<isaiL_kernel_type> OpenclKernels::isaiL_k;
|
||||||
std::unique_ptr<isaiU_kernel_type> OpenclKernels::isaiU_k;
|
std::unique_ptr<isaiU_kernel_type> OpenclKernels::isaiU_k;
|
||||||
@ -106,7 +105,6 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
|||||||
sources.emplace_back(ILU_apply2_fm_str);
|
sources.emplace_back(ILU_apply2_fm_str);
|
||||||
#endif
|
#endif
|
||||||
sources.emplace_back(stdwell_apply_str);
|
sources.emplace_back(stdwell_apply_str);
|
||||||
sources.emplace_back(stdwell_apply_no_reorder_str);
|
|
||||||
sources.emplace_back(ILU_decomp_str);
|
sources.emplace_back(ILU_decomp_str);
|
||||||
sources.emplace_back(isaiL_str);
|
sources.emplace_back(isaiL_str);
|
||||||
sources.emplace_back(isaiU_str);
|
sources.emplace_back(isaiU_str);
|
||||||
@ -136,7 +134,6 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
|||||||
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
|
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
|
||||||
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
|
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
|
||||||
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
|
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
|
||||||
stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
|
|
||||||
ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
|
ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
|
||||||
isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL")));
|
isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL")));
|
||||||
isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU")));
|
isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU")));
|
||||||
@ -466,29 +463,7 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void OpenclKernels::apply_stdwells_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
|
void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
|
||||||
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
|
|
||||||
cl::Buffer &d_toOrder, int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
|
|
||||||
{
|
|
||||||
const unsigned int work_group_size = 32;
|
|
||||||
const unsigned int total_work_items = num_std_wells * work_group_size;
|
|
||||||
const unsigned int lmem1 = sizeof(double) * work_group_size;
|
|
||||||
const unsigned int lmem2 = sizeof(double) * dim_wells;
|
|
||||||
Timer t_apply_stdwells;
|
|
||||||
|
|
||||||
cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
|
||||||
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, d_val_pointers_ocl,
|
|
||||||
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
|
||||||
|
|
||||||
if (verbosity >= 4) {
|
|
||||||
event.wait();
|
|
||||||
std::ostringstream oss;
|
|
||||||
oss << std::scientific << "OpenclKernels apply_stdwells() time: " << t_apply_stdwells.stop() << " s";
|
|
||||||
OpmLog::info(oss.str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
|
|
||||||
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
|
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
|
||||||
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
|
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
|
||||||
{
|
{
|
||||||
@ -498,7 +473,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
|||||||
const unsigned int lmem2 = sizeof(double) * dim_wells;
|
const unsigned int lmem2 = sizeof(double) * dim_wells;
|
||||||
Timer t_apply_stdwells;
|
Timer t_apply_stdwells;
|
||||||
|
|
||||||
cl::Event event = (*stdwell_apply_no_reorder_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||||
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
|
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
|
||||||
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
||||||
|
|
||||||
|
@ -44,10 +44,6 @@ using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::B
|
|||||||
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
|
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
|
||||||
using stdwell_apply_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
using stdwell_apply_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
|
||||||
const unsigned int, const unsigned int, cl::Buffer&,
|
|
||||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
|
|
||||||
using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
|
||||||
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
const unsigned int, const unsigned int, cl::Buffer&,
|
const unsigned int, const unsigned int, cl::Buffer&,
|
||||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
|
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
|
||||||
@ -85,7 +81,6 @@ private:
|
|||||||
static std::unique_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
|
static std::unique_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
|
||||||
static std::unique_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
|
static std::unique_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
|
||||||
static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
|
static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
|
||||||
static std::unique_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
|
|
||||||
static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
|
static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
|
||||||
static std::unique_ptr<isaiL_kernel_type> isaiL_k;
|
static std::unique_ptr<isaiL_kernel_type> isaiL_k;
|
||||||
static std::unique_ptr<isaiU_kernel_type> isaiU_k;
|
static std::unique_ptr<isaiU_kernel_type> isaiU_k;
|
||||||
@ -116,7 +111,6 @@ public:
|
|||||||
static const std::string ILU_apply2_fm_str;
|
static const std::string ILU_apply2_fm_str;
|
||||||
#endif
|
#endif
|
||||||
static const std::string stdwell_apply_str;
|
static const std::string stdwell_apply_str;
|
||||||
static const std::string stdwell_apply_no_reorder_str;
|
|
||||||
static const std::string ILU_decomp_str;
|
static const std::string ILU_decomp_str;
|
||||||
static const std::string isaiL_str;
|
static const std::string isaiL_str;
|
||||||
static const std::string isaiU_str;
|
static const std::string isaiU_str;
|
||||||
@ -144,11 +138,7 @@ public:
|
|||||||
static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||||
cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size);
|
cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size);
|
||||||
|
|
||||||
static void apply_stdwells_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
|
static void apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
|
||||||
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
|
|
||||||
cl::Buffer &d_toOrder, int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells);
|
|
||||||
|
|
||||||
static void apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
|
|
||||||
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
|
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
|
||||||
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells);
|
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells);
|
||||||
|
|
||||||
|
@ -31,7 +31,6 @@
|
|||||||
#include <opm/simulators/linalg/bda/opencl/openclWellContributions.hpp>
|
#include <opm/simulators/linalg/bda/opencl/openclWellContributions.hpp>
|
||||||
|
|
||||||
#include <opm/simulators/linalg/bda/BdaResult.hpp>
|
#include <opm/simulators/linalg/bda/BdaResult.hpp>
|
||||||
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
|
||||||
|
|
||||||
|
|
||||||
// iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done
|
// iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done
|
||||||
@ -308,7 +307,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
|||||||
|
|
||||||
// apply wellContributions
|
// apply wellContributions
|
||||||
if(wellContribs.getNumWells() > 0){
|
if(wellContribs.getNumWells() > 0){
|
||||||
static_cast<WellContributionsOCL&>(wellContribs).apply(d_pw, d_v, d_toOrder);
|
static_cast<WellContributionsOCL&>(wellContribs).apply(d_pw, d_v);
|
||||||
}
|
}
|
||||||
if(verbosity >= 3) {
|
if(verbosity >= 3) {
|
||||||
queue->finish();
|
queue->finish();
|
||||||
@ -353,7 +352,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
|||||||
|
|
||||||
// apply wellContributions
|
// apply wellContributions
|
||||||
if(wellContribs.getNumWells() > 0){
|
if(wellContribs.getNumWells() > 0){
|
||||||
static_cast<WellContributionsOCL&>(wellContribs).apply(d_s, d_t, d_toOrder);
|
static_cast<WellContributionsOCL&>(wellContribs).apply(d_s, d_t);
|
||||||
}
|
}
|
||||||
if (verbosity >= 3) {
|
if (verbosity >= 3) {
|
||||||
queue->finish();
|
queue->finish();
|
||||||
@ -435,8 +434,6 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
|
|||||||
#if COPY_ROW_BY_ROW
|
#if COPY_ROW_BY_ROW
|
||||||
vals_contiguous = new double[N];
|
vals_contiguous = new double[N];
|
||||||
#endif
|
#endif
|
||||||
// mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
|
|
||||||
// jacMat.reset(new BlockedMatrix(Nb, jac_nnzb, block_size, vals2, cols2, rows2));
|
|
||||||
mat = matrix;
|
mat = matrix;
|
||||||
jacMat = jacMatrix;
|
jacMat = jacMatrix;
|
||||||
|
|
||||||
@ -456,10 +453,6 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
|
|||||||
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
|
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
|
||||||
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
||||||
|
|
||||||
if (opencl_ilu_parallel) {
|
|
||||||
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (const cl::Error& error) {
|
} catch (const cl::Error& error) {
|
||||||
std::ostringstream oss;
|
std::ostringstream oss;
|
||||||
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
|
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
|
||||||
@ -482,23 +475,20 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
|||||||
#if COPY_ROW_BY_ROW
|
#if COPY_ROW_BY_ROW
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
for (int i = 0; i < Nb; ++i) {
|
for (int i = 0; i < Nb; ++i) {
|
||||||
int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
|
int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
|
||||||
memcpy(vals_contiguous.data() + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
|
memcpy(vals_contiguous.data() + sum, reinterpret_cast<double*>(mat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
|
||||||
sum += size_row * block_size * block_size;
|
sum += size_row * block_size * block_size;
|
||||||
}
|
}
|
||||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
|
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
|
||||||
#else
|
#else
|
||||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
|
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices, nullptr, &events[1]);
|
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
|
||||||
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers, nullptr, &events[2]);
|
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
|
||||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]);
|
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]);
|
||||||
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
|
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
|
||||||
if (opencl_ilu_parallel) {
|
|
||||||
events.resize(6);
|
|
||||||
queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder, nullptr, &events[5]);
|
|
||||||
}
|
|
||||||
cl::WaitForEvents(events);
|
cl::WaitForEvents(events);
|
||||||
events.clear();
|
events.clear();
|
||||||
if (err != CL_SUCCESS) {
|
if (err != CL_SUCCESS) {
|
||||||
@ -522,17 +512,18 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
|
|||||||
#if COPY_ROW_BY_ROW
|
#if COPY_ROW_BY_ROW
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
for (int i = 0; i < Nb; ++i) {
|
for (int i = 0; i < Nb; ++i) {
|
||||||
int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
|
int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
|
||||||
memcpy(vals_contiguous.data() + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
|
memcpy(vals_contiguous.data() + sum, reinterpret_cast<double*>(mat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
|
||||||
sum += size_row * block_size * block_size;
|
sum += size_row * block_size * block_size;
|
||||||
}
|
}
|
||||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
|
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
|
||||||
#else
|
#else
|
||||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
|
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]);
|
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]);
|
||||||
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
|
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
|
||||||
|
|
||||||
cl::WaitForEvents(events);
|
cl::WaitForEvents(events);
|
||||||
events.clear();
|
events.clear();
|
||||||
if (err != CL_SUCCESS) {
|
if (err != CL_SUCCESS) {
|
||||||
@ -558,18 +549,6 @@ bool openclSolverBackend<block_size>::analyze_matrix() {
|
|||||||
else
|
else
|
||||||
success = prec->analyze_matrix(mat.get());
|
success = prec->analyze_matrix(mat.get());
|
||||||
|
|
||||||
if (opencl_ilu_parallel) {
|
|
||||||
// toOrder = bilu0->getToOrder();
|
|
||||||
// fromOrder = bilu0->getFromOrder();
|
|
||||||
// rmat = bilu0->getRMat();
|
|
||||||
toOrder = prec->getToOrder();
|
|
||||||
fromOrder = prec->getFromOrder();
|
|
||||||
rmat = prec->getRMat();
|
|
||||||
} else {
|
|
||||||
rmat = mat.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if (verbosity > 2) {
|
if (verbosity > 2) {
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
out << "openclSolver::analyze_matrix(): " << t.stop() << " s";
|
out << "openclSolver::analyze_matrix(): " << t.stop() << " s";
|
||||||
@ -583,12 +562,11 @@ bool openclSolverBackend<block_size>::analyze_matrix() {
|
|||||||
|
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
void openclSolverBackend<block_size>::update_system(double *vals, double *b, WellContributions &wellContribs) {
|
void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
|
||||||
Timer t;
|
Timer t;
|
||||||
|
|
||||||
mat->nnzValues = vals;
|
mat->nnzValues = vals;
|
||||||
h_b = b;
|
h_b = b;
|
||||||
static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false);
|
|
||||||
|
|
||||||
if (verbosity > 2) {
|
if (verbosity > 2) {
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
@ -674,13 +652,13 @@ SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<Block
|
|||||||
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
update_system(matrix->nnzValues, b, wellContribs);
|
update_system(matrix->nnzValues, b);
|
||||||
if (!create_preconditioner()) {
|
if (!create_preconditioner()) {
|
||||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||||
}
|
}
|
||||||
copy_system_to_gpu();
|
copy_system_to_gpu();
|
||||||
} else {
|
} else {
|
||||||
update_system(matrix->nnzValues, b, wellContribs);
|
update_system(matrix->nnzValues, b);
|
||||||
if (!create_preconditioner()) {
|
if (!create_preconditioner()) {
|
||||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||||
}
|
}
|
||||||
|
@ -54,11 +54,10 @@ private:
|
|||||||
std::vector<double> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
|
std::vector<double> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
|
||||||
|
|
||||||
// OpenCL variables must be reusable, they are initialized in initialize()
|
// OpenCL variables must be reusable, they are initialized in initialize()
|
||||||
cl::Buffer d_Avals, d_Acols, d_Arows; // (reordered) matrix in BSR format on GPU
|
cl::Buffer d_Avals, d_Acols, d_Arows; // matrix in BSR format on GPU
|
||||||
cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p; // vectors, used during linear solve
|
cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p; // vectors, used during linear solve
|
||||||
cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve
|
cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve
|
||||||
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
|
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
|
||||||
cl::Buffer d_toOrder; // only used when reordering is used
|
|
||||||
|
|
||||||
std::vector<cl::Device> devices;
|
std::vector<cl::Device> devices;
|
||||||
|
|
||||||
@ -67,12 +66,10 @@ private:
|
|||||||
std::unique_ptr<Preconditioner<block_size> > prec;
|
std::unique_ptr<Preconditioner<block_size> > prec;
|
||||||
// can perform blocked ILU0 and AMG on pressure component
|
// can perform blocked ILU0 and AMG on pressure component
|
||||||
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
|
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
|
||||||
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
|
|
||||||
bool analysis_done = false;
|
bool analysis_done = false;
|
||||||
std::shared_ptr<BlockedMatrix> mat = nullptr; // original matrix
|
std::shared_ptr<BlockedMatrix> mat = nullptr; // original matrix
|
||||||
std::shared_ptr<BlockedMatrix> jacMat = nullptr; // matrix for preconditioner
|
std::shared_ptr<BlockedMatrix> jacMat = nullptr; // matrix for preconditioner
|
||||||
BlockedMatrix *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv
|
bool opencl_ilu_parallel; // parallelize ILU operations (with level_scheduling)
|
||||||
bool opencl_ilu_parallel; // reordering strategy
|
|
||||||
std::vector<cl::Event> events;
|
std::vector<cl::Event> events;
|
||||||
cl_int err;
|
cl_int err;
|
||||||
|
|
||||||
@ -142,11 +139,10 @@ private:
|
|||||||
/// Copy linear system to GPU
|
/// Copy linear system to GPU
|
||||||
void copy_system_to_gpu();
|
void copy_system_to_gpu();
|
||||||
|
|
||||||
/// Reorder the linear system so it corresponds with the coloring
|
/// Reassign pointers, in case the addresses of the Dune variables have changed
|
||||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||||
/// \param[in] b input vectors, contains N values
|
/// \param[in] b input vectors, contains N values
|
||||||
/// \param[out] wellContribs WellContributions, to set reordering
|
void update_system(double *vals, double *b);
|
||||||
void update_system(double *vals, double *b, WellContributions &wellContribs);
|
|
||||||
|
|
||||||
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||||
void update_system_on_gpu();
|
void update_system_on_gpu();
|
||||||
@ -155,12 +151,13 @@ private:
|
|||||||
/// \return true iff analysis was successful
|
/// \return true iff analysis was successful
|
||||||
bool analyze_matrix();
|
bool analyze_matrix();
|
||||||
|
|
||||||
/// Perform ilu0-decomposition
|
/// Create the preconditioner, only done once per linear solve
|
||||||
/// \return true iff decomposition was successful
|
/// \return true iff decomposition was successful
|
||||||
bool create_preconditioner();
|
bool create_preconditioner();
|
||||||
|
|
||||||
/// Solve linear system
|
/// Solve linear system
|
||||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||||
|
/// could be empty
|
||||||
/// \param[inout] res summary of solver result
|
/// \param[inout] res summary of solver result
|
||||||
void solve_system(WellContributions &wellContribs, BdaResult &res);
|
void solve_system(WellContributions &wellContribs, BdaResult &res);
|
||||||
|
|
||||||
@ -174,7 +171,7 @@ public:
|
|||||||
/// \param[in] tolerance required relative tolerance for openclSolver
|
/// \param[in] tolerance required relative tolerance for openclSolver
|
||||||
/// \param[in] platformID the OpenCL platform to be used
|
/// \param[in] platformID the OpenCL platform to be used
|
||||||
/// \param[in] deviceID the device to be used
|
/// \param[in] deviceID the device to be used
|
||||||
/// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL
|
/// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
|
||||||
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
|
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
|
||||||
/// only ilu0, cpr_quasiimpes and isai are supported
|
/// only ilu0, cpr_quasiimpes and isai are supported
|
||||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
|
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
|
||||||
|
@ -37,20 +37,10 @@ void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue*
|
|||||||
this->queue = queue_;
|
this->queue = queue_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void WellContributionsOCL::setReordering(int* h_toOrder_, bool reorder_)
|
|
||||||
{
|
|
||||||
this->h_toOrder = h_toOrder_;
|
|
||||||
this->reorder = reorder_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
|
void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y){
|
||||||
if (reorder) {
|
OpenclKernels::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
|
||||||
OpenclKernels::apply_stdwells_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
|
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
|
||||||
d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
|
|
||||||
} else {
|
|
||||||
OpenclKernels::apply_stdwells_no_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
|
|
||||||
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
|
void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
|
||||||
@ -67,7 +57,6 @@ void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
|
|||||||
|
|
||||||
// actually apply MultisegmentWells
|
// actually apply MultisegmentWells
|
||||||
for (auto& well : multisegments) {
|
for (auto& well : multisegments) {
|
||||||
well->setReordering(h_toOrder, reorder);
|
|
||||||
well->apply(h_x.data(), h_y.data());
|
well->apply(h_x.data(), h_y.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -78,9 +67,9 @@ void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
|
|||||||
events.clear();
|
events.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
void WellContributionsOCL::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
|
void WellContributionsOCL::apply(cl::Buffer d_x, cl::Buffer d_y){
|
||||||
if(num_std_wells > 0){
|
if(num_std_wells > 0){
|
||||||
apply_stdwells(d_x, d_y, d_toOrder);
|
apply_stdwells(d_x, d_y);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(num_ms_wells > 0){
|
if(num_ms_wells > 0){
|
||||||
|
@ -37,14 +37,9 @@ class WellContributionsOCL : public WellContributions
|
|||||||
public:
|
public:
|
||||||
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
|
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
|
||||||
|
|
||||||
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
|
void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y);
|
||||||
/// Those indices need to be mapped via toOrder
|
|
||||||
/// \param[in] toOrder array with mappings
|
|
||||||
/// \param[in] reorder whether reordering is actually used or not
|
|
||||||
void setReordering(int* toOrder, bool reorder);
|
|
||||||
void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
|
|
||||||
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
|
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
|
||||||
void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
|
void apply(cl::Buffer d_x, cl::Buffer d_y);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
/// Allocate memory for the StandardWells
|
/// Allocate memory for the StandardWells
|
||||||
@ -60,8 +55,6 @@ protected:
|
|||||||
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
|
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
|
||||||
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
|
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
|
||||||
|
|
||||||
bool reorder = false;
|
|
||||||
int *h_toOrder = nullptr;
|
|
||||||
std::vector<double> h_x;
|
std::vector<double> h_x;
|
||||||
std::vector<double> h_y;
|
std::vector<double> h_y;
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user