Allow verbosity of cusparseSolver to be set via command line

This commit is contained in:
T.D. (Tongdong) Qiu 2019-12-05 09:44:55 +01:00
parent 69033ca7f2
commit 5cb6ec510c
5 changed files with 66 additions and 57 deletions

View File

@ -251,11 +251,12 @@ protected:
const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
const bool matrix_add_well_contributions = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
if(use_gpu && !matrix_add_well_contributions){
std::cerr << "Error cannot use GPU solver if command line parameter --matrix-add-well-contributions is false, because the GPU solver performs a standard bicgstab" << std::endl;
exit(1);
}
bdaBridge = new BdaBridge(use_gpu, maxit, tolerance);
bdaBridge = new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance);
#else
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
if(use_gpu){

View File

@ -30,10 +30,10 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
namespace Opm
{
BdaBridge::BdaBridge(bool use_gpu_, int maxit, double tolerance) : use_gpu(use_gpu_){
BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, double tolerance) : use_gpu(use_gpu_) {
#if HAVE_CUDA
if(use_gpu){
backend = new cusparseSolverBackend(maxit, tolerance);
backend = new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance);
}
#endif
}

View File

@ -45,7 +45,7 @@ private:
bool use_gpu;
public:
BdaBridge(bool use_gpu, int maxit, double tolerance);
BdaBridge(bool use_gpu, int linear_solver_verbosity, int maxit, double tolerance);
~BdaBridge();

View File

@ -35,12 +35,6 @@
#include "cusparse_v2.h"
// For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html
// print initial, intermediate and final norms, and used iterations
#define VERBOSE_BACKEND 0
// print more detailed timers of various solve elements and backend functions
#define PRINT_TIMERS_BACKEND 0
namespace Opm
{
@ -54,7 +48,7 @@ namespace Opm
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}
cusparseSolverBackend::cusparseSolverBackend(int maxit_, double tolerance_) : maxit(maxit_), tolerance(tolerance_), minit(0){
cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0){
}
cusparseSolverBackend::~cusparseSolverBackend(){
@ -84,10 +78,9 @@ namespace Opm
cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
#if VERBOSE_BACKEND
printf("Initial norm: %.5e\n", norm_0);
printf("Tolerance: %.0e, nnzb: %d\n", tolerance, nnzb);
#endif
if(verbosity > 1){
printf("Initial norm: %.5e\n", norm_0);
}
for(it = 0.5; it < maxit; it+=0.5){
rhop = rho;
@ -153,27 +146,23 @@ namespace Opm
if(norm < tolerance * norm_0 && it > minit){
break;
}
#if VERBOSE_BACKEND
if((int)it % 10 == 0){
if(verbosity > 1){
printf("it: %.1f, norm: %.5e\n", it, norm);
}
#endif
}
t_total2 = second();
#if PRINT_TIMERS_BACKEND
printf("Total solve time: %.6f s\n", t_total2-t_total1);
#endif
res.iterations = std::min(it, (float)maxit);
res.reduction = norm/norm_0;
res.conv_rate = static_cast<double>(pow(res.reduction,1.0/it));
res.elapsed = t_total2-t_total1;
res.elapsed = t_total2 - t_total1;
res.converged = (it != (maxit + 0.5));
#if VERBOSE_BACKEND
printf("Iterations: %.1f\n", it);
printf("Final norm: %.5e\n", norm);
printf("GPU converged: %d\n", res.converged);
#endif
if(verbosity > 0){
printf("=== converged: %d, conv_rate: %.2f, time: %f, time per iteration: %f, iterations: %.1f\n", res.converged, res.conv_rate, res.elapsed, res.elapsed/it, it);
}
return res.converged;
}
@ -265,10 +254,10 @@ namespace Opm
void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b){
#if PRINT_TIMERS_BACKEND
double t1, t2;
t1 = second();
#endif
if(verbosity > 2){
t1 = second();
}
// information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
// possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
@ -285,29 +274,29 @@ namespace Opm
this->cols = cols;
this->rows = rows;
#if PRINT_TIMERS_BACKEND
t2 = second();
printf("copy_system_to_gpu(): %f s\n", t2-t1);
#endif
if(verbosity > 2){
t2 = second();
printf("cusparseSolver::copy_system_to_gpu(): %f s\n", t2-t1);
}
} // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same
void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b){
#if PRINT_TIMERS_BACKEND
double t1, t2;
t1 = second();
#endif
if(verbosity > 2){
t1 = second();
}
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
#if PRINT_TIMERS_BACKEND
t2 = second();
printf("update_system_on_gpu(): %f s\n", t2-t1);
#endif
if(verbosity > 2){
t2 = second();
printf("cusparseSolver::update_system_on_gpu(): %f s\n", t2-t1);
}
} // end update_system_on_gpu()
@ -319,6 +308,11 @@ namespace Opm
void cusparseSolverBackend::analyse_matrix(){
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
double t1, t2;
if(verbosity > 2){
t1 = second();
}
cusparseCreateMatDescr(&descr_B);
cusparseCreateMatDescr(&descr_M);
@ -385,14 +379,20 @@ namespace Opm
BLOCK_SIZE, info_U, policy, d_buffer);
cudaCheckLastError("Could not analyse level information");
if(verbosity > 2){
t2 = second();
printf("cusparseSolver::analyse_matrix(): %f s\n", t2-t1);
}
} // end analyse_matrix()
bool cusparseSolverBackend::create_preconditioner(){
#if PRINT_TIMERS_BACKEND
double t1, t2;
t1 = second();
#endif
if(verbosity > 2){
t1 = second();
}
d_mCols = d_bCols;
d_mRows = d_bRows;
cusparseDbsrilu02(cusparseHandle, order, \
@ -407,11 +407,11 @@ namespace Opm
return false;
}
#if PRINT_TIMERS_BACKEND
cudaStreamSynchronize(stream);
t2 = second();
printf("Decomp time: %.6f s\n", t2-t1);
#endif
if(verbosity > 2){
cudaStreamSynchronize(stream);
t2 = second();
printf("cusparseSolver::create_preconditioner(): %f s\n", t2-t1);
}
return true;
} // end create_preconditioner()
@ -429,18 +429,18 @@ namespace Opm
// copy result to host memory
double* cusparseSolverBackend::post_process(){
#if PRINT_TIMERS_BACKEND
double t1, t2;
t1 = second();
#endif
if(verbosity > 2){
t1 = second();
}
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
#if PRINT_TIMERS_BACKEND
t2 = second();
printf("Copy result back to CPU: %.6f s\n", t2-t1);
#endif
if(verbosity > 2){
t2 = second();
printf("cusparseSolver::post_process(): %f s\n", t2-t1);
}
return x;
} // end post_process()

View File

@ -59,9 +59,17 @@ private:
bool initialized = false;
// verbosity
// 0: print nothing during solves, only when initializing
// 1: print number of iterations and final norm
// 2: also print norm each iteration
// 3: also print timings of different backend functions
int verbosity = 0;
public:
cusparseSolverBackend(int maxit, double tolerance);
cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance);
~cusparseSolverBackend();