/* Copyright 2019 Equinor ASA This file is part of the Open Porous Media project (OPM). OPM is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OPM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OPM. If not, see . */ #ifndef __NVCC__ #error "Cannot compile for cusparse: NVIDIA compiler not found" #endif #include #include #include #include #include #include #include #include #include #include #include "cublas_v2.h" #include "cusparse_v2.h" // For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html namespace Opm { const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE; const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW; double second(void) { struct timeval tv; gettimeofday(&tv, nullptr); return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; } cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0) { } cusparseSolverBackend::~cusparseSolverBackend() { finalize(); } void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res) { double t_total1, t_total2; int n = N; double rho = 1.0, rhop; double alpha, nalpha, beta; double omega, nomega, tmp1, tmp2; double norm, norm_0; double zero = 0.0; double one = 1.0; double mone = -1.0; float it; t_total1 = second(); cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r); cublasDscal(cublasHandle, n, &mone, d_r, 1); cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1); cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1); cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1); cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0); if (verbosity > 1) { std::ostringstream out; out << std::scientific << "cusparseSolver initial norm: " << norm_0; OpmLog::info(out.str()); } for (it = 0.5; it < maxit; it+=0.5) { rhop = rho; cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho); if (it > 1) { beta = (rho/rhop) * (alpha/omega); nomega = -omega; cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1); cublasDscal(cublasHandle, n, &beta, d_p, 1); cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1); } // apply ilu0 cusparseDbsrsv2_solve(cusparseHandle, order, \ operation, Nb, nnzb, &one, \ descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer); cusparseDbsrsv2_solve(cusparseHandle, order, \ operation, Nb, nnzb, &one, \ descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer); // spmv cusparseDbsrmv(cusparseHandle, order, \ operation, Nb, Nb, nnzb, \ &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v); cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1); alpha = rho / tmp1; nalpha = -alpha; cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1); cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1); cublasDnrm2(cublasHandle, n, d_r, 1, &norm); if (norm < tolerance * norm_0 && it > minit) { break; } it += 0.5; // apply ilu0 cusparseDbsrsv2_solve(cusparseHandle, order, \ operation, Nb, nnzb, &one, \ descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer); cusparseDbsrsv2_solve(cusparseHandle, order, \ operation, Nb, nnzb, &one, \ descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer); // spmv cusparseDbsrmv(cusparseHandle, order, \ operation, Nb, Nb, nnzb, &one, descr_M, \ d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t); cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1); cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2); omega = tmp1 / tmp2; nomega = -omega; cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1); cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1); cublasDnrm2(cublasHandle, n, d_r, 1, &norm); if (norm < tolerance * norm_0 && it > minit) { break; } if (verbosity > 1) { std::ostringstream out; out << "it: " << it << std::scientific << ", norm: " << norm; OpmLog::info(out.str()); } } t_total2 = second(); res.iterations = std::min(it, (float)maxit); res.reduction = norm/norm_0; res.conv_rate = static_cast(pow(res.reduction,1.0/it)); res.elapsed = t_total2 - t_total1; res.converged = (it != (maxit + 0.5)); if (verbosity > 0) { std::ostringstream out; out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \ ", time per iteration: " << res.elapsed/it << ", iterations: " << it; OpmLog::info(out.str()); } } void cusparseSolverBackend::initialize(int N, int nnz, int dim) { this->N = N; this->nnz = nnz; this->BLOCK_SIZE = dim; this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE; Nb = (N + dim - 1) / dim; std::ostringstream out; out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks"; OpmLog::info(out.str()); out.str(""); out.clear(); out << "Minit: " << minit << ", maxit: " << maxit << std::scientific << ", tolerance: " << tolerance; OpmLog::info(out.str()); int deviceID = 0; cudaSetDevice(deviceID); cudaCheckLastError("Could not get device"); struct cudaDeviceProp props; cudaGetDeviceProperties(&props, deviceID); cudaCheckLastError("Could not get device properties"); out.str(""); out.clear(); out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor; OpmLog::info(out.str()); cudaStreamCreate(&stream); cudaCheckLastError("Could not create stream"); cublasCreate(&cublasHandle); cudaCheckLastError("Could not create cublasHandle"); cusparseCreate(&cusparseHandle); cudaCheckLastError("Could not create cusparseHandle"); cudaMalloc((void**)&d_x, sizeof(double) * N); cudaMalloc((void**)&d_b, sizeof(double) * N); cudaMalloc((void**)&d_r, sizeof(double) * N); cudaMalloc((void**)&d_rw,sizeof(double) * N); cudaMalloc((void**)&d_p, sizeof(double) * N); cudaMalloc((void**)&d_pw,sizeof(double) * N); cudaMalloc((void**)&d_s, sizeof(double) * N); cudaMalloc((void**)&d_t, sizeof(double) * N); cudaMalloc((void**)&d_v, sizeof(double) * N); cudaMalloc((void**)&d_bVals, sizeof(double) * nnz); cudaMalloc((void**)&d_bCols, sizeof(double) * nnz); cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1)); cudaMalloc((void**)&d_mVals, sizeof(double) * nnz); cudaCheckLastError("Could not allocate enough memory on GPU"); cublasSetStream(cublasHandle, stream); cudaCheckLastError("Could not set stream to cublas"); cusparseSetStream(cusparseHandle, stream); cudaCheckLastError("Could not set stream to cusparse"); cudaMallocHost((void**)&x, sizeof(double) * N); cudaCheckLastError("Could not allocate pinned host memory"); initialized = true; } // end initialize() void cusparseSolverBackend::finalize() { cudaFree(d_x); cudaFree(d_b); cudaFree(d_r); cudaFree(d_rw); cudaFree(d_p); cudaFree(d_pw); cudaFree(d_s); cudaFree(d_t); cudaFree(d_v); cudaFree(d_mVals); cudaFree(d_bVals); cudaFree(d_bCols); cudaFree(d_bRows); cudaFree(d_buffer); cusparseDestroyBsrilu02Info(info_M); cusparseDestroyBsrsv2Info(info_L); cusparseDestroyBsrsv2Info(info_U); cusparseDestroyMatDescr(descr_B); cusparseDestroyMatDescr(descr_M); cusparseDestroyMatDescr(descr_L); cusparseDestroyMatDescr(descr_U); cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); cudaHostUnregister(vals); cudaHostUnregister(cols); cudaHostUnregister(rows); cudaStreamDestroy(stream); cudaFreeHost(x); } // end finalize() void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) { double t1, t2; if (verbosity > 2) { t1 = second(); } // information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c // possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault); cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault); cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault); cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault); cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream); this->vals = vals; this->cols = cols; this->rows = rows; if (verbosity > 2) { cudaStreamSynchronize(stream); t2 = second(); std::ostringstream out; out << "cusparseSolver::copy_system_to_gpu(): " << t2-t1 << " s"; OpmLog::info(out.str()); } } // end copy_system_to_gpu() // don't copy rowpointers and colindices, they stay the same void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b) { double t1, t2; if (verbosity > 2) { t1 = second(); } cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream); if (verbosity > 2) { cudaStreamSynchronize(stream); t2 = second(); std::ostringstream out; out << "cusparseSolver::update_system_on_gpu(): " << t2-t1 << " s"; OpmLog::info(out.str()); } } // end update_system_on_gpu() void cusparseSolverBackend::reset_prec_on_gpu() { cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream); } bool cusparseSolverBackend::analyse_matrix() { int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize; double t1, t2; if (verbosity > 2) { t1 = second(); } cusparseCreateMatDescr(&descr_B); cusparseCreateMatDescr(&descr_M); cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL); const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO; // matrices from Flow are base0 cusparseSetMatIndexBase(descr_B, base_type); cusparseSetMatIndexBase(descr_M, base_type); cusparseCreateMatDescr(&descr_L); cusparseSetMatIndexBase(descr_L, base_type); cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER); cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT); cusparseCreateMatDescr(&descr_U); cusparseSetMatIndexBase(descr_U, base_type); cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER); cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT); cudaCheckLastError("Could not initialize matrix descriptions"); cusparseCreateBsrilu02Info(&info_M); cusparseCreateBsrsv2Info(&info_L); cusparseCreateBsrsv2Info(&info_U); cudaCheckLastError("Could not create analysis info"); cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M); cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L); cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U); cudaCheckLastError(); d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U)); cudaMalloc((void**)&d_buffer, d_bufferSize); // analysis of ilu LU decomposition cusparseDbsrilu02_analysis(cusparseHandle, order, \ Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \ BLOCK_SIZE, info_M, policy, d_buffer); int structural_zero; cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero); if (CUSPARSE_STATUS_ZERO_PIVOT == status) { return false; } // analysis of ilu apply cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \ Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \ BLOCK_SIZE, info_L, policy, d_buffer); cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \ Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \ BLOCK_SIZE, info_U, policy, d_buffer); cudaCheckLastError("Could not analyse level information"); if (verbosity > 2) { cudaStreamSynchronize(stream); t2 = second(); std::ostringstream out; out << "cusparseSolver::analyse_matrix(): " << t2-t1 << " s"; OpmLog::info(out.str()); } return true; } // end analyse_matrix() bool cusparseSolverBackend::create_preconditioner() { double t1, t2; if (verbosity > 2) { t1 = second(); } d_mCols = d_bCols; d_mRows = d_bRows; cusparseDbsrilu02(cusparseHandle, order, \ Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \ BLOCK_SIZE, info_M, policy, d_buffer); int structural_zero; // cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize() cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero); if (CUSPARSE_STATUS_ZERO_PIVOT == status) { return false; } if (verbosity > 2) { cudaStreamSynchronize(stream); t2 = second(); std::ostringstream out; out << "cusparseSolver::create_preconditioner(): " << t2-t1 << " s"; OpmLog::info(out.str()); } return true; } // end create_preconditioner() void cusparseSolverBackend::solve_system(BdaResult &res) { // actually solve gpu_pbicgstab(res); cudaStreamSynchronize(stream); cudaCheckLastError("Something went wrong during the GPU solve"); } // end solve_system() // copy result to host memory // caller must be sure that x is a valid array void cusparseSolverBackend::post_process(double *x) { if (!initialized) { cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault); } double t1, t2; if (verbosity > 2) { t1 = second(); } cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); if (verbosity > 2) { t2 = second(); std::ostringstream out; out << "cusparseSolver::post_process(): " << t2-t1 << " s"; OpmLog::info(out.str()); } } // end post_process() typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus; cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res) { if (initialized == false) { initialize(N, nnz, dim); copy_system_to_gpu(vals, rows, cols, b); }else{ update_system_on_gpu(vals, b); } if (analysis_done == false) { if (!analyse_matrix()) { return cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED; } } reset_prec_on_gpu(); if (create_preconditioner()) { solve_system(res); }else{ return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED; } return cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS; } }