Replaced timing function with Dune::Timer

This commit is contained in:
T.D. (Tongdong) Qiu 2020-07-01 14:50:34 +02:00
parent 845563c37b
commit 7f8faa018b
6 changed files with 80 additions and 212 deletions

View File

@ -20,6 +20,7 @@
#include <config.h> #include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp> #include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp> #include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/BdaSolver.hpp> #include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/BILU0.hpp> #include <opm/simulators/linalg/bda/BILU0.hpp>
@ -30,10 +31,7 @@ namespace bda
{ {
using Opm::OpmLog; using Opm::OpmLog;
using Dune::Timer;
// define 'second' as 'BdaSolver<>::second', this allows usage of the second() function for timing
// typedefs cannot handle templates
const auto second = BdaSolver<>::second;
template <unsigned int block_size> template <unsigned int block_size>
BILU0<block_size>::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) : BILU0<block_size>::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) :
@ -62,7 +60,6 @@ namespace bda
bool BILU0<block_size>::init(BlockedMatrix *mat) bool BILU0<block_size>::init(BlockedMatrix *mat)
{ {
const unsigned int bs = block_size; const unsigned int bs = block_size;
double t1 = 0.0, t2 = 0.0;
BlockedMatrix *CSCmat = nullptr; BlockedMatrix *CSCmat = nullptr;
this->N = mat->Nb * block_size; this->N = mat->Nb * block_size;
@ -79,21 +76,16 @@ namespace bda
CSCmat->nnzValues = new double[nnzbs * bs * bs]; CSCmat->nnzValues = new double[nnzbs * bs * bs];
CSCmat->colIndices = new int[nnzbs]; CSCmat->colIndices = new int[nnzbs];
CSCmat->rowPointers = new int[Nb + 1]; CSCmat->rowPointers = new int[Nb + 1];
if(verbosity >= 3){ Timer t_convert;
t1 = second();
}
bcsr_to_bcsc<block_size>(mat->nnzValues, mat->colIndices, mat->rowPointers, CSCmat->nnzValues, CSCmat->colIndices, CSCmat->rowPointers, mat->Nb); bcsr_to_bcsc<block_size>(mat->nnzValues, mat->colIndices, mat->rowPointers, CSCmat->nnzValues, CSCmat->colIndices, CSCmat->rowPointers, mat->Nb);
if(verbosity >= 3){ if(verbosity >= 3){
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "BILU0 convert CSR to CSC: " << t2 - t1 << " s"; out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s\n";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} }
if(verbosity >= 3){ Timer t_analysis;
t1 = second();
}
rMat = allocateBlockedMatrix<block_size>(mat->Nb, mat->nnzbs); rMat = allocateBlockedMatrix<block_size>(mat->Nb, mat->nnzbs);
LUMat = soft_copyBlockedMatrix(rMat); LUMat = soft_copyBlockedMatrix(rMat);
if (level_scheduling) { if (level_scheduling) {
@ -105,9 +97,8 @@ namespace bda
return false; return false;
} }
if(verbosity >= 3){ if(verbosity >= 3){
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "BILU0 analysis took: " << t2 - t1 << " s, " << numColors << " colors"; out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
@ -161,27 +152,21 @@ namespace bda
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat) bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
{ {
const unsigned int bs = block_size; const unsigned int bs = block_size;
double t1 = 0.0, t2 = 0.0;
if (verbosity >= 3){ Timer t_reorder;
t1 = second();
}
blocked_reorder_matrix_by_pattern<block_size>(mat, toOrder, fromOrder, rMat); blocked_reorder_matrix_by_pattern<block_size>(mat, toOrder, fromOrder, rMat);
if (verbosity >= 3){ if (verbosity >= 3){
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "BILU0 reorder matrix: " << t2 - t1 << " s"; out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
if (verbosity >= 3){ Timer t_copy;
t1 = second();
}
memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(double) * bs * bs * rMat->nnzbs); memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(double) * bs * bs * rMat->nnzbs);
if (verbosity >= 3){ if (verbosity >= 3){
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "BILU0 memcpy: " << t2 - t1 << " s"; out << "BILU0 memcpy: " << t_copy.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
@ -191,9 +176,8 @@ namespace bda
int LSize = 0; int LSize = 0;
if (verbosity >= 3){ Timer t_decomposition;
t1 = second();
}
// go through all rows // go through all rows
for (i = 0; i < LUMat->Nb; i++) { for (i = 0; i < LUMat->Nb; i++) {
iRowStart = LUMat->rowPointers[i]; iRowStart = LUMat->rowPointers[i];
@ -272,15 +256,12 @@ namespace bda
URowIndex++; URowIndex++;
} }
if (verbosity >= 3) { if (verbosity >= 3) {
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "BILU0 decomposition: " << t2 - t1 << " s"; out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
if (verbosity >= 3) { Timer t_copyToGpu;
t1 = second();
}
if (pattern_uploaded == false) { if (pattern_uploaded == false) {
queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, LMat->nnzbs * sizeof(int), LMat->colIndices); queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, LMat->nnzbs * sizeof(int), LMat->colIndices);
queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, UMat->nnzbs * sizeof(int), UMat->colIndices); queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, UMat->nnzbs * sizeof(int), UMat->colIndices);
@ -292,9 +273,8 @@ namespace bda
queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, UMat->nnzbs * sizeof(double) * bs * bs, UMat->nnzValues); queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, UMat->nnzbs * sizeof(double) * bs * bs, UMat->nnzValues);
queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(double) * bs * bs, invDiagVals); queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(double) * bs * bs, invDiagVals);
if (verbosity >= 3) { if (verbosity >= 3) {
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "BILU0 copy to GPU: " << t2 - t1 << " s"; out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
@ -307,11 +287,8 @@ namespace bda
template <unsigned int block_size> template <unsigned int block_size>
void BILU0<block_size>::apply(cl::Buffer& x, cl::Buffer& y) void BILU0<block_size>::apply(cl::Buffer& x, cl::Buffer& y)
{ {
double t1 = 0.0, t2 = 0.0;
if (verbosity >= 3) {
t1 = second();
}
cl::Event event; cl::Event event;
Timer t_apply;
for(int color = 0; color < numColors; ++color){ for(int color = 0; color < numColors; ++color){
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
@ -324,9 +301,8 @@ namespace bda
if (verbosity >= 3) { if (verbosity >= 3) {
event.wait(); event.wait();
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "BILU0 apply: " << t2 - t1 << " s"; out << "BILU0 apply: " << t_apply.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} }

View File

@ -21,8 +21,6 @@
#define OPM_BDASOLVER_BACKEND_HEADER_INCLUDED #define OPM_BDASOLVER_BACKEND_HEADER_INCLUDED
#include <sys/time.h>
#include <opm/simulators/linalg/bda/BdaResult.hpp> #include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/BdaSolverStatus.hpp> #include <opm/simulators/linalg/bda/BdaSolverStatus.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp> #include <opm/simulators/linalg/bda/WellContributions.hpp>
@ -35,8 +33,7 @@ namespace bda
/// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver /// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
/// This class is abstract, no instantiations can of it can be made, only of its children /// This class is abstract, no instantiations can of it can be made, only of its children
/// Without a default block_size value, the BILU0 class cannot use BdaSolver::second() template <unsigned int block_size>
template <unsigned int block_size = 3>
class BdaSolver class BdaSolver
{ {
@ -74,13 +71,6 @@ namespace bda
virtual void get_result(double *x) = 0; virtual void get_result(double *x) = 0;
/// Different implementations of BdaSolver can use this function for timing
static double second(void) {
struct timeval tv;
gettimeofday(&tv, nullptr);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}
}; // end class BdaSolver }; // end class BdaSolver
} // end namespace bda } // end namespace bda

View File

@ -23,6 +23,7 @@
#include <sstream> #include <sstream>
#include <opm/common/OpmLog/OpmLog.hpp> #include <opm/common/OpmLog/OpmLog.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp> #include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp> #include <opm/simulators/linalg/bda/BdaResult.hpp>
@ -40,6 +41,7 @@ namespace bda
{ {
using Opm::OpmLog; using Opm::OpmLog;
using Dune::Timer;
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE; const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
@ -56,7 +58,7 @@ cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
template <unsigned int block_size> template <unsigned int block_size>
void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
double t_total1, t_total2; Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
int n = N; int n = N;
double rho = 1.0, rhop; double rho = 1.0, rhop;
double alpha, nalpha, beta; double alpha, nalpha, beta;
@ -67,8 +69,6 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
double mone = -1.0; double mone = -1.0;
float it; float it;
t_total1 = second();
if (wellContribs.getNumWells() > 0) { if (wellContribs.getNumWells() > 0) {
wellContribs.setCudaStream(stream); wellContribs.setCudaStream(stream);
} }
@ -169,12 +169,10 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
} }
} }
t_total2 = second();
res.iterations = std::min(it, (float)maxit); res.iterations = std::min(it, (float)maxit);
res.reduction = norm / norm_0; res.reduction = norm / norm_0;
res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / it)); res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / it));
res.elapsed = t_total2 - t_total1; res.elapsed = t_total.stop();
res.converged = (it != (maxit + 0.5)); res.converged = (it != (maxit + 0.5));
if (verbosity > 0) { if (verbosity > 0) {
@ -284,11 +282,7 @@ void cusparseSolverBackend<block_size>::finalize() {
template <unsigned int block_size> template <unsigned int block_size>
void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) { void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
Timer t;
double t1, t2;
if (verbosity > 2) {
t1 = second();
}
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
int sum = 0; int sum = 0;
@ -309,9 +303,8 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *ro
if (verbosity > 2) { if (verbosity > 2) {
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "cusparseSolver::copy_system_to_gpu(): " << t2 - t1 << " s"; out << "cusparseSolver::copy_system_to_gpu(): " << t.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} // end copy_system_to_gpu() } // end copy_system_to_gpu()
@ -320,11 +313,7 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *ro
// don't copy rowpointers and colindices, they stay the same // don't copy rowpointers and colindices, they stay the same
template <unsigned int block_size> template <unsigned int block_size>
void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *rows, double *b) { void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *rows, double *b) {
Timer t;
double t1, t2;
if (verbosity > 2) {
t1 = second();
}
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
int sum = 0; int sum = 0;
@ -343,9 +332,8 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *
if (verbosity > 2) { if (verbosity > 2) {
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "cusparseSolver::update_system_on_gpu(): " << t2 - t1 << " s"; out << "cusparseSolver::update_system_on_gpu(): " << t.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} // end update_system_on_gpu() } // end update_system_on_gpu()
@ -361,11 +349,7 @@ template <unsigned int block_size>
bool cusparseSolverBackend<block_size>::analyse_matrix() { bool cusparseSolverBackend<block_size>::analyse_matrix() {
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize; int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
double t1, t2; Timer t;
if (verbosity > 2) {
t1 = second();
}
cusparseCreateMatDescr(&descr_B); cusparseCreateMatDescr(&descr_B);
cusparseCreateMatDescr(&descr_M); cusparseCreateMatDescr(&descr_M);
@ -428,9 +412,8 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
if (verbosity > 2) { if (verbosity > 2) {
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "cusparseSolver::analyse_matrix(): " << t2 - t1 << " s"; out << "cusparseSolver::analyse_matrix(): " << t.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
@ -441,11 +424,7 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
template <unsigned int block_size> template <unsigned int block_size>
bool cusparseSolverBackend<block_size>::create_preconditioner() { bool cusparseSolverBackend<block_size>::create_preconditioner() {
Timer t;
double t1, t2;
if (verbosity > 2) {
t1 = second();
}
d_mCols = d_bCols; d_mCols = d_bCols;
d_mRows = d_bRows; d_mRows = d_bRows;
@ -463,9 +442,8 @@ bool cusparseSolverBackend<block_size>::create_preconditioner() {
if (verbosity > 2) { if (verbosity > 2) {
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "cusparseSolver::create_preconditioner(): " << t2 - t1 << " s"; out << "cusparseSolver::create_preconditioner(): " << t.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
return true; return true;
@ -485,19 +463,14 @@ void cusparseSolverBackend<block_size>::solve_system(WellContributions& wellCont
// caller must be sure that x is a valid array // caller must be sure that x is a valid array
template <unsigned int block_size> template <unsigned int block_size>
void cusparseSolverBackend<block_size>::get_result(double *x) { void cusparseSolverBackend<block_size>::get_result(double *x) {
Timer t;
double t1, t2;
if (verbosity > 2) {
t1 = second();
}
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream); cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
if (verbosity > 2) { if (verbosity > 2) {
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "cusparseSolver::get_result(): " << t2 - t1 << " s"; out << "cusparseSolver::get_result(): " << t.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} // end get_result() } // end get_result()

View File

@ -44,7 +44,6 @@ class cusparseSolverBackend : public BdaSolver<block_size> {
using Base::verbosity; using Base::verbosity;
using Base::maxit; using Base::maxit;
using Base::tolerance; using Base::tolerance;
using Base::second;
using Base::initialized; using Base::initialized;
typedef BdaSolverStatus::Status Status; typedef BdaSolverStatus::Status Status;

View File

@ -23,6 +23,7 @@
#include <opm/common/OpmLog/OpmLog.hpp> #include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp> #include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#define __CL_ENABLE_EXCEPTIONS #define __CL_ENABLE_EXCEPTIONS
@ -48,6 +49,7 @@ namespace bda
{ {
using Opm::OpmLog; using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size> template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_) { openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_) {
@ -72,14 +74,11 @@ unsigned int openclSolverBackend<block_size>::ceilDivision(const unsigned int A,
template <unsigned int block_size> template <unsigned int block_size>
double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out) double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
{ {
double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 1024; const unsigned int work_group_size = 1024;
const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
if (verbosity >= 4) { Timer t_dot;
t1 = second();
}
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group)); cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
@ -92,9 +91,8 @@ double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl
if (verbosity >= 4) { if (verbosity >= 4) {
event.wait(); event.wait();
t2 = second();
std::ostringstream oss; std::ostringstream oss;
oss << "openclSolver dot_w time: " << t2 - t1; oss << std::scientific << "openclSolver dot_w time: " << t_dot.stop() << " s";
OpmLog::info(oss.str()); OpmLog::info(oss.str());
} }
@ -104,14 +102,11 @@ double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl
template <unsigned int block_size> template <unsigned int block_size>
double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out) double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
{ {
double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 1024; const unsigned int work_group_size = 1024;
const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
if (verbosity >= 4) { Timer t_norm;
t1 = second();
}
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group)); cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
@ -125,9 +120,8 @@ double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
if (verbosity >= 4) { if (verbosity >= 4) {
event.wait(); event.wait();
t2 = second();
std::ostringstream oss; std::ostringstream oss;
oss << "openclSolver norm_w time: " << t2 - t1; oss << std::scientific << "openclSolver norm_w time: " << t_norm.stop() << " s";
OpmLog::info(oss.str()); OpmLog::info(oss.str());
} }
@ -137,21 +131,17 @@ double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
template <unsigned int block_size> template <unsigned int block_size>
void openclSolverBackend<block_size>::axpy_w(cl::Buffer in, const double a, cl::Buffer out) void openclSolverBackend<block_size>::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
{ {
double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
if (verbosity >= 4) { Timer t_axpy;
t1 = second();
}
cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N); cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N);
if (verbosity >= 4) { if (verbosity >= 4) {
event.wait(); event.wait();
t2 = second();
std::ostringstream oss; std::ostringstream oss;
oss << "openclSolver axpy_w time: " << t2 - t1; oss << std::scientific << "openclSolver axpy_w time: " << t_axpy.stop() << " s";
OpmLog::info(oss.str()); OpmLog::info(oss.str());
} }
} }
@ -159,21 +149,17 @@ void openclSolverBackend<block_size>::axpy_w(cl::Buffer in, const double a, cl::
template <unsigned int block_size> template <unsigned int block_size>
void openclSolverBackend<block_size>::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta) void openclSolverBackend<block_size>::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
{ {
double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
if (verbosity >= 4) { Timer t_custom;
t1 = second();
}
cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N); cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N);
if (verbosity >= 4) { if (verbosity >= 4) {
event.wait(); event.wait();
t2 = second();
std::ostringstream oss; std::ostringstream oss;
oss << "openclSolver custom_w time: " << t2 - t1; oss << std::scientific << "openclSolver custom_w time: " << t_custom.stop() << " s";
OpmLog::info(oss.str()); OpmLog::info(oss.str());
} }
} }
@ -181,22 +167,18 @@ void openclSolverBackend<block_size>::custom_w(cl::Buffer p, cl::Buffer v, cl::B
template <unsigned int block_size> template <unsigned int block_size>
void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b) void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
{ {
double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
if (verbosity >= 4) { Timer t_spmv;
t1 = second();
}
cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
if (verbosity >= 4) { if (verbosity >= 4) {
event.wait(); event.wait();
t2 = second();
std::ostringstream oss; std::ostringstream oss;
oss << "openclSolver spmv_blocked_w time: " << t2 - t1; oss << std::scientific << "openclSolver spmv_blocked_w time: " << t_spmv.stop() << " s";
OpmLog::info(oss.str()); OpmLog::info(oss.str());
} }
} }
@ -209,9 +191,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
double rho, rhop, beta, alpha, omega, tmp1, tmp2; double rho, rhop, beta, alpha, omega, tmp1, tmp2;
double norm, norm_0; double norm, norm_0;
double t_total1, t_total2, t1 = 0.0, t2 = 0.0; Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
double prec_time = 0.0, spmv_time = 0.0, well_time = 0.0, rest_time = 0.0;
t_total1 = second();
wellContribs.setOpenCLQueue(queue.get()); wellContribs.setOpenCLQueue(queue.get());
wellContribs.setReordering(toOrder, true); wellContribs.setReordering(toOrder, true);
@ -245,7 +225,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
t1 = second(); t_rest.start();
for (it = 0.5; it < maxit; it += 0.5) { for (it = 0.5; it < maxit; it += 0.5) {
rhop = rho; rhop = rho;
rho = dot_w(d_rw, d_r, d_tmp); rho = dot_w(d_rw, d_r, d_tmp);
@ -254,37 +234,32 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
beta = (rho / rhop) * (alpha / omega); beta = (rho / rhop) * (alpha / omega);
custom_w(d_p, d_v, d_r, omega, beta); custom_w(d_p, d_v, d_r, omega, beta);
} }
t2 = second(); t_rest.stop();
rest_time += t2 - t1;
// pw = prec(p) // pw = prec(p)
t1 = second(); t_prec.start();
prec->apply(d_p, d_pw); prec->apply(d_p, d_pw);
t2 = second(); t_prec.stop();
prec_time += t2 - t1;
// v = A * pw // v = A * pw
t1 = second(); t_spmv.start();
spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v); spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v);
t2 = second(); t_spmv.stop();
spmv_time += t2 - t1;
// apply wellContributions // apply wellContributions
if (wellContribs.getNumWells() > 0) { if (wellContribs.getNumWells() > 0) {
t1 = second(); t_well.start();
wellContribs.apply(d_pw, d_v); wellContribs.apply(d_pw, d_v);
t2 = second(); t_well.stop();
well_time += t2 - t1;
} }
t1 = second(); t_rest.start();
tmp1 = dot_w(d_rw, d_v, d_tmp); tmp1 = dot_w(d_rw, d_v, d_tmp);
alpha = rho / tmp1; alpha = rho / tmp1;
axpy_w(d_v, -alpha, d_r); // r = r - alpha * v axpy_w(d_v, -alpha, d_r); // r = r - alpha * v
axpy_w(d_pw, alpha, d_x); // x = x + alpha * pw axpy_w(d_pw, alpha, d_x); // x = x + alpha * pw
norm = norm_w(d_r, d_tmp); norm = norm_w(d_r, d_tmp);
t2 = second(); t_rest.stop();
rest_time += t2 - t1;
if (norm < tolerance * norm_0) { if (norm < tolerance * norm_0) {
break; break;
@ -293,34 +268,30 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
it += 0.5; it += 0.5;
// s = prec(r) // s = prec(r)
t1 = second(); t_prec.start();
prec->apply(d_r, d_s); prec->apply(d_r, d_s);
t2 = second(); t_prec.stop();
prec_time += t2 - t1;
// t = A * s // t = A * s
t1 = second(); t_spmv.start();
spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_s, d_t); spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_s, d_t);
t2 = second(); t_spmv.stop();
spmv_time += t2 - t1;
// apply wellContributions // apply wellContributions
if (wellContribs.getNumWells() > 0) { if (wellContribs.getNumWells() > 0) {
t1 = second(); t_well.start();
wellContribs.apply(d_s, d_t); wellContribs.apply(d_s, d_t);
t2 = second(); t_well.stop();
well_time += t2 - t1;
} }
t1 = second(); t_rest.start();
tmp1 = dot_w(d_t, d_r, d_tmp); tmp1 = dot_w(d_t, d_r, d_tmp);
tmp2 = dot_w(d_t, d_t, d_tmp); tmp2 = dot_w(d_t, d_t, d_tmp);
omega = tmp1 / tmp2; omega = tmp1 / tmp2;
axpy_w(d_s, omega, d_x); // x = x + omega * s axpy_w(d_s, omega, d_x); // x = x + omega * s
axpy_w(d_t, -omega, d_r); // r = r - omega * t axpy_w(d_t, -omega, d_r); // r = r - omega * t
norm = norm_w(d_r, d_tmp); norm = norm_w(d_r, d_tmp);
t2 = second(); t_rest.stop();
rest_time += t2 - t1;
if (norm < tolerance * norm_0) { if (norm < tolerance * norm_0) {
break; break;
@ -333,14 +304,10 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
} }
} }
t2 = second();
t_total2 = second();
rest_time += t2 - t1;
res.iterations = std::min(it, (float)maxit); res.iterations = std::min(it, (float)maxit);
res.reduction = norm / norm_0; res.reduction = norm / norm_0;
res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / it)); res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / it));
res.elapsed = t_total2 - t_total1; res.elapsed = t_total.stop();
res.converged = (it != (maxit + 0.5)); res.converged = (it != (maxit + 0.5));
if (verbosity > 0) { if (verbosity > 0) {
@ -556,12 +523,7 @@ void openclSolverBackend<block_size>::finalize() {
template <unsigned int block_size> template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu() { void openclSolverBackend<block_size>::copy_system_to_gpu() {
Timer t;
double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) {
t1 = second();
}
cl::Event event; cl::Event event;
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
@ -583,9 +545,8 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
event.wait(); event.wait();
if (verbosity > 2) { if (verbosity > 2) {
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "openclSolver::copy_system_to_gpu(): " << t2 - t1 << " s"; out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} // end copy_system_to_gpu() } // end copy_system_to_gpu()
@ -594,12 +555,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
// don't copy rowpointers and colindices, they stay the same // don't copy rowpointers and colindices, they stay the same
template <unsigned int block_size> template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system_on_gpu() { void openclSolverBackend<block_size>::update_system_on_gpu() {
Timer t;
double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) {
t1 = second();
}
cl::Event event; cl::Event event;
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
@ -619,9 +575,8 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
event.wait(); event.wait();
if (verbosity > 2) { if (verbosity > 2) {
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "openclSolver::update_system_on_gpu(): " << t2 - t1 << " s"; out << "openclSolver::update_system_on_gpu(): " << t.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} // end update_system_on_gpu() } // end update_system_on_gpu()
@ -629,12 +584,7 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
template <unsigned int block_size> template <unsigned int block_size>
bool openclSolverBackend<block_size>::analyse_matrix() { bool openclSolverBackend<block_size>::analyse_matrix() {
Timer t;
double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) {
t1 = second();
}
bool success = prec->init(mat); bool success = prec->init(mat);
int work_group_size = 32; int work_group_size = 32;
@ -648,9 +598,8 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
rmat = prec->getRMat(); rmat = prec->getRMat();
if (verbosity > 2) { if (verbosity > 2) {
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "openclSolver::analyse_matrix(): " << t2 - t1 << " s"; out << "openclSolver::analyse_matrix(): " << t.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
@ -662,18 +611,14 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
template <unsigned int block_size> template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system(double *vals, double *b) { void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
double t1 = 0.0, t2 = 0.0; Timer t;
if (verbosity > 2) {
t1 = second();
}
mat->nnzValues = vals; mat->nnzValues = vals;
blocked_reorder_vector_by_pattern<block_size>(mat->Nb, b, fromOrder, rb); blocked_reorder_vector_by_pattern<block_size>(mat->Nb, b, fromOrder, rb);
if (verbosity > 2) { if (verbosity > 2) {
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "openclSolver::update_system(): " << t2 - t1 << " s"; out << "openclSolver::update_system(): " << t.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} // end update_system() } // end update_system()
@ -681,18 +626,13 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
template <unsigned int block_size> template <unsigned int block_size>
bool openclSolverBackend<block_size>::create_preconditioner() { bool openclSolverBackend<block_size>::create_preconditioner() {
Timer t;
double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) {
t1 = second();
}
bool result = prec->create_preconditioner(mat); bool result = prec->create_preconditioner(mat);
if (verbosity > 2) { if (verbosity > 2) {
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "openclSolver::create_preconditioner(): " << t2 - t1 << " s"; out << "openclSolver::create_preconditioner(): " << t.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
return result; return result;
@ -701,18 +641,14 @@ bool openclSolverBackend<block_size>::create_preconditioner() {
template <unsigned int block_size> template <unsigned int block_size>
void openclSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) { void openclSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
// actually solve Timer t;
double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) {
t1 = second();
}
// actually solve
gpu_pbicgstab(wellContribs, res); gpu_pbicgstab(wellContribs, res);
if (verbosity > 2) { if (verbosity > 2) {
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "openclSolver::solve_system(): " << t2 - t1 << " s"; out << "openclSolver::solve_system(): " << t.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
@ -723,19 +659,14 @@ void openclSolverBackend<block_size>::solve_system(WellContributions& wellContri
// caller must be sure that x is a valid array // caller must be sure that x is a valid array
template <unsigned int block_size> template <unsigned int block_size>
void openclSolverBackend<block_size>::get_result(double *x) { void openclSolverBackend<block_size>::get_result(double *x) {
Timer t;
double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) {
t1 = second();
}
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb); queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
blocked_reorder_vector_by_pattern<block_size>(mat->Nb, rb, toOrder, x); blocked_reorder_vector_by_pattern<block_size>(mat->Nb, rb, toOrder, x);
if (verbosity > 2) { if (verbosity > 2) {
t2 = second();
std::ostringstream out; std::ostringstream out;
out << "openclSolver::get_result(): " << t2 - t1 << " s"; out << "openclSolver::get_result(): " << t.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} // end get_result() } // end get_result()

View File

@ -50,7 +50,6 @@ class openclSolverBackend : public BdaSolver<block_size>
using Base::verbosity; using Base::verbosity;
using Base::maxit; using Base::maxit;
using Base::tolerance; using Base::tolerance;
using Base::second;
using Base::initialized; using Base::initialized;
typedef BdaSolverStatus::Status Status; typedef BdaSolverStatus::Status Status;