From 4384872f8ed0aae61bfb96c1d32094b3f97681bb Mon Sep 17 00:00:00 2001 From: Tong Dong Qiu Date: Mon, 13 Jun 2022 16:12:36 +0200 Subject: [PATCH] Only use timers when printing their results, and actually wait for kernels to finish when timing GPU --- opm/simulators/linalg/bda/opencl/BILU0.cpp | 2 +- .../linalg/bda/opencl/openclSolverBackend.cpp | 67 ++++++++++++++----- 2 files changed, 50 insertions(+), 19 deletions(-) diff --git a/opm/simulators/linalg/bda/opencl/BILU0.cpp b/opm/simulators/linalg/bda/opencl/BILU0.cpp index b63f5959b..49b718da5 100644 --- a/opm/simulators/linalg/bda/opencl/BILU0.cpp +++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp @@ -267,7 +267,6 @@ bool BILU0::create_preconditioner(BlockedMatrix *mat, BlockedMatrix Timer t_decomposition; std::ostringstream out; - cl::Event event; for (int color = 0; color < numColors; ++color) { const unsigned int firstRow = rowsPerColorPrefix[color]; const unsigned int lastRow = rowsPerColorPrefix[color+1]; @@ -278,6 +277,7 @@ bool BILU0::create_preconditioner(BlockedMatrix *mat, BlockedMatrix } if (verbosity >= 3) { + queue->finish(); out << "BILU0 decomposition: " << t_decomposition.stop() << " s"; OpmLog::info(out.str()); } diff --git a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp index dec83ce4c..4cc541633 100644 --- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp @@ -279,7 +279,9 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr OpmLog::info(out.str()); } - t_rest.start(); + if (verbosity >= 3) { + t_rest.start(); + } for (it = 0.5; it < maxit; it += 0.5) { rhop = rho; rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N); @@ -288,32 +290,47 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr beta = (rho / rhop) * (alpha / omega); OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N); } - t_rest.stop(); + if (verbosity >= 3) { + queue->finish(); + t_rest.stop(); + t_prec.start(); + } // pw = prec(p) - t_prec.start(); prec->apply(d_p, d_pw); - t_prec.stop(); + if (verbosity >= 3) { + queue->finish(); + t_prec.stop(); + t_spmv.start(); + } // v = A * pw - t_spmv.start(); OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size); - t_spmv.stop(); + if (verbosity >= 3) { + queue->finish(); + t_spmv.stop(); + t_well.start(); + } // apply wellContributions - t_well.start(); if(wellContribs.getNumWells() > 0){ static_cast(wellContribs).apply(d_pw, d_v, d_toOrder); } - t_well.stop(); + if(verbosity >= 3) { + queue->finish(); + t_well.stop(); + t_rest.start(); + } - t_rest.start(); tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N); alpha = rho / tmp1; OpenclKernels::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v OpenclKernels::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw norm = OpenclKernels::norm(d_r, d_tmp, N); - t_rest.stop(); + if (verbosity >= 3) { + queue->finish(); + t_rest.stop(); + } if (norm < tolerance * norm_0) { break; @@ -322,30 +339,44 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr it += 0.5; // s = prec(r) - t_prec.start(); + if (verbosity >= 3) { + t_prec.start(); + } prec->apply(d_r, d_s); - t_prec.stop(); + if (verbosity >= 3) { + queue->finish(); + t_prec.stop(); + t_spmv.start(); + } // t = A * s - t_spmv.start(); OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size); - t_spmv.stop(); + if(verbosity >= 3){ + queue->finish(); + t_spmv.stop(); + t_well.start(); + } // apply wellContributions - t_well.start(); if(wellContribs.getNumWells() > 0){ static_cast(wellContribs).apply(d_s, d_t, d_toOrder); } - t_well.stop(); + if (verbosity >= 3) { + queue->finish(); + t_well.stop(); + t_rest.start(); + } - t_rest.start(); tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N); tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N); omega = tmp1 / tmp2; OpenclKernels::axpy(d_s, omega, d_x, N); // x = x + omega * s OpenclKernels::axpy(d_t, -omega, d_r, N); // r = r - omega * t norm = OpenclKernels::norm(d_r, d_tmp, N); - t_rest.stop(); + if (verbosity >= 3) { + queue->finish(); + t_rest.stop(); + } if (norm < tolerance * norm_0) { break;