/* Copyright 2020 Equinor ASA This file is part of the Open Porous Media project (OPM). OPM is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OPM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OPM. If not, see . */ #include #include #include #include #include #include #include #include #include // iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done // otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough #define COPY_ROW_BY_ROW 0 namespace bda { using Opm::OpmLog; using Dune::Timer; template openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) { prec = new Preconditioner(opencl_ilu_reorder, verbosity_); std::ostringstream out; try { std::vector platforms; cl::Platform::get(&platforms); if (platforms.size() == 0) { OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL platforms are found"); } out << "Found " << platforms.size() << " OpenCL platforms" << "\n"; if (verbosity >= 1) { std::string platform_info; for (unsigned int i = 0; i < platforms.size(); ++i) { platforms[i].getInfo(CL_PLATFORM_NAME, &platform_info); out << "Platform name : " << platform_info << "\n"; platforms[i].getInfo(CL_PLATFORM_VENDOR, &platform_info); out << "Platform vendor : " << platform_info << "\n"; platforms[i].getInfo(CL_PLATFORM_VERSION, &platform_info); out << "Platform version : " << platform_info << "\n"; platforms[i].getInfo(CL_PLATFORM_PROFILE, &platform_info); out << "Platform profile : " << platform_info << "\n"; platforms[i].getInfo(CL_PLATFORM_EXTENSIONS, &platform_info); out << "Platform extensions: " << platform_info << "\n\n"; } } OpmLog::info(out.str()); out.str(""); out.clear(); if (platforms.size() <= platformID) { OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID"); } else { std::string platform_info; out << "Chosen:\n"; platforms[platformID].getInfo(CL_PLATFORM_NAME, &platform_info); out << "Platform name : " << platform_info << "\n"; platforms[platformID].getInfo(CL_PLATFORM_VERSION, &platform_info); out << "Platform version : " << platform_info << "\n"; OpmLog::info(out.str()); out.str(""); out.clear(); } platforms[platformID].getDevices(CL_DEVICE_TYPE_ALL, &devices); if (devices.size() == 0){ OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found"); } out << "Found " << devices.size() << " OpenCL devices" << "\n"; if (verbosity >= 1) { for (unsigned int i = 0; i < devices.size(); ++i) { std::string device_info; std::vector work_sizes; std::vector partitions; devices[i].getInfo(CL_DEVICE_NAME, &device_info); out << "CL_DEVICE_NAME : " << device_info << "\n"; devices[i].getInfo(CL_DEVICE_VENDOR, &device_info); out << "CL_DEVICE_VENDOR : " << device_info << "\n"; devices[i].getInfo(CL_DRIVER_VERSION, &device_info); out << "CL_DRIVER_VERSION : " << device_info << "\n"; devices[i].getInfo(CL_DEVICE_BUILT_IN_KERNELS, &device_info); out << "CL_DEVICE_BUILT_IN_KERNELS: " << device_info << "\n"; devices[i].getInfo(CL_DEVICE_PROFILE, &device_info); out << "CL_DEVICE_PROFILE : " << device_info << "\n"; devices[i].getInfo(CL_DEVICE_OPENCL_C_VERSION, &device_info); out << "CL_DEVICE_OPENCL_C_VERSION: " << device_info << "\n"; devices[i].getInfo(CL_DEVICE_EXTENSIONS, &device_info); out << "CL_DEVICE_EXTENSIONS : " << device_info << "\n"; devices[i].getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &work_sizes); for (unsigned int j = 0; j < work_sizes.size(); ++j) { out << "CL_DEVICE_MAX_WORK_ITEM_SIZES[" << j << "]: " << work_sizes[j] << "\n"; } devices[i].getInfo(CL_DEVICE_PARTITION_PROPERTIES, &partitions); for (unsigned int j = 0; j < partitions.size(); ++j) { out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n"; } partitions.clear(); devices[i].getInfo(CL_DEVICE_PARTITION_TYPE, &partitions); for (unsigned int j = 0; j < partitions.size(); ++j) { out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n"; } // C-style properties cl_device_id tmp_id = devices[i](); cl_ulong size; clGetDeviceInfo(tmp_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &size, 0); out << "CL_DEVICE_LOCAL_MEM_SIZE : " << size / 1024 << " KB\n"; clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0); out << "CL_DEVICE_GLOBAL_MEM_SIZE : " << size / 1024 / 1024 / 1024 << " GB\n"; clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_ulong), &size, 0); out << "CL_DEVICE_MAX_COMPUTE_UNITS : " << size << "\n"; clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &size, 0); out << "CL_DEVICE_MAX_MEM_ALLOC_SIZE : " << size / 1024 / 1024 << " MB\n"; clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(cl_ulong), &size, 0); out << "CL_DEVICE_MAX_WORK_GROUP_SIZE : " << size << "\n"; clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0); out << "CL_DEVICE_GLOBAL_MEM_SIZE : " << size / 1024 / 1024 / 1024 << " GB\n\n"; } } OpmLog::info(out.str()); out.str(""); out.clear(); if (devices.size() <= deviceID){ OPM_THROW(std::logic_error, "Error chosen too high OpenCL device ID"); } else { std::string device_info; out << "Chosen:\n"; devices[deviceID].getInfo(CL_DEVICE_NAME, &device_info); out << "CL_DEVICE_NAME : " << device_info << "\n"; devices[deviceID].getInfo(CL_DEVICE_VERSION, &device_info); out << "CL_DEVICE_VERSION : " << device_info << "\n"; OpmLog::info(out.str()); out.str(""); out.clear(); } // removed all unused devices if (deviceID != 0) { devices[0] = devices[deviceID]; } devices.resize(1); context = std::make_shared(devices[0]); queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err)); } catch (const cl::Error& error) { std::ostringstream oss; oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n"; oss << getErrorString(error.err()); // rethrow exception OPM_THROW(std::logic_error, oss.str()); } catch (const std::logic_error& error) { // rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs throw error; } } template openclSolverBackend::~openclSolverBackend() { finalize(); } // divide A by B, and round up: return (int)ceil(A/B) template unsigned int openclSolverBackend::ceilDivision(const unsigned int A, const unsigned int B) { return A / B + (A % B > 0); } template double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out) { const unsigned int work_group_size = 256; const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; Timer t_dot; cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group)); queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp); double gpu_sum = 0.0; for (unsigned int i = 0; i < num_work_groups; ++i) { gpu_sum += tmp[i]; } if (verbosity >= 4) { event.wait(); std::ostringstream oss; oss << std::scientific << "openclSolver dot_w time: " << t_dot.stop() << " s"; OpmLog::info(oss.str()); } return gpu_sum; } template double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out) { const unsigned int work_group_size = 256; const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; Timer t_norm; cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group)); queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp); double gpu_norm = 0.0; for (unsigned int i = 0; i < num_work_groups; ++i) { gpu_norm += tmp[i]; } gpu_norm = sqrt(gpu_norm); if (verbosity >= 4) { event.wait(); std::ostringstream oss; oss << std::scientific << "openclSolver norm_w time: " << t_norm.stop() << " s"; OpmLog::info(oss.str()); } return gpu_norm; } template void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int total_work_items = num_work_groups * work_group_size; Timer t_axpy; cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N); if (verbosity >= 4) { event.wait(); std::ostringstream oss; oss << std::scientific << "openclSolver axpy_w time: " << t_axpy.stop() << " s"; OpmLog::info(oss.str()); } } template void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int total_work_items = num_work_groups * work_group_size; Timer t_custom; cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N); if (verbosity >= 4) { event.wait(); std::ostringstream oss; oss << std::scientific << "openclSolver custom_w time: " << t_custom.stop() << " s"; OpmLog::info(oss.str()); } } template void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; Timer t_spmv; cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); if (verbosity >= 4) { event.wait(); std::ostringstream oss; oss << std::scientific << "openclSolver spmv_blocked_w time: " << t_spmv.stop() << " s"; OpmLog::info(oss.str()); } } template void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { float it; double rho, rhop, beta, alpha, omega, tmp1, tmp2; double norm, norm_0; if(wellContribs.getNumWells() > 0){ wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get()); } Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false); // set r to the initial residual // if initial x guess is not 0, must call applyblockedscaleadd(), not implemented //applyblockedscaleadd(-1.0, mat, x, r); // set initial values events.resize(5); queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N, nullptr, &events[0]); queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &events[1]); rho = 1.0; alpha = 1.0; omega = 1.0; queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &events[2]); queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &events[3]); queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &events[4]); cl::WaitForEvents(events); events.clear(); if (err != CL_SUCCESS) { // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error"); } norm = norm_w(d_r, d_tmp); norm_0 = norm; if (verbosity > 1) { std::ostringstream out; out << std::scientific << "openclSolver initial norm: " << norm_0; OpmLog::info(out.str()); } t_rest.start(); for (it = 0.5; it < maxit; it += 0.5) { rhop = rho; rho = dot_w(d_rw, d_r, d_tmp); if (it > 1) { beta = (rho / rhop) * (alpha / omega); custom_w(d_p, d_v, d_r, omega, beta); } t_rest.stop(); // pw = prec(p) t_prec.start(); prec->apply(d_p, d_pw); t_prec.stop(); // v = A * pw t_spmv.start(); spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v); t_spmv.stop(); // apply wellContributions t_well.start(); if(wellContribs.getNumWells() > 0){ wellContribs.apply(d_pw, d_v, d_toOrder); } t_well.stop(); t_rest.start(); tmp1 = dot_w(d_rw, d_v, d_tmp); alpha = rho / tmp1; axpy_w(d_v, -alpha, d_r); // r = r - alpha * v axpy_w(d_pw, alpha, d_x); // x = x + alpha * pw norm = norm_w(d_r, d_tmp); t_rest.stop(); if (norm < tolerance * norm_0) { break; } it += 0.5; // s = prec(r) t_prec.start(); prec->apply(d_r, d_s); t_prec.stop(); // t = A * s t_spmv.start(); spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_s, d_t); t_spmv.stop(); // apply wellContributions t_well.start(); if(wellContribs.getNumWells() > 0){ wellContribs.apply(d_s, d_t, d_toOrder); } t_well.stop(); t_rest.start(); tmp1 = dot_w(d_t, d_r, d_tmp); tmp2 = dot_w(d_t, d_t, d_tmp); omega = tmp1 / tmp2; axpy_w(d_s, omega, d_x); // x = x + omega * s axpy_w(d_t, -omega, d_r); // r = r - omega * t norm = norm_w(d_r, d_tmp); t_rest.stop(); if (norm < tolerance * norm_0) { break; } if (verbosity > 1) { std::ostringstream out; out << "it: " << it << std::scientific << ", norm: " << norm; OpmLog::info(out.str()); } } res.iterations = std::min(it, (float)maxit); res.reduction = norm / norm_0; res.conv_rate = static_cast(pow(res.reduction, 1.0 / it)); res.elapsed = t_total.stop(); res.converged = (it != (maxit + 0.5)); if (verbosity > 0) { std::ostringstream out; out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \ ", time per iteration: " << res.elapsed / it << ", iterations: " << it; OpmLog::info(out.str()); } if (verbosity >= 4) { std::ostringstream out; out << "openclSolver::ilu_apply: " << t_prec.elapsed() << " s\n"; out << "wellContributions::apply: " << t_well.elapsed() << " s\n"; out << "openclSolver::spmv: " << t_spmv.elapsed() << " s\n"; out << "openclSolver::rest: " << t_rest.elapsed() << " s\n"; out << "openclSolver::total_solve: " << res.elapsed << " s\n"; OpmLog::info(out.str()); } } template void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) { this->N = N_; this->nnz = nnz_; this->nnzb = nnz_ / block_size / block_size; Nb = (N + dim - 1) / dim; std::ostringstream out; out << "Initializing GPU, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n"; out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n"; out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n"; OpmLog::info(out.str()); out.str(""); out.clear(); try { prec->setOpenCLContext(context.get()); prec->setOpenCLQueue(queue.get()); tmp = new double[N]; #if COPY_ROW_BY_ROW vals_contiguous = new double[N]; #endif mat.reset(new BlockedMatrix(Nb, nnzb, vals, cols, rows)); d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnz); d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb); d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1)); bool reorder = (opencl_ilu_reorder != ILUReorder::NONE); if (reorder) { rb = new double[N]; d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb); } get_opencl_kernels(); prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get()); } catch (const cl::Error& error) { std::ostringstream oss; oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n"; oss << getErrorString(error.err()); // rethrow exception OPM_THROW(std::logic_error, oss.str()); } catch (const std::logic_error& error) { // rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs throw error; } initialized = true; } // end initialize() void add_kernel_string(cl::Program::Sources &sources, std::string &source) { sources.emplace_back(std::make_pair(source.c_str(), source.size())); } template void openclSolverBackend::get_opencl_kernels() { cl::Program::Sources sources; std::string axpy_s = get_axpy_string(); add_kernel_string(sources, axpy_s); std::string dot_1_s = get_dot_1_string(); add_kernel_string(sources, dot_1_s); std::string norm_s = get_norm_string(); add_kernel_string(sources, norm_s); std::string custom_s = get_custom_string(); add_kernel_string(sources, custom_s); std::string spmv_blocked_s = get_spmv_blocked_string(); add_kernel_string(sources, spmv_blocked_s); #if CHOW_PATEL bool ilu_operate_on_full_matrix = false; #else bool ilu_operate_on_full_matrix = true; #endif std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix); add_kernel_string(sources, ILU_apply1_s); std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix); add_kernel_string(sources, ILU_apply2_s); std::string stdwell_apply_s = get_stdwell_apply_string(true); add_kernel_string(sources, stdwell_apply_s); std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false); add_kernel_string(sources, stdwell_apply_no_reorder_s); std::string ilu_decomp_s = get_ilu_decomp_string(); add_kernel_string(sources, ilu_decomp_s); cl::Program program = cl::Program(*context, sources); program.build(devices); // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking // actually creating the kernels dot_k.reset(new cl::make_kernel(cl::Kernel(program, "dot_1"))); norm_k.reset(new cl::make_kernel(cl::Kernel(program, "norm"))); axpy_k.reset(new cl::make_kernel(cl::Kernel(program, "axpy"))); custom_k.reset(new cl::make_kernel(cl::Kernel(program, "custom"))); spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked"))); ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1"))); ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2"))); stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply"))); stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder"))); ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp"))); } // end get_opencl_kernels() template void openclSolverBackend::finalize() { if (opencl_ilu_reorder != ILUReorder::NONE) { delete[] rb; } delete[] tmp; #if COPY_ROW_BY_ROW delete[] vals_contiguous; #endif delete prec; } // end finalize() template void openclSolverBackend::copy_system_to_gpu() { Timer t; events.resize(5); #if COPY_ROW_BY_ROW int sum = 0; for (int i = 0; i < Nb; ++i) { int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i]; memcpy(vals_contiguous + sum, reinterpret_cast(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size); sum += size_row * block_size * block_size; } err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]); #else err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]); #endif err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices, nullptr, &events[1]); err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers, nullptr, &events[2]); err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[3]); err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]); if (opencl_ilu_reorder != ILUReorder::NONE) { events.resize(6); queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder, nullptr, &events[5]); } cl::WaitForEvents(events); events.clear(); if (err != CL_SUCCESS) { // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueueWriteBuffer error"); } if (verbosity > 2) { std::ostringstream out; out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s"; OpmLog::info(out.str()); } } // end copy_system_to_gpu() // don't copy rowpointers and colindices, they stay the same template void openclSolverBackend::update_system_on_gpu() { Timer t; events.resize(3); #if COPY_ROW_BY_ROW int sum = 0; for (int i = 0; i < Nb; ++i) { int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i]; memcpy(vals_contiguous + sum, reinterpret_cast(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size); sum += size_row * block_size * block_size; } err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]); #else err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]); #endif err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[1]); err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]); cl::WaitForEvents(events); events.clear(); if (err != CL_SUCCESS) { // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueueWriteBuffer error"); } if (verbosity > 2) { std::ostringstream out; out << "openclSolver::update_system_on_gpu(): " << t.stop() << " s"; OpmLog::info(out.str()); } } // end update_system_on_gpu() template bool openclSolverBackend::analyse_matrix() { Timer t; bool success = prec->init(mat.get()); int work_group_size = 32; int num_work_groups = ceilDivision(N, work_group_size); int total_work_items = num_work_groups * work_group_size; int lmem_per_work_group = work_group_size * sizeof(double); prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group); if (opencl_ilu_reorder == ILUReorder::NONE) { rmat = mat.get(); } else { toOrder = prec->getToOrder(); fromOrder = prec->getFromOrder(); rmat = prec->getRMat(); } if (verbosity > 2) { std::ostringstream out; out << "openclSolver::analyse_matrix(): " << t.stop() << " s"; OpmLog::info(out.str()); } analysis_done = true; return success; } // end analyse_matrix() template void openclSolverBackend::update_system(double *vals, double *b, WellContributions &wellContribs) { Timer t; mat->nnzValues = vals; if (opencl_ilu_reorder != ILUReorder::NONE) { reorderBlockedVectorByPattern(mat->Nb, b, fromOrder, rb); wellContribs.setReordering(toOrder, true); } else { rb = b; wellContribs.setReordering(nullptr, false); } if (verbosity > 2) { std::ostringstream out; out << "openclSolver::update_system(): " << t.stop() << " s"; OpmLog::info(out.str()); } } // end update_system() template bool openclSolverBackend::create_preconditioner() { Timer t; bool result = prec->create_preconditioner(mat.get()); if (verbosity > 2) { std::ostringstream out; out << "openclSolver::create_preconditioner(): " << t.stop() << " s"; OpmLog::info(out.str()); } return result; } // end create_preconditioner() template void openclSolverBackend::solve_system(WellContributions &wellContribs, BdaResult &res) { Timer t; // actually solve try { gpu_pbicgstab(wellContribs, res); } catch (const cl::Error& error) { std::ostringstream oss; oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n"; oss << getErrorString(error.err()); // rethrow exception OPM_THROW(std::logic_error, oss.str()); } catch (const std::logic_error& error) { // rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs throw error; } if (verbosity > 2) { std::ostringstream out; out << "openclSolver::solve_system(): " << t.stop() << " s"; OpmLog::info(out.str()); } } // end solve_system() // copy result to host memory // caller must be sure that x is a valid array template void openclSolverBackend::get_result(double *x) { Timer t; if (opencl_ilu_reorder != ILUReorder::NONE) { queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb); reorderBlockedVectorByPattern(mat->Nb, rb, toOrder, x); } else { queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x); } if (verbosity > 2) { std::ostringstream out; out << "openclSolver::get_result(): " << t.stop() << " s"; OpmLog::info(out.str()); } } // end get_result() template SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { if (initialized == false) { initialize(N_, nnz_, dim, vals, rows, cols); if (analysis_done == false) { if (!analyse_matrix()) { return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; } } update_system(vals, b, wellContribs); if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } copy_system_to_gpu(); } else { update_system(vals, b, wellContribs); if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } update_system_on_gpu(); } solve_system(wellContribs, res); return SolverStatus::BDA_SOLVER_SUCCESS; } #define INSTANTIATE_BDA_FUNCTIONS(n) \ template openclSolverBackend::openclSolverBackend(int, int, double, unsigned int, unsigned int, ILUReorder); \ INSTANTIATE_BDA_FUNCTIONS(1); INSTANTIATE_BDA_FUNCTIONS(2); INSTANTIATE_BDA_FUNCTIONS(3); INSTANTIATE_BDA_FUNCTIONS(4); #undef INSTANTIATE_BDA_FUNCTIONS } // namespace bda