/* Copyright 2020 Equinor ASA This file is part of the Open Porous Media project (OPM). OPM is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OPM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OPM. If not, see . */ #include #include #include #include #include #include #include #include // if defined, any FPGA kernel failure will terminate flow; otherwise, the FPGA // kernel will be disabled and execution will continue using DUNE #define FPGA_EXIT_WITH_HW_FAILURE //#undef FPGA_EXIT_WITH_HW_FAILURE // if defined, the function generate_statistics will create a CSV-formatted file // with detailed statistics about the FPGA backend performance //#define FPGA_STATISTICS_FILE_ENABLED #undef FPGA_STATISTICS_FILE_ENABLED namespace Opm { namespace Accelerator { using Opm::OpmLog; template FpgaSolverBackend::FpgaSolverBackend(std::string fpga_bitstream, int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder) : BdaSolver(fpga_bitstream, verbosity_, maxit_, tolerance_) { int err; std::ostringstream oss; double start = second(); // currently, only block size == 3 is supported by the FPGA backend assert(block_size == 3); if (verbosity < 1) { perf_call_enabled = false; } // setup bitstream name and other parameters if (fpga_bitstream.compare("") == 0) { OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file has not been specified"); } if (!fileExists(fpga_bitstream.c_str())) { OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file specified does not exists or is not readable"); } // ----------------------------- // FPGA: setup the OpenCL platform // ----------------------------- std::string main_kernel_name(KERNEL_NAME); // macro defined in bicgstab_solver_config.hpp // auto-select the proper FPGA device and create context and other CL objects err = setup_opencl(nullptr, &device_id, &context, &commands, &program, &kernel, main_kernel_name.c_str(), fpga_bitstream.c_str(), &platform_awsf1); if (err != 0) { oss << "Failed to setup the OpenCL device (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } oss << "Detected FPGA platform type is "; if (platform_awsf1) { oss << "AWS-F1."; } else { oss << "Xilinx Alveo."; } OpmLog::info(oss.str()); oss.str(""); oss.clear(); // ----------------------------- // FPGA: setup the debug buffer // ----------------------------- // set kernel debug lines depending on an environment variable const char *xem = getenv("XCL_EMULATION_MODE"); if ((xem != nullptr) && (strcmp(xem, "sw_emu") == 0 || strcmp(xem, "hw_emu") == 0)) { debug_outbuf_words = DEBUG_OUTBUF_WORDS_MAX_EMU; oss << "Detected co-simulation mode, debug_outbuf_words set to " << debug_outbuf_words << ".\n"; OpmLog::info(oss.str()); oss.str(""); oss.clear(); } else { // set to 2 to reduce overhead in reading back and interpreting the debug lines; // increase to get more debug info from the kernel // range is 2..DEBUG_OUTBUF_WORDS_MAX-1 debug_outbuf_words = 2; } // host debug buffer setup err = fpga_setup_host_debugbuf(debug_outbuf_words, &debugBuffer, &debugbufferSize); if (err != 0) { oss << "Failed to call fpga_setup_host_debug_buffer (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } // device debug buffer setup err = fpga_setup_device_debugbuf(context, debugBuffer, &cldebug, debugbufferSize); if (err != 0) { oss << "Failed to call fpga_setup_device_debug_buffer (" << err << ").\n"; OPM_THROW(std::logic_error, oss.str()); } // copy debug buffer to device err = fpga_copy_to_device_debugbuf(commands, cldebug, debugBuffer, debugbufferSize, debug_outbuf_words); if (err != 0) { oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ").\n"; OPM_THROW(std::logic_error, oss.str()); } // ------------------------------------------------ // FPGA: query the kernel for limits/configuration // ------------------------------------------------ err = fpga_kernel_query(context, commands, kernel, cldebug, debugBuffer, debug_outbuf_words, rst_assert_cycles, rst_settle_cycles, &hw_x_vector_elem, &hw_max_row_size, &hw_max_column_size, &hw_max_colors_size, &hw_max_nnzs_per_row, &hw_max_matrix_size, &hw_use_uram, &hw_write_ilu0_results, &hw_dma_data_width, &hw_mult_num, &hw_x_vector_latency, &hw_add_latency, &hw_mult_latency, &hw_num_read_ports, &hw_num_write_ports, &hw_reset_cycles, &hw_reset_settle); if (err != 0) { oss << "Failed to call fpga_kernel_query (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } if (verbosity >= 1) { oss << "FPGA kernel limits/configuration:\n"; oss << " x_vector_elem=" << hw_max_colors_size << ", max_row_size=" << hw_max_nnzs_per_row << ", max_column_size=" << hw_max_matrix_size << "\n"; oss << " max_colors_size=" << hw_x_vector_elem << ", max_nnzs_per_row=" << hw_max_row_size << ", max_matrix_size=" << hw_max_column_size << "\n"; oss << " use_uram=" << hw_use_uram << ", write_ilu0_results=" << hw_write_ilu0_results << "\n"; oss << " dma_data_width=" << hw_dma_data_width << ", mult_num=" << (unsigned int)hw_mult_num << "\n"; oss << " x_vector_latency=" << (unsigned int)hw_x_vector_latency << "\n"; oss << " add_latency=" << (unsigned int)hw_add_latency << ", mult_latency=" << (unsigned int)hw_mult_latency << "\n"; oss << " num_read_ports=" << (unsigned int)hw_num_read_ports << ", num_write_ports=" << (unsigned int)hw_num_write_ports << "\n"; oss << " reset_cycles=" << hw_reset_cycles << ", reset_settle=" << hw_reset_settle; OpmLog::info(oss.str()); oss.str(""); oss.clear(); } // check that LU results are generated by the kernel if (use_LU_res && !hw_write_ilu0_results) { OpmLog::warning("Kernel reports that LU results are not written to memory, but use_LU_res is set; disabling LU results usage"); oss.str(""); oss.clear(); use_LU_res = false; } // setup preconditioner double start_prec = second(); prec = std::make_unique(opencl_ilu_reorder, verbosity_, hw_max_row_size, hw_max_column_size, hw_max_nnzs_per_row, hw_max_colors_size); perf_total.s_preconditioner_setup = second() - start_prec; if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) { level_scheduling = true; } perf_total.s_initialization = second() - start; } // end fpgaSolverBackend template FpgaSolverBackend::~FpgaSolverBackend() { if (verbosity >= 1) { generate_statistics(); } delete[] rx; delete[] rb; if (nnzValArrays != nullptr) { free(nnzValArrays); } if (L_nnzValArrays != nullptr) { free(L_nnzValArrays); } if (U_nnzValArrays != nullptr) { free(U_nnzValArrays); } // FPGA: buffers free(debugBuffer); for (int b = 0; b < RW_BUF; b++) { free(dataBuffer[b]); } free(databufferSize); // FPGA: OpenCL objects if (cldebug != nullptr) { clReleaseMemObject(cldebug); } for (int b = 0; b < RW_BUF; b++) { if (cldata[b] != nullptr) { clReleaseMemObject(cldata[b]); } } clReleaseCommandQueue(commands); clReleaseContext(context); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseDevice(device_id); } // end ~fpgaSolverBackend() // copy result to host memory // caller must be sure that x is a valid array template void FpgaSolverBackend::get_result(double *x_) { double start = 0; if (perf_call_enabled) { start = second(); } // apply to results the reordering (stored in toOrder) reorderBlockedVectorByPattern(mat->Nb, rx, toOrder, x_); // TODO: check if it is more efficient to avoid copying resultsBuffer[0] to rx in solve_system (private) if (perf_call_enabled) { perf_call.back().s_postprocess = second() - start; } } // end get_result() template SolverStatus FpgaSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res) { if (initialized == false) { initialize(N_, nnz_, dim, vals, rows, cols); if (!analyse_matrix()) { return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; } } perf_call.emplace_back(); update_system(vals, b); if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } solve_system(res); if (verbosity >= 1) { std::ostringstream oss; oss << "fpgaSolverBackend::" << __func__ << " - converged: " << res.converged << \ ", iterations: " << res.iterations << ", reduction: " << res.reduction << \ ", conv_rate: " << res.conv_rate << ", elapsed: " << res.elapsed; OpmLog::info(oss.str()); } return SolverStatus::BDA_SOLVER_SUCCESS; } template void FpgaSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) { double start = second(); this->N = N_; this->nnz = nnz_; this->nnzb = nnz_ / block_size / block_size; Nb = (N + dim - 1) / dim; // allocate host memory for matrices and vectors // actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here mat.reset(new BlockedMatrix(N_ / block_size, nnz_ / block_size / block_size, vals, cols, rows)); std::ostringstream oss; oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \ "block size: " << dim << ", total nnz: " << this->nnz << "\n"; oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance; OpmLog::info(oss.str()); rx = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))]; rb = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))]; perf_total.s_initialization += second() - start; initialized = true; } // end initialize() template bool FpgaSolverBackend::analyse_matrix() { std::ostringstream oss; int err; double start = second(); bool success = prec->init(mat.get()); if (!success) { OpmLog::warning("Preconditioner for FPGA solver failed to initialize"); return success; } toOrder = prec->getToOrder(); fromOrder = prec->getFromOrder(); rMat = prec->getRMat(); processedPointers = prec->getResultPointers(); processedSizes = prec->getResultSizes(); processedPointers[19] = rb; processedPointers[20] = rx; nnzValArrays_size = static_cast(processedPointers[5])[0]; L_nnzValArrays_size = static_cast(processedPointers[11])[0]; U_nnzValArrays_size = static_cast(processedPointers[17])[0]; // ------------------------------------- // FPGA: setup host/device data buffers // ------------------------------------- // allocate memory and setup data layout err = fpga_setup_host_datamem(level_scheduling, fpga_config_bits, processedSizes, &setupArray, &nnzValArrays, &nnzValArrays_size, &columnIndexArray, &newRowOffsetArray, &PIndexArray, &colorSizesArray, &L_nnzValArrays, &L_nnzValArrays_size, &L_columnIndexArray, &L_newRowOffsetArray, &L_PIndexArray, &L_colorSizesArray, &U_nnzValArrays, &U_nnzValArrays_size, &U_columnIndexArray, &U_newRowOffsetArray, &U_PIndexArray, &U_colorSizesArray, &BLKDArray, &X1Array, &R1Array, &X2Array, &R2Array, &LresArray, &UresArray, &databufferSize, dataBuffer, result_offsets, 1 /*num_nnz_arrays*/, true /*reset_data_buffers*/, /* WARNING: leave reset_data_buffers always ENABLED to avoid data corruption! */ debugbufferSize); if (err) { oss << "Failed to call fpga_setup_host_datamem (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } // results buffers setup if (use_LU_res) { resultsBufferNum = 4; } else { resultsBufferNum = 2; } if (resultsBufferNum > RES_BUF_MAX) { oss << "Number of results buffer (" << resultsBufferNum << ") is out of range (max " << RES_BUF_MAX << ")"; OPM_THROW(std::logic_error, oss.str()); } resultsNum = processedSizes[0]; // rowSize, invariant between system solves for (int i = 0; i < resultsBufferNum; i++) { resultsBufferSize[i] = roundUpTo(resultsNum, CACHELINE_BYTES / sizeof(double)) * sizeof(double); } // device data memory setup err = fpga_setup_device_datamem(context, databufferSize, dataBuffer, cldata); if (err != 0) { oss << "Failed to call fpga_setup_device_datamem (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } // ------------------------------------ // FPGA: setup the kernel's parameters // ------------------------------------ err = fpga_set_kernel_parameters(kernel, abort_cycles, debug_outbuf_words - 1, maxit, debug_sample_rate, tolerance, cldata, cldebug); if (err != 0) { oss << "Failed to call fpga_set_kernel_parameters (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } perf_total.s_analysis = second() - start; analysis_done = true; return success; } // end analyse_matrix() template bool FpgaSolverBackend::create_preconditioner() { double start = 0; if (perf_call_enabled) { start = second(); } memset(rx, 0, sizeof(double) * N); bool result = prec->create_preconditioner(mat.get()); if (!result) { OpmLog::warning("fpgaSolverBackend: create_preconditioner failed"); } if (perf_call_enabled) { perf_call.back().s_preconditioner_create = second() - start; } return result; } // end create_preconditioner() template void FpgaSolverBackend::solve_system(BdaResult &res) { std::ostringstream oss; int err; double start = 0, start_total = 0; // ------------------------------------ // FPGA: return immediately if FPGA is disabled // ------------------------------------ if (fpga_disabled) { res.converged = false; OpmLog::warning("FPGA is disabled, fallback to SW execution"); return; } fpga_calls++; if (perf_call_enabled) { start = second(); start_total = start; } // check if any buffer is larger than the size set in preconditioner->init // TODO: add check for all other buffer sizes that may overflow? err = 0; if ( ((int *)processedPointers[5])[0] > nnzValArrays_size || ((int *)processedPointers[11])[0] > L_nnzValArrays_size || ((int *)processedPointers[17])[0] > U_nnzValArrays_size ) { err = 1; } if (err != 0) { OPM_THROW(std::logic_error, "A buffer size is larger than the initial allocation in solve_system (check preconditioner init)"); } // ------------------------------------ // FPGA: copy input data to host data buffers // ------------------------------------ if (perf_call_enabled) { start = second(); } err = fpga_copy_host_datamem( processedPointers, processedSizes, setupArray, nnzValArrays, &nnzValArrays_size, columnIndexArray, newRowOffsetArray, PIndexArray, colorSizesArray, L_nnzValArrays, &L_nnzValArrays_size, L_columnIndexArray, L_newRowOffsetArray, L_PIndexArray, L_colorSizesArray, U_nnzValArrays, &U_nnzValArrays_size, U_columnIndexArray, U_newRowOffsetArray, U_PIndexArray, U_colorSizesArray, BLKDArray, X1Array, R1Array, X2Array, R2Array, use_LU_res, LresArray, UresArray, databufferSize, dataBuffer, 1 /* nnzValArrays_num */, reset_data_buffers, fill_results_buffers, dump_data_buffers, fpga_calls); if (perf_call_enabled) { perf_call.back().s_mem_setup = second() - start; } if (err != 0) { oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } // ------------------------------------ // FPGA: copy buffers to device // ------------------------------------ // copy debug buffer to device if (perf_call_enabled) { start = second(); } err = fpga_copy_to_device_debugbuf(commands, cldebug, debugBuffer, debugbufferSize, debug_outbuf_words); if (err != 0) { oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } // copy data buffers to device err = fpga_copy_to_device_datamem(commands, RW_BUF, cldata); if (err != 0) { oss << "Failed to call fpga_copy_to_device_datamem (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } if (perf_call_enabled) { perf_call.back().s_mem_h2d = second() - start; } // ------------------------------------ // FPGA: execute the kernel // ------------------------------------ double time_elapsed_ms; if (perf_call_enabled) { start = second(); } err = fpga_kernel_run(commands, kernel, &time_elapsed_ms); if (perf_call_enabled) { perf_call.back().s_kernel_exec = second() - start; } if (err != 0) { oss << "Failed to call fpga_kernel_run (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } // ---------------------------------------- // FPGA: read back debug buffer from device // ---------------------------------------- if (perf_call_enabled) { start = second(); } err = fpga_copy_from_device_debugbuf((bool)(verbosity < 10), commands, debug_outbuf_words, debugbufferSize, cldebug, debugBuffer, abort_cycles, &kernel_cycles, &kernel_iter_run, norms, &last_norm_idx, &kernel_aborted, &kernel_signature, &kernel_overflow, &kernel_noresults, &kernel_wrafterend, &kernel_dbgfifofull); if (err != 0) { oss << "Failed to call fpga_copy_from_device_debugbuf (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } if (kernel_wrafterend) { OpmLog::warning("Detected recoverable FPGA error: kernel write after end"); } if (kernel_dbgfifofull) { OpmLog::warning("Detected recoverable FPGA error: debug FIFO full"); } if (kernel_aborted || kernel_signature || kernel_overflow) { #if defined(FPGA_EXIT_WITH_HW_FAILURE) oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \ ",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")"; OPM_THROW(std::logic_error, oss.str()); #else oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \ ",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")\n"; oss << "Disabling FPGA kernel: execution will continue with SW kernel"; OpmLog::warning(oss.str()); oss.str(""); oss.clear(); fpga_disabled = true; #endif } if (perf_call_enabled) { perf_call.back().n_kernel_exec_cycles = kernel_cycles; } // copy (back) results only if FPGA is not disabled if (!fpga_disabled) { if (kernel_noresults) { OpmLog::warning("FPGA kernel did not return results because the required precision is already reached"); // rx still contains zeros from initial guess } else { // ------------------------------------ // FPGA: read back results from device // ------------------------------------ err = fpga_map_results(even(kernel_iter_run), use_residuals, use_LU_res, commands, resultsNum, resultsBufferNum, resultsBufferSize, debugbufferSize, cldata, resultsBuffer, result_offsets, dump_results, data_dir, basename, sequence); if (err != 0) { oss << "Failed to call fpga_map_results (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } // TODO: copy results buffers to reordering output buffers memcpy(rx, resultsBuffer[0], resultsNum * sizeof(double)); err = fpga_unmap_results(even(kernel_iter_run), use_residuals, use_LU_res, commands, cldata, resultsBuffer); if (err != 0) { oss << "Failed to call fpga_unmap_results (" << err << ")"; OPM_THROW(std::logic_error, oss.str()); } } } // set results and update statistics (if enabled) if (perf_call_enabled) { perf_call.back().s_mem_d2h = second() - start; } float iter = ((float)kernel_iter_run / 2.0) + 0.5; // convert from half iteration int to actual iterationns res.iterations = (int)iter; res.reduction = norms[0] / norms[last_norm_idx]; // norms[0] is the initial norm res.conv_rate = pow(res.reduction, 1.0 / iter); res.elapsed = second() - start_total; if (perf_call_enabled) { perf_call.back().s_solve = res.elapsed; perf_call.back().n_kernel_exec_iters = iter; } // convergence depends on number of iterations reached and hw execution errors res.converged = true; if (fpga_disabled || kernel_aborted || kernel_signature || kernel_overflow || iter >= (float)maxit) { res.converged = false; if (verbosity >= 1) { oss << "FPGA kernel did not converge, reason: fpga_disabled=" << fpga_disabled << \ ", kernel_aborted=" << kernel_aborted << ", kernel_signature=" << kernel_signature << \ ", kernel_overflow=" << kernel_overflow << ", (iter>=" << maxit << ")=" << (iter >= (float)maxit); OpmLog::warning(oss.str()); oss.str(""); oss.clear(); } } if (perf_call_enabled) { perf_call.back().converged = res.converged; perf_call.back().converged_flags = ((unsigned int)fpga_disabled) + ((unsigned int)kernel_aborted << 1) + ((unsigned int)kernel_signature << 2) + ((unsigned int)kernel_overflow << 3) + ((unsigned int)(iter >= (float)maxit) << 4); } } // end solve_system() template void FpgaSolverBackend::update_system(double *vals, double *b) { double start = 0; mat->nnzValues = vals; // reorder inputs using previously found ordering (stored in fromOrder) if (perf_call_enabled) { start = second(); } reorderBlockedVectorByPattern(mat->Nb, b, fromOrder, rb); if (perf_call_enabled) { perf_call.back().s_reorder = second() - start; } } // end update_system() template void FpgaSolverBackend::generate_statistics() { std::ostringstream oss; unsigned int conv_iter = 0, conv_ovf = 0; if (!perf_call_enabled || fpga_calls == 0) { OpmLog::warning("FPGA statistics were not collected"); return; } std::printf("--- FPGA statistics ---\n"); std::printf("total solver calls..........: %u\n", fpga_calls); std::printf("time initialization.........: %8.6f s\n", perf_total.s_initialization); std::printf("time preconditioner setup...: %8.6f s\n", perf_total.s_preconditioner_setup); #if defined(FPGA_STATISTICS_FILE_ENABLED) // DEBUG: this can be enabled to gather all the statistics in a CSV-formatted file FILE *fout = fopen("fpga_statistics_details.csv", "w"); if (fout != nullptr) { std::fprintf(fout, "call,preconditioner_create,analysis,reorder,mem_setup,mem_h2d,kernel_exec,kernel_cycles,kernel_iters,mem_d2h,solve,postprocess,converged\n"); } #endif unsigned int num_data_points = perf_call.size(); for (unsigned int i = 0; i < num_data_points; i++) { perf_total.s_preconditioner_create += perf_call[i].s_preconditioner_create; if (perf_call[i].s_preconditioner_create > perf_total.s_preconditioner_create_max) { perf_total.s_preconditioner_create_max = perf_call[i].s_preconditioner_create; } if (perf_call[i].s_preconditioner_create < perf_total.s_preconditioner_create_min) { perf_total.s_preconditioner_create_min = perf_call[i].s_preconditioner_create; } perf_total.s_analysis += perf_call[i].s_analysis; if (perf_call[i].s_analysis > perf_total.s_analysis_max) { perf_total.s_analysis_max = perf_call[i].s_analysis; } if (perf_call[i].s_analysis < perf_total.s_analysis_min) { perf_total.s_analysis_min = perf_call[i].s_analysis; } perf_total.s_reorder += perf_call[i].s_reorder; if (perf_call[i].s_reorder > perf_total.s_reorder_max) { perf_total.s_reorder_max = perf_call[i].s_reorder; } if (perf_call[i].s_reorder < perf_total.s_reorder_min) { perf_total.s_reorder_min = perf_call[i].s_reorder; } perf_total.s_mem_setup += perf_call[i].s_mem_setup; if (perf_call[i].s_mem_setup > perf_total.s_mem_setup_max) { perf_total.s_mem_setup_max = perf_call[i].s_mem_setup; } if (perf_call[i].s_mem_setup < perf_total.s_mem_setup_min) { perf_total.s_mem_setup_min = perf_call[i].s_mem_setup; } perf_total.s_mem_h2d += perf_call[i].s_mem_h2d; if (perf_call[i].s_mem_h2d > perf_total.s_mem_h2d_max) { perf_total.s_mem_h2d_max = perf_call[i].s_mem_h2d; } if (perf_call[i].s_mem_h2d < perf_total.s_mem_h2d_min) { perf_total.s_mem_h2d_min = perf_call[i].s_mem_h2d; } perf_total.s_kernel_exec += perf_call[i].s_kernel_exec; if (perf_call[i].s_kernel_exec > perf_total.s_kernel_exec_max) { perf_total.s_kernel_exec_max = perf_call[i].s_kernel_exec; } if (perf_call[i].s_kernel_exec < perf_total.s_kernel_exec_min) { perf_total.s_kernel_exec_min = perf_call[i].s_kernel_exec; } perf_total.n_kernel_exec_cycles += (unsigned long)perf_call[i].n_kernel_exec_cycles; if (perf_call[i].n_kernel_exec_cycles > perf_total.n_kernel_exec_cycles_max) { perf_total.n_kernel_exec_cycles_max = perf_call[i].n_kernel_exec_cycles; } if (perf_call[i].n_kernel_exec_cycles < perf_total.n_kernel_exec_cycles_min) { perf_total.n_kernel_exec_cycles_min = perf_call[i].n_kernel_exec_cycles; } perf_total.n_kernel_exec_iters += perf_call[i].n_kernel_exec_iters; if (perf_call[i].n_kernel_exec_iters > perf_total.n_kernel_exec_iters_max) { perf_total.n_kernel_exec_iters_max = perf_call[i].n_kernel_exec_iters; } if (perf_call[i].n_kernel_exec_iters < perf_total.n_kernel_exec_iters_min) { perf_total.n_kernel_exec_iters_min = perf_call[i].n_kernel_exec_iters; } perf_total.s_mem_d2h += perf_call[i].s_mem_d2h; if (perf_call[i].s_mem_d2h > perf_total.s_mem_d2h_max) { perf_total.s_mem_d2h_max = perf_call[i].s_mem_d2h; } if (perf_call[i].s_mem_d2h < perf_total.s_mem_d2h_min) { perf_total.s_mem_d2h_min = perf_call[i].s_mem_d2h; } perf_total.s_solve += perf_call[i].s_solve; if (perf_call[i].s_solve > perf_total.s_solve_max) { perf_total.s_solve_max = perf_call[i].s_solve; } if (perf_call[i].s_solve < perf_total.s_solve_min) { perf_total.s_solve_min = perf_call[i].s_solve; } perf_total.s_postprocess += perf_call[i].s_postprocess; if (perf_call[i].s_postprocess > perf_total.s_postprocess_max) { perf_total.s_postprocess_max = perf_call[i].s_postprocess; } if (perf_call[i].s_postprocess < perf_total.s_postprocess_min) { perf_total.s_postprocess_min = perf_call[i].s_postprocess; } perf_total.n_converged += (unsigned int)perf_call[i].converged; if (perf_call[i].converged_flags & 1 << 4) { conv_iter += 1; } if (perf_call[i].converged_flags & 1 << 3) { conv_ovf += 1; } #if defined(FPGA_STATISTICS_FILE_ENABLED) if (fout != nullptr) { std::fprintf(fout, "%d,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%u,%.1f,%8.6f,%8.6f,%8.6f,%u\n", i, perf_call[i].s_preconditioner_create, perf_call[i].s_analysis, perf_call[i].s_reorder, perf_call[i].s_mem_setup, perf_call[i].s_mem_h2d, perf_call[i].s_kernel_exec, perf_call[i].n_kernel_exec_cycles, perf_call[i].n_kernel_exec_iters, perf_call[i].s_mem_d2h, perf_call[i].s_solve, perf_call[i].s_postprocess, (unsigned int)perf_call[i].converged); } #endif } #if defined(FPGA_STATISTICS_FILE_ENABLED) if (fout != nullptr) { fclose(fout); } #endif perf_total.s_preconditioner_create_avg = perf_total.s_preconditioner_create / num_data_points; perf_total.s_analysis_avg = perf_total.s_analysis / num_data_points; perf_total.s_reorder_avg = perf_total.s_reorder / num_data_points; perf_total.s_mem_setup_avg = perf_total.s_mem_setup / num_data_points; perf_total.s_mem_h2d_avg = perf_total.s_mem_h2d / num_data_points; perf_total.s_kernel_exec_avg = perf_total.s_kernel_exec / num_data_points; perf_total.n_kernel_exec_cycles_avg = perf_total.n_kernel_exec_cycles / num_data_points; perf_total.n_kernel_exec_iters_avg = perf_total.n_kernel_exec_iters / num_data_points; perf_total.s_mem_d2h_avg = perf_total.s_mem_d2h / num_data_points; perf_total.s_solve_avg = perf_total.s_solve / num_data_points; perf_total.s_postprocess_avg = perf_total.s_postprocess / num_data_points; std::printf("time preconditioner creation: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", perf_total.s_preconditioner_create, perf_total.s_preconditioner_create_avg, perf_total.s_preconditioner_create_min, perf_total.s_preconditioner_create_max); std::printf("time analysis...............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", perf_total.s_analysis, perf_total.s_analysis_avg, perf_total.s_analysis_min, perf_total.s_analysis_max); std::printf("time reorder................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", perf_total.s_reorder, perf_total.s_reorder_avg, perf_total.s_reorder_min, perf_total.s_reorder_max); std::printf("time memory setup...........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", perf_total.s_mem_setup, perf_total.s_mem_setup_avg, perf_total.s_mem_setup_min, perf_total.s_mem_setup_max); std::printf("time memory host2dev........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", perf_total.s_mem_h2d, perf_total.s_mem_h2d_avg, perf_total.s_mem_h2d_min, perf_total.s_mem_h2d_max); std::printf("time kernel execution.......: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", perf_total.s_kernel_exec, perf_total.s_kernel_exec_avg, perf_total.s_kernel_exec_min, perf_total.s_kernel_exec_max); std::printf("cycles kernel execution.....: total %lu, avg %lu, min %lu, max %lu\n", perf_total.n_kernel_exec_cycles, perf_total.n_kernel_exec_cycles_avg, perf_total.n_kernel_exec_cycles_min, perf_total.n_kernel_exec_cycles_max); std::printf("iterations kernel execution.: total %.1f, avg %.1f, min %.1f, max %.1f\n", perf_total.n_kernel_exec_iters, perf_total.n_kernel_exec_iters_avg, perf_total.n_kernel_exec_iters_min, perf_total.n_kernel_exec_iters_max); std::printf("time memory dev2host........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", perf_total.s_mem_d2h, perf_total.s_mem_d2h_avg, perf_total.s_mem_d2h_min, perf_total.s_mem_d2h_max); std::printf("time solve..................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", perf_total.s_solve, perf_total.s_solve_avg, perf_total.s_solve_min, perf_total.s_solve_max); std::printf("time postprocess............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", perf_total.s_postprocess, perf_total.s_postprocess_avg, perf_total.s_postprocess_min, perf_total.s_postprocess_max); std::printf("converged...................: %u/%u, with iter>%d=%u, overflow=%u\n", perf_total.n_converged, num_data_points, maxit, conv_iter, conv_ovf); std::printf("-----------------------\n"); } //end generate_statistics() #define INSTANTIATE_BDA_FUNCTIONS(n) \ template FpgaSolverBackend::FpgaSolverBackend(std::string, int, int, double, ILUReorder); \ INSTANTIATE_BDA_FUNCTIONS(1); INSTANTIATE_BDA_FUNCTIONS(2); INSTANTIATE_BDA_FUNCTIONS(3); INSTANTIATE_BDA_FUNCTIONS(4); INSTANTIATE_BDA_FUNCTIONS(5); INSTANTIATE_BDA_FUNCTIONS(6); #undef INSTANTIATE_BDA_FUNCTIONS } // namespace Accelerator } // namespace Opm