mirror of
				https://github.com/OPM/opm-simulators.git
				synced 2025-02-25 18:55:30 -06:00 
			
		
		
		
	Do not reorder the matrix for GPU
This commit is contained in:
		| @@ -93,16 +93,16 @@ void reorderBlockedVectorByPattern(int Nb, double *vector, int *fromOrder, doubl | ||||
| bool canBeStarted(const int rowIndex, const  int *rowPointers, const  int *colIndices, const std::vector<bool>& doneRows); | ||||
|  | ||||
| /// Find a level scheduling reordering for an input matrix | ||||
| /// The toOrder and fromOrder   arrays must be allocated already | ||||
| /// The toOrder and fromOrder arrays must be allocated already | ||||
| /// \param[in] CSRColIndices    column indices array, obtained from storing the input matrix in the CSR format | ||||
| /// \param[in] CSRRowPointers   row pointers array, obtained from storing the input matrix in the CSR format | ||||
| /// \param[in] CSCRowIndices    row indices array, obtained from storing the input matrix in the CSC format | ||||
| /// \param[in] CSCColPointers   column pointers array, obtained from storing the input matrix in the CSC format | ||||
| /// \param[in] Nb               number of blockrows in the matrix | ||||
| /// \param[out] numColors       a pointer to the number of colors needed for the level scheduling | ||||
| /// \param[inout] toOrder       the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved | ||||
| /// \param[inout] fromOrder     the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved | ||||
| /// \param[inout] rowsPerColor  for each used color, the number of rows assigned to that color, this function uses emplace_back() to fill | ||||
| /// \param[out] toOrder         the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved | ||||
| /// \param[out] fromOrder       the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved | ||||
| /// \param[out] rowsPerColor    for each color, an array of all rowIndices in that color, this function uses emplace_back() to fill | ||||
| void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector<int>& rowsPerColor); | ||||
|  | ||||
| /// Find a graph coloring reordering for an input matrix | ||||
|   | ||||
| @@ -78,14 +78,8 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat | ||||
|         fromOrder.resize(Nb); | ||||
|         CSCRowIndices.resize(matToDecompose->nnzbs); | ||||
|         CSCColPointers.resize(Nb + 1); | ||||
|         rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size); | ||||
|  | ||||
|         if (jacMat) { | ||||
|             rJacMat = std::make_shared<BlockedMatrix>(jacMat->Nb, jacMat->nnzbs, block_size); | ||||
|             LUmat = std::make_unique<BlockedMatrix>(*rJacMat); | ||||
|         } else { | ||||
|             LUmat = std::make_unique<BlockedMatrix>(*rmat); | ||||
|         } | ||||
|         LUmat = std::make_unique<BlockedMatrix>(*matToDecompose); | ||||
|  | ||||
|         Timer t_convert; | ||||
|         csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb); | ||||
| @@ -101,17 +95,9 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat | ||||
|     if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) { | ||||
|         out << "BILU0 reordering strategy: " << "level_scheduling\n"; | ||||
|         findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); | ||||
|         reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get()); | ||||
|         if (jacMat) { | ||||
|             reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get()); | ||||
|         } | ||||
|     } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) { | ||||
|         out << "BILU0 reordering strategy: " << "graph_coloring\n"; | ||||
|         findGraphColoring<block_size>(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, Nb, Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); | ||||
|         reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get()); | ||||
|         if (jacMat) { | ||||
|             reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get()); | ||||
|         } | ||||
|     } else if (opencl_ilu_reorder == ILUReorder::NONE) { | ||||
|         out << "BILU0 reordering strategy: none\n"; | ||||
|         // numColors = 1; | ||||
| @@ -143,6 +129,7 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat | ||||
|     s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb); | ||||
|     s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1)); | ||||
|     s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb); | ||||
|     s.rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned) * LUmat->Nb); | ||||
| #if CHOW_PATEL | ||||
|     s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs); | ||||
|     s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs); | ||||
| @@ -156,14 +143,16 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat | ||||
|     s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1)); | ||||
| #endif | ||||
|  | ||||
|     events.resize(2); | ||||
|     events.resize(3); | ||||
|     err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]); | ||||
|  | ||||
|     rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0 | ||||
|     for (int i = 0; i < numColors; ++i) { | ||||
|         rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i]; | ||||
|         rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i]; | ||||
|     } | ||||
|  | ||||
|     err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]); | ||||
|     err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), fromOrder.data(), nullptr, &events[2]); | ||||
|  | ||||
|     cl::WaitForEvents(events); | ||||
|     events.clear(); | ||||
| @@ -189,27 +178,7 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix | ||||
| { | ||||
|     const unsigned int bs = block_size; | ||||
|  | ||||
|     auto *matToDecompose = jacMat; | ||||
|  | ||||
|     if (opencl_ilu_reorder == ILUReorder::NONE) { // NONE should only be used in debug | ||||
|         matToDecompose = jacMat ? jacMat : mat; | ||||
|     } else { | ||||
|         Timer t_reorder; | ||||
|         if (jacMat) { | ||||
|             matToDecompose = rJacMat.get(); | ||||
|             reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get()); | ||||
|             reorderNonzeroes(jacMat, jacReordermappingNonzeroes, rJacMat.get()); | ||||
|         } else { | ||||
|             matToDecompose = rmat.get(); | ||||
|             reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get()); | ||||
|         } | ||||
|  | ||||
|         if (verbosity >= 3){ | ||||
|             std::ostringstream out; | ||||
|             out << "BILU0 reorder matrix: " << t_reorder.stop() << " s"; | ||||
|             OpmLog::info(out.str()); | ||||
|         } | ||||
|     } | ||||
|     auto *matToDecompose = jacMat ? jacMat : mat; | ||||
|  | ||||
|     // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp | ||||
|     // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy | ||||
| @@ -269,11 +238,11 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix | ||||
|     std::ostringstream out; | ||||
|     for (int color = 0; color < numColors; ++color) { | ||||
|         const unsigned int firstRow = rowsPerColorPrefix[color]; | ||||
|         const unsigned int lastRow = rowsPerColorPrefix[color+1]; | ||||
|         const unsigned int lastRow = rowsPerColorPrefix[color + 1]; | ||||
|         if (verbosity >= 5) { | ||||
|             out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n"; | ||||
|         } | ||||
|         OpenclKernels::ILU_decomp(firstRow, lastRow, | ||||
|         OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices, | ||||
|                                   s.LUvals, s.LUcols, s.LUrows, s.diagIndex, | ||||
|                                   s.invDiagVals, rowsPerColor[color], block_size); | ||||
|     } | ||||
| @@ -301,11 +270,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) | ||||
|  | ||||
|     for (int color = 0; color < numColors; ++color) { | ||||
| #if CHOW_PATEL | ||||
|         OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, | ||||
|         OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows, | ||||
|                                   s.diagIndex, y, x, s.rowsPerColor, | ||||
|                                   color, rowsPerColor[color], block_size); | ||||
| #else | ||||
|         OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, | ||||
|         OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows, | ||||
|                                   s.diagIndex, y, x, s.rowsPerColor, | ||||
|                                   color, rowsPerColor[color], block_size); | ||||
| #endif | ||||
| @@ -313,11 +282,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) | ||||
|  | ||||
|     for (int color = numColors - 1; color >= 0; --color) { | ||||
| #if CHOW_PATEL | ||||
|         OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, | ||||
|         OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows, | ||||
|                                   s.diagIndex, s.invDiagVals, x, s.rowsPerColor, | ||||
|                                   color, rowsPerColor[color], block_size); | ||||
| #else | ||||
|         OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, | ||||
|         OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows, | ||||
|                                   s.diagIndex, s.invDiagVals, x, s.rowsPerColor, | ||||
|                                   color, rowsPerColor[color], block_size); | ||||
| #endif | ||||
|   | ||||
| @@ -54,8 +54,6 @@ class BILU0 : public Preconditioner<block_size> | ||||
|  | ||||
| private: | ||||
|     std::unique_ptr<BlockedMatrix> LUmat = nullptr; | ||||
|     std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM | ||||
|     std::shared_ptr<BlockedMatrix> rJacMat = nullptr;  | ||||
| #if CHOW_PATEL | ||||
|     std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr; | ||||
| #endif | ||||
| @@ -76,6 +74,7 @@ private: | ||||
|         cl::Buffer invDiagVals; | ||||
|         cl::Buffer diagIndex; | ||||
|         cl::Buffer rowsPerColor; | ||||
|         cl::Buffer rowIndices; | ||||
| #if CHOW_PATEL | ||||
|         cl::Buffer Lvals, Lcols, Lrows; | ||||
|         cl::Buffer Uvals, Ucols, Urows; | ||||
| @@ -117,12 +116,7 @@ public: | ||||
|  | ||||
|     BlockedMatrix* getRMat() override | ||||
|     { | ||||
|         return rmat.get(); | ||||
|     } | ||||
|  | ||||
|     BlockedMatrix* getRJacMat() | ||||
|     { | ||||
|         return rJacMat.get(); | ||||
|         return LUmat.get(); | ||||
|     } | ||||
|  | ||||
|     std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure() | ||||
|   | ||||
| @@ -3,6 +3,7 @@ | ||||
| /// Here, L is it's own BSR matrix. | ||||
| /// Only used with ChowPatelIlu. | ||||
| __kernel void ILU_apply1( | ||||
|     __global const unsigned *rowIndices, | ||||
|     __global const double *LUvals, | ||||
|     __global const unsigned int *LUcols, | ||||
|     __global const unsigned int *LUrows, | ||||
| @@ -30,14 +31,16 @@ __kernel void ILU_apply1( | ||||
|     target_block_row += nodesPerColorPrefix[color]; | ||||
|  | ||||
|     while(target_block_row < nodesPerColorPrefix[color+1]){ | ||||
|         const unsigned int first_block = LUrows[target_block_row]; | ||||
|         const unsigned int last_block = LUrows[target_block_row+1]; | ||||
|         unsigned row_idx = rowIndices[target_block_row]; | ||||
|  | ||||
|         const unsigned int first_block = LUrows[row_idx]; | ||||
|         const unsigned int last_block = LUrows[row_idx+1]; | ||||
|         unsigned int block = first_block + lane / (bs*bs); | ||||
|         double local_out = 0.0; | ||||
|  | ||||
|         if(lane < num_active_threads){ | ||||
|             if(lane < bs){ | ||||
|                 local_out = y[target_block_row*bs+lane]; | ||||
|                 local_out = y[row_idx*bs+lane]; | ||||
|             } | ||||
|             for(; block < last_block; block += num_blocks_per_warp){ | ||||
|                 const double x_elem = x[LUcols[block]*bs + c]; | ||||
| @@ -60,7 +63,7 @@ __kernel void ILU_apply1( | ||||
|         } | ||||
|  | ||||
|         if(lane < bs){ | ||||
|             const unsigned int row = target_block_row*bs + lane; | ||||
|             const unsigned int row = row_idx*bs + lane; | ||||
|             x[row] = tmp[lane]; | ||||
|         } | ||||
|  | ||||
|   | ||||
| @@ -3,6 +3,7 @@ | ||||
| /// Here, L is inside a normal, square matrix. | ||||
| /// In this case, diagIndex indicates where the rows of L end. | ||||
| __kernel void ILU_apply1( | ||||
|     __global const unsigned *rowIndices, | ||||
|     __global const double *LUvals, | ||||
|     __global const unsigned int *LUcols, | ||||
|     __global const unsigned int *LUrows, | ||||
| @@ -29,14 +30,17 @@ __kernel void ILU_apply1( | ||||
|     const unsigned int r = lane % bs; | ||||
|  | ||||
|     while(target_block_row < nodesPerColorPrefix[color+1]){ | ||||
|         const unsigned int first_block = LUrows[target_block_row]; | ||||
|         const unsigned int last_block = diagIndex[target_block_row]; | ||||
|         unsigned row_idx = rowIndices[target_block_row]; | ||||
|  | ||||
|         const unsigned int first_block = LUrows[row_idx]; | ||||
|         const unsigned int last_block = diagIndex[row_idx]; | ||||
|  | ||||
|         unsigned int block = first_block + lane / (bs*bs); | ||||
|         double local_out = 0.0; | ||||
|  | ||||
|         if(lane < num_active_threads){ | ||||
|             if(lane < bs){ | ||||
|                 local_out = y[target_block_row*bs+lane]; | ||||
|                 local_out = y[row_idx*bs+lane]; | ||||
|             } | ||||
|             for(; block < last_block; block += num_blocks_per_warp){ | ||||
|                 const double x_elem = x[LUcols[block]*bs + c]; | ||||
| @@ -59,7 +63,7 @@ __kernel void ILU_apply1( | ||||
|         } | ||||
|  | ||||
|         if(lane < bs){ | ||||
|             const unsigned int row = target_block_row*bs + lane; | ||||
|             const unsigned int row = row_idx*bs + lane; | ||||
|             x[row] = tmp[lane]; | ||||
|         } | ||||
|  | ||||
|   | ||||
| @@ -3,6 +3,7 @@ | ||||
| /// Here, U is it's own BSR matrix. | ||||
| /// Only used with ChowPatelIlu. | ||||
| __kernel void ILU_apply2( | ||||
|     __global const unsigned *rowIndices, | ||||
|     __global const double *LUvals, | ||||
|     __global const int *LUcols, | ||||
|     __global const int *LUrows, | ||||
| @@ -29,14 +30,15 @@ __kernel void ILU_apply2( | ||||
|     const unsigned int r = lane % bs; | ||||
|  | ||||
|     while(target_block_row < nodesPerColorPrefix[color+1]){ | ||||
|         const unsigned int first_block = LUrows[target_block_row]; | ||||
|         const unsigned int last_block = LUrows[target_block_row+1]; | ||||
|         unsigned row_idx = rowIndices[target_block_row]; | ||||
|         const unsigned int first_block = LUrows[row_idx]; | ||||
|         const unsigned int last_block = LUrows[row_idx+1]; | ||||
|         unsigned int block = first_block + lane / (bs*bs); | ||||
|         double local_out = 0.0; | ||||
|  | ||||
|         if(lane < num_active_threads){ | ||||
|             if(lane < bs){ | ||||
|                 const unsigned int row = target_block_row*bs+lane; | ||||
|                 const unsigned int row = row_idx*bs+lane; | ||||
|                 local_out = x[row]; | ||||
|             } | ||||
|             for(; block < last_block; block += num_blocks_per_warp){ | ||||
| @@ -64,10 +66,10 @@ __kernel void ILU_apply2( | ||||
|             tmp[lane + bs*idx_t/warpsize] = local_out; | ||||
|             double sum = 0.0; | ||||
|             for(int i = 0; i < bs; ++i){ | ||||
|                 sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; | ||||
|                 sum += invDiagVals[row_idx*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; | ||||
|             } | ||||
|  | ||||
|             const unsigned int row = target_block_row*bs + lane; | ||||
|             const unsigned int row = row_idx*bs + lane; | ||||
|             x[row] = sum; | ||||
|         } | ||||
|  | ||||
|   | ||||
| @@ -3,6 +3,7 @@ | ||||
| /// Here, U is inside a normal, square matrix. | ||||
| /// In this case diagIndex indicates where the rows of U start. | ||||
| __kernel void ILU_apply2( | ||||
|     __global const unsigned *rowIndices, | ||||
|     __global const double *LUvals, | ||||
|     __global const int *LUcols, | ||||
|     __global const int *LUrows, | ||||
| @@ -30,14 +31,15 @@ __kernel void ILU_apply2( | ||||
|     target_block_row += nodesPerColorPrefix[color]; | ||||
|  | ||||
|     while(target_block_row < nodesPerColorPrefix[color+1]){ | ||||
|         const unsigned int first_block = diagIndex[target_block_row] + 1; | ||||
|         const unsigned int last_block = LUrows[target_block_row+1]; | ||||
|         unsigned row_idx = rowIndices[target_block_row]; | ||||
|         const unsigned int first_block = diagIndex[row_idx] + 1; | ||||
|         const unsigned int last_block = LUrows[row_idx+1]; | ||||
|         unsigned int block = first_block + lane / (bs*bs); | ||||
|         double local_out = 0.0; | ||||
|  | ||||
|         if(lane < num_active_threads){ | ||||
|             if(lane < bs){ | ||||
|                 const unsigned int row = target_block_row*bs+lane; | ||||
|                 const unsigned int row = row_idx*bs+lane; | ||||
|                 local_out = x[row]; | ||||
|             } | ||||
|             for(; block < last_block; block += num_blocks_per_warp){ | ||||
| @@ -65,10 +67,10 @@ __kernel void ILU_apply2( | ||||
|             tmp[lane + bs*idx_t/warpsize] = local_out; | ||||
|             double sum = 0.0; | ||||
|             for(int i = 0; i < bs; ++i){ | ||||
|                 sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; | ||||
|                 sum += invDiagVals[row_idx*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; | ||||
|             } | ||||
|  | ||||
|             const unsigned int row = target_block_row*bs + lane; | ||||
|             const unsigned int row = row_idx*bs + lane; | ||||
|             x[row] = sum; | ||||
|         } | ||||
|  | ||||
|   | ||||
| @@ -68,6 +68,7 @@ __kernel void inverter(__global double *matrix, __global double *inverse) | ||||
| /// The kernel takes a full BSR matrix and performs inplace ILU decomposition | ||||
| __kernel void ilu_decomp(const unsigned int firstRow, | ||||
|                          const unsigned int lastRow, | ||||
|                          __global const unsigned *rowIndices, | ||||
|                          __global double *LUvals, | ||||
|                          __global const int *LUcols, | ||||
|                          __global const int *LUrows, | ||||
| @@ -91,14 +92,15 @@ __kernel void ilu_decomp(const unsigned int firstRow, | ||||
|     // go through all rows | ||||
|     for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group) | ||||
|     { | ||||
|         int iRowStart = LUrows[i]; | ||||
|         int iRowEnd = LUrows[i + 1]; | ||||
|         const unsigned row = rowIndices[i]; | ||||
|         int iRowStart = LUrows[row]; | ||||
|         int iRowEnd = LUrows[row + 1]; | ||||
|  | ||||
|         // go through all elements of the row | ||||
|         for (int ij = iRowStart; ij < iRowEnd; ij++) { | ||||
|             int j = LUcols[ij]; | ||||
|  | ||||
|             if (j < i) { | ||||
|             if (j < row) { | ||||
|                 // calculate the pivot of this row | ||||
|                 block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset); | ||||
|  | ||||
| @@ -127,11 +129,11 @@ __kernel void ilu_decomp(const unsigned int firstRow, | ||||
|         } | ||||
|  | ||||
|         // store the inverse in the diagonal | ||||
|         inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs); | ||||
|         inverter(LUvals + diagIndex[row] * bs * bs, invDiagVals + row * bs * bs); | ||||
|  | ||||
|         // copy inverse | ||||
|         if (thread_id_in_hwarp < bs * bs) { | ||||
|             LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp]; | ||||
|             LUvals[diagIndex[row] * bs * bs + thread_id_in_hwarp] = invDiagVals[row * bs * bs + thread_id_in_hwarp]; | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -390,7 +390,7 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row | ||||
|     } | ||||
| } | ||||
|  | ||||
| void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, | ||||
| void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, | ||||
|                                cl::Buffer& rows, cl::Buffer& diagIndex, | ||||
|                                const cl::Buffer& y, cl::Buffer& x, | ||||
|                                cl::Buffer& rowsPerColor, int color, | ||||
| @@ -403,8 +403,8 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, | ||||
|     Timer t_ilu_apply1; | ||||
|  | ||||
|     cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), | ||||
|                           vals, cols, rows, diagIndex, y, x, | ||||
|                           rowsPerColor, color, block_size, | ||||
|                           rowIndices, vals, cols, rows, diagIndex, | ||||
|                           y, x, rowsPerColor, color, block_size, | ||||
|                           cl::Local(lmem_per_work_group)); | ||||
|  | ||||
|     if (verbosity >= 5) { | ||||
| @@ -415,7 +415,7 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, | ||||
|     } | ||||
| } | ||||
|  | ||||
| void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, | ||||
| void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, | ||||
|                                cl::Buffer& rows, cl::Buffer& diagIndex, | ||||
|                                cl::Buffer& invDiagVals, cl::Buffer& x, | ||||
|                                cl::Buffer& rowsPerColor, int color, | ||||
| @@ -428,8 +428,8 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, | ||||
|     Timer t_ilu_apply2; | ||||
|  | ||||
|     cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), | ||||
|                           vals, cols, rows, diagIndex, invDiagVals, | ||||
|                           x, rowsPerColor, color, block_size, | ||||
|                           rowIndices, vals, cols, rows, diagIndex, | ||||
|                           invDiagVals, x, rowsPerColor, color, block_size, | ||||
|                           cl::Local(lmem_per_work_group)); | ||||
|  | ||||
|     if (verbosity >= 5) { | ||||
| @@ -440,7 +440,7 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, | ||||
|     } | ||||
| } | ||||
|  | ||||
| void OpenclKernels::ILU_decomp(int firstRow, int lastRow, | ||||
| void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices, | ||||
|                                cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, | ||||
|                                cl::Buffer& diagIndex, cl::Buffer& invDiagVals, | ||||
|                                int rowsThisColor, unsigned int block_size) | ||||
| @@ -453,7 +453,8 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow, | ||||
|     Timer t_ilu_decomp; | ||||
|  | ||||
|     cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), | ||||
|                           firstRow, lastRow, vals, cols, rows, | ||||
|                           firstRow, lastRow, rowIndices, | ||||
|                           vals, cols, rows, | ||||
|                           invDiagVals, diagIndex, rowsThisColor, | ||||
|                           cl::Local(lmem_per_work_group)); | ||||
|  | ||||
|   | ||||
| @@ -39,9 +39,9 @@ using residual_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, | ||||
|                                          cl::Buffer&, const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>; | ||||
| using residual_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, | ||||
|                                          cl::Buffer&, const cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>; | ||||
| using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&, | ||||
| using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&, | ||||
|                                                cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>; | ||||
| using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, | ||||
| using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, | ||||
|                                                cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>; | ||||
| using stdwell_apply_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, | ||||
|                                                   cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, | ||||
| @@ -51,7 +51,7 @@ using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor<cl::Buffer&, cl:: | ||||
|                                                              cl::Buffer&, cl::Buffer&, cl::Buffer&, | ||||
|                                                              const unsigned int, const unsigned int, cl::Buffer&, | ||||
|                                                              cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>; | ||||
| using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, | ||||
| using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, | ||||
|                                                cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>; | ||||
| using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, | ||||
|                                   cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>; | ||||
| @@ -135,13 +135,13 @@ public: | ||||
|     static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, const cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true, bool add = false); | ||||
|     static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size); | ||||
|  | ||||
|     static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, | ||||
|     static void ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, | ||||
|         const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size); | ||||
|  | ||||
|     static void ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, | ||||
|     static void ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, | ||||
|         cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size); | ||||
|  | ||||
|     static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, | ||||
|     static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, | ||||
|         cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size); | ||||
|  | ||||
|     static void apply_stdwells_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl, | ||||
|   | ||||
| @@ -232,12 +232,6 @@ void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& co | ||||
|     queue = queue_; | ||||
| } | ||||
|  | ||||
| template <unsigned int block_size> | ||||
| openclSolverBackend<block_size>::~openclSolverBackend() { | ||||
|     finalize(); | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| template <unsigned int block_size> | ||||
| void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { | ||||
| @@ -464,7 +458,6 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> | ||||
|  | ||||
|         bool reorder = (opencl_ilu_reorder != ILUReorder::NONE); | ||||
|         if (reorder) { | ||||
|             rb = new double[N]; | ||||
|             d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb); | ||||
|         } | ||||
|  | ||||
| @@ -482,13 +475,6 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> | ||||
|     initialized = true; | ||||
| } // end initialize() | ||||
|  | ||||
| template <unsigned int block_size> | ||||
| void openclSolverBackend<block_size>::finalize() { | ||||
|     if (opencl_ilu_reorder != ILUReorder::NONE) { | ||||
|         delete[] rb; | ||||
|     } | ||||
| } // end finalize() | ||||
|  | ||||
| template <unsigned int block_size> | ||||
| void openclSolverBackend<block_size>::copy_system_to_gpu() { | ||||
|     Timer t; | ||||
| @@ -508,7 +494,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() { | ||||
|  | ||||
|     err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices, nullptr, &events[1]); | ||||
|     err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers, nullptr, &events[2]); | ||||
|     err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[3]); | ||||
|     err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]); | ||||
|     err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]); | ||||
|     if (opencl_ilu_reorder != ILUReorder::NONE) { | ||||
|         events.resize(6); | ||||
| @@ -546,7 +532,7 @@ void openclSolverBackend<block_size>::update_system_on_gpu() { | ||||
|     err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]); | ||||
| #endif | ||||
|  | ||||
|     err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[1]); | ||||
|     err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]); | ||||
|     err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]); | ||||
|     cl::WaitForEvents(events); | ||||
|     events.clear(); | ||||
| @@ -602,13 +588,8 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b, Wel | ||||
|     Timer t; | ||||
|  | ||||
|     mat->nnzValues = vals; | ||||
|     if (opencl_ilu_reorder != ILUReorder::NONE) { | ||||
|         reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb); | ||||
|         static_cast<WellContributionsOCL&>(wellContribs).setReordering(toOrder, true); | ||||
|     } else { | ||||
|         rb = b; | ||||
|         static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false); | ||||
|     } | ||||
|     h_b = b; | ||||
|     static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false); | ||||
|  | ||||
|     if (verbosity > 2) { | ||||
|         std::ostringstream out; | ||||
| @@ -670,12 +651,7 @@ template <unsigned int block_size> | ||||
| void openclSolverBackend<block_size>::get_result(double *x) { | ||||
|     Timer t; | ||||
|  | ||||
|     if (opencl_ilu_reorder != ILUReorder::NONE) { | ||||
|         queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb); | ||||
|         reorderBlockedVectorByPattern<block_size>(mat->Nb, rb, toOrder, x); | ||||
|     } else { | ||||
|         queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x); | ||||
|     } | ||||
|     queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x); | ||||
|  | ||||
|     if (verbosity > 2) { | ||||
|         std::ostringstream out; | ||||
|   | ||||
| @@ -51,7 +51,7 @@ class openclSolverBackend : public BdaSolver<block_size> | ||||
|     using Base::initialized; | ||||
|  | ||||
| private: | ||||
|     double *rb = nullptr;                 // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b | ||||
|     double *h_b = nullptr;                // b vector, on host | ||||
|     std::vector<double> vals_contiguous;  // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp | ||||
|  | ||||
|     // OpenCL variables must be reusable, they are initialized in initialize() | ||||
| @@ -184,9 +184,6 @@ public: | ||||
|     /// For the CPR coarse solver | ||||
|     openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder); | ||||
|  | ||||
|     /// Destroy a openclSolver, and free memory | ||||
|     ~openclSolverBackend(); | ||||
|  | ||||
|     /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format | ||||
|     /// \param[in] matrix         matrix A | ||||
|     /// \param[in] b              input vector, contains N values | ||||
|   | ||||
		Reference in New Issue
	
	Block a user