diff --git a/src/Utility/Test/TestThreadGroups.C b/src/Utility/Test/TestThreadGroups.C index fad8cf4e..2bf0bfbe 100644 --- a/src/Utility/Test/TestThreadGroups.C +++ b/src/Utility/Test/TestThreadGroups.C @@ -62,6 +62,41 @@ TEST(TestThreadGroups, Groups2D) CHECK_INTMATRICES_EQUAL(groups[0], "src/Utility/Test/refdata/ThreadGroups_2D_1.ref"); #endif + std::vector b14(4, true); + std::vector b24(4, true); + + for (int i = 0; i < 2; ++i) { + ThreadGroups group((ThreadGroups::StripDirection)i); + group.calcGroups(b14, b24, 1, 1); +#ifdef USE_OPENMP + const int ref1[4][4] = {{0,4,8,12}, {2,6,10,14}, + {0,1,2, 3}, {8,9,10,11}}; + + const int ref2[4][4] = {{1,5,9,13}, { 3, 7,11,15}, + {4,5,6, 7}, {12,13,14,15}}; + + ASSERT_EQ(group.size(), 2U); + ASSERT_EQ(group[0].size(), 2U); + ASSERT_EQ(group[1].size(), 2U); + ASSERT_EQ(group[0][0].size(), 4U); + ASSERT_EQ(group[0][1].size(), 4U); + ASSERT_EQ(group[1][0].size(), 4U); + ASSERT_EQ(group[1][1].size(), 4U); + for (size_t j = 0; j < 4; ++j) { + ASSERT_EQ(group[0][0][j], ref1[i*2][j]); + ASSERT_EQ(group[0][1][j], ref1[i*2+1][j]); + ASSERT_EQ(group[1][0][j], ref2[i*2][j]); + ASSERT_EQ(group[1][1][j], ref2[i*2+1][j]); + } +#else + ASSERT_EQ(group.size(), 1U); + ASSERT_EQ(group[0].size(), 1U); + ASSERT_EQ(group[0][0].size(), 16U); + for (int j = 0; j < 16; ++j) + ASSERT_EQ(group[0][0][j], j); +#endif + } + #ifdef USE_OPENMP omp_set_num_threads(3); #endif diff --git a/src/Utility/ThreadGroups.C b/src/Utility/ThreadGroups.C index c5eb0c9c..3d8a0de2 100644 --- a/src/Utility/ThreadGroups.C +++ b/src/Utility/ThreadGroups.C @@ -13,9 +13,7 @@ #include "ThreadGroups.h" #include -#if SP_DEBUG > 1 #include -#endif #ifdef USE_OPENMP #include #endif @@ -33,33 +31,32 @@ void ThreadGroups::oneGroup (size_t nel) void ThreadGroups::calcGroups (const BoolVec& el1, const BoolVec& el2, int p1, int p2) { +#ifndef USE_OPENMP + this->oneGroup(el1.size()*el2.size()); +#else // Count the non-zero element in each direction, the zero-span elements // should not affect the partitioning as they don't involve any work - size_t i, nel1 = 0, nel2 = 0; - for (i = 0; i < el1.size(); i++) - if (el1[i]) nel1++; - for (i = 0; i < el2.size(); i++) - if (el2[i]) nel2++; + size_t nel1 = 0, nel2 = 0; + for (bool e : el1) if (e) nel1++; + for (bool e : el2) if (e) nel2++; - int threads=1; - int stripsize=0; - int remainder=0; - int dir=0, mul=1; -#ifdef USE_OPENMP - threads = omp_get_max_threads(); + int threads = omp_get_max_threads(); int parts = threads > 1 ? 2*threads : 1; - dir = getStripDirection(nel1,nel2,parts); - mul = dir == 0 ? 1 : el1.size(); - int els = dir == 0 ? nel1 : nel2; + if (stripDir == ANY) + stripDir = getStripDirection(nel1,nel2,parts); + int mul = stripDir == U ? 1 : el1.size(); + int els = stripDir == U ? nel1 : nel2; - // The minimum strip size (with) depends on the polynomial degree + // The minimum strip size (width) depends on the polynomial degree // due to the overlapping support of the splines basis functions - int minsize = dir == 0 ? p1 : p2; + int stripsize = 0; + int minsize = stripDir == U ? p1 : p2; while (threads > 1 && (stripsize = els/parts) < minsize) { threads --; parts -= 2; } + int remainder = 0; if (threads > 1) remainder = els - stripsize*parts; else @@ -71,7 +68,6 @@ void ThreadGroups::calcGroups (const BoolVec& el1, const BoolVec& el2, << "\nstripsize " << stripsize << "\n# of strips " << els/stripsize << "\nremainder " << remainder << std::endl; -#endif #endif nel1 = el1.size(); @@ -82,7 +78,7 @@ void ThreadGroups::calcGroups (const BoolVec& el1, const BoolVec& el2, { int i, j, t, zspan, offs = 0; IntVec stripsizes[2], startelms[2]; - const BoolVec& elz = dir == 0 ? el1 : el2; + const BoolVec& elz = stripDir == U ? el1 : el2; stripsizes[0].resize(threads,stripsize); stripsizes[1].resize(threads,stripsize); @@ -112,45 +108,41 @@ void ThreadGroups::calcGroups (const BoolVec& el1, const BoolVec& el2, for (i = 0; i < 2; ++i) { // loop over groups tg[i].resize(threads); for (int t = 0; t < threads; ++t) { // loop over threads - int maxx = dir == 0 ? stripsizes[i][t] : nel1; - int maxy = dir == 1 ? stripsizes[i][t] : nel2; + int maxx = stripDir == U ? stripsizes[i][t] : nel1; + int maxy = stripDir == V ? stripsizes[i][t] : nel2; tg[i][t].reserve(maxx*maxy); for (int i2 = 0; i2 < maxy; ++i2) for (int i1 = 0; i1 < maxx; ++i1) tg[i][t].push_back(startelms[i][t]+i1+i2*nel1); } -#if defined(USE_OPENMP) && SP_DEBUG > 1 - std::cout << "group " << i << std::endl; - for (size_t j = 0; j < tg[i].size(); ++j) { - std::cout << "\t thread " << j << ": "; - for (size_t k = 0; k < tg[i][j].size(); ++k) - std::cout << tg[i][j][k] << " "; - std::cout << std::endl; - } +#if SP_DEBUG > 1 + printGroup(tg[i],i); #endif } } +#endif } void ThreadGroups::calcGroups (int nel1, int nel2, int minsize) { - int threads=1; - int stripsize=0; - int remainder=0; - int dir=0, mul=1; -#ifdef USE_OPENMP - threads = omp_get_max_threads(); +#ifndef USE_OPENMP + this->oneGroup(nel1*nel2); +#else + int threads = omp_get_max_threads(); int parts = threads > 1 ? 2*threads : 1; - dir = getStripDirection(nel1,nel2,parts); - mul = dir == 0 ? 1 : nel1; - int els = dir == 0 ? nel1 : nel2; + if (stripDir == ANY) + stripDir = getStripDirection(nel1,nel2,parts); + int mul = stripDir == U ? 1 : nel1; + int els = stripDir == U ? nel1 : nel2; + int stripsize = 0; while (threads > 1 && (stripsize = els/parts) < minsize) { threads --; parts -= 2; } + int remainder = 0; if (threads > 1) remainder = els - stripsize*parts; else @@ -162,7 +154,6 @@ void ThreadGroups::calcGroups (int nel1, int nel2, int minsize) << "\nstripsize " << stripsize << "\n# of strips " << els/stripsize << "\nremainder " << remainder << std::endl; -#endif #endif if (threads == 1) @@ -187,77 +178,51 @@ void ThreadGroups::calcGroups (int nel1, int nel2, int minsize) for (i = 0; i < 2; ++i) { // loop over groups tg[i].resize(threads); for (int t = 0; t < threads; ++t) { // loop over threads - int maxx = dir == 0 ? stripsizes[i][t] : nel1; - int maxy = dir == 1 ? stripsizes[i][t] : nel2; + int maxx = stripDir == U ? stripsizes[i][t] : nel1; + int maxy = stripDir == V ? stripsizes[i][t] : nel2; for (int i2 = 0; i2 < maxy; ++i2) for (int i1 = 0; i1 < maxx; ++i1) tg[i][t].push_back(startelms[i][t]+i1+i2*nel1); } #if defined(USE_OPENMP) && SP_DEBUG > 1 - std::cout << "group " << i << std::endl; - for (size_t j = 0; j < tg[i].size(); ++j) { - std::cout << "\t thread " << j << ": "; - for (size_t k = 0; k < tg[i][j].size(); ++k) - std::cout << tg[i][j][k] << " "; - std::cout << std::endl; - } + printGroup(tg[i],i); #endif } } -} - - -int ThreadGroups::getStripDirection (int nel1, int nel2, int parts) -{ - int s1 = nel1 / parts; - int s2 = nel2 / parts; - int r1 = nel1 - s1*parts; - int r2 = nel2 - s2*parts; - - if (r1*nel2 < r2*nel1) - return 0; // strips in u-direction - else if (r1*nel2 > r2*nel1) - return 1; // strips in v-direction - - if (nel1 > nel2) - return 0; // strips in u-direction - else - return 1; // strips in v-direction +#endif } void ThreadGroups::calcGroups (const BoolVec& el1, const BoolVec& el2, const BoolVec& el3, int p1, int p2, int p3) { +#ifndef USE_OPENMP + this->oneGroup(el1.size()*el2.size()*el3.size()); +#else // Count the non-zero element in each direction, the zero-span elements // should not affect the partitioning as they don't involve any work - size_t i, nel1 = 0, nel2 = 0, nel3 = 0; - for (i = 0; i < el1.size(); i++) - if (el1[i]) nel1++; - for (i = 0; i < el2.size(); i++) - if (el2[i]) nel2++; - for (i = 0; i < el3.size(); i++) - if (el3[i]) nel3++; + size_t nel1 = 0, nel2 = 0, nel3 = 0; + for (bool e : el1) if (e) nel1++; + for (bool e : el2) if (e) nel2++; + for (bool e : el3) if (e) nel3++; - int threads=1; - int stripsize=0; - int remainder=0; - int dir=0, mul=1; -#ifdef USE_OPENMP - threads = omp_get_max_threads(); + int threads = omp_get_max_threads(); int parts = threads > 1 ? 2*threads : 1; - dir = getStripDirection(nel1,nel2,nel3,parts); - mul = dir == 0 ? 1 : el1.size()*(dir == 1 ? 1 : el2.size()); - int els = dir == 0 ? nel1 : (dir == 1 ? nel2 : nel3); + if (stripDir == ANY) + stripDir = getStripDirection(nel1,nel2,nel3,parts); + int mul = stripDir == U ? 1 : el1.size()*(stripDir == V ? 1 : el2.size()); + int els = stripDir == U ? nel1 : (stripDir == V ? nel2 : nel3); - // The minimum strip size (with) depends on the polynomial degree + // The minimum strip size (width) depends on the polynomial degree // due to the overlapping support of the splines basis functions - int minsize = dir == 0 ? p1 : (dir == 1 ? p2 : p3); + int stripsize = 0; + int minsize = stripDir == U ? p1 : (stripDir == V ? p2 : p3); while (threads > 1 && (stripsize = els/parts) < minsize) { threads --; parts -= 2; } + int remainder = 0; if (threads > 1) remainder = els - stripsize*parts; else @@ -269,7 +234,6 @@ void ThreadGroups::calcGroups (const BoolVec& el1, const BoolVec& el2, << "\nstripsize " << stripsize << "\n# of strips " << els/stripsize << "\nremainder " << remainder << std::endl; -#endif #endif nel1 = el1.size(); @@ -281,7 +245,7 @@ void ThreadGroups::calcGroups (const BoolVec& el1, const BoolVec& el2, { int i, j, t, zspan, offs = 0; IntVec stripsizes[2], startelms[2]; - const BoolVec& elz = dir == 0 ? el1 : (dir == 1 ? el2 : el3); + const BoolVec& elz = stripDir == U ? el1 : (stripDir == V ? el2 : el3); stripsizes[0].resize(threads,stripsize); stripsizes[1].resize(threads,stripsize); @@ -311,9 +275,9 @@ void ThreadGroups::calcGroups (const BoolVec& el1, const BoolVec& el2, for (i = 0; i < 2; ++i) { // loop over groups tg[i].resize(threads); for (int t = 0; t < threads; ++t) { // loop over threads - int maxx = dir == 0 ? stripsizes[i][t] : nel1; - int maxy = dir == 1 ? stripsizes[i][t] : nel2; - int maxz = dir == 2 ? stripsizes[i][t] : nel3; + int maxx = stripDir == U ? stripsizes[i][t] : nel1; + int maxy = stripDir == V ? stripsizes[i][t] : nel2; + int maxz = stripDir == W ? stripsizes[i][t] : nel3; tg[i][t].reserve(maxx*maxy*maxz); for (int i3 = 0; i3 < maxz; ++i3) for (int i2 = 0; i2 < maxy; ++i2) @@ -321,38 +285,34 @@ void ThreadGroups::calcGroups (const BoolVec& el1, const BoolVec& el2, tg[i][t].push_back(startelms[i][t]+i1+nel1*(i2+nel2*i3)); } -#if defined(USE_OPENMP) && SP_DEBUG > 1 - std::cout << "group " << i << std::endl; - for (size_t j = 0; j < tg[i].size(); ++j) { - std::cout << "\t thread " << j << " (" << tg[i][j].size() << "): "; - for (size_t k = 0; k < tg[i][j].size(); ++k) - std::cout << tg[i][j][k] << " "; - std::cout << std::endl; - } +#if SP_DEBUG > 1 + printGroup(tg[i],i); #endif } } +#endif } void ThreadGroups::calcGroups (int nel1, int nel2, int nel3, int minsize) { - int threads=1; - int stripsize=0; - int remainder=0; - int i, dir=0, mul=1; -#ifdef USE_OPENMP - threads = omp_get_max_threads(); +#ifndef USE_OPENMP + this->oneGroup(nel1*nel2*nel3); +#else + int threads = omp_get_max_threads(); int parts = threads > 1 ? 2*threads : 1; - dir = getStripDirection(nel1,nel2,nel3,parts); - mul = dir == 0 ? 1 : nel1*(dir == 1 ? 1 : nel2); - int els = dir == 0 ? nel1 : (dir == 1 ? nel2 : nel3); + if (stripDir == ANY) + stripDir = getStripDirection(nel1,nel2,nel3,parts); + int mul = stripDir == U ? 1 : nel1*(stripDir == V ? 1 : nel2); + int els = stripDir == U ? nel1 : (stripDir == V ? nel2 : nel3); + int stripsize = 0; while (threads > 1 && (stripsize = els/parts) < minsize) { threads --; parts -= 2; } + int remainder = 0; if (threads > 1) remainder = els - stripsize*parts; else @@ -364,13 +324,13 @@ void ThreadGroups::calcGroups (int nel1, int nel2, int nel3, int minsize) << "\nstripsize " << stripsize << "\n# of strips " << els/stripsize << "\nremainder " << remainder << std::endl; -#endif #endif if (threads == 1) this->oneGroup(nel1*nel2*nel3); else { + int offs, i; IntVec stripsizes[2]; stripsizes[0].resize(threads,stripsize); stripsizes[1].resize(threads,stripsize); @@ -378,7 +338,7 @@ void ThreadGroups::calcGroups (int nel1, int nel2, int nel3, int minsize) stripsizes[i%2][threads-(i+1)/2]++; IntVec startelms[2]; - for (int offs = i = 0; i < threads; ++i) { + for (offs = i = 0; i < threads; ++i) { startelms[0].push_back(offs*mul); offs += stripsizes[0][i]; startelms[1].push_back(offs*mul); @@ -388,9 +348,9 @@ void ThreadGroups::calcGroups (int nel1, int nel2, int nel3, int minsize) for (i = 0; i < 2; ++i) { // loop over groups tg[i].resize(threads); for (int t = 0; t < threads; ++t) { // loop over threads - int maxx = dir == 0 ? stripsizes[i][t] : nel1; - int maxy = dir == 1 ? stripsizes[i][t] : nel2; - int maxz = dir == 2 ? stripsizes[i][t] : nel3; + int maxx = stripDir == U ? stripsizes[i][t] : nel1; + int maxy = stripDir == V ? stripsizes[i][t] : nel2; + int maxz = stripDir == W ? stripsizes[i][t] : nel3; for (int i3 = 0; i3 < maxz; ++i3) for (int i2 = 0; i2 < maxy; ++i2) for (int i1 = 0; i1 < maxx; ++i1) @@ -398,20 +358,36 @@ void ThreadGroups::calcGroups (int nel1, int nel2, int nel3, int minsize) } #if defined(USE_OPENMP) && SP_DEBUG > 1 - std::cout << "group " << i << std::endl; - for (size_t j = 0; j < tg[i].size(); ++j) { - std::cout << "\t thread " << j << " (" << tg[i][j].size() << "): "; - for (size_t k = 0; k < tg[i][j].size(); ++k) - std::cout << tg[i][j][k] << " "; - std::cout << std::endl; - } + printGroup(tg[i],i); #endif } } +#endif } -int ThreadGroups::getStripDirection (int nel1, int nel2, int nel3, int parts) +ThreadGroups::StripDirection ThreadGroups::getStripDirection (int nel1, + int nel2, + int parts) +{ + int s1 = nel1 / parts; + int s2 = nel2 / parts; + int r1 = nel1 - s1*parts; + int r2 = nel2 - s2*parts; + + if (r1*nel2 < r2*nel1) + return U; + else if (r1*nel2 > r2*nel1) + return V; + + return nel1 > nel2 ? U : V; +} + + +ThreadGroups::StripDirection ThreadGroups::getStripDirection (int nel1, + int nel2, + int nel3, + int parts) { int s1 = nel1 / parts; int s2 = nel2 / parts; @@ -421,27 +397,27 @@ int ThreadGroups::getStripDirection (int nel1, int nel2, int nel3, int parts) int r3 = nel3 - s3*parts; if (r1*nel2*nel3 < nel1*r2*nel3 && r1*nel2*nel3 < nel1*nel2*r3) - return 0; // strips along x axis + return U; else if (nel1*r2*nel3 < r1*nel2*nel3 && nel1*r2*nel3 < nel1*nel2*r3) - return 1; // strips along y axis + return V; else if (nel1*nel2*r3 < r1*nel2*nel3 && nel1*nel2*r3 < nel1*r2*nel3) - return 2; // strips along z axis + return W; // The number of left-over elements is not smallest in one direction only if (r1*nel2*nel3 > nel1*r2*nel3) - return nel2 > nel3 ? 1 : 2; + return nel2 > nel3 ? V : W; else if (nel1*r2*nel3 > nel1*nel2*r3) - return nel1 > nel3 ? 0 : 2; + return nel1 > nel3 ? U : W; else if (nel1*nel2*r3 > r1*nel2*nel3) - return nel1 > nel2 ? 0 : 1; + return nel1 > nel2 ? U : V; // The number of left-over elements is the same in all three directions if (nel1 >= nel2 && nel1 >= nel3) - return 0; + return U; else if (nel2 >= nel1 && nel2 >= nel3) - return 1; + return V; else - return 2; + return W; } @@ -452,3 +428,15 @@ void ThreadGroups::applyMap (const IntVec& map) for (size_t j = 0; j < tg[l][k].size(); ++j) tg[l][k][j] = map[tg[l][k][j]]; } + + +void ThreadGroups::printGroup (const IntMat& group, int g) +{ + std::cout <<"group "<< g; + for (size_t t = 0; t < group.size(); t++) + { + std::cout <<"\n\t thread "<< t <<" ("<< group[t].size() <<"):"; + for (int e : group[t]) std::cout <<" "<< e; + } + std::cout << std::endl; +} diff --git a/src/Utility/ThreadGroups.h b/src/Utility/ThreadGroups.h index fcc37647..f6c00abd 100644 --- a/src/Utility/ThreadGroups.h +++ b/src/Utility/ThreadGroups.h @@ -29,6 +29,12 @@ class ThreadGroups typedef std::vector IntMat; //!< Element lists for all threads public: + //! Directions to consider for element stripes. + enum StripDirection { U, V, W, ANY }; + + //! \brief Default constructor. + ThreadGroups(StripDirection dir = ANY) : stripDir(dir) {} + //! \brief Calculates a 2D thread group partitioning based on strips. //! \param[in] el1 Flags non-zero knot spans in first parameter direction //! \param[in] el2 Flags non-zero knot spans in second parameter direction @@ -74,9 +80,17 @@ public: protected: //! \brief Calculates the parameter direction of the treading strips in 2D. - static int getStripDirection(int nel1, int nel2, int parts); + static StripDirection getStripDirection(int nel1, int nel2, + int parts); //! \brief Calculates the parameter direction of the treading strips in 3D. - static int getStripDirection(int nel1, int nel2, int nel3, int parts); + static StripDirection getStripDirection(int nel1, int nel2, int nel3, + int parts); + + //! \brief Prints out a threading group definition. + static void printGroup(const IntMat& group, int g); + +public: + StripDirection stripDir; //!< Actual direction to split elements private: IntMat tg[2]; //!< Threading groups (always two, but the second may be empty)