added: strip along the optimal direction + spread remainder over several threads when possible
git-svn-id: http://svn.sintef.no/trondheim/IFEM/trunk@1445 e10b68d5-8a6e-419e-a041-bce267b0401d
This commit is contained in:
parent
7789611606
commit
6d49692360
@ -295,7 +295,7 @@ size_t utl::find_closest (const std::vector<real>& a, real v)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void utl::calcThreadGroups(int nel1, int nel2, utl::ThreadGroups& result)
|
void utl::calcThreadGroups(int nel1, int nel2, ThreadGroups& result)
|
||||||
{
|
{
|
||||||
int threads=1;
|
int threads=1;
|
||||||
int groups=1;
|
int groups=1;
|
||||||
@ -306,17 +306,33 @@ void utl::calcThreadGroups(int nel1, int nel2, utl::ThreadGroups& result)
|
|||||||
if (threads > 1)
|
if (threads > 1)
|
||||||
groups = 2;
|
groups = 2;
|
||||||
|
|
||||||
stripsize = nel1/(groups*threads);
|
int dir, els, mul;
|
||||||
if (stripsize < 2) {
|
int s1 = nel1/(groups*threads);
|
||||||
|
int s2 = nel2/(groups*threads);
|
||||||
|
int r1 = nel1-(s1*groups*threads);
|
||||||
|
int r2 = nel2-(s2*groups*threads);
|
||||||
|
if (r1*nel2 < r2*nel1) {
|
||||||
|
stripsize = s1;
|
||||||
|
dir = 0;
|
||||||
|
els = nel1;
|
||||||
|
mul = 1;
|
||||||
|
} else {
|
||||||
|
stripsize = s2;
|
||||||
|
els = nel2;
|
||||||
|
dir = 1;
|
||||||
|
mul = nel1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stripsize < 2 && groups > 1) {
|
||||||
std::cerr << __FUNCTION__ << ": Warning: too many threads available." << std::endl
|
std::cerr << __FUNCTION__ << ": Warning: too many threads available." << std::endl
|
||||||
<< "Reducing to a suitable amount" << std::endl;
|
<< "Reducing to a suitable amount" << std::endl;
|
||||||
while (((stripsize = nel1/(groups*threads)) < 2) && threads > 1)
|
while (((stripsize = els/(groups*threads)) < 2) && threads > 1)
|
||||||
threads--;
|
threads--;
|
||||||
if (threads == 1)
|
if (threads == 1)
|
||||||
groups=1;
|
groups=1;
|
||||||
stripsize = nel1/(groups*threads);
|
stripsize = els/(groups*threads);
|
||||||
}
|
}
|
||||||
remainder = nel1-(stripsize*groups*threads);
|
remainder = els-(stripsize*groups*threads);
|
||||||
#endif
|
#endif
|
||||||
result.resize(groups);
|
result.resize(groups);
|
||||||
|
|
||||||
@ -325,7 +341,7 @@ void utl::calcThreadGroups(int nel1, int nel2, utl::ThreadGroups& result)
|
|||||||
std::cout << "nel1 " << nel1 << std::endl;
|
std::cout << "nel1 " << nel1 << std::endl;
|
||||||
std::cout << "nel2 " << nel2 << std::endl;
|
std::cout << "nel2 " << nel2 << std::endl;
|
||||||
std::cout << "stripsize " << stripsize << std::endl;
|
std::cout << "stripsize " << stripsize << std::endl;
|
||||||
std::cout << "# of strips " << nel1/stripsize << std::endl;
|
std::cout << "# of strips " << els/stripsize << std::endl;
|
||||||
std::cout << "remainder " << remainder << std::endl;
|
std::cout << "remainder " << remainder << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -334,16 +350,36 @@ void utl::calcThreadGroups(int nel1, int nel2, utl::ThreadGroups& result)
|
|||||||
for (int i=0;i<nel1*nel2;++i)
|
for (int i=0;i<nel1*nel2;++i)
|
||||||
result[0][0].push_back(i);
|
result[0][0].push_back(i);
|
||||||
} else {
|
} else {
|
||||||
|
std::vector< std::vector<int> > stripsizes;
|
||||||
|
stripsizes.resize(2);
|
||||||
|
stripsizes[0].resize(threads,stripsize);
|
||||||
|
stripsizes[1].resize(threads,stripsize);
|
||||||
|
int r=0;
|
||||||
|
for (int i=0;i<remainder && r < remainder;++i) {
|
||||||
|
stripsizes[1][threads-1-i]++;
|
||||||
|
r++;
|
||||||
|
if (r < remainder) {
|
||||||
|
stripsizes[0][threads-1-i]++;
|
||||||
|
r++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::vector< std::vector<int> > startelms;
|
||||||
|
startelms.resize(2);
|
||||||
|
int offs=0;
|
||||||
|
for (int i=0;i<threads;++i) {
|
||||||
|
startelms[0].push_back(offs*mul);
|
||||||
|
offs += stripsizes[0][i];
|
||||||
|
startelms[1].push_back(offs*mul);
|
||||||
|
offs += stripsizes[1][i];
|
||||||
|
}
|
||||||
for (size_t g=0;g<result.size();++g) { // loop over groups
|
for (size_t g=0;g<result.size();++g) { // loop over groups
|
||||||
result[g].resize(threads);
|
result[g].resize(threads);
|
||||||
for (int t=0;t<threads;++t) { // loop over threads
|
for (int t=0;t<threads;++t) { // loop over threads
|
||||||
size_t startel = g*stripsize+result.size()*t*stripsize;
|
int maxx = dir==0?stripsizes[g][t]:nel1;
|
||||||
int curstripsize = stripsize;
|
int maxy = dir==1?stripsizes[g][t]:nel2;
|
||||||
if (t == threads-1 && g == result.size()-1)
|
for (int i2=0; i2 < maxy; ++i2) { // loop in y direction
|
||||||
curstripsize += remainder;
|
for (int i1=0;i1<maxx; ++i1) {
|
||||||
for (int i2=0; i2 < nel2; ++i2) { // loop in y direction
|
int iEl = startelms[g][t]+i1+i2*nel1;
|
||||||
for (int i1=0;i1<curstripsize; ++i1) {
|
|
||||||
int iEl = startel+i1+i2*nel1;
|
|
||||||
result[g][t].push_back(iEl);
|
result[g][t].push_back(iEl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -376,17 +412,43 @@ void utl::calcThreadGroups(int nel1, int nel2, int nel3, ThreadGroups& result)
|
|||||||
if (threads > 1)
|
if (threads > 1)
|
||||||
groups = 2;
|
groups = 2;
|
||||||
|
|
||||||
stripsize = nel1/(groups*threads);
|
int dir, els, mul;
|
||||||
if (stripsize < 2) {
|
int s1 = nel1/(groups*threads);
|
||||||
|
int s2 = nel2/(groups*threads);
|
||||||
|
int s3 = nel3/(groups*threads);
|
||||||
|
int r1 = nel1-(s1*groups*threads);
|
||||||
|
int r2 = nel2-(s2*groups*threads);
|
||||||
|
int r3 = nel3-(s3*groups*threads);
|
||||||
|
if (r1*nel2*nel3 < r2*nel1*nel2 && r1*nel2*nel3 < r3*nel1*nel2 ) {
|
||||||
|
// strips along x axis
|
||||||
|
stripsize = s1;
|
||||||
|
dir = 0;
|
||||||
|
els = nel1;
|
||||||
|
mul = 1;
|
||||||
|
} else if (r2*nel1*nel3 < r1*nel2*nel3 && r2*nel1*nel3 < r3*nel1*nel2 ) {
|
||||||
|
// strips along y axis
|
||||||
|
stripsize = s2;
|
||||||
|
els = nel2;
|
||||||
|
dir = 1;
|
||||||
|
mul = nel1;
|
||||||
|
} else {
|
||||||
|
// strips along z axis
|
||||||
|
stripsize = s3;
|
||||||
|
els = nel3;
|
||||||
|
dir = 2;
|
||||||
|
mul = nel1*nel2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stripsize < 2 && groups > 1) {
|
||||||
std::cerr << __FUNCTION__ << ": Warning: too many threads available." << std::endl
|
std::cerr << __FUNCTION__ << ": Warning: too many threads available." << std::endl
|
||||||
<< "Reducing to a suitable amount" << std::endl;
|
<< "Reducing to a suitable amount" << std::endl;
|
||||||
while ((stripsize = nel1/(groups*threads)) < 2 && threads > 1)
|
while (((stripsize = els/(groups*threads)) < 2) && threads > 1)
|
||||||
threads--;
|
threads--;
|
||||||
if (threads == 1)
|
if (threads == 1)
|
||||||
groups=1;
|
groups=1;
|
||||||
stripsize = nel1/(groups*threads);
|
stripsize = els/(groups*threads);
|
||||||
}
|
}
|
||||||
remainder = nel1-(stripsize*groups*threads);
|
remainder = els-(stripsize*groups*threads);
|
||||||
#endif
|
#endif
|
||||||
result.resize(groups);
|
result.resize(groups);
|
||||||
|
|
||||||
@ -396,22 +458,49 @@ void utl::calcThreadGroups(int nel1, int nel2, int nel3, ThreadGroups& result)
|
|||||||
std::cout << "nel2 " << nel2 << std::endl;
|
std::cout << "nel2 " << nel2 << std::endl;
|
||||||
std::cout << "nel3 " << nel3 << std::endl;
|
std::cout << "nel3 " << nel3 << std::endl;
|
||||||
std::cout << "stripsize " << stripsize << std::endl;
|
std::cout << "stripsize " << stripsize << std::endl;
|
||||||
std::cout << "# of strips " << (stripsize?nel1/stripsize:0) << std::endl;
|
std::cout << "# of strips " << els/stripsize << std::endl;
|
||||||
std::cout << "remainder " << remainder << std::endl;
|
std::cout << "remainder " << remainder << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (size_t g=0;g<result.size();++g) { // loop over groups
|
if (groups == 1) {
|
||||||
result[g].resize(threads);
|
result[0].resize(1);
|
||||||
for (int t=0;t<threads;++t) { // loop over threads
|
for (int i=0;i<nel1*nel2*nel3;++i)
|
||||||
size_t startel = g*stripsize+result.size()*t*stripsize;
|
result[0][0].push_back(i);
|
||||||
int curstripsize = stripsize;
|
} else {
|
||||||
if (t == threads-1 && g == result.size()-1)
|
std::vector< std::vector<int> > stripsizes;
|
||||||
curstripsize += remainder;
|
stripsizes.resize(2);
|
||||||
for (int i2=0; i2 < nel2; ++i2) { // loop in y direction
|
stripsizes[0].resize(threads,stripsize);
|
||||||
for (int i3=0; i3 < nel3; ++i3) {
|
stripsizes[1].resize(threads,stripsize);
|
||||||
for (int i1=0;i1<curstripsize; ++i1) {
|
int r=0;
|
||||||
int iEl = startel+i1+i3*nel1*nel2+i2*nel1;
|
for (int i=0;i<remainder && r < remainder;++i) {
|
||||||
result[g][t].push_back(iEl);
|
stripsizes[1][threads-1-i]++;
|
||||||
|
r++;
|
||||||
|
if (r < remainder) {
|
||||||
|
stripsizes[0][threads-1-i]++;
|
||||||
|
r++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::vector< std::vector<int> > startelms;
|
||||||
|
startelms.resize(2);
|
||||||
|
int offs=0;
|
||||||
|
for (int i=0;i<threads;++i) {
|
||||||
|
startelms[0].push_back(offs*mul);
|
||||||
|
offs += stripsizes[0][i];
|
||||||
|
startelms[1].push_back(offs*mul);
|
||||||
|
offs += stripsizes[1][i];
|
||||||
|
}
|
||||||
|
for (size_t g=0;g<result.size();++g) { // loop over groups
|
||||||
|
result[g].resize(threads);
|
||||||
|
for (int t=0;t<threads;++t) { // loop over threads
|
||||||
|
int maxx = dir==0?stripsizes[g][t]:nel1;
|
||||||
|
int maxy = dir==1?stripsizes[g][t]:nel2;
|
||||||
|
int maxz = dir==2?stripsizes[g][t]:nel3;
|
||||||
|
for (int i3=0; i3 < maxz; ++i3) {
|
||||||
|
for (int i2=0; i2 < maxy; ++i2) { // loop in y direction
|
||||||
|
for (int i1=0; i1< maxx; ++i1) {
|
||||||
|
int iEl = startelms[g][t]+i1+i2*nel1+i3*nel1*nel2;
|
||||||
|
result[g][t].push_back(iEl);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user