added: strip along the optimal direction + spread remainder over several threads when possible

git-svn-id: http://svn.sintef.no/trondheim/IFEM/trunk@1445 e10b68d5-8a6e-419e-a041-bce267b0401d
This commit is contained in:
akva 2012-02-01 13:18:07 +00:00 committed by Knut Morten Okstad
parent 7789611606
commit 6d49692360

View File

@ -295,7 +295,7 @@ size_t utl::find_closest (const std::vector<real>& a, real v)
}
void utl::calcThreadGroups(int nel1, int nel2, utl::ThreadGroups& result)
void utl::calcThreadGroups(int nel1, int nel2, ThreadGroups& result)
{
int threads=1;
int groups=1;
@ -306,17 +306,33 @@ void utl::calcThreadGroups(int nel1, int nel2, utl::ThreadGroups& result)
if (threads > 1)
groups = 2;
stripsize = nel1/(groups*threads);
if (stripsize < 2) {
int dir, els, mul;
int s1 = nel1/(groups*threads);
int s2 = nel2/(groups*threads);
int r1 = nel1-(s1*groups*threads);
int r2 = nel2-(s2*groups*threads);
if (r1*nel2 < r2*nel1) {
stripsize = s1;
dir = 0;
els = nel1;
mul = 1;
} else {
stripsize = s2;
els = nel2;
dir = 1;
mul = nel1;
}
if (stripsize < 2 && groups > 1) {
std::cerr << __FUNCTION__ << ": Warning: too many threads available." << std::endl
<< "Reducing to a suitable amount" << std::endl;
while (((stripsize = nel1/(groups*threads)) < 2) && threads > 1)
while (((stripsize = els/(groups*threads)) < 2) && threads > 1)
threads--;
if (threads == 1)
groups=1;
stripsize = nel1/(groups*threads);
stripsize = els/(groups*threads);
}
remainder = nel1-(stripsize*groups*threads);
remainder = els-(stripsize*groups*threads);
#endif
result.resize(groups);
@ -325,7 +341,7 @@ void utl::calcThreadGroups(int nel1, int nel2, utl::ThreadGroups& result)
std::cout << "nel1 " << nel1 << std::endl;
std::cout << "nel2 " << nel2 << std::endl;
std::cout << "stripsize " << stripsize << std::endl;
std::cout << "# of strips " << nel1/stripsize << std::endl;
std::cout << "# of strips " << els/stripsize << std::endl;
std::cout << "remainder " << remainder << std::endl;
#endif
@ -334,16 +350,36 @@ void utl::calcThreadGroups(int nel1, int nel2, utl::ThreadGroups& result)
for (int i=0;i<nel1*nel2;++i)
result[0][0].push_back(i);
} else {
std::vector< std::vector<int> > stripsizes;
stripsizes.resize(2);
stripsizes[0].resize(threads,stripsize);
stripsizes[1].resize(threads,stripsize);
int r=0;
for (int i=0;i<remainder && r < remainder;++i) {
stripsizes[1][threads-1-i]++;
r++;
if (r < remainder) {
stripsizes[0][threads-1-i]++;
r++;
}
}
std::vector< std::vector<int> > startelms;
startelms.resize(2);
int offs=0;
for (int i=0;i<threads;++i) {
startelms[0].push_back(offs*mul);
offs += stripsizes[0][i];
startelms[1].push_back(offs*mul);
offs += stripsizes[1][i];
}
for (size_t g=0;g<result.size();++g) { // loop over groups
result[g].resize(threads);
for (int t=0;t<threads;++t) { // loop over threads
size_t startel = g*stripsize+result.size()*t*stripsize;
int curstripsize = stripsize;
if (t == threads-1 && g == result.size()-1)
curstripsize += remainder;
for (int i2=0; i2 < nel2; ++i2) { // loop in y direction
for (int i1=0;i1<curstripsize; ++i1) {
int iEl = startel+i1+i2*nel1;
int maxx = dir==0?stripsizes[g][t]:nel1;
int maxy = dir==1?stripsizes[g][t]:nel2;
for (int i2=0; i2 < maxy; ++i2) { // loop in y direction
for (int i1=0;i1<maxx; ++i1) {
int iEl = startelms[g][t]+i1+i2*nel1;
result[g][t].push_back(iEl);
}
}
@ -376,17 +412,43 @@ void utl::calcThreadGroups(int nel1, int nel2, int nel3, ThreadGroups& result)
if (threads > 1)
groups = 2;
stripsize = nel1/(groups*threads);
if (stripsize < 2) {
int dir, els, mul;
int s1 = nel1/(groups*threads);
int s2 = nel2/(groups*threads);
int s3 = nel3/(groups*threads);
int r1 = nel1-(s1*groups*threads);
int r2 = nel2-(s2*groups*threads);
int r3 = nel3-(s3*groups*threads);
if (r1*nel2*nel3 < r2*nel1*nel2 && r1*nel2*nel3 < r3*nel1*nel2 ) {
// strips along x axis
stripsize = s1;
dir = 0;
els = nel1;
mul = 1;
} else if (r2*nel1*nel3 < r1*nel2*nel3 && r2*nel1*nel3 < r3*nel1*nel2 ) {
// strips along y axis
stripsize = s2;
els = nel2;
dir = 1;
mul = nel1;
} else {
// strips along z axis
stripsize = s3;
els = nel3;
dir = 2;
mul = nel1*nel2;
}
if (stripsize < 2 && groups > 1) {
std::cerr << __FUNCTION__ << ": Warning: too many threads available." << std::endl
<< "Reducing to a suitable amount" << std::endl;
while ((stripsize = nel1/(groups*threads)) < 2 && threads > 1)
while (((stripsize = els/(groups*threads)) < 2) && threads > 1)
threads--;
if (threads == 1)
groups=1;
stripsize = nel1/(groups*threads);
stripsize = els/(groups*threads);
}
remainder = nel1-(stripsize*groups*threads);
remainder = els-(stripsize*groups*threads);
#endif
result.resize(groups);
@ -396,22 +458,49 @@ void utl::calcThreadGroups(int nel1, int nel2, int nel3, ThreadGroups& result)
std::cout << "nel2 " << nel2 << std::endl;
std::cout << "nel3 " << nel3 << std::endl;
std::cout << "stripsize " << stripsize << std::endl;
std::cout << "# of strips " << (stripsize?nel1/stripsize:0) << std::endl;
std::cout << "# of strips " << els/stripsize << std::endl;
std::cout << "remainder " << remainder << std::endl;
#endif
for (size_t g=0;g<result.size();++g) { // loop over groups
result[g].resize(threads);
for (int t=0;t<threads;++t) { // loop over threads
size_t startel = g*stripsize+result.size()*t*stripsize;
int curstripsize = stripsize;
if (t == threads-1 && g == result.size()-1)
curstripsize += remainder;
for (int i2=0; i2 < nel2; ++i2) { // loop in y direction
for (int i3=0; i3 < nel3; ++i3) {
for (int i1=0;i1<curstripsize; ++i1) {
int iEl = startel+i1+i3*nel1*nel2+i2*nel1;
result[g][t].push_back(iEl);
if (groups == 1) {
result[0].resize(1);
for (int i=0;i<nel1*nel2*nel3;++i)
result[0][0].push_back(i);
} else {
std::vector< std::vector<int> > stripsizes;
stripsizes.resize(2);
stripsizes[0].resize(threads,stripsize);
stripsizes[1].resize(threads,stripsize);
int r=0;
for (int i=0;i<remainder && r < remainder;++i) {
stripsizes[1][threads-1-i]++;
r++;
if (r < remainder) {
stripsizes[0][threads-1-i]++;
r++;
}
}
std::vector< std::vector<int> > startelms;
startelms.resize(2);
int offs=0;
for (int i=0;i<threads;++i) {
startelms[0].push_back(offs*mul);
offs += stripsizes[0][i];
startelms[1].push_back(offs*mul);
offs += stripsizes[1][i];
}
for (size_t g=0;g<result.size();++g) { // loop over groups
result[g].resize(threads);
for (int t=0;t<threads;++t) { // loop over threads
int maxx = dir==0?stripsizes[g][t]:nel1;
int maxy = dir==1?stripsizes[g][t]:nel2;
int maxz = dir==2?stripsizes[g][t]:nel3;
for (int i3=0; i3 < maxz; ++i3) {
for (int i2=0; i2 < maxy; ++i2) { // loop in y direction
for (int i1=0; i1< maxx; ++i1) {
int iEl = startelms[g][t]+i1+i2*nel1+i3*nel1*nel2;
result[g][t].push_back(iEl);
}
}
}
}