/****(c) 2020. A. Gerbessiotis. See ../KAagcopy.txt ****/
/*******************************************************/
#include   <stdlib.h>
#include   "avgaii.h"
#include   "aimisc.h"
#include   "prdx.h"	

#define ex2(X)  ((int)((unsigned int)(1)<<(X)))

void oper_add(idata *result,idata *left,idata *right,int size)
{
        register int    i;
        for (i=0;i<size;i++)
                result[i]=left[i]+right[i];
}

void rdx4 (idata *src,int nn)
{
 register int rounds;
 int count[256],tcount[256],ccount[256], total[256];
 register int i,j,dpf,dpl,first,last,size,offset,nbytes,r,m;
 int N,n;
 idata *out;
 unsigned int shifts[4],shift;

 r=256;
 n=nn;
 nbytes=sizeof(unsigned int);
 shifts[0]=0;shifts[1]=8;shifts[2]=16;shifts[3]=24;

 N=n;
 out = (unsigned int *) malloc(N*nbytes);

for(rounds=0;rounds<4;rounds++) { /**********ROUND LOOP****/
 shift=shifts[rounds];
 memset(count,0,r*sizeof(int));
 for(i=0;i<n;++i)
    count[(src[i]>>shift) & 0x000000FF]++; /* count */
 memcpy(tcount,count,r*sizeof(int)); /* copy counters for later use */
 for(j=1;j<r;++j)
    count[j] +=count[j-1]; /* determine position of keys in out */
 for(i=n-1;i>=0;--i) {
   m=--count[(src[i]>>shift) & 0x000000FF];
   out[m]=src[i];         /* complete count sort */
 }

 memcpy(count,tcount,r*sizeof(int));  /* recover count */
 /* Determine global position of each key; ccount ppf, total global sums  */
 ai2scan(oper_add,r,(char *)count,(char *)ccount,(char*) total,sizeof(int));
 AIBARRIER();
 for(j=1;j<r;++j)
   total[j]=total[j]+total[j-1];
 for(j=1;j<r;++j)
   ccount[j]=ccount[j]+ total[j-1]; /* position of 0..r-1 in global array */
 for(j=1;j<r;++j)
    count[j] +=count[j-1]; /* repeat count sort steps */


/*
*/
 AIMREGISTER((char *)src,n*nbytes);/* prepare for communication */ 
 AIMCOMMIT(); 
 AIMCOMSTART((char *)src);
 for(j=0;j<r;j++) {
   first=ccount[j]-tcount[j]; /* first position */
   last =ccount[j]-1;         /* last  position  to store [j]'s */
   dpf = first / N;           /* first processor */
   dpl = last  / N;           /* last  processor to store [j]'s */
   if (first <=last) {        /* do there exist [j] keys? */
   if (dpf==dpl){ /* if yes, check if they all go to same processor */
      /* DEBUG1; */
      if (tcount[j]>0) { /* no need for communication if 0 */
      AIMHPPUT(dpf,(char *)&out[(count[j]-tcount[j])],(char *)src,
              nbytes*(first-dpf*N), tcount[j]*nbytes);
      }
   }
   else { /* if they do not go to the same processor */
      size=dpl*N-first; /* first processor receives some */
      /* DEBUG2; */
      if (size >0) {
        AIMHPPUT(dpf,(char *)&out[(count[j]-tcount[j])],(char *)src,
               nbytes*(N-size),size*nbytes);
      }
      offset=count[j]-tcount[j]+size; /* second processor receives rest */
      size= tcount[j]-size;
      if (size >0) { 
        AIMHPPUT(dpl,(char *)&out[offset],(char *)src,0,(size)*nbytes);
      }
   }
 }
 }
 AIMCOMEND((char *)src); 
 AIBARRIER();
 AIMDEREGISTER((char *)src); 
}
 free((void *)out);
}

/* Some issues .... rdx2 .... MPI for n=16777216 or higher ... causes crash.. p=4 */

void rdx2 (idata *src,int nn)
{
 register int rounds;
 int count[65536  ],tcount[65536  ],ccount[65536  ], total[65536  ];
 register int i,j,dpf,dpl,first,last,size,offset,nbytes,r,m;
 int N,n;
 idata *out;
 unsigned int shift;

 r=65536; 
 n=nn;
 nbytes=sizeof(unsigned int);
 N=n;
 out = (idata *) malloc(N*nbytes);

for(rounds=0,shift=0;rounds<2;rounds++, shift+=16) { /**********ROUND LOOP****/
 memset(count,0,r*sizeof(int));
 for(i=0;i<n;++i)
    count[(src[i]>>shift) & 0x0000FFFF]++; /* count */
 memcpy(tcount,count,r*sizeof(int)); /* copy counters for later use */
 for(j=1;j<r;++j)
    count[j] +=count[j-1]; /* determine position of keys in out */
 for(i=n-1;i>=0;--i) {
   m=--count[(src[i]>>shift) & 0x0000FFFF];
   out[m]=src[i];         /* complete count sort */
 }

 memcpy(count,tcount,r*sizeof(int));  /* recover count */
 /* Determine global position of each key; ccount ppf, total global sums  */
 ai2scan(oper_add,r,(char *)count,(char *)ccount,(char*) total,sizeof(int));
 AIBARRIER();
 for(j=1;j<r;++j)
   total[j]=total[j]+total[j-1];
 for(j=1;j<r;++j)
   ccount[j]=ccount[j]+ total[j-1]; /* position of 0..r-1 in global array */
 for(j=1;j<r;++j)
    count[j] +=count[j-1]; /* repeat count sort steps */


/*
*/
 AIMREGISTER((char *)src,n*nbytes);/* prepare for communication */ 
 AIMCOMMIT(); 
#ifdef MPILIB
 for(i=0;i<r;i+=256){
 AIMCOMSTART((char *)src);
 for(j=i;j<i+256;j++) {
#else
 for(j=0;j<r;j++){
#endif
   first=ccount[j]-tcount[j]; /* first position */
   last =ccount[j]-1;         /* last  position  to store [j]'s */
   dpf = first / N;           /* first processor */
   dpl = last  / N;           /* last  processor to store [j]'s */
   if (first <=last) {        /* do there exist [j] keys? */
   if (dpf==dpl){ /* if yes, check if they all go to same processor */
      /* DEBUG1; */
      if (tcount[j]>0) { /* no need for communication if 0 */
      AIMHPPUT(dpf,(char *)&out[(count[j]-tcount[j])],(char *)src,
              nbytes*(first-dpf*N), tcount[j]*nbytes);
      }
   }
   else { /* if they do not go to the same processor */
      size=dpl*N-first; /* first processor receives some */
      /* DEBUG2; */
      if (size >0) {
        AIMHPPUT(dpf,(char *)&out[(count[j]-tcount[j])],(char *)src,
               nbytes*(N-size),size*nbytes);
      }
      offset=count[j]-tcount[j]+size; /* second processor receives rest */
      size= tcount[j]-size;
      if (size >0) { 
        AIMHPPUT(dpl,(char *)&out[offset],(char *)src,0,(size)*nbytes);
      }
   }
 }
 }
 AIMCOMEND((char *)src); 
#ifdef MPILIB
}
#endif
 AIBARRIER();
 AIMDEREGISTER((char *)src); 
}
 free((void *)out);
}

