/****(c) 2020. A. Gerbessiotis. See ../KAagcopy.txt ****/
/*******************************************************/
#include   <stdlib.h>
#include   "avgaii.h"
#include   "aimisc.h"
#include   "prdx.h"
#define    V256  256

#ifdef DEBUG
void debprint (int *g,char *s);
void debprint2(int *g,char *s);
#endif

void pprdx (idata *src,int nn)
{
 idata *out;
 register int pid,nprocs,rounds,m,n,nbytes;    
 register int i,j,k,dpf,dpl,first,last,size,offset;
 register unsigned int shift;
 int count[V256],tcount[V256],ccount[V256],*gatcount, *fircount;
 unsigned int shifts[4];

 pid = AIPID(); nprocs = AINPROCS();
 n=nn;
 nbytes=sizeof(idata);
 out = (idata *) malloc(n*nbytes);
 gatcount = (int *) malloc(V256*nprocs*sizeof(int));  /* only for proc 0 */
 fircount = (int *) malloc(V256*nprocs*sizeof(int));  /* only for proc 0 */

 shifts[0]=0;shifts[1]=8;shifts[2]=16;shifts[3]=24;

for(rounds=0;rounds<4;rounds++) { 

   /* PHASE 0  initialization */

   memset(count,0,V256*sizeof(int));
   memset(gatcount,0,V256*nprocs*sizeof(int));
   memset(fircount,0,V256*nprocs*sizeof(int));

   /* PHASE 1 countsort local keys */

   shift=shifts[rounds];
   memset(count,0,V256*sizeof(int));
   for(i=0;i<n;++i)
      count[(src[i]>>shift) & 0x000000FF]++; /* count */
   memcpy(tcount,count,V256*sizeof(int)); /* tcount a copy of count BEFORE PREFIX*/
   for(j=1;j<V256;++j)
      count[j] +=count[j-1]; /* determine position of keys in out */
   memcpy(ccount,count,V256*sizeof(int)); /* tcount a copy of count AFTER  PREFIX*/
   for(i=n-1;i>=0;--i) {
     m=--count[(src[i]>>shift) & 0x000000FF];
     out[m]=src[i];         /* complete count sort */
   }

   /* PHASE 2 retrieve local count array and gather all into processor 0 */

   memcpy(count,tcount,V256*sizeof(int));  /* recover count :: Optimize remove it and cnl*/
   bspgather(0,(char *) count,(char *) gatcount,V256*sizeof(int));
   if (pid == 0 ) {
     k=1;
     for(i=0;i<V256;i++) {
       for(j=0;j<V256*nprocs;j+=V256) {
          if (k < (V256*nprocs))
          fircount[k++] = gatcount[i+j];  
       }
     }
     for(j=1;j<V256*nprocs;++j)
        fircount[j] +=fircount[j-1]; /* first address to store */

     /* prepare gatcount for reverse ie scatter operation */
     k=0;
     for(i=0;i<V256;i++) {
       for(j=0;j<V256*nprocs;j+=V256) {
          if (k < (V256*nprocs))
          gatcount[i+j] = fircount[k++];
       }
     }
   }
   bspscatter(0,(char *) gatcount,(char *) count,V256*sizeof(int));

   /* PHASE 3  Global routing to complete first round  */

   AIMREGISTER((char *)src,n*nbytes);/* prepare for communication */ 
   AIMCOMMIT(); 
   AIMCOMSTART((char *)src);

   for(j=0;j<V256;j++) {
     first=count[j];             /* first GLOBAL position */
     last =count[j]+tcount[j]-1; /* last  GLOBAL position  to store pid's [j]'s */
     dpf = first /nn;           /* first  GLOBAL processor */
     dpl = last  /nn;           /* last   GLOBAL processor to store pid's [j]'s */
     if (first <=last) {        /* do there exist [j] keys? */
       if (dpf==dpl){ /* if yes, check if they all go to same processor */
         /* DEBUG1; */
         if (tcount[j]>0) { /* no need for communication if 0 */
         AIMHPPUT(dpf,(char *)&out[ccount[j]-tcount[j]],(char *)src,nbytes*(first-dpf*nn), tcount[j]*nbytes);
         }
       }
       else { /* if they do not go to the same processor */
          size=dpl*nn-first; /* first processor receives some */
          /* DEBUG2; */
          if (size >0) {
            AIMHPPUT(dpf,(char *)&out[(ccount[j]-tcount[j])],(char *)src,
                   nbytes*(nn-size),size*nbytes);
          }
          offset=ccount[j]-tcount[j]+size; /* second processor receives rest */
          size= tcount[j]-size;
          if (size >0) { 
            AIMHPPUT(dpl,(char *)&out[offset],(char *)src,0,(size)*nbytes);
          }
       }
     }
   }
   AIMCOMEND((char *)src); 
   AIMDEREGISTER((char *)src); 
}
   free((void *)fircount);
   free((void *)gatcount);
   free((void *)out);
}


#ifdef DEBUG
void debprint(int *g, char *s)
{
 int pid, nprocs,i,j,sum=0;
 pid=AIPID();
 nprocs=AINPROCS();

 if (pid == 0) {
   for(i=0;i<V256;i++){
       fprintf(stdout,"%5d : ",i);
     for(j=0;j<V256*nprocs;j+=V256){
       fprintf(stdout,"%5d ",g [ i + j ]);
       sum += g[i+j];
     }
     fprintf(stdout,"%10d %s\n",sum,s);fflush(stdout);
   }
     fprintf(stdout,"-----------\n");fflush(stdout);
 }
}

void debprint2(int *t, char *s)
{
 int pid, nprocs,i,sum=0;
 pid=AIPID();
 nprocs=AINPROCS();

 if (pid == 0) {
   for(i=0;i<nprocs*V256;i++){
       if (i%nprocs ==0) fprintf(stdout,"%5d : ",i/nprocs);
       fprintf(stdout,"%5d ",t [i]);
       sum += t[i];
       if (i%nprocs ==nprocs-1) fprintf(stdout,"%5d %s\n",sum,s);
     }
     fprintf(stdout,"\n");fflush(stdout);
   }
}
#endif
