/****(c) 2020. A. Gerbessiotis. See ../KAagcopy.txt ****/
/*******************************************************/
#include <stdlib.h>
#include <math.h>
#include "avgaii.h"
#include "prdx.h"
#include "seqo.h"
#include "psrtai.h"

 /**** gsd       :Gerbessiotis-Siniolakis Deterministic sorting
  **** keys      : array of idata to sort, 
  ****    n      : # of keys in local processor
  **** total     : total number of keys;may not be n*nprocs.If total==0
  ****             then total is set to n*nprocs;  ???? NOT TRUE ANY MORE ????
  **** result    : Output array: Algorithm does not sort in place
  **** nbytes    : byte size of idata nbytes=sizeof(idata)
  **** localsort : Sequential Sort function to sort local idata
  **** generic    : Generic Sort function
  **** percent   : of actual sample size; allows control of sample
  ****             size to vary from theoretical value.
  **** Returns an integer that is the number of local keys after sort.
  ****/
int  gsd (idata  *keys, int n, int total, idata **result,
	  int nbytes, void (*localsort)(idata *,int), 
          void (*generic)(void*,int,int,int (*)(const void*,const void *)),
          int percent)
{
  register int  pid,nprocs; 
  register double u;    /* auxiliary variable */
  register int  ns;     /* sample size */
  idata          *sample;          /* key sample array */
  idata          *ssample;          /* key sample array */
  idata          *split;           /* splitter array */
  int           *bucket;          /* local bucket size */
  int           *obucket;         /* auxiliary local */
  int           *ppf_bucket;      /* ParallelPrefix'ed bucket size */
  int           *lbucket;         /* auxiliary local */
  register int  offset,index;     /* aux variables in random sampling*/
  register int  proc_size;        /* processor bucket size */
  register int  size,intsize;     /* size of idata and int */
#ifdef ROUTEONE
  register int  i,j;              /* index variables */
#else
  register int  i,j,k;            /* index variables */
#endif
  register int  left,right,middle;/* binary search variables */
  idata         *temp_result;

  /**** PART 0: Some early checks*/
  pid=AIPID(); nprocs=AINPROCS();
  size=nbytes;
  if (nprocs <= 1) {
    temp_result = (idata  *) malloc(n*size);
    memcpy(temp_result,keys,size*n);
    (*localsort) ((idata *)temp_result,n);
    *result=(idata *)temp_result;
    return(n);	
  }

  /**** PART 1: Initializations */

#ifdef PARALLELSAMPLESORT
  if (total==0) total=nprocs*n;
  u=(percent/100.0)*nprocs*log((double)total);      /* IGNORE */
  u=(percent/100.0)*nprocs*log(log((double)total)); /* THEORY COMPLIANT */
  u=ceil(u+1); /* auxiliary results */
  ns= (((int) u)*nprocs);/* total sample size */
  intsize=sizeof(int);   /* sizeof(int)   */
#else
  if (total==0) total=nprocs*n;
  u=(percent/100.0)*nprocs;
  ns= (((int) u)*nprocs);/* total sample size */
  intsize=sizeof(int);   /* sizeof(int)   */
#endif


  /**** PART 2: Random number initialization */

  /* It's the deterministic algorithm; What did you expect ? */ 

  /**** PART 3: Space allocation and checking */

  sample     = (idata*) malloc(ns*size);        /* Sample idata storage */
  ssample    = (idata*) malloc(ns*size);        /* Sample idata storage */
  split      = (idata*) malloc(nprocs*size);    /* splitter storage */
  bucket     = (int*)   malloc(nprocs*intsize); /* bucket size */
  obucket    = (int*)   malloc(nprocs*intsize); /* bucket size */
  ppf_bucket = (int*)   malloc(nprocs*intsize); /* ppf result temp */
  lbucket    = (int*)   malloc(nprocs*intsize); /* auxiliary variable */

  /**** PART 4: Registration area */
  /* Delayed registration; registration before communication*/

  /**** PART 5: Regular OVERsampling : Read PPL paper ****/

  /* Example: Say n= 11 localsamplesize=ns/nprocs=3
   o  o  o  o  o  o  o  o  o  o  o   
            ^           ^        ^
   sample   1           2        3=maximum
   CEIL(...) =4 FLOOR(...) =3 i.e. interspace is
              3             2  respectively
  */

  (*localsort)((idata *)keys,n); /****Sort first; Sample later ****/

  index = LEFT(n,(ns/nprocs));/* detemine sample properties          */
  offset= CEIL(n,(ns/nprocs));/* equidistant sample keys are picked  */ 
  for(i=0;i<index;i++) {      /* ceiling(ns/nprocs)  up to index     */
    sample[i] = keys[(i+1)*offset-1];
  }
  offset=FLOOR(n,(ns/nprocs));/* floor(ns/nprocs)    beyond index    */
  for(i=index;i<((ns/nprocs)-1);i++) {
  sample[i] = keys[(i+1)*offset-1];
  }
        /* Append the maximum to the sample */
        sample[ns/nprocs-1] = keys[(ns/nprocs)*offset-1];
  /*or  sample[i          ] = keys[(i+1      )*offset-1]; */
  offset=(ns/nprocs)*pid*size;
  bucket[0]=ns/nprocs; /* Sample size per processor; ns/nprocs=u     */

  /**** PART 6: Sample sorting */

#ifndef PARALLELSAMPLESORT
  AIMREGISTER(sample,ns*size);
  AIMCOMMIT(); AIMCOMSTART(sample);
  /* collect sample at processor 0 */
  if (pid != 0 )
    AIMPUT(0,sample,sample,offset,bucket[0]*size);
  AIMCOMEND(sample);

  /* sort sample  and select p-1 splitters */
  if (pid == 0) {
    (*localsort)((idata*)sample,ns);
    for(i=1;i<nprocs;i++)
       split[i-1]=sample[(ns*i/nprocs)-1];
  }
  AIMREGISTER(split,nprocs*size);
  AIMCOMMIT();AIMCOMSTART(split);
  if (pid==0){
    for(i=1;i<nprocs;i++){
      AIMHPPUT(i,split,split,0,(nprocs-1)*size);
    }
  }
  AIMCOMEND(split);
  AIMDEREGISTER(split);    /* DEL */
  AIMDEREGISTER(sample);   /* DEL */
#else
  bucket[0] =  bspbtndat(sample,ssample,bucket[0]);
    /* Done with sample sorting; Locate splitters*/
    /* Locate splitters in resulting merged sequence */
  AIMREGISTER(split,nprocs*size);
  AIMCOMMIT();AIMCOMSTART(split);
  for(i=0;i<nprocs;i++){/*Last local sample becomes a splitter*/
    AIMHPPUT(i,&ssample[(bucket[0]-1)],split,pid*size,size);
  }
  AIMCOMEND(split);
  AIMDEREGISTER(split);   /* DEL */
#endif

  /**** PART 7: Splitters and Buckets preparation */
  memset(bucket,0,nprocs*sizeof(int));

  /**** PART 8: Binary search of keys into the splitters but  */
  /**** It is more efficient to do it the other way around!!  */
  /**** # of splitters = nprocs-1; #of buckets = nprocs       */
  for(i=nprocs-2,bucket[nprocs-1]=n;i>=0;i--){
      bucket[i]=searchbdat((idata *)&split[i],(idata *)keys,n);
      bucket[i+1]-=bucket[i];
  }
     
  /**** PART 9: Independent prefix operations on bucket sizes */
  aiscan((int*) bucket, (int*) ppf_bucket, nprocs);
/*ai2prefix(operadd,nprocs,(char *)bucket,(char *)ppf_bucket,intsize); */

  /**** PART 10: Rearrange keys according to bucket destination */
  /* Send bucket size to corresponding processors */
  AIMREGISTER(lbucket,nprocs*intsize);
  AIMCOMMIT();AIMCOMSTART(lbucket);
  for(i=0;i<nprocs;i++) {
     AIMHPPUT(i,&ppf_bucket[i],lbucket,pid*intsize,intsize);
  }
  AIMCOMEND(lbucket);

  /* Allocate memory */
  proc_size = lbucket[nprocs-1];
  temp_result = (idata *) malloc(proc_size*size);
   AIMREGISTER(temp_result,proc_size*size);
   AIMCOMMIT();

   /**** PART 11: Route keys to destination processor */
#ifdef ROUTEONE
   AIMCOMSTART(temp_result);
   for(i=0,j=0;i<nprocs;i++) {
        middle=bucket[i]*size;
        right=ppf_bucket[i]*size;
        left=right-middle;
        AIMHPPUT(i,&keys[j],temp_result,left,middle);
        j+=bucket[i];
   }
   AIMCOMEND(temp_result); 
#else
  obucket[0]=0;
  for(i=1;i<nprocs;i++) {
    obucket[i]=obucket[i-1]+bucket[i-1];
  }
  AIMCOMSTART(temp_result);
   for(k=0,j=0;k<nprocs;k++) {
        i=(k+1)%nprocs;
        middle=bucket[i]*size;
        right=ppf_bucket[i]*size;
        left=right-middle;
        AIMHPPUT(i,&keys[obucket[i]],temp_result,left,middle);
        j+=bucket[i];
   }
   AIMCOMEND(temp_result);
#endif

   /**** PART 11a: Start merging received sequences */
#ifdef KWAYMERGE
   for(i=1;i<nprocs; i = (i<<1)) {
      for(left=0,j=0;left < nprocs;left += (i<<1)) {
         middle = MIN((left+(i<<1)),nprocs) -1;
         right=lbucket[left+i-1];
         mergedat2((idata *)&temp_result[j],
               (idata *) &temp_result[right],
               (right-j),
               (lbucket[middle]-right)
               );
         j=lbucket[middle];
      }
   }
#else
   (*localsort)((idata*)temp_result,proc_size);
#endif


   /**** PART 12: Ready to exit. Free space and Deregister*/
  AIMDEREGISTER(temp_result);
  AIMDEREGISTER(lbucket);
  free((void*)lbucket);
  free((void*)ppf_bucket); 
  free((void*)obucket);
  free((void*)bucket);
  free((void*)split);
  free((void*)ssample);
  free((void*)sample);

  *result=temp_result;
  return(proc_size);
}
