/****(c) 2020. A. Gerbessiotis. See ../KAagcopy.txt ****/
/*******************************************************/
#include "avgaii.h"
#include "prdx.h"
#include "seqo.h"
#include <stdlib.h>
#include <math.h>
#include "psrtai.h"


 /**** gvr       :Gerbessiotis-Valiant    Randomized    sorting
  **** keys      : array of idata to sort, idata is int or double
  ****    n      : # of keys in local processor
  **** total     : total number of keys
  **** result    : Output array: Algorithm does not sort in place
  **** nbytes    : byte size of idata nbytes=sizeof(idata)
  **** localsort : Sequential Sort function to sort local data
  **** generic    : Sequential Sort function to sort sample
  **** percent   : of actual sample size; allows control of sample
  ****             size to vary from theoretical value.
  **** Returns an integer that is the number of local keys after sort.
  ****/
int  gvr (idata  *keys, int n, int total, idata **result,
          int nbytes, void (*localsort)(idata*, int), 
          void (*generic)(void*,int,int,int (*)(const void*,const void *)),
          int percent)

{
  register int  pid,nprocs;
#ifdef ROUTEONE
  register int	i,j;              /* index variables */
#else
  register int	i,j,k;              /* index variables */
#endif
  register int	size,intsize;	  /* size of data and int */
  register double u;              /* auxiliary variable */
  register int  ns;               /* sample size */
  idata         *sample;          /* key sample array */
  idata         *ssample;         /* key sample array */
  idata         *split;           /* splitter array */
  register int  *bucket_index;    /* bucket-index of each input key */
  int           *bucket;          /* local bucket size */
  int           *ppf_bucket;      /* ParallelPrefix'ed bucket size */
  int           *lbucket;         /* auxiliary local */
#ifndef PARALLELSAMPLESORT
  register int 	offset,index;     /* aux variables in random sampling*/
#else
  register int 	index;            /* aux variables in random sampling*/
#endif
  int           proc_size;	  /* processor bucket size */
  register int	compare_res;	  /* comparison result */
  register int 	left,right,middle;/* binary search variables */
  idata		*temp_result;


  /**** PART 0: Some early checks*/
  pid=AIPID(); nprocs=AINPROCS();
  size=nbytes;
  if (nprocs <= 1) {
    temp_result = (idata  *) malloc(n*size);
    memcpy(temp_result,keys,size*n);
    (*localsort) ((idata *)temp_result,n);
    *result=(idata *)temp_result;
    return(n);
  }

  /**** PART 1: Initializations */

  if (total==0) total=nprocs*n;
  u=(percent/100.0)*nprocs*pow(log((double)total),2.0);
  u=ceil(u+1);
  ns=  ((int)ceil(u/(double)nprocs)) * nprocs;
  intsize=sizeof(int);   /* sizeof(int)   */
	
  /**** PART 2: Random number initialization */

  srandom    (23+1001*pid); /* LCRG, Shaky and fails spectral test */

  /**** PART 3: Space allocation and checking */

  sample       = (idata*)  malloc(ns*size);        /* Sample data storage */
  ssample      = (idata*)  malloc(ns*size);        /* Sample data storage */
  split        = (idata*)  malloc(nprocs*size);    /* splitter storage */
  bucket       = (int*)    malloc(nprocs*intsize); /* bucket size */
  ppf_bucket   = (int*)    malloc(nprocs*intsize); /* ppf result temp */
  lbucket      = (int*)    malloc(nprocs*intsize); /* auxiliary variable */
  temp_result  = (idata *) malloc(n*size);         /* routing storage */
  bucket_index = (int*)    malloc(n*intsize);       /* key bucket index */

  /**** PART 4: Registration area */
  /* Delayed registration; intime registration before communication*/

  /**** PART 5: Sampling*/
   /* A non paper-compliant procedure, but it's if ns=2*ns */ 
   for(i=0;i<(ns/nprocs);i++) {
    index= random()%n;
    sample[i]=keys[index];
   }
#ifndef PARALLELSAMPLESORT
   offset=(ns/nprocs)*pid*size;
#endif
   bucket[0]=ns/nprocs;

  /**** PART 6: Sample sorting */

#ifndef PARALLELSAMPLESORT
  AIMREGISTER(sample,ns*size);
  AIMCOMMIT(); AIMCOMSTART(sample);
  /* collect sample at processor 0 */
  if (pid != 0 )
    AIMPUT(0,sample,sample,offset,bucket[0]*size);
  AIMCOMEND(sample);

  /* sort sample  and select p-1 splitters */
  if (pid == 0) {
    (*localsort)((idata*)sample,ns);
    for(i=1;i<nprocs;i++)
       split[i-1]=sample[(ns*i/nprocs)-1];
  }
  AIMREGISTER(split,nprocs*size);
  AIMCOMMIT();AIMCOMSTART(split);
  if (pid==0){
    for(i=1;i<nprocs;i++){
      AIMHPPUT(i,split,split,0,(nprocs-1)*size);
    }
  }
  AIMCOMEND(split);
  AIMDEREGISTER(split);
  AIMDEREGISTER(sample);
#else
  /* Sort sample locally */
  (*localsort)((idata *)sample,bucket[0]);
  bucket[0]= bspbtndat(sample,ssample,bucket[0]);

    /* Done with sample sorting; Locate splitters*/
    /* Locate splitters in resulting merged sequence */
  AIMREGISTER(split,nprocs*size);
  AIMCOMMIT(); AIMCOMSTART(split);
  for(i=0;i<nprocs;i++){/*Last local sample becomes a splitter*/
    AIMHPPUT(i,&ssample[(bucket[0]-1)],split,pid*size,size);
  }
  AIMCOMEND(split);
  AIMDEREGISTER(split);
#endif
 

  /**** PART 7: Splitters and Buckets preparation */
  memset(bucket,0,nprocs*sizeof(int));

  /**** PART 8: Binary search keys into splitters */
  for(i=0;i<n;i++) {
     left=0; 
     right=nprocs-2; 
     middle=(left+right)>>1;
     while (left <= right) { 
	compare_res = keys[i]- split[middle] ; 
        if (compare_res > 0) {
          left=middle+1;
	} else {
          right =middle-1;
	}
        middle=(left+right)>>1;
     }
     j=left;
     bucket_index[i]=j; /* i-th key goes to bucket j */
     bucket[j]++;       /* increment size of bucket j */
   }
     

  /**** PART 9: Independent prefix operations on bucket sizes */
  aiscan((int*)bucket,(int  *)ppf_bucket,nprocs);
/* ai2prefix(operadd,nprocs,(char *)bucket,(char *)ppf_bucket,intsize); */

  /**** PART 10: Rearrange keys according to bucket destination */
  /* lbucket[i] is the index in the re-ordered array  temp_result
   * of the first input-key destined to bucket i */
  /* Precomputation step */
   lbucket[0]=0;
   for(i=1;i<nprocs;i++) {
      lbucket[i] =lbucket[i-1]+bucket[i-1]; 
   }
   /* Rearrangement step :  keys -> temp_result */
   for(i=0;i<n;i++) {
      temp_result[lbucket[bucket_index[i]]]= keys[i];
/*
      MEMCPY(&temp_result[(lbucket[bucket_index[i]])],
             &keys[i],size); 
*/
      lbucket[bucket_index[i]]++;
   }
   free((void*)bucket_index);
   memcpy((char*)keys,(char*)temp_result,n*size);
   free((void*)temp_result);


   /**** PART 11: Route rearranged keys to destination processor */
   AIMREGISTER(&proc_size,sizeof(int));
   AIMCOMMIT(); AIMCOMSTART(&proc_size);
   if (pid == (nprocs-1)){
     for(i=0;i<nprocs;i++)
	AIMHPPUT(i,&ppf_bucket[i],&proc_size,0,intsize);
   }
   AIMCOMEND(&proc_size);
   temp_result = (idata *) malloc(proc_size*size);
   AIMREGISTER(temp_result,proc_size*size);
   AIMCOMMIT();
#ifdef ROUTEONE
   AIMCOMSTART(temp_result);
   for(i=0;i<nprocs;i++) {
      if (bucket[i] !=0)
	AIMHPPUT(i,(char*)&keys[(lbucket[i]-bucket[i])],
		  (char*)temp_result,(ppf_bucket[i]-bucket[i])*size,
		  bucket[i]*size);  
   }
   AIMCOMEND(temp_result);
#else
   AIMCOMSTART(temp_result);
   for(k=0;k<nprocs;k++) {
      i=(k+1)%nprocs;
      if (bucket[i] !=0)
	AIMHPPUT(i,(char*)&keys[(lbucket[i]-bucket[i])],
		  (char*)temp_result,(ppf_bucket[i]-bucket[i])*size,
		  bucket[i]*size);  
   }
   AIMCOMEND(temp_result);
#endif

   /**** PART 11a: Start merging received sequences */
#ifdef KWAYMERGE
   (*localsort)((idata *)temp_result,proc_size);
#else
   (*localsort)((idata *)temp_result,proc_size);
#endif

   /**** PART 12: Ready to exit. Free space */
  AIMDEREGISTER(temp_result);
  AIMDEREGISTER(&proc_size);

  free((void*)lbucket);
  free((void*)ppf_bucket); 
  free((void*)bucket);
  free((void*)split);
  free((void*)ssample);
  free((void*)sample);

  *result=(idata*) temp_result;
  return(proc_size);
}
