#! /users/jonh/bin/perl
$bindir        = "//amd/tmp_mnt/garnet/fs25/jonh/BSP/bin/";
$includedir    = "//amd/tmp_mnt/garnet/fs25/jonh/BSP/bin/../include/";
$libdir        = "//amd/tmp_mnt/garnet/fs25/jonh/BSP/bin/../lib/";
$mandir        = "/amd/tmp_mnt/garnet/fs25/jonh/BSP/man";





















push(@INC,$includedir);
push(@INC,$bindir);
require "bsputil";

select(STDERR); $| = 1; select(STDOUT); # no STDERR buffering, please.
($Pgm = $0) =~ s|.*/||;
$Version        = "1.3, 25th Nov 1997";
$bug_reports_to = 'bsplib-bugs@comlab.ox.ac.uk';

$ShortUsage = "\n$Pgm usage: for basic information, try the `-help' option\n";



$Verbose    = 0;
$P          = 0;
$PP         = 0;
$HostList   = "";
$Arch   = &backtick("$bindir/bsparch -arch");
$Arch   = &backtick("bsparch -arch")  if ($Arch eq "");
$Device = &backtick("$bindir/bsparch -device");
$Device = &backtick("bsparch -device") if ($Device eq "");
$Executable ="";
$SplitOutput=0;
$ZeroLocal  =0;
$DoLoad     =1;
$CheckPointRestart=0;
$CheckPointListOk=0;
$CheckPointExec="";




  $finished=0;
  arg: while (($_ = $ARGV[0]) && !$finished) {
  shift(@ARGV);
  #--------HELP------------------------------------------------
  /^-(help|man)$/ && do {&FormatManual("bsprun.1");exit(0);};

  /^-v$/          && do {$Verbose = 1; next arg;};

  /^-splitio$/    && do {$SplitOutput = 1; next arg;};
  /^-restart$/    && do {$CheckPointRestart = 1; next arg;};
  /^-local$/      && do {$ZeroLocal = 1; next arg;};
  /^-nolocal$/    && do {$ZeroLocal =-1; next arg;};
  /^-noload$/     && do {$DoLoad    = 0; $ZeroLocal=1;next arg;};

  /^-hostlist$/   && do {$HostList = &grab_next_arg("-hostlist");
			 next arg;};

  /^(-npes|-p)$/  && do {$P=&grab_next_arg($1);
                         if (!($P =~ /^\d+$/)) {
                           print STDERR "$Pgm: -npes argument must be ",
                                        "an integer \"",$P,"\".\n";
                           $Status++;
                         };
                         $P = int($P);
                         next arg;};
  $Executable=$_;
  $finished = 1;
  }


  if (!(-e $Executable)) {
    print STDERR "$Pgm: \"$Executable\" No such executable.\n";
    $Status++;
  }
  if ($Status>0) {
    print STDERR $ShortUsage;
    exit(1);
  }
  print STDERR "$Pgm: ($Version)\n" if $Verbose;
  
  $HostList = $ENV{'HOME'} . "/.host.list" if ($HostList eq "");
  if ($P==0) {
    if ($ENV{'BSP_PROCS'}) {
      $P = $ENV{'BSP_PROCS'};
    } else {
      $P = 1;
    }
  }
  for($i=1;$i<$P;$i=$i*2) {}
  $PP=$i;
  print STDERR "$Pgm: upper bound of number of processes is $P\n" if $Verbose;

  &runProgram();
  exit(1);






sub int_lt {
  return(int($b) <=> int($a));
}






sub runProgram {
  local($exit_code);
  local($original_executable);
  local($exec_dir);
  local($i,$j,$k,$zero);
  local(@hostlist);
  local(@activehosts);
  local(@permutation);
  local(@checkpoints);
  local(@recentCheckpoints);
  local($age);

  $original_executable = $Executable;
  do {
    if ($CheckPointRestart) {
      @hostlist=();
      @activehosts=();
      @permutation=();
      @checkpoints=();
      %recentCheckpoints=();
      
      if ($Executable =~ /\/[^\/]*$/) {
        $exec_dir = $`;
      } else {
        $exec_dir = ".";
      }
      $age = (-M "$exec_dir/.cprmasterlist")*24.0*60.0;
      $CheckPointListOk = ($age<=5.0) && ($age!=0.0);
      opendir(CWD,$exec_dir);
      @files = readdir(CWD);
      @files = grep(/$original_executable\.cpr\d+\_$P\.\d+$/,@files);
      foreach $i (@files) {
	if (-x "$exec_dir/$i" ) {
          $i =~ /cpr(\d+)\_(\d+)\.(\d+)$/;
          if (!$recentCheckpoints{$3}) {
            $recentCheckpoints{$3}=1;
          } else {
            $recentCheckpoints{$3}++;
          }
        }
      }
      @checkpoints=sort int_lt (keys %recentCheckpoints);
      do {
        $i=shift(@checkpoints);
        $j=$recentCheckpoints{$i};
      } while (($#checkpoints>=0) && ($j!=$P));
      if ($j!=$P) {
        print STDERR "$Pgm: no consistent checkpoint available to restart\n";
        exit(1);
      } 
      print "$Pgm: most recent checkpoint is $i\n" if $Verbose;
      $Executable = $original_executable . ".cpr0_$P.$i";
      if ($CheckPointListOk) {
        if (!open(HOSTLIST,"$exec_dir/.cprmasterlist")) {
          print STDERR "$Pgm: unable to read checkpointed host list\n";
	  exit(1);
        }
        while(<HOSTLIST>) {
          push(@hostlist,$_);
        }
        close(HOSTLIST);
        $zero=shift(@hostlist);
        if (!open(HOSTZERO,"> $exec_dir/.cprzeroslist")) {
          print STDERR "$Pgm: unable to write to checkpoint host list ",
                       "for bspnowstartup\n";
	  exit(1);
        }
        if (!open(HOSTREST,"> $exec_dir/.cprrestlist")) {
          print STDERR "$Pgm: unable to write to checkpoint host list ",
                       "for user executable\n";
	  exit(1);
        }
        for($i=0;$i<($P-1);$i++) {
          push(@activehosts,shift(@hostlist));
          push(@permutation,$i);
        }
        for($i=0;$i<$P;$i++) {
          $j = int(rand($P-1));
          $k = $activehosts[$j];
          $activehosts[$j]=$activehosts[$i];
          $activehosts[$i]=$k;
        }
        print HOSTZERO $zero;
        print HOSTZERO "# Not really interested in the rest\n";
        for($i=0;$i<=$#hostlist;$i++) {
          print HOSTZERO $hostlist[$i];
        }
        for($i=0;$i<=$#activehosts;$i++) {
          print HOSTZERO $activehosts[$i];
        }
	close(HOSTZERO);
	
        for($i=0;$i<=$#activehosts;$i++) {
          print HOSTREST $activehosts[$i];
        }
        print HOSTREST "# Not really interested in the rest\n";
	print HOSTREST $zero;
        for($i=0;$i<=$#hostlist;$i++) {
          print HOSTREST $hostlist[$i];
        }
        close(HOSTREST);
	&run_something("sync","make files NFS visible");
      }
    }
    $exit_code=&runCommand();
    $CheckPointRestart = $exit_code==123;
  } while ($exit_code==123);
}

sub randomise {
  local($x);
  return (rand()%3-1);
}





sub runCommand {
  local($cmd,$home,$exec_dir,$do_exec);

  $do_exec=1;
  if      (($Device eq "MPASS_MPI") && ($Arch =~ /^SGI/)) {
    $cmd = "env MPI_DEFAULT_NP=$P ";

  } elsif ($Device eq "MPASS_MPI") {
    $cmd = "mpirun -machinefile $HostList -np $P ";
 
  } elsif ($Arch eq "CRAYT3D") { 
    $cmd = "env BSP_PROCS=$P mppexec -npes $PP ";

  } elsif ($Arch eq "CRAYT3E") {
    $cmd = "mpprun -n $P ";

  } elsif ($Arch eq "SP2") {
    $cmd = "env MP_PROCS=$P BSP_PROCS=$P BSP_EXEC=$Executable ";

  } elsif ($Device eq "MPASS_PARMACS") {
    $cmd = $bindir . "/" . $Arch . "/bsp_spmdhost $P ";
 
  } elsif ($Device eq "MPASS_TCPIP" || 
           $Device eq "MPASS_UDPIP" || 
           $Device eq "MPASS_3C905B") {
    $cmd=&tcpip_startup_cmd();
    $do_exec=0;

  } elsif ($Device eq "SHMEM_WIN32") {
#    $cmd1 = "set BSP_PROCS=$P";
#    $cmd2 = "set BSP_EXEC=$Executable";
    $do_exec=0;

  } else {
    $cmd = "env BSP_PROCS=$P BSP_EXEC=$Executable ";
  }
  $cmd .= " " . $Executable . " " . join(' ',@ARGV);
  print STDERR "$Pgm: execing \"$cmd\"\n" if $Verbose;
  if ($do_exec) {
    exec $cmd;
  } else {
    system($cmd);
    return($?>>8);
  }

}




sub tcpip_startup_cmd {
  local($cmd,$rcmd,$home,$pwd,$exec_dir);
  local($i,$rank,$found,$exec_bindir,$hostname,$here_cmd,$load_cmd);

  $home=$ENV{'HOME'};
  $home .= "/" if (!($home=~/\/$/));
  $pwd =$ENV{'PWD'};
  $pwd   .= "/" if (!($pwd=~/\/$/));
  print "$Pgm: home=$home Pwd=$pwd\n" if $Verbose;
  if ($home eq (substr($pwd,0,length($home)))) {
    $exec_dir = substr($pwd,length($home),length($pwd));
  } else {
     $exec_dir = $pwd;
  }
  $exec_dir .= "/" if $exec_dir ne "";
  chop($hostname=&backtick("hostname"));
  
  if ($ZeroLocal==0 && !(-e "$bindir/$Arch/bsp_nowstartup")) {
    print STDERR "$Pgm: **warning** defaulting to -local as bsp_nowstartup\n";
    print STDERR "$Pgm: has not been installed. \n";
  }
  $here_cmd=" BSP_LOCAL=$hostname" if ($ZeroLocal<0);
  $load_cmd=" BSP_DOLOAD=1"        if ($DoLoad && !$CheckPointListOk);
  $cpr_cmd =" BSPTCP_HOSTNAMES=$pwd/.cprrestlist" if $CheckPointListOk;
  
  $rcmd= "env BSP_PROCS=$P ". 
         " BSP_SPLITOUTPUT=" . $SplitOutput . 
         " BSP_PWD=$exec_dir" . 
	 $here_cmd.
	 $load_cmd.
         $cpr_cmd.
	 " BSP_EXEC=$Executable ";

  if ($ZeroLocal==1) {
    return $rcmd;
  } else {
     if (($ZeroLocal==0) && (!$CheckPointListOk)) {
        $cmd = $bindir . "/". $Arch . "/".
               "bsploadd -i 1 -f $HostList -s localhost -u 1 ";
        print STDERR "$Pgm: piped output from $cmd\n" if $Verbose;
        if (!open(BSPLOAD, "$cmd |")) { 
          print STDERR "$Pgm: unable to execute load daemon client.\n";
          print STDERR "$Pgm: client is $cmd.\n";
          exit(1);
        } else {
	   $i=1;
	   while(<BSPLOAD>) {
	     chop($_);
	     $rank=$i if ($hostname eq $_);
	     $i++;
	   }
	   close(BSPLOAD);
           print STDERR "$Pgm: local host has rank $rank.\n" if $Verbose;
	   return $rcmd if ($rank<=$P);
        }
     }

     if ($home eq (substr($bindir,0,length($home)))) {
       $exec_bindir = substr($bindir,length($home),length($bindir)) ."/";
     } else {
       $exec_bindir = $bindir . "/";
     }
     $cpr_cmd="";
     $cpr_cmd=" BSPTCP_HOSTNAMES=$pwd/.cprrestlist" if $CheckPointListOk;
     $cmd= "env BSP_PROCS=2 ".
           " BSP_SPLITOUTPUT=" . $SplitOutput .
	   " BSP_PWD=$exec_bindir/$Arch/". 
	   $here_cmd.
	   $load_cmd.
           " BSP_EXEC=bsp_nowstartup";
     
     $cmd.=" $bindir/$Arch/bsp_nowstartup $exec_dir ".
	   $rcmd;
     return $cmd;
   }
}

