#!/usr/bin/env perl

##**************************************************************
##
## Copyright (C) 1990-2007, Condor Team, Computer Sciences Department,
## University of Wisconsin-Madison, WI.
## 
## Licensed under the Apache License, Version 2.0 (the "License"); you
## may not use this file except in compliance with the License.  You may
## obtain a copy of the License at
## 
##    http://www.apache.org/licenses/LICENSE-2.0
## 
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
##**************************************************************


#
# Script to set/check whether Condor is running on compute cluster nodes.
#
# 14-Jan-2000 Derek Wright <wright@cs.wisc.edu>
# 21-Jan-2000 Peter Couvares <pfc@cs.wisc.edu> many fixes/improvements
# 25-Feb-2000 Peter Couvares <pfc@cs.wisc.edu> added logging
# 15-Mar-2000 Derek Wright <wright@cs.wisc.edu> fixed bugs reported by
#             wylie@cs.wisc.edu
# 19-Apr-2000 Peter Couvares <pfc@cs.wisc.edu> made 'off' persistent
#             across reboots
# 27-Apr-2000 Peter Couvares <pfc@cs.wisc.edu> added check to make
#             sure the host this is being run from is authorized to
#             turn condor on/off; also added logging of the hostname
# 17-Jul-2000 Erik Paulson <epaulson@cs.wisc.edu> updated 'check' so it
#             lists outstanding checkouts from the logfile.
# 25-Jul-2000 Peter Couvares <pfc@cs.wisc.edu> fixed status checking
#             to ask nodes directly so fast shutdown works reliably
#
# 17-Sep-2000 Erik Paulson <epaulson@cs.wisc.edu> added condor_findhost 
#             to the 'off' command; 
# 04-Oct-2000 Erik Paulson <epaulson@cs.wisc.edu> fix check_auth() to
#             know about the database nodes, and update the usage
#
# This is a very simple, first pass to solve this problem.
# Future work required:
#   reservations into the future
#   length of reservations
#   automatic checking of stale/idle reservations
#   ...
#
# Usage:
# condor_cluster_tool off N
#     turns off Condor on N nodes, and prints their hostnames to STDOUT
# condor_cluster_tool on [hosts]
#     turns on Condor on the given hosts
# condor_cluster_tool check [hosts]
#     prints the status of Condor on all nodes by default, or on only
#     the given hosts if specified

use English;
use FileHandle;
use Sys::Hostname;

### Config variables

       $RELEASE_DIR = "/unsup/condor";
          $NUMNODES = 64;  # (beware: we assume $NUMNODES<=99 in a few places)
	   $DBNODES = 36;
         $STARTNODE = 1;
       $CONDOR_POOL = "condor.cs.wisc.edu";
           $LOGFILE = "/p/condor/workspaces/cluster/condor_cluster_tool.log";
             $DEBUG = 0;


### Global variables (not user-servicable)

            $SBIN = "$RELEASE_DIR/sbin";
             $BIN = "$RELEASE_DIR/bin";
      $CONDOR_OFF = "$SBIN/condor_off -pool $CONDOR_POOL -fast";
       $CONDOR_ON = "$SBIN/condor_on -pool $CONDOR_POOL";
 $CONDOR_FINDHOST = "$BIN/condor_findhost -m -pool $CONDOR_POOL";  
      $CONFIG_OFF = "$BIN/condor_config_val -pool $CONDOR_POOL " .
                    "-set 'start_daemons = false'";
       $CONFIG_ON = "$BIN/condor_config_val -pool $CONDOR_POOL " .
                    "-unset start_daemons";
   $CONDOR_STATUS = "$BIN/condor_status -pool $CONDOR_POOL -direct";
 $CONDOR_RECONFIG = "$SBIN/condor_reconfig -pool $CONDOR_POOL";
$CONDOR_ALLOW = "$BIN/condor_config_val -pool $CONDOR_POOL "; 
  $CONDOR_DBALLOW = "-name db31 allow_administrator";
  $CONDOR_CCALLOW = "-name c33 allow_administrator";
  $CCLUSTER_CONST = " -c 'IsComputeCluster==TRUE'";
 $DBCLUSTER_CONST = " -c 'IsDBCluster==TRUE'";
   $GIGABIT_CONST = " -c 'HasGigabit==TRUE'";
       $USE_CONST = $CCLUSTER_CONST;


### The guts

# remember, $#ARGV is always 1 less than the # of args...
if( $#ARGV < 0 ) {
    usage();
    exit 1;
}

# determine username & hostname and then open logfile
$username = scalar getpwuid( $UID );
$hostname = hostname();
$user = "$username\@$hostname";
open( LOG, ">>$LOGFILE" ) || warn "$0: error opening $LOGFILE: $!\n";

# make sure initial logging occurs even if user kills us mid-way
LOG->autoflush();

# log command-line args
print LOG timestamp() . " $user";
foreach $arg ( @ARGV ) {
    print LOG " $arg";
}
print LOG "\n";

# parse args
$cmd = shift;    # "on, "off", "check", etc.
@args = @ARGV;   # args to prior command, if any

if( $cmd =~ /^ch/ )
{
    if( $#args < 0 )
    {
	foreach $n ($STARTNODE .. 9)
	{
	    push @args, "c0$n";
	}
	foreach $n (10 .. $NUMNODES)
	{
	    push @args, "c$n";
	}
	foreach $n ($STARTNODE .. 9)
	{
	    push @args, "db0$n";
	}
	foreach $n (10 .. $DBNODES)
	{
	    push @args, "db$n";
	}
	$print_summary = 1;
    }
    $on_total = 0;
    $off_total = 0;
    check_condor( @args );
    print "$on_total nodes running Condor, $off_total not running Condor\n"
	if $print_summary;
}
elsif( $cmd eq "on" )
{
    if( $#args < 0 )
    {
	usage();
	exit 1;
    }
    start_condor( @args );
}
elsif( $cmd =~ /^of/ )
{
    if( $#args < 0 )
    {
	usage();
	exit 1;
    }
    stop_condor( @args );
}
else
{
    usage();
    exit 1;
}

close LOG || warn "$0: error closing $LOGFILE: $!\n";
exit 0;


### Subroutines

sub usage()
{
    print STDERR
	"Usage: $0 off N [-cnode] [-dbnode] [-gigabit]\n",
	"       $0 on hosts [-a]\n",
	"       $0 check [hosts]\n",
	"\n",
	"       off   Stop Condor on any N nodes, and print their hostnames ",
	"to STDOUT\n",
	"            -cnode:   Pick from the Compute Cluster (default)\n ",
	"            -dbnode:  Pick from the Database Cluster\n ",
 	"            -gigabit: Pick from the nodes with gigabit\n ",     
    "\n",
	"       on    Start Condor on the given hosts\n",
	"             -a: Return all nodes that you currently hae checked out\n",
	"\n",
	"       check Print state of Condor on all nodes (or specific nodes, if given)\n",
	"\n",
	"       Examples:\n",
	"          $0 off 4\n",
	"          $0 off 2 -dbnode\n",
	"          $0 on c03 c05 c07 c09\n",
	"          $0 on -a\n",
	"          $0 check c03 c04\n",
	"\nFor help, please send mail to c-cluster-admin\@cs.wisc.edu\n";
}


sub check_auth()
{
    $DBCHECK = $CONDOR_ALLOW . $CONDOR_DBALLOW;
    $CCCHECK = $CONDOR_ALLOW . $CONDOR_CCALLOW;

    $db_auth_hosts = `$DBCHECK`;
	#print "$DBCHECK\n";
    $cc_auth_hosts = `$CCCHECK`;
    unless( ($cc_auth_hosts =~ /$hostname/ )||($db_auth_hosts =~ /$hostname/)) {
	print "Error: current host ($hostname) is not authorized ";
	print "to manage nodes\n\n";
	print "Please use one of the following authorized hosts for "; 
	print "compute cluster nodes:\n ";
	print "$cc_auth_hosts\n";
	print "Or use one of the following hosts for DB nodes:\n ";
	print "$db_auth_hosts\n";
	print "...or contact c-cluster-admin\@cs.wisc.edu with any ";
	print "questions.\n\n";
	exit 1;
    }
}

sub everyone() {

	foreach $n ($STARTNODE .. 9)
	{
	    push @allnodes, "c0$n";
	}
	foreach $n (10 .. $NUMNODES)
	{
	    push @allnodes, "c$n";
	}
	foreach $n ($STARTNODE .. 9)
	{
	    push @allnodes, "db0$n";
	}
	foreach $n (10 .. $DBNODES)
	{
	    push @allnodes, "db$n";
	}

	return(@allnodes);
}

sub start_condor
{
    my (@hosts);
    %nodes = ();
    foreach $arg (@_) {
        if( $arg eq "-a") {
            @allhosts = everyone();
            parse_logfile();
            foreach $host (@allhosts)
            {
                $reserver = ${$nodes{$host}}[1];
                $reserver =~ s/\@.*$//;
                if ( $reserver eq $username)
                {
                    #print " (reserved by $reserver at ";
                    #print "${$nodes{$host}}[0])";
                    #print "$user has $host checked out\n";
                    push(@hosts, $host);
                }
            }
         }
         else {
             push(@hosts, $arg);
        }
    }

    check_auth();

    foreach $host ( @hosts )
    {
	# if the user just specified a number, translate to a "cXX" node name
	if( $host =~ /^\d+$/ )
	{
	    if( $host < 10 )
	    {
		$host = "c0$host";
	    }
	    else
	    {
		$host = "c$host";
	    }
	}
	print LOG timestamp() . " $user    (returned $host)\n";
	#print timestamp() . " $user    (returned $host)\n";
	system( "$CONDOR_ON $host" );
	system( "$CONFIG_ON -name $host > /dev/null" );
	system( "$CONDOR_RECONFIG $host > /dev/null" );
    }
}


sub stop_condor {
    my ($num, $host, @hosts);

    foreach $arg (@_) {
         if( $arg eq "-gigabit") {
            $USE_CONST = $GIGABIT_CONST;
         }
         elsif ($arg eq "-dbnode") {
             $USE_CONST = $DBCLUSTER_CONST;
         }
         elsif ($arg eq "-cnode") {
			$USE_CONST = $CCLUSTER_CONST;
         }
         else {
			$num = $arg;
         }
    }

    if( $num =~ /\D/ ) {
        die "$0: error: $num is not a number, try again\n";
    }
    if( $num > $NUMNODES ) {
        die "$0: error: the cluster only has $NUMNODES machines, try again\n";
    }

    check_auth();

    @hosts = find_avail( $num );
    @hosts = sort(@hosts);

    foreach $host ( @hosts ) {
	kill_condor( $host );
    }

    foreach $host ( @hosts )
    {
	while( condor_up( $host ) )
	{
	    print "DEBUG: $host still up...\n" if $DEBUG;
	    sleep 1;
	}
	$host =~ s/\.cs\.wisc\.edu//;
	print $host, "\n";
	print LOG timestamp() . " $user    (given $host)\n";
    }
}


sub find_avail {
    my ($num) = @_;
    my ($found, $host, @avail);

    $found = 0;

    $CONDOR_FINDHOST_COMMAND = $CONDOR_FINDHOST . $USE_CONST . " -n $num";
    open ( GETMACHINES, "$CONDOR_FINDHOST_COMMAND 2>&1 |\n");
    while(<GETMACHINES>) {
        chomp;
        push @avail, $_;
    }

    close(GETMACHINES);
    $exit = ($? >> 8);
    $found = $exit - 1;
    die "$0: error: only $found nodes found running Condor, try again\n" if $exit > 0;
    return @avail;
}


sub kill_condor {
    my( $host ) = @_;
    `$CONDOR_OFF $host`;
    system( "$CONFIG_OFF -name $host > /dev/null" );
    system( "$CONDOR_RECONFIG $host > /dev/null" );
}

sub parse_logfile {
    my( $date, $user, $checkedout);

    $date = '';
    $user = '';
    $checkedOut = 'returned';

    open( CLUSTER_LOG, $LOGFILE ) || die "$0: error opening $LOGFILE: $!\n";
    while( <CLUSTER_LOG> )
    {
	if( /(given|returned)/ )
	{
	    chomp;
	    ($dow, $month, $day, $tod, $year, $who, $what, $which) = split;
	    $date = join (" ", $dow, $month, $day, $tod, $year);
	    $what =~ s/\(//;
	    $which =~ s/\)//;
	    #print "$which: $what at $date by $who\n";
	    if( $what eq "returned" )
	    {
		if( ${$nodes{$which}}[2] ne "given" )
	        {
		    #print "$who returning $which at $date",
		    #", which was never checked out! \n";
		}
	    
	        ${$nodes{$which}}[0] = "";
	        ${$nodes{$which}}[1] = "";
                ${$nodes{$which}}[2] = $what;
            }
            if( $what eq "given" )
            {
                if( ${$nodes{$which}}[2] ne "returned")
                {
                    #print "$who checked out $which at $date,",
                    #" which was already checked out by",
		    #" ${$nodes{$which}}[1] at",
		    #" ${$nodes{$which}}[0]\n";
                }
 		${$nodes{$which}}[0] = $date;
		${$nodes{$which}}[1] = $who;
		${$nodes{$which}}[2] = $what;
	    }

	}
    }
}

sub check_condor {
    my( @hosts ) = @_;
    
    %nodes = ();
    
    foreach $host ( @hosts )
    {
	#if the user just specifies a number, prepend the "cXX" node name
	if( $host =~ /^\d+$/ ) 
	{
	    if( $host < 10 ) 
	    {
		$host = "c0$host";
	    }
	    else
	    {
		$host = "c$host";
	    }
	}
	push( @{$nodes{"$host"}}, $date, $user, $checkedOut);  
    }

    parse_logfile();

    #foreach $node (sort keys %nodes) 

    foreach $host (@hosts)
    {
	$isUp = check_host($host);
	print "$host: $isUp";
	if ( ${$nodes{$host}}[2] eq "given")
        {
	    $reserver = ${$nodes{$host}}[1];
            $reserver =~ s/\@.*$//;
	    print " (reserved by $reserver at ";
	    print "${$nodes{$host}}[0])";
	}
        print"\n";
    }
}

sub check_host
{
    my $host = shift;
    my( $status );

    $up = condor_up( $host );

    if( $up == 1 )
    {
	$status = "Condor is on ";
	$on_total++;
    }
    elsif( $up == -1 )
    {
	$status = "unknown host";
    }
    else
    {
	$status = "Condor is off";
	$off_total++;
    }
    return $status;
}


sub timestamp {
    return scalar localtime();
}


sub condor_up()
{
    my $node = shift;

    open( CMD, "$CONDOR_STATUS $node 2>&1 |" )
	|| die "$0: error running \"$CONDOR_STATUS $node\": $!\n";

    while( <CMD> )
    {
	chomp;
	if ( /^(vm|slot)\d+\@$node/ || /^$node/ )
	{
	    return 1;
	}
	# will never happen with condor_status -direct ...
	elsif( /unknown host/ )
	{
	    return -1;
	}
    }
    return 0;
}
