#!/bin/bash
# (C) Copyright 2006-2016 Hewlett Packard Enterprise Development, L.P.
# @(#) Serviceguard Disk Monitor Script
# @(#) Product Name                :  HP Serviceguard
# @(#) Product Version             :  A.12.10.00
# @(#) Patch Name                  :  
#
#
# NOTE:    This script is not configurable!  Any changes made to this
#          script will be overwritten when you upgrade to the next
#          release of Serviceguard.
#

function usage
{
    echo "usage: cmresserviced [-h, --help] [-v, --version]"
    echo "                     [-f, --log-file <log_file>]"
    echo "                     [-l, --log-level <1-2>]n"
    echo "                     [-p, --port <port_number>]"
    echo "                     [-t, --poll-interval <seconds>]"
    echo "                     [<disk_path> [<disk_path>...] |"
    echo "                     [<vg> [<vg>...]]"
    echo "                     [<multipath_device_name> [multipath_device_name>...]"
    echo ""
    echo "multipath_device_name = name of multipath device"
    echo "without partition information."
    echo "Ex: Default Multipath Device /dev/mapper/mpath0"
    echo "Ex: Alias Multipath Device /dev/mapper/yellow"
    echo ""
    echo "-f, --log-file, -p, --port are all ignored but exist for"
    echo "backward compatibility."
    exit 1
}

MAX_RETRYS=5 
TIMEOUT=60 # seconds
LOG_LEVEL=0
MYPROC=`basename $0` # process name

# set SG paths for this distro. File HAS to be present, else
# condition is just in case...
if [[ -f /etc/cmcluster.conf ]]; then
    . /etc/cmcluster.conf
else
    printf "This system is missing the /etc/cmcluster.conf file.\n"
    printf "Must not have a valid version of Serviceguard installed!\n"
    exit 1
fi

DEVICE_MAPPER_PATTERN="-e s/_part[0-9]+$// -e s/-part[0-9]+$// -e s/p[0-9]+$//"

test -f /etc/redhat-release && \
if /bin/grep "Maipo" /etc/redhat-release >/dev/null 2>&1
then
    OS_Version="RH7"
fi

function log
{
    LEVEL=$1
    shift
    if [[ $LEVEL -le $LOG_LEVEL ]]
    then
        LINE="$(date +'%b %d %k:%M:%S') $MYPROC[$$] $*"
        if [[ "X$LOG_FILE" != "X" ]]
        then
            echo $LINE >> $LOG_FILE
        else
            echo $LINE
        fi
    fi
}

function check_disk
{
    DEVICE=$1
    log 1 "Checking $DEVICE"
    cd $SGRUN; $SGSBIN/cmcheckdisk $DEVICE
    if [[ $? -ne 0 ]]
    then
        log 0 "Failed to read from $DEVICE"
        exit 1
    fi
    log 2 "Finished checking $DEVICE"
    exit 0
}

# parse arguments
if [[ $1 = "-h" || $1 = "--help" ]]
then
    usage
elif [[ $1 = "-v" || $1 = "--version" ]]
then
    log 0 "Version is 2.0"
    exit 0
fi

while [[ ${1:0:1} == "-" ]]
do
    if [[ $1 = "-f" || $1 == "--log-file" ]]
    then
        shift
        LOG_FILE="" # Ignore logfile so that it goes to package log
        shift
    elif [[ $1 = "-l" || $1 == "--log-level" ]]
    then
        shift
        LOG_LEVEL=$1
        shift
    elif [[ $1 = "-p" || $1 == "--port" ]]
    then
        shift
        PORT=$1
        shift
    elif [[ $1 = "-t" || $1 == "--poll-interval" ]]
    then
        shift
        TIMEOUT=$1
        shift
    else
        usage
    fi
done

# Get the list of devices or vgs that were passed on the command
# line.They will be processed and populated into the DEVICES array.
UNITS=$*
DEVICES=""

if [[ "X$UNITS" = "X" ]]; then
    usage
fi
for UNIT in $UNITS
do
    vgs $UNIT >/dev/null 2>&1
    if (( $? != 0 )); then
        # If the unit passed in is not a vg then it might be valid
        # device. Add it to the DEVICES array and then it will be
        # checked later.
        # But before we add it to the DEVICES array, lets check to see
        # if its a MPIO device (/dev/mapper/mpathX). If so then lets
        # look up what the real device is (/dev/dm-X) and add that to
        # the DEVICES array (ie that is the device that will be in 
        # /proc/partitions).
        
        if [[ ${UNIT} == /dev/mapper/* ]]; then
            # Strip off the /dev/mapper/ part.
            dm=${UNIT##/dev/mapper/}
            # in case someone gives /dev/mapper/mpathXpY or /dev/mapper/mpathX_partY
            # versus just mpathX
            if [ "$OS_Version" = "RH7" ]
            then 
                dm_uuid=`dmsetup info $dm | grep UUID | awk -F- '{print $NF}'`
                output=$(multipath -ll $dm_uuid | grep $dm_uuid)
            else
            	dm=`echo $dm | sed -r $DEVICE_MAPPER_PATTERN` 
            	output=$(multipath -ll $dm | grep $dm)
            fi
            val=`echo $output | grep -e "(" -e ")"`
            if (( $? == 0 )); then
               real_UNIT=$(echo $output | awk '{print $3}')
            else
               real_UNIT=$(echo $output | awk '{print $2}')
            fi
            UNIT="/dev/$real_UNIT"
        fi
        
        DEVICES="${DEVICES} ${UNIT}"
    else
        vgerror1=$(vgs $UNIT  2>&1)
        vgerror2=$?
        echo $vgerror1 | grep -i "Invalid volume group" > /dev/null 2>&1 
        if (( ($vgerror2 != 0) || ($? == 0) )); then 
            # If the unit passed in is not a vg then it might be valid
            # device. Add it to the DEVICES array and then it will be
            # checked later.
            DEVICES="${DEVICES} ${UNIT}"
        else
            # Ok we are a vg. Lets get the pv's and add those to the DEVICES
            # array.
            PVS=$(vgdisplay -v ${UNIT} 2>/dev/null | grep 'PV Name' | awk '{print $3}')
            for pv in $PVS
            do
                if [[ -n ${pv} ]]; then
                    if [[ ${pv} == /dev/mapper/* ]]; then
                        # Strip off the /dev/mapper/ part.
                        dm=${pv##/dev/mapper/}
                        # in case someone gives /dev/mapper/mpathXpY or /dev/mapper/mpathX_partY
                        # versus just mpathX
                        if [ "$OS_Version" = "RH7" ]
                        then
                            dm_uuid=`dmsetup info $dm | grep UUID | awk -F- '{print $NF}'`
                            output=$(multipath -ll $dm_uuid | grep $dm_uuid)
                        else
                       	    dm=`echo $dm | sed -r $DEVICE_MAPPER_PATTERN`
                            output=$(multipath -ll $dm | grep $dm)
                        fi	
                        val=`echo $output | grep -e "(" -e ")"`
                        if (( $? == 0 )); then
                           real_UNIT=$(echo $output | awk '{print $3}')
                        else
                           real_UNIT=$(echo $output | awk '{print $2}')
                        fi
                        pv="/dev/$real_UNIT"
                    fi

                    DEVICES="${DEVICES} ${pv}"
                fi
            done
        fi
    fi
done

# Now validate each device.
if [[ "X$DEVICES" = "X" ]]
then
    usage
fi

# validate device files
typeset -i DEVICE_COUNT=0
for DEVICE in $DEVICES
do
    if [[ ! -b $DEVICE ]]
    then
        log 0 "$DEVICE is not a valid disk device or VG"
        exit 1
    fi
    DEVICE_NAME[$DEVICE_COUNT]=$DEVICE
    retry[$DEVICE_COUNT]=0
    (( DEVICE_COUNT = DEVICE_COUNT + 1 ))
done

log 0 "Starting disk monitoring"

while [[ 1 ]]
do
    typeset -i INDEX=0
    # check all disks in the background
    while (( INDEX < DEVICE_COUNT ))
    do
        # check disk in background so we can time it out if needed
        ( check_disk ${DEVICE_NAME[$INDEX]} ) &
        CHECK_PID[$INDEX]=$!
        (( INDEX = INDEX + 1 ))
    done

    # wait for a while
    log 2 "Sleeping $TIMEOUT"
    sleep $TIMEOUT

    typeset -i INDEX=0
    typeset -i retv=0;

    typeset -i pid=0;

    while (( INDEX < DEVICE_COUNT ))
    do
        if (( retry[$INDEX] > 0 )) 
        then
            log 0 "Retrying Disk check for ${DEVICE_NAME[$INDEX]}... ${retry[$INDEX]} of $MAX_RETRYS times."
        fi
        
        # see if the check has finished
        grep -q "PPid:[[:space:]]$$$" /proc/${CHECK_PID[$INDEX]}/status >/dev/null 2>&1
        if [[ $? -eq 0 ]]
        then
            # check process is still running, too bad
            pid=$(UNIX95=y ps --ppid ${CHECK_PID[$INDEX]} -o pid=)
            if [[ $pid -ne 0 ]]
            then
                ps -p $pid | grep cmcheckdisk >/dev/null 2>&1
                if (( 0 == $? ))
                then
                    log 0 "Timed-out ${DEVICE_NAME[$INDEX]},terminating $pid"
                    # Get the process table before killing the process
                    if [[ "X$LOG_FILE" != "X" ]]
                    then
                        echo "$(ps -ef)" >> $LOG_FILE
                    else
                        echo "$(ps -ef)"
                    fi
                    # Kill the child
                    kill -s KILL $pid 
                    if (($? != 0))
                    then
	                #Couldn't kill cmcheckdisk
	    	        log 0 "Could not terminate cmcheckdisk $pid"
                    else
	    	        log 0 " terminated cmcheckdisk $pid."
                    fi
                fi
            else
		log 0 "cmcheckdisk exited, but the parent at "
		      "${CHECK_PID[$INDEX]} is still around"
            fi

            # Kill the parent if it is still running, it should
            # exit when cmcheckdisk is killed.
            grep -q "PPid:[[:space:]]$$$" /proc/${CHECK_PID[$INDEX]}/status >/dev/null 2>&1
            if [[ $? -eq 0 ]]
            then
                ps -p ${CHECK_PID[$INDEX]} | grep cmresserviced >/dev/null 2>&1
                if (( 0 == $? ))
                then
                    kill -s KILL ${CHECK_PID[$INDEX]}
                    if (($? != 0))
                    then
                        #Couldn't kill parent of cmcheckdisk
                        log 0 "Could not terminate cmcheckdisk's parent ${CHECK_PID[$INDEX]}"
                    else
                        log 0 " terminated cmcheckdisk's parent ${CHECK_PID[$INDEX]}"
                    fi
                fi
            fi
        fi

        # see if the check succeeded 
        wait ${CHECK_PID[$INDEX]}
        (( retv=$? ));

        if ((retv != 0)) 
        then
            # check logs error message
            log 0 "Disk check failed for ${DEVICE_NAME[$INDEX]} " \
                  "with $retv."
            
            # If we get more than MAX_RETRYS in a row, exit. Otherwise
            # try to monitor the disk again - this may be a transite
            # issue.
            if (( retry[$INDEX] >= MAX_RETRYS )); then
                log 0 "Device ${DEVICE_NAME[$INDEX]} timed-out ${retry[$INDEX]} times. Aborting."
	        exit 1
            fi
            (( retry[$INDEX] = retry[$INDEX] + 1 ))
        else
            if (( retry[$INDEX] > 0 ))
            then
                log 0 "Disk Check for ${DEVICE_NAME[$INDEX]} has completed"
            fi
            (( retry[$INDEX] = 0 ))
        fi        
        (( INDEX = INDEX + 1 ))
    done

done
