#!/bin/bash

#    gfs2_lockgather - A script that gathers data for diagnosing GFS2 locking issues
#    Copyright 2012 Adam Drew <adrew@redhat.com>

#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.

#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.

#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.


QUIET=false

#Handle arguments
for var in "$@"
do
	#Handle running on all nodes
	if [ $var == "--allnodes" ] ||  [ $var == "-a" ] ; then

    		for node in $(ccs_tool lsnode | tail --lines=+5 | grep -v "Cluster name" | grep -v "Nodename" | awk '{print $1}') ; do
        		#We gather via SSH on all nodes, even the local node
        		#We do this becuase determining which node name is the 
        		#node running the script is too much logic to be worth it
        		echo "Starting data gathering on $node..." 
			ssh  -q -f  root@$node '/sbin/gfs2_lockgather -q' 
			echo "gfs2_lockgather will log a message in /var/log/messages on $node when complete or if there is an error."
   	 	done	
		exit 0
	fi

	#Handle quiet mode
	if [ $var == "-q" ] || [ $var == "--quiet" ] ; then
		QUIET=true
	fi
	
	#Handle help request
	if [ $var == "--help" ] || [ $var == "--info" ] || [ $var == "-h" ] ; then

		echo "gfs2_lockgather, version 1"
		echo "A script that gathers data for diagnosing GFS2 locking issues."
		echo "---------------------------------------------------------------"
		echo "To gather on a single node invoke the script with no arguments."
		echo "To see this message use --help, --info, or -h."
		echo "To run with messages supressed use --quiet or -q."
		echo "To gather on all nodes invoke the script with --allnodes or -a."
		echo "Only 1 instance of gfs2_lockgather may run on a node at a time."
		echo ""
		exit 0
	fi
	
done

#Check for the lock file. We only want one instance running at a time.
if [ -e /var/run/gfs2_lockgather.lock ]; then
	echo -ne 'Error: Lock file /var/run/gfs2_lockgather.lock found.\nAnother instance of gfs2_lockgather may be running.\nAnother node may be running a gather on this node.\n' 
	logger -t gfs2_lockgather 'Error: Lock file /var/run/gfs2_lockgather.lock found. Another instance may be running. Quitting.'
        exit 1
fi

#Create the gather lock
touch  /var/run/gfs2_lockgather.lock 

logger -t gfs2_lockgather 'Gather started.'

if [ $QUIET == false ] ; then echo -ne '[       ]  Setting up for gather.\t\t\t\t\t\t\t\t\r' ; fi
#Get the current datetime for unique naming
DATETIME=$(date +%m%d%Y-%H%M%S)
 
#Set up the directory structure
mkdir /tmp/debugfs
mount -t debugfs none /tmp/debugfs
mkdir /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata
mkdir /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run1
mkdir /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run2

if [ $QUIET == false ] ; then echo -ne '[#      ]  Gathering environment data.\t\t\t\t\t\t\t\t\r'  ; fi
#Gather some basics
clustat > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/clustat.out
cman_tool services > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/clustat.out
mount -l > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/mount-l.out
ps aux > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/ps-aux.out
uname -a > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/uname-a.out

if [ $QUIET == false ] ; then echo -ne '[##     ]  Gathering GFS2 and DLM lock data: pass 1\t\t\t\t\t\t\t\t\r'  ; fi
#Glock and DLM lock dump 1
for dlmfile in $(ls -lsv /tmp/debugfs/dlm/ | grep -v total | awk '{print $10}') ; do dd if=/tmp/debugfs/dlm/$dlmfile bs=1024M of=/tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run1/$dlmfile &> /dev/null; done
for fs in $(ls -lsv /tmp/debugfs/gfs2/ | grep -v total | awk '{print $10}') ; do dd if=/tmp/debugfs/gfs2/$fs/glocks bs=1024M of=/tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run1/$fs-glocks &> /dev/null; done
    
#Enable and trigger sysrq
echo 1 > /proc/sys/kernel/sysrq
 
#Thread Dump
#This is much faster than waiting for syslog to dump the thread dumps to the messages log
if [ $QUIET == false ] ; then echo -ne '[###    ]  Gathering thread dumps.\t\t\t\t\t\t\t\t\r'  ; fi

$(
cat /proc/kmsg > /tmp/thread-dumps &
echo 't' > /proc/sysrq-trigger
sleep 10
kill -9 $!
)

if [ $QUIET == false ] ; then echo -ne '[####   ]  Gathering GFS2 and DLM lock data: pass 2.\t\t\t\t\t\t\t\t\r' ; fi
#Glock and DLM dump 2
for dlmfile in $(ls -lsv /tmp/debugfs/dlm/ | grep -v total | awk '{print $10}') ; do dd if=/tmp/debugfs/dlm/$dlmfile bs=1024M of=/tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run2/$dlmfile &> /dev/null; done
for fs in $(ls -lsv /tmp/debugfs/gfs2/ | grep -v total | awk '{print $10}') ; do dd if=/tmp/debugfs/gfs2/$fs/glocks bs=1024M of=/tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run2/$fs-glocks &> /dev/null; done

if [ $QUIET == false ] ; then echo -ne '[#####  ]  Gathering messages logs\t\t\t\t\t\t\t\t\r' ; fi
#Get the messages log file
cp /var/log/messages /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/

#Tar up the results and clean up temporary files    
if [ $QUIET == false ] ; then echo -ne '[###### ]   Cleaning up... 80%.\t\t\t\t\t\t\t\t\r' ; fi
tar cjf /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata.tar.bz /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/ &> /dev/null
umount /tmp/debugfs/
rm -f  /var/run/gfs2_lockgather.lock 
rm -rf /tmp/debugfs
rm -rf /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata
logger -t gfs2_lockgather "Gather completed. File is /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata.tar.bz"  
if [ $QUIET == false ] ; then echo -ne "[#######]  Done. File is /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata.tar.bz\r\t\t\t\t\t\t\t\t\r\n"  ; fi
exit 0
