#!/bin/ksh
trap 'echo "Received signal, aborting ..." >&2; wait; exit 1' 1 2 3 15

set -eu
#begin
#=======================================================================
#
#     Script dcagen  
#     -------------    
#                         
#     Purpose     : Creates Direct Column Access-files for given ODB
#                   to improve post-processing & direct column access
#                   from ODB tables
#                               
#     Usage       : dcagen [-l dbname] [-t table] [-f] [-d] [-n] [-u] [-i dbdir] [-v] [-N cpus]
#                          [-F] [-b] [-B bufsiz] [-z] [-g] [-c colname@table] [-C colname@table]
#                          [-x] [-Q] [-m] [-o output_file_prefix] [-D cache_disk] [-a] [-A] [-I] [-P]
#                          [-e errlog] [-E errlog]
#                                            
#     Parameters  : -l dbname - database name (default: from *.dd file name)
#     ----------    -t table  - table name (can be supplied multiple times)
#                   -f        - force to continue despite fatal errors
#                   -F        - force to create despite already exists
#                   -d        - turn debugging on
#                   -n        - do NOT update i.e. DO NOT use -u option for dcagen.x (the default)
#                   -u        - use option -u for dcagen.x
#                   -i dbdir  - Database directory (default: "." i.e. current dir)
#                   -v        - Produce verbose output
#                   -N        - How many CPUs to use to parallelize dcagen over multiple tables (default=3)
#                   -b        - Create binary DCA-file instead of text
#                   -B bufsiz - Override the ODB_CACHE_LIMIT used with -c/-C (in MBytes) (default=128)
#                   -z        - Remove zero-length DCA-files (by default NOT removed)
#                   -g        - DCA-files (binary or text) will be gzip'ped, if possible (by default: not)
#                   -c c@t    - Create cache of column 'c@t' values to file cache/c@t.<#>
#                   -C c@t    - Create unpacked cache of column 'c@t' values to file cache/c@t.<#>
#                   -q        - Quiet option : Do not print dots (....) in show progress in non-verbose mode
#                   -x        - Extract poolno,nrows,nmdis,cardinality,min,max of column(s) given by -c/-C
#                   -m        - Same as -x, but prints only column name followed by aggregate min & max
#                   -Q        - Quickmode: same as -x, but quits after the first input file has been processed
#                   -o prefix - *ALL* cacheable files will be written to file(s) cache/prefix.cache.%d. Implies -N 1
#                                If set to %s, then prefix will be based on prevailing tablename
#                   -D cachedisk - root-path for cache-disk i.e. ODB_CACHE_DISK (default=cache)
#                   -a        - cache all encountered columns. Do NOT unpack them (-C colname and -A override)
#                   -A        - cache all encountered columns. Unpack them before caching
#                   -I        - Do *NOT* (re-)create $dbname.IOASSIGN-file (even if it did not exist)
#                   -P        - Prevent from running odbprune recursively (f.ex. when dcagen is called from odbprune)
#                   -e errlog - Enforce stderr output to file errlog
#                   -E errlog - Enforce stderr output to file errlog; as -e, but append to the existing errlog
#
#=======================================================================
#end

cmd=$(\cd $(dirname $0); echo $(pwd))/$(basename $0)
thisdir=$(pwd)

if [[ $# -gt 0 ]] ; then
  ARGS="${cmd} $*"
else
  ARGS="${cmd}"
fi

USAGE="Usage: $0 [-l dbname] [-t table] [-f] [-F] [-d] [-u] [-n]"
USAGE="$USAGE [-i dbdir] [-v] [-N cpus] [-b] [-B bufsiz] [-z] [-g]"
USAGE="$USAGE [-c col@table] [-C col@table] [-q] [-x] [-m] [-Q] [-o output_file_prefix]"
USAGE="$USAGE [-D cache_disk] [-a] [-A] [-I] [-P] [-e errlog] [-E errlog]"

dbname=""
tables=""
force2continue=0
force2create=0
errflg=0
debug=""
update="-n" # -u (update) can be 100 times slower than if -n option was used ;-(
dbdir=.
verbose=0
export ODB_PARAL=${ODB_PARAL:=4}
paral=${ODB_PARAL}
bufsiz=""
binary=""
bin=0
rmzero=0
dogzip=0
cachec=""
cacheC=""
cacheit=0
quiet=0
extract=0
extract_arg=""
quick=0
minmax=0
oprefix=""
oprefix_given=0
export ODB_CACHE_DISK=${ODB_CACHE_DISK:="cache"} # could be f.ex. "/fast/disk"
cachedisk=${ODB_CACHE_DISK}
allcached=""
ALLcached=""
create_Ioassign=1 # ... but only when $dbname.IOASSIGN was not found
prune=1
errlog_given=0
errlog="/dev/null"
export OMP_NUM_THREADS=1

while getopts aAbB:c:C:dD:e:E:fFgi:Il:mnN:o:PqQt:uvxz  option
do
  case $option in
    a) allcached="-a"; cacheit=1;;
    A) ALLcached="-A"; cacheit=1;;
    b) binary="-b"; bin=1;;
    B) bufsiz="-i $OPTARG";;
    c) cachec="$cachec $OPTARG"; cacheit=1;;
    C) cacheC="$cacheC $OPTARG"; cacheit=1;;
    d) debug="-d";;
    D) cachedisk="$OPTARG";;
    e) errlog_given=1; errlog="$OPTARG";;
    E) errlog_given=2; errlog="$OPTARG";;
    f) force2continue=1;;
    F) force2create=1;;
    g) dogzip=1;;
    i) dbdir=$OPTARG;;
    I) create_Ioassign=0;;
    l) dbname=$OPTARG;;
    m) minmax=1; extract=1;;
    n) update="-n";;
    N) paral=$OPTARG;;
    o) oprefix="-o $(basename $OPTARG)"; oprefix_given=1;;
    P) prune=0;;
    q) quiet=1;;
    Q) extract=1; quick=1;;
    t) tables="$tables$OPTARG ";;
    u) update="-u";;
    v) verbose=1;;
    x) extract=1; quick=0;;
    z) rmzero=1;;
    *) errflg=1;;
  esac
done

if [[ $extract -eq 1 ]] ; then
  extract_arg="-x"
  quiet=1
  update="-n"
  binary=""
  bin=0
  dogzip=0
  paral=1
  verbose=0
  force2create=1
  errlog_given=1; errlog="/dev/null"
fi

if [[ $errlog_given -ge 1 ]] ; then
  if [[ $errlog_given -eq 1 ]] ; then # -e
    exec 2>$errlog
  else # -E
    exec 2>>$errlog
  fi
fi

if [[ $quiet -eq 0 ]] ; then
  echo "$ARGS" >&2
fi

if [[ $cacheit -eq 1 && $oprefix_given -eq 1 ]] ; then
  paral=1
fi

if [[ ! -d $dbdir ]] ; then
  echo "***Error: Given database directory does not exist. Check your '-i dbdir'" >&2
  errflg=2
else
  dbdir=$(\cd $dbdir>/dev/null 2>&1; pwd)
  if [[ "$dbname" = "" ]] ; then
    dbname=$(\cd $dbdir>/dev/null 2>&1; \ls -C1 *.dd 2>/dev/null | head -1)
    dbname=$(basename $dbname .dd)
  fi
fi

if [[ "$dbname" = "" ]] ; then
  echo "***Error: Unable to determine the database name. Check your '-l dbname'" >&2
  errflg=3
fi

dbname=$(echo "$dbname" | perl -pe 's/\..*//')
ddfile=$dbdir/$dbname.dd
npools=0
if [[ ! -f $ddfile ]] ; then
  echo "***Error: Unable to locate the main metadata file (ddfile '$ddfile')" >&2
  errflg=4
else
  npools=$(head -5 $ddfile | tail -1)
fi

#=======================================================================
#   Report errors and exit
#=======================================================================

if [[ $errflg -ne 0 ]] ; then
  echo "***Error(s) were detected" >&2
  awk '/#begin/,/#end/' $cmd | egrep -v '#(begin|end)' | sed 's/^#//' >&2
  exit 1
fi

#=======================================================================
# Begin processing ...
#=======================================================================

if [[ $prune -eq 1 ]] ; then
  prunecmd="odbprune -i $dbdir"
  if [[ $quiet -eq 1 ]] ; then
    prunecmd="$prunecmd -q"
  fi
  $prunecmd || {
    rc=$?
    echo "***Error in $0: '$prunecmd' has failed" >&2
    exit $rc
  }
fi

dcadir_in=$dbdir/dca
dcadir_out=$dbdir/dca

if [[ $create_Ioassign -eq 1 && ! -f $dbdir/$dbname.IOASSIGN ]] ; then
  #-- Make sure $dbdir/$dbname.IOASSIGN exists
  \cd $dbdir
  if [[ $quiet -eq 1 ]] ; then
    create_ioassign -l $dbname -q >&2
  else
    create_ioassign -l $dbname >&2
  fi
  \cd $thisdir
fi

if [[ "$(echo "$cachedisk" | cut -c1)" != "/" ]] ; then
  cachedisk=./$cachedisk
fi
export ODB_CACHE_DISK="$cachedisk"

cacheall=""
cachearg=""
if [[ $cacheit -eq 1 ]] ; then
  if [[ $extract -eq 0 ]] ; then
    [[ -d $cachedisk ]] || mkdir -p $cachedisk
  fi
  if [[ "$cachec" != "" ]] ; then
    cacheallc=$(echo "$cachec" | perl -pe 's/,/ /g; s/\s+/ /g;')
    cachec=$(echo "$cachec" | perl -pe 's/,/ /g; s/\s+/ /g; s/(\S+)/-c $1/g')
  else
    cacheallc=""
  fi
  if [[ "$cacheC" != "" ]] ; then
    cacheallC=$(echo "$cacheC" | perl -pe 's/,/ /g; s/\s+/ /g;')
    cacheC=$(echo "$cacheC" | perl -pe 's/,/ /g; s/\s+/ /g; s/(\S+)/-C $1/g')
  else
    cacheallC=""
  fi
  cacheall="$cacheallc $cacheallC"
  cachearg="$oprefix $ALLcached $allcached $cachec $cacheC" # Note: oprefix (-o flag) MUST BE prior to -c/-C flags
fi

if [[ "$tables" = "" ]] ; then
  tables=$(egrep ^@ $ddfile | perl -ne 'print "$1\n" if (m/^\@(\S+)\s+\d+/)' | perl -pe 's/\n/ /g;')

# reorder tables for optimal parallel dcagen: largest tables first
# for this we need the actual cumulative filesizes for each table

  TMPDIR=${TMPDIR:=/tmp}
  sizes=$TMPDIR/sizes.pl.$$
  cat > $sizes <<'EOF'
use strict; my %cnt = (); my $size; my $key;
for (<>) { next if (m/^\s*$/); s/^\s+//; s|\s+\d+/| |; ($size,$key) = split/\s+/; $cnt{$key} += $size; }
foreach $key (keys(%cnt)) { $size = $cnt{$key}; print "$size $key\n"; }
EOF

  #-- AIX doesn't seem to support 'xargs -r', but its 'xargs' by default implies -r ...
  xargs_r=$(echo "" | xargs -r 2>/dev/null && echo "xargs -r" || echo "xargs")

  tables=$(\cd $dbdir >/dev/null 2>&1; \
          (echo "$tables" | perl -pe 's/\s*(\w+)/0 $1\n/g;' ; \
           find [0-9]* -type f -follow 2>/dev/null | $xargs_r ls -C1s) | \
           perl -w $sizes | sort -nr -k1,1 | awk '{print $NF}' | perl -pe 's/\n/ /g;')

  rm -f $sizes
else
  tables=$(echo "$tables" | perl -pe 's/,/ /g; s/\s+/ /g;')
fi

[[ $verbose -eq 0 ]] || set +xv

[[ $verbose -eq 0 ]] || echo "dcagen: $(pwd) ..."
[[ -d $dcadir_out ]] || mkdir $dcadir_out

in_error=$thisdir/in_error
rm -f $in_error

function dcagen_parallel {
  set -eu
  typeset t=$1
  if [[ $cacheit -eq 1 ]] ; then
    typeset ca
    for ca in $cacheall
    do
      typeset col=$(echo "$ca" | perl -pe 's/\@.*//')
      if [[ $extract -eq 0 ]] ; then
        rm -f $cachedisk/${col}@$t.cache.*
      fi
    done
  fi
  typeset cnt=0
  typeset dcafile_out
  if [[ $extract -eq 0 ]] ; then
    dcafile_out=$dcadir_out/$t.dca
  else
    dcafile_out=$dcadir_out/$t.extract
  fi
  typeset rc=0
  typeset nohdr=""
  typeset gzip=""
  [[ $dogzip -eq 0 ]] || gzip=$(whence gzip 2>/dev/null || echo "")
  typeset v=""
  [[ $verbose -eq 0 ]] || v="v"
  if [[ $bin -eq 0 ]] ; then
    typeset dcafile_in=$dcadir_in/$t.dca
    egrep '^#DCA' $dcafile_in 2>/dev/null | egrep 'is_little$' >/dev/null 2>&1 || rc=$?
  fi
  if [[ $rc -ne 0 ]] || [[ $force2create -eq 1 ]] ; then
    [[ $verbose -eq 0 ]] || echo "Creating DCA-information for table $t ..."
    cat /dev/null > $dcafile_out
#    for f in $(\cd $dbdir >/dev/null 2>&1; \
#               find [0-9]* -type f -follow -name $t -print 2>/dev/null | sort -n)
    typeset f
    typeset poolno=0
    while [[ $poolno -lt $npools ]]
    do
     ((poolno += 1))
     f=$poolno/$t
     if [[ -r $dbdir/$f ]] ; then
      typeset cmd
      cmd=$(echo "$ODB_FEBINPATH/dcagen.x $debug $update -f $ddfile -l $dbname \
            -t $t $cachearg -p $poolno $binary $bufsiz $nohdr $extract_arg $f")
      cmd=$(echo "$cmd" | perl -pe 's/\s+/ /g')

      [[ $verbose -eq 0 ]] || echo "cd $dbdir; $cmd"
      (\cd $dbdir >/dev/null 2>&1; $cmd) >> $dcafile_out || {
        echo "***Error in '$cmd'" >&2
        [[ $force2continue -eq 1 ]] || {
           echo "***Error in '$cmd'" >> $in_error
           exit 1
        }
      }
      if [[ $extract -eq 0 && $verbose -eq 0 && $quiet -eq 0 ]] ; then # ...... printout (per every 10th)
        ((cnt += 1))
        typeset tst=$((cnt%10))
        if [[ $tst -eq 1 ]] ; then
          (echo "." | perl -pe 's/\n//' >&2)  # echo -n not available on all Junikses
        fi
      fi
      nohdr="-h"
      if [[ $quick -eq 1 ]] ; then # get out now
        poolno=$npools
      fi
     fi # if [[ -r $f ]] ; then ...
    done

    if [[ $extract -eq 0 && $verbose -eq 0 && $quiet -eq 0 ]] ; then # last ..... printout
      (echo "." | perl -pe 's/\n//' >&2)  # echo -n not available on all Junikses
    fi

    if [[ $extract -eq 1 ]] ; then
      if [[ $minmax -eq 1 ]] ; then 
        typeset cols=$(awk '{print $1}' < $dcafile_out | sort -u)
        typeset ca
        for ca in $cols
        do
          egrep "^$ca " $dcafile_out |\
            awk 'BEGIN { name=""; min=1e+100; max=-1e+100;}  { \
                 name=$1; \
                 if ($NF > max) { max=$NF; } \
                 if ($(NF-1) < min) { min=$(NF-1);}          } \
                 END {print name,min,max;}'
        done
      else
        cat $dcafile_out
      fi
      rm -f $dcafile_out
    else
      if [[ $bin -eq 0 ]] ; then  
        egrep '^#DCA' $dcafile_out | head -1 > $dcafile_out.$$
        egrep -v '^#' $dcafile_out | sort -n -k1,1 -k6,6 >> $dcafile_out.$$
        mv $dcafile_out.$$ $dcafile_out
      fi

      if [[ -s $dcafile_out ]] ; then
        if [[ "$gzip" != "" && -x "$gzip" ]] ; then
          rm -f $dcafile_out.gz
          $gzip -1$v $dcafile_out
          if [[ -f $dcafile_out.gz ]] ; then
            mv $dcafile_out.gz $dcafile_out
          fi
        fi
      elif [[ $rmzero -eq 1 ]] ; then
        rm -f $dcafile_out
      fi
    fi # if [[ $extract -eq 1 ]] ; then ... else ...
  fi
}

subproc="&"
if [[ $paral -le 1 ]] ; then
  subproc=""
  paral=1
fi

setx="";
[[ $verbose -eq 0 ]] || setx="set -x;"

[[ $verbose -eq 0 ]] || date
n=0
for t in $tables
do
  eval "($setx dcagen_parallel $t) $subproc"
  ((n+=1))
  if [[ -f $in_error && $force2continue -eq 0 ]] ; then
    break
  fi
  [[ $((n%$paral)) -eq 0 ]] && wait
  if [[ -f $in_error && $force2continue -eq 0 ]] ; then
    break
  fi
done
wait

if [[ $extract -eq 0 ]] ; then
  if [[ $quiet -eq 0 ]] ; then
    [[ $verbose -eq 1 ]] || echo " "
    [[ $verbose -eq 0 ]] || date
  fi
fi

if [[ -f $in_error ]] ; then
  echo "***Error: There were error(s) encountered during dcagen run(s)" >&2
  cat $in_error >&2
  rm -f $in_error
  exit 1
elif [[ $verbose -eq 1 ]] ; then
  pwd
  ls -Lltr $dcadir_out/*.dca 2>/dev/null || :
  if [[ $cacheit -eq 1 && $extract -eq 0 ]] ; then
    ls -Lltr $cachedisk 2>/dev/null || :
  fi
fi

exit 0


