#!/bin/bash

# Public domain notice for all NCBI EDirect scripts is located at:
# https://www.ncbi.nlm.nih.gov/books/NBK179288/#chapter6.Public_Domain_Notice

# archive-nlmnlp

total_start=$(date "+%s")

pth=$( dirname "$0" )

case "$pth" in
  /* )
    ;; # already absolute
  *  )
    pth=$(cd "$pth" && pwd)
    ;;
esac

case ":$PATH:" in
  *:"$pth":* )
    ;;
  * )
    PATH="$PATH:$pth"
    export PATH
    ;;
esac

hasgo=$( command -v go )
if [ ! -x "$hasgo" ]
then
  echo "ERROR: The Go (golang) compiler must be installed locally in order to process the data, EXITING" >&2
  exit 1
fi

# database-specific parameters

dbase="pubmed"
fields="CHEM DISZ GENE GRIF GSYN PREF"

# control flags set by command-line arguments

useFtp=true
useHttps=false
noAspera=false

scratch=false

download=true

e2index=false
e2invert=false
e2merge=false
e2post=false

while [ $# -gt 0 ]
do
  case "$1" in
    daily | -daily )
      e2index=true
      e2invert=true
      shift
      ;;
    index | -index | reindex | -reindex )
      e2index=true
      e2invert=true
      e2merge=true
      e2post=true
      shift
      ;;
    clean | -clean | clear | -clear | scrub | -scrub | scour | -scour | scratch | -scratch | erase | -erase )
      # delete Scratch directories
      scratch=true
      shift
      ;;
    -ftp )
      useFtp=true
      useHttps=false
      noAspera=true
      export EDIRECT_NO_ASPERA=true
      shift
      ;;
    -http | -https )
      useFtp=false
      useHttps=true
      shift
      ;;
    * )
      break
      ;;
  esac
done

if [ "$scratch" == true ] && [ "$e2index" == true ]
then
  echo "ERROR: Cleaning and indexing must be done in separate commands, EXITING" >&2
  exit 1
fi

if [ "$scratch" = true ]
then
  pm-clean -db "$dbase" -fields "$fields" -scratch
  exit 0
fi

# get path to local folder

osname=$( uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/' )

GetLocalArchiveFolder() {

  dbs="$1"
  fld="$2"

  # find selected local archive folder from environment variables or configuration file
  target=$( rchive -local "$dbs" "$fld" )

  if [ -z "$target" ] || [ "$target" = "" ]
  then
    echo "ERROR: Must supply path to local data by setting EDIRECT_LOCAL_ARCHIVE environment variable" >&2
    exit 1
  fi

  if [ -n "$osname" ] && [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
  then
    target=$( cygpath -w "$target" )
  fi

  # remove trailing slash
  target=${target%/}

  echo "$target"
}

date
echo "" >&2

pm-setup -db "$dbase"

echo "Preparing Drives" >&2
pm-prepare -db "$dbase"
echo "" >&2

archiveBase=$( GetLocalArchiveFolder "$dbase" "Archive" )
dataBase=$( GetLocalArchiveFolder "$dbase" "Data" )
extrasBase=$( GetLocalArchiveFolder "$dbase" "Extras" )
indexBase=$( GetLocalArchiveFolder "$dbase" "Index" )
invertBase=$( GetLocalArchiveFolder "$dbase" "Invert" )
mergedBase=$( GetLocalArchiveFolder "$dbase" "Merged" )
postingsBase=$( GetLocalArchiveFolder "$dbase" "Postings" )
scratchBase=$( GetLocalArchiveFolder "$dbase" "Scratch" )
currentBase=$( GetLocalArchiveFolder "$dbase" "Current" )
indexedBase=$( GetLocalArchiveFolder "$dbase" "Indexed" )
invertedBase=$( GetLocalArchiveFolder "$dbase" "Inverted" )

DWN=""

IDX=""
INV=""
MRG=""
PST=""

downloadFTP() {
  dir="$1"
  msk="$2"

  nquire -lst ftp.ncbi.nlm.nih.gov "$dir" |
  grep "$msk" |
  skip-if-file-exists | tee /dev/stderr |
  if [ "$noAspera" = true ]
  then
    nquire -dwn ftp.ncbi.nlm.nih.gov "$dir"
  else
    nquire -asp ftp.ncbi.nlm.nih.gov "$dir"
  fi
}

DoBioconcepts() {

  if [ ! -f "chemical2pubtatorcentral.gz" ]
  then
    if [ "$useFtp" = true ]
    then
      downloadFTP "pub/lu/PubTatorCentral" "chemical2pubtatorcentral.gz"
    elif [ "$useHttps" = true ]
    then
      downloadFTP "pub/lu/PubTatorCentral" "chemical2pubtatorcentral.gz"
    fi
  fi

  if [ ! -f "disease2pubtatorcentral.gz" ]
  then
    if [ "$useFtp" = true ]
    then
      downloadFTP "pub/lu/PubTatorCentral" "disease2pubtatorcentral.gz"
    elif [ "$useHttps" = true ]
    then
      downloadFTP "pub/lu/PubTatorCentral" "disease2pubtatorcentral.gz"
    fi
  fi

  if [ ! -f "gene2pubtatorcentral.gz" ]
  then
    if [ "$useFtp" = true ]
    then
      downloadFTP "pub/lu/PubTatorCentral" "gene2pubtatorcentral.gz"
    elif [ "$useHttps" = true ]
    then
      downloadFTP "pub/lu/PubTatorCentral" "gene2pubtatorcentral.gz"
    fi
  fi
}

DoGeneRIFs() {

  if [ ! -f "generifs_basic.gz" ]
  then
    if [ "$useFtp" = true ]
    then
      downloadFTP "gene/GeneRIF" "generifs_basic.gz"
    elif [ "$useHttps" = true ]
    then
      nquire -bulk -get https://ftp.ncbi.nlm.nih.gov gene/GeneRIF generifs_basic.gz > generifs_basic.gz
    fi
  fi

  if [ ! -f "gene_info.gz" ]
  then
    if [ "$useFtp" = true ]
    then
      downloadFTP "gene/DATA" "gene_info.gz"
    elif [ "$useHttps" = true ]
    then
      nquire -bulk -get https://ftp.ncbi.nlm.nih.gov gene/DATA gene_info.gz > gene_info.gz
    fi
  fi

  if [ ! -f "geneconv.xml" ] && [ -f "gene_info.gz" ]
  then
    gunzip -c gene_info.gz |
     go run "$pth/extern/prep-geneinfo.go" > geneconv.xml
  fi

  if [ -f "geneconv.xml" ]
  then
    if [ ! -f "genename.txt" ]
    then
      cat geneconv.xml |
      xtract -pattern Rec -if Id -and Gene -element Id Gene |
      sort-table -k 1,1n > genename.txt
    fi

    if [ ! -f "genesyns.txt" ]
    then
      cat geneconv.xml |
      xtract -pattern Rec -if Id -and Syns -element Id Syns |
      sort-table -k 1,1n > genesyns.txt
    fi
  fi
}

if [ "$download" = true ]
then
  seconds_start=$(date "+%s")

  if [ -d "${extrasBase}" ]
  then
    cd "${extrasBase}"

    echo "Checking for BioConcept and GeneRIF Updates"

    fst=$( nquire -dir ftp.ncbi.nlm.nih.gov "pub/lu/PubTatorCentral" )
    scd=$( nquire -dir ftp.ncbi.nlm.nih.gov "gene/GeneRIF" )
    for fl in chemical2pubtatorcentral.gz disease2pubtatorcentral.gz gene2pubtatorcentral.gz
    do
      if [ -s "$fl" ]
      then
        one=$( echo "$fst" | grep "$fl" | cut -f 1 )
        two=$( wc -c < "$fl" | tr -d ' ' )
        if [ "$one" != "$two" ]
        then
          echo "Removing outdated $fl" >&2
          rm "$fl"
        fi
      fi
    done

    if [ -s "generifs_basic.gz" ]
    then
      one=$( echo "$scd" | grep "generifs_basic.gz" | cut -f 1 )
      two=$( wc -c < "generifs_basic.gz" | tr -d ' ' )
      if [ "$one" != "$two" ]
      then
        echo "Removing outdated generifs_basic.gz" >&2
        rm "generifs_basic.gz"
      fi
    fi

    echo "Downloading GeneRIFs" >&2

    DoGeneRIFs
    sleep 1

    echo "Downloading BioConcepts Tables" >&2

    DoBioconcepts
    sleep 1

    echo "Copying to Data Directory"
    for fl in chemconv.xml diszconv.xml geneconv.xml genename.txt genesyns.txt
    do
      if [ ! -f "${dataBase}/$fl" ] && [ -f "${extrasBase}/$fl" ]
      then
        cp "${extrasBase}/$fl" "${dataBase}/$fl"
      fi
    done
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  DWN=$seconds
  echo "DWN $DWN seconds" >&2
  echo "" >&2
  sleep 1
fi

if [ -d "${scratchBase}" ]
then
  echo "Removing Previous Indices" >&2

  target="${currentBase}"
  find "$target" -name "*.xml" -delete
  find "$target" -name "*.xml.gz" -delete

  target="${indexedBase}"
  find "$target" -name "*.e2x" -delete
  find "$target" -name "*.e2x.gz" -delete

  target="${invertedBase}"
  find "$target" -name "*.inv" -delete
  find "$target" -name "*.inv.gz" -delete
fi

if [ -d "${mergedBase}" ]
then
  target="${mergedBase}"
  find "$target" -name "*.mrg" -delete
  find "$target" -name "*.mrg.gz" -delete
fi

BioConcepts() {

  domain="$1"
  transform="$2"
  prefix="$3"

  gunzip -c "${extrasBase}/${domain}2pubtatorcentral.gz" |
  go run "$pth/extern/prep-nlmnlp.go" "${extrasBase}/$transform.txt" | 
  go run "$pth/extern/prep-finish.go" 5000000 "${indexedBase}" "$prefix"
}

if [ "$e2index" = true ]
then
  seconds_start=$(date "+%s")
  echo "Indexing BioConcepts" >&2

  cd "${indexedBase}"

  target="${indexedBase}"
  echo "chemical meshname biocchem disease meshname biocdisz gene genename biocgene" |
  xargs -n 3 |
  while read domain transform prefix
  do
    BioConcepts "$domain" "$transform" "$prefix"
  done

  echo "Indexing GeneRIFs"
  gunzip -c "${extrasBase}/generifs_basic.gz" |
  go run "$pth/extern/prep-generif.go" "${extrasBase}/genename.txt" "${extrasBase}/genesyns.txt" | 
  go run "$pth/extern/prep-finish.go" 5000000 "${indexedBase}" "generifs"

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  IDX=$seconds
  echo "IDX $IDX seconds" >&2
  echo "" >&2
  sleep 1
fi

if [ "$e2invert" = true ]
then
  seconds_start=$(date "+%s")
  echo "Inverting BioConcepts Indices" >&2

  if [ -d "${indexedBase}" ]
  then
    cd "${indexedBase}"

    target="${invertedBase}"
    for fl in *.e2x.gz
    do
      base=${fl%.e2x.gz}
      echo "$base.inv"
      gunzip -c "$fl" |
      rchive -e2invert |
      gzip -1 > "$target/$base.inv.gz"
      sleep 1
    done
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  INV=$seconds
  echo "INV $INV seconds" >&2
  echo "" >&2
  sleep 1
fi

if [ "$e2merge" = true ]
then
  seconds_start=$(date "+%s")
  echo "Merging Inverted Indices" >&2

  if [ -d "${invertedBase}" ]
  then
    cd "${invertedBase}"

    target="${mergedBase}"
    find "$target" -name "*.mrg" -delete
    find "$target" -name "*.mrg.gz" -delete
    osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
    if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
    then
      target=`cygpath -w "$target"`
    fi
    target=${target%/}
    rchive -gzip -merge "$target" *.inv.gz
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  MRG=$seconds
  echo "MRG $MRG seconds" >&2
  echo "" >&2
  sleep 1
fi

if [ "$e2post" = true ]
then
  seconds_start=$(date "+%s")
  echo "Producing Postings Files" >&2

  if [ -d "${mergedBase}" ]
  then
    cd "${mergedBase}"

    target="${postingsBase}"
    osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
    if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
    then
      target=`cygpath -w "$target"`
    fi
    target=${target%/}
    for fl in *.mrg.gz
    do
      echo "$fl"
    done |
    sort |
    xargs -n 100 echo |
    while read files
    do
      rchive -db pubmed -promote "$target" "$fields" $files
    done
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  PST=$seconds
  echo "PST $PST seconds" >&2
  echo "" >&2
  sleep 1
fi

wait

# check postings
okay=""
if [ "$e2post" = true ]
then
  okay=$( xsearch -db pubmed -query "rad57 [GRIF]" | grep -w 33421364 )
  if [ -n "$okay" ]
  then
    echo "Archive and Index are OK" >&2
    echo "" >&2
  fi
fi

if [ "$e2post" = true ] && [ -n "$okay" ] && [ -d "${mergedBase}" ]
then
  target="${mergedBase}"
  find "$target" -name "*.mrg" -delete
  find "$target" -name "*.mrg.gz" -delete
fi

cd

echo "ARCHIVE-NLMNLP" >&2

echo "" >&2

PrintTime() {

  if [ "$1" = true ]
  then
    echo "$2 $3 seconds" >&2
  fi
}

PrintTime "$download" "DWN" "$DWN"

PrintTime "$e2index" "IDX" "$IDX"
PrintTime "$e2invert" "INV" "$INV"
PrintTime "$e2merge" "MRG" "$MRG"
PrintTime "$e2post" "PST" "$PST"

echo "" >&2

function PrintTotalElapsedTime {
  local L=$1
  local T=$2
  local D=$((T/60/60/24))
  local H=$((T/60/60%24))
  local M=$((T/60%60))
  local S=$((T%60))
  printf '%s %d second' "$L" $T 1>&2
  (( $T > 1 )) && printf 's' 1>&2
  if [ "$T" -gt 59 ]
  then
    printf ', or' 1>&2
    (( $D > 0 )) && printf ' %d day' $D 1>&2
    (( $D > 1 )) && printf 's' 1>&2
    (( $H > 0 )) && printf ' %d hour' $H 1>&2
    (( $H > 1 )) && printf 's' 1>&2
    (( $M > 0 )) && printf ' %d minute' $M 1>&2
    (( $M > 1 )) && printf 's' 1>&2
    (( $S > 0 )) && printf ' %d second' $S 1>&2
    (( $S > 1 )) && printf 's' 1>&2
  fi
  printf '\n' 1>&2
}

total_end=$(date "+%s")
total=$((total_end - total_start))
TOT=$total
PrintTotalElapsedTime "TOT" "$TOT"
echo "" >&2
