#!/usr/bin/bash
# set -x  # debug if uncommented
# Purpose:
# download a list of valid area codes from wikipedia
#
# Ed Attfield  2019-08-06 new, scrape wikipedia for valid area codes
#                         as input to hangup-fakenum

VERSION="(NCID) 1.18"

# make sure we can find wget, awk
#PATH=$PATH:/usr/local/bin

# target location for backup copies of ncidd.blacklist
BAKDIR=/var/backups/ncid

# the etc/ncid directory
ConfigDir="/etc/ncid"

# name of FCC source file
AC_list="valid-area-codes" 

# full website path where the area codes list is found
URL="https://en.wikipedia.org/wiki/List_of_North_American_Numbering_Plan_area_codes"

# separator in case stdout/stderr are being redirected to log files
equals="==============================================================================="

usage() {
   cat <<EOF

Usage: $prog [options]
       
Options:
       [-h] [-V]

       -h = show this help

       -V = display version

       Scrapes a wikipedia entry to get a list of valid
       North America area code.

EOF

   exit 1
}

prog=`basename $0 .sh`

# Options on command line
while getopts :hV opt ; do
    case $opt in
        h) usage;;
        V) echo "$prog $VERSION"; exit 0;;
       \?) echo "Invalid option: -$OPTARG"; usage;;
        *) echo "Invalid option: -$OPTARG"; usage;;
    esac
done

echo
if [ $EUID == 0 ]; then
   echo ${equals}
   echo -n "Running ${prog} at "
   date
else
   echo "${prog} must be run as root, try sudo ${prog}"
   exit 1
fi
echo "Command line: ${prog} $*"

shift $((OPTIND-1)) # skip over command line args (if any)

# write an awk script to /var/tmp
cat << 'ENDAWK' > /var/tmp/get-areacodes-list.awk
# parse the wikipedia page from
# cat /var/tmp/${AC_list}.html | tr '<' '\012' | awk -f /var/tmp/areas.awk |& less
BEGIN {
   found_td=0
   found_tr=0
   got_area=area;
   words="";
}

# start of table definition
/^tr>$/ {
   # print "line " $0
   found_tr=1
   found_td=0
   got_area=0
   words=""
   # print "found tr"
}
# start of table definition
/^td>$/ {
   found_td++
   # print "found td " found_td " line " $0
}
# start of table definition with words
/^td>.+$/ || /^i>.+$/  {
   if ($0 ~ "td>") found_td++
   # print "found td " found_td " words " $0
   if (found_tr && found_td==1) {
      # so break at the next > and it should give some words
      pos=index($0, ">");
      if (pos > 0) {
	 area=substr($0, pos+1);
	 # print "area " area
	 # but not x11 like 511 and 911
	 # but not 900 ?
         ones=index(area, "11");
         if (ones == 0 && length(area)==3 ) {
	    # print "parsed area " area;
	    got_area=area;
	 }
      }
   }
   else if (found_tr && found_td==2) {
      # so break at the next > and it should give some words
      pos=index($0, ">");
      if (pos > 0) {
	 words=tolower(substr($0, pos+1));
         if (words ~ "not in use" || 
	     words ~ "not an area code" || 
	     words ~ "reserved for" || 
	     words ~ "assigned for numbering relief" || 
	     words ~ "planned overlay" || 
	     words ~ "reserved to" || 
	     words ~ "proposed" ) {
	    # print got_area " reset words " words
	    got_area=0;
	 }
	 else {
	    # print got_area " just words " words
	 }
      }
   }
}
# other lines with link
/^a href="\/wiki\// {
   # print "found link " found_td  " line " $0
   if (found_tr && found_td==1) {
      # print "other " $0
      # comes afer <tr> <td> and starts with  <a href="/wiki/
      # so break at the next > and it should give a number
      pos=index($0, ">");
      if (pos > 0) {
	 area=substr($0, pos+1);
	 # print "area " area
	 # but not x11 like 511 and 911
	 # but not 900 ?
         ones=index(area, "11");
         if (ones == 0 && length(area)==3 ) {
	    # print "parsed area " area;
	    got_area=area;
	 }
      }
   }
}

# end of row
/^\/tr>$/ {
   # print "line " $0
   if (found_tr && got_area) {
      if (length(words)>0) {
         print got_area "\t # " words
      }
      else {
         print got_area
      }
      # print "area " got_area " other " $0
      found_tr=0
      found_td=0
      got_area=0
   }
}

# end of column or link
/^\/td>$/ || /^\/a>$/ {
   # do nothing
   # print "line " $0
}
ENDAWK

# Change directory so the result is placed in the etc/ncid directory.
cd ${ConfigDir}

# download the latest list of phone numbers that have come from FCC complaints: 
# (this list has already been trimmed to the recent complaints and sorted)
wget -nv ${URL} -O /var/tmp/${AC_list}.html 2>&1
rc=$?
if [ ${rc} != 0 ]; then
   echo "wget ${URL}: download failed with return code ${rc}"
   rm -f /var/tmp/${AC_list}.html
   exit 1
fi

echo "Successfully downloaded ${AC_list} for hangup-fakenum."
cat /var/tmp/${AC_list}.html | tr '<' '\012' | awk -f /var/tmp/get-areacodes-list.awk > /var/tmp/${AC_list}
mv /var/tmp/${AC_list} ${AC_list}
rm /var/tmp/get-areacodes-list.awk
exit 0

