#! /usr/bin/perl -w
use lib '/usr/lib/perl'; use INN::Config;
use strict;

# @(#)scanspool
#
# Originally written by Landon Curt Noll (chongo was here /\../\)
#
# This code is placed in the public domain.
#
# As this may take a while to run, using the -v switch is recommended to see
# how far the program has progressed.
#
# This program will scan first the active file, noting problems such as:
#
#     - malformed lines
#     - duplicate entries
#     - newsgroups aliased to a non-existent newsgroup
#     - newsgroups aliased to a newsgroup that is also aliased
#
# Then it will examine all articles under your news spool directory,
# complaining about articles that:
#
#     - have a basename that starts with a leading 0
#     - have a basename that is out of range according to the low and high
#       water marks of the active file
#     - do not contain a Newsgroups header field
#     - are in a directory for which there is no newsgroup in the active file
#     - are in a newsgroup to which they do not belong
#
# scanspool will also complain about directories with an all-digit component,
# in case the parent directory is a newsgroup with a high water mark below that
# all-digit component, as the name of such directories will conflict with a
# future article file with the same name.
#
# scanspool understands aliased groups.  Thus, if an article is posted
# to foo.old.name that is aliased to foo.bar, it will be expected to
# be found under foo.bar and not foo.old.name.
#
# scanspool assumes that the path of a valid newsgroup's directory
# from the top of the spool tree will not contain any "." character.
# Thus, directories such as out.going, tmp.dir, in.coming and
# news.archive will not be searched.  This program also assumes that
# article basenames contain only decimal digits.  Last, files under
# the top level directory "lost+found" are not scanned.
#
# The output of scanspool will start with one of 4 forms:
#
#    FATAL:         fatal or internal error                     (to stderr)
#
#    WARNING:       active or article format problem,           (to stderr)
#                   newsgroup alias problem, find(1) error,
#                   article open error
#
#    path/123:      basename starts with 0,                     (to stdout)
#                   article number out of range,
#                   article in the wrong directory,
#                   article in a directory not related to
#                   an active non-aliased newsgroup
#
#    \t ...         verbose message starting with a tab         (to stdout)

# Data structures
#
my %gname2type;
# $gname2type{$name}
#    $name      - newsgroup name in foo.dot.form
#    produces  => 4th active field  (y, n, x, ...)
#                 alias type is "=", not "=foo.bar"
#
my %realgname;
# $realgname{$name}
#    $name      - newsgroup name in foo.dot.form
#    produces  => newsgroup name in foo.dot.form
#                 if type is =, this will be a.b, not $name
#
my %lowart;
# $lowart{$name}
#    $name      - newsgroup name in foo.dot.form
#    produces  => lowest article allowed in the group
#                 if type is =, this is not valid
#
my %highart;
# $highart{$name}
#    $name      - newsgroup name in foo.dot.form
#    produces  => highest article allowed in the group
#                 if type is =, this is not valid
#                 If $highart{$name} < $lowart{$name},
#                 then the group should be empty

use Getopt::Std;

# Define variables Getopt::Std uses for --help and --version.
$Getopt::Std::STANDARD_HELP_VERSION = 1;
our $VERSION = $INN::Config::version;
$VERSION =~ s/INN //;

$0 =~ s!.*/!!;

my $usage = "Usage:
  $0 [-cnrv] [-a active-file] [-s spool-dir]

  Perform a sanity scan over all articles in <patharticles>

Options:
  -a active-file     active file to use (default <pathdb>/active)
  -c                 check article filenames, don't scan the articles
  -n                 don't throttle innd
  -r                 remove articles reported to have a problem
  -s spool-dir       spool tree (default <patharticles>)
  -v                 verbose mode
                     verbose messages begin with a tab
                     show articles found in non-active directories
";

sub HELP_MESSAGE {
    print $usage;
    exit(0);
}

# global constants
#
my $prog = $0;                       # our name
my $spool = "$INN::Config::patharticles";
my $active = "$INN::Config::active";
my $ctlinnd = "$INN::Config::pathbin/ctlinnd";
my $reason = "running scanspool";    # throttle reason

# parse args
#
my %opt;
getopts("a:cnrs:v", \%opt) || die $usage;
$active = $opt{'a'} if defined $opt{'a'};
$spool = $opt{'s'} if defined $opt{'s'};

# throttle innd unless -n
#
if (!defined $opt{'n'}) {
    system("$ctlinnd throttle '$reason' >/dev/null 2>&1");
}

# process the active file
#
parse_active($active);

# check the spool directory
#
check_spool($spool);

# unthrottle innd unless -n
#
if (!defined $opt{'n'}) {
    system("$ctlinnd go '$reason' >/dev/null 2>&1");
}

# all done
exit(0);

# parse_active - parse the active file
#
# From the active file, fill out the %gname2type (type of newsgroup)
# and %realgname (real/non-aliased name of group), %lowart & %highart
# (low and high article numbers).  This routine will also check for
# aliases to missing groups, or groups that are also aliases.
#
sub parse_active {
    my ($active) = @_;    # the name of the active file to use
    my $ACTIVE;           # active file handle
    my $line;             # active file line
    my $name;             # name of newsgroup
    my $low;              # low article number
    my $high;             # high article number
    my $type;             # type of newsgroup (4th active field)
    my $alias;            # realname of an aliased group
    my $linenum;          # active file line number

    # if verbose (-v), say what we are doing
    print "\tscanning $active\n" if defined $opt{'v'};

    # open the active file
    open($ACTIVE, '<', $active) || fatal(1, "cannot open $active");

    # parse each line
    $linenum = 0;
    while ($line = <$ACTIVE>) {
        # count the line
        ++$linenum;

        # verify that we have a correct number of tokens
        if ($line !~ /^\S+ 0*(\d+) 0*(\d+) \S+$/o) {
            problem("WARNING: active line is malformed at line $linenum");
            next;
        }
        ($name, $high, $low, $type)
          = $line =~ /^(\S+) 0*(\d+) 0*(\d+) (\S+)$/o;

        # watch for duplicate entries
        if (defined $realgname{$name}) {
            problem("WARNING: ignoring duplicate group: $name"
                  . ", at active line $linenum");
            next;
        }

        # record which type it is
        $gname2type{$name} = $type;

        # record the low and high article numbers
        $lowart{$name} = $low;
        $highart{$name} = $high;

        # determine the directory and real group name
        # no special rule for "j" or "x"
        if ($type =~ /^=(.+)/o) {
            $alias = $1;
            $gname2type{$name} = "=";    # rename type to be just =
        } else {
            $alias = $name;
        }
        $realgname{$name} = $alias;
    }

    # close the active file
    close $ACTIVE;

    # be sure that any alias type is aliased to a real group
    foreach my $name (keys %realgname) {
        # skip if not an alias type
        next if $gname2type{$name} ne "=";

        # be sure that the alias exists
        $alias = $realgname{$name};
        if (!defined $realgname{$alias}) {
            problem("WARNING: alias for $name: $alias, is not a group");
            next;
        }

        # be sure that the alias is not an alias of something else
        if ($gname2type{$alias} eq "=") {
            problem("WARNING: alias for $name: $alias, is also an alias");
            next;
        }
    }
    return;
}

# problem - report a problem to stdout
#
# Print a message to stdout.  Parameters are space separated.
# A final newline is appended to it.
#
# usage:
#       problem(arg, arg2, ...)
#
sub problem {
    # print the line with the header field and newline
    my $line = join(" ", @_);
    print STDERR $line, "\n";
    return;
}

# rm_file - remove a file
#
# Remove the file given as argument, if the -r flag is used.
# Otherwise, it is a no-op.
# Report a fatal error to stderr and exit if the removal fails.
#
# usage:
#       rm_file(filename)
#
sub rm_file {
    my ($filename) = @_;

    if (defined $opt{'r'}) {
        unlink("$filename") or fatal(4, "cannot remove $filename");
        problem("$filename: successfully removed");
    }

    return;
}

# fatal - report a fatal error to stderr and exit
#
# Print a message to stderr.  The message has the program name prepended
# to it.  Parameters are space separated.  A final newline is appended
# to it.  This function exits with the code of exitval.
#
# usage:
#       fatal(exitval, arg, arg2, ...)
#
sub fatal {
    my ($exitval, @args) = @_;

    # firewall, in case we're called with less than two arguments
    if ($#_ < 1) {
        print STDERR "FATAL: fatal() called with only ", $#_ + 1,
          " arguments\n";
        if ($#_ < 0) {
            $exitval = -1;
        }
    }

    # print the error message
    my $line = join(" ", @args);
    print STDERR "$prog: $line\n";

    # unthrottle innd unless -n
    #
    if (!defined $opt{'n'}) {
        system("$ctlinnd go '$reason' >/dev/null 2>&1");
    }

    # exit
    exit $exitval;
}

# check_spool - check the articles found in the spool directory
#
# This subroutine will check all articles found under the $spool directory.
# It will examine only file path that do not contain any "." or whitespace
# character, and whose basename is completely numeric.  Files under
# lost+found will also be ignored.
#
# given:
#       $spooldir  - top of <patharticles> tree
#
sub check_spool {
    my ($spooldir) = @_;    # top of article tree
    my $filename;           # article pathname under $spool
    my $artgrp;             # group of an article
    my $artnum;             # article number in a group
    my $prevgrp = '';       # previous different value of $artgrp
    my $preverrgrp = '';    # previous non-active $artgrp
    my $ARTICLE;            # article handle
    my $aline;              # header line from an article
    my $newsgroupsField;    # in a continuation line, reading Newsgroups
    my @group;              # array of groups from the Newsgroups header field
    my $remove;             # mark an article as to be removed
    my $FINDFILE;           # find command pipe handle

    # if verbose, say what we are doing
    print "\tfinding articles under $spooldir\n" if defined $opt{'v'};

    # move to the $spool directory
    chdir $spooldir or fatal(2, "cannot chdir to $spool");

    # start finding files
    #
    if (
        !open $FINDFILE, '-|',
        "find . \\( -type d -o -type f -o -type l \\) -name '[0-9]*' "
        . "-print 2>&1",
    ) {
        fatal(3, "cannot start find in $spool");
    }

    # process each history line
    #
    while ($filename = <$FINDFILE>) {
        # if the line contains "find:", assume it is a find error and print it
        chomp($filename);
        if ($filename =~ /find:\s/o) {
            problem("WARNING:", $filename);
            next;
        }

        # remove the leading "./" that find put in our path
        $filename =~ s#^\./##o;

        # skip if this path has a "." in it (beyond a leading "./")
        next if ($filename =~ /\./o);

        # skip if lost+found
        next if ($filename =~ m:^lost+found/:o);

        # skip if not a numeric basename
        next if ($filename !~ m:/\d+$:o);

        # skip timehash and timecaf spools
        next
          if $filename =~ /^time-\d\d\/\d\d/
          or $filename =~ /^time-\d\d\/..\/\d\d/
          or $filename =~ /^timecaf-\d\d\/\d\d/;

        # get the article's newsgroup name (based on its path from $spool)
        $artgrp = $filename;
        $artgrp =~ s#/\d+$##o;
        $artgrp =~ s#/#.#go;

        # get the article number (based on its path from $spool)
        $artnum = $filename;
        $artnum =~ s#^.+/##o;

        # warn if a directory with an all-digit component is found, and the
        # parent directory ($artgrp) is an active newsgroup with a high water
        # mark below that all-digit component
        if (-d "$filename") {
            if (defined $gname2type{$artgrp}
                && $highart{$artgrp} > 1
                && $highart{$artgrp} < $artnum)
            {
                problem(
                    "WARNING:",
                    "$filename: directory with an all-digit component",
                    "(will conflict with a future article with the same name",
                    "in $artgrp)",
                );
            }
            next;
        }

        # if verbose (-v), then note if our group changed
        if (defined $opt{'v'} && $artgrp ne $prevgrp) {
            print "\t$artgrp\n";
            $prevgrp = $artgrp;
        }

        # note if the article is not in a directory that is used by
        # a real (non-aliased) group in the active file
        #
        # If we complained about this group before, don't complain again.
        # If verbose, note files that could be removed.
        #
        if (!defined $gname2type{$artgrp} || $gname2type{$artgrp} =~ /[=jx]/o)
        {
            if ($preverrgrp ne $artgrp) {
                problem("$artgrp: not an active group directory (probably "
                      . "a removed newsgroup; see articles with -v)");
                $preverrgrp = $artgrp;
            }
            if (defined $opt{'v'}) {
                problem("$filename: article found in non-active directory");
            }
            rm_file($filename);
            next;
        }

        # check on the article number
        if ($artnum =~ m/^0/o) {
            problem("$filename: article basename starts with a 0");
            rm_file($filename);
            next if defined $opt{'r'};
        }
        if (defined $gname2type{$artgrp}) {
            if ($lowart{$artgrp} > $highart{$artgrp}) {
                problem("$filename: active indicates group should be empty");
                rm_file($filename);
                next if defined $opt{'r'};
            } elsif ($artnum < $lowart{$artgrp}) {
                problem("$filename: article number is too low "
                      . "(first article has number $lowart{$artgrp} "
                      . "in the active file)");
                rm_file($filename);
                next if defined $opt{'r'};
            } elsif ($artnum > $highart{$artgrp}) {
                problem("$filename: article number is too high "
                      . "(last article has number $highart{$artgrp} "
                      . "in the active file");
                rm_file($filename);
                next if defined $opt{'r'};
            }
        }

        # if check filenames only (-c), then do nothing else with the file
        next if (defined $opt{'c'});

        # don't open a control or junk, they can be from anywhere
        next
          if ($artgrp eq "control"
              || $artgrp =~ /^control\./
              || $artgrp eq "junk");

        if (-z "$filename") {
            problem("WARNING: $filename: empty file");
            rm_file($filename);
            next;
        }

        # try open the file
        if (!open $ARTICLE, '<', $filename) {
            # the find is now gone (expired?), give up on it
            problem("WARNING: cannot open $filename");
            next;
        }

        @group = ();
        $newsgroupsField = 0;
        $remove = 0;

        # read until the Newsgroups header field is found
      AREADLINE:
        while ($aline = <$ARTICLE>) {
            # catch the Newsgroups header field
            if ($aline =~ /^Newsgroups:/io
                or ($aline =~ /^\s+/o and $newsgroupsField > 0))
            {
                # convert $aline into a comma separated list of groups
                # remove both CR and LF from the header field body (because
                # the article may be stored in wire-format)
                $aline =~ s/^Newsgroups://io;
                $aline =~ tr/ \t\r\n//d;

                # form an array of newsgroups, and retain them (in case of a
                # folded Newsgroups header field)
                push(@group, split(",", $aline));
                $newsgroupsField = 1;
            } elsif ($aline =~ /^\r?\n$/o) {
                # end of headers
                problem("WARNING: $filename: no Newsgroups header field");
                $remove++;
                last;
            } else {
                # do not continue parsing the headers as we have just found out
                # the whole contents of the Newsgroups header field
                last if $newsgroupsField > 0;
            }
            if (eof $ARTICLE) {
                problem("WARNING: $filename: article with no body");
                $remove++;
            }
        }

        # check if the Newsgroups header field contains our group
        if ($newsgroupsField > 0) {
            my $found = 0;
            for (my $j = 0; $j <= $#group; ++$j) {
                # look at the group
                if (exists $realgname{ $group[$j] }
                    and $realgname{ $group[$j] } eq $artgrp)
                {
                    # this article was posted to this group
                    $found++;
                    last;
                }
            }

            if ($found == 0) {
                # no group or group alias was found
                problem("$filename: does not belong in $artgrp"
                      . " according to its Newsgroups header field");
                $remove++;
            }
        }

        # close the article
        close $ARTICLE;

        # remove the article if asked to
        rm_file($filename) if $remove > 0;
    }

    # all done with the find
    close $FINDFILE;
    return;
}
