#! /usr/bin/perl -w
use lib '/usr/lib/perl'; use INN::Config;
use strict;

# @(#)scanspool.pl	1.20 4/6/92 00:47:35
#
# Written by:  Landon Curt Noll		(chongo was here  /\../\)
#
# This code is placed in the public domain.
#
# scanspool - Perform a big scan over all articles in <patharticles>
#
# usage:
#    scanspool [-cnv] [-a active-file] [-s spool-dir]
#
#    -a active-file	active file to use (default <pathdb>/active)
#    -s spool-dir	spool tree (default <patharticles>)
#    -v 		verbose mode
#			verbose messages begin with a tab
#			show articles found in non-active directories
#    -c			check article filenames, don't scan the articles
#    -n			don't throttle innd
#
# NOTE: This take a while, -v is a good thing if you want to know
#	how far this program has progressed.
#
# This program will scan first the active file, noting problems such as:
#
#	malformed line
#	group aliased to a non-existent group
#	group aliased to a group that is also aliased
#
# Then it will examine all articles under your news spool directory,
# looking for articles that:
#
#	basename that starts with a leading 0
#	basename that is out of range with the active file
#	does not contain a Newsgroups: line
#	article that is all header and no text
#	is in a directory for which there is no active group
#	article that is in a group to which it does not belong
#
# Scanspool understands aliased groups.  Thus, if an article is posted
# to foo.old.name that is aliased to foo.bar, it will be expected to
# be found under foo.bar and not foo.old.name.
#
# Any group that is of type 'j' or 'x' (4th field of the active file)
# will be allowed to show up under the junk group.
#
# Scanspool assumes that the path of a valid newsgroup's directory
# from the top of the spool tree will not contain any "." character.
# Thus, directories such as out.going, tmp.dir, in.coming and
# news.archive will not be searched.  This program also assumes that
# article basenames contain only decimal digits.  Last, files under
# the top level directory "lost+found" are not scanned.
#
# The output of scanspool will start with one of 4 forms:
#
#    FATAL: 	    fatal or internal error 			(to stderr)
#
#    WARN: 	    active or article format problem,		(to stderr)
#		    group alias problem, find error,
#		    article open error
#
#    path/123:	    basename starts with 0,			(to stdout)
#		    article number out of range,
#		    article in the wrong directory,
#		    article in directory not related to
#		        an active non-aliases newsgroup
#
#    \t ...	    verbose message starting with a tab		(to stdout)


# Data structures
#
my %gname2type;
# $gname2type{$name}
#    $name	- newsgroup name in foo.dot.form
#    produces  => 4th active field  (y, n, x, ...)
#		  alias type is "=", not "=foo.bar"
#
my %realgname;
# $realgname{$name}
#    $name      - newsgroup name in foo.dot.form
#    produces  => newsgroup name in foo.dot.form
#		  if type is =, this will be a.b, not $name
#
my %lowart;
# $lowart{$name}
#    $name      - newsgroup name in foo.dot.form
#    produces  => lowest article allowed in the group
#		  if type is =, this is not valid
#
my %highart;
# $highart{$name}
#    $name      - newsgroup name in foo.dot.form
#    produces  => highest article allowed in the group
#		  if type is =, this is not valid
#		  If $highart{$name} < $lowart{$name},
#		  then the group should be empty

# perl requirements
#
use Getopt::Std;

# global constants
#
my $prog = $0;                          # our name
my $spool = "$INN::Config::patharticles";
my $active = "$INN::Config::active";
my $ctlinnd = "$INN::Config::pathbin/ctlinnd";
my $reason = "running scanspool";       # throttle reason

# parse args
#
my %opt;
getopts("a:s:vcn", \%opt);
$active = $opt{'a'} if defined $opt{'a'};
$spool = $opt{'s'} if defined $opt{'s'};

# throttle innd unless -n
#
if (! defined $opt{'n'}) {
    system("$ctlinnd throttle '$reason' >/dev/null 2>&1");
}

# process the active file
#
parse_active($active);

# check the spool directory
#
check_spool($spool);

# unthrottle innd unless -n
#
if (! defined $opt{'n'}) {
    system("$ctlinnd go '$reason' >/dev/null 2>&1");
}

# all done
exit(0);


# parse_active - parse the active file
#
# From the active file, fill out the %gname2type (type of newsgroup)
# and %realgname (real/non-aliased name of group), %lowart & %highart
# (low and high article numbers).  This routine will also check for
# aliases to missing groups or groups that are also aliases.
#
sub parse_active
{
    my ($active) = @_;  # the name of the active file to use
    my $ACTIVE;         # active file handle
    my $line;           # active file line
    my $name;           # name of newsgroup
    my $low;            # low article number
    my $high;           # high article number
    my $type;           # type of newsgroup (4th active field)
    my $alias;          # realname of an aliased group
    my $linenum;        # active file line number

    # if verbose (-v), say what we are doing
    print "\tscanning $active\n" if defined $opt{'v'};

    # open the active file
    open ($ACTIVE, '<', $active) || fatal(1, "cannot open $active");

    # parse each line
    $linenum = 0;
    while ($line = <$ACTIVE>) {

	# count the line
	++$linenum;

	# verify that we have a correct number of tokens
	if ($line !~ /^\S+ 0*(\d+) 0*(\d+) \S+$/o) {
	    problem("WARNING: active line is mal-formed at line $linenum");
	    next;
	}
	($name, $high, $low, $type) = $line =~ /^(\S+) 0*(\d+) 0*(\d+) (\S+)$/o;

	# watch for duplicate entries
	if (defined $realgname{$name}) {
	    problem("WARNING: ignoring duplicate group: $name, at active line $linenum");
	    next;
	}

	# record which type it is
	$gname2type{$name} = $type;

	# record the low and high article numbers
	$lowart{$name} = $low;
	$highart{$name} = $high;

	# determine the directory and real group name
	if ($type eq "j" || $type eq "x") {
	    $alias = $name;
	} elsif ($type =~ /^=(.+)/o) {
	    $alias = $1;
	    $gname2type{$name} = "=";	# rename type to be just =
	} else {
	    $alias = $name;
	}
	$realgname{$name} = $alias;
    }

    # close the active file
    close $ACTIVE;

    # be sure that any alias type is aliased to a real group
    foreach my $name (keys %realgname) {

	# skip if not an alias type
	next if $gname2type{$name} ne "=";

	# be sure that the alias exists
	$alias = $realgname{$name};
	if (! defined $realgname{$alias}) {
	    problem("WARNING: alias for $name: $alias, is not a group");
	    next;
	}

	# be sure that the alias is not an alias of something else
	if ($gname2type{$alias} eq "=") {
	    problem("WARNING: alias for $name: $alias, is also an alias");
	    next;
	}
    }
    return;
}


# problem - report a problem to stdout
#
# Print a message to stdout.  Parameters are space separated.
# A final newline is appended to it.
#
# usage:
#	problem(arg, arg2, ...)
#
sub problem
{
    # print the line with the header and newline
    my $line = join(" ", @_);
    print STDERR $line, "\n";
    return;
}


# fatal - report a fatal error to stderr and exit
#
# Print a message to stderr.  The message has the program name prepended
# to it.  Parameters are space separated.  A final newline is appended
# to it.  This function exits with the code of exitval.
#
# usage:
#	fatal(exitval, arg, arg2, ...)
#
sub fatal
{
    my ($exitval, @args) = @_;

    # firewall, in case we're called with less than two arguments
    if ($#_ < 1) {
	print STDERR "FATAL: fatal() called with only ", $#_+1, " arguments\n";
	if ($#_ < 0) {
	    $exitval = -1;
	}
    }

    # print the error message
    my $line = join(" ", @args);
    print STDERR "$prog: $line\n";

    # unthrottle innd unless -n
    #
    if (! defined $opt{'n'}) {
	system("$ctlinnd go '$reason' >/dev/null 2>&1");
    }

    # exit
    exit $exitval;
}


# check_spool - check the articles found in the spool directory
#
# This subroutine will check all articles found under the $spool directory.
# It will examine only file path that do not contain any "." or whitespace
# character, and whose basename is completely numeric.  Files under
# lost+found will also be ignored.
#
# given:
#	$spooldir  - top of <patharticles> tree
#
sub check_spool
{
    my ($spooldir) = @_;        # top of article tree
    my $filename;               # article pathname under $spool
    my $artgrp;                 # group of an article
    my $artnum;                 # article number in a group
    my $prevgrp = '';           # previous different value of $artgrp
    my $preverrgrp = '';        # previous non-active $artgrp
    my $ARTICLE;                # article handle
    my $aline;                  # header line from an article
    my @group;                  # array of groups from the Newsgroups: header
    my $FINDFILE;               # find command pipe handle

    # if verbose, say what we are doing
    print "\tfinding articles under $spooldir\n" if defined $opt{'v'};

    # move to the $spool directory
    chdir $spooldir or fatal(2, "cannot chdir to $spool");

    # start finding files
    #
    if (!open $FINDFILE, '-|',
	  "find . \\( -type f -o -type l \\) -name '[0-9]*' -print 2>&1") {
	fatal(3, "cannot start find in $spool");
    }

    # process each history line
    #
    while ($filename = <$FINDFILE>) {

	# if the line contains find:, assume it is a find error and print it
	chomp($filename);
	if ($filename =~ /find:\s/o) {
	    problem("WARNING:", $filename);
	    next;
	}

	# remove the ./ that find put in our path
	$filename =~ s#^\./##o;

	# skip if this path has a . in it (beyond a leading ./)
	next if ($filename =~ /\./o);

	# skip if lost+found
	next if ($filename =~ m:^lost+found/:o);

	# skip if not a numeric basename
	next if ($filename !~ m:/\d+$:o);

	# get the article's newsgroup name (based on its path from $spool)
	$artgrp = $filename;
	$artgrp =~ s#/\d+$##o;
	$artgrp =~ s#/#.#go;

	# if verbose (-v), then note if our group changed
	if (defined $opt{'v'} && $artgrp ne $prevgrp) {
	    print "\t$artgrp\n";
	    $prevgrp = $artgrp;
	}

	# note if the article is not in a directory that is used by
	# a real (non-aliased) group in the active file
	#
	# If we complained about this group before, don't complain again.
	# If verbose, note files that could be removed.
	#
	if (!defined $gname2type{$artgrp} || $gname2type{$artgrp} =~ /[=jx]/o){
	    if ($preverrgrp ne $artgrp) {
		problem("$artgrp: not an active group directory");
		$preverrgrp = $artgrp;
	    }
	    if (defined $opt{'v'}) {
		problem("$filename: article found in non-active directory");
	    }
	    next;
	}

	# check on the article number
	$artnum = $filename;
	$artnum =~ s#^.+/##o;
	if ($artnum =~ m/^0/o) {
	    problem("$filename: article basename starts with a 0");
	}
	if (defined $gname2type{$artgrp}) {
	    if ($lowart{$artgrp} > $highart{$artgrp}) {
		problem("$filename: active indicates group should be empty");
	    } elsif ($artnum < $lowart{$artgrp}) {
		problem("$filename: article number is too low");
	    } elsif ($artnum > $highart{$artgrp}) {
		problem("$filename: article number is too high");
	    }
	}

	# if check filenames only (-c), then do nothing else with the file
	next if (defined $opt{'c'});

	# don't open a control or junk, they can be from anywhere
	next if ($artgrp eq "control" || $artgrp eq "junk");

	# try open the file
	if (!open $ARTICLE, '<', $filename) {

	    # the find is now gone (expired?), give up on it
	    problem("WARNING: cannot open $filename");
	    next;
	}

# read until the Newsgroups: header line is found
	AREADLINE:
	while ($aline = <$ARTICLE>) {

	    # catch the Newsgroups: header
	    if ($aline =~ /^Newsgroups:\w*\W/io) {
		# convert $aline into a comma separated list of groups
                # remove both CR and LF from the header value (because the article may be
                # stored in wire-format).
		$aline =~ s/^Newsgroups://io;
		$aline =~ tr/ \t\r\n//d;

		# form an array of news groups
		@group = split(",", $aline);

		# see if any groups in the Newsgroups: header are our group
		for (my $j=0; $j <= $#group; ++$j) {

		    # look at the group
		    if (exists $realgname{$group[$j]} and $realgname{$group[$j]} eq $artgrp) {
			# this article was posted to this group
			last AREADLINE;
		    }
		}

		# no group or group alias was found
		problem("$filename: does not belong in $artgrp according to its Newsgroups: header");
		last;

	    # else watch for the end of the header
	    } elsif ($aline =~ /^\s*$/o) {

		# no Newsgroups: header found
		problem("WARNING: $filename: no Newsgroups: header");
		last;
	    }
	    if (eof $ARTICLE) {
		problem("WARNING: $filename: EOF found while reading header");
	    }
	}

	# close the article
	close $ARTICLE;
    }

    # all done with the find
    close $FINDFILE;
    return;
}
