#! /usr/bin/perl -w

# vim:syntax=perl

use strict;

use lib '/usr/share/perl5';

use Lire::Email;
use Lire::DlfSchema;
use Lire::Program qw( :msg :dlf );

my $debug = 0;

my $schema = eval { Lire::DlfSchema::load_schema( "email" ); };
lr_err( "error loading email schema: $@" ) if $@;
my $hash2dlf = 
  $schema->make_hashref2asciidlf_func( qw/time msgid from_user from_domain
                                          size delay xdelay to_user to_domain
                                          stat xstat/ );

my $msgno;
my %msgs;

init_dlf_converter( "email");
my $lines       = 0;
my $dlflines    = 0;
my $errorlines  = 0;
while (<>) {
    chomp;
    $lines++;
    my @line = split ' ', $_;

    if ( @line < 4 ) {
        lr_warn( "skipping line '$_': has " . @line .
                 " field, should have at least 4" );
        $errorlines++;
        next;
    }

    # get '972810464.144861'
    my $time = shift @line;

    lr_debug( "number of keys in msgs: ", scalar keys %msgs )
      if $debug;

    if ($line[0] eq 'new') {
        if ($line[1] eq 'msg') {
            # 'new msg 6172'
            $msgno = $line[2];
            if (defined $msgs{$msgno}) {
                lr_info( "deleting msgno '$msgno': number is being reused" );
                delete $msgs{$msgno};
            }
            $msgs{$msgno}{time} = $time;
            $msgs{$msgno}{msgid} = $time . '-' . $msgno;
        } else {
            lr_warn( "skipping line '$_': it has a 'new', but second " .
                     "field is '" . $line[1] . "', not a 'msg'" );
            $errorlines++;
            next;
        }
    } elsif ($line[0] eq 'end') {
        if ($line[1] eq 'msg') {
            $msgno = $line[2];

            my $tmp;
            unless ( defined $msgs{$msgno}{fromaddress} ) {
                lr_notice( "skipping line '$_': fromaddress not " .
                           "found yet" );
                delete $msgs{$msgno};
                next;
            }

            unless (sanitize('emailadress', $msgs{$msgno}{fromaddress},
                             $tmp))
            {
                # delete $msgs{$msgno};
                lr_err ("cannot sanitize 'emailadress', '$tmp'" );
            }
            ($msgs{$msgno}{from_user}, $msgs{$msgno}{from_domain}) =
              splitemailadress($tmp);

            while (my ($dlvno, $dlv) = each %{ $msgs{$msgno}{deliveries} }) {
                # this delivery is inheriting properties from the message
                # it belongs to
                while (my ($k, $v) = each %{ $msgs{$msgno} }) {
                    next if $k eq 'deliveries';
                    if ($k eq 'time') {
                        unless (defined $dlv->{time}) {
                            lr_info( "found 'end msg' line for msg " .
                                     "'$msgno', but delivery '$dlvno' hasn't yet " .
                                     "got a timestamp. using the end timestamp" );
                            $dlv->{time} = $time;
                        }
                        $dlv->{xdelay} = $dlv->{time} - $v;
                        $dlv->{delay} = $dlv->{xdelay}; # XXX is this what
                                                            # we want?
                        $dlv->{time} = $v;
                        next;
                    }
                    if ( defined $dlv->{$k} ) {
                        delete $msgs{$msgno};
                        lr_err ( "key '$k' already in delivery '$dlvno' for " .
                                 "message '$msgno'" );
                    }
                    $dlv->{$k} = $v;
                }

                unless (sanitize('emailadress', $dlv->{toaddress}, $tmp)) {
                    delete $msgs{$msgno};
                    lr_err( "cannot sanitize 'emailadress', '$tmp'" );
                }
                ($dlv->{to_user}, $dlv->{to_domain}) =
                    splitemailadress( $tmp );

                print join( " ", @{ $hash2dlf->( $dlv ) } ), "\n";
                $dlflines++;
                delete $msgs{$msgno}{deliveries}{$dlvno};
            }
            delete $msgs{$msgno};
        } else {
            lr_warn( "skipping line '$_': it has a 'end', but second " .
                     "field is '" . $line[1] . "', not a 'msg'" );
            $errorlines++;
            next;
        }
        # make sure msgno gets flushed in _any_ case XXX

    } elsif ($line[0] eq 'info') {
        if ($line[1] eq 'msg') {
            # 'info msg 6161: bytes 18099 from <logreport@logreport.org> 
            #     qp 16700 uid 1015'
            ($msgno = $line[2]) =~ s/:$//;

            if ( $line[3] ne 'bytes' ) {
                lr_warn ( "evil line '$_': 'info msg', but no 'bytes', " .
                          "skipping" );
                $errorlines++;
                next;
            }

            unless (defined $msgs{$msgno}) {
                lr_info( "found info msg $msgno line, but msg $msgno " .
                         "was not yet announced in a new msg line. " .
                         "pretending it was" );
                $msgs{$msgno} = {};
            }

            if (defined $msgs{$msgno}{size}) {
               lr_info( "size for msg $msgno already defined, but " .
                        "found new one in line '$_'. overwriting." );
            }

            $msgs{$msgno}{size} = $line[4];

            if ( $line[5] ne 'from' ) {
                lr_warn ( "evil line '$_': 'info msg', but no 'from', " .
                          "skipping" );
                $errorlines++;
                next;
            }
            if (defined $msgs{$msgno}{fromaddress}) {
                lr_info( "fromaddress for msg $msgno already " .
                         "defined, but found new one in line '$_'. " . 
                         "overwriting." );
            }
            $msgs{$msgno}{fromaddress} = $line[6];
        } else {
            lr_warn( "skipping line '$_': it has a 'info', but second " .
                     "field is '" . $line[1] . "', not a 'msg'" );
            $errorlines++;
            next;
        }
    } elsif ($line[0] eq 'starting') {
        if ($line[1] eq 'delivery') {
            # starting delivery 4472: msg 6161 to local vanbaal@mdcc.cx

            unless ( @line == 8 ) {
                lr_warn( "evil line '$_': 'starting delivery', but no " .
                         "8 fields, skipping" );
                next;
            }

            (my $dlvno = $line[2]) =~ s/:$//;
            if ( $line[3] ne 'msg' ) {
                lr_warn( "evil line '$_': 'starting delivery', but no " .
                         "'msg', skipping" );
                $errorlines++;
                next;
            }

            $msgno = $line[4];
            if ( $line[5] ne 'to' ) {
                lr_warn( "evil line '$_': 'starting delivery', but no " .
                         "'to', skipping" );
                $errorlines++;
                next;
            }
            my $dest = $line[6]; # 'local' or 'remote'
            my $toaddress = $line[7];

            unless (defined $msgs{$msgno}) {
                lr_info( "found info msg $msgno line, but msg $msgno " .
                         "was not yet announced in a new msg line. " .
                         "pretending it was" );
                $msgs{$msgno} = {};
            }

            unless (defined $msgs{$msgno}{deliveries}) {
                # the first delivery for this message
                $msgs{$msgno}{deliveries} = {};
            }

            # go store this delivery
            if (defined $msgs{$msgno}{deliveries}{$dlvno}) {
                lr_info( "gonna flush msg '$msgno', delivery '$dlvno': ".
                         "reused" );
                delete $msgs{$msgno}{deliveries}{$dlvno};
            }
            $msgs{$msgno}{deliveries}{$dlvno}{dest} = $dest;
            $msgs{$msgno}{deliveries}{$dlvno}{toaddress} = $toaddress;
        } else {
            lr_warn( "skipping line '$_': it has a 'starting', but " .
                     "second field is '" . $line[1] . "', not a 'delivery'" );
            $errorlines++;
            next;
        }
    } elsif ($line[0] eq 'delivery') {
        # 'delivery 4466: success: did_0+0+1/'
        # 'delivery 4469: success: 213.46.240.6_accepted_message./R
        #    emote_host_said:_250_Message_received:_20001029065738.E
        #    ZXJ24328.amsmta02-svc@mdcc.cx/'

        # 'delivery 5037: deferral: Connected_to_216.33.151.135_but_
        #    connection_died._Possible_duplicate!_(#4.4.2)/'
        # 'delivery 5540: success: ezmlm-manage:_info:_qp_32060/did_0+0+3/
        # 'delivery 5641: failure: 64.4.42.7_failed_after_I_sent_the_
        #    message./Remote_host_said:_554_Transaction_failed/'
        # 'delivery 6266: deferral: Connected_to_195.193.176.2_but_se
        #    nder_was_rejected./Remote_host_said:_451_<joostvb@mdcc.c
        #    x>..._Sender_domain_must_resolve/
        # 'delivery 4464: deferral: CNAME_lookup_failed_temporarily._(#4.4.3)/
        # 'delivery 4533: deferral: Connected_to_212.27.32.3_but_send
        #    er_was_rejected./Remote_host_said:_451_DNS_temporary_fai
        #    lure_(#4.3.0)/

        unless ( @line == 4 ) {
            lr_warn( "evil line '$_': 'delivery', " .
                     "but no 4 fields, skipping\n" );
            $errorlines++;
            next;
        }

        (my $dlvno = $line[1]) =~ s/:$//;
        # 'deferral:' 'success:' 'failure:'
        (my $stat = $line[2]) =~ s/:$//;
        my $xstat = $line[3];

        # find to which message this delivery belongs
        my $found = 0;
        # Bizarre copy of HASH in aassign at ./qmail2dlf line 236, <> line 694. 
        # while ((my $msgno, my $msg) = each %msgs) {
        for my $msgno (keys %msgs) {
            if (defined $msgs{$msgno}{deliveries}{$dlvno}) {
                # found it
                if (defined $msgs{$msgno}{deliveries}{$dlvno}{stat}) {
                   lr_info( "found line '$_', but status for delivery " .
                            "'$dlvno' already defined: overwriting" );
                }
                # translate to sendmail-style
                $msgs{$msgno}{deliveries}{$dlvno}{stat} = 
                    $stat eq 'success' ? 'sent' : (
                        $stat eq 'deferral' ? 'deferred' : $stat
                    );
                if (defined $msgs{$msgno}{deliveries}{$dlvno}{xstat}) {
                    lr_info( "found line '$_', but status line for " .
                             "delivery '$dlvno' already defined: " .
                             "overwriting");
                }
                $msgs{$msgno}{deliveries}{$dlvno}{xstat} = $xstat;
                if (defined $msgs{$msgno}{deliveries}{$dlvno}{time}) {
                   lr_info( "found line '$_', but time for delivery " .
                            "'$dlvno' already defined: overwriting" );
                }
                $msgs{$msgno}{deliveries}{$dlvno}{time} = $time;

                $found = 1;
                last;
            }
        }
        lr_info( "cannot find delivery '$dlvno', so skipping line '$_'" )
          unless $found;
    } elsif ($line[0] eq 'status:') {
        ;
    } elsif ($line[0] eq 'warning:') {
        ;
    } elsif ($line[0] eq 'bounce') {
        ; # skip this one: no sendmail or postfix counterpart
    } else {
        lr_warn( "skipping line '$_': first field should be one of ".
                 "'new', 'info', 'starting', 'delivery', 'end', 'bounce', " .
                 "'status:' or 'warning:', not '".  $line[0] . "'");
        $errorlines++;
        next;
    }
}

end_dlf_converter( $lines, $dlflines, $errorlines );

__END__

=pod

=head1 NAME

qmail2dlf - convert sanitized qmail-send logs to dlf

=head1 SYNOPSIS

B<qmail2ldf>

=head1 DESCRIPTION

B<qmail2dlf> reads qmail-send logs from stdin, writes dlf to stdout, and, in
case of errors, complains on stderr.

=head1 TIMESTAMPS

We expect timestamps which look like e.g. `977359048.466280500' in our log:
number of seconds since the epoch, in any precision.  The log should feature 
lines which look something like:

 982584201.511524 info msg 6426: bytes 3537 from 
   <qmail-return-63074-joostvb=mdcc.cx@list.cr.yp.to> qp 21089 uid 70

I<splogger>, as distributed with qmail, writes `a numerical timestamp', ie 
something like 972802273.627578, indicating seconds and nanoseconds since the 
beginning of 1970.

I<multilog>, as distributed with daemontools, 
I<http://cr.yp.to/daemontools.html>, if invoked with action I<t>, writes a 
`@' and a I<tai64n> format timestamp. These timestamps look like e.g.
4000000039ef8532346bb35c. Note that 0x400000000000000000000000 is 2^62.

One can get the current time in I<tai64> format by doing:

 echo 40000000`(echo obase = 16; date +%s) | bc`00000000

(assuming GNU date is installed)

To convert I<tai64n> to numerical timestamps, one can use Russ Allbery's
tai64nfraq. It's in the public domain, available from 
I<http://www.qmail.org/tai64nfrac>.

=head1 EXAMPLE

To process a log as produced by splogger:

 $ lr_desyslog qmail < mail.log | qmail2dlf

To process a log as produced by multilog:

 $ tai64nfrac < current | qmail2dlf

qmail2dlf(1) will be rarely used on its own, but is more likely called
by lr_log2report:

 $ tai64nfrac < /service/qmail-send/log/main/current | \
     lr_log2report qmail

=head1 BUGS

We don't deal with bounces:

 email qmail none qmail2dlf info skipping line '986244190.800217 bounce msg 
  6306 qp 28445': first field should be one of 'new', 'info', 'starting', 
  'delivery', 'end' or 'status:', not 'bounce'

qmail2dlf doesn't use any information from the qmail-smtpd(1) logs.  (These
look like

 2002-06-16 09:34:57.798038500 tcpserver: pid 19385 from 100.61.24.7
 2002-06-16 09:34:58.114198500 tcpserver: ok 19385 foo.example.com:100.163.25.11:25 logreport.iae.nl:212.61.24.7:postfix:1189

.)

=head1 VERSION

$Id: qmail2dlf.in,v 1.24 2006/07/23 13:16:34 vanbaal Exp $

=head1 COPYRIGHT

Copyright (C) 2000, 2001 Stichting LogReport Foundation LogReport@LogReport.org

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html.

=head1 AUTHOR

Joost van Baal <joostvb@logreport.org>

=cut

# Local Variables:
# mode: cperl
# End:
