#! /usr/bin/perl -w
# vim:syntax=perl

use strict;
use lib '/usr/share/perl5';

use vars qw/%conn_cache %skipped @new_conn $dlf_maker $dlflines/;

use Lire::DlfSchema;
use Lire::Email;
use Lire::Program qw( :msg :dlf );
use Time::Local;

sub date2epoch {
    my ( $mon, $day, $year, $hour, $min, $sec, $ampm ) = 
      $_[0] =~ m!^^(\d+)/(\d+)/(\d+) (\d+):(\d+):(\d+) (AM|PM)!
	or die "invalid ArGoSoft timestamp: $_[0] (should be mm/dd/yyyy HH:MM:SS AM|PM)\n";

    # Checks are handled in timelocal
    $mon  -= 1;
    $year -= 1900;
    $hour += 12 if $ampm eq 'PM';

    timelocal( $sec, $min, $hour, $day, $mon, $year );
}

sub lex_record {
    # Remove DOS line ending
    $_[0] =~ s/\r?\n$//;

    # Returns date, conn_id, msg
    return $_[0] =~ /^(.*?) - (?:[\[\(]\s*(\d+)[\]\)] )?(.*)$/
      or die "Invalid ArGoSoft Mail Server log line\n";
}

# We use a FIFO array to hold the different messages because ArGoSoft
# log file doesn't contains any cross-connection (smtp-in, smtp-out,
# local delivery, etc.) identifier. So we use the from and to
# addresses.
#
# INVARIANT: We assume that ArGoSoft will deliver the message in the 
# order it received them.

my @msg_queue;

sub find_msg_in_queue {
    my ( $from, $to, $start_from_end ) = @_;

    # Find a message in the queue that was addressed to
    # those addresses

    if (!$start_from_end) {
	for (my $i=0; $i < @msg_queue; $i++ ) {
	    my $msg = $msg_queue[$i];

	    next if defined $from && defined $msg->{from} && $msg->{from} ne $from;
	    next if defined $to && defined $msg->{to} && $msg->{to} ne $to;

	    # Found, keep the position in the message queue of the message
	    $msg->{_queue_pos} = $i;

	    return $msg;
	}
    } else {
	for (my $i=$#msg_queue; $i >= 0; $i-- ) {
	    my $msg = $msg_queue[$i];

	    next if defined $from && defined $msg->{from} && $msg->{from} ne $from;
	    next if defined $to && defined $msg->{to} && $msg->{to} ne $to;

	    # Found, keep the position in the message queue of the message
	    $msg->{_queue_pos} = $i;

	    return $msg;
	}	
    }

    return undef;
}

sub enqueue_msg {
    my ( $msg ) = @_;
    push @msg_queue, $msg;

    $msg->{_queue_pos} = $#msg_queue;
}

sub dequeue_msg {
    my ( $msg ) = @_;

    die "Invalid msg: missing _queue_pos attribute\n"
      unless defined $msg->{_queue_pos};

    die "Invalid msg: invalid _queue_pos attribute: $msg->{_queue_pos}\n"
      if $msg->{_queue_pos} < 0 || $msg->{_queue_pos} > $#msg_queue;

    splice @msg_queue, $msg->{_queue_pos}, 1;
}

sub flush_queue {
    foreach my $msg ( @msg_queue ) {
	lr_warn( "Message from $msg->{from} to $msg->{to} still in queue but stat isn't 'deferred'" )
	  if $msg->{stat} ne 'deferred';
	print_argo_dlf( $msg );
    }
}

sub delivery_dlf {
    my ( $date, $conn, $stat, $xstat, $final ) = @_;

    my $time = date2epoch( $date );
    foreach my $to ( @{$conn->{rcpt_to}} ) {
	my $msg;

	if ( $conn->{proto} eq 'smtp-in' ) {
	    $msg = {};
	    $msg->{time} = $conn->{time_start};
	    $msg->{from} = $conn->{mail_from};
	    $msg->{to}   = $to;
	    $msg->{from_relay_host} = $conn->{from_host};
	    $msg->{from_relay_ip} = $conn->{from_ip};
	    $msg->{delay}	= $time - $msg->{time};

	    enqueue_msg( $msg );
	} elsif ( $conn->{proto} eq 'smtp-out' || 
		  $conn->{proto} eq 'delivery' ) 
	{
	    $msg = find_msg_in_queue( $conn->{mail_from}, $to );

	    # Try a bounce
	    $msg = find_msg_in_queue( "<>", $to ) unless $msg;
	    unless ($msg ) {
		lr_notice( "can't find origin of message from $conn->{mail_from} to $to at line $. assuming webmail post" );
		$msg = {
			time => $conn->{time_start},
			from => $conn->{mail_from},
			from_relay_host => "webmail",
			from_relay_ip	=> "127.0.0.1",
			to   => $to
		       };
		enqueue_msg( $msg );
	    }
	    if ( $conn->{proto} eq 'delivery' ) {
		$msg->{to_relay_host} = "localhost";
		$msg->{to_relay_ip}	= "127.0.0.1";
	    } else {
		$msg->{to_relay_host} = $conn->{to_host};
		$msg->{to_relay_ip}	= $conn->{to_ip};
	    }
	    $msg->{delay} = $time - $msg->{time};
	} else {
	    die "Unknown protocol in delivery_dlf: $conn->{proto}\n";
	}

	$msg->{stat} = $stat if defined $stat;
	$msg->{xstat} = $xstat if defined $xstat;

	if ( $final ) {
	    print_argo_dlf( $msg );
	    dequeue_msg( $msg )
	} 
    }
}

sub bounce_msg {
    my ( $date, $conn ) = @_;

    my $time = date2epoch( $date );
    my $msg = {};
    $msg->{time} = $conn->{time_start};
    $msg->{from} = "<>";
    $msg->{to}   = $conn->{mail_from};
    $msg->{from_relay_host} = "localhost";
    $msg->{from_relay_ip}   = "127.0.0.1";
    $msg->{delay} = 0;

    enqueue_msg( $msg );
}

sub print_argo_dlf {
    my ( $dlf ) = @_;

    if ( defined $dlf->{from} ) {
	my $email;
	sanitize( "emailadress", $dlf->{from}, $email );
	($dlf->{from_user}, $dlf->{from_domain}) = splitemailadress( $email );
    }

    if ( defined $dlf->{to} ) {
	my $email;
	sanitize( "emailadress", $dlf->{to}, $email );
	($dlf->{to_user}, $dlf->{to_domain}) = splitemailadress( $email );
    }

    sanitize( "relayhost", $dlf->{from_relay_host}, $dlf->{from_relay_host})
      if $dlf->{from_relay_host};
    sanitize( "relayhost", $dlf->{to_relay_host}, $dlf->{to_relay_host})
      if $dlf->{to_relay_host};
    sanitize( "relayip", $dlf->{from_relay_ip}, $dlf->{from_relay_ip})
      if $dlf->{from_relay_host};
    sanitize( "relayip", $dlf->{to_relay_ip}, $dlf->{to_relay_ip})
      if $dlf->{to_relay_ip};

    print join( " ", @{$dlf_maker->( $dlf )}), "\n";
    $dlflines++;
}

sub new_connection {
    my ( $date, $proto, $host, $ip ) = @_;
    push @new_conn, { proto => $proto, from_host => $host, from_ip => $ip };
}

sub start_connection {
    my ( $date, $conn_id, $msg ) = @_;

    my $proto;
    if ( $msg =~ /^220 ArGoSoft/ ) {
	$proto = "smtp-in";
    } elsif ( $msg =~ /^\+OK ArGoSoft/ ) {
	$proto = "pop3";
    } elsif ( $msg =~ /^(GET|POST|PUT|HEAD)/ ) {
	$proto = "web";
    } else {
	die "Can't determine protocol from $msg\n";
    }

    # Try to find a matching new connection for that line
    for (my $i=0; $i < @new_conn; $i++ ) {
	if ( $new_conn[$i]{proto} eq $proto ) {
	    # Found

	    $conn_cache{$conn_id} = {
				     conn_id	  => $conn_id,
				     time_start   => date2epoch( $date ),
				     %{$new_conn[$i]},
				     state	  => "start",
				    };

	    splice @new_conn, $i, 1;
	    return 1;
	}
    }
    return 0;
}

sub end_connection {
    my ( $date, $conn_id ) = @_;

    delete $conn_cache{$conn_id};
}

sub start_delivery {
    my ( $date, $conn_id, $to_email ) = @_;

    my $nrcpt;
    if ( $to_email =~ /^(\d+) recipients$/ ) {
	$nrcpt = $1;
	$to_email = undef;
    } else {
	$nrcpt = 1;
    }

    $conn_cache{$conn_id} =
      {
       conn_id	    => $conn_id,
       proto	    => "delivery",
       time_start   => date2epoch( $date ),
       nrcpt	    => $nrcpt,
       delivering_to => $to_email,
      }
}

my %smtp_state = ( start	    => \&smtp_start,
		   mail_from_reply  => \&smtp_mail_from_reply,
		   rcpt_to	    => \&smtp_rcpt_to,
		   rcpt_to_reply    => \&smtp_rcpt_to_reply,
		   data		    => \&smtp_data,
		 );

sub smtp_reset {
    my ( $conn ) = @_;

    $conn->{state} = "start";
    delete $conn->{mail_from};
    delete $conn->{rcpt_to};
    delete $conn->{last_rcpt};
}

sub smtp_start {
    my ( $date, $conn, $msg ) = @_;

    if ( $msg =~ /^MAIL FROM:\s*(<[^>]*>|\S+)/ ) {
	sanitize( "emailadress", $1, $conn->{mail_from} );
	$conn->{state}	    = "mail_from_reply";
    } elsif ( $msg =~ /^RSET/ ) {
	smtp_reset( $conn );
    } elsif ( $conn->{proto} eq 'smtp-out' ) {
	# Check for delivery status message
	smtp_delivery( $date, $conn, $msg );
    }
    # Ignore other messages in the start state
}

sub smtp_mail_from_reply {
    my ( $date, $conn, $msg ) = @_;

    if ( $msg =~ /^250/ ) {
	if ( $msg =~ /^250 Sender "([^"]*) OK.../ ) {
	    # ArGoSoft code
	    $conn->{mail_from} = $1;
	}
	$conn->{state} = "rcpt_to";
    } elsif ( $msg =~ /^(Checking|Address)/ ) {
	# Ignore those informational messages
    } elsif ( $msg =~ /^451 (.*)/ ) {
	# Mail from address rejected
	# The DLF record will be output when we see the Fatal Error message.
	#
	# FIXME: Check how the log looks in the case of failing SMTP relaying 
	# attempt in inbound connections
	$conn->{state} = "start";
    } elsif ( $msg =~ /^Error: \[(\d+)\](.*)/ ) {
	# WinSock error
	delivery_dlf( $date, $conn, "winsock error", $2, 1);
    } else {
	die "Unknown message in smtp_mail_from state\n";
    }
}

sub smtp_rcpt_to {
    my ( $date, $conn, $msg ) = @_;

    if ( $msg =~ /^RCPT TO:\s*(<[^>]*>|\S+)/ ) {
	sanitize( "emailadress", $1, $conn->{last_rcpt} );
	$conn->{state}	    = "rcpt_to_reply";
    } elsif ( $msg =~ /^RSET/ ) {
	smtp_reset( $conn );
    } elsif ( $msg =~ /^DATA/ ) {
	$conn->{state} = "data";
    } elsif ( $msg =~ /^QUIT/ ) {
	$conn->{state} = "start";
    } else {
	die "Unknown message in rcpt_to state\n";
    }
}

sub smtp_rcpt_to_reply {
    my ( $date, $conn, $msg ) = @_;

    if ( $msg =~ /^250/ ) {
	if ( $msg =~ /^250 Recipient "([^"]*) OK.../ ) {
	    # ArGoSoft code
	    push @{$conn->{rcpt_to}}, $1;
	} elsif ( $msg =~ /^250/ ) {
	    # Other server's code
	    push @{$conn->{rcpt_to}}, $conn->{last_rcpt};
	}
	delete $conn->{last_rcpt};
	$conn->{state} = "rcpt_to";
    } elsif ( $msg =~ /^(451|550) (.*)/ ) {
	# Recipient rejected

	# In the case of outgoing SMTP connections, we will
	# output the DLF record when we see the Fatal Error message
	if ( $conn->{proto} eq 'smtp-in' ) {
	    my $old_to = $conn->{to};
	    $conn->{to} = [ $conn->{last_rcpt} ];
	    delivery_dlf( $date, $conn, $1, $1, 1 );
	    $conn->{to} = $old_to;
	}
	delete $conn->{last_rcpt};
	$conn->{state} = "rcpt_to";
    } else {
	die "Unknown message in smtp_rcpt_to_reply state\n";
    }
}

sub smtp_data {
    my ( $date, $conn, $msg ) = @_;

    # Skip, lines like
    # 3/17/2002 3:45:49 PM - [   601]
    return unless defined $msg && length $msg;
    if ( $msg =~ /^354 /) {
	# Beginning of the message
	# In the case of incoming SMTP connection,
	# we have to enqueue the message now, even though we didn't
	# received the 250 confirmation yet because the delivering message
	# often appears before it.
	delivery_dlf( $date, $conn, "sent", $1 )
	  if $conn->{proto} eq 'smtp-in';
	
    } elsif ( $msg =~ /^550 (.*)/ ) {
	# Message rejected for some reason: content filtering, 
	# message size, etc.

	# In the case of outgoing SMTP connections, we will
	# output the DLF record when we see the Fatal Error message

	if ( $conn->{proto} eq 'smtp-in' ) {
	    my $xstat = $1;
	    my $stat;
	    if ( $xstat =~ /filter/ ) {
		$stat = "content_rejected";
	    } elsif ( $xstat =~ /size/ ) {
		$stat = "size";
	    } else {
		$stat = "unknown_error";
	    }
	    foreach my $to ( @{$conn->{rcpt_to}} ) {
		my $msg = find_msg_in_queue( $conn->{mail_from}, $to );

		# Since we queued the message on the 354, this shoudn't
		# happen
		die "missing message from $conn->{mail_form} to $to\n"
		  unless $msg; 
		$msg->{stat}	= $stat;
		$msg->{xstat}	= $xstat;
		print_argo_dlf( $msg );
		dequeue_msg( $msg );
	    }
	}

	$conn->{state} = "start";
    } elsif ( $msg =~ /^250 (.*)/ ) {
	# Message accepted for delivery
	delivery_dlf( $date, $conn, "sent", $1 )
	  if $conn->{proto} eq 'smtp-out';

	$conn->{state} = "start";
    } elsif ( $msg =~ /^\./ ) {
	# End of the message
	;
    } else {
	die "Unknown message in smtp_data state\n";
    }
}

# HEURISTICS: We assume that ArGoSoft will deliver only to one address at
# a time.
sub smtp_delivery {
    my ( $date, $conn, $msg ) = @_;

    if ( $msg =~ /^Message from.*?relayed/ ) {
	# Completed relaying
	delivery_dlf( $date, $conn, undef, undef, 1 );

	smtp_reset( $conn );
	$conn->{nrcpt}--;
	$conn->{proto} = "delivery";
    } elsif ( $msg =~ /^Fatal SMTP Error: .*?\.\.\. (.*?) Will bounce back/ ) {
	# Bounce
	# Make sure we have a valid rcpt_to
	$conn->{rcpt_to} = [ $conn->{delivering_to} ] 
	  unless defined $conn->{rcpt_to};
	delivery_dlf( $date, $conn, "bounced", $1, 1 );
	
	# Add a bounce message to the queue
	bounce_msg( $date, $conn );

	$conn->{nrcpt}--;
	$conn->{proto} = "delivery";
    } elsif ( $msg =~ /^SMTP Server Error: (.*?) Will retry in/ ) {
	# Temporary failure
	delivery_dlf( $date, $conn, "deferred", $1 );

	$conn->{nrcpt}--;
	$conn->{proto} = "delivery";
    }

    # Remove from connection cache if this was the last recipient
    delete $conn_cache{$conn->{conn_id}} unless $conn->{nrcpt};
}

sub updt_delivery {
    my ( $date, $conn, $msg ) = @_;

    if ( $msg =~ /^Forwarding mail to (.*)/ ) {
	# Forward
	my $forward = $1;
	die "ArGoSoft's logging s*cks: can't determine original destinator in forward to $forward\n"
	  unless $conn->{delivering_to};

	# Change the recipient of the message
	# HEURISTICS: Forward log messages always seem to appear 
	# right after the 354 or 250 answer to the DATA command
	# The related message is thus nearer then end of the queue
	# rather than at the beginning
	$msg = find_msg_in_queue( undef, $conn->{delivering_to}, 1 );
	$msg->{to} = $forward if ( $msg );

	$conn->{nrcpt}--;
    } elsif ( $msg =~ /^Copy of the message from (\S+) kept in mailbox (.*)/ ){
	# Copy before forward
	delivery_dlf( $date, $conn, "sent", undef );
    } elsif ( $msg =~ /^Message from (.*?) delivered to (.*)/ ) {
	# Completed local delivery
	$conn->{mail_from} = $1;
	$conn->{rcpt_to} = [ $2 ];

	delivery_dlf( $date, $conn, "sent", undef, 1);
	$conn->{nrcpt}--;
    } elsif ( $msg =~ /^Trying the server (\S+): ([\d.]+)/ ) {
	# Outgoing connection
	$conn->{proto}	    = "smtp-out";
	$conn->{to_host}    = $1;
	$conn->{to_ip}	    = $2;
	$conn->{state}	    = "start";
    } elsif ( $msg =~ /^(Attempting to deliver|Retrieved \d+ MX records)/) {
	# Start of outgoing SMTP connection
    } else {
	die "Unknown message in updt_delivery state\n";
    }

    # Remove from connection cache if this was the last recipient
    delete $conn_cache{$conn->{conn_id}} unless $conn->{nrcpt};
}

sub parse_record {
    my ( $date, $conn_id, $msg ) = lex_record( $_[0] );
    if ( defined $conn_id ) {
	my $conn = $conn_cache{$conn_id};

	if ( $msg =~ /Delivering to (.*)/ ) {
	    start_delivery( $date, $conn_id, $1 );
	    return;
	} elsif ( ! defined $conn ) {
	    # This may be part of a new connection
	    return if @new_conn && start_connection( $date, $conn_id, $msg );

	    # Connection that started before the beginning of 
	    # the log file
	    lr_notice( "Skipping connection $conn_id which started before the beginning of the log file" ) 
	      unless $skipped{$conn_id};
	    $skipped{$conn_id} = 1;
	    return;
	}

	if ( $conn->{proto} eq 'smtp-in' || $conn->{proto} eq 'smtp-out' ) {
	    $smtp_state{$conn->{state}}->( $date, $conn, $msg );
	} elsif ( $conn->{proto} eq 'delivery' ) {
	    updt_delivery( $date, $conn, $msg );
	}

	# Ignore other connections 'type

    } elsif ( $msg =~ /^Requested (SMTP|POP3|Web) connection from ([\d.]+) \[([^\]]+)\]/ ) {
	# Start of connection
	my $proto = lc $1;
	$proto .= "-in" if $proto eq 'smtp';

	new_connection( $date, $proto, $3, $2 );
    } elsif ( $msg =~ /ended. ID=(\d+)/ ) {
	# End of connection
	end_connection( $date, $1 );
    } elsif ( $msg =~ /stopped|started/ ) {
	# INVARIANT: We assume that the servers can't be 
	# restarted independantly from one another.

	# Reset connection cache
	lr_notice( "Servers restarted at line $. but there are still active connections: ", join ( ", ", keys %conn_cache ) )
	  if keys %conn_cache;
	%conn_cache = ();
	%skipped    = ();
    } else {
	die "Unknown record\n";
    }
}

my $lines	= 0;
$dlflines	= 0;
my $errorlines  = 0;

my $schema = eval { Lire::DlfSchema::load_schema( "email" ); };
lr_err( "error loading email schema: $@" ) if $@;
$dlf_maker =
  $schema->make_hashref2asciidlf_func( qw/time from_user from_domain
					  from_relay_host from_relay_ip
					  delay
					  to_user to_domain
					  to_relay_host to_relay_ip
					  stat xstat/ );

@new_conn   = ();
%conn_cache = ();
%skipped    = ();
@msg_queue  = ();
init_dlf_converter( "email" );
my $line;
while ( defined( $line = <> ) ) {
    $lines++;

    eval { parse_record( $line ) };
    if ($@) {
	lr_warn( $@ );
	lr_warn( "failed to parse '$line'. Skipping." );
	$errorlines++;
    }
}

if ( keys %conn_cache ) {
    lr_notice( "There are still active connections at the end of the log file: ",
	       join( ", ", keys %conn_cache ))
}

# Output all pending messages
flush_queue();

end_dlf_converter( $lines, $dlflines, $errorlines );

__END__

=pod 

=head1 NAME

argomail2dlf - converts ArGoSoft Mail Server log files to the email DLF format

=head1 SYNOPSIS

B<argomail2dlf>

=head1 DESCRIPTION

B<argomail2dlf> converts a ArGoSoft Mail Server log file to email DLF
format. Information on the ArGoSoft Mail Server can be found on
http://www.argosoft.com/applications/mailserver/. The generic email
DLF format is described in email.xml.

This DLF converter was developed and tested with the ArGoSoft Mail
Server Pro. It could work with the Free and Plus versions, but it
wasn't tested.

=head1 LOGGING CONFIGURATION

To operate properly with Lire, you have to make sure that the
following logging features are enabled:

    - Log SMTP commands.
    - Log SMTP conversations.
    - Log to File.

The other two options (Log POP Commands and Log Web Commands) aren't
needed and can be left unchecked.

=head1 LIMITATIONS

Due to the limitations in the logging system of ArGoSoft Mail Server,
the following information isn't available:

    - size of messages
    - message's ID

Also, since ArGoSoft doesn't log "queue identifier" (that is an
identifier that can be used to track the delivery of a message across
the different components), we have to rely on heuristics and assumed
invariants to track the messages. We use the sender and recipient
addresses to track the messages.

=over 4

=item Delivery in order

We assume that ArGoSoft Mail Server will deliver its messages in the
order it received them.

=item One recipient delivery

We assume that ArGoSoft Mail Server will only deliver to one recipient
at a time. That is that if a message was adressed to two recipients to
the same remote demain, it will use two connections to deliver this
message.

=item Local delivery happens immediately

We assume that local delivery (and forward) will happen right after
the 354 or 250 status code is logged. We have to rely on this, because
the log messages related to forwarding don't mention the sender.

=item Unknown messages were delivered through the web

Messages that are delivered but weren't sent through SMTP are presumed
to come from the webmail relay (sent through the web interface). They
could also have been sent before the start of the log file.

=back

=head1 EXAMPLES

To process a log as produced by argomail

 $ argomail2dlf < mail.log

argomail2dlf will be rarely used on its own, but is more likely
called by lr_log2report:

 $ lr_log2report argomail < /var/log/mail.log

=head1 SEE ALSO

exim2dlf(1), postfix2dlf(1), sendmail2dlf(1), qmail2dlf(1)

=head1 VERSION

$Id: argomail2dlf.in,v 1.10 2006/07/23 13:16:34 vanbaal Exp $

=head1 COPYRIGHT

Copyright (C) 2000, 2001 Stichting LogReport Foundation LogReport@LogReport.org

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html.

=head1 CREDITS

We would like to thanks denon from denon.cx for contributing sample
log files and helping debug this DLF converter.

=head1 AUTHOR

Francis J. Lacoste <flacoste@logreport.org>

=cut

# Local Variables:
# mode: cperl
# End:
