#!/usr/bin/perl -w
# gnuhtml2latex html to latex converter
# Copyright (c) 1999 Tomasz Wgrzanowski <maniek@beer.com>
# Maintenance taken over by Gunnar Wolf, 2005
# Copyright (c) 2005-2010 Gunnar Wolf <gwolf@gwolf.org>
#
# gnuhtml2latex is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# On Debian GNU/Linux systems, the complete text of the GNU General
# Public License can be found in `/usr/share/common-licenses/GPL'.
#
# THIS IS VERY ALPHA

use strict;
use Getopt::Std;

getopts('a:bcf:gh:i:no:pst:HPS:',\%main::opts);
unless (defined $main::opts{o}) { $main::opts{o} = '{article}' }
unless (defined $main::opts{h}) { $main::opts{h} = '' }
unless (defined $main::opts{f}) { $main::opts{f} = '' }
$main::num = ($main::opts{n})?'':'*';

{
my %tagstable_start = (
'p' => '\\par ',
'b' => '\\textbf{',
'i' => '\\textit{',
'u' => '\\underline{',
'dt' => '\\item[',
'dd' => ']',
'br' => '\\\\',
'em' => '\\emph{',
'h1' => "\\section${main::num}\{",
'h2' => "\\subsection${main::num}\{",
'h3' => "\\subsubsection${main::num}\{",
'h4' => "\\paragraph${main::num}\{",
'h5' => "\\subparagraph${main::num}\{",
'h6' => "\\subparagraph${main::num}\{",
'li' => '\\item ',
'ul' => '\\begin{itemize}',
'ol' => '\\begin{enumerate}',
'dl' => '\\begin{description}',
'tt' => '\\texttt{',
'kbd' => '{\\tt\\bf ',
'var' => '\\textit{',
'dfn' => '{\\bf\\it ',
'cite' => '{\\sc ',
'samp' => '\\texttt{',
'strong' => '\\textbf{',
'listing' => '\\begin{verbatim}',
'code' => '\\texttt{',
'pre' => '\\begin{verbatim}',
'blockquote' => '\\begin{quotation}'
);

my %tagstable_end = (
'b' => '}',
'i' => '}',
'u' => '}',
'em' => '}',
'h1' => '}',
'h2' => '}',
'h3' => '}',
'h4' => '}',
'h5' => '}',
'h6' => '}',
'tt' => '}',
'kbd' => '}',
'var' => '}',
'dfn' => '}',
'cite' => '}',
'samp' => '}',
'strong' => '}',
'ul' => '\\end{itemize}',
'ol' => '\\end{enumerate}',
'dl' => '\\end{description}',
'listing' => '\\end{verbatim}',
'code' => '}',
'pre' => '\\end{verbatim}',
'blockquote' => '\\end{quotation}'
);

my $mode = 0;
my $firstfile = 1;
my $lastfile = 1;
my $substitution = 1;
package HTML::LatexMaker;
use HTML::Parser;
use HTML::Entities;
@HTML::LatexMaker::ISA = ( "HTML::Parser" );
1;

sub firstfile { my $self = shift; $firstfile = shift; }
sub lastfile { my $self = shift; $lastfile = shift; }

sub anchor_convert {
    my ($attr, $attseq)=@_;

    return unless defined($main::opts{H});
    return unless defined($attr->{href});

    printf "\\href{%s}{",$attr->{href};
}

sub image_convert {
    my ($attr, $attseq, $url, $caption, $localimg, $imgtype, $wget);
    ($attr, $attseq) = @_;

    return unless defined($main::opts{g});
    return unless defined($attr->{src});
    $wget = find_wget();

    $localimg = $url = $attr->{src};
    $localimg =~ s!(?:http|ftp)://!!;
    $localimg =~ s!\?.*!!;
    if ($localimg =~ s/\.(png|jpg|eps|gif|tif)$//) {
	$imgtype = $1;
    } else {
	warn "Cannot determine a valid image type for $url - Trying with .png";
	$imgtype = 'png'
    }

    $localimg =~ s![/?&.]!_!g;
    $localimg .= ".$imgtype" if $imgtype;
    if (-f $localimg) {
	warn "$localimg: Already here, skipping download\n";
    } elsif ($wget) {
	system($wget, $url, '-nv', '--load-cookies', '/tmp/wget.cookies', 
	       '-O', '-nc', $localimg);
    } else {
	warn "wget not found, you will need to create `$localimg'\n" .
	    "(Original URL: $url)\n";
    } 

    $caption = $attr->{title} || $attr->{alt} || 
	sprintf('\href{%s}{%s}', $url, $url);

    printf "
\\begin{figure}
\\centering
\\includegraphics[width=0.4\\textwidth]{%s}
\\caption{%s} 
\\end{figure}", $localimg, $caption;
}

sub find_wget {
    for my $path (split /:/,$ENV{PATH}) {
	my $wget = "$path/wget";
	return $wget if -x $wget;
    }
    warn "wget not found in path - No images will be downloaded\n";
    return undef;
}

sub start {
    my %tag_tbl;
    my ( $self, $tag, $attr, $attrseq ) = @_;

    %tag_tbl = (
	html     => sub { start_mode(1); return },
	head     => sub { start_mode(2); return },
	body     => sub { start_mode(3); return },
	pre      => sub { $substitution = 0; },
	listing  => sub { $substitution = 0; },
	a        => sub { anchor_convert($attr, $attrseq); return },
	img      => sub { image_convert($attr, $attrseq); return }
	);

    &{$tag_tbl{$tag}} if $tag_tbl{$tag};

    return unless( $mode == 3 and defined $tagstable_start{$tag} );
    print $tagstable_start{$tag};
}

sub end {
    my %tag_tbl;
    my ( $self, $tag ) = @_;

    %tag_tbl = (
	html    => sub { end_mode(0); return },
	head    => sub { end_mode(1); return },
	body    => sub { end_mode(1); return },
	pre     => sub { $substitution=1 },
	listing => sub { $substitution=1 },
	a       => sub { $main::opts{H} && do { print "}"; return } },
	);

    &{$tag_tbl{$tag}} if $tag_tbl{$tag};

    return unless( $mode == 3 and defined $tagstable_end{$tag} );
    print $tagstable_end{$tag};
}

sub text {
my ( $self, $text ) = @_;
return unless( $mode == 3 );

# Handle some things that decode_entities doesn't.
# (This needs to be done *before* calling decode_entities: otherwise
# there'd be no way of distinguishing `&FOO;' from `&amp;FOO;'.)

# We use `!' for internal purposes during entity translation.
$text =~ s/!|&\#(?:0*33|x0*21);/!bang;/g;

# Handle `&lsquo;&ldquo;', `&ndash;&mdash;' and so on by inserting
# thin space between the translations in such cases.
$text =~ s/&\#(?:x0*2d|0*45);/-/g;
$text =~ s/(&mdash;|&ndash;|-)(?=(?:&mdash;|&ndash;|-))/$1!thinsp;/g;
$text =~ s/(&[lr][sd]quo;)(?=(?:&[lr][sd]quo;))/$1!thinsp;/g;

# There are many things that decode_entities doesn't handle.
# A few of those things we handle ourselves.  The final replacement
# happens later (so that we correctly handle the various quotes
# whether they're literal, numeric character ref, or symbolic ref).
# In the meantime we change from `&FOO;' to `!FOO;'.
$text =~ s/&([mn]dash|[lr][sd]quo|hellip);/!$1;/g;

$text = decode_entities($text);

$text =~ s/\\/!backslash;/g;

# Does not work properly.
#	$text =~ s/([~\`\'\"]+)/!verb|$1|/g;  
if ($substitution) {
    $text =~ s/([_&%\{\}\#])/\\$1/g;
}
$text =~ s/\$/\\\$/g;
$text =~ s/\^/\\^{}/g;
$text =~ s/!backslash;/\$\\backslash\$/g;
$text =~ s/!mdash;/---/g;
$text =~ s/!ndash;/--/g;
$text =~ s/!lsquo;/`/g;  #`;
$text =~ s/!rsquo;/'/g;  #';
$text =~ s/!ldquo;/``/g;
$text =~ s/!rdquo;/''/g;
$text =~ s/!hellip;/\ldots{}/g;
$text =~ s/!thinsp;/\$\\,\$/g;
#	$text =~ s/!verb|/\\verb|/g;
$text =~ s/!bang;/!/g;
$text =~ s/\xa0/~/g;
#$text =~ s/>/\$>\$/g;
	    
# Whatever looks like an URL should be made into one
$text =~ s![[{]?((?:http|ftp)://\S+)[\]}]?!\\url{$1}!g;

print $text;
}

sub start_mode {
    my ( $mode_new, $skip_pre );
    ($mode_new) = @_;
    $skip_pre = $main::opts{P};

    if ( $mode_new == 1 && $firstfile) {
	print "% This file was converted from HTML to LaTeX with\n" .
	    "% gnuhtml2latex program\n" .
	    "% (c) Tomasz Wegrzanowski <maniek\@beer.com> 1999\n" .
	    "% (c) Gunnar Wolf <gwolf\@gwolf.org> 2005-2010\n" .
	    "% Version : $main::version.\n";

	if (!$skip_pre) {
	    print '\documentclass'.$main::opts{o}."\n";
	    print "\\usepackage{hyperref}\n" if $main::opts{H};
	    if ($main::opts{g}) {
		print "\\usepackage{graphicx}\n";
		print "\\DeclareGraphicsExtensions{.png,.jpg,.eps,.gif,.tif}\n";
	    }
	}
    }

    if ( $mode_new == 3 && $firstfile) {
	print "\\begin{document}\n" unless $skip_pre;
	print $main::opts{h};

	if ( defined $main::opts{a} or defined $main::opts{t} or
	     defined $main::opts{c} ){
	    if ( defined $main::opts{a} or defined $main::opts{t} ) {
		print ('\\title{'.$main::opts{t}.'}') if $main::opts{t};
		print ( '\\author{'.($main::opts{a} or '')."}\n\\maketitle" );
	    }
	    if ( $main::opts{c} ) { print "\n\\tableofcontents\n" }
	}
	if ( $main::opts{p} ) { print "\n\\newpage" }
    }
    $mode = $mode_new;
}

sub end_mode {
    my ( $mode_new, $skip_post);
    ($mode_new ) = @_;
    $skip_post = $main::opts{P};
    if ( $mode == 3 && $lastfile ) {
	print $main::opts{f};
	print "\\end{document}\n" unless $skip_post;
    }
    $mode = $mode_new;
}

}

$main::version = '0.4';

if ( $main::opts{i} ) {
    open FILE, $main::opts{i} or 
	die "$main::opts{i}  $1";
    @ARGV=<FILE>;
    close FILE;
}

if ( $main::opts{b} ) {
    if (@ARGV>=1) {
        my $filename=$ARGV[0];
        open FILE, $filename or die "$filename $!";
        $filename =~ s/\.html?$//;
        my $outfile = $filename.".tex";
        unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
        my $doc = new HTML::LatexMaker;
	$doc->ignore_elements($main::opts{S}) if $main::opts{S};
	$doc->lastfile(0);
        $doc->parse_file (\*FILE);
        $doc->firstfile(0);
        close FILE;
	for (my $i=1; $i < @ARGV-1; $i++) {
	    $filename=$ARGV[$i];
	    open FILE, $filename or next;
	    $filename =~ s/\.html?$//;
	    $outfile = $filename.".tex";
	    unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
	    $doc->parse_file (\*FILE);
	    close FILE;
	}
	$filename=$ARGV[@ARGV-1];
        open FILE, $filename or die;
        $filename =~ s/\.html?$//;
        $outfile = $filename.".tex";
        unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
        $doc->lastfile(1);
        $doc->parse_file (\*FILE);
    }
} else {
    foreach my $filename(@ARGV) {
	open FILE, $filename or next;
	$filename =~ s/\.html?$//;
	my $outfile = $filename.".tex";
	unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
	my $doc = new HTML::LatexMaker;
	$doc->ignore_elements($main::opts{S}) if $main::opts{S};
	$doc->parse_file (\*FILE);
	close FILE;
    }
}

=head1 NAME

gnuhtml2latex - html to latex converter

=head1 SYNOPSIS

B<gnuhtml2latex> F<[options]> F<filename>

=head1 OPTIONS

=over

=item -a [author]

speecify document author

=item -b

Process more than one input HTML file (they all get concatenated and
written to a single output file, or to STDOUT if F<-s> is set)

=item -c

Use table of contents

=item -f [string]

Specify foonote

=item -h [string]

Specify header

=item -i filename

Get the list of files to be converted from the specified filename

=item -n

Use numbered sections

=item -H

use hyperref package to process anchors


=item -g

Include images. If wget is installed, it will be used in order to
download the images; otherwise, their position will just be marked in
the resulting TeX document.

=item -o [string]

Specify document style

=item -p

Break page after title / table of contents

=item -P

Partial / plain: Omit preamble and postamble. Note that F<-P> makes
F<-H> and F<-o> meaningless (as they act in the preamble)

=item -S

Skip (ignore) the specified comma-separated tags, along with all of
their content.

=item -s

Write to STDOUT instead of to inputfilename.tex

=item -t [title]

Specify title of document

=back

=head1 DESCRIPTION

This aims to be replacement of html2latex.

Program takes html file foo.html or foo.htm file
and makes latex file foo.tex from it

=head1 NOT VERY AMBITIOUS TODO

For people who want only functionality of original html2latex

 bugfixes - Im sure there is plenty of bugs inside
 clueful backslash escaping
 more entities from outside of iso-8895-1
 tables
 performance boost
 and a lot more

=head1 MORE AMBITIOUS TODO

For people who want a real tool

 make it part of some html processor

=head1 FUTURE OF THIS PACKAGE

This is very possible that functions of this package will be included
to some more general project. This package was made mainly to make world
a bit more free.

=cut
