#!/bin/sh
#================================================================
# Copyright (C) 2015 J.F. Dockes
# There used to be Estraier content in there, but I quite believe that is not
# the case any more.
# This file is licensed under the GPL v2
#================================================================
# Convert a pdf file to  HTML.
#
# We use pdftotext from the xpdf/poppler-utils package. 
#
# pdftotext sometimes outputs unescaped text inside HTML text sections.
# We try to correct.
#
# If pdftotext produces no text and tesseract is available, we try to
# perform OCR. As this can be very slow and the result not always
# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
#
# We guess the OCR language in order of preference:
#  - From the content of a ".ocrpdflang" file if it exists in the same
#    directory as the PDF
#  - From an RECOLL_TESSERACT_LANG environment variable
#  - From the content of $RECOLL_CONFDIR/ocrpdf
#  - Default to "eng"
#
# Uncomment the following if you get better results without. The
# pdftotext manual says that the option is no longer recommended The
# difference in output seems mostly the removal of soft-hyphens when
# -raw is not set
# optionraw=-raw

# set variables
progname="rclpdf"
filetype=pdf


#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file

# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).

# Describe error in a way that can be interpreted by our caller
senderror()
{
    echo RECFILTERROR $*
    # Also alert on stderr just in case
    echo ":2:$progname::: $*" 1>&2
    exit 1
}

iscmd()
{
    cmd=$1
    case $cmd in
    */*)
	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
    *)
      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
      return 1 ;;
    esac
}

checkcmds()
{
    for cmd in $*;do
      if iscmd $cmd 
      then 
        a=1
      else 
        senderror HELPERNOTFOUND $cmd
      fi
    done
}

# show help message
if test $# -ne 1 -o "$1" = "--help" 
then
  echo "Convert a $filetype file to HTML text for Recoll indexing."
  echo "Usage: $progname [infile]"
  exit 1
fi

infile="$1"

# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
  senderror INPUTNOSUCHFILE "$infile"
fi

# protect access to our temp files and directories
umask 77

##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE

checkcmds pdftotext iconv awk

ocrpossible=0
if iscmd tesseract; then
    if iscmd pdftoppm; then
        ocrpossible=1
    fi
fi
confdir=${RECOLL_CONFDIR:-~/.recoll}
test ! -f "$confdir/ocrpdf" && ocrpossible=0

tmpdir=

cleanup()
{
    # Note that we're using a constant part (rclpdftmp), that hopefully
    # guarantees that we can't do big mistakes with the -rf here.
    if test ! -z "$tmpdir"; then
        rm -rf $tmpdir/rclpdftmp
        rmdir $tmpdir
    fi
}
    
trap cleanup EXIT HUP QUIT INT TERM

runpdftotext()
{
    # Test poppler version: at some point before 0.24, poppler began
    # to properly escape text inside the header (but not the body).
    XYZ=`pdftotext -v 2>&1 | awk '/pdftotext/{print $3}'`
    MAJOR=`echo $XYZ | cut -d. -f 1`
    MINOR=`echo $XYZ | cut -d. -f 2`
    escapeheader=1
    escapebody=1
    if test "$MAJOR" -gt 0 ; then
        escapeheader=0
    elif test "$MINOR" -ge 24; then
        escapeheader=0;
    fi

    # Run pdftotext and fix the result (add a charset tag and fix the
    # html escaping). The escaping is a half-hearted job. We do try to
    # fix some header fields, only for those which are single-line.
    pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
    iconv -f UTF-8 -t UTF-8 -c -s |
    awk -v escapeheader=$escapeheader -v escapebody=$escapebody 'BEGIN'\
' {
  inbodypre = 0
  cont = ""
}
function escapehtml(s)
{
  gsub(/&/, "\\&amp;", s)
  gsub(/</, "\\&lt;", s)
  gsub(/>/, "\\&gt;", s)
  gsub(/"/, "\\&quot;", s)
  return s
}    
{
  $0 = cont $0
  cont = ""
  # Insert charset meta tag at end of header
  if(inbodypre == 0 && $0 ~ /<\/head>/) {
    match($0, /<\/head>/)
    part1 = substr($0, 0, RSTART-1)
    part2 = substr($0, RSTART, length($0))
    charsetmeta = "<meta http-equiv=\"Content-Type\" "\
                  "content=\"text/html; charset=UTF-8\">"
    $0 =  part1 charsetmeta "\n" part2
  }
  if(inbodypre == 0 && $0 ~ /<title>.*<\/title>/){
    match($0, /<title>.*<\/title>/)
    part1 = substr($0, 0, RSTART-1)
    mid = substr($0, RSTART, RLENGTH)
    part2 = substr($0, RSTART + RLENGTH, length($0))
    gsub(/<title>/, "", mid)
    gsub(/<\/title>/, "", mid)
    if (escapeheader) {
        mid = escapehtml(mid)
    }
    mid = "<title>" mid "</title>"
    $0 = part1 mid part2
  }
  # This matches all single-line meta fields
  if(inbodypre == 0 && $0 ~ /content=".*"\/>/){
    match($0, /content=".*"\/>/)
    part1 = substr($0, 0, RSTART-1)
    mid = substr($0, RSTART, RLENGTH)
    part2 = substr($0, RSTART + RLENGTH, length($0))
    gsub(/content="/, "", mid)
    gsub(/"\/>/, "", mid)
    if (escapeheader) {
        mid = escapehtml(mid)
    }
    mid = "content=\"" mid "\"/>"
    $0 = part1 mid part2
  }

  # Recoll treats "Subject" as a "title" element (based on emails). The PDF
  # "Subject" metadata field is more like an HTML "description"
  if(inbodypre == 0 && $0 ~ /<meta ?name="Subject"/){
      gsub(/="Subject"/, "=\"Description\"", $0)
  }

  if ($0 == "<pre>"){
    # Begin of body text.
    inbodypre++
    print $0
    next
  } else if ($0 ~ /<\/pre>/){
    inbodypre--
    print $0 
    next
  } else if ($0 ~ /[-]$/) {
    # Note : soft-hyphen is iso8859 0xad
    # Break at last whitespace
    match($0, "[ \t][^ \t]+$")
    line = substr($0, 0, RSTART)
    cont = substr($0, RSTART, RLENGTH-1)
    $0 = line
    # print "LINE [" $0 "] CONT[" cont "]"
  } 
  if(inbodypre > 0 && escapebody){
      $0 = escapehtml($0)
  }
  print $0
}
'
}

# If we're not equipped for ocr, just run pdftotext to stdout
if test $ocrpossible -eq 0; then
    runpdftotext
    exit $?
fi


# tesseract is installed, prepare for running it.
# We need to check the pdftotext output, but we don't want to run
# it twice. Use a temporary file.
if test z"$RECOLL_TMPDIR" != z; then
    ttdir=$RECOLL_TMPDIR
elif test z"$TMPDIR" != z ; then
    ttdir=$TMPDIR
else
    ttdir=/tmp
fi
tmpdir=$ttdir/rclpdf_tmp$$
mkdir $tmpdir || senderror mkdir $tmpdir failed
mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed

# Run pdftotext into the temp file
pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile
runpdftotext > $pdftxtfile

# If text is big, or small but not only tags and empty lines, output
# it. Given the contents check which we perform, a file in which the
# only text content is metadata (pdf description field), will be run
# through OCR, which is not necessarily what we would want. It would
# be possible to detect the situation if this proved an issue.
txtsize=`ls -l $pdftxtfile | awk '{print $5}'`
txtempty=0
# Use grep to check if there is regular text in there. Only do it on
# small outputs
if test $txtsize -lt 5000 ; then
    realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile`
    test -z "$realtext" && txtempty=1
fi

if test $txtempty -eq 0; then
    # pdftotext produced actual output, use it. No OCR
    cat $pdftxtfile
    exit 0
fi

# PDF has no text content and tesseract is available. Give it a try
pdflangfile=`dirname "$infile"`/.ocrpdflang
if test -f "$pdflangfile"; then
    tesseractlang=`cat "$pdflangfile"`
fi

# Try to guess tesseract language. This should depend on the input
# file, but we have no general way to determine it. So use the
# environment and hope for the best.
if test -z "$tesseractlang"; then
    tesseractlang=${RECOLL_TESSERACT_LANG}
    if test -z "$tesseractlang"; then
        # Half assed trial to guess from LANG then default to english
        localelang=`echo $LANG | awk -F_ '{print $1}'`
        # echo localelang "$localelang" >&2
        case "$localelang" in 
        en) tesseractlang=eng;;
        de) tesseractlang=deu;;
        fr) tesseractlang=fra;;
        # Someone will have to add more tesseract language codes here.
        esac

        test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"`

        test -z "$tesseractlang" && tesseractlang="eng"
    fi
fi

# echo tesseractlang "$tesseractlang" >&2

TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile"
TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX"

# split pdf-pages
ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1)
if [ $? -ne 0 ] ; then
    senderror "pdftoppm: $ERR_MSG"
fi

for i in $TMPFILE* ; do
    if [ -s "$i" ] ; then

        tesseract $i $i -l  $tesseractlang > $TESSERRORFILE 2>&1
        TESSERR=$?
        # ignore tesseract start message
        LINECOUNT=$(wc -l < $TESSERRORFILE)
        if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then
            echo "tesseract-error $TESSERR page $i in $infile" >&2
            # sort "compacts" leptonica-output
            cat $TESSERRORFILE | sort -u >&2
        fi
        # else
            # debugging purpose
            # SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX)
            # echo "no pdftoppm in $infile cp to $SICFILE" >&2
            # cp -a $infile $SICFILE
        # fi
    fi
done

# don't output "empty" HTML-Files
CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m) 
if [ "$CHARS" -gt 0 ] ; then
    echo "<HTML><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><pre>" 
    cat "$TMPFILE"*.txt | \
        awk '{
  gsub(/&/, "\\&amp;", $0)
  gsub(/</, "\\&lt;", $0)
  gsub(/>/, "\\&gt;", $0)
  print $0
}
'
    echo "</pre></body></html>"
fi
