#!/bin/bash
#
#    djvuocr.sh is free software. You can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    djvuocr.sh is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with djvuocr.sh.  If not, see <http://www.gnu.org/licenses/>.
#
#
# This is script is meant to automate the process of converting
# image-based PDF files into searchable DJVU files by means of OCR.
#
# It was inspired by:
# http://superuser.com/questions/641899/how-to-automatically-find-non-searchable-pdfs
# http://jwilk.net/software/ocrodjvu
#
# It is required to have the following installed:
#   ocrodjvu (tested with 0.7.18 from Debian packages.debian.org)
#   pdftoppm, pdfinfo and pdffonts (from poppler-utils 0.24.5)
#   scantailor (tested with 0.9.11.1)
#   djvm and cjb2 (from djvulibre; djvulibre-bin 3.5.25.4-3)
#
# Save this script to a file and make it executable:
#   chmod +x djvuocr.sh
#
# Then run it on a terminal with the name of the PDF that you want to
# OCR:
#   djvuocr.sh /directory/MyPdfFile.pdf
#
# Good luck!

if [[ ! "$#" = "1" ]]; then
    printf "Usage: $0 /path/to/PDF\n";
    exit 1;
fi;

[[ -z $(which "scantailor") ]] && inst="$inst scantailor";
[[ -z $(which "pdf2djvu") ]] && inst="$inst pdf2djvu";
[[ -z $(which "ocrodjvu") ]] && inst="$inst ocrodjvu";
[[ -z $(which "pdftoppm") || -z $(which "pdfinfo") || -z $(which "pdffonts") ]] && inst="$inst poppler-utils";
[[ -z $(which "djvm") || -z $(which "cjb2") ]] && inst="$inst djvulibre-bin";
if [[ -n $inst ]]; then
    printf "The following utilities need to be installed:\n$inst\n\n"
    printf "Do you want to continue? [Y/n]"
    read bol_inst;
    case $bol_inst in
	[yY] | [yY][eE][sS] | "")
	    sudo aptitude install "$inst";
	    ;;
	[nN] | [Nn][Oo])
	    printf "Aborting. Have a nice day."
	    exit 1;
	    ;;
    esac;
fi;

arch_pdf="$1";

# Go page by page trying to see which ones are searchable
# (searchable pages should have a font)
num_pag=$(printf "$PDFINFO_OUT" | grep '^Pages' | awk '{print $2}');
pags="";
i="";
for (($i=1; $i<=$num_pag; i++)); do
    # Count the number of fonts (get rid of pdffonts header)
    font=$(pdffonts -f "$i" -l "$i" "$arch_pdf" | wc -l);
    font=$(( $font - 2 ));
    # If there are no fonts, this page is not searchable, and I want
    # to store it
    if [[ "$font" == "0" ]]; then
    	pags="$pags $i";
    fi;
done;

# Start the conversion
if [[ -n "$pags" ]]; then
    printf "***** It has non-searchable pages $pags *****\n";
    [[ -z $nomTmp ]] && nomTmp="$(uuidgen)";
    OCRdir="/tmp/OCR/$nomTmp"
    mkdir -p "$OCRdir"
    printf "\nCreated working directory $OCRdir"

    printf "\nConverting pages to TIFF: ";
    for i in $pags; do
	printf "$i ";
	pdftoppm -tiffcompression none -r 300 -tiff -f "$i" -l "$i" "$arch_pdf" "$OCRdir/$nomTmp";
    done;
    printf "\n";
    printf "Do you want to use scantailor to improve the OCR? [y/N] "
    read bol_tailor;
    case $bol_tailor in
	[yY] | [yY][eE][sS])
	    printf "When scantailor is opened,
1) click \"New Project\";
2) choose $OCRdir as \"Input Directory\";
3) don't change the \"Output Directory\";
4) fullfil the tasks to the left;
5) in the last task select black and white (not mixed nor grayscale);
6) when done, click on the play symbol (an arrow within a circle), and
7) close scantailor. You don't need to save the project if you don't want to.

You'll be brought back to this terminal when finished. Hit ENTER now."
	    read;
	    scantailor;
	    out_dir="out";
	    ;;
	[nN] | [Nn][Oo] | "")
	    printf "Skipping scantailor";
    esac;
    out_dir="$OCRdir/$out_dir";

    printf "\nConverting pages to DJVU: \n";
    djvu_dir="$OCRdir/djvu";
    mkdir -p "$djvu_dir"
    # Check if tesseract is working
    if [[ "$(ocrodjvu --list-engines)" =~ "tesseract" ]]; then
	printf "Using tesseract OCR\n";
	ocr="tesseract";
    else
	printf "Not using tesseract. The results will be sub-optimal\n";
    fi;
    for i in "$out_dir"/*.tif; do
	if [[ -f "$i" ]]; then
	    # http://jwilk.net/software/ocrodjvu
	    djvu_arch="$djvu_dir/$(basename -s .tif "$i")".djvu;
	    cjb2 "$i" "$djvu_arch";
	    if [[ -n "$ocr" ]]; then
		ocrodjvu --in-place -e "$ocr" "$djvu_arch";
	    else
		ocrodjvu --in-place "$djvu_arch";
	    fi;
	fi;
    done;

    printf "\nExtracting remaining pages from PDF: \n"
    for (( i=1; i<=$num_pag; i++ )); do
	if [[ ! -f "$djvu_dir/$nomTmp-$i.djvu" ]]; then
	    # pdftk "$arch_pdf" cat "$i" output "$out_dir/$nomTmp-$i".pdf;
	    pdf2djvu -j0 --lines -p "$i" -o "$djvu_dir/$nomTmp-$i".djvu "$arch_pdf";
	fi;
#    pdftk "$arch_pdf" burst output "$(basename -s .pdf "$nomTmp")"-"%d.pdf";
    done;

    printf "\nJoining to DJVU: \n";
    djvm -c "$OCRdir/$(basename -s .pdf "$arch_pdf")"-OCR.djvu "$djvu_dir/$nomTmp-"*.djvu;

    printf "\n***** Success! *****\n";
    printf "Your new file is located at $OCRdir";

    printf "\nDo you want to clean the temporary files? [Y/n] ";
    read fin;
    case "$fin" in
	[yY] | [yY][eE][sS] | "")
	    rm -fr "$djvu_dir" "$out_dir" "$OCRdir/$nomTmp*.tif";
	    ;;
	[nN] | [nN][oO])
	    printf "\n"
	    du -h "$OCRdir";
	    ;;
    esac;
fi;