#!/bin/bash # # djvuocr.sh is free software. You can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # djvuocr.sh is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with djvuocr.sh. If not, see . # # # This is script is meant to automate the process of converting # image-based PDF files into searchable DJVU files by means of OCR. # # It was inspired by: # http://superuser.com/questions/641899/how-to-automatically-find-non-searchable-pdfs # http://jwilk.net/software/ocrodjvu # # It is required to have the following installed: # ocrodjvu (tested with 0.7.18 from Debian packages.debian.org) # pdftoppm, pdfinfo and pdffonts (from poppler-utils 0.24.5) # scantailor (tested with 0.9.11.1) # djvm and cjb2 (from djvulibre; djvulibre-bin 3.5.25.4-3) # # Save this script to a file and make it executable: # chmod +x djvuocr.sh # # Then run it on a terminal with the name of the PDF that you want to # OCR: # djvuocr.sh /directory/MyPdfFile.pdf # # Good luck! function check_use() { if [[ ! "$#" = "1" ]]; then printf "Usage: $0 /path/to/directory\n"; exit 1; fi; } function check_apps() { [[ -z $(which "scantailor") ]] && inst="$inst scantailor"; [[ -z $(which "pdf2djvu") ]] && inst="$inst pdf2djvu"; [[ -z $(which "ocrodjvu") ]] && inst="$inst ocrodjvu"; [[ -z $(which "pdftoppm") || -z $(which "pdfinfo") || -z $(which "pdffonts") ]] && inst="$inst poppler-utils"; [[ -z $(which "djvm") || -z $(which "cjb2") ]] && inst="$inst djvulibre-bin"; if [[ -n $inst ]]; then printf "The following utilities need to be installed:\n$inst\n\n" printf "Do you want to continue? [Y/n]" read bol_inst; case $bol_inst in [yY] | [yY][eE][sS] | "") sudo aptitude install "$inst"; ;; [nN] | [Nn][Oo]) printf "Aborting. Have a nice day.\n" exit 1; ;; esac; fi; } function DjVuOCR() { [[ -z $nomTmp ]] && nomTmp="$(uuidgen)"; OCRdir="/tmp/OCR/$nomTmp"; mkdir -p "$OCRdir"; printf "\nCreated working directory $OCRdir"; printf "\nConverting pages to TIFF: "; for i in $pags; do printf "$i "; pdftoppm -tiffcompression none -r 300 -tiff -f "$i" -l "$i" "$arch_pdf" "$OCRdir/$nomTmp"; done; printf "\n"; printf "Do you want to use scantailor to improve the OCR? [y/N] " read bol_tailor /dev/null)"; # Catch the exit code RET_PDFINFO="$?"; # Check if there was an error (0 if there is no error) if [[ ! "$RET_PDFINFO" = "0" ]]; then READ_ERROR=1; printf "Error while reading $FILE. Skipping...\n"; # stop this loop and start the next one continue; fi; main; done < <(find "$dir" -type f -iname '*pdf' -print0)