Document Scanning incl OCR - Pastebin.com

1 min read Original article ↗

Guest User

a guest

Aug 19th, 2013

380

0

Never

Not a member of Pastebin yet? Sign Up, it unlocks many cool features!

  1. #!/bin/bash

  2. set -x

  3. set -e

  4. target="${1}.pdf"

  5. num_pages=${num_pages:-1}

  6. if [[ -z "$target" ]]; then

  7. echo >&2 "Usage: $0 DESTINATION"

  8. exit 1

  9. fi

  10. echo "Scanning $num_pages page(s) to $target..."

  11. tempdir=$(mktemp -d)

  12. scansource=$(LC_ALL=C scanimage -L \

  13. | grep 'HP 7650 Document scanner' \

  14. | sed "s,^device \`\([^']\+\)'.*,\1,")

  15. (

  16. cd "$tempdir"

  17.     scanimage -d"$scansource" -B64 --format=tiff -l0 -t0 -x215 -y297 \

  18. --batch-start=0 \

  19. --batch-count="$num_pages" \

  20. --batch-increment=1 \

  21. -b \

  22. --mode Color \

  23. --resolution 300 \

  24. --source ADF

  25. )

  26. convert="convert ${tempdir}/out0.tif"

  27. convert "${tempdir}/out0.tif" "${tempdir}/out0.pdf"

  28. gs="gs \

  29.    -dBATCH \

  30.    -q \

  31.    -dNOPAUSE \

  32.    -sDEVICE=pdfwrite \

  33.    -sOutputFile=- \

  34. ${tempdir}/out0.pdf"

  35. for ((i=1; i < num_pages; ++i)) do

  36.     convert "${tempdir}/out${i}.tif" "${tempdir}/out${i}.pdf"

  37. convert="$convert -append ${tempdir}/out${i}.tif"

  38. gs="$gs ${tempdir}/out${i}.pdf"

  39. done

  40. $convert "${tempdir}/scanimage.pnm"

  41. gocr -i "${tempdir}/scanimage.pnm" -o "${tempdir}/${1}.txt" -f UTF8

  42. a2ps -Xutf-8 -i "${tempdir}/${1}.txt" -o "${tempdir}/${1}.ps"

  43. $gs "${tempdir}/${1}.ps" > "$target"

  44. ls -lh "$tempdir"

  45. ls -lh "$target"

  46. rm -r "$tempdir"