Skip to content

Commit

Permalink
Merge pull request #267 from mediamicroservices/makepdf-cleanup
Browse files Browse the repository at this point in the history
Makepdf cleanup
  • Loading branch information
smithmyz authored Sep 4, 2019
2 parents 93f2a7f + 37cea4c commit 39df424
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 51 deletions.
82 changes: 34 additions & 48 deletions makepdf
Original file line number Diff line number Diff line change
@@ -1,49 +1,18 @@
#!/bin/bash
# makepdf, makes a pdf from an image series
VERSION="1.0"
SCRIPTDIR=$(dirname $(which "${0}"))
SCRIPTDIR="$(dirname "$(which "${0}")")"
. "${SCRIPTDIR}/mmfunctions" || { echo "Missing '${SCRIPTDIR}/mmfunctions'. Exiting." ; exit 1 ;};
DEPENDENCIES=(ffmpeg convert pdfjoin tesseract)
DEPENDENCIES=(ffmpeg pdfjoin tesseract)
_initialize_make

fix_tif_name(){
local tif="${1}"
local tifname=$(basename "${TIF}")
local tifdir=$(dirname "${TIF}")
local basename=$(echo "${tifname}" | cut -d_ -f1 | cut -d. -f2)
local pageno=$(echo "${tifname}" | cut -d_ -f2 | cut -d. -f1)
local EXTENSION=$(echo "${tifname}" | cut -d_ -f2 | cut -d. -f2)
local pageno=$(echo "0000${pageno}" | tail -c 4 | head -c 3)
_report -dt "Renaming to fit image sequence naming pattern."
local old_tif_name="${TIF}"
NEW_TIF_NAME="${tifdir}/${basename}_${pageno}.${EXTENSION}"
mv -v -n "${old_tif_name}" "${NEW_TIF_NAME}"
}

check_tif_name(){
local tif="${1}"
local tifname=$(basename "${TIF}")
local tifdir=$(dirname "${TIF}")
local basename=$(echo "${tifname}" | cut -d_ -f2 | cut -d. -f2)
local pageno=$(echo "${tifname}" | cut -d_ -f2 | cut -d. -f1)
local EXTENSION=$(echo "${tifname}" | cut -d_ -f2 | cut -d. -f1)
local pageno=$(echo "0000${pageno}" | tail -c 4 | head -c 3)
if [ "$EXTENSION" = "TIF" ] && [ $(echo $pageno | awk '{print length()}') = "3" ] ; then
continue
else
fix_tif_name "${TIF}"
fi
}

_usage(){
echo
echo "$(basename "${0}") ${VERSION}"
echo "This application will create a small pdf file (suitable for sharing online) from a collection of image file or package input with the following options."
echo "Dependencies: ${DEPENDENCIES[@]}"
echo "Usage: $(basename ${0}) [ -d /path/to/deliver/to/ ] fileorpackage1 [ fileorpackage2 ...]"
echo " -d directory ( directory to deliver the resulting file to )"
echo "Usage: $(basename "${0}") [ -o /path/to/deliver/to/ ] fileorpackage1 [ fileorpackage2 ...]"
echo " -o directory ( directory to write the resulting file to )"
echo " -n (dry-run mode, show the commands that would be run but don't do anything)"
echo " -e emailaddress ( send an email about the delivery, only valid if -d is used )"
echo " -E emailaddress ( send an email about process outcome )"
echo " -h ( display this help )"
Expand All @@ -54,16 +23,14 @@ _usage(){

# command-line options to set mediaid and original variables
OPTIND=1
while getopts ":o:d:e:E:nh" OPT ; do
while getopts ":o:e:E:h" OPT ; do
case "${OPT}" in
o) OUTPUTDIR_FORCED="${OPTARG}" && _check_outputdir_forced ;;
d) DELIVERDIR="${OPTARG}" && _check_deliverdir ;;
e) EMAILADDRESS_DELIVERY="${OPTARG}" ;;
E) EMAILADDRESS_OUTCOME="${OPTARG}" ;;
n) DRYRUN=true;;
h) _usage ;;
*) echo "bad option -${OPTARG}" ; _usage ;;
:) echo "Option -${OPTARG} requires an argument" ; _writeerrorlog "makepdf" "The option selected required an argument and none was provided. The script had to exit." ; exit 1 ;;
*) echo "bad option -${OPTARG}" ; _usage ;;
esac
done
shift $(( ${OPTIND} - 1 ))
Expand All @@ -85,7 +52,7 @@ while [ "${*}" != "" ] ; do
fi
OUTPUTDIRTEXT="${INPUT}/objects/access/txt_1"
_run mkdir -p "${LOGDIR}"
exec > >(tee "${LOGDIR}/$(basename "${0}")_$(_get_iso8601_c)_$(basename ${0})_${VERSION}.txt")
exec > >(tee "${LOGDIR}/$(basename "${0}")_$(_get_iso8601_c)_$(basename "${0}")_${VERSION}.txt")
exec 2>&1

#_find_input "${INPUT}"
Expand All @@ -108,23 +75,42 @@ while [ "${*}" != "" ] ; do
TMP_JPG_DIR="${TMP_MAKEPDF_DIR}/jpgs"
_run mkdir -p "${TMP_MAKEPDF_DIR}" "${TMP_JPG_DIR}" "${OUTPUTDIRTEXT}"

for TIF in `find "${SOURCEDIR}" -maxdepth 1 -mindepth 1 -iname "*.tif" -type f` ; do
check_tif_name "${TIF}"
TIF_BASE_NAME="$(basename "${TIF%.*}")"
if [ "${NEW_TIF_NAME}" ] ; then
for TIF in $(find "${SOURCEDIR}" -maxdepth 1 -mindepth 1 -iname "*.tif" -type f | sort) ; do
tifname="$(basename "${TIF}")"
_report -dt "Working on ${tifname}..."
pageno="$(echo "${tifname}" | cut -d_ -f2 | cut -d. -f1)"
# check that tif name adheres to filename pattern and rename if not
if [[ "${#pageno}" != "3" ]] ; then
tifdir="$(dirname "${TIF}")"
basename="$(echo "${tifname}" | cut -d_ -f1 | cut -d. -f2)"
EXTENSION="${tifname##*.}"
pageno="$(echo "0000${pageno}" | tail -c 4 | head -c 3)"
_report -dt "Renaming to fit image sequence naming pattern."
NEW_TIF_NAME="${tifdir}/${basename}_${pageno}.${EXTENSION}"
mv -v -n "${TIF}" "${NEW_TIF_NAME}"
TIF="${NEW_TIF_NAME}"
fi
if [[ ! -s "${TMP_JPG_DIR}/$(basename "${TIF%.*}").jpg" ]] ; then
ffmpeg -i "${TIF}" -pix_fmt yuvj420p -s 1275x1650 "${TMP_JPG_DIR}/${TIF_BASE_NAME}.jpg"
TIF_BASE_NAME="$(basename "${TIF%.*}")"
JPG_NAME="${TMP_JPG_DIR}/${TIF_BASE_NAME}.jpg"
if [[ ! -s "${JPG_NAME}" ]] ; then
ffmpeg -hide_banner -nostdin -i "${TIF}" -pix_fmt yuvj420p -s 1275x1650 "${JPG_NAME}"
fi
tesseract "${TMP_JPG_DIR}/${TIF_BASE_NAME}.jpg" "${TMP_JPG_DIR}/${TIF_BASE_NAME}" -l eng --psm 4 pdf "${SCRIPTDIR}/tesseract.conf"
tesseract "${TMP_JPG_DIR}/${TIF_BASE_NAME}.jpg" "${TMP_JPG_DIR}/${TIF_BASE_NAME}" -l eng --psm 4 "${SCRIPTDIR}/tesseract.conf"
TESSERACT_CONFIG=(-c tessedit_char_whitelist="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^\&*(){}[]\|\"':;?/>.<,~\` " -c textord_min_linesize=2.25 -c preserve_interword_spaces=1)
tesseract "${JPG_NAME}" "${TMP_JPG_DIR}/${TIF_BASE_NAME}" -l eng --psm 4 "${TESSERACT_CONFIG[@]}" pdf
tesseract "${JPG_NAME}" "${TMP_JPG_DIR}/${TIF_BASE_NAME}" -l eng --psm 4 "${TESSERACT_CONFIG[@]}" txt
done

_report -dt "Checking for PBCore data"
SCRIPT_TITLE=$(fmpbcore "${MEDIAID}" | xmlstarlet 'select' -N "p=http://www.pbcore.org/PBCore/PBCoreNamespace.html" -t -v "/p:pbcoreCollection/p:pbcoreDescriptionDocument/p:pbcoreTitle[@titleType='Series']" -o ": " -v "/p:pbcoreCollection/p:pbcoreDescriptionDocument/p:pbcoreTitle[@titleType='Episode']")
if [[ -n "${SCRIPT_TITLE}" ]] ; then
MIDDLE_OPTIONS+=(--pdftitle "${SCRIPT_TITLE}")
fi
SCRIPT_AUTHOR=$(fmpbcore "${MEDIAID}" | xmlstarlet 'select' -N "p=http://www.pbcore.org/PBCore/PBCoreNamespace.html" -t -m "/p:pbcoreCollection/p:pbcoreDescriptionDocument/p:pbcoreCreator" -v "p:creatorRole" -o ": " -v "p:creator" -o " ; ")
if [[ -n "${SCRIPT_AUTHOR}" ]] ; then
MIDDLE_OPTIONS+=(--pdfauthor "${SCRIPT_AUTHOR}")
fi

pdfjoin --pdftitle "${SCRIPT_TITLE}" --pdfauthor "${SCRIPT_AUTHOR}" --pdfkeywords "${MEDIAID}" --fitpaper 'false' --rotateoversize 'false' --paper letter $(find "${TMP_JPG_DIR}" -name "*.pdf" | sort | xargs) --outfile "${OUTPUT}"
pdfjoin "${MIDDLE_OPTIONS[@]}" --pdfkeywords "${MEDIAID}" --fitpaper 'false' --rotateoversize 'false' --paper letter $(find "${TMP_JPG_DIR}" -name "*.pdf" | sort | xargs) --outfile "${OUTPUT}"

find "${TMP_JPG_DIR}" -name "*.txt" | sort | while read page ; do
PAGENAME="$(basename "${page%.*}")"
Expand Down
3 changes: 0 additions & 3 deletions tesseract.conf

This file was deleted.

0 comments on commit 39df424

Please sign in to comment.