-
Notifications
You must be signed in to change notification settings - Fork 1
/
ocr.sh
executable file
·182 lines (176 loc) · 4.9 KB
/
ocr.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/bin/bash
getdir(){
if [ -z $1 ];then
echo "无可用参数,退出" >&2
kill -9 $$
exit 1
fi
find $1 -type f -printf "%TY%Tj%TH%TM%TS %h/%f\n" | grep -E -i '\.jpg|\.png|\.jpeg' | sort -n | cut -f2 -d' '
}
killme(){
echo "进程被终止"
kill -9 $$
}
about(){
echo -e "用法:$0 -d <图片所在目录> [-s] [-o <输出文件>]"
echo -e "各参数解释:\n"
echo -e " -d | --directory : 该参数用于指定图片所在目录,也可在所有参数都缺省的情况下省略该参数,直接后接图片目录"
echo -e " -s | --show : 该参数用于显示已识别出的结果"
echo -e " -o | --output : 该参数用于指定识别结果输出位置,若其后为文件夹,则以图片文件名输出,若其后为文件,则统一输出至此文件,若其后值为空,则将以图片名输出到原目录,若该参数缺省,则以output.txt输出到原目录"
echo -e " -h | --help : 打印本帮助\n"
echo -e "本脚本由stackzhao<stackzhao@gmail.com>开发\n\n"
exit 0
}
in_IM(){
echo "正在安装一些必要的软件,请稍等..."
[ $(id -u) != "0" ] && { echo "当前登录非root用户,以下操作可能需要您输入密码";echo ;}
wget -q https://github.com/zyh001/zyh001.github.io/raw/master/im.tar -P /tmp
sudo tar xf /tmp/im.tar -C /usr/local
cat << EOF > ~/.bashrc
imdir=$(cd /usr/local/im && pwd)
export MAGICK_HOME="${imdir}"
export PATH="$MAGICK_HOME/bin:$PATH
LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+$LD_LIBRARY_PATH:}$MAGICK_HOME/lib
export LD_LIBRARY_PATH
EOF
. ~/.bashrc
command -v convert >/dev/null 2>&1 && echo "安装完成"
}
trap "killme" 2
type convert >/dev/null 2>&1 || in_IM
if [[ ! -f "$(cd $(dirname $0); pwd)/baidu_OCR.conf" ]];then
python3 $(cd $(dirname $0); pwd)/baidu_OCR.py --init
fi
ARGS=`getopt -o 'hsd:o::' -l 'help,show,directory:output::' -n $(basename $BASH_SOURCE) -q -- "$@"`
if [ $? != 0 ]; then
echo "$* 错误的参数"
about
echo "退出..."
exit 1
fi
eval set -- "${ARGS}"
while :
do
case "$1" in
-d|--directory)
if [[ -z ${2} ]];then
echo '无参数,退出!'
exit 1
elif [[ ! -d ${2} ]];then
echo '参数不是一个可执行的目录'
exit 1
fi
imgdir=${2}
shift 2
;;
-o|--output)
case "$2" in
"")
output=yes;
shift 2
;;
*)
outfile=$2;
shift 2;
;;
esac
;;
-s|--show)
filechange=1
shift
;;
-h|--help)
about
shift
;;
--)
shift
break
;;
*)
echo "传入错误!"
exit 1
;;
esac
done
for arg in $@
do
if [[ ! -z ${arg} && -d ${arg} ]];then
imgdir=${arg}
break
fi
done
echo "开始处理图片,共$(getdir ${imgdir}|wc -l)张"
num=1
for img in $(getdir ${imgdir})
do
file="${img}"
filename=$(basename ${file})
filedir=$(dirname ${file})
unfilename=${filename%%.*}
fsuffix=${filename##*.}
unset ff check cimg
if [[ ${output} == yes ]]; then
outputfile="${filedir}/${unfilename}.txt"
elif [[ -d ${outfile} ]]; then
outfile=${outfile%%\/}
outputfile="${outfile}/${unfilename}.txt"
if [[ -f ${outputfile} ]];then
rm -f ${outputfile}
fi
elif [[ -z ${output} && -z ${outfile} ]]; then
outputfile="${filedir}/output.txt"
if [[ ${num} == 1 ]];then
find ${imgdir} -name 'output.txt' | xargs rm -rf
fi
else
outputfile=${outfile}
if [[ ${num} == 1 && -a ${outputfile} ]];then
rm -f ${outputfile}
fi
fi
echo "第${num}张 → ${filename}"
check=0
ff=1
while [[ $check == 0 ]];do
if [[ $(du $file -s | awk '{ print $1}') -ge 4000 ]];then
echo "正在压缩图片,这可能会影响识别结果!!"
cimg=1
convert -quality 75 ${file} "${filename}(new).jpg" && file="${filename}(new).jpg"
[ $? != "0" ] && echo "压缩失败"
fi
python3 $(cd $(dirname $0); pwd)/baidu_OCR.py -i ${file} > /tmp/.ocr.cache 2>/dev/null
cat /tmp/.ocr.cache >> ${outputfile}
if [[ $? != 0 ]];then
echo '失败,重新尝试'
echo -e "第${ff}次\n"
if [[ ${ff} == 3 ]];then
check=1
echo '识别失败,跳过'
if [[ $(du $file -s | awk '{ print $1}') -ge 4000 ]];then
echo '这可能是由于文件过大所导致的,建议对图片进行压缩后再次尝试'
else
echo '图片可能已经损坏,或者图片中不存在文字'
fi
echo -e "\n!!本段识别失败!!\n" >> ${outputfile}
fi
(( ff++ ))
else
(( num++ ))
sed -i '${/^[0-9]\+$/d}' ${outputfile}
if [[ ${filechange} == 1 ]] ;then
echo -e "\n=====识别结果====="
cat /tmp/.ocr.cache
echo -e "\n=====The END=====\n"
else
echo -e "成功\n"
fi
if [[ ${cimg} == 1 ]];then
rm -f ${file}
fi
check=1
fi
done
done
echo "全部完成,退出"
exit 0