forked from jiangxiluning/chinese-ocr
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmodel.py
633 lines (522 loc) · 29.4 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
# coding:utf-8
import os
import shutil
import sys
import cv2
import re
import numpy as np
sys.path.append('ctpn')
from math import sqrt, degrees, atan2, fabs, sin, cos, radians
from PIL import Image
from random import randint
from ctpn.text_detect import text_detect
from crnn.crnn import crnnOcr
from crnn_preprocessing import preprocessing
from utils import Queue, get_img_difference, get_union_coordinate
# global constant
# 文本框最小高度(0.045*720=32.4px)
G_MIN_BOX_HEIGHT = 0.045
# # 文本框最大高度(0.088*720=63.36px)
# G_MAX_BOX_HEIGHT = 0.088
# TODO:news
# 文本框最大高度(0.1*720=72px)
G_MAX_BOX_HEIGHT = 0.1
# 文本最小高度(与preprocessing输出对比)(0.028*720=20.16px)
G_MIN_SUBTITLE_HEIGHT = 0.028
# 文本最大高度(与preprocessing输出对比)(0.070*720=50.4px)
G_MAX_SUBTITLE_HEIGHT = 0.070
# 需要计算限制参数的帧数 # TODO:finetune
G_RAPID_UPDATE_FRAME = 500
# 信息存储在队列中的帧数
G_COMPATE_FRAME_NUM = 2
# 图片相似度阈值 # TODO:finetune
G_DIFFERENT_THRESHOLD = 10
# global variables
# 字幕框高度(每一帧进行计算,初始值:0.04*720=28.8) TODO:finetune
g_subtitle_height = 0.04
g_top = 0.7
g_bottom = 1
g_frame_num_with_subtitle = 0
g_results_list = []
g_canny_img_queue = Queue(G_COMPATE_FRAME_NUM)
g_recs_queue = Queue(G_COMPATE_FRAME_NUM)
# ui显示用的字符串
g_str_ui = ""
def crnnRec(im, text_recs):
"""
crnn模型,ocr识别
@@model,
@@converter,
@@im:Array
@@text_recs:text box
"""
results = {}
xDim, yDim = im.shape[1], im.shape[0]
for index, rec in enumerate(text_recs):
results[index] = [rec, ]
pt1 = (max(1, rec[0]), max(1, rec[1]))
pt2 = (rec[2], rec[3])
pt3 = (min(rec[6], xDim - 2), min(yDim - 2, rec[7]))
pt4 = (rec[4], rec[5])
degree = degrees(atan2(pt2[1] - pt1[1], pt2[0] - pt1[0])) ##图像倾斜角度
partImg = dumpRotateImage(im, degree, pt1, pt2, pt3, pt4)
image = Image.fromarray(partImg).convert('L')
sim_pred = crnnOcr(image)
results[index].append(sim_pred) ##识别文字
return results
def dumpRotateImage(img, degree, pt1, pt2, pt3, pt4):
height, width = img.shape[:2]
heightNew = int(width * fabs(sin(radians(degree))) + height * fabs(cos(radians(degree))))
widthNew = int(height * fabs(sin(radians(degree))) + width * fabs(cos(radians(degree))))
matRotation = cv2.getRotationMatrix2D((width * 0.5, height * 0.5), degree, 1)
matRotation[0, 2] += (widthNew - width) * 0.5
matRotation[1, 2] += (heightNew - height) * 0.5
imgRotation = cv2.warpAffine(img, matRotation, (widthNew, heightNew), borderValue=(255, 255, 255))
pt1 = list(pt1)
pt3 = list(pt3)
[[pt1[0]], [pt1[1]]] = np.dot(matRotation, np.array([[pt1[0]], [pt1[1]], [1]]))
[[pt3[0]], [pt3[1]]] = np.dot(matRotation, np.array([[pt3[0]], [pt3[1]], [1]]))
ydim, xdim = imgRotation.shape[:2]
imgOut = imgRotation[max(1, int(pt1[1])):min(ydim - 1, int(pt3[1])), max(1, int(pt1[0])):min(xdim - 1, int(pt3[0]))]
# height,width=imgOut.shape[:2]
return imgOut
def sort_box(box):
"""
对box排序,及页面进行排版
text_recs[index, 0] = x1
text_recs[index, 1] = y1
text_recs[index, 2] = x2
text_recs[index, 3] = y2
text_recs[index, 4] = x3
text_recs[index, 5] = y3
text_recs[index, 6] = x4
text_recs[index, 7] = y4
"""
box = sorted(box, key=lambda x: sum([x[1]])) # 按左上角y轴坐标从上到下排序
return box
# 字幕过滤1
def subtitle_filter1(boxes, img_width, img_height):
"""
对以下情况的字幕框进行过滤:1.文字位置不在ROI中,2.CTPN检测得到的字幕框高度较小,3.字幕框高度与长度的比值
:param boxes:
:param img_height:
:param img_width:
:return:
"""
# roi限制(纵向限制由CTPN完成,即传入CTPN的图像已经过纵向裁剪)
ROI_X = 0.5
# 最大字幕框高度与长度比(height/length)TODO:finetune
MAX_HL_RATIO = 1
temp = []
for index, box in enumerate(boxes):
# 将不满足限制条件的文字框加入到待删除列表中
if box[0] > img_width * ROI_X: # 字幕框最左端出现在屏幕右半边
temp.append(index)
if (box[5] - box[1]) < img_height * G_MIN_BOX_HEIGHT or (box[5] - box[1]) > img_height * G_MAX_BOX_HEIGHT: # 字幕框高度较小或较大
temp.append(index)
if (box[5] - box[1]) > (box[2] - box[0]) * MAX_HL_RATIO: # 字幕框高度与长度比值大于1
temp.append(index)
# TODO: 除了滚动字幕外出现在中间的字幕框也去掉(离散框)
boxes = np.delete(boxes, temp, axis=0)
return sort_box(boxes)
# 字幕过滤2
def subtitle_filter2(boxes, resize_height, origin_height, subtitle_height_list, canny_img2_list, output_process=False):
"""
在进行canny图像提取之后再次进行过滤,大幅更新限制参数时使用较为宽松的限制条件,小幅更新时使用严格的限制条件
对以下情况进行过滤:1.字幕高度在合理范围之外,2.字幕位置(Y轴)在合理范围之外(严格条件下)
:param boxes:
:param resize_height:
:param origin_height:
:param subtitle_height_list:
:param canny_img2_list:
:param output_process:
:return:
"""
# 条件收束前字幕高度限制
HEIGHT_MIN_DELTA = 0.01
HEIGHT_MAX_DELTA = 0.03
# 条件收束后字幕高度限制
HEIGHT_DELTA_STRICT = 0.01
# 条件收束后字幕Y轴位置限制
SUBTITLE_LOCATION_Y_DELTA = 0.02
global g_str_ui
temp = []
for index, subtitle_height in enumerate(subtitle_height_list):
if g_frame_num_with_subtitle < G_RAPID_UPDATE_FRAME: # 大幅更新限制参数时应使用较宽松的限制条件
if subtitle_height < origin_height * (g_subtitle_height - HEIGHT_MIN_DELTA) \
or subtitle_height > origin_height * (g_subtitle_height + HEIGHT_MAX_DELTA): # 只对字幕高度进行限制
temp.append(index)
if output_process:
g_str_ui += "loose restriction\n"
print("loose restriction")
g_str_ui += "subtitle_height:" + str(subtitle_height) + ", restriction: max:" + str(
origin_height*(g_subtitle_height + HEIGHT_MAX_DELTA)) + " min:" + str(
origin_height*(g_subtitle_height - HEIGHT_MIN_DELTA)) + "\n"
print("subtitle_height:" + str(subtitle_height) + ", restriction: max:" + str(
origin_height * (g_subtitle_height + HEIGHT_MAX_DELTA)) + " min:" + str(
origin_height * (g_subtitle_height - HEIGHT_MIN_DELTA)))
else: # 小幅更新限制参数时应使用较为严格的限制条件
if abs(subtitle_height / origin_height - g_subtitle_height) > HEIGHT_DELTA_STRICT \
or boxes[index][1] < (g_top - SUBTITLE_LOCATION_Y_DELTA) * resize_height \
or boxes[index][7] > (g_bottom + SUBTITLE_LOCATION_Y_DELTA) * resize_height: # 对字幕高度及在Y轴上的位置进行严格限制
temp.append(index)
if output_process:
g_str_ui += "tight restriction\n"
print("tight restriction")
g_str_ui += "subtitle_height:" + str(subtitle_height) + ", restriction: max:" + str(
origin_height*(g_subtitle_height + HEIGHT_DELTA_STRICT)) + " , min:" + str(
origin_height*(g_subtitle_height - HEIGHT_DELTA_STRICT)) + "\n"
print("subtitle_height:" + str(subtitle_height) + ", restriction: max:" + str(
origin_height*(g_subtitle_height + HEIGHT_DELTA_STRICT)) + " , min:" + str(
origin_height*(g_subtitle_height - HEIGHT_DELTA_STRICT)))
g_str_ui += "boxes top:" + str(boxes[index][1]) + ", restriction: min:" + str(
(g_top - SUBTITLE_LOCATION_Y_DELTA)*resize_height) + "\n"
print("boxes top:" + str(boxes[index][1]) + ", restriction: min:" + str(
(g_top - SUBTITLE_LOCATION_Y_DELTA)*resize_height))
g_str_ui += "boxes bottom:" + str(boxes[index][7]) + ", restriction: max:" + str(
(g_bottom + SUBTITLE_LOCATION_Y_DELTA)*resize_height) + "\n"
print("boxes bottom:" + str(boxes[index][7]) + ", restriction: max:" + str(
(g_bottom + SUBTITLE_LOCATION_Y_DELTA)*resize_height))
boxes = np.delete(boxes, temp, axis=0)
# 对用于比较相似度的canny图像列表也进行更新,去除不符合的图像
canny_img2_list = [canny_img2_list[i] for i in range(len(canny_img2_list)) if i not in temp]
return boxes, canny_img2_list
# 获取原始尺寸下的坐标
def convert_to_origin_coordinate(text_recs, resize_ratio):
tmp = np.zeros((len(text_recs), 8), np.int)
for index1, text_rec in enumerate(text_recs):
for index2, point in enumerate(text_rec):
tmp[index1, index2] = point / resize_ratio
return tmp
def crop_img(img, video_name, output_path, boxes, frameNum):
base_name = video_name.split('/')[-1]
# 对每一帧分别创建一个文件夹存放截取出的字幕
if os.path.exists(os.path.join(output_path, "cropped_pic_{}_{}".format(base_name.split('.')[0], frameNum))):
shutil.rmtree(os.path.join(output_path, "cropped_pic_{}_{}".format(base_name.split('.')[0], frameNum)))
os.makedirs(os.path.join(output_path, "cropped_pic_{}_{}".format(base_name.split('.')[0], frameNum)))
for i, box in enumerate(boxes):
left = max(min(box[0], box[4]), 0)
top = min(box[1], box[3])
right = min(max(box[2], box[6]), img.shape[1])
bottom = max(box[5], box[7])
cropped = img[int(top):int(bottom), int(left):int(right)] # 高度、宽度
cv2.imwrite(os.path.join(output_path, "cropped_pic_{}_{}".format(base_name.split('.')[0], frameNum),
"{}_{}_{}.jpg".format(base_name.split('.')[0], frameNum, str(i))), cropped)
def delete_overlap_get_scroll_list(text_recs, resize_im_height, output_process=False):
"""
返回是否为滚动字幕的列表,以及合并了滚动字幕中重叠部分的字幕框
:param text_recs:
:param resize_im_height:
:param output_process
:return:
"""
no_overlap = []
uncertain_scroll = []
is_scroll = []
result_text_recs = []
MIN_SCROLL_Y_TOP = 0.88
global str_ui
for i in range(len(text_recs)):
# TODO:考虑上方符合要求,但是下方不符合要求的情况,这样处理会出现bug
# TODO:暂时去除对滚动字幕下方的限制,避免bug
# if min(text_recs[i][5], text_recs[i][7]) > resize_im_height * 0.93 and min(text_recs[i][1], text_recs[i][3]) > resize_im_height * 0.88:
if min(text_recs[i][1], text_recs[i][3]) > resize_im_height * MIN_SCROLL_Y_TOP:
uncertain_scroll.append(i)
# TODO:考虑上方符合要求,但是下方不符合要求的情况,这样处理会出现bug
len_of_no_scroll = len(text_recs) - len(uncertain_scroll)
is_scroll.extend([False] * len_of_no_scroll)
result_text_recs.extend(text_recs[:len_of_no_scroll])
if len(uncertain_scroll) > 1:
no_overlap.append(text_recs[uncertain_scroll[0]])
for index in uncertain_scroll[1:]:
for i, rec in enumerate(no_overlap):
# 检测是否有重叠字幕,有则合并重叠字幕,否则将直接将其添加到no_overlap中
union = get_union_coordinate(rec, text_recs[index])
if union:
if output_process:
str_ui += "overlap!!!\tunion_coordinate: " + union + "\n"
print("overlap!!!\tunion_coordinate: ", union)
# 更新rec到no_scroll
no_overlap[i] = [union[2], union[0], union[3], union[0], union[2], union[1], union[3], union[1]]
break
if not union:
no_overlap.append(text_recs[index])
elif len(uncertain_scroll) == 1:
no_overlap.append(text_recs[uncertain_scroll[0]])
is_scroll.extend([True] * len(no_overlap))
result_text_recs.extend(no_overlap)
result_text_recs = sorted(result_text_recs, key=lambda x: sum([x[1]]))
return is_scroll, result_text_recs
def update_subtitle_height_restriction(subtitle_height_list, text_recs, origin_img_height, resize_im_height, output_process=False):
"""
更新字幕高度限制参数,在指定帧数前进行大幅更新,之后小幅更新
:param subtitle_height_list:
:param text_recs:
:param origin_img_height:
:param resize_im_height:
:param output_process:
:return:
"""
global g_subtitle_height
global g_top
global g_bottom
global g_frame_num_with_subtitle
global g_str_ui
UPDATE_RATE_RAPID = 0.1
UPDATE_RATE_SLOW = 0.05
# 字幕最小高度(与preprocessing输出对比)
min_subtitle_height = G_MIN_SUBTITLE_HEIGHT * origin_img_height
max_subtitle_height = G_MAX_SUBTITLE_HEIGHT * origin_img_height
if subtitle_height_list and g_frame_num_with_subtitle < G_RAPID_UPDATE_FRAME: # 存在字幕,且需要大幅度更新限制参数
index = randint(0, len(subtitle_height_list) - 1) # 同一帧中检测到多个疑似字幕,则从中随机选择一个进行数据更新
if subtitle_height_list[index] > min_subtitle_height and subtitle_height_list[index] < max_subtitle_height:
g_subtitle_height = (1 - UPDATE_RATE_RAPID) * g_subtitle_height + UPDATE_RATE_RAPID * subtitle_height_list[index] / origin_img_height
g_top = (1 - UPDATE_RATE_RAPID) * g_top + UPDATE_RATE_RAPID * text_recs[index][1] / resize_im_height
g_bottom = (1 - UPDATE_RATE_RAPID) * g_bottom + UPDATE_RATE_RAPID * text_recs[index][7] / resize_im_height
g_frame_num_with_subtitle = g_frame_num_with_subtitle + 1
elif subtitle_height_list and g_frame_num_with_subtitle >= G_RAPID_UPDATE_FRAME: # 存在字幕,且需要小幅度更新限制参数
index = randint(0, len(subtitle_height_list) - 1)
if subtitle_height_list[index] > min_subtitle_height and subtitle_height_list[index] < max_subtitle_height:
g_subtitle_height = (1 - UPDATE_RATE_SLOW) * g_subtitle_height + UPDATE_RATE_SLOW * subtitle_height_list[index] / origin_img_height
g_top = (1 - UPDATE_RATE_SLOW) * g_top + UPDATE_RATE_SLOW * text_recs[index][1] / resize_im_height
g_bottom = (1 - UPDATE_RATE_SLOW) * g_bottom + UPDATE_RATE_SLOW * text_recs[index][7] / resize_im_height
g_frame_num_with_subtitle = g_frame_num_with_subtitle + 1
if output_process:
g_str_ui += "height of subtitle:" + str(g_subtitle_height) + "\n" + "left_top_y:" + str(g_top) + "\n" + "left_bottom_y:" + str(g_bottom) + "\n"
print("height of subtitle:" + str(g_subtitle_height))
print("left_top_y:" + str(g_top))
print("left_bottom_y:" + str(g_bottom))
def voting(origin_recs, frame_result, canny_img2_list, origin_img_height, origin_img_width, img_no, output_process=False):
"""
投票系统,输出相同字幕中出现次数最多的检测结果
:param origin_recs:
:param frame_result:
:param canny_img2_list:
:param origin_img_height:
:param origin_img_width:
:param img_no:
:param output_process:
:return:
"""
global g_str_ui
# 将当前帧的canny2图像,origin_recs写入队列
if len(canny_img2_list) > 0:
if g_canny_img_queue.is_full(): # 如果某个队列已满,需要一起出队一个元素
g_canny_img_queue.dequeue()
g_recs_queue.dequeue()
g_canny_img_queue.enqueue(canny_img2_list)
g_recs_queue.enqueue(origin_recs)
# g_results_list:【result_dict_1,result_dict_2...】, result_dict:{result_1:times_1,result_2:times_2,...}
global g_results_list
new_results_list = []
DISTANCE_RESTRICT_PER = 0.01
max_distance = sqrt(origin_img_height ** 2 + origin_img_width ** 2)
is_match = False
if g_canny_img_queue.is_full() and g_results_list: # 队列满,则说明队列中存有前后两帧的canny_list,可进行相似对比
for i, curr_canny2 in enumerate(g_canny_img_queue.queue[1]): # 遍历当前帧的canny_list
curr_rec = g_recs_queue.queue[1][i]
# 简化坐标:8 itmes -> 4 items
curr_top = min(curr_rec[1], curr_rec[3])
curr_bottom = max(curr_rec[5], curr_rec[7])
curr_left = min(curr_rec[0], curr_rec[4])
curr_right = max(curr_rec[2], curr_rec[6])
simp_curr_rec = [curr_top, curr_bottom, curr_left, curr_right]
# 计算文本框中心点坐标
curr_center_x = (curr_left + curr_right) * 0.5
curr_center_y = (curr_top + curr_bottom) * 0.5
for j, last_canny2 in enumerate(g_canny_img_queue.queue[0]): # 遍历前一帧的canny_list
last_rec = g_recs_queue.queue[0][j]
# 简化坐标:8 itmes -> 4 items
last_top = min(last_rec[1], last_rec[3])
last_bottom = max(last_rec[5], last_rec[7])
last_left = min(last_rec[0], last_rec[4])
last_right = max(last_rec[2], last_rec[6])
simp_last_rec = [last_top, last_bottom, last_left, last_right]
# 计算文本框中心点
last_center_x = (last_left + last_right) * 0.5
last_center_y = (last_top + last_bottom) * 0.5
# 计算前后两帧中两个文本框中心点的距离
distance = sqrt((last_center_x - curr_center_x) ** 2 + (last_center_y - curr_center_y) ** 2)
if distance < DISTANCE_RESTRICT_PER * max_distance: # 距离小于阈值,能匹配到
# 取x轴上的重叠区域
x_list = sorted([last_left, last_right, curr_left, curr_right])
# 检测前后两帧字幕图像是否重叠
if (x_list[0] != x_list[1] and x_list[0] in simp_curr_rec and x_list[1] in simp_curr_rec) or (x_list[0] != x_list[1] and x_list[0] in simp_last_rec and x_list[1] in simp_last_rec):
raise ValueError("Two subtitles are not overlapped! One is {}, another is {}".format(str(simp_last_rec), str(simp_curr_rec)))
canny_part_xmin = x_list[1]
canny_part_xmax = x_list[2]
canny_len = canny_part_xmax - canny_part_xmin
# 取y轴上的重叠区域
y_list = sorted([last_top, last_bottom, curr_top, curr_bottom])
# 检测前后两帧字幕图像是否重叠
if (y_list[0] != y_list[1] and y_list[0] in simp_curr_rec and y_list[1] in simp_curr_rec) or (y_list[0] != y_list[1] and y_list[0] in simp_last_rec and y_list[1] in simp_last_rec):
raise ValueError("Two subtitles are not overlapped! One is {}, another is {}".format(str(simp_last_rec), str(simp_curr_rec)))
canny_part_ymin = y_list[1]
canny_part_ymax = y_list[2]
# 在x轴上缩小对比图像的面积
compute_part_len = canny_len / 2 // 2
x_min = (canny_part_xmin + canny_part_xmax) // 2 - compute_part_len
x_max = (canny_part_xmin + canny_part_xmax) // 2 + compute_part_len
# 转换成相对坐标
related_last_xmin = int(x_min - last_left)
related_last_xmax = int(x_max - last_left)
related_curr_xmin = int(x_min - curr_left)
related_curr_xmax = int(x_max - curr_left)
related_last_ymin = int(canny_part_ymin - last_top)
related_last_ymax = int(canny_part_ymax - last_top)
related_curr_ymin = int(canny_part_ymin - curr_top)
related_curr_ymax = int(canny_part_ymax - curr_top)
# 计算图片相似度
old_img = g_canny_img_queue.queue[0][j][related_last_ymin:related_last_ymax,
related_last_xmin:related_last_xmax]
new_img = g_canny_img_queue.queue[1][i][related_curr_ymin:related_curr_ymax,
related_curr_xmin:related_curr_xmax]
difference = get_img_difference(old_img, new_img, hash_type="perception")
if output_process:
g_str_ui += "the difference between" + str(img_no) + "_" + str(i) + "and " + str(img_no - 1) + "_" + str(j) + ":" + str(difference) + "\n"
print("the difference between", str(img_no), "_", str(i), "and ", str(img_no - 1), "_", str(j),
":", str(difference))
if difference >= G_DIFFERENT_THRESHOLD: # 匹配到,但是计算相似度的结果说明两个字幕不同
# from random import uniform, randint
# r = round(uniform(0, 1), 3)
# cv2.imwrite(os.path.join(output_path, "{}_{}_part_{}_{}.jpg".format(img_no, img_no - 1, r, round(uniform(0, 1), 3))), old_img)
# cv2.imwrite(os.path.join(output_path, "{}_{}_part_{}_{}.jpg".format(img_no, img_no - 1, r, round(uniform(0, 1), 3))), new_img)
if output_process:
g_str_ui += str(img_no) + "_" + str(i) + " different from" + str(img_no - 1) + "_" + str(j) + "\n"
print(str(img_no), "_", str(i), " different from", str(img_no - 1), "_", str(j))
# 去掉上一帧同位置的投票结果(不从原来的结果中读取前一帧的投票结果),并新增当前帧结果
new_results_list.append({frame_result[i][1]: 1})
else: # 匹配到,计算相似度的结果说明两个字幕相同
# 更新位置(按y轴坐标进行排序写入list),结果(value)+1[{result1:times1,result2:times2,...},{result1:times1,result2:times2,...},...]
result_dict = g_results_list[j] # 读取该文本框在前一帧中的投票结果
if frame_result[i][1] in result_dict: # 该结果之前已存在,数值+1
result_dict[frame_result[i][1]] = int(result_dict[frame_result[i][1]]) + 1
else: # 该结果之前不存在,新增一个投票项,数值为1
result_dict[frame_result[i][1]] = 1
new_results_list.append(result_dict)
is_match = True
break
else: # 距离大于阈值,没匹配到
is_match = False
# j遍历后都未被匹配,新建一个结果
if not is_match:
result_dict = {frame_result[i][1]: 1}
new_results_list.append(result_dict)
is_match = True
# 用当前帧结果替代前一帧结果以去除结果中未被比较的项,只保留本帧中出现字幕位置的结果
g_results_list = new_results_list
else: # 队列未满,则说明这是第一帧,需要初始化RESULT_LIST
for rlt in frame_result:
g_results_list.append({rlt[1]: 1})
if output_process:
g_str_ui += str(g_results_list) + "\n"
print(g_results_list)
def model(img, img_no, video_name, output_path, output_process=False):
global g_str_ui
g_str_ui = ''
origin_img = img.copy()
origin_img_height = origin_img.shape[0]
origin_img_width = origin_img.shape[1]
text_recs, drawn_img, resize_img, resize_ratio = text_detect(origin_img, top=0.7, bottom=1, left=0, right=1)
resize_im_height = resize_img.shape[0]
resize_im_width = resize_img.shape[1]
# 第一次字幕过滤(位置信息)
text_recs = subtitle_filter1(text_recs, resize_im_width, resize_im_height)
if text_recs is None or len(text_recs) == 0:
return [], img, [], resize_ratio, g_str_ui
# 获取原图坐标便于预处理
origin_recs = convert_to_origin_coordinate(text_recs, resize_ratio)
if output_process:
crop_img(img, video_name, output_path, origin_recs, img_no)
# crnn前预处理
origin_img = img.copy()
preprocessed_img, subtitle_height_list, canny_img2_list = preprocessing.p_picture(origin_recs, [False] * len(origin_recs), origin_img, img_no, video_name, output_path, 1)
# TODO: 1.mp4 4374帧字幕高度检测检测失常、拼接时纵向位置的偏差
# if output_process:
# np.savetxt(
# os.path.join(output_path, "subtitle_height_{}_{}.txt".format(video_name.split('/')[-1].split('.')[0], img_no)),
# subtitle_height_list, fmt='%d')
# 更新字幕高度限制
update_subtitle_height_restriction(subtitle_height_list, text_recs, origin_img_height, resize_im_height, output_process=output_process)
# 第二次字幕过滤(高度,更精确的纵坐标)
text_recs, canny_img2_list = subtitle_filter2(text_recs, resize_im_height, origin_img_height, subtitle_height_list, canny_img2_list, output_process=output_process)
if text_recs is None or len(text_recs) == 0:
return [], img, [], resize_ratio, g_str_ui
# 送入CRNN检测
result = crnnRec(resize_img, text_recs)
# 去除检测结果最前端非中文字符,出现重复字符的彻底解决方法应为重新训练
for key in result:
pattern = re.compile(u"^[^\u4e00-\u9fa5]+")
result[key][1] = re.sub(pattern, '', result[key][1])
# 根据位置对比是否同一字幕,进行投票
# 更新原图坐标
origin_recs = convert_to_origin_coordinate(text_recs, resize_ratio)
# 将result字典(index:信息)转换为列表,保持其按index排列
frame_result = [result[i] for i in result]
# 投票
voting(origin_recs, frame_result, canny_img2_list, origin_img_height, origin_img_width, img_no, output_process=output_process)
# 输出出现次数最多的结果
for index, data in enumerate(g_results_list):
for k in sorted(data, key=data.__getitem__, reverse=True):
if output_process:
g_str_ui += "result:" + k + ", times:" + str(data[k]) + "\n"
print("result:" + k + ", times:" + str(data[k]))
result[int(index)][1] = k
break
if output_process:
return result, preprocessed_img, [False] * len(result), resize_ratio, g_str_ui
else:
return result, origin_img, [False] * len(result), resize_ratio, g_str_ui
def model_news(img, img_no, video_name, output_path, output_process=False):
global g_str_ui
g_str_ui = ''
origin_img = img.copy()
origin_img_height = origin_img.shape[0]
origin_img_width = origin_img.shape[1]
# ctpn
text_recs, drawn_img, resize_img, resize_ratio = text_detect(origin_img, top=0.5, bottom=1, left=0, right=1)
resize_im_height = resize_img.shape[0]
resize_im_width = resize_img.shape[1]
# 第一次字幕过滤(位置信息)
text_recs = subtitle_filter1(text_recs, resize_im_width, resize_im_height)
if text_recs is None or len(text_recs) == 0:
return [], origin_img, [], resize_ratio, g_str_ui
# 合并滚动字幕中重叠部分,并获取滚动字幕标记
is_scroll, text_recs = delete_overlap_get_scroll_list(text_recs, resize_im_height, output_process=output_process)
# TODO:画出重叠区域的框,验证算法是否正确
origin_recs = convert_to_origin_coordinate(text_recs, resize_ratio)
if output_process:
crop_img(img, video_name, output_path, origin_recs, img_no)
# crnn前预处理
origin_img = img.copy()
preprocessed_img, canny_img2_list = preprocessing.p_picture(origin_recs, is_scroll, origin_img, img_no, video_name, output_path, 2)
# 送入CRNN检测
result = crnnRec(resize_img, text_recs)
# 去除检测结果最前端非中文字符,出现重复字符的彻底解决方法应为重新训练
for key in result:
pattern = re.compile(u"^[^\u4e00-\u9fa5]+")
result[key][1] = re.sub(pattern, '', result[key][1])
# 根据位置对比是否同一字幕,进行投票
# 去除is_scroll为True的字幕框,不进行投票
no_scroll_result = []
no_scroll_canny_list = []
no_scroll_recs = []
for i in result:
if not is_scroll[i]:
no_scroll_result.append(result[i])
no_scroll_canny_list.append(canny_img2_list[i])
no_scroll_recs.append(origin_recs[i])
# 投票
voting(no_scroll_recs, no_scroll_result, no_scroll_canny_list, origin_img_height, origin_img_width, img_no, output_process=output_process)
# 输出出现次数最多的结果
for index, data in enumerate(g_results_list):
for k in sorted(data, key=data.__getitem__, reverse=True):
if output_process:
g_str_ui += "result:" + k + ", times:" + str(data[k]) + "\n"
print("result:" + k + ", times:" + str(data[k]))
result[int(index)][1] = k
break
if output_process:
return result, preprocessed_img, is_scroll, resize_ratio, g_str_ui
else:
return result, origin_img, is_scroll, resize_ratio, g_str_ui