-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocr_repair.py
100 lines (69 loc) · 3.63 KB
/
ocr_repair.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# -*- coding: utf-8 -*-
import json
from fix_tools import *
import re
import operator
with open("resources/testword_list.txt", "r", encoding="utf-8") as f:
testword_list = [ re.sub("\n", "", x) for x in f]
with open("resources/fix_data.tsv", "r", encoding="utf-8") as f:
corrections = [parse_data_row(re.sub("\n", "", x)) for x in f]
with open("resources/test_list.txt", "r", encoding="utf-8") as f:
test_list = [re.sub("\n", "", x) for x in f]
with open("resources/trigram_list.json", "r", encoding="utf-8") as f:
trigram_table = json.load(f)
ensure_dbs()
# necessary resources are built here.
correction_list = parse_corrections_to_list(corrections)
split_list = get_split_list(correction_list)
character_frequency_table = get_character_frequency_table(testword_list)
replacement_probabilities = build_replacement_probability_table(correction_list, character_frequency_table)
def analyse_word(word):
# the word in iterated character by character and a matrix is built based on each possible replacement character for each character in the word. Also splitting characters are taken in to account.
# the matrix is passed through to run_through_matrix(), which calculates individual probabilities of each possible route through the matrix. Only 100 most probable possibilities are chosen to be analysed for the next step through the matrix.
correction_matrix = []
for i in range(0, len(word)):
c = word[i]
if c not in replacement_probabilities: c = "unknown"
if len(replacement_probabilities[c]) > 1:
possibilities = [c]
split_pos = check_split_list(split_list, word[i:])
possibilities.extend(split_pos)
row = dict()
factor = 1
for pos in possibilities:
if len(pos) > 1:
factor = replacement_probabilities[pos][pos]
for pos in possibilities:
for rep in replacement_probabilities[pos]:
rep_s = add_split_marks(rep, len(pos))
if len(pos) == 1: f = factor
else: f = 1
if len(pos) > 1 and rep == pos:
that = True
else:row.update({ rep_s : replacement_probabilities[pos][rep]*f })
correction_matrix.append(row)
else:
correction_matrix.append( { c : 1 } )
guesses = run_through_matrix(word, correction_matrix, trigram_table)
return guesses
def calculate_final_probabilities(guesses):
probabilities = dict()
for guess in guesses:
factor = get_word_probability(guess["fragment"])
probabilities.update( { guess["fragment"] : guess["prob"]*factor } )
return probabilities
def analyse(word, n=3):
# word is the word to be analysed
# n is the length of the list of most probable outcomes returned
# analyse_word() returns 100 most probable guesses, calculate_final_probabilities() assigns them new probabilities taking account the outputs probability in a wordlist.
guesses = analyse_word(word)
guesses = calculate_final_probabilities(guesses)
guesses = sorted(guesses.items(), key=operator.itemgetter(1), reverse=True)
return guesses[0:int(n)]
# if i < 11 :print(word, guesses[i])
# if guesses[i][0] == "maksakoon":
# print(i, word, guesses[i])
# maksakoon = guesses[i]
#print(word_probability_table["kostaaksensa"]*maksakoon[1]/word_probability_table["nonce"])
#print(correction_probabilities["f"])
#print(scramble_probabilities["f"])