-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocrc.bib
312 lines (297 loc) · 12 KB
/
ocrc.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
Selected Bibliography on OCR correction for Le
Temps
June 16, 2016
References
[9] A. Kae, G. Huang, C. Doersch, and E. Learned-Miller. Improving state-of-
the-art OCR through high-precision document-specific modeling. In Pro-
ceedings of the IEEE Computer Society Conference on Computer Vision
and Pattern Recognition, pages 1935-1942, 2010.
[18] M. Wick, M. Ross, and E. Learned-Miller. Context-Sensitive Error Correc-
tion: Using Topic Models to Improve OCR. Ninth International Conference
on Document Analysis and Recognition (ICDAR 2007) Vol 2, pages 1168-
1172, 2007.
@MastersThesis{ocr:Niklas,
author = {Kai Niklas},
title = {Unsupervised Post-Correction of OCR Errors},
school = {Leibniz Universit¨at Hannover},
year = 2010
}
@InProceedings{ocr:Reynaert:Corpus-Clean,
author = {Martin Reynaert},
title = {Synergy of Nederlab and @PhilosTEI: Diachronic and Multilingual Text-induced Corpus Clean-up},
booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14)},
year = {2014},
month = {may},
pages = {1224-1230},
date = {26-31},
address = {Reykjavik, Iceland},
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Hrafn Loftsson and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
publisher = {European Language Resources Association (ELRA)},
isbn = {978-2-9517408-8-4},
language = {english}
}
@inproceedings{ocr:Lund,
author = {Lund, William B. and Ringger, Eric K. and Walker, Daniel D.},
title = {How well does multiple OCR error correction generalize?},
booktitle = {Proceedings og SPIE 9021, Document Recognition and Retrieval XXI},
volume = {9021},
pages = {1-13},
publisher = {SPIE},
address = {San Francisco, California},
year = {2013},
doi = {10.1117/12.2042502},
URL = {http://dx.doi.org/10.1117/12.2042502},
}
@article{ocr:Damerau,
author = {Damerau, Fred J.},
title = {A technique for computer detection and correction of spelling errors},
journal = {Communications of the ACM},
volume = {7},
number = 3,
pages = {171-176},
year = {1964}
}
@article{ocr:Eger,
author = {Steffen Eger and Tim vor der Br\"uck and Alexander Mehler},
title = {Statistical Learning for {OCR} Text Correction},
journal = {The Prague Bulletin of Mathematical Linguistics},
volume = {105},
pages = {77-99},
year = {2016}
}
@article{ocr:Mei,
author = {Jie Mei and
Aminul Islam and
Yajing Wu and
Abidalrahman Moh'd and
Evangelos E. Milios},
title = {Statistical Learning for {OCR} Text Correction},
journal = {CoRR},
pages = {1-10},
volume = {abs/1611.06950},
year = {2016},
url = {http://arxiv.org/abs/1611.06950},
timestamp = {Thu, 01 Dec 2016 19:32:08 +0100},
biburl = {http://dblp.uni-trier.de/rec/bib/journals/corr/MeiIWMM16},
bibsource = {dblp computer science bibliography, http://dblp.org}
}
@inproceedings{ocr:KissosDershowitz,
author = {Ido Kissos and Nachum Dershowitz},
title = {OCR Error Correction Using Character Correction and Feature-Based Word Classification},
booktitle = {2016 12th IAPR Workshop on Document Analysis Systems (DAS)},
year = {2016},
pages = {198-203},
publisher = {IEEE Computer Society},
address = {Los Alamitos, CA, USA},
}
@inproceedings{ocr:Taghva,
author = {Taghva, Kazem and Nartker, Thomas and Borsack, Julie},
title = {Information Access in the Presence of OCR Errors},
booktitle = {Proceedings of the 1st ACM Workshop on Hardcopy Document Processing},
series = {HDP '04},
year = {2004},
isbn = {1-58113-976-4},
location = {Washington, DC, USA},
pages = {1--8},
url = {http://doi.acm.org/10.1145/1031442.1031443},
doi = {10.1145/1031442.1031443},
acmid = {1031443},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {categorization, document conversion, information extraction, markup},
}
@inproceedings{ocr:Tong,
author = {Xiang Tong and David A. Evans},
booktitle = {Fourth Workshop on Very Large Corpora},
title = {A Statistical Approach to Automatic OCR Error Correction in Context},
url = {http://www.aclweb.org/anthology/W96-0108},
year = {1996},
address = {Copenhagen},
publisher = {Association for Computational Linguistics},
pages = {88--100}
}
@article{ocr:Reffle,
author = {Reffle, Ulrich and Ringlstetter, Christoph},
title = {Unsupervised Profiling of OCRed Historical Documents},
journal = {Pattern Recognition},
issue_date = {May, 2013},
volume = {46},
number = {5},
month = may,
year = {2013},
issn = {0031-3203},
pages = {1346--1357},
url = {http://dx.doi.org/10.1016/j.patcog.2012.10.002},
doi = {10.1016/j.patcog.2012.10.002},
acmid = {2431004},
publisher = {Elsevier Science Inc.},
address = {New York, NY, USA},
keywords = {Error detection and error correction, OCR postprocessing, Processing of historical documents, Statistical learning},
}
@article{ocr:Bassil,
author = {Youssef Bassil and Mohammad Alwani},
title = {{OCR} Post-Processing Error Correction Algorithm using Google Online Spelling Suggestion},
journal = {Journal of Emerging Trends in Computing and Information Sciences},
volume = 3,
number = 1,
pages = {90-99},
year = {2012}
}
@InProceedings{ocr:AfliWay,
author = {Afli, Haithem and Way, Andy},
title = {Integrating Optical Character Recognition and Machine Translation of Historical Documents},
booktitle = {Proceedings of the Workshop on Language Technology Resources and Tools for Digital Humanities (LT4DH)},
month = {December},
year = {2016},
address = {Osaka, Japan},
publisher = {The COLING 2016 Organizing Committee},
pages = {109--116},
url = {http://aclweb.org/anthology/W16-4015}
}
@inproceedings{ocr:Afli,
author = {Haithem Afli and
Zhengwei Qiu and
Andy Way and
P{\'{a}}raic Sheridan},
title = {Using {SMT} for {OCR} Error Correction of Historical Texts},
booktitle = {Proceedings of the Tenth International Conference on Language Resources
and Evaluation {LREC} 2016, Portoro{\v{z}}, Slovenia, May 23-28, 2016.},
year = {2016},
pages = {962-966},
publisher = {ELRA},
address = {Portoro{\v{z}}, Slovenia},
url = {http://www.lrec-conf.org/proceedings/lrec2016/summaries/280.html},
timestamp = {Tue, 30 Aug 2016 18:49:47 +0200},
biburl = {http://dblp2.uni-trier.de/rec/bib/conf/lrec/AfliQWS16},
bibsource = {dblp computer science bibliography, http://dblp.org}
}
@inproceedings{ocr:Traub,
author = {Myriam C. Traub and
Jacco van Ossenbruggen and
Lynda Hardman},
title = {Impact Analysis of {OCR} Quality on Research Tasks in Digital Archives},
booktitle = {Research and Advanced Technology for Digital Libraries - 19th International
Conference on Theory and Practice of Digital Libraries, {TPDL} 2015,
Pozna{\'{n}}, Poland, September 14-18, 2015. Proceedings},
pages = {252--263},
year = {2015},
publisher = {Springer},
address = {Berlin},
url = {http://dx.doi.org/10.1007/978-3-319-24592-8_19},
doi = {10.1007/978-3-319-24592-8_19},
timestamp = {Mon, 07 Sep 2015 11:21:01 +0200},
biburl = {http://dblp.uni-trier.de/rec/bib/conf/ercimdl/TraubOH15},
bibsource = {dblp computer science bibliography, http://dblp.org}
}
@inproceedings{ocr:Evershed,
author = {Evershed, John and Fitch, Kent},
title = {Correcting Noisy OCR: Context Beats Confusion},
booktitle = {Proceedings of the First International Conference on Digital Access to Textual Cultural Heritage},
series = {DATeCH '14},
year = {2014},
isbn = {978-1-4503-2588-2},
location = {Madrid, Spain},
pages = {45--51},
url = {http://doi.acm.org/10.1145/2595188.2595200},
doi = {10.1145/2595188.2595200},
acmid = {2595200},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {OCR, automatic correction, historical documents, noisy text},
}
@Incollection{ocr:Kettunen,
author="Kettunen, Kimmo",
editor="Calvanese, Diego and De Nart, Dario and Tasso, Carlo",
title="Keep, Change or Delete? Setting up a Low Resource OCR Post-correction Framework for a Digitized Old Finnish Newspaper Collection",
bookTitle="Digital Libraries on the Move: 11th Italian Research Conference on Digital Libraries, IRCDL 2015, Bolzano, Italy, January 29-30, 2015, Revised Selected Papers",
year="2016",
publisher="Springer International Publishing",
address="Cham",
pages="95--103",
isbn="978-3-319-41938-1",
doi="10.1007/978-3-319-41938-1_11",
url="http://dx.doi.org/10.1007/978-3-319-41938-1_11"
}
@inproceedings{ocr:Reynaert:2014,
author = {Reynaert, Martin},
title = {On OCR Ground Truths and OCR Post-correction Gold Standards, Tools and Formats},
booktitle = {Proceedings of the First International Conference on Digital Access to Textual Cultural Heritage},
series = {DATeCH '14},
year = {2014},
isbn = {978-1-4503-2588-2},
location = {Madrid, Spain},
pages = {159--166},
numpages = {8},
url = {http://doi.acm.org/10.1145/2595188.2595216},
doi = {10.1145/2595188.2595216},
acmid = {2595216},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {FoLiA XML, OCR post-correction, TICCL, evaluation, gold standard, ground truth},
}
@inproceedings{ocr:Reynaert:2008,
author = {Reynaert, Martin},
title = {Non-interactive OCR Post-correction for Giga-scale Digitization Projects},
booktitle = {Proceedings of the 9th International Conference on Computational Linguistics and Intelligent Text Processing},
series = {CICLing'08},
year = {2008},
isbn = {3-540-78134-X, 978-3-540-78134-9},
location = {Haifa, Israel},
pages = {617--630},
numpages = {14},
url = {http://dl.acm.org/citation.cfm?id=1787578.1787647},
acmid = {1787647},
publisher = {Springer-Verlag},
address = {Berlin, Heidelberg},
}
@inproceedings{ocr:Reynaert:2016,
author = {Martin Reynaert},
title = {{OCR} Post-Correction Evaluation of Early Dutch Books Online - Revisited},
booktitle = {Proceedings of the Tenth International Conference on Language Resources
and Evaluation {LREC} 2016, Portoro{\v{z}}, Slovenia, May 23-28, 2016.},
year = {2016},
pages = {967-974},
publisher = {ELRA},
address = {Portoro{\v{z}}, Slovenia},
url = {http://www.lrec-conf.org/proceedings/lrec2016/summaries/596.html},
timestamp = {Tue, 30 Aug 2016 18:49:47 +0200},
biburl = {http://dblp2.uni-trier.de/rec/bib/conf/lrec/Reynaert16},
bibsource = {dblp computer science bibliography, http://dblp.org}
}
@article{cl:Deerwester,
author = {Deerwester, Scott and Dumais, Susan T. and Furnas, George W. and Landauer, Thomas K. and Harshman, Richard},
title = {Indexing by latent semantic analysis},
journal = {Journal of the American Society for Information Science},
volume = {41},
number = {6},
publisher = {Wiley Subscription Services, Inc., A Wiley Company},
issn = {1097-4571},
url = {http://dx.doi.org/10.1002/(SICI)1097-4571(199009)41:6<391::AID-ASI1>3.0.CO;2-9},
doi = {10.1002/(SICI)1097-4571(199009)41:6<391::AID-ASI1>3.0.CO;2-9},
pages = {391--407},
year = {1990},
}
@inproceedings{cl:Rehurek:Gensim,
title = {{Software Framework for Topic Modelling with Large Corpora}},
author = {Radim {\v R}eh{\r u}{\v r}ek and Petr Sojka},
booktitle = {{Proceedings of the LREC 2010 Workshop on New
Challenges for NLP Frameworks}},
pages = {45--50},
year = 2010,
month = May,
day = 22,
publisher = {ELRA},
address = {Valletta, Malta},
url={http://is.muni.cz/publication/884893/en},
language={English}
}
@InProceedings{ocr:Silfverberg,
author = {Miikka Silfverberg and Pekka Kauppinen and Krister Lind\'en},
title = {Data-driven spelling correction using weighted finite-state methods},
booktitle = {Proceedings of the SIGFSM Workshop on Statistical NLP and Weighted Automata},
month = {August},
year = {2016},
address = {Berlin, Germany},
publisher = {Association for Computational Linguistics},
pages = {51-59}
}