-
Notifications
You must be signed in to change notification settings - Fork 102
/
Copy pathpdf_search_replace_advanced.go
321 lines (275 loc) · 8.01 KB
/
pdf_search_replace_advanced.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
/*
* pdf_search_replace_advanced.go - More advanced example of search and replace with UniPDF.
* Replaces <text> with <replace text> in the output PDF. Takes font encoding
* into account when finding occurrences of the search term. In addition, it
* attempts to identify occurences of the search term which are contained within
* multiple PDF text processing operators (Tj, TJ).
*
* Syntax: go run pdf_search_replace_advanced.go <input.pdf> <output.pdf> <text> <replace text>
*/
package main
import (
"errors"
"fmt"
"os"
"strings"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/common/license"
"github.com/unidoc/unipdf/v3/contentstream"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/model"
"github.com/unidoc/unipdf/v3/model/optimize"
)
func init() {
// Make sure to load your metered License API key prior to using the library.
// If you need a key, you can sign up and create a free one at https://cloud.unidoc.io
err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`))
if err != nil {
panic(err)
}
}
type textChunk struct {
font *model.PdfFont
strObj *core.PdfObjectString
val string
idx int
}
func (tc *textChunk) encode() {
var encoded string
if font := tc.font; font != nil {
encodedBytes, numMisses := font.StringToCharcodeBytes(tc.val)
if numMisses != 0 {
common.Log.Debug("WARN: some runes could not be encoded.\n\t%s -> %v")
}
encoded = string(encodedBytes)
}
*tc.strObj = *core.MakeString(encoded)
}
type textChunks struct {
text string
chunks []*textChunk
}
func (tc *textChunks) replace(search, replacement string) {
text := tc.text
chunks := tc.chunks
// Steps:
// 1. Search for the first index of the search term in the text.
// 2. Use the found index to match the text chunk which contains
// (or partly contains) the search term.
// 3. Replace the search term in the found text chunk. The search term
// will not always start at the beginning of the text chunk. Also,
// the search term could be split in multiple text chunks. If that's
// the case, replace the portion of the search term in the found
// chunk and continue removing characters from the following chunks
// until the search term has been completely erased.
// 4. Offset the text chunks slice to the last processed text chunk from
// the previous step, if the text chunk was not completely erased, or
// to the next one otherwise. This is necessary so that the visited
// text chunks are skipped when searching for the next occurrence of the
// search term.
// 5. Discard the part of the text up to (and including) the index found
// in step one.
// 6. Move to step 1 in order to search for the search term in the remaining
// text.
var chunkOffset int
matchIdx := strings.Index(text, search)
for currMatchIdx := matchIdx; matchIdx != -1; {
for i, chunk := range chunks[chunkOffset:] {
idx, lenChunk := chunk.idx, len(chunk.val)
if currMatchIdx < idx || currMatchIdx > idx+lenChunk-1 {
continue
}
chunkOffset += i + 1
start := currMatchIdx - idx
remaining := len(search) - (lenChunk - start)
replaceVal := chunk.val[:start] + replacement
if remaining < 0 {
replaceVal += chunk.val[lenChunk+remaining:]
chunkOffset--
}
chunk.val = replaceVal
chunk.encode()
for j := chunkOffset; remaining > 0; j++ {
c := chunks[j]
l := len(c.val)
if l > remaining {
c.val = c.val[remaining:]
} else {
c.val = ""
chunkOffset++
}
c.encode()
remaining -= l
}
break
}
text = text[matchIdx+1:]
matchIdx = strings.Index(text, search)
currMatchIdx += matchIdx + 1
}
tc.text = strings.Replace(tc.text, search, replacement, -1)
}
func main() {
// Uncomment to enable debugging.
//common.SetLogger(common.NewConsoleLogger(common.LogLevelDebug))
if len(os.Args) < 5 {
fmt.Printf("Usage: go run pdf_search_replace_advanced.go <input.pdf> <output.pdf> <text> <replace text>\n")
os.Exit(0)
}
inputPath := os.Args[1]
outputPath := os.Args[2]
searchText := os.Args[3]
replaceText := os.Args[4]
err := searchReplace(inputPath, outputPath, searchText, replaceText)
if err != nil {
panic(err)
}
fmt.Printf("Successfully created %s\n", outputPath)
}
func searchReplace(inputPath, outputPath, searchText, replaceText string) error {
f, err := os.Open(inputPath)
if err != nil {
return err
}
defer f.Close()
pdfReader, err := model.NewPdfReader(f)
if err != nil {
return err
}
pdfWriter := model.NewPdfWriter()
encrypted, err := pdfReader.IsEncrypted()
if err != nil {
return err
}
if encrypted {
ok, err := pdfReader.Decrypt([]byte(""))
if err != nil {
return err
}
if !ok {
return errors.New("Encrypted")
}
}
numPages, err := pdfReader.GetNumPages()
if err != nil {
return err
}
for n := 1; n <= numPages; n++ {
page, err := pdfReader.GetPage(n)
if err != nil {
return err
}
err = searchReplacePageText(page, searchText, replaceText)
if err != nil {
return err
}
err = pdfWriter.AddPage(page)
if err != nil {
return err
}
}
fw, err := os.Create(outputPath)
if err != nil {
return err
}
defer fw.Close()
opt := optimize.Options{
CombineDuplicateStreams: true,
CombineIdenticalIndirectObjects: true,
UseObjectStreams: true,
CompressStreams: true,
}
pdfWriter.SetOptimizer(optimize.New(opt))
return pdfWriter.Write(fw)
}
func searchReplacePageText(page *model.PdfPage, searchText, replaceText string) error {
contents, err := page.GetAllContentStreams()
if err != nil {
return err
}
ops, err := contentstream.NewContentStreamParser(contents).Parse()
if err != nil {
return err
}
// Generate text chunks.
var currFont *model.PdfFont
tc := textChunks{}
textProcFunc := func(objptr *core.PdfObject) {
strObj, ok := core.GetString(*objptr)
if !ok {
common.Log.Debug("Invalid parameter, skipping")
return
}
str := strObj.String()
if currFont != nil {
decoded, _, numMisses := currFont.CharcodeBytesToUnicode(strObj.Bytes())
if numMisses != 0 {
common.Log.Debug("WARN: some charcodes could not be decoded.\n\t%v -> %s", strObj.Bytes(), decoded)
}
str = decoded
}
tc.chunks = append(tc.chunks, &textChunk{
font: currFont,
strObj: strObj,
val: str,
idx: len(tc.text),
})
tc.text += str
}
processor := contentstream.NewContentStreamProcessor(*ops)
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState, resources *model.PdfPageResources) error {
switch op.Operand {
case `Tj`, `'`:
if len(op.Params) != 1 {
common.Log.Debug("Invalid: Tj/' with invalid set of parameters - skip")
return nil
}
textProcFunc(&op.Params[0])
case `''`:
if len(op.Params) != 3 {
common.Log.Debug("Invalid: '' with invalid set of parameters - skip")
return nil
}
textProcFunc(&op.Params[3])
case `TJ`:
if len(op.Params) != 1 {
common.Log.Debug("Invalid: TJ with invalid set of parameters - skip")
return nil
}
arr, _ := core.GetArray(op.Params[0])
for i := range arr.Elements() {
obj := arr.Get(i)
textProcFunc(&obj)
arr.Set(i, obj)
}
case "Tf":
if len(op.Params) != 2 {
common.Log.Debug("Invalid: Tf with invalid set of parameters - skip")
return nil
}
fname, ok := core.GetName(op.Params[0])
if !ok || fname == nil {
common.Log.Debug("ERROR: could not get font name")
return nil
}
fObj, has := resources.GetFontByName(*fname)
if !has {
common.Log.Debug("ERROR: font %s not found", fname.String())
return nil
}
pdfFont, err := model.NewPdfFontFromPdfObject(fObj)
if err != nil {
common.Log.Debug("ERROR: loading font")
return nil
}
currFont = pdfFont
}
return nil
})
if err = processor.Process(page.Resources); err != nil {
return err
}
tc.replace(searchText, replaceText)
return page.SetContentStreams([]string{ops.String()}, core.NewFlateEncoder())
}