A simple Go library which enables reading PDF files.
Features
- Get plain text content (without format)
- Get Content (including all font and formatting information)
go get -u github.com/rsc/pdf
package main
import (
"bytes"
"fmt"
"github.com/rsc/pdf"
)
func main() {
content, err := readPdf("test.pdf") // Read local pdf file
if err != nil {
panic(err)
}
fmt.Println(content)
return
}
func readPdf(path string) (string, error) {
r, err := pdf.Open(path)
if err != nil {
return "", err
}
var buf bytes.Buffer
buf.ReadFrom(p.GetPlainText())
return buf.String(), nil
}
func readPdf2(path string) (string, error) {
r, err := pdf.Open(path)
if err != nil {
return "", err
}
totalPage := r.NumPage()
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() {
continue
}
var lastTextStyle pdf.Text
texts := p.Content().Text
for _, text := range texts {
if isSameSentence(text, lastTextStyle) {
lastTextStyle.S = lastTextStyle.S + text.S
} else {
fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
lastTextStyle = text
}
}
}
return "", nil
}