-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathapi.go
116 lines (98 loc) · 2.78 KB
/
api.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
package apidemo
import (
"fmt"
"io"
"os"
"path/filepath"
"strings"
"github.com/benoitkugler/pdf/model"
)
// ListAttachments returns a list of embedded file attachment names of `doc`.
func ListAttachments(doc model.Document) []string {
out := make([]string, len(doc.Catalog.Names.EmbeddedFiles))
for i, file := range doc.Catalog.Names.EmbeddedFiles {
out[i] = file.FileSpec.UF
}
return out
}
// AddAttachments embeds files into the document and writes the result to w.
// A file is either a file name or a file name and a description separated by a comma.
func AddAttachments(doc *model.Document, enc *model.Encrypt, w io.Writer, files []string) error {
for _, fn := range files {
s := strings.Split(fn, ",")
if len(s) == 0 || len(s) > 2 {
return fmt.Errorf("invalid file description : %s", fn)
}
fileName := s[0]
desc := ""
if len(s) == 2 {
desc = s[1]
}
f, err := os.Open(fileName)
if err != nil {
return err
}
defer f.Close()
fi, err := f.Stat()
if err != nil {
return err
}
content, err := io.ReadAll(f)
if err != nil {
return fmt.Errorf("can't read file : %w", err)
}
var emb model.EmbeddedFileStream
emb.Params.SetChecksumAndSize(content)
emb.Params.ModDate = fi.ModTime()
// compression with flate, optional
emb.Stream = model.NewCompressedStream(content)
if err != nil {
return fmt.Errorf("can't compress file : %w", err)
}
fs := model.FileSpec{
UF: filepath.Base(fileName),
EF: &emb,
Desc: desc,
}
att := model.NameToFile{Name: fs.UF, FileSpec: &fs}
doc.Catalog.Names.EmbeddedFiles = append(doc.Catalog.Names.EmbeddedFiles, att)
}
err := doc.Write(w, enc)
return err
}
// ExtractContent dumps "PDF source" files from `doc` into `outDir` for selected pages.
// Passing `nil` for `pageNumbers` extracts all pages. Invalid page numbers are ignored.
func ExtractContent(doc model.Document, outDir string, pageNumbers []int) error {
// Note: the parsing of the page selection must have been done previously
pages := doc.Catalog.Pages.Flatten()
if pageNumbers == nil {
pageNumbers = make([]int, len(pages))
for i := 0; i < len(pages); i++ {
pageNumbers[i] = i
}
}
seen := map[int]bool{}
for _, pageNumber := range pageNumbers {
if seen[pageNumber] { // avoid duplicate
continue
}
if pageNumber >= len(pages) { // Handle overflow gracefully
continue
}
seen[pageNumber] = true
var totalPageContent []byte
for _, ct := range pages[pageNumber].Contents {
ctContent, err := ct.Decode()
if err != nil {
return err
}
totalPageContent = append(totalPageContent, ctContent...)
}
outPath := filepath.Join(outDir, fmt.Sprintf("Content_page_%d.txt", pageNumber))
err := os.WriteFile(outPath, totalPageContent, os.ModePerm)
if err != nil {
return err
}
}
return nil
}