pdf2html helps to convert PDF file to HTML or Text using Apache Tika. This module also helps to generate thumbnail image for PDF file using Apache PDFBox.
via yarn:
yarn add pdf2html
via npm:
npm install --save pdf2html
Java runtime environment (JRE) is required to run this module.
const pdf2html = require('pdf2html')
pdf2html.html('sample.pdf', (err, html) => {
if (err) {
console.error('Conversion error: ' + err)
} else {
console.log(html)
}
})
pdf2html.text('sample.pdf', (err, text) => {
if (err) {
console.error('Conversion error: ' + err)
} else {
console.log(text)
}
})
pdf2html.pages('sample.pdf', (err, htmlPages) => {
if (err) {
console.error('Conversion error: ' + err)
} else {
console.log(htmlPages)
}
})
const options = { text: true }
pdf2html.pages('sample.pdf', options, (err, textPages) => {
if (err) {
console.error('Conversion error: ' + err)
} else {
console.log(textPages)
}
})
pdf2html.meta('sample.pdf', (err, meta) => {
if (err) {
console.error('Conversion error: ' + err)
} else {
console.log(meta)
}
})
pdf2html.thumbnail('sample.pdf', (err, thumbnailPath) => {
if (err) {
console.error('Conversion error: ' + err)
} else {
console.log(thumbnailPath)
}
})
const options = { page: 1, imageType: 'png', width: 160, height: 226 }
pdf2html.thumbnail('sample.pdf', options, (err, thumbnailPath) => {
if (err) {
console.error('Conversion error: ' + err)
} else {
console.log(thumbnailPath)
}
})
Sometimes downloading the dependencies might be too slow or unable to download in a HTTP proxy environment. Follow the step below to skip the dependency downloads.
cd node_modules/pdf2html/vendor
# These URLs come from https://github.com/shebinleo/pdf2html/blob/master/postinstall.js#L6-L7
wget https://dlcdn.apache.org/pdfbox/2.0.26/pdfbox-app-2.0.26.jar
wget https://dlcdn.apache.org/tika/2.4.0/tika-app-2.4.0.jar