Skip to content

Commit

Permalink
extract annotations v2
Browse files Browse the repository at this point in the history
  • Loading branch information
Joscha Legewie committed Mar 31, 2014
1 parent ab4bba0 commit 4815161
Show file tree
Hide file tree
Showing 17 changed files with 506 additions and 95 deletions.
14 changes: 14 additions & 0 deletions examples/extract/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
## "Extract" overview

This example is a minimalistic application for the extraction of annotations from pdfs using pdf.js.

## Getting started

Point your browser to `index.html`. Voila. Take a peek at `extract.js` to see
how to make basic calls to `pdf.js`.


## Additional resources

+ [GNUpdf - Introduction to PDF](http://gnupdf.org/Introduction_to_PDF)

101 changes: 101 additions & 0 deletions examples/extract/extract.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */

'use strict';

// example to extract annotations
// cd to pdf.js folder
// python -m SimpleHTTPServer 8888
// http://localhost:8888/examples/extract/index.html

const SUPPORTED_ANNOTS = ["Text", "Highlight", "Underline"]

// Fetch the PDF document from the URL using promices
PDFJS.getDocument('/examples/extract/pdf/test.pdf').then(function(pdf) {

var n_annos = 0,
numPages = pdf.numPages;
// PDF data: pdf.pdfInfo, pdf.isEncrypted(), pdf.getMetadata().then(function(meta) {console.log('Metadata: ' + JSON.stringify(meta));});

// function to handle page (render and extract annotations)
var extract = function(page) {
var scale = 1;
var viewport = page.getViewport(scale);
// Prepare canvas using PDF page dimensions
var canvas = document.getElementById('the-canvas');
var context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
// Render PDF page into canvas context
var renderContext = {
canvasContext: context,
viewport: viewport
};

// get annotations
var annotations;
page.getAnnotations().then(function extractAnno(annos) {
console.log('Page: '+page.pageNumber + " ("+annos.length+" annotations " + JSON.stringify(annos.map(function(a) {return a.type;})) + ")");
// filter for supported annotations
annotations = annos.filter(function(anno) {return SUPPORTED_ANNOTS.indexOf(anno.type) >= 0;});
// skip page if there is nothing interesting
if (annotations.length==0) {
if(numPages>page.pageNumber) pdf.getPage(page.pageNumber+1).then(extract,function(err) {console.log('error getting the page:' + err)});
return;
}
// render page
page.render(renderContext).then(function() {
console.log('Page: '+page.pageNumber + " (page rendered)");
// console.log(annotations);
// show annotations
var markup = annotations
.filter(function(anno) {return typeof anno.markup !== 'undefined';})
.map(function(anno) {
return anno.markup.join(' ').trim()
.replace('\ufb00','ff').replace('\ufb01','fi').replace('\ufb02','fl')
.replace(/\ufb03/g,'ffi').replace(/\ufb04/g,'ffl').replace(/\ufb05/g,'ft')
.replace(/\ufb06/g,'st').replace(/\uFB00/g,'ff').replace(/\uFB01/g,'fi')
.replace(/\uFB02/g,'fl').replace(/\uFB03/g,'ffi').replace(/\uFB04/g,'ffl')
.replace(/\uFB05/g,'ft').replace(/\uFB06/g,'st')
.replace(/\u201D/g,'"').replace(/\u201C/g,'"').replace(/\u2019/g,"'")
.replace(/\u2013/g,"-");
});
var content = annotations
.filter(function(anno) {return typeof anno.content !== 'undefined';})
.map(function(anno) {
return anno.content;
});

for (var i = 0; i < annotations.length; i++) {
if(markup[i]!==undefined) console.log(markup[i]);
if(content[i]!==undefined) console.log(content[i]);
}

// render next page
if(numPages>page.pageNumber) pdf.getPage(page.pageNumber+1).then(extract,function(err) {console.log('error getting the page:' + err)});
},
// error handler for page
function(error) {
console.log('Error rendering the page: ', error);
// continue with next page
if(numPages>page.pageNumber) pdf.getPage(page.pageNumber+1).then(extract,function(err) {console.log('error getting the page:' + err)});
});

},
// error handler for page
function(error) {
console.log(error);
// continue with next page
if(numPages>page.pageNumber) pdf.getPage(page.pageNumber+1).then(extract,function(err) {console.log('error getting the page:' + err)});
});
};

// Using promise to fetch the page
pdf.getPage(1).then(extract,function(err) {console.log('error getting the page:' + err)});

},
function(err) {
console.log('unable to open pdf: ' + err);
});


45 changes: 45 additions & 0 deletions examples/extract/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
<!doctype html>
<html>

<head>
<!-- In production, only one script (pdf.js) is necessary -->
<!-- In production, change the content of PDFJS.workerSrc below -->
<script type="text/javascript" src="../../src/network.js"></script>
<script type="text/javascript" src="../../src/chunked_stream.js"></script>
<script type="text/javascript" src="../../src/pdf_manager.js"></script>
<script type="text/javascript" src="../../src/core.js"></script>
<script type="text/javascript" src="../../src/util.js"></script>
<script type="text/javascript" src="../../src/api.js"></script>
<script type="text/javascript" src="../../src/canvas.js"></script>
<script type="text/javascript" src="../../src/obj.js"></script>
<script type="text/javascript" src="../../src/function.js"></script>
<script type="text/javascript" src="../../src/charsets.js"></script>
<script type="text/javascript" src="../../src/cidmaps.js"></script>
<script type="text/javascript" src="../../src/colorspace.js"></script>
<script type="text/javascript" src="../../src/crypto.js"></script>
<script type="text/javascript" src="../../src/evaluator.js"></script>
<script type="text/javascript" src="../../src/fonts.js"></script>
<script type="text/javascript" src="../../src/glyphlist.js"></script>
<script type="text/javascript" src="../../src/image.js"></script>
<script type="text/javascript" src="../../src/metrics.js"></script>
<script type="text/javascript" src="../../src/parser.js"></script>
<script type="text/javascript" src="../../src/pattern.js"></script>
<script type="text/javascript" src="../../src/stream.js"></script>
<script type="text/javascript" src="../../src/worker.js"></script>
<script type="text/javascript" src="../../external/jpgjs/jpg.js"></script>
<script type="text/javascript" src="../../src/jpx.js"></script>
<script type="text/javascript" src="../../src/jbig2.js"></script>

<script type="text/javascript">
// Specify the main script used to create a new PDF.JS web worker.
// In production, change this to point to the combined `pdf.js` file.
PDFJS.workerSrc = '../../src/worker_loader.js';
</script>
<script type="text/javascript" src="extract.js"></script>
</head>

<body>
<canvas id="the-canvas" style="border:1px solid black;"/>
</body>

</html>
Binary file added examples/extract/pdf/test.pdf
Binary file not shown.
48 changes: 22 additions & 26 deletions extensions/firefox/components/PdfStreamConverter.js
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,7 @@ PdfStreamConverter.prototype = {
var hash = aRequest.URI.ref;
rangeRequest = contentEncoding === 'identity' &&
acceptRanges === 'bytes' &&
aRequest.contentLength >= 0 &&
hash.indexOf('disableRange=true') < 0;
}

Expand Down Expand Up @@ -764,33 +765,28 @@ PdfStreamConverter.prototype = {
// We get the DOM window here instead of before the request since it
// may have changed during a redirect.
var domWindow = getDOMWindow(channel);
// Double check the url is still the correct one.
if (domWindow.document.documentURIObject.equals(aRequest.URI)) {
var actions;
if (rangeRequest) {
// We are going to be issuing range requests, so cancel the
// original request
aRequest.resume();
aRequest.cancel(Cr.NS_BINDING_ABORTED);
actions = new RangedChromeActions(domWindow,
contentDispositionFilename, aRequest);
} else {
actions = new StandardChromeActions(
domWindow, contentDispositionFilename, dataListener);
}
var requestListener = new RequestListener(actions);
domWindow.addEventListener(PDFJS_EVENT_ID, function(event) {
requestListener.receive(event);
}, false, true);
if (actions.supportsIntegratedFind()) {
var chromeWindow = getChromeWindow(domWindow);
var findEventManager = new FindEventManager(chromeWindow.gFindBar,
domWindow,
chromeWindow);
findEventManager.bind();
}
var actions;
if (rangeRequest) {
// We are going to be issuing range requests, so cancel the
// original request
aRequest.resume();
aRequest.cancel(Cr.NS_BINDING_ABORTED);
actions = new RangedChromeActions(domWindow,
contentDispositionFilename, aRequest);
} else {
log('Dom window url did not match request url.');
actions = new StandardChromeActions(
domWindow, contentDispositionFilename, dataListener);
}
var requestListener = new RequestListener(actions);
domWindow.addEventListener(PDFJS_EVENT_ID, function(event) {
requestListener.receive(event);
}, false, true);
if (actions.supportsIntegratedFind()) {
var chromeWindow = getChromeWindow(domWindow);
var findEventManager = new FindEventManager(chromeWindow.gFindBar,
domWindow,
chromeWindow);
findEventManager.bind();
}
listener.onStopRequest(aRequest, context, statusCode);
}
Expand Down
2 changes: 1 addition & 1 deletion l10n/cs/viewer.properties
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

bookmark.title=Aktuální zobrazení(zkopírovat nebo otevřít v novém okně)
bookmark.title=Aktuální zobrazení (zkopírovat nebo otevřít v novém okně)
previous.title=Předchozí stránka
next.title=Další stránka
print.title=Tisk
Expand Down
36 changes: 19 additions & 17 deletions src/api.js
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -311,25 +311,27 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
var continueCallback = params.continueCallback;

// Once the operatorList and fonts are loaded, do the actual rendering.
this.displayReadyPromise.then(
function pageDisplayReadyPromise() {
if (self.destroyed) {
complete();
return;
}
this.getAnnotations().then(function(annos) {
this.displayReadyPromise.then(
function pageDisplayReadyPromise() {
if (self.destroyed) {
complete();
return;
}
var gfx = new CanvasGraphics(params.canvasContext, this.commonObjs,
this.objs, params.textLayer, params.imageLayer, annos);
try {
this.display(gfx, params.viewport, complete, continueCallback);
} catch (e) {
complete(e);
}

var gfx = new CanvasGraphics(params.canvasContext, this.commonObjs,
this.objs, params.textLayer, params.imageLayer);
try {
this.display(gfx, params.viewport, complete, continueCallback);
} catch (e) {
complete(e);
}.bind(this),
function pageDisplayReadPromiseError(reason) {
complete(reason);
}
}.bind(this),
function pageDisplayReadPromiseError(reason) {
complete(reason);
}
);
);
}.bind(this));

return promise;
},
Expand Down
Loading

0 comments on commit 4815161

Please sign in to comment.