diff --git a/examples/extract/README.md b/examples/extract/README.md new file mode 100755 index 0000000000000..a6e1b2944f007 --- /dev/null +++ b/examples/extract/README.md @@ -0,0 +1,14 @@ +## "Extract" overview + +This example is a minimalistic application for the extraction of annotations from pdfs using pdf.js. + +## Getting started + +Point your browser to `index.html`. Voila. Take a peek at `extract.js` to see +how to make basic calls to `pdf.js`. + + +## Additional resources + ++ [GNUpdf - Introduction to PDF](http://gnupdf.org/Introduction_to_PDF) + diff --git a/examples/extract/extract.js b/examples/extract/extract.js new file mode 100755 index 0000000000000..b2fb4f5f0226f --- /dev/null +++ b/examples/extract/extract.js @@ -0,0 +1,101 @@ +/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */ + +'use strict'; + +// example to extract annotations +// cd to pdf.js folder +// python -m SimpleHTTPServer 8888 +// http://localhost:8888/examples/extract/index.html + +const SUPPORTED_ANNOTS = ["Text", "Highlight", "Underline"] + +// Fetch the PDF document from the URL using promices +PDFJS.getDocument('/examples/extract/pdf/test.pdf').then(function(pdf) { + + var n_annos = 0, + numPages = pdf.numPages; + // PDF data: pdf.pdfInfo, pdf.isEncrypted(), pdf.getMetadata().then(function(meta) {console.log('Metadata: ' + JSON.stringify(meta));}); + + // function to handle page (render and extract annotations) + var extract = function(page) { + var scale = 1; + var viewport = page.getViewport(scale); + // Prepare canvas using PDF page dimensions + var canvas = document.getElementById('the-canvas'); + var context = canvas.getContext('2d'); + canvas.height = viewport.height; + canvas.width = viewport.width; + // Render PDF page into canvas context + var renderContext = { + canvasContext: context, + viewport: viewport + }; + + // get annotations + var annotations; + page.getAnnotations().then(function extractAnno(annos) { + console.log('Page: '+page.pageNumber + " ("+annos.length+" annotations " + JSON.stringify(annos.map(function(a) {return a.type;})) + ")"); + // filter for supported annotations + annotations = annos.filter(function(anno) {return SUPPORTED_ANNOTS.indexOf(anno.type) >= 0;}); + // skip page if there is nothing interesting + if (annotations.length==0) { + if(numPages>page.pageNumber) pdf.getPage(page.pageNumber+1).then(extract,function(err) {console.log('error getting the page:' + err)}); + return; + } + // render page + page.render(renderContext).then(function() { + console.log('Page: '+page.pageNumber + " (page rendered)"); + // console.log(annotations); + // show annotations + var markup = annotations + .filter(function(anno) {return typeof anno.markup !== 'undefined';}) + .map(function(anno) { + return anno.markup.join(' ').trim() + .replace('\ufb00','ff').replace('\ufb01','fi').replace('\ufb02','fl') + .replace(/\ufb03/g,'ffi').replace(/\ufb04/g,'ffl').replace(/\ufb05/g,'ft') + .replace(/\ufb06/g,'st').replace(/\uFB00/g,'ff').replace(/\uFB01/g,'fi') + .replace(/\uFB02/g,'fl').replace(/\uFB03/g,'ffi').replace(/\uFB04/g,'ffl') + .replace(/\uFB05/g,'ft').replace(/\uFB06/g,'st') + .replace(/\u201D/g,'"').replace(/\u201C/g,'"').replace(/\u2019/g,"'") + .replace(/\u2013/g,"-"); + }); + var content = annotations + .filter(function(anno) {return typeof anno.content !== 'undefined';}) + .map(function(anno) { + return anno.content; + }); + + for (var i = 0; i < annotations.length; i++) { + if(markup[i]!==undefined) console.log(markup[i]); + if(content[i]!==undefined) console.log(content[i]); + } + + // render next page + if(numPages>page.pageNumber) pdf.getPage(page.pageNumber+1).then(extract,function(err) {console.log('error getting the page:' + err)}); + }, + // error handler for page + function(error) { + console.log('Error rendering the page: ', error); + // continue with next page + if(numPages>page.pageNumber) pdf.getPage(page.pageNumber+1).then(extract,function(err) {console.log('error getting the page:' + err)}); + }); + + }, + // error handler for page + function(error) { + console.log(error); + // continue with next page + if(numPages>page.pageNumber) pdf.getPage(page.pageNumber+1).then(extract,function(err) {console.log('error getting the page:' + err)}); + }); + }; + + // Using promise to fetch the page + pdf.getPage(1).then(extract,function(err) {console.log('error getting the page:' + err)}); + +}, +function(err) { + console.log('unable to open pdf: ' + err); +}); + + diff --git a/examples/extract/index.html b/examples/extract/index.html new file mode 100755 index 0000000000000..14c337f48df7b --- /dev/null +++ b/examples/extract/index.html @@ -0,0 +1,45 @@ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/extract/pdf/test.pdf b/examples/extract/pdf/test.pdf new file mode 100644 index 0000000000000..3a396b580e70e Binary files /dev/null and b/examples/extract/pdf/test.pdf differ diff --git a/extensions/firefox/components/PdfStreamConverter.js b/extensions/firefox/components/PdfStreamConverter.js index b80de30fbaa59..5332fb7793df8 100644 --- a/extensions/firefox/components/PdfStreamConverter.js +++ b/extensions/firefox/components/PdfStreamConverter.js @@ -713,6 +713,7 @@ PdfStreamConverter.prototype = { var hash = aRequest.URI.ref; rangeRequest = contentEncoding === 'identity' && acceptRanges === 'bytes' && + aRequest.contentLength >= 0 && hash.indexOf('disableRange=true') < 0; } @@ -764,33 +765,28 @@ PdfStreamConverter.prototype = { // We get the DOM window here instead of before the request since it // may have changed during a redirect. var domWindow = getDOMWindow(channel); - // Double check the url is still the correct one. - if (domWindow.document.documentURIObject.equals(aRequest.URI)) { - var actions; - if (rangeRequest) { - // We are going to be issuing range requests, so cancel the - // original request - aRequest.resume(); - aRequest.cancel(Cr.NS_BINDING_ABORTED); - actions = new RangedChromeActions(domWindow, - contentDispositionFilename, aRequest); - } else { - actions = new StandardChromeActions( - domWindow, contentDispositionFilename, dataListener); - } - var requestListener = new RequestListener(actions); - domWindow.addEventListener(PDFJS_EVENT_ID, function(event) { - requestListener.receive(event); - }, false, true); - if (actions.supportsIntegratedFind()) { - var chromeWindow = getChromeWindow(domWindow); - var findEventManager = new FindEventManager(chromeWindow.gFindBar, - domWindow, - chromeWindow); - findEventManager.bind(); - } + var actions; + if (rangeRequest) { + // We are going to be issuing range requests, so cancel the + // original request + aRequest.resume(); + aRequest.cancel(Cr.NS_BINDING_ABORTED); + actions = new RangedChromeActions(domWindow, + contentDispositionFilename, aRequest); } else { - log('Dom window url did not match request url.'); + actions = new StandardChromeActions( + domWindow, contentDispositionFilename, dataListener); + } + var requestListener = new RequestListener(actions); + domWindow.addEventListener(PDFJS_EVENT_ID, function(event) { + requestListener.receive(event); + }, false, true); + if (actions.supportsIntegratedFind()) { + var chromeWindow = getChromeWindow(domWindow); + var findEventManager = new FindEventManager(chromeWindow.gFindBar, + domWindow, + chromeWindow); + findEventManager.bind(); } listener.onStopRequest(aRequest, context, statusCode); } diff --git a/l10n/cs/viewer.properties b/l10n/cs/viewer.properties index f8ed476de3e08..83f12b2665f7b 100644 --- a/l10n/cs/viewer.properties +++ b/l10n/cs/viewer.properties @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -bookmark.title=Aktuální zobrazení(zkopírovat nebo otevřít v novém okně) +bookmark.title=Aktuální zobrazení (zkopírovat nebo otevřít v novém okně) previous.title=Předchozí stránka next.title=Další stránka print.title=Tisk diff --git a/src/api.js b/src/api.js old mode 100644 new mode 100755 index e1ec31470c7e0..775f15482a1a3 --- a/src/api.js +++ b/src/api.js @@ -311,25 +311,27 @@ var PDFPageProxy = (function PDFPageProxyClosure() { var continueCallback = params.continueCallback; // Once the operatorList and fonts are loaded, do the actual rendering. - this.displayReadyPromise.then( - function pageDisplayReadyPromise() { - if (self.destroyed) { - complete(); - return; - } + this.getAnnotations().then(function(annos) { + this.displayReadyPromise.then( + function pageDisplayReadyPromise() { + if (self.destroyed) { + complete(); + return; + } + var gfx = new CanvasGraphics(params.canvasContext, this.commonObjs, + this.objs, params.textLayer, params.imageLayer, annos); + try { + this.display(gfx, params.viewport, complete, continueCallback); + } catch (e) { + complete(e); + } - var gfx = new CanvasGraphics(params.canvasContext, this.commonObjs, - this.objs, params.textLayer, params.imageLayer); - try { - this.display(gfx, params.viewport, complete, continueCallback); - } catch (e) { - complete(e); + }.bind(this), + function pageDisplayReadPromiseError(reason) { + complete(reason); } - }.bind(this), - function pageDisplayReadPromiseError(reason) { - complete(reason); - } - ); + ); + }.bind(this)); return promise; }, diff --git a/src/canvas.js b/src/canvas.js old mode 100644 new mode 100755 index 6dd5b2b03910e..e82f24bbd48c0 --- a/src/canvas.js +++ b/src/canvas.js @@ -214,7 +214,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { // before it stops and shedules a continue of execution. var EXECUTION_TIME = 15; - function CanvasGraphics(canvasCtx, commonObjs, objs, textLayer, imageLayer) { + function CanvasGraphics(canvasCtx, commonObjs, objs, textLayer, imageLayer, annotations) { this.ctx = canvasCtx; this.current = new CanvasExtraState(); this.stateStack = []; @@ -224,6 +224,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { this.commonObjs = commonObjs; this.objs = objs; this.textLayer = textLayer; + this.annotations = annotations; this.imageLayer = imageLayer; this.groupStack = []; if (canvasCtx) { @@ -437,6 +438,14 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { this.ctx.save(); this.ctx.transform.apply(this.ctx, transform); + // Record the user => device transformation so we can transform annotation + // bounding boxes appropriately + if (this.ctx.mozCurrentTransform) { + this.ctx.user2dev = this.ctx.mozCurrentTransform.slice(0, 6); + } else { + this.ctx.user2dev = IDENTITY_MATRIX; + } + if (this.textLayer) { this.textLayer.beginLayout(); } @@ -931,6 +940,155 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { geometry.fontSize = this.current.fontSize; return geometry; }, + /** Compute the coordinates and width of the character given its width and + * x-offset in font space, a font object, and a matrix used for transforming + * from font space to device space. Returns an object with the character's + * x, y, and width properties, and the width of a space in the given font, + * all in device space. + */ + makeCharDims: function canvasMakeCharDims(charWidth, xOffset, font, font2dev) { + var xy = Util.applyTransform([xOffset, 0], font2dev); + var w = Util.applyTransform([xOffset + charWidth, 0], font2dev); + var dims = {x: xy[0], y: xy[1]}; + dims.width = Math.abs(xy[0] - w[0]); + var spaceWidth = font.coded ? font.spaceWidth : font.spaceWidth * .001; + var sw = Util.applyTransform([spaceWidth,0], font2dev); + // TODO: why 2.0? I changed it to 3 + // dims.spaceWidth = (sw[0] - font2dev[4]) / 2.0; + dims.spaceWidth = (sw[0] - font2dev[4]) / 3.0; + return dims; + }, + /** Determines if character, with the given dimensions, falls within the + * bounds of annotation annot. If so, returns the 0-based index of the quad + * region within which the character falls. If the character is outside the + * annotation, returns -1. */ + charInAnnot: function canvasCharInAnnot(annot, glyph, cdims, user2dev) { + if (annot.type && (annot.type == 'Highlight' || + annot.type == 'Underline')) { + for (var i = 0; i < annot.quadPoints.length; i++) { + var quad = annot.quadPoints[i]; + var qxy0 = Util.applyTransform([quad.x, quad.y], user2dev); + var quadOtherCorner = [quad.x + quad.width, quad.y + quad.height]; + var qxy1 = Util.applyTransform(quadOtherCorner, user2dev); + var minX = Math.min(qxy0[0], qxy1[0]); + var maxX = Math.max(qxy0[0], qxy1[0]); + var minY = Math.min(qxy0[1], qxy1[1]); + var maxY = Math.max(qxy0[1], qxy1[1]); + // only grab characters where 50% of the character's + // width lies within the annotation + var xPlusHalfWidth = cdims.x + (0.5 * cdims.width); + if (xPlusHalfWidth >= minX && xPlusHalfWidth <= maxX && + cdims.y >= minY && cdims.y <= maxY) { + return i; + } + } + } + return -1; + }, + /** Update the markup array for annot, placing the given character into the + * string associated with the given quad. */ + updateMarkup: function canvasGraphicsUpdateMarkup(annot, quad, glyph, charDims, isSpace) { + if (quad < 0) return; + // check whether glyph.fontChar is valid Ascii, otherwise use glyph.unicode + // http://stackoverflow.com/a/13522793 + function isAsciiOnly(str) { + for (var i = 0; i < str.length; i++) + if (str.charCodeAt(i) > 127) + return false; + return true; + } + // var character = /^[\u0-\u7f]*$/.test(glyph.fontChar) ? glyph.fontChar : glyph.unicode; + var character = isAsciiOnly(glyph.fontChar) ? glyph.fontChar : glyph.unicode; + // exit if unprintable unicode characters + // http://stackoverflow.com/a/11598864 + var re = /[\0-\x1F\x7F-\x9F\xAD\u0378\u0379\u037F-\u0383\u038B\u038D\u03A2\u0528-\u0530\u0557\u0558\u0560\u0588\u058B-\u058E\u0590\u05C8-\u05CF\u05EB-\u05EF\u05F5-\u0605\u061C\u061D\u06DD\u070E\u070F\u074B\u074C\u07B2-\u07BF\u07FB-\u07FF\u082E\u082F\u083F\u085C\u085D\u085F-\u089F\u08A1\u08AD-\u08E3\u08FF\u0978\u0980\u0984\u098D\u098E\u0991\u0992\u09A9\u09B1\u09B3-\u09B5\u09BA\u09BB\u09C5\u09C6\u09C9\u09CA\u09CF-\u09D6\u09D8-\u09DB\u09DE\u09E4\u09E5\u09FC-\u0A00\u0A04\u0A0B-\u0A0E\u0A11\u0A12\u0A29\u0A31\u0A34\u0A37\u0A3A\u0A3B\u0A3D\u0A43-\u0A46\u0A49\u0A4A\u0A4E-\u0A50\u0A52-\u0A58\u0A5D\u0A5F-\u0A65\u0A76-\u0A80\u0A84\u0A8E\u0A92\u0AA9\u0AB1\u0AB4\u0ABA\u0ABB\u0AC6\u0ACA\u0ACE\u0ACF\u0AD1-\u0ADF\u0AE4\u0AE5\u0AF2-\u0B00\u0B04\u0B0D\u0B0E\u0B11\u0B12\u0B29\u0B31\u0B34\u0B3A\u0B3B\u0B45\u0B46\u0B49\u0B4A\u0B4E-\u0B55\u0B58-\u0B5B\u0B5E\u0B64\u0B65\u0B78-\u0B81\u0B84\u0B8B-\u0B8D\u0B91\u0B96-\u0B98\u0B9B\u0B9D\u0BA0-\u0BA2\u0BA5-\u0BA7\u0BAB-\u0BAD\u0BBA-\u0BBD\u0BC3-\u0BC5\u0BC9\u0BCE\u0BCF\u0BD1-\u0BD6\u0BD8-\u0BE5\u0BFB-\u0C00\u0C04\u0C0D\u0C11\u0C29\u0C34\u0C3A-\u0C3C\u0C45\u0C49\u0C4E-\u0C54\u0C57\u0C5A-\u0C5F\u0C64\u0C65\u0C70-\u0C77\u0C80\u0C81\u0C84\u0C8D\u0C91\u0CA9\u0CB4\u0CBA\u0CBB\u0CC5\u0CC9\u0CCE-\u0CD4\u0CD7-\u0CDD\u0CDF\u0CE4\u0CE5\u0CF0\u0CF3-\u0D01\u0D04\u0D0D\u0D11\u0D3B\u0D3C\u0D45\u0D49\u0D4F-\u0D56\u0D58-\u0D5F\u0D64\u0D65\u0D76-\u0D78\u0D80\u0D81\u0D84\u0D97-\u0D99\u0DB2\u0DBC\u0DBE\u0DBF\u0DC7-\u0DC9\u0DCB-\u0DCE\u0DD5\u0DD7\u0DE0-\u0DF1\u0DF5-\u0E00\u0E3B-\u0E3E\u0E5C-\u0E80\u0E83\u0E85\u0E86\u0E89\u0E8B\u0E8C\u0E8E-\u0E93\u0E98\u0EA0\u0EA4\u0EA6\u0EA8\u0EA9\u0EAC\u0EBA\u0EBE\u0EBF\u0EC5\u0EC7\u0ECE\u0ECF\u0EDA\u0EDB\u0EE0-\u0EFF\u0F48\u0F6D-\u0F70\u0F98\u0FBD\u0FCD\u0FDB-\u0FFF\u10C6\u10C8-\u10CC\u10CE\u10CF\u1249\u124E\u124F\u1257\u1259\u125E\u125F\u1289\u128E\u128F\u12B1\u12B6\u12B7\u12BF\u12C1\u12C6\u12C7\u12D7\u1311\u1316\u1317\u135B\u135C\u137D-\u137F\u139A-\u139F\u13F5-\u13FF\u169D-\u169F\u16F1-\u16FF\u170D\u1715-\u171F\u1737-\u173F\u1754-\u175F\u176D\u1771\u1774-\u177F\u17DE\u17DF\u17EA-\u17EF\u17FA-\u17FF\u180F\u181A-\u181F\u1878-\u187F\u18AB-\u18AF\u18F6-\u18FF\u191D-\u191F\u192C-\u192F\u193C-\u193F\u1941-\u1943\u196E\u196F\u1975-\u197F\u19AC-\u19AF\u19CA-\u19CF\u19DB-\u19DD\u1A1C\u1A1D\u1A5F\u1A7D\u1A7E\u1A8A-\u1A8F\u1A9A-\u1A9F\u1AAE-\u1AFF\u1B4C-\u1B4F\u1B7D-\u1B7F\u1BF4-\u1BFB\u1C38-\u1C3A\u1C4A-\u1C4C\u1C80-\u1CBF\u1CC8-\u1CCF\u1CF7-\u1CFF\u1DE7-\u1DFB\u1F16\u1F17\u1F1E\u1F1F\u1F46\u1F47\u1F4E\u1F4F\u1F58\u1F5A\u1F5C\u1F5E\u1F7E\u1F7F\u1FB5\u1FC5\u1FD4\u1FD5\u1FDC\u1FF0\u1FF1\u1FF5\u1FFF\u200B-\u200F\u202A-\u202E\u2060-\u206F\u2072\u2073\u208F\u209D-\u209F\u20BB-\u20CF\u20F1-\u20FF\u218A-\u218F\u23F4-\u23FF\u2427-\u243F\u244B-\u245F\u2700\u2B4D-\u2B4F\u2B5A-\u2BFF\u2C2F\u2C5F\u2CF4-\u2CF8\u2D26\u2D28-\u2D2C\u2D2E\u2D2F\u2D68-\u2D6E\u2D71-\u2D7E\u2D97-\u2D9F\u2DA7\u2DAF\u2DB7\u2DBF\u2DC7\u2DCF\u2DD7\u2DDF\u2E3C-\u2E7F\u2E9A\u2EF4-\u2EFF\u2FD6-\u2FEF\u2FFC-\u2FFF\u3040\u3097\u3098\u3100-\u3104\u312E-\u3130\u318F\u31BB-\u31BF\u31E4-\u31EF\u321F\u32FF\u4DB6-\u4DBF\u9FCD-\u9FFF\uA48D-\uA48F\uA4C7-\uA4CF\uA62C-\uA63F\uA698-\uA69E\uA6F8-\uA6FF\uA78F\uA794-\uA79F\uA7AB-\uA7F7\uA82C-\uA82F\uA83A-\uA83F\uA878-\uA87F\uA8C5-\uA8CD\uA8DA-\uA8DF\uA8FC-\uA8FF\uA954-\uA95E\uA97D-\uA97F\uA9CE\uA9DA-\uA9DD\uA9E0-\uA9FF\uAA37-\uAA3F\uAA4E\uAA4F\uAA5A\uAA5B\uAA7C-\uAA7F\uAAC3-\uAADA\uAAF7-\uAB00\uAB07\uAB08\uAB0F\uAB10\uAB17-\uAB1F\uAB27\uAB2F-\uABBF\uABEE\uABEF\uABFA-\uABFF\uD7A4-\uD7AF\uD7C7-\uD7CA\uD7FC-\uF8FF\uFA6E\uFA6F\uFADA-\uFAFF\uFB07-\uFB12\uFB18-\uFB1C\uFB37\uFB3D\uFB3F\uFB42\uFB45\uFBC2-\uFBD2\uFD40-\uFD4F\uFD90\uFD91\uFDC8-\uFDEF\uFDFE\uFDFF\uFE1A-\uFE1F\uFE27-\uFE2F\uFE53\uFE67\uFE6C-\uFE6F\uFE75\uFEFD-\uFF00\uFFBF-\uFFC1\uFFC8\uFFC9\uFFD0\uFFD1\uFFD8\uFFD9\uFFDD-\uFFDF\uFFE7\uFFEF-\uFFFB\uFFFE\uFFFF]/g; + if(re.test(character)) return; + if(!annot.markup && character==" ") return; + // char details for debugging + var charInfo = {}; + charInfo.fontChar = glyph.fontChar; + charInfo.unicode = glyph.unicode; + charInfo.charDims = charDims; + charInfo.isSpace = isSpace; + // add to annotation object + if (!annot.markup) { + annot.markup = []; + annot.markupGeom = []; + annot.chars = []; + annot.spaceSize = []; + } + if (!annot.markup[quad]) { + // annot.markupGeom[quad].brx ensures that only characters are added that are right of the first one in annotation + annot.markupGeom[quad] = {brx: charDims.x + charDims.width}; + annot.markup[quad] = character; + charInfo.character = character; + annot.chars.push(charInfo); + } else { + var markupEnd = annot.markup[quad].length - 1; + var lastCharSpace = (annot.markup[quad].charAt(markupEnd) == ' '); + // exclude double spaces + if (isSpace && lastCharSpace) return; + // exclude previous space if it is further right then current character + var lastChar = annot.chars.slice(-1)[0]; + if(!isSpace && lastCharSpace && typeof lastChar.charDims.x !== 'undefined' && + lastChar.charDims.x/*-charDims.spaceWidth*/>charDims.x+charDims.spaceWidth) { + annot.markup[quad] = annot.markup[quad].substring(0, markupEnd); + annot.chars = annot.chars.splice(0,annot.chars.length-1); + lastChar = annot.chars.slice(-1)[0]; + annot.markupGeom[quad].brx = lastChar.charDims.x + lastChar.charDims.width; + // reset markupEnd and lastCharSpace + markupEnd = annot.markup[quad].length - 1; + lastCharSpace = (annot.markup[quad].charAt(markupEnd) == ' '); + } + + // show current char + /*var rd = function (x) {return Math.round(x*1000)/1000;} + console.log(JSON.stringify([ + glyph.fontChar, rd(charDims.x), rd(charDims.y), + isSpace, lastCharSpace, + rd(charDims.width), rd(annot.markupGeom[quad].brx), rd(charDims.spaceWidth)]));*/ + + // insert space if ... + if (!isSpace && !lastCharSpace && (charDims.spaceWidth != 0 || /^[\u201C\(]*$/.test(character) ) && + charDims.x > annot.markupGeom[quad].brx + charDims.spaceWidth) { + annot.markup[quad] += ' '; + charInfo.character = ' '; + annot.chars.push(charInfo); + } + + // add current character + if (!isSpace && annot.markupGeom[quad].brx < charDims.x + charDims.width) { + annot.markupGeom[quad].brx = charDims.x + charDims.width; + annot.markup[quad] += character; + charInfo.character = character; + annot.chars.push(charInfo); + } + // add space but exclude mini spaces + if (isSpace) { + // late char (a-z or digits) + var lastChar = annot.chars + .filter(function(c) {return /^[\w]*$/.test(c.character);}) + .slice(-1)[0]; + if (typeof lastChar === 'undefined') lastChar = annot.chars.slice(-1)[0]; + // do not add 'mini' spaces that are between to characters of one word + var relativeSize = charDims.width/lastChar.charDims.width; + if(relativeSize<0.2) return; + + if (annot.spaceSize.length>0) { + var sum = annot.spaceSize.reduce(function(a, b) { return a + b }); + var avg = sum / annot.spaceSize.length; + if(charDims.width/avg<0.3) return; + } + annot.spaceSize.push(charDims.width); + // add space + annot.markupGeom[quad].brx = charDims.x + charDims.width; + annot.markup[quad] += character; + charInfo.character = character; + annot.chars.push(charInfo); + } + } + }, + showText: function CanvasGraphics_showText(str, skipTextSelection) { var ctx = this.ctx; @@ -951,6 +1109,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { var canvasWidth = 0.0; var vertical = font.vertical; var defaultVMetrics = font.defaultVMetrics; + var show = false; // Type3 fonts - each glyph is a "mini-PDF" if (font.coded) { @@ -960,7 +1119,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { ctx.scale(textHScale, 1); - if (textSelection) { + if (textSelection || this.annotation) { this.save(); ctx.scale(1, -1); geom = this.createTextGeometry(); @@ -989,6 +1148,18 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { ctx.translate(width, 0); current.x += width * textHScale; + + if (this.annotations && ctx.user2dev) { + // check if glyph is within an annotation + var chDims = this.makeCharDims(transformed[0] * fontSize, width, + font, ctx.mozCurrentTransform); + for (var j = 0; j < this.annotations.length; j++) { + var annot = this.annotations[j]; + var quad = this.charInAnnot(annot, glyph, chDims, ctx.user2dev); + this.updateMarkup(annot, quad, glyph, chDims, false); + } + } + canvasWidth += width; } ctx.restore(); @@ -1004,7 +1175,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { else lineWidth /= scale; - if (textSelection) + if (textSelection || this.annotations) geom = this.createTextGeometry(); if (fontSizeScale != 1.0) { @@ -1086,6 +1257,18 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { } } + if (this.annotations && ctx.user2dev) { + // check if glyph is within an annotation + // var charDims = this.makeCharDims(glyph.width * fontSize * .001, x, font, ctx.mozCurrentTransform); + var charDims = this.makeCharDims(width * fontSize * current.fontMatrix[0], x, font, ctx.mozCurrentTransform); + glyph.print = false; + for (var j = 0; j < this.annotations.length; j++) { + var annot = this.annotations[j]; + var quad = this.charInAnnot(annot, glyph, charDims, ctx.user2dev); + this.updateMarkup(annot, quad, glyph, charDims, false); + } + } + x += charWidth; canvasWidth += charWidth; @@ -1124,16 +1307,17 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { var geom; var canvasWidth = 0.0; var textSelection = textLayer ? true : false; + var font2dev = []; var vertical = font.vertical; var spacingAccumulator = 0; - if (textSelection) { + if (textSelection || this.annotations) { ctx.save(); this.applyTextTransforms(); + font2dev = ctx.mozCurrentTransform.slice(0, 6); geom = this.createTextGeometry(); ctx.restore(); } - for (var i = 0; i < arrLength; ++i) { var e = arr[i]; if (isNum(e)) { @@ -1144,12 +1328,25 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { current.x += spacingLength; } - if (textSelection) + if (textSelection || this.annotations) spacingAccumulator += spacingLength; + + if (this.annotations && ctx.user2dev) { + var charDims = + this.makeCharDims(spacingLength, + canvasWidth - spacingLength, + font, font2dev); + for (var j = 0; j < this.annotations.length; j++) { + var annot = this.annotations[j]; + var quad = this.charInAnnot(annot, {'fontChar':' ', 'unicode': ' '}, charDims, ctx.user2dev); + this.updateMarkup(annot, quad, {'fontChar':' ', 'unicode': ' '}, charDims, true); + } + } + } else if (isString(e)) { var shownCanvasWidth = this.showText(e, true); - if (textSelection) { + if (textSelection || this.annotations) { canvasWidth += spacingAccumulator + shownCanvasWidth; spacingAccumulator = 0; } diff --git a/src/core.js b/src/core.js old mode 100644 new mode 100755 index 9b0f0530d7aee..2293dd2080f8f --- a/src/core.js +++ b/src/core.js @@ -66,6 +66,10 @@ var Page = (function PageClosure() { this.pageDict = pageDict; this.xref = xref; this.ref = ref; + this.idCounters = { + font: 0, + obj: 0 + }; } Page.prototype = { @@ -159,17 +163,19 @@ var Page = (function PageClosure() { var contentStreamPromise = pdfManager.ensure(this, 'getContentStream', []); var resourcesPromise = pdfManager.ensure(this, 'resources'); + + var partialEvaluator = new PartialEvaluator( + pdfManager, this.xref, handler, + this.pageIndex, 'p' + this.pageIndex + '_', + this.idCounters); + var dataPromises = Promise.all( [contentStreamPromise, resourcesPromise]); dataPromises.then(function(data) { var contentStream = data[0]; var resources = data[1]; - var pe = self.pe = new PartialEvaluator( - pdfManager, - self.xref, handler, self.pageIndex, - 'p' + self.pageIndex + '_'); - pdfManager.ensure(pe, 'getOperatorList', + pdfManager.ensure(partialEvaluator, 'getOperatorList', [contentStream, resources]).then( function(opListPromise) { opListPromise.then(function(data) { @@ -181,11 +187,7 @@ var Page = (function PageClosure() { pdfManager.ensure(this, 'getAnnotationsForDraw', []).then( function(annotations) { - var annotationEvaluator = new PartialEvaluator( - pdfManager, self.xref, handler, self.pageIndex, - 'p' + self.pageIndex + '_annotation'); - - pdfManager.ensure(annotationEvaluator, 'getAnnotationsOperatorList', + pdfManager.ensure(partialEvaluator, 'getAnnotationsOperatorList', [annotations]).then( function(opListPromise) { opListPromise.then(function(data) { @@ -242,12 +244,13 @@ var Page = (function PageClosure() { dataPromises.then(function(data) { var contentStream = data[0]; var resources = data[1]; - var pe = new PartialEvaluator( - pdfManager, - self.xref, handler, self.pageIndex, - 'p' + self.pageIndex + '_'); + var partialEvaluator = new PartialEvaluator( + pdfManager, self.xref, handler, + self.pageIndex, 'p' + self.pageIndex + '_', + self.idCounters); - pe.getTextContent(contentStream, resources).then(function(bidiTexts) { + partialEvaluator.getTextContent( + contentStream, resources).then(function(bidiTexts) { textContentPromise.resolve({ bidiTexts: bidiTexts }); @@ -320,6 +323,7 @@ var Page = (function PageClosure() { }, getAnnotationsBase: function Page_getAnnotationsBase() { + if (this.annotationsList) return this.annotationsList; var xref = this.xref; function getInheritableProperty(annotation, name) { var item = annotation; @@ -362,8 +366,29 @@ var Page = (function PageClosure() { var item = {}; item.type = subtype.name; + // list of quad regions + item.quadPoints = []; + var quadpts = annotation.get('QuadPoints') || []; + for (var j = 0; j < quadpts.length; j += 8) { + // NB: we don't transform the quadpoints here, but later once we know + // the user space => device space transformation. + var topLeft = {x: quadpts[j + 4], y: quadpts[j + 5]}; + var bottomRight = {x: quadpts[j + 2], y: quadpts[j + 3]}; + var quad = {}; + quad.x = Math.min(topLeft.x, bottomRight.x); + quad.y = Math.min(topLeft.y, bottomRight.y); + quad.width = Math.abs(topLeft.x - bottomRight.x); + quad.height = Math.abs(topLeft.y - bottomRight.y); + item.quadPoints.push(quad); + } var rect = annotation.get('Rect'); item.rect = Util.normalizeRect(rect); + // var topLeftCorner = this.rotatePoint(rect[0], rect[1]); + // var bottomRightCorner = this.rotatePoint(rect[2], rect[3]); + // item.x = Math.min(topLeftCorner.x, bottomRightCorner.x); + // item.y = Math.min(topLeftCorner.y, bottomRightCorner.y); + // item.width = Math.abs(topLeftCorner.x - bottomRightCorner.x); + // item.height = Math.abs(topLeftCorner.y - bottomRightCorner.y); var includeAnnotation = true; switch (subtype.name) { @@ -460,10 +485,18 @@ var Page = (function PageClosure() { case 'Text': var content = annotation.get('Contents'); var title = annotation.get('T'); + var name = annotation.get('Name'); item.content = stringToPDFString(content || ''); item.title = stringToPDFString(title || ''); - item.name = !annotation.has('Name') ? 'Note' : - annotation.get('Name').name; + item.name = name ? name.name : 'Note'; + break; + case 'Highlight': + case 'Underline': + var content = annotation.get('Contents'); + var title = annotation.get('T'); + // sometimes there's no content, only markup + if (content) item.content = stringToPDFString(content); + item.title = stringToPDFString(title || ''); break; default: var appearance = getDefaultAnnotationAppearance(annotation); @@ -479,6 +512,21 @@ var Page = (function PageClosure() { }); } } + // sort items in visual order: top->bottom, left->right + function sortAnnotations(a, b) { + // rect=[x1, y1, x2, y2] + if (a.item.rect[2] < b.item.rect[0]) return -1; + if (b.item.rect[2] < a.item.rect[0]) return 1; + if (a.item.rect[1] < b.item.rect[1]) return 1; + if (a.item.rect[1] > b.item.rect[1]) return -1; + return 0; + } + items.sort(sortAnnotations); + // items.forEach(function(a) {console.log(a.item.type + ': ' + a.item.rect);}) + + this.annotationsList = items; + return this.annotationsList; + return items; } }; diff --git a/src/evaluator.js b/src/evaluator.js index 3fcf03c303fd5..1bd97650c8b3e 100644 --- a/src/evaluator.js +++ b/src/evaluator.js @@ -25,7 +25,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { function PartialEvaluator(pdfManager, xref, handler, pageIndex, - uniquePrefix) { + uniquePrefix, idCounters) { this.state = new EvalState(); this.stateStack = []; @@ -34,8 +34,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { this.handler = handler; this.pageIndex = pageIndex; this.uniquePrefix = uniquePrefix; - this.objIdCounter = 0; - this.fontIdCounter = 0; + this.idCounters = idCounters; } // Specifies properties for each command @@ -277,7 +276,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { // If there is no imageMask, create the PDFImage and a lot // of image processing can be done here. var uniquePrefix = this.uniquePrefix || ''; - var objId = 'img_' + uniquePrefix + (++this.objIdCounter); + var objId = 'img_' + uniquePrefix + (++this.idCounters.obj); dependencies[objId] = true; retData.args = [objId, w, h]; @@ -510,11 +509,11 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { font = xref.fetchIfRef(font) || fontRes.get(fontName); if (!isDict(font)) { - ++this.fontIdCounter; + ++this.idCounters.font; promise.resolve({ font: { translated: new ErrorFont('Font ' + fontName + ' is not available'), - loadedName: 'g_font_' + this.uniquePrefix + this.fontIdCounter + loadedName: 'g_font_' + this.uniquePrefix + this.idCounters.obj }, dependencies: {} }); @@ -525,7 +524,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { if (!loadedName) { // keep track of each font we translated so the caller can // load them asynchronously before calling display on a page - loadedName = 'g_font_' + this.uniquePrefix + (this.fontIdCounter + 1); + loadedName = 'g_font_' + this.uniquePrefix + (this.idCounters.font + 1); font.loadedName = loadedName; var translated; @@ -575,7 +574,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { }); } - ++this.fontIdCounter; + ++this.idCounters.font; } else { promise.resolve({ font: font, diff --git a/test/pdfs/ArabicCIDTrueType.pdf b/test/pdfs/ArabicCIDTrueType.pdf index 0821ccdbd7616..aaec0b8a0fc0f 100644 Binary files a/test/pdfs/ArabicCIDTrueType.pdf and b/test/pdfs/ArabicCIDTrueType.pdf differ diff --git a/test/pdfs/arial_unicode_ab_cidfont.pdf b/test/pdfs/arial_unicode_ab_cidfont.pdf index e5c97b882ac15..0c0bce7c63d43 100644 Binary files a/test/pdfs/arial_unicode_ab_cidfont.pdf and b/test/pdfs/arial_unicode_ab_cidfont.pdf differ diff --git a/test/pdfs/arial_unicode_en_cidfont.pdf b/test/pdfs/arial_unicode_en_cidfont.pdf index 6c90e41e3355a..c64ed4fa7caf2 100644 Binary files a/test/pdfs/arial_unicode_en_cidfont.pdf and b/test/pdfs/arial_unicode_en_cidfont.pdf differ diff --git a/test/pdfs/asciihexdecode.pdf b/test/pdfs/asciihexdecode.pdf index f3bd457ecb62d..45aaf427c47cc 100644 Binary files a/test/pdfs/asciihexdecode.pdf and b/test/pdfs/asciihexdecode.pdf differ diff --git a/test/pdfs/basicapi.pdf b/test/pdfs/basicapi.pdf index 31ffcfe9feb70..cd1b41cd02922 100644 Binary files a/test/pdfs/basicapi.pdf and b/test/pdfs/basicapi.pdf differ diff --git a/test/pdfs/noembed-identity.pdf b/test/pdfs/noembed-identity.pdf index a54233f6882e1..490a874844ce9 100644 Binary files a/test/pdfs/noembed-identity.pdf and b/test/pdfs/noembed-identity.pdf differ diff --git a/web/viewer.js b/web/viewer.js index 034f299f3c6c2..225e95ac9874e 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -1172,23 +1172,31 @@ var PDFView = { }, navigateTo: function pdfViewNavigateTo(dest) { - if (typeof dest === 'string') - dest = this.destinations[dest]; - if (!(dest instanceof Array)) - return; // invalid destination - // dest array looks like that: