aboutsummaryrefslogtreecommitdiffhomepage
path: root/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp
diff options
context:
space:
mode:
authorGravatar edisonn@google.com <edisonn@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2013-07-10 17:09:50 +0000
committerGravatar edisonn@google.com <edisonn@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2013-07-10 17:09:50 +0000
commit571c70b95f56e22b5a7d6f4f288aa6c9a925a64f (patch)
treee0d3377d7e373350706d46722af8fe050abda9d3 /experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp
parent89fa4b9ee6bc6039781acbdb6c097a41f894ea1c (diff)
Native PDF parser implementation - don't try it on pdfs that are not generated by skia, Crome Print Preview or Chrome Save As Pdf - it will crash as mising xref, pdfs with updates, and other features are not supported yer.
Review URL: https://codereview.chromium.org/18323019 git-svn-id: http://skia.googlecode.com/svn/trunk@9962 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp')
-rw-r--r--experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp466
1 files changed, 462 insertions, 4 deletions
diff --git a/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp b/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp
index 5d8683899b..04a1c50caf 100644
--- a/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp
+++ b/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp
@@ -1,11 +1,469 @@
-
#include "SkNativeParsedPDF.h"
+#include "SkPdfNativeTokenizer.h"
+#include "SkPdfBasics.h"
+#include "SkPdfParser.h"
+#include "SkPdfObject.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "SkPdfFileTrailerDictionary_autogen.h"
+#include "SkPdfCatalogDictionary_autogen.h"
+#include "SkPdfPageObjectDictionary_autogen.h"
+#include "SkPdfPageTreeNodeDictionary_autogen.h"
+#include "SkPdfMapper_autogen.h"
+
+
+
+long getFileSize(const char* filename)
+{
+ struct stat stat_buf;
+ int rc = stat(filename, &stat_buf);
+ return rc == 0 ? stat_buf.st_size : -1;
+}
+
+unsigned char* lineHome(unsigned char* start, unsigned char* current) {
+ while (current > start && !isPdfEOL(*(current - 1))) {
+ current--;
+ }
+ return current;
+}
+
+unsigned char* previousLineHome(unsigned char* start, unsigned char* current) {
+ if (current > start && isPdfEOL(*(current - 1))) {
+ current--;
+ }
+
+ // allows CR+LF, LF+CR but not two CR+CR or LF+LF
+ if (current > start && isPdfEOL(*(current - 1)) && *current != *(current - 1)) {
+ current--;
+ }
+
+ while (current > start && !isPdfEOL(*(current - 1))) {
+ current--;
+ }
+
+ return current;
+}
+
+unsigned char* ignoreLine(unsigned char* current, unsigned char* end) {
+ while (current < end && !isPdfEOL(*current)) {
+ current++;
+ }
+ current++;
+ if (current < end && isPdfEOL(*current) && *current != *(current - 1)) {
+ current++;
+ }
+ return current;
+}
+
+
+// TODO(edisonn): NYI
+// TODO(edisonn): 3 constructuctors from URL, from stream, from file ...
+// TODO(edisonn): write one that accepts errors in the file and ignores/fixis them
+// TODO(edisonn): testing:
+// 1) run on a lot of file
+// 2) recoverable corupt file: remove endobj, endsteam, remove other keywords, use other white spaces, insert comments randomly, ...
+// 3) irrecoverable corrupt file
+SkNativeParsedPDF::SkNativeParsedPDF(const char* path) : fAllocator(new SkPdfAllocator()) {
+ FILE* file = fopen(path, "r");
+ fContentLength = getFileSize(path);
+ fFileContent = new unsigned char[fContentLength];
+ fread(fFileContent, fContentLength, 1, file);
+ fclose(file);
+ file = NULL;
+
+ unsigned char* eofLine = lineHome(fFileContent, fFileContent + fContentLength - 1);
+ unsigned char* xrefByteOffsetLine = previousLineHome(fFileContent, eofLine);
+ unsigned char* xrefstartKeywordLine = previousLineHome(fFileContent, xrefByteOffsetLine);
+
+ if (strcmp((char*)xrefstartKeywordLine, "startxref") != 0) {
+ // TODO(edisonn): report/issue
+ }
+
+ long xrefByteOffset = atol((const char*)xrefByteOffsetLine);
+
+ bool storeCatalog = true;
+ while (xrefByteOffset >= 0) {
+ unsigned char* trailerStart = readCrossReferenceSection(fFileContent + xrefByteOffset, xrefstartKeywordLine);
+ xrefByteOffset = readTrailer(trailerStart, xrefstartKeywordLine, storeCatalog);
+ storeCatalog = false;
+ }
+
+ // TODO(edisonn): warn/error expect fObjects[fRefCatalogId].fGeneration == fRefCatalogGeneration
+ // TODO(edisonn): security, verify that SkPdfCatalogDictionary is indeed using mapper
+ // load catalog
+ fRootCatalog = (SkPdfCatalogDictionary*)resolveReference(fRootCatalogRef);
+ SkPdfPageTreeNodeDictionary* tree = fRootCatalog->Pages(this);
-SkNativeParsedPDF::SkNativeParsedPDF() {
- // TODO(edisonn): Auto-generated constructor stub
+ fillPages(tree);
+ // now actually read all objects if we want, or do it lazyly
+ // and resolve references?... or not ...
}
+// TODO(edisonn): NYI
SkNativeParsedPDF::~SkNativeParsedPDF() {
- // TODO(edisonn): Auto-generated destructor stub
+ delete[] fFileContent;
+ delete fAllocator;
+}
+
+unsigned char* SkNativeParsedPDF::readCrossReferenceSection(unsigned char* xrefStart, unsigned char* trailerEnd) {
+ unsigned char* current = ignoreLine(xrefStart, trailerEnd); // TODO(edisonn): verify next keyord is "xref", use nextObject here
+
+ SkPdfObject token;
+ while (current < trailerEnd) {
+ token.reset();
+ unsigned char* previous = current;
+ current = nextObject(current, trailerEnd, &token, NULL);
+ if (!token.isInteger()) {
+ return previous;
+ }
+
+ int startId = token.intValue();
+ token.reset();
+ current = nextObject(current, trailerEnd, &token, NULL);
+
+ if (!token.isInteger()) {
+ // TODO(edisonn): report/warning
+ return current;
+ }
+
+ int entries = token.intValue();
+
+ for (int i = 0; i < entries; i++) {
+ token.reset();
+ current = nextObject(current, trailerEnd, &token, NULL);
+ if (!token.isInteger()) {
+ // TODO(edisonn): report/warning
+ return current;
+ }
+ int offset = token.intValue();
+
+ token.reset();
+ current = nextObject(current, trailerEnd, &token, NULL);
+ if (!token.isInteger()) {
+ // TODO(edisonn): report/warning
+ return current;
+ }
+ int generation = token.intValue();
+
+ token.reset();
+ current = nextObject(current, trailerEnd, &token, NULL);
+ if (!token.isKeyword() || token.len() != 1 || (*token.c_str() != 'f' && *token.c_str() != 'n')) {
+ // TODO(edisonn): report/warning
+ return current;
+ }
+
+ addCrossSectionInfo(startId + i, generation, offset, *token.c_str() == 'f');
+ }
+ }
+ // TODO(edisonn): it should never get here? there is no trailer?
+ return current;
+}
+
+long SkNativeParsedPDF::readTrailer(unsigned char* trailerStart, unsigned char* trailerEnd, bool storeCatalog) {
+ unsigned char* current = ignoreLine(trailerStart, trailerEnd); // TODO(edisonn): verify next keyord is "trailer" use nextObject here
+
+ SkPdfObject token;
+ current = nextObject(current, trailerEnd, &token, fAllocator);
+ SkPdfFileTrailerDictionary* trailer = (SkPdfFileTrailerDictionary*)&token;
+
+ if (storeCatalog) {
+ const SkPdfObject* ref = trailer->Root(NULL);
+ if (ref == NULL || !ref->isReference()) {
+ // TODO(edisonn): oops, we have to fix the corrup pdf file
+ return -1;
+ }
+ fRootCatalogRef = ref;
+ }
+
+ if (trailer->has_Prev()) {
+ return trailer->Prev(NULL);
+ }
+
+ return -1;
+}
+
+void SkNativeParsedPDF::addCrossSectionInfo(int id, int generation, int offset, bool isFreed) {
+ // TODO(edisonn): security here
+ while (fObjects.count() < id + 1) {
+ reset(fObjects.append());
+ }
+
+ fObjects[id].fOffset = offset;
+ fObjects[id].fObj = NULL;
+}
+
+SkPdfObject* SkNativeParsedPDF::readObject(int id/*, int expectedGeneration*/) const {
+ long startOffset = fObjects[id].fOffset;
+ //long endOffset = fObjects[id].fOffsetEnd;
+ // TODO(edisonn): use hinted endOffset
+ // TODO(edisonn): current implementation will result in a lot of memory usage
+ // to decrease memory usage, we wither need to be smart and know where objects end, and we will
+ // alocate only the chancks needed, or the tokenizer will not make copies, but then it needs to
+ // cache the results so it does not go twice on the same buffer
+ unsigned char* current = fFileContent + startOffset;
+ unsigned char* end = fFileContent + fContentLength;
+
+ SkPdfNativeTokenizer tokenizer(current, end - current, fMapper, fAllocator);
+
+ SkPdfObject idObj;
+ SkPdfObject generationObj;
+ SkPdfObject objKeyword;
+ SkPdfObject* dict = fAllocator->allocObject();
+
+ current = nextObject(current, end, &idObj, NULL);
+ if (current >= end) {
+ // TODO(edisonn): report warning/error
+ return NULL;
+ }
+
+ current = nextObject(current, end, &generationObj, NULL);
+ if (current >= end) {
+ // TODO(edisonn): report warning/error
+ return NULL;
+ }
+
+ current = nextObject(current, end, &objKeyword, NULL);
+ if (current >= end) {
+ // TODO(edisonn): report warning/error
+ return NULL;
+ }
+
+ if (!idObj.isInteger() || !generationObj.isInteger() || id != idObj.intValue()/* || generation != generationObj.intValue()*/) {
+ // TODO(edisonn): report warning/error
+ }
+
+ if (!objKeyword.isKeyword() || strcmp(objKeyword.c_str(), "obj") != 0) {
+ // TODO(edisonn): report warning/error
+ }
+
+ current = nextObject(current, end, dict, fAllocator);
+
+ // TODO(edisonn): report warning/error - verify last token is endobj
+
+ return dict;
+}
+
+void SkNativeParsedPDF::fillPages(SkPdfPageTreeNodeDictionary* tree) {
+ const SkPdfArray* kids = tree->Kids(this);
+ if (kids == NULL) {
+ *fPages.append() = (SkPdfPageObjectDictionary*)tree;
+ return;
+ }
+
+ int cnt = kids->size();
+ for (int i = 0; i < cnt; i++) {
+ const SkPdfObject* obj = resolveReference(kids->objAtAIndex(i));
+ if (fMapper->mapPageObjectDictionary(obj) != kPageObjectDictionary_SkPdfObjectType) {
+ *fPages.append() = (SkPdfPageObjectDictionary*)obj;
+ } else {
+ // TODO(edisonn): verify that it is a page tree indeed
+ fillPages((SkPdfPageTreeNodeDictionary*)obj);
+ }
+ }
+}
+
+int SkNativeParsedPDF::pages() const {
+ return fPages.count();
+}
+
+SkPdfResourceDictionary* SkNativeParsedPDF::pageResources(int page) {
+ return fPages[page]->Resources(this);
+}
+
+// TODO(edisonn): Partial implemented. Move the logics directly in the code generator for inheritable and default value?
+SkRect SkNativeParsedPDF::MediaBox(int page) const {
+ SkPdfPageObjectDictionary* current = fPages[page];
+ while (!current->has_MediaBox() && current->has_Parent()) {
+ current = (SkPdfPageObjectDictionary*)current->Parent(this);
+ }
+ if (current) {
+ return current->MediaBox(this);
+ }
+ return SkRect::MakeEmpty();
+}
+
+// TODO(edisonn): stream or array ... ? for now only array
+SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfPage(int page) const {
+ if (fPages[page]->isContentsAStream(this)) {
+ return tokenizerOfStream(fPages[page]->getContentsAsStream(this));
+ } else {
+ // TODO(edisonn): NYI, we need to concatenate all streams in the array or make the tokenizer smart
+ // so we don't allocate new memory
+ return NULL;
+ }
+}
+
+SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfStream(SkPdfObject* stream) const {
+ if (stream == NULL) {
+ return NULL;
+ }
+
+ return new SkPdfNativeTokenizer(stream, fMapper, fAllocator);
+}
+
+// TODO(edisonn): NYI
+SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfBuffer(unsigned char* buffer, size_t len) const {
+ // warning does not track two calls in the same buffer! the buffer is updated!
+ // make a clean copy if needed!
+ return new SkPdfNativeTokenizer(buffer, len, fMapper, fAllocator);
+}
+
+size_t SkNativeParsedPDF::objects() const {
+ return fObjects.count();
+}
+
+SkPdfObject* SkNativeParsedPDF::object(int i) {
+ SkASSERT(!(i < 0 || i > fObjects.count()));
+
+ if (i < 0 || i > fObjects.count()) {
+ return NULL;
+ }
+
+ if (fObjects[i].fObj == NULL) {
+ // TODO(edisonn): when we read the cross reference sections, store the start of the next object
+ // and fill fOffsetEnd
+ fObjects[i].fObj = readObject(i);
+ }
+
+ return fObjects[i].fObj;
+}
+
+const SkPdfMapper* SkNativeParsedPDF::mapper() const {
+ return fMapper;
+}
+
+SkPdfReal* SkNativeParsedPDF::createReal(double value) const {
+ SkPdfObject* obj = fAllocator->allocObject();
+ SkPdfObject::makeReal(value, obj);
+ return (SkPdfReal*)obj;
+}
+
+SkPdfInteger* SkNativeParsedPDF::createInteger(int value) const {
+ SkPdfObject* obj = fAllocator->allocObject();
+ SkPdfObject::makeInteger(value, obj);
+ return (SkPdfInteger*)obj;
+}
+
+SkPdfString* SkNativeParsedPDF::createString(unsigned char* sz, size_t len) const {
+ SkPdfObject* obj = fAllocator->allocObject();
+ SkPdfObject::makeString(sz, len, obj);
+ return (SkPdfString*)obj;
+}
+
+PdfContext* gPdfContext = NULL;
+
+void SkNativeParsedPDF::drawPage(int page, SkCanvas* canvas) {
+ SkPdfNativeTokenizer* tokenizer = tokenizerOfPage(page);
+
+ PdfContext pdfContext(this);
+ pdfContext.fOriginalMatrix = SkMatrix::I();
+ pdfContext.fGraphicsState.fResources = pageResources(page);
+
+ gPdfContext = &pdfContext;
+
+ // TODO(edisonn): get matrix stuff right.
+ // TODO(edisonn): add DPI/scale/zoom.
+ SkScalar z = SkIntToScalar(0);
+ SkRect rect = MediaBox(page);
+ SkScalar w = rect.width();
+ SkScalar h = rect.height();
+
+ SkPoint pdfSpace[4] = {SkPoint::Make(z, z), SkPoint::Make(w, z), SkPoint::Make(w, h), SkPoint::Make(z, h)};
+// SkPoint skiaSpace[4] = {SkPoint::Make(z, h), SkPoint::Make(w, h), SkPoint::Make(w, z), SkPoint::Make(z, z)};
+
+ // TODO(edisonn): add flag for this app to create sourunding buffer zone
+ // TODO(edisonn): add flagg for no clipping.
+ // Use larger image to make sure we do not draw anything outside of page
+ // could be used in tests.
+
+#ifdef PDF_DEBUG_3X
+ SkPoint skiaSpace[4] = {SkPoint::Make(w+z, h+h), SkPoint::Make(w+w, h+h), SkPoint::Make(w+w, h+z), SkPoint::Make(w+z, h+z)};
+#else
+ SkPoint skiaSpace[4] = {SkPoint::Make(z, h), SkPoint::Make(w, h), SkPoint::Make(w, z), SkPoint::Make(z, z)};
+#endif
+ //SkPoint pdfSpace[2] = {SkPoint::Make(z, z), SkPoint::Make(w, h)};
+ //SkPoint skiaSpace[2] = {SkPoint::Make(w, z), SkPoint::Make(z, h)};
+
+ //SkPoint pdfSpace[2] = {SkPoint::Make(z, z), SkPoint::Make(z, h)};
+ //SkPoint skiaSpace[2] = {SkPoint::Make(z, h), SkPoint::Make(z, z)};
+
+ //SkPoint pdfSpace[3] = {SkPoint::Make(z, z), SkPoint::Make(z, h), SkPoint::Make(w, h)};
+ //SkPoint skiaSpace[3] = {SkPoint::Make(z, h), SkPoint::Make(z, z), SkPoint::Make(w, 0)};
+
+ SkAssertResult(pdfContext.fOriginalMatrix.setPolyToPoly(pdfSpace, skiaSpace, 4));
+ SkTraceMatrix(pdfContext.fOriginalMatrix, "Original matrix");
+
+
+ pdfContext.fGraphicsState.fMatrix = pdfContext.fOriginalMatrix;
+ pdfContext.fGraphicsState.fMatrixTm = pdfContext.fGraphicsState.fMatrix;
+ pdfContext.fGraphicsState.fMatrixTlm = pdfContext.fGraphicsState.fMatrix;
+
+ canvas->setMatrix(pdfContext.fOriginalMatrix);
+
+#ifndef PDF_DEBUG_NO_PAGE_CLIPING
+ canvas->clipRect(SkRect::MakeXYWH(z, z, w, h), SkRegion::kIntersect_Op, true);
+#endif
+
+// erase with red before?
+// SkPaint paint;
+// paint.setColor(SK_ColorRED);
+// canvas->drawRect(rect, paint);
+
+ PdfMainLooper looper(NULL, tokenizer, &pdfContext, canvas);
+ looper.loop();
+
+ delete tokenizer;
+
+ canvas->flush();
+}
+
+SkPdfAllocator* SkNativeParsedPDF::allocator() const {
+ return fAllocator;
+}
+
+SkPdfObject* SkNativeParsedPDF::resolveReference(SkPdfObject* ref) const {
+ return (SkPdfObject*)resolveReference((const SkPdfObject*)ref);
+}
+
+// TODO(edisonn): fix infinite loop if ref to itself!
+// TODO(edisonn): perf, fix refs at load, and resolve will simply return fResolvedReference?
+SkPdfObject* SkNativeParsedPDF::resolveReference(const SkPdfObject* ref) const {
+ if (ref && ref->isReference()) {
+ int id = ref->referenceId();
+ // TODO(edisonn): generation/updates not supported now
+ //int gen = ref->referenceGeneration();
+
+ SkASSERT(!(id < 0 || id > fObjects.count()));
+
+ if (id < 0 || id > fObjects.count()) {
+ return NULL;
+ }
+
+ // TODO(edisonn): verify id and gen expected
+
+ if (fObjects[id].fResolvedReference != NULL) {
+ return fObjects[id].fResolvedReference;
+ }
+
+ if (fObjects[id].fObj == NULL) {
+ fObjects[id].fObj = readObject(id);
+ }
+
+ if (fObjects[id].fResolvedReference == NULL) {
+ if (!fObjects[id].fObj->isReference()) {
+ fObjects[id].fResolvedReference = fObjects[id].fObj;
+ } else {
+ fObjects[id].fResolvedReference = resolveReference(fObjects[id].fObj);
+ }
+ }
+
+ return fObjects[id].fResolvedReference;
+ }
+ // TODO(edisonn): fix the mess with const, probably we need to remove it pretty much everywhere
+ return (SkPdfObject*)ref;
}