From 78b38b130deb8bcfa41611039875ce0162542ac1 Mon Sep 17 00:00:00 2001
From: "edisonn@google.com"
 <edisonn@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>
Date: Mon, 15 Jul 2013 18:20:58 +0000
Subject: pdfviewer: native inline images support

Review URL: https://codereview.chromium.org/19243003

git-svn-id: http://skia.googlecode.com/svn/trunk@10087 2bbb7eff-a529-9590-31e7-b0007b416f81
---
 .../pdfparser/native/SkPdfNativeTokenizer.cpp      | 190 +++++++++++++++++++--
 1 file changed, 176 insertions(+), 14 deletions(-)

(limited to 'experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.cpp')

diff --git a/experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.cpp b/experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.cpp
index de49e35f11..babfedfb05 100644
--- a/experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.cpp
+++ b/experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.cpp
@@ -4,6 +4,30 @@
 #include "SkPdfConfig.h"
 
 #include "SkPdfStreamCommonDictionary_autogen.h"
+#include "SkPdfImageDictionary_autogen.h"
+
+// TODO(edisonn): perf!!!
+// there could be 0s between start and end! but not in the needle.
+static char* strrstrk(char* hayStart, char* hayEnd, const char* needle) {
+    int needleLen = strlen(needle);
+    if ((isPdfWhiteSpaceOrPdfDelimiter(*(hayStart+needleLen)) || (hayStart+needleLen == hayEnd)) &&
+            strncmp(hayStart, needle, needleLen) == 0) {
+        return hayStart;
+    }
+
+    hayStart++;
+
+    while (hayStart < hayEnd) {
+        if (isPdfWhiteSpaceOrPdfDelimiter(*(hayStart-1)) &&
+                (isPdfWhiteSpaceOrPdfDelimiter(*(hayStart+needleLen)) || (hayStart+needleLen == hayEnd)) &&
+                strncmp(hayStart, needle, needleLen) == 0) {
+            return hayStart;
+        }
+        hayStart++;
+    }
+    return NULL;
+}
+
 
 static unsigned char* skipPdfWhiteSpaces(unsigned char* start, unsigned char* end) {
     while (start < end && isPdfWhiteSpace(*start)) {
@@ -68,6 +92,7 @@ static unsigned char* readArray(unsigned char* start, unsigned char* end, SkPdfO
         }
         array->appendInArray(newObj);
     }
+    printf("break;\n");  // DO NOT SUBMIT!
     // TODO(edisonn): report not reached, we should never get here
     // TODO(edisonn): there might be a bug here, enable an assert and run it on files
     // or it might be that the files were actually corrupted
@@ -458,6 +483,11 @@ static unsigned char* readStream(unsigned char* start, unsigned char* end, SkPdf
         start += 2;
     } else if (start[0] == kLF_PdfWhiteSpace) {
         start += 1;
+    } else if (isPdfWhiteSpace(start[0])) {
+        start += 1;
+    } else {
+        // TODO(edisonn): warn it should be isPdfDelimiter(start[0])) ?
+        // TODO(edisonn): warning?
     }
 
     SkPdfStreamCommonDictionary* stream = (SkPdfStreamCommonDictionary*) dict;
@@ -475,17 +505,12 @@ static unsigned char* readStream(unsigned char* start, unsigned char* end, SkPdf
     if (length < 0) {
         // scan the buffer, until we find first endstream
         // TODO(edisonn): all buffers must have a 0 at the end now,
-        // TODO(edisonn): hack (mark end of content with 0)
-        unsigned char lastCh = *end;
-        *end = '\0';
-        //SkASSERT(*end == '\0');
-        unsigned char* endstream = (unsigned char*)strstr((const char*)start, "endstream");
-        *end = lastCh;
+        unsigned char* endstream = (unsigned char*)strrstrk((char*)start, (char*)end, "endstream");
 
         if (endstream) {
             length = endstream - start;
             if (*(endstream-1) == kLF_PdfWhiteSpace) length--;
-            if (*(endstream-1) == kCR_PdfWhiteSpace) length--;
+            if (*(endstream-2) == kCR_PdfWhiteSpace) length--;
         }
     }
     if (length >= 0) {
@@ -507,6 +532,37 @@ static unsigned char* readStream(unsigned char* start, unsigned char* end, SkPdf
     return start;
 }
 
+static unsigned char* readInlineImageStream(unsigned char* start, unsigned char* end, SkPdfImageDictionary* inlineImage, SkNativeParsedPDF* doc) {
+    // We already processed ID keyword, and we should be positioned immediately after it
+
+    // TODO(edisonn): security: read after end check, or make buffers with extra 2 bytes
+    if (start[0] == kCR_PdfWhiteSpace && start[1] == kLF_PdfWhiteSpace) {
+        start += 2;
+    } else if (start[0] == kLF_PdfWhiteSpace) {
+        start += 1;
+    } else if (isPdfWhiteSpace(start[0])) {
+        start += 1;
+    } else {
+        SkASSERT(isPdfDelimiter(start[0]));
+        // TODO(edisonn): warning?
+    }
+
+    unsigned char* endstream = (unsigned char*)strrstrk((char*)start, (char*)end, "EI");
+    unsigned char* endEI = endstream ? endstream + 2 : NULL;  // 2 == strlen("EI")
+
+    if (endstream) {
+        int length = endstream - start;
+        if (*(endstream-1) == kLF_PdfWhiteSpace) length--;
+        if (*(endstream-2) == kCR_PdfWhiteSpace) length--;
+        inlineImage->addStream(start, (size_t)length);
+    } else {
+        // TODO(edisonn): report error in inline image stream (ID-EI) section
+        // TODO(edisonn): based on filter, try to ignore a missing EI, and read data properly
+        return end;
+    }
+    return endEI;
+}
+
 static unsigned char* readDictionary(unsigned char* start, unsigned char* end, SkPdfObject* dict, SkPdfAllocator* allocator, SkNativeParsedPDF* doc) {
     SkPdfObject::makeEmptyDictionary(dict);
 
@@ -563,11 +619,16 @@ static unsigned char* readDictionary(unsigned char* start, unsigned char* end, S
 
     // now we should expect >>
     start = skipPdfWhiteSpaces(start, end);
-    start = endOfPdfToken(start, end);  // >
-    start = endOfPdfToken(start, end);  // >
-
-    // TODO(edisonn): read stream ... put dict and stream in a struct, and have a pointer to struct ...
-    // or alocate 2 objects, and if there is no stream, free it to be used by someone else? or just leave it ?
+    if (*start != kClosedInequityBracket_PdfDelimiter) {
+        // TODO(edisonn): report/warning
+    }
+    *start = '\0';
+    start++;  // skip >
+    if (*start != kClosedInequityBracket_PdfDelimiter) {
+        // TODO(edisonn): report/warning
+    }
+    *start = '\0';
+    start++;  // skip >
 
     start = readStream(start, end, dict, doc);
 
@@ -604,6 +665,7 @@ unsigned char* nextObject(unsigned char* start, unsigned char* end, SkPdfObject*
             case kOpenedInequityBracket_PdfDelimiter:
                 *start = '\0';
                 if (end > start + 1 && start[1] == kOpenedInequityBracket_PdfDelimiter) {
+                    start[1] = '\0';  // optional
                     // TODO(edisonn): pass here the length somehow?
                     return readDictionary(start + 2, end, token, allocator, doc);  // skip <<
                 } else {
@@ -688,7 +750,7 @@ SkPdfNativeTokenizer::SkPdfNativeTokenizer(SkPdfObject* objWithStream, const SkP
     size_t len = 0;
     objWithStream->GetFilteredStreamRef(&buffer, &len, fAllocator);
     // TODO(edisonn): hack, find end of object
-    char* endobj = strstr((char*)buffer, "endobj");
+    char* endobj = strrstrk((char*)buffer, (char*)buffer + len, "endobj");
     if (endobj) {
         len = endobj - (char*)buffer + strlen("endobj");
     }
@@ -699,7 +761,7 @@ SkPdfNativeTokenizer::SkPdfNativeTokenizer(SkPdfObject* objWithStream, const SkP
 
 SkPdfNativeTokenizer::SkPdfNativeTokenizer(unsigned char* buffer, int len, const SkPdfMapper* mapper, SkPdfAllocator* allocator, SkNativeParsedPDF* doc) : fDoc(doc), fMapper(mapper), fAllocator(allocator), fEmpty(false), fHasPutBack(false) {
     // TODO(edisonn): hack, find end of object
-    char* endobj = strstr((char*)buffer, "endobj");
+    char* endobj = strrstrk((char*)buffer, (char*)buffer + len, "endobj");
     if (endobj) {
         len = endobj - (char*)buffer + strlen("endobj");
     }
@@ -775,3 +837,103 @@ bool SkPdfNativeTokenizer::readToken(PdfToken* token) {
 
     return readTokenCore(token);
 }
+
+#define DECLARE_PDF_NAME(longName) SkPdfName longName((char*)#longName)
+
+// keys
+DECLARE_PDF_NAME(BitsPerComponent);
+DECLARE_PDF_NAME(ColorSpace);
+DECLARE_PDF_NAME(Decode);
+DECLARE_PDF_NAME(DecodeParms);
+DECLARE_PDF_NAME(Filter);
+DECLARE_PDF_NAME(Height);
+DECLARE_PDF_NAME(ImageMask);
+DECLARE_PDF_NAME(Intent); // PDF 1.1 - the key, or the abreviations?
+DECLARE_PDF_NAME(Interpolate);
+DECLARE_PDF_NAME(Width);
+
+// values
+DECLARE_PDF_NAME(DeviceGray);
+DECLARE_PDF_NAME(DeviceRGB);
+DECLARE_PDF_NAME(DeviceCMYK);
+DECLARE_PDF_NAME(Indexed);
+DECLARE_PDF_NAME(ASCIIHexDecode);
+DECLARE_PDF_NAME(ASCII85Decode);
+DECLARE_PDF_NAME(LZWDecode);
+DECLARE_PDF_NAME(FlateDecode);  // PDF 1.2
+DECLARE_PDF_NAME(RunLengthDecode);
+DECLARE_PDF_NAME(CCITTFaxDecode);
+DECLARE_PDF_NAME(DCTDecode);
+
+#define HANDLE_NAME_ABBR(obj,longName,shortName) if (obj->isName(#shortName)) return &longName;
+
+
+static SkPdfObject* inlineImageKeyAbbreviationExpand(SkPdfObject* key) {
+    if (!key || !key->isName()) {
+        return key;
+    }
+
+    // TODO(edisonn): use autogenerated code!
+    HANDLE_NAME_ABBR(key, BitsPerComponent, BPC);
+    HANDLE_NAME_ABBR(key, ColorSpace, CS);
+    HANDLE_NAME_ABBR(key, Decode, D);
+    HANDLE_NAME_ABBR(key, DecodeParms, DP);
+    HANDLE_NAME_ABBR(key, Filter, F);
+    HANDLE_NAME_ABBR(key, Height, H);
+    HANDLE_NAME_ABBR(key, ImageMask, IM);
+//    HANDLE_NAME_ABBR(key, Intent, );
+    HANDLE_NAME_ABBR(key, Interpolate, I);
+    HANDLE_NAME_ABBR(key, Width, W);
+
+    return key;
+}
+
+static SkPdfObject* inlineImageValueAbbreviationExpand(SkPdfObject* value) {
+    if (!value || !value->isName()) {
+        return value;
+    }
+
+    // TODO(edisonn): use autogenerated code!
+    HANDLE_NAME_ABBR(value, DeviceGray, G);
+    HANDLE_NAME_ABBR(value, DeviceRGB, RGB);
+    HANDLE_NAME_ABBR(value, DeviceCMYK, CMYK);
+    HANDLE_NAME_ABBR(value, Indexed, I);
+    HANDLE_NAME_ABBR(value, ASCIIHexDecode, AHx);
+    HANDLE_NAME_ABBR(value, ASCII85Decode, A85);
+    HANDLE_NAME_ABBR(value, LZWDecode, LZW);
+    HANDLE_NAME_ABBR(value, FlateDecode, Fl);  // (PDF 1.2)
+    HANDLE_NAME_ABBR(value, RunLengthDecode, RL);
+    HANDLE_NAME_ABBR(value, CCITTFaxDecode, CCF);
+    HANDLE_NAME_ABBR(value, DCTDecode, DCT);
+
+    return value;
+}
+
+SkPdfImageDictionary* SkPdfNativeTokenizer::readInlineImage() {
+    // BI already processed
+    fUncompressedStream = skipPdfWhiteSpaces(fUncompressedStream, fUncompressedStreamEnd);
+    if (fUncompressedStream >= fUncompressedStreamEnd) {
+        return NULL;
+    }
+
+    SkPdfImageDictionary* inlineImage = (SkPdfImageDictionary*)fAllocator->allocObject();
+    SkPdfObject::makeEmptyDictionary(inlineImage);
+
+    while (fUncompressedStream < fUncompressedStreamEnd) {
+        SkPdfObject* key = fAllocator->allocObject();
+        fUncompressedStream = nextObject(fUncompressedStream, fUncompressedStreamEnd, key, fAllocator, fDoc);
+
+        if (key->isKeyword() && key->len() == 2 && key->c_str()[0] == 'I' && key->c_str()[1] == 'D') { // ID
+            fUncompressedStream = readInlineImageStream(fUncompressedStream, fUncompressedStreamEnd, inlineImage, fDoc);
+            return inlineImage;
+        } else {
+            SkPdfObject* obj = fAllocator->allocObject();
+            fUncompressedStream = nextObject(fUncompressedStream, fUncompressedStreamEnd, obj, fAllocator, fDoc);
+            // TODO(edisonn): perf maybe we should not expand abreviation like this
+            inlineImage->set(inlineImageKeyAbbreviationExpand(key),
+                             inlineImageValueAbbreviationExpand(obj));
+        }
+    }
+    // TODO(edisonn): report end of data with inline image without an EI
+    return inlineImage;
+}
-- 
cgit v1.2.3