From f59d18a4ea5ad56975502faf50eae43418ea9c2a Mon Sep 17 00:00:00 2001 From: halcanary Date: Fri, 16 Sep 2016 14:44:57 -0700 Subject: SkPDF: Implement /ActualText to make text extraction correct. For old API: no change. For new API: LTR text is perfectly extracted, RTL needs better testing. CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-Clang-GCE-CPU-AVX2-x86_64-Debug-ASAN-Trybot,Test-Ubuntu-Clang-GCE-CPU-AVX2-x86_64-Debug-MSAN-Trybot BUG=skia:5434 GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2322403002 Review-Url: https://codereview.chromium.org/2322403002 --- src/pdf/SkPDFDevice.cpp | 352 +++++++++++++++++++++++++++++++------ src/pdf/SkPDFMakeToUnicodeCmap.cpp | 14 +- src/pdf/SkPDFUtils.h | 11 +- 3 files changed, 312 insertions(+), 65 deletions(-) (limited to 'src') diff --git a/src/pdf/SkPDFDevice.cpp b/src/pdf/SkPDFDevice.cpp index 1a1bd8a411..0e54bfd765 100644 --- a/src/pdf/SkPDFDevice.cpp +++ b/src/pdf/SkPDFDevice.cpp @@ -7,6 +7,7 @@ #include "SkPDFDevice.h" +#include "SkAdvancedTypefaceMetrics.h" #include "SkAnnotationKeys.h" #include "SkBitmapDevice.h" #include "SkBitmapKey.h" @@ -37,6 +38,7 @@ #include "SkTemplates.h" #include "SkTextBlobRunIterator.h" #include "SkTextFormatParams.h" +#include "SkUtils.h" #include "SkXfermodeInterpretation.h" #define DPI_FOR_RASTER_SCALE_ONE 72 @@ -922,8 +924,205 @@ private: bool fInitialized = false; const bool fDefaultPositioning; }; + +/** Given the m-to-n glyph-to-character mapping data (as returned by + harfbuzz), iterate over the clusters. */ +class Clusterator { +public: + Clusterator() : fClusters(nullptr), fUtf8Text(nullptr), fGlyphCount(0), fTextByteLength(0) {} + explicit Clusterator(uint32_t glyphCount) + : fClusters(nullptr) + , fUtf8Text(nullptr) + , fGlyphCount(glyphCount) + , fTextByteLength(0) {} + // The clusters[] array is an array of offsets into utf8Text[], + // one offset for each glyph. See SkTextBlobBuilder for more info. + Clusterator(const uint32_t* clusters, + const char* utf8Text, + uint32_t glyphCount, + uint32_t textByteLength) + : fClusters(clusters) + , fUtf8Text(utf8Text) + , fGlyphCount(glyphCount) + , fTextByteLength(textByteLength) { + // This is a cheap heuristic for /ReversedChars which seems to + // work for clusters produced by HarfBuzz, which either + // increase from zero (LTR) or decrease to zero (RTL). + // "ReversedChars" is how PDF deals with RTL text. + fReversedChars = + fUtf8Text && fClusters && fGlyphCount && fClusters[0] != 0; + } + struct Cluster { + const char* fUtf8Text; + uint32_t fTextByteLength; + uint32_t fGlyphIndex; + uint32_t fGlyphCount; + explicit operator bool() const { return fGlyphCount != 0; } + }; + // True if this looks like right-to-left text. + bool reversedChars() const { return fReversedChars; } + Cluster next() { + if ((!fUtf8Text || !fClusters) && fGlyphCount) { + // These glyphs have no text. Treat as one "cluster". + uint32_t glyphCount = fGlyphCount; + fGlyphCount = 0; + return Cluster{nullptr, 0, 0, glyphCount}; + } + if (fGlyphCount == 0 || fTextByteLength == 0) { + return Cluster{nullptr, 0, 0, 0}; // empty + } + SkASSERT(fUtf8Text); + SkASSERT(fClusters); + uint32_t cluster = fClusters[0]; + if (cluster >= fTextByteLength) { + return Cluster{nullptr, 0, 0, 0}; // bad input. + } + uint32_t glyphsInCluster = 1; + while (glyphsInCluster < fGlyphCount && + fClusters[glyphsInCluster] == cluster) { + ++glyphsInCluster; + } + SkASSERT(glyphsInCluster <= fGlyphCount); + uint32_t textLength = 0; + if (glyphsInCluster == fGlyphCount) { + // consumes rest of glyphs and rest of text + if (kInvalidCluster == fPreviousCluster) { // LTR text or single cluster + textLength = fTextByteLength - cluster; + } else { // RTL text; last cluster. + SkASSERT(fPreviousCluster < fTextByteLength); + if (fPreviousCluster <= cluster) { // bad input. + return Cluster{nullptr, 0, 0, 0}; + } + textLength = fPreviousCluster - cluster; + } + fGlyphCount = 0; + return Cluster{fUtf8Text + cluster, + textLength, + fGlyphIndex, + glyphsInCluster}; + } + SkASSERT(glyphsInCluster < fGlyphCount); + uint32_t nextCluster = fClusters[glyphsInCluster]; + if (nextCluster >= fTextByteLength) { + return Cluster{nullptr, 0, 0, 0}; // bad input. + } + if (nextCluster > cluster) { // LTR text + if (kInvalidCluster != fPreviousCluster) { + return Cluster{nullptr, 0, 0, 0}; // bad input. + } + textLength = nextCluster - cluster; + } else { // RTL text + SkASSERT(nextCluster < cluster); + if (kInvalidCluster == fPreviousCluster) { // first cluster + textLength = fTextByteLength - cluster; + } else { // later cluster + if (fPreviousCluster <= cluster) { + return Cluster{nullptr, 0, 0, 0}; // bad input. + } + textLength = fPreviousCluster - cluster; + } + fPreviousCluster = cluster; + } + uint32_t glyphIndex = fGlyphIndex; + fGlyphCount -= glyphsInCluster; + fGlyphIndex += glyphsInCluster; + fClusters += glyphsInCluster; + return Cluster{fUtf8Text + cluster, + textLength, + glyphIndex, + glyphsInCluster}; + } + +private: + static constexpr uint32_t kInvalidCluster = 0xFFFFFFFF; + const uint32_t* fClusters; + const char* fUtf8Text; + uint32_t fGlyphCount; + uint32_t fTextByteLength; + uint32_t fGlyphIndex = 0; + uint32_t fPreviousCluster = kInvalidCluster; + bool fReversedChars = false; +}; + +struct TextStorage { + SkAutoTMalloc fUtf8textStorage; + SkAutoTMalloc fClusterStorage; + SkAutoTMalloc fGlyphStorage; +}; } // namespace +/** Given some unicode text (as passed to drawText(), convert to + glyphs (via primitive shaping), while preserving + glyph-to-character mapping information. */ +static Clusterator make_clusterator( + const void* sourceText, + size_t sourceByteCount, + const SkPaint& paint, + TextStorage* storage, + int glyphCount) { + SkASSERT(SkPaint::kGlyphID_TextEncoding != paint.getTextEncoding()); + SkASSERT(glyphCount == paint.textToGlyphs(sourceText, sourceByteCount, nullptr)); + SkASSERT(glyphCount > 0); + storage->fGlyphStorage.reset(SkToSizeT(glyphCount)); + (void)paint.textToGlyphs(sourceText, sourceByteCount, storage->fGlyphStorage.get()); + storage->fClusterStorage.reset(SkToSizeT(glyphCount)); + uint32_t* clusters = storage->fClusterStorage.get(); + uint32_t utf8ByteCount = 0; + const char* utf8Text = nullptr; + switch (paint.getTextEncoding()) { + case SkPaint::kUTF8_TextEncoding: { + const char* txtPtr = (const char*)sourceText; + for (int i = 0; i < glyphCount; ++i) { + clusters[i] = SkToU32(txtPtr - (const char*)sourceText); + txtPtr += SkUTF8_LeadByteToCount(*(const unsigned char*)txtPtr); + SkASSERT(txtPtr <= (const char*)sourceText + sourceByteCount); + } + SkASSERT(txtPtr == (const char*)sourceText + sourceByteCount); + utf8ByteCount = SkToU32(sourceByteCount); + utf8Text = (const char*)sourceText; + break; + } + case SkPaint::kUTF16_TextEncoding: { + const uint16_t* utf16ptr = (const uint16_t*)sourceText; + int utf16count = SkToInt(sourceByteCount / sizeof(uint16_t)); + utf8ByteCount = SkToU32(SkUTF16_ToUTF8(utf16ptr, utf16count)); + storage->fUtf8textStorage.reset(utf8ByteCount); + char* txtPtr = storage->fUtf8textStorage.get(); + utf8Text = txtPtr; + int clusterIndex = 0; + while (utf16ptr < (const uint16_t*)sourceText + utf16count) { + clusters[clusterIndex++] = SkToU32(txtPtr - utf8Text); + SkUnichar uni = SkUTF16_NextUnichar(&utf16ptr); + txtPtr += SkUTF8_FromUnichar(uni, txtPtr); + } + SkASSERT(clusterIndex == glyphCount); + SkASSERT(txtPtr == storage->fUtf8textStorage.get() + utf8ByteCount); + SkASSERT(utf16ptr == (const uint16_t*)sourceText + utf16count); + break; + } + case SkPaint::kUTF32_TextEncoding: { + const SkUnichar* utf32 = (const SkUnichar*)sourceText; + int utf32count = SkToInt(sourceByteCount / sizeof(SkUnichar)); + SkASSERT(glyphCount == utf32count); + for (int i = 0; i < utf32count; ++i) { + utf8ByteCount += SkToU32(SkUTF8_FromUnichar(utf32[i])); + } + storage->fUtf8textStorage.reset(SkToSizeT(utf8ByteCount)); + char* txtPtr = storage->fUtf8textStorage.get(); + utf8Text = txtPtr; + for (int i = 0; i < utf32count; ++i) { + clusters[i] = SkToU32(txtPtr - utf8Text); + txtPtr += SkUTF8_FromUnichar(utf32[i], txtPtr); + } + break; + } + default: + SkDEBUGFAIL(""); + break; + } + return Clusterator(clusters, utf8Text, SkToU32(glyphCount), utf8ByteCount); +} + static void draw_transparent_text(SkPDFDevice* device, const SkDraw& d, const void* text, size_t len, @@ -965,6 +1164,10 @@ static void draw_transparent_text(SkPDFDevice* device, } } +static SkUnichar map_glyph(const SkTDArray& glyphToUnicode, SkGlyphID glyph) { + return SkToInt(glyph) < glyphToUnicode.count() ? glyphToUnicode[SkToInt(glyph)] : -1; +} + static void update_font(SkWStream* wStream, int fontIndex, SkScalar textSize) { wStream->writeText("/"); char prefix = SkPDFResourceDict::GetResourceTypePrefix(SkPDFResourceDict::kFont_ResourceType); @@ -994,19 +1197,9 @@ void SkPDFDevice::internalDrawText( // https://bug.skia.org/5665 return; } - // TODO(halcanary): implement /ActualText with these values. - (void)clusters; - (void)textByteLength; - (void)utf8Text; - if (textByteLength > 0) { - SkASSERT(clusters); - SkASSERT(utf8Text); - SkASSERT(srcPaint.getTextEncoding() == SkPaint::kGlyphID_TextEncoding); - } else { - SkASSERT(nullptr == clusters); - SkASSERT(nullptr == utf8Text); + if (0 == sourceByteCount || !sourceText) { + return; } - SkPaint paint = calculate_text_paint(srcPaint); replace_srcmode_on_opaque_paint(&paint); if (!paint.getTypeface()) { @@ -1028,7 +1221,6 @@ void SkPDFDevice::internalDrawText( return; } // TODO(halcanary): use metrics->fGlyphToUnicode to check Unicode mapping. - const SkGlyphID maxGlyphID = metrics->fLastGlyphID; if (!SkPDFFont::CanEmbedTypeface(typeface, fDocument->canon())) { SkPath path; // https://bug.skia.org/3866 switch (positioning) { @@ -1061,18 +1253,34 @@ void SkPDFDevice::internalDrawText( offset.x(), offset.y(), paint); return; } - SkAutoSTMalloc<128, SkGlyphID> glyphStorage; + + // These three heap buffers are only used in the case where no glyphs + // are passed to drawText() (most clients pass glyphs or a textblob). + TextStorage storage; const SkGlyphID* glyphs = nullptr; - if (paint.getTextEncoding() == SkPaint::kGlyphID_TextEncoding) { + Clusterator clusterator; + if (textByteLength > 0) { + SkASSERT(glyphCount == SkToInt(sourceByteCount / sizeof(SkGlyphID))); glyphs = (const SkGlyphID*)sourceText; - // validate input later. + clusterator = Clusterator(clusters, utf8Text, SkToU32(glyphCount), textByteLength); + SkASSERT(clusters); + SkASSERT(utf8Text); + SkASSERT(srcPaint.getTextEncoding() == SkPaint::kGlyphID_TextEncoding); + SkASSERT(glyphCount == paint.textToGlyphs(sourceText, sourceByteCount, nullptr)); + } else if (SkPaint::kGlyphID_TextEncoding == srcPaint.getTextEncoding()) { + SkASSERT(glyphCount == SkToInt(sourceByteCount / sizeof(SkGlyphID))); + glyphs = (const SkGlyphID*)sourceText; + clusterator = Clusterator(SkToU32(glyphCount)); + SkASSERT(glyphCount == paint.textToGlyphs(sourceText, sourceByteCount, nullptr)); + SkASSERT(nullptr == clusters); + SkASSERT(nullptr == utf8Text); } else { - glyphStorage.reset(SkToSizeT(glyphCount)); - (void)paint.textToGlyphs(sourceText, sourceByteCount, glyphStorage.get()); - glyphs = glyphStorage.get(); - paint.setTextEncoding(SkPaint::kGlyphID_TextEncoding); + SkASSERT(nullptr == clusters); + SkASSERT(nullptr == utf8Text); + clusterator = make_clusterator(sourceText, sourceByteCount, srcPaint, + &storage, glyphCount); + glyphs = storage.fGlyphStorage; } - bool defaultPositioning = (positioning == SkTextBlob::kDefault_Positioning); paint.setHinting(SkPaint::kNo_Hinting); SkAutoGlyphCache glyphCache(paint, nullptr, nullptr); @@ -1094,51 +1302,91 @@ void SkPDFDevice::internalDrawText( } SkDynamicMemoryWStream* out = &content.entry()->fContent; SkScalar textSize = paint.getTextSize(); + const SkTDArray& glyphToUnicode = metrics->fGlyphToUnicode; out->writeText("BT\n"); SK_AT_SCOPE_EXIT(out->writeText("ET\n")); + const SkGlyphID maxGlyphID = metrics->fLastGlyphID; bool multiByteGlyphs = SkPDFFont::IsMultiByte(SkPDFFont::FontType(*metrics)); + if (clusterator.reversedChars()) { + out->writeText("/ReversedChars BMC\n"); + } + SK_AT_SCOPE_EXIT(if (clusterator.reversedChars()) { out->writeText("EMC\n"); } ); GlyphPositioner glyphPositioner(out, paint.getTextSkewX(), multiByteGlyphs, defaultPositioning, offset); SkPDFFont* font = nullptr; - for (int index = 0; index < glyphCount; ++index) { - SkGlyphID gid = glyphs[index]; - if (gid > maxGlyphID) { - continue; // Skip this invalid glyphID. - } - if (!font || !font->hasGlyph(gid)) { - // Either this is the first loop iteration or the current - // PDFFont cannot encode this glyph. - glyphPositioner.flush(); - // Try to get a font which can encode the glyph. - int fontIndex = this->getFontResourceIndex(typeface, gid); - SkASSERT(fontIndex >= 0); - if (fontIndex < 0) { return; } - update_font(out, fontIndex, textSize); - font = fFontResources[fontIndex]; - SkASSERT(font); // All preconditions for SkPDFFont::GetFontResource are met. - if (!font) { return; } - SkASSERT(font->multiByteGlyphs() == multiByteGlyphs); + + while (Clusterator::Cluster c = clusterator.next()) { + int index = c.fGlyphIndex; + int glyphLimit = index + c.fGlyphCount; + + bool actualText = false; + SK_AT_SCOPE_EXIT(if (actualText) { glyphPositioner.flush(); out->writeText("EMC\n"); } ); + if (c.fUtf8Text) { // real cluster + // Check if `/ActualText` needed. + const char* textPtr = c.fUtf8Text; + // TODO(halcanary): validate utf8 input. + SkUnichar unichar = SkUTF8_NextUnichar(&textPtr); + const char* textEnd = c.fUtf8Text + c.fTextByteLength; + if (textPtr < textEnd || // more characters left + glyphLimit > index + 1 || // toUnicode wouldn't work + unichar != map_glyph(glyphToUnicode, glyphs[index])) // test single Unichar map + { + glyphPositioner.flush(); + out->writeText("/Span<writeText("> >> BDC\n"); // begin marked-content sequence + // with an associated property list. + actualText = true; + } } - font->noteGlyphUsage(gid); - SkScalar advance{0.0f}; - SkPoint xy{0.0f, 0.0f}; - if (!defaultPositioning) { - advance = glyphCache->getGlyphIDAdvance(gid).fAdvanceX; - xy = SkTextBlob::kFull_Positioning == positioning - ? SkPoint{pos[2 * index], pos[2 * index + 1]} - : SkPoint{pos[index], 0}; - if (alignment != SkPaint::kLeft_Align) { - xy.offset(alignmentFactor * advance, 0); + for (; index < glyphLimit; ++index) { + SkGlyphID gid = glyphs[index]; + if (gid > maxGlyphID) { + continue; + } + if (!font || !font->hasGlyph(gid)) { + // Not yet specified font or need to switch font. + int fontIndex = this->getFontResourceIndex(typeface, gid); + // All preconditions for SkPDFFont::GetFontResource are met. + SkASSERT(fontIndex >= 0); + if (fontIndex < 0) { + return; + } + glyphPositioner.flush(); + update_font(out, fontIndex, textSize); + font = fFontResources[fontIndex]; + SkASSERT(font); // All preconditions for SkPDFFont::GetFontResource are met. + if (!font) { + return; + } + SkASSERT(font->multiByteGlyphs() == multiByteGlyphs); + } + SkPoint xy{0, 0}; + SkScalar advance{0}; + if (!defaultPositioning) { + advance = glyphCache->getGlyphIDAdvance(gid).fAdvanceX; + xy = SkTextBlob::kFull_Positioning == positioning + ? SkPoint{pos[2 * index], pos[2 * index + 1]} + : SkPoint{pos[index], 0}; + if (alignment != SkPaint::kLeft_Align) { + xy.offset(alignmentFactor * advance, 0); + } } + font->noteGlyphUsage(gid); + SkGlyphID encodedGlyph = multiByteGlyphs ? gid : font->glyphToPDFFontEncoding(gid); + glyphPositioner.writeGlyph(xy, advance, encodedGlyph); } - SkGlyphID encodedGlyph = - multiByteGlyphs ? gid : font->glyphToPDFFontEncoding(gid); - glyphPositioner.writeGlyph(xy, advance, encodedGlyph); } } diff --git a/src/pdf/SkPDFMakeToUnicodeCmap.cpp b/src/pdf/SkPDFMakeToUnicodeCmap.cpp index b610934bcc..afe773207d 100644 --- a/src/pdf/SkPDFMakeToUnicodeCmap.cpp +++ b/src/pdf/SkPDFMakeToUnicodeCmap.cpp @@ -69,16 +69,6 @@ struct BFRange { }; } // namespace -static void write_utf16be(SkDynamicMemoryWStream* wStream, SkUnichar utf32) { - SkGlyphID utf16[2] = {0, 0}; - size_t len = SkUTF16_FromUnichar(utf32, utf16); - SkASSERT(len == 1 || len == 2); - SkPDFUtils::WriteUInt16BE(wStream, utf16[0]); - if (len == 2) { - SkPDFUtils::WriteUInt16BE(wStream, utf16[1]); - } -} - static void write_glyph(SkDynamicMemoryWStream* cmap, bool multiByte, SkGlyphID gid) { @@ -102,7 +92,7 @@ static void append_bfchar_section(const SkTDArray& bfchar, cmap->writeText("<"); write_glyph(cmap, multiByte, bfchar[i + j].fGlyphId); cmap->writeText("> <"); - write_utf16be(cmap, bfchar[i + j].fUnicode); + SkPDFUtils::WriteUTF16beHex(cmap, bfchar[i + j].fUnicode); cmap->writeText(">\n"); } cmap->writeText("endbfchar\n"); @@ -124,7 +114,7 @@ static void append_bfrange_section(const SkTDArray& bfrange, cmap->writeText("> <"); write_glyph(cmap, multiByte, bfrange[i + j].fEnd); cmap->writeText("> <"); - write_utf16be(cmap, bfrange[i + j].fUnicode); + SkPDFUtils::WriteUTF16beHex(cmap, bfrange[i + j].fUnicode); cmap->writeText(">\n"); } cmap->writeText("endbfrange\n"); diff --git a/src/pdf/SkPDFUtils.h b/src/pdf/SkPDFUtils.h index a9194f2e72..964689f4f2 100644 --- a/src/pdf/SkPDFUtils.h +++ b/src/pdf/SkPDFUtils.h @@ -12,6 +12,7 @@ #include "SkPaint.h" #include "SkPath.h" #include "SkStream.h" +#include "SkUtils.h" class SkMatrix; class SkPDFArray; @@ -92,7 +93,15 @@ inline void WriteUInt8(SkDynamicMemoryWStream* wStream, uint8_t value) { result[1] = gHex[0xF & value]; wStream->write(result, 2); } - +inline void WriteUTF16beHex(SkDynamicMemoryWStream* wStream, SkUnichar utf32) { + uint16_t utf16[2] = {0, 0}; + size_t len = SkUTF16_FromUnichar(utf32, utf16); + SkASSERT(len == 1 || len == 2); + SkPDFUtils::WriteUInt16BE(wStream, utf16[0]); + if (len == 2) { + SkPDFUtils::WriteUInt16BE(wStream, utf16[1]); + } +} } // namespace SkPDFUtils #endif -- cgit v1.2.3