src/google/protobuf/util/internal/json_escaping.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356

// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc.  All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <google/protobuf/util/internal/json_escaping.h>

#include <google/protobuf/stubs/logging.h>
#include <google/protobuf/stubs/common.h>

namespace google {
namespace protobuf {
namespace util {
namespace converter {

namespace {

// Array of hex characters for conversion to hex.
static const char kHex[] = "0123456789abcdef";

// Characters 0x00 to 0x9f are very commonly used, so we provide a special
// table lookup.
//
// For unicode code point ch < 0xa0:
// kCommonEscapes[ch] is the escaped string of ch, if escaping is needed;
//                    or an empty string, if escaping is not needed.
static const char kCommonEscapes[160][7] = {
  // C0 (ASCII and derivatives) control characters
  "\\u0000", "\\u0001", "\\u0002", "\\u0003",  // 0x00
  "\\u0004", "\\u0005", "\\u0006", "\\u0007",
  "\\b",     "\\t",     "\\n",     "\\u000b",
  "\\f",     "\\r",     "\\u000e", "\\u000f",
  "\\u0010", "\\u0011", "\\u0012", "\\u0013",  // 0x10
  "\\u0014", "\\u0015", "\\u0016", "\\u0017",
  "\\u0018", "\\u0019", "\\u001a", "\\u001b",
  "\\u001c", "\\u001d", "\\u001e", "\\u001f",
  // Escaping of " and \ are required by www.json.org string definition.
  // Escaping of < and > are required for HTML security.
  "", "", "\\\"", "", "",        "", "",        "",  // 0x20
  "", "", "",     "", "",        "", "",        "",
  "", "", "",     "", "",        "", "",        "",  // 0x30
  "", "", "",     "", "\\u003c", "", "\\u003e", "",
  "", "", "",     "", "",        "", "",        "",  // 0x40
  "", "", "",     "", "",        "", "",        "",
  "", "", "",     "", "",        "", "",        "",  // 0x50
  "", "", "",     "", "\\\\",    "", "",        "",
  "", "", "",     "", "",        "", "",        "",  // 0x60
  "", "", "",     "", "",        "", "",        "",
  "", "", "",     "", "",        "", "",        "",  // 0x70
  "", "", "",     "", "",        "", "",        "\\u007f",
  // C1 (ISO 8859 and Unicode) extended control characters
  "\\u0080", "\\u0081", "\\u0082", "\\u0083",  // 0x80
  "\\u0084", "\\u0085", "\\u0086", "\\u0087",
  "\\u0088", "\\u0089", "\\u008a", "\\u008b",
  "\\u008c", "\\u008d", "\\u008e", "\\u008f",
  "\\u0090", "\\u0091", "\\u0092", "\\u0093",  // 0x90
  "\\u0094", "\\u0095", "\\u0096", "\\u0097",
  "\\u0098", "\\u0099", "\\u009a", "\\u009b",
  "\\u009c", "\\u009d", "\\u009e", "\\u009f"
};

// Determines if the given char value is a unicode surrogate code unit (either
// high-surrogate or low-surrogate).
inline bool IsSurrogate(uint32 c) {
  // Optimized form of:
  // return c >= kMinHighSurrogate && c <= kMaxLowSurrogate;
  // (Reduced from 3 ALU instructions to 2 ALU instructions)
  return (c & 0xfffff800) == JsonEscaping::kMinHighSurrogate;
}

// Returns true if the given unicode code point cp is a valid
// unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint).
inline bool IsValidCodePoint(uint32 cp) {
  return cp <= JsonEscaping::kMaxCodePoint;
}

// Returns the low surrogate for the given unicode code point. The result is
// meaningless if the given code point is not a supplementary character.
inline uint16 ToLowSurrogate(uint32 cp) {
  return (cp & (JsonEscaping::kMaxLowSurrogate
                - JsonEscaping::kMinLowSurrogate))
      + JsonEscaping::kMinLowSurrogate;
}

// Returns the high surrogate for the given unicode code point. The result is
// meaningless if the given code point is not a supplementary character.
inline uint16 ToHighSurrogate(uint32 cp) {
  return (cp >> 10) + (JsonEscaping::kMinHighSurrogate -
                       (JsonEscaping::kMinSupplementaryCodePoint >> 10));
}

// Input str is encoded in UTF-8. A unicode code point could be encoded in
// UTF-8 using anywhere from 1 to 4 characters, and it could span multiple
// reads of the ByteSource.
//
// This function reads the next unicode code point from the input (str) at
// the given position (index), taking into account any left-over partial
// code point from the previous iteration (cp), together with the number
// of characters left to read to complete this code point (num_left).
//
// This function assumes that the input (str) is valid at the given position
// (index). In order words, at least one character could be read successfully.
//
// The code point read (partial or complete) is stored in (cp). Upon return,
// (num_left) stores the number of characters that has yet to be read in
// order to complete the current unicode code point. If the read is complete,
// then (num_left) is 0. Also, (num_read) is the number of characters read.
//
// Returns false if we encounter an invalid UTF-8 string. Returns true
// otherwise, including the case when we reach the end of the input (str)
// before a complete unicode code point is read.
bool ReadCodePoint(StringPiece str, int index,
                   uint32 *cp, int* num_left, int *num_read) {
  if (*num_left == 0) {
    // Last read was complete. Start reading a new unicode code point.
    *cp = static_cast<uint8>(str[index++]);
    *num_read = 1;
    // The length of the code point is determined from reading the first byte.
    //
    // If the first byte is between:
    //    0..0x7f: that's the value of the code point.
    // 0x80..0xbf: <invalid>
    // 0xc0..0xdf: 11-bit code point encoded in 2 bytes.
    //                                   bit 10-6, bit 5-0
    // 0xe0..0xef: 16-bit code point encoded in 3 bytes.
    //                        bit 15-12, bit 11-6, bit 5-0
    // 0xf0..0xf7: 21-bit code point encoded in 4 bytes.
    //             bit 20-18, bit 17-12, bit 11-6, bit 5-0
    // 0xf8..0xff: <invalid>
    //
    // Meaning of each bit:
    // <msb> bit 7: 0 - single byte code point: bits 6-0 are values.
    //              1 - multibyte code point
    //       bit 6: 0 - subsequent bytes of multibyte code point:
    //                  bits 5-0 are values.
    //              1 - first byte of multibyte code point
    //       bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values.
    //              1 - first byte of code point with >= 3 bytes.
    //       bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values.
    //              1 - first byte of code point with >= 4 bytes.
    //       bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values.
    //              1 - reserved for future expansion.
    if (*cp <= 0x7f) {
      return true;
    } else if (*cp <= 0xbf) {
      return false;
    } else if (*cp <= 0xdf) {
      *cp &= 0x1f;
      *num_left = 1;
    } else if (*cp <= 0xef) {
      *cp &= 0x0f;
      *num_left = 2;
    } else if (*cp <= 0xf7) {
      *cp &= 0x07;
      *num_left = 3;
    } else {
      return false;
    }
  } else {
    // Last read was partial. Initialize num_read to 0 and continue reading
    // the last unicode code point.
    *num_read = 0;
  }
  while (*num_left > 0 && index < str.size()) {
    uint32 ch = static_cast<uint8>(str[index++]);
    --(*num_left);
    ++(*num_read);
    *cp = (*cp << 6) | (ch & 0x3f);
    if (ch < 0x80 || ch > 0xbf) return false;
  }
  return *num_left > 0 || (!IsSurrogate(*cp) && IsValidCodePoint(*cp));
}

// Stores the 16-bit unicode code point as its hexadecimal digits in buffer
// and returns a StringPiece that points to this buffer. The input buffer needs
// to be at least 6 bytes long.
StringPiece ToHex(uint16 cp, char* buffer) {
  buffer[5] = kHex[cp & 0x0f];
  cp >>= 4;
  buffer[4] = kHex[cp & 0x0f];
  cp >>= 4;
  buffer[3] = kHex[cp & 0x0f];
  cp >>= 4;
  buffer[2] = kHex[cp & 0x0f];
  return StringPiece(buffer).substr(0, 6);
}

// Stores the 32-bit unicode code point as its hexadecimal digits in buffer
// and returns a StringPiece that points to this buffer. The input buffer needs
// to be at least 12 bytes long.
StringPiece ToSurrogateHex(uint32 cp, char* buffer) {
  uint16 low = ToLowSurrogate(cp);
  uint16 high = ToHighSurrogate(cp);

  buffer[11] = kHex[low & 0x0f];
  low >>= 4;
  buffer[10] = kHex[low & 0x0f];
  low >>= 4;
  buffer[9] = kHex[low & 0x0f];
  low >>= 4;
  buffer[8] = kHex[low & 0x0f];

  buffer[5] = kHex[high & 0x0f];
  high >>= 4;
  buffer[4] = kHex[high & 0x0f];
  high >>= 4;
  buffer[3] = kHex[high & 0x0f];
  high >>= 4;
  buffer[2] = kHex[high & 0x0f];

  return StringPiece(buffer, 12);
}

// If the given unicode code point needs escaping, then returns the
// escaped form. The returned StringPiece either points to statically
// pre-allocated char[] or to the given buffer. The input buffer needs
// to be at least 12 bytes long.
//
// If the given unicode code point does not need escaping, an empty
// StringPiece is returned.
StringPiece EscapeCodePoint(uint32 cp, char* buffer) {
  if (cp < 0xa0) return kCommonEscapes[cp];
  switch (cp) {
    // These are not required by json spec
    // but used to prevent security bugs in javascript.
    case 0xfeff:  // Zero width no-break space
    case 0xfff9:  // Interlinear annotation anchor
    case 0xfffa:  // Interlinear annotation separator
    case 0xfffb:  // Interlinear annotation terminator

    case 0x00ad:  // Soft-hyphen
    case 0x06dd:  // Arabic end of ayah
    case 0x070f:  // Syriac abbreviation mark
    case 0x17b4:  // Khmer vowel inherent Aq
    case 0x17b5:  // Khmer vowel inherent Aa
      return ToHex(cp, buffer);

    default:
      if ((cp >= 0x0600 && cp <= 0x0603) ||  // Arabic signs
          (cp >= 0x200b && cp <= 0x200f) ||  // Zero width etc.
          (cp >= 0x2028 && cp <= 0x202e) ||  // Separators etc.
          (cp >= 0x2060 && cp <= 0x2064) ||  // Invisible etc.
          (cp >= 0x206a && cp <= 0x206f)) {  // Shaping etc.
        return ToHex(cp, buffer);
      }

      if (cp == 0x000e0001 ||                        // Language tag
          (cp >= 0x0001d173 && cp <= 0x0001d17a) ||  // Music formatting
          (cp >= 0x000e0020 && cp <= 0x000e007f)) {  // TAG symbols
        return ToSurrogateHex(cp, buffer);
      }
  }
  return StringPiece();
}

// Tries to escape the given code point first. If the given code point
// does not need to be escaped, but force_output is true, then render
// the given multi-byte code point in UTF8 in the buffer and returns it.
StringPiece EscapeCodePoint(uint32 cp, char* buffer, bool force_output) {
  StringPiece sp = EscapeCodePoint(cp, buffer);
  if (force_output && sp.empty()) {
    buffer[5] = (cp & 0x3f) | 0x80;
    cp >>= 6;
    if (cp <= 0x1f) {
      buffer[4] = cp | 0xc0;
      sp = StringPiece(buffer + 4, 2);
      return sp;
    }
    buffer[4] = (cp & 0x3f) | 0x80;
    cp >>= 6;
    if (cp <= 0x0f) {
      buffer[3] = cp | 0xe0;
      sp = StringPiece(buffer + 3, 3);
      return sp;
    }
    buffer[3] = (cp & 0x3f) | 0x80;
    buffer[2] = ((cp >> 6) & 0x07) | 0xf0;
    sp = StringPiece(buffer + 2, 4);
  }
  return sp;
}

}  // namespace

void JsonEscaping::Escape(strings::ByteSource* input,
                          strings::ByteSink* output) {
  char buffer[12] = "\\udead\\ubee";
  uint32 cp = 0;     // Current unicode code point.
  int num_left = 0;  // Num of chars to read to complete the code point.
  while (input->Available() > 0) {
    StringPiece str = input->Peek();
    StringPiece escaped;
    int i = 0;
    int num_read;
    bool ok;
    bool cp_was_split = num_left > 0;
    // Loop until we encounter either
    //   i) a code point that needs to be escaped; or
    //  ii) a split code point is completely read; or
    // iii) a character that is not a valid utf8; or
    //  iv) end of the StringPiece str is reached.
    do {
      ok = ReadCodePoint(str, i, &cp, &num_left, &num_read);
      if (num_left > 0 || !ok) break;  // case iii or iv
      escaped = EscapeCodePoint(cp, buffer, cp_was_split);
      if (!escaped.empty()) break;     // case i or ii
      i += num_read;
      num_read = 0;
    } while (i < str.length());        // case iv
    // First copy the un-escaped prefix, if any, to the output ByteSink.
    if (i > 0) input->CopyTo(output, i);
    if (num_read > 0) input->Skip(num_read);
    if (!ok) {
      // Case iii: Report error.
      // TODO(wpoon): Add error reporting.
      num_left = 0;
    } else if (num_left == 0 && !escaped.empty()) {
      // Case i or ii: Append the escaped code point to the output ByteSink.
      output->Append(escaped.data(), escaped.size());
    }
  }
  if (num_left > 0) {
    // Treat as case iii: report error.
    // TODO(wpoon): Add error reporting.
  }
}

}  // namespace converter
}  // namespace util
}  // namespace protobuf
}  // namespace google