summaryrefslogtreecommitdiff
path: root/absl/strings/internal/charconv_parse.cc
blob: 98823def8351173dd050222a6fb2693fc0e9c0fa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
// Copyright 2018 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "absl/strings/internal/charconv_parse.h"
#include "absl/strings/charconv.h"

#include <cassert>
#include <cstdint>
#include <limits>

#include "absl/strings/internal/memutil.h"

namespace absl {
ABSL_NAMESPACE_BEGIN
namespace {

// ParseFloat<10> will read the first 19 significant digits of the mantissa.
// This number was chosen for multiple reasons.
//
// (a) First, for whatever integer type we choose to represent the mantissa, we
// want to choose the largest possible number of decimal digits for that integer
// type.  We are using uint64_t, which can express any 19-digit unsigned
// integer.
//
// (b) Second, we need to parse enough digits that the binary value of any
// mantissa we capture has more bits of resolution than the mantissa
// representation in the target float.  Our algorithm requires at least 3 bits
// of headway, but 19 decimal digits give a little more than that.
//
// The following static assertions verify the above comments:
constexpr int kDecimalMantissaDigitsMax = 19;

static_assert(std::numeric_limits<uint64_t>::digits10 ==
                  kDecimalMantissaDigitsMax,
              "(a) above");

// IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa.
static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed");
static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact");
static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact");

// The lowest valued 19-digit decimal mantissa we can read still contains
// sufficient information to reconstruct a binary mantissa.
static_assert(1000000000000000000u > (uint64_t{1} << (53 + 3)), "(b) above");

// ParseFloat<16> will read the first 15 significant digits of the mantissa.
//
// Because a base-16-to-base-2 conversion can be done exactly, we do not need
// to maximize the number of scanned hex digits to improve our conversion.  What
// is required is to scan two more bits than the mantissa can represent, so that
// we always round correctly.
//
// (One extra bit does not suffice to perform correct rounding, since a number
// exactly halfway between two representable floats has unique rounding rules,
// so we need to differentiate between a "halfway between" number and a "closer
// to the larger value" number.)
constexpr int kHexadecimalMantissaDigitsMax = 15;

// The minimum number of significant bits that will be read from
// kHexadecimalMantissaDigitsMax hex digits.  We must subtract by three, since
// the most significant digit can be a "1", which only contributes a single
// significant bit.
constexpr int kGuaranteedHexadecimalMantissaBitPrecision =
    4 * kHexadecimalMantissaDigitsMax - 3;

static_assert(kGuaranteedHexadecimalMantissaBitPrecision >
                  std::numeric_limits<double>::digits + 2,
              "kHexadecimalMantissaDigitsMax too small");

// We also impose a limit on the number of significant digits we will read from
// an exponent, to avoid having to deal with integer overflow.  We use 9 for
// this purpose.
//
// If we read a 9 digit exponent, the end result of the conversion will
// necessarily be infinity or zero, depending on the sign of the exponent.
// Therefore we can just drop extra digits on the floor without any extra
// logic.
constexpr int kDecimalExponentDigitsMax = 9;
static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax,
              "int type too small");

// To avoid incredibly large inputs causing integer overflow for our exponent,
// we impose an arbitrary but very large limit on the number of significant
// digits we will accept.  The implementation refuses to match a string with
// more consecutive significant mantissa digits than this.
constexpr int kDecimalDigitLimit = 50000000;

// Corresponding limit for hexadecimal digit inputs.  This is one fourth the
// amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires
// a binary exponent adjustment of 4.
constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4;

// The largest exponent we can read is 999999999 (per
// kDecimalExponentDigitsMax), and the largest exponent adjustment we can get
// from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these
// comfortably fits in an integer.
//
// We count kDecimalDigitLimit twice because there are independent limits for
// numbers before and after the decimal point.  (In the case where there are no
// significant digits before the decimal point, there are independent limits for
// post-decimal-point leading zeroes and for significant digits.)
static_assert(999999999 + 2 * kDecimalDigitLimit <
                  std::numeric_limits<int>::max(),
              "int type too small");
static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) <
                  std::numeric_limits<int>::max(),
              "int type too small");

// Returns true if the provided bitfield allows parsing an exponent value
// (e.g., "1.5e100").
bool AllowExponent(chars_format flags) {
  bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
  bool scientific =
      (flags & chars_format::scientific) == chars_format::scientific;
  return scientific || !fixed;
}

// Returns true if the provided bitfield requires an exponent value be present.
bool RequireExponent(chars_format flags) {
  bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
  bool scientific =
      (flags & chars_format::scientific) == chars_format::scientific;
  return scientific && !fixed;
}

const int8_t kAsciiToInt[256] = {
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,  2,  3,  4,  5,  6,  7,  8,
    9,  -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1};

// Returns true if `ch` is a digit in the given base
template <int base>
bool IsDigit(char ch);

// Converts a valid `ch` to its digit value in the given base.
template <int base>
unsigned ToDigit(char ch);

// Returns true if `ch` is the exponent delimiter for the given base.
template <int base>
bool IsExponentCharacter(char ch);

// Returns the maximum number of significant digits we will read for a float
// in the given base.
template <int base>
constexpr int MantissaDigitsMax();

// Returns the largest consecutive run of digits we will accept when parsing a
// number in the given base.
template <int base>
constexpr int DigitLimit();

// Returns the amount the exponent must be adjusted by for each dropped digit.
// (For decimal this is 1, since the digits are in base 10 and the exponent base
// is also 10, but for hexadecimal this is 4, since the digits are base 16 but
// the exponent base is 2.)
template <int base>
constexpr int DigitMagnitude();

template <>
bool IsDigit<10>(char ch) {
  return ch >= '0' && ch <= '9';
}
template <>
bool IsDigit<16>(char ch) {
  return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0;
}

template <>
unsigned ToDigit<10>(char ch) {
  return static_cast<unsigned>(ch - '0');
}
template <>
unsigned ToDigit<16>(char ch) {
  return static_cast<unsigned>(kAsciiToInt[static_cast<unsigned char>(ch)]);
}

template <>
bool IsExponentCharacter<10>(char ch) {
  return ch == 'e' || ch == 'E';
}

template <>
bool IsExponentCharacter<16>(char ch) {
  return ch == 'p' || ch == 'P';
}

template <>
constexpr int MantissaDigitsMax<10>() {
  return kDecimalMantissaDigitsMax;
}
template <>
constexpr int MantissaDigitsMax<16>() {
  return kHexadecimalMantissaDigitsMax;
}

template <>
constexpr int DigitLimit<10>() {
  return kDecimalDigitLimit;
}
template <>
constexpr int DigitLimit<16>() {
  return kHexadecimalDigitLimit;
}

template <>
constexpr int DigitMagnitude<10>() {
  return 1;
}
template <>
constexpr int DigitMagnitude<16>() {
  return 4;
}

// Reads decimal digits from [begin, end) into *out.  Returns the number of
// digits consumed.
//
// After max_digits has been read, keeps consuming characters, but no longer
// adjusts *out.  If a nonzero digit is dropped this way, *dropped_nonzero_digit
// is set; otherwise, it is left unmodified.
//
// If no digits are matched, returns 0 and leaves *out unchanged.
//
// ConsumeDigits does not protect against overflow on *out; max_digits must
// be chosen with respect to type T to avoid the possibility of overflow.
template <int base, typename T>
int ConsumeDigits(const char* begin, const char* end, int max_digits, T* out,
                  bool* dropped_nonzero_digit) {
  if (base == 10) {
    assert(max_digits <= std::numeric_limits<T>::digits10);
  } else if (base == 16) {
    assert(max_digits * 4 <= std::numeric_limits<T>::digits);
  }
  const char* const original_begin = begin;

  // Skip leading zeros, but only if *out is zero.
  // They don't cause an overflow so we don't have to count them for
  // `max_digits`.
  while (!*out && end != begin && *begin == '0') ++begin;

  T accumulator = *out;
  const char* significant_digits_end =
      (end - begin > max_digits) ? begin + max_digits : end;
  while (begin < significant_digits_end && IsDigit<base>(*begin)) {
    // Do not guard against *out overflow; max_digits was chosen to avoid this.
    // Do assert against it, to detect problems in debug builds.
    auto digit = static_cast<T>(ToDigit<base>(*begin));
    assert(accumulator * base >= accumulator);
    accumulator *= base;
    assert(accumulator + digit >= accumulator);
    accumulator += digit;
    ++begin;
  }
  bool dropped_nonzero = false;
  while (begin < end && IsDigit<base>(*begin)) {
    dropped_nonzero = dropped_nonzero || (*begin != '0');
    ++begin;
  }
  if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
    *dropped_nonzero_digit = true;
  }
  *out = accumulator;
  return static_cast<int>(begin - original_begin);
}

// Returns true if `v` is one of the chars allowed inside parentheses following
// a NaN.
bool IsNanChar(char v) {
  return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') ||
         (v >= 'A' && v <= 'Z');
}

// Checks the range [begin, end) for a strtod()-formatted infinity or NaN.  If
// one is found, sets `out` appropriately and returns true.
bool ParseInfinityOrNan(const char* begin, const char* end,
                        strings_internal::ParsedFloat* out) {
  if (end - begin < 3) {
    return false;
  }
  switch (*begin) {
    case 'i':
    case 'I': {
      // An infinity string consists of the characters "inf" or "infinity",
      // case insensitive.
      if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) {
        return false;
      }
      out->type = strings_internal::FloatType::kInfinity;
      if (end - begin >= 8 &&
          strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) {
        out->end = begin + 8;
      } else {
        out->end = begin + 3;
      }
      return true;
    }
    case 'n':
    case 'N': {
      // A NaN consists of the characters "nan", case insensitive, optionally
      // followed by a parenthesized sequence of zero or more alphanumeric
      // characters and/or underscores.
      if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) {
        return false;
      }
      out->type = strings_internal::FloatType::kNan;
      out->end = begin + 3;
      // NaN is allowed to be followed by a parenthesized string, consisting of
      // only the characters [a-zA-Z0-9_].  Match that if it's present.
      begin += 3;
      if (begin < end && *begin == '(') {
        const char* nan_begin = begin + 1;
        while (nan_begin < end && IsNanChar(*nan_begin)) {
          ++nan_begin;
        }
        if (nan_begin < end && *nan_begin == ')') {
          // We found an extra NaN specifier range
          out->subrange_begin = begin + 1;
          out->subrange_end = nan_begin;
          out->end = nan_begin + 1;
        }
      }
      return true;
    }
    default:
      return false;
  }
}
}  // namespace

namespace strings_internal {

template <int base>
strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end,
                                         chars_format format_flags) {
  strings_internal::ParsedFloat result;

  // Exit early if we're given an empty range.
  if (begin == end) return result;

  // Handle the infinity and NaN cases.
  if (ParseInfinityOrNan(begin, end, &result)) {
    return result;
  }

  const char* const mantissa_begin = begin;
  while (begin < end && *begin == '0') {
    ++begin;  // skip leading zeros
  }
  uint64_t mantissa = 0;

  int exponent_adjustment = 0;
  bool mantissa_is_inexact = false;
  int pre_decimal_digits = ConsumeDigits<base>(
      begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);
  begin += pre_decimal_digits;
  int digits_left;
  if (pre_decimal_digits >= DigitLimit<base>()) {
    // refuse to parse pathological inputs
    return result;
  } else if (pre_decimal_digits > MantissaDigitsMax<base>()) {
    // We dropped some non-fraction digits on the floor.  Adjust our exponent
    // to compensate.
    exponent_adjustment =
        static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());
    digits_left = 0;
  } else {
    digits_left =
        static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);
  }
  if (begin < end && *begin == '.') {
    ++begin;
    if (mantissa == 0) {
      // If we haven't seen any nonzero digits yet, keep skipping zeros.  We
      // have to adjust the exponent to reflect the changed place value.
      const char* begin_zeros = begin;
      while (begin < end && *begin == '0') {
        ++begin;
      }
      int zeros_skipped = static_cast<int>(begin - begin_zeros);
      if (zeros_skipped >= DigitLimit<base>()) {
        // refuse to parse pathological inputs
        return result;
      }
      exponent_adjustment -= static_cast<int>(zeros_skipped);
    }
    int post_decimal_digits = ConsumeDigits<base>(
        begin, end, digits_left, &mantissa, &mantissa_is_inexact);
    begin += post_decimal_digits;

    // Since `mantissa` is an integer, each significant digit we read after
    // the decimal point requires an adjustment to the exponent. "1.23e0" will
    // be stored as `mantissa` == 123 and `exponent` == -2 (that is,
    // "123e-2").
    if (post_decimal_digits >= DigitLimit<base>()) {
      // refuse to parse pathological inputs
      return result;
    } else if (post_decimal_digits > digits_left) {
      exponent_adjustment -= digits_left;
    } else {
      exponent_adjustment -= post_decimal_digits;
    }
  }
  // If we've found no mantissa whatsoever, this isn't a number.
  if (mantissa_begin == begin) {
    return result;
  }
  // A bare "." doesn't count as a mantissa either.
  if (begin - mantissa_begin == 1 && *mantissa_begin == '.') {
    return result;
  }

  if (mantissa_is_inexact) {
    // We dropped significant digits on the floor.  Handle this appropriately.
    if (base == 10) {
      // If we truncated significant decimal digits, store the full range of the
      // mantissa for future big integer math for exact rounding.
      result.subrange_begin = mantissa_begin;
      result.subrange_end = begin;
    } else if (base == 16) {
      // If we truncated hex digits, reflect this fact by setting the low
      // ("sticky") bit.  This allows for correct rounding in all cases.
      mantissa |= 1;
    }
  }
  result.mantissa = mantissa;

  const char* const exponent_begin = begin;
  result.literal_exponent = 0;
  bool found_exponent = false;
  if (AllowExponent(format_flags) && begin < end &&
      IsExponentCharacter<base>(*begin)) {
    bool negative_exponent = false;
    ++begin;
    if (begin < end && *begin == '-') {
      negative_exponent = true;
      ++begin;
    } else if (begin < end && *begin == '+') {
      ++begin;
    }
    const char* const exponent_digits_begin = begin;
    // Exponent is always expressed in decimal, even for hexadecimal floats.
    begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax,
                               &result.literal_exponent, nullptr);
    if (begin == exponent_digits_begin) {
      // there were no digits where we expected an exponent.  We failed to read
      // an exponent and should not consume the 'e' after all.  Rewind 'begin'.
      found_exponent = false;
      begin = exponent_begin;
    } else {
      found_exponent = true;
      if (negative_exponent) {
        result.literal_exponent = -result.literal_exponent;
      }
    }
  }

  if (!found_exponent && RequireExponent(format_flags)) {
    // Provided flags required an exponent, but none was found.  This results
    // in a failure to scan.
    return result;
  }

  // Success!
  result.type = strings_internal::FloatType::kNumber;
  if (result.mantissa > 0) {
    result.exponent = result.literal_exponent +
                      (DigitMagnitude<base>() * exponent_adjustment);
  } else {
    result.exponent = 0;
  }
  result.end = begin;
  return result;
}

template ParsedFloat ParseFloat<10>(const char* begin, const char* end,
                                    chars_format format_flags);
template ParsedFloat ParseFloat<16>(const char* begin, const char* end,
                                    chars_format format_flags);

}  // namespace strings_internal
ABSL_NAMESPACE_END
}  // namespace absl