summaryrefslogtreecommitdiff
path: root/goldfishlocale.cc
blob: f7fd20d85fdf6a55bff4db6173c5624dc39b1ad0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
// Copyright 2022 Benjamin Barenblat
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

#include "goldfishlocale.h"

#include <assert.h>
#include <errno.h>
#include <iconv.h>
#include <langinfo.h>
#include <locale.h>
#include <stddef.h>

#include <iostream>
#include <limits>
#include <locale>
#include <new>
#include <optional>
#include <stdexcept>
#include <string>
#include <system_error>
#include <type_traits>

namespace goldfishlocale_internal {

namespace {

// The codeset of the current process's locale. std::string isn't trivially
// destructible, so we just leak this.
const std::string* system_codeset;

// The value returned by iconv_open(3) when things go wrong. This needs to be a
// function for Reasons; just call this function whenever you need the
// value, and let the compiler inline the value.
template <typename T = iconv_t>
T InvalidIconv() noexcept {
  // POSIX specifies the invalid iconv_t as as (iconv_t)-1, but it doesn't
  // specify the representation of iconv_t. On glibc, it's a pointer, but it
  // could also legally be an index into a table or something. This function
  // thus needs to return "whatever -1 looks like" for any scalar type.
  static_assert(std::is_scalar_v<T>);

  // It would be very strange if iconv_t were std::nullptr_t.
  static_assert(!std::is_null_pointer_v<T>);

  // This function is a template so the compiler doesn't try to validate
  // branches of this if statement. (If iconv_t is a pointer, iconv_t{-1} is
  // invalid.)
  if constexpr (std::is_pointer_v<T> || std::is_member_pointer_v<T>) {
    return reinterpret_cast<T>(std::numeric_limits<uintptr_t>::max());
  } else {
    return T{-1};
  }
}

// The value returned by iconv(3) when things go wrong.
constexpr auto kIconvError =
    // POSIX specifies this as (size_t)-1.
    static_cast<size_t>(-1);

// Convenience wrapper for iconv_open(3). Returns a new iconv_t if one can be
// constructed and std::nullopt if iconv can't handle the conversion.
std::optional<iconv_t> IconvOpen(const char* from, const char* to) {
  iconv_t conv = iconv_open(to, from);
  if (conv == InvalidIconv()) {
    if (errno == EINVAL) {
      return std::nullopt;
    }
    throw std::system_error(errno, std::system_category(),
                            "goldfishlocale: iconv_open");
  }
  return conv;
}

// This class is thread-compatible.
class Iconv final {
 public:
  // Creates an iconv converter.
  explicit Iconv(const char* from, std::string to) {
    size_t to_size = to.size();

    // glibc has a nice mode for graceful degradation (i.e., converting © to (C)
    // in locales that only support ASCII). Try that first.
    to.append("//TRANSLIT");
    if (std::optional<iconv_t> conv = IconvOpen(from, to.c_str());
        conv.has_value()) {
      conv_ = *conv;
      return;
    }

    // We might still be on glibc, in which case we need to specify //IGNORE to
    // get iconv to ignore characters that don't exist in the target character
    // set instead of erroring out.
    to.replace(to_size, to.size(), "//IGNORE");
    if (std::optional<iconv_t> conv = IconvOpen(from, to.c_str());
        conv.has_value()) {
      conv_ = *conv;
      return;
    }

    to.resize(to_size);
    if (std::optional<iconv_t> conv = IconvOpen(from, to.c_str());
        conv.has_value()) {
      conv_ = *conv;
      return;
    }

    throw std::system_error(EINVAL, std::system_category(),
                            "goldfishlocale: iconv_open");
  }

  std::string Convert(char* in_buf, size_t in_bytes_left) {
    // Reset the iconv state.
    if (iconv(conv_, nullptr, nullptr, nullptr, nullptr) == kIconvError) {
      throw std::system_error(errno, std::system_category(),
                              "goldfishlocale: iconv");
    }

    // Having the string expand during this translation is unusual. If we're
    // going from UTF-8 to UTF-8, this is just going to be a memcpy; if we're
    // going from UTF-8 to C, most multibyte characters are going to degrade to
    // single-byte equivalents. Start by allocating the same number of bytes in
    // the output buffer as are in the input buffer; we can always expand later.
    std::string result(in_bytes_left, '\0');
    char* out_buf = result.data();
    size_t out_bytes_left =
        result.size() * sizeof(decltype(result)::value_type);

    while (iconv(conv_, &in_buf, &in_bytes_left, &out_buf, &out_bytes_left) ==
           kIconvError) {
      if (errno == E2BIG) {
        // result is full, but we still need to decode more characters. We're
        // going to reallocate result, which may invalidate out_buf; save our
        // position as an index so we can recompute out_buf later.
        ptrdiff_t result_index = out_buf - result.data();

        // Just double the result buffer size.
        int increment = result.size();
        result.append(increment, '\0');

        out_buf = result.data() + result_index;
        out_bytes_left += increment;
      } else {
        throw std::system_error(errno, std::system_category(),
                                "goldfishlocale: iconv");
      }
    }
    result.resize(result.size() - out_bytes_left);
    return result;
  }

 private:
  iconv_t conv_;
};

#ifndef NDEBUG

// The error code returned from locale functions. Since the error code is 0,
// this is valid whether locale_t is a pointer or an arithmetic type. If only
// iconv_open(3) could have worked this way.
constexpr locale_t kZeroLocale{0};

// A C locale that we own.
class Locale final {
 public:
  static Locale Duplicate(locale_t locale) { return Locale(locale); }

  // These are deleted for simplicity's sake in the current implementation;
  // there's no requirement that they be absent. Copies could be implemented
  // atop duplocale(3), and moves are easy to implement via swap.
  Locale(const Locale&) = delete;
  Locale& operator=(const Locale&) = delete;

  ~Locale() noexcept { freelocale(locale_); }

  const locale_t& get() noexcept { return locale_; }

 private:
  explicit Locale(locale_t locale) : locale_(duplocale(locale)) {
    if (locale_ == kZeroLocale) {
      if (errno == ENOMEM) {
        throw std::bad_alloc();
      }
      throw std::system_error(errno, std::system_category(),
                              "goldfishlocale: duplocale");
    }
  }

  locale_t locale_;
};

// Looks up the codeset of the current thread's locale.
std::string CurrentThreadCodeset() {
  // Get an (unowned) reference to the current thread's locale.
  locale_t locale_desc = uselocale(kZeroLocale);
  if (locale_desc == kZeroLocale) {
    throw std::system_error(errno, std::system_category(),
                            "goldfishlocale: uselocale");
  }

  // locale_desc might be LC_GLOBAL_LOCALE, which is an illegal argument to
  // nl_langinfo_l(3). Duplicate the locale before asking for the codeset to get
  // rid of any LC_GLOBAL_LOCALEs.
  return nl_langinfo_l(CODESET, Locale::Duplicate(locale_desc).get());
}

#endif  // !defined(NDEBUG)

}  // namespace

std::string ToSystem(char* in_buf, size_t in_bytes_left) {
#ifndef NDEBUG
  if (system_codeset == nullptr) {
    throw std::logic_error(
        "goldfishlocale: ToSystem was called before SetLocaleFromEnvironment");
  }

  if (CurrentThreadCodeset() != *system_codeset) {
    throw std::logic_error(
        "goldfishlocale: Process locale changed during execution");
  }
#endif

  // Iconv is thread-compatible, not thread-safe. Make it thread-safe and avoid
  // contention by just having one per thread.
  static thread_local Iconv conv("UTF-8", *system_codeset);

  return conv.Convert(in_buf, in_bytes_left);
}

}  // namespace goldfishlocale_internal

namespace goldfishlocale {

void SetLocaleFromEnvironment() {
#ifndef NDEBUG
  if (goldfishlocale_internal::system_codeset != nullptr) {
    throw std::logic_error(
        "goldfishlocale: SetLocaleFromEnvironment has already been called");
  }
#endif

  std::locale loc("");
  std::locale::global(loc);
  std::cin.imbue(loc);
  std::cout.imbue(loc);
  std::cerr.imbue(loc);
  std::clog.imbue(loc);
  std::wcin.imbue(loc);
  std::wcout.imbue(loc);
  std::wcerr.imbue(loc);
  std::wclog.imbue(loc);

  goldfishlocale_internal::system_codeset =
      new std::string(nl_langinfo(CODESET));
}

}  // namespace goldfishlocale