summaryrefslogtreecommitdiff
path: root/goldfishlocale.cc
diff options
context:
space:
mode:
Diffstat (limited to 'goldfishlocale.cc')
-rw-r--r--goldfishlocale.cc268
1 files changed, 268 insertions, 0 deletions
diff --git a/goldfishlocale.cc b/goldfishlocale.cc
new file mode 100644
index 0000000..f7fd20d
--- /dev/null
+++ b/goldfishlocale.cc
@@ -0,0 +1,268 @@
+// Copyright 2022 Benjamin Barenblat
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#include "goldfishlocale.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <iconv.h>
+#include <langinfo.h>
+#include <locale.h>
+#include <stddef.h>
+
+#include <iostream>
+#include <limits>
+#include <locale>
+#include <new>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <system_error>
+#include <type_traits>
+
+namespace goldfishlocale_internal {
+
+namespace {
+
+// The codeset of the current process's locale. std::string isn't trivially
+// destructible, so we just leak this.
+const std::string* system_codeset;
+
+// The value returned by iconv_open(3) when things go wrong. This needs to be a
+// function for Reasons; just call this function whenever you need the
+// value, and let the compiler inline the value.
+template <typename T = iconv_t>
+T InvalidIconv() noexcept {
+ // POSIX specifies the invalid iconv_t as as (iconv_t)-1, but it doesn't
+ // specify the representation of iconv_t. On glibc, it's a pointer, but it
+ // could also legally be an index into a table or something. This function
+ // thus needs to return "whatever -1 looks like" for any scalar type.
+ static_assert(std::is_scalar_v<T>);
+
+ // It would be very strange if iconv_t were std::nullptr_t.
+ static_assert(!std::is_null_pointer_v<T>);
+
+ // This function is a template so the compiler doesn't try to validate
+ // branches of this if statement. (If iconv_t is a pointer, iconv_t{-1} is
+ // invalid.)
+ if constexpr (std::is_pointer_v<T> || std::is_member_pointer_v<T>) {
+ return reinterpret_cast<T>(std::numeric_limits<uintptr_t>::max());
+ } else {
+ return T{-1};
+ }
+}
+
+// The value returned by iconv(3) when things go wrong.
+constexpr auto kIconvError =
+ // POSIX specifies this as (size_t)-1.
+ static_cast<size_t>(-1);
+
+// Convenience wrapper for iconv_open(3). Returns a new iconv_t if one can be
+// constructed and std::nullopt if iconv can't handle the conversion.
+std::optional<iconv_t> IconvOpen(const char* from, const char* to) {
+ iconv_t conv = iconv_open(to, from);
+ if (conv == InvalidIconv()) {
+ if (errno == EINVAL) {
+ return std::nullopt;
+ }
+ throw std::system_error(errno, std::system_category(),
+ "goldfishlocale: iconv_open");
+ }
+ return conv;
+}
+
+// This class is thread-compatible.
+class Iconv final {
+ public:
+ // Creates an iconv converter.
+ explicit Iconv(const char* from, std::string to) {
+ size_t to_size = to.size();
+
+ // glibc has a nice mode for graceful degradation (i.e., converting © to (C)
+ // in locales that only support ASCII). Try that first.
+ to.append("//TRANSLIT");
+ if (std::optional<iconv_t> conv = IconvOpen(from, to.c_str());
+ conv.has_value()) {
+ conv_ = *conv;
+ return;
+ }
+
+ // We might still be on glibc, in which case we need to specify //IGNORE to
+ // get iconv to ignore characters that don't exist in the target character
+ // set instead of erroring out.
+ to.replace(to_size, to.size(), "//IGNORE");
+ if (std::optional<iconv_t> conv = IconvOpen(from, to.c_str());
+ conv.has_value()) {
+ conv_ = *conv;
+ return;
+ }
+
+ to.resize(to_size);
+ if (std::optional<iconv_t> conv = IconvOpen(from, to.c_str());
+ conv.has_value()) {
+ conv_ = *conv;
+ return;
+ }
+
+ throw std::system_error(EINVAL, std::system_category(),
+ "goldfishlocale: iconv_open");
+ }
+
+ std::string Convert(char* in_buf, size_t in_bytes_left) {
+ // Reset the iconv state.
+ if (iconv(conv_, nullptr, nullptr, nullptr, nullptr) == kIconvError) {
+ throw std::system_error(errno, std::system_category(),
+ "goldfishlocale: iconv");
+ }
+
+ // Having the string expand during this translation is unusual. If we're
+ // going from UTF-8 to UTF-8, this is just going to be a memcpy; if we're
+ // going from UTF-8 to C, most multibyte characters are going to degrade to
+ // single-byte equivalents. Start by allocating the same number of bytes in
+ // the output buffer as are in the input buffer; we can always expand later.
+ std::string result(in_bytes_left, '\0');
+ char* out_buf = result.data();
+ size_t out_bytes_left =
+ result.size() * sizeof(decltype(result)::value_type);
+
+ while (iconv(conv_, &in_buf, &in_bytes_left, &out_buf, &out_bytes_left) ==
+ kIconvError) {
+ if (errno == E2BIG) {
+ // result is full, but we still need to decode more characters. We're
+ // going to reallocate result, which may invalidate out_buf; save our
+ // position as an index so we can recompute out_buf later.
+ ptrdiff_t result_index = out_buf - result.data();
+
+ // Just double the result buffer size.
+ int increment = result.size();
+ result.append(increment, '\0');
+
+ out_buf = result.data() + result_index;
+ out_bytes_left += increment;
+ } else {
+ throw std::system_error(errno, std::system_category(),
+ "goldfishlocale: iconv");
+ }
+ }
+ result.resize(result.size() - out_bytes_left);
+ return result;
+ }
+
+ private:
+ iconv_t conv_;
+};
+
+#ifndef NDEBUG
+
+// The error code returned from locale functions. Since the error code is 0,
+// this is valid whether locale_t is a pointer or an arithmetic type. If only
+// iconv_open(3) could have worked this way.
+constexpr locale_t kZeroLocale{0};
+
+// A C locale that we own.
+class Locale final {
+ public:
+ static Locale Duplicate(locale_t locale) { return Locale(locale); }
+
+ // These are deleted for simplicity's sake in the current implementation;
+ // there's no requirement that they be absent. Copies could be implemented
+ // atop duplocale(3), and moves are easy to implement via swap.
+ Locale(const Locale&) = delete;
+ Locale& operator=(const Locale&) = delete;
+
+ ~Locale() noexcept { freelocale(locale_); }
+
+ const locale_t& get() noexcept { return locale_; }
+
+ private:
+ explicit Locale(locale_t locale) : locale_(duplocale(locale)) {
+ if (locale_ == kZeroLocale) {
+ if (errno == ENOMEM) {
+ throw std::bad_alloc();
+ }
+ throw std::system_error(errno, std::system_category(),
+ "goldfishlocale: duplocale");
+ }
+ }
+
+ locale_t locale_;
+};
+
+// Looks up the codeset of the current thread's locale.
+std::string CurrentThreadCodeset() {
+ // Get an (unowned) reference to the current thread's locale.
+ locale_t locale_desc = uselocale(kZeroLocale);
+ if (locale_desc == kZeroLocale) {
+ throw std::system_error(errno, std::system_category(),
+ "goldfishlocale: uselocale");
+ }
+
+ // locale_desc might be LC_GLOBAL_LOCALE, which is an illegal argument to
+ // nl_langinfo_l(3). Duplicate the locale before asking for the codeset to get
+ // rid of any LC_GLOBAL_LOCALEs.
+ return nl_langinfo_l(CODESET, Locale::Duplicate(locale_desc).get());
+}
+
+#endif // !defined(NDEBUG)
+
+} // namespace
+
+std::string ToSystem(char* in_buf, size_t in_bytes_left) {
+#ifndef NDEBUG
+ if (system_codeset == nullptr) {
+ throw std::logic_error(
+ "goldfishlocale: ToSystem was called before SetLocaleFromEnvironment");
+ }
+
+ if (CurrentThreadCodeset() != *system_codeset) {
+ throw std::logic_error(
+ "goldfishlocale: Process locale changed during execution");
+ }
+#endif
+
+ // Iconv is thread-compatible, not thread-safe. Make it thread-safe and avoid
+ // contention by just having one per thread.
+ static thread_local Iconv conv("UTF-8", *system_codeset);
+
+ return conv.Convert(in_buf, in_bytes_left);
+}
+
+} // namespace goldfishlocale_internal
+
+namespace goldfishlocale {
+
+void SetLocaleFromEnvironment() {
+#ifndef NDEBUG
+ if (goldfishlocale_internal::system_codeset != nullptr) {
+ throw std::logic_error(
+ "goldfishlocale: SetLocaleFromEnvironment has already been called");
+ }
+#endif
+
+ std::locale loc("");
+ std::locale::global(loc);
+ std::cin.imbue(loc);
+ std::cout.imbue(loc);
+ std::cerr.imbue(loc);
+ std::clog.imbue(loc);
+ std::wcin.imbue(loc);
+ std::wcout.imbue(loc);
+ std::wcerr.imbue(loc);
+ std::wclog.imbue(loc);
+
+ goldfishlocale_internal::system_codeset =
+ new std::string(nl_langinfo(CODESET));
+}
+
+} // namespace goldfishlocale