// Copyright 2022 Benjamin Barenblat // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy of // the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations under // the License. #include "goldfishlocale.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace goldfishlocale_internal { namespace { // The codeset of the current process's locale. std::string isn't trivially // destructible, so we just leak this. const std::string* system_codeset; // The value returned by iconv_open(3) when things go wrong. This needs to be a // function for Reasons; just call this function whenever you need the // value, and let the compiler inline the value. template T InvalidIconv() noexcept { // POSIX specifies the invalid iconv_t as as (iconv_t)-1, but it doesn't // specify the representation of iconv_t. On glibc, it's a pointer, but it // could also legally be an index into a table or something. This function // thus needs to return "whatever -1 looks like" for any scalar type. static_assert(std::is_scalar_v); // It would be very strange if iconv_t were std::nullptr_t. static_assert(!std::is_null_pointer_v); // This function is a template so the compiler doesn't try to validate // branches of this if statement. (If iconv_t is a pointer, iconv_t{-1} is // invalid.) if constexpr (std::is_pointer_v || std::is_member_pointer_v) { return reinterpret_cast(std::numeric_limits::max()); } else { return T{-1}; } } // The value returned by iconv(3) when things go wrong. constexpr auto kIconvError = // POSIX specifies this as (size_t)-1. static_cast(-1); // Convenience wrapper for iconv_open(3). Returns a new iconv_t if one can be // constructed and std::nullopt if iconv can't handle the conversion. std::optional IconvOpen(const char* from, const char* to) { iconv_t conv = iconv_open(to, from); if (conv == InvalidIconv()) { if (errno == EINVAL) { return std::nullopt; } throw std::system_error(errno, std::system_category(), "goldfishlocale: iconv_open"); } return conv; } // This class is thread-compatible. class Iconv final { public: // Creates an iconv converter. explicit Iconv(const char* from, std::string to) { size_t to_size = to.size(); // glibc has a nice mode for graceful degradation (i.e., converting © to (C) // in locales that only support ASCII). Try that first. to.append("//TRANSLIT"); if (std::optional conv = IconvOpen(from, to.c_str()); conv.has_value()) { conv_ = *conv; return; } // We might still be on glibc, in which case we need to specify //IGNORE to // get iconv to ignore characters that don't exist in the target character // set instead of erroring out. to.replace(to_size, to.size(), "//IGNORE"); if (std::optional conv = IconvOpen(from, to.c_str()); conv.has_value()) { conv_ = *conv; return; } to.resize(to_size); if (std::optional conv = IconvOpen(from, to.c_str()); conv.has_value()) { conv_ = *conv; return; } throw std::system_error(EINVAL, std::system_category(), "goldfishlocale: iconv_open"); } std::string Convert(char* in_buf, size_t in_bytes_left) { // Reset the iconv state. if (iconv(conv_, nullptr, nullptr, nullptr, nullptr) == kIconvError) { throw std::system_error(errno, std::system_category(), "goldfishlocale: iconv"); } // Having the string expand during this translation is unusual. If we're // going from UTF-8 to UTF-8, this is just going to be a memcpy; if we're // going from UTF-8 to C, most multibyte characters are going to degrade to // single-byte equivalents. Start by allocating the same number of bytes in // the output buffer as are in the input buffer; we can always expand later. std::string result(in_bytes_left, '\0'); char* out_buf = result.data(); size_t out_bytes_left = result.size() * sizeof(decltype(result)::value_type); while (iconv(conv_, &in_buf, &in_bytes_left, &out_buf, &out_bytes_left) == kIconvError) { if (errno == E2BIG) { // result is full, but we still need to decode more characters. We're // going to reallocate result, which may invalidate out_buf; save our // position as an index so we can recompute out_buf later. ptrdiff_t result_index = out_buf - result.data(); // Just double the result buffer size. int increment = result.size(); result.append(increment, '\0'); out_buf = result.data() + result_index; out_bytes_left += increment; } else { throw std::system_error(errno, std::system_category(), "goldfishlocale: iconv"); } } result.resize(result.size() - out_bytes_left); return result; } private: iconv_t conv_; }; #ifndef NDEBUG // The error code returned from locale functions. Since the error code is 0, // this is valid whether locale_t is a pointer or an arithmetic type. If only // iconv_open(3) could have worked this way. constexpr locale_t kZeroLocale{0}; // A C locale that we own. class Locale final { public: static Locale Duplicate(locale_t locale) { return Locale(locale); } // These are deleted for simplicity's sake in the current implementation; // there's no requirement that they be absent. Copies could be implemented // atop duplocale(3), and moves are easy to implement via swap. Locale(const Locale&) = delete; Locale& operator=(const Locale&) = delete; ~Locale() noexcept { freelocale(locale_); } const locale_t& get() noexcept { return locale_; } private: explicit Locale(locale_t locale) : locale_(duplocale(locale)) { if (locale_ == kZeroLocale) { if (errno == ENOMEM) { throw std::bad_alloc(); } throw std::system_error(errno, std::system_category(), "goldfishlocale: duplocale"); } } locale_t locale_; }; // Looks up the codeset of the current thread's locale. std::string CurrentThreadCodeset() { // Get an (unowned) reference to the current thread's locale. locale_t locale_desc = uselocale(kZeroLocale); if (locale_desc == kZeroLocale) { throw std::system_error(errno, std::system_category(), "goldfishlocale: uselocale"); } // locale_desc might be LC_GLOBAL_LOCALE, which is an illegal argument to // nl_langinfo_l(3). Duplicate the locale before asking for the codeset to get // rid of any LC_GLOBAL_LOCALEs. return nl_langinfo_l(CODESET, Locale::Duplicate(locale_desc).get()); } #endif // !defined(NDEBUG) } // namespace std::string ToSystem(char* in_buf, size_t in_bytes_left) { #ifndef NDEBUG if (system_codeset == nullptr) { throw std::logic_error( "goldfishlocale: ToSystem was called before SetLocaleFromEnvironment"); } if (CurrentThreadCodeset() != *system_codeset) { throw std::logic_error( "goldfishlocale: Process locale changed during execution"); } #endif // Iconv is thread-compatible, not thread-safe. Make it thread-safe and avoid // contention by just having one per thread. static thread_local Iconv conv("UTF-8", *system_codeset); return conv.Convert(in_buf, in_bytes_left); } } // namespace goldfishlocale_internal namespace goldfishlocale { void SetLocaleFromEnvironment() { #ifndef NDEBUG if (goldfishlocale_internal::system_codeset != nullptr) { throw std::logic_error( "goldfishlocale: SetLocaleFromEnvironment has already been called"); } #endif std::locale loc(""); std::locale::global(loc); std::cin.imbue(loc); std::cout.imbue(loc); std::cerr.imbue(loc); std::clog.imbue(loc); std::wcin.imbue(loc); std::wcout.imbue(loc); std::wcerr.imbue(loc); std::wclog.imbue(loc); goldfishlocale_internal::system_codeset = new std::string(nl_langinfo(CODESET)); } } // namespace goldfishlocale