From 07c61498bd7fa6166029c1ab093a35f82d926667 Mon Sep 17 00:00:00 2001 From: Benjamin Barenblat Date: Tue, 25 Jan 2022 18:23:21 -0500 Subject: skiphead, a program to preserve headers in a pipeline This is a faster, more robust rewrite of a shell script I wrote a few years ago to preserve headers when grepping through program output. I can never remember what the headers are when I run things like 'ps', so being able to say something like 'ps -ef | skiphead grep systemd' is useful. As a bonus, the program detects your locale and automatically displays help and error messages using the correct encoding. --- goldfishlocale.cc | 268 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 goldfishlocale.cc (limited to 'goldfishlocale.cc') diff --git a/goldfishlocale.cc b/goldfishlocale.cc new file mode 100644 index 0000000..f7fd20d --- /dev/null +++ b/goldfishlocale.cc @@ -0,0 +1,268 @@ +// Copyright 2022 Benjamin Barenblat +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy of +// the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +#include "goldfishlocale.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace goldfishlocale_internal { + +namespace { + +// The codeset of the current process's locale. std::string isn't trivially +// destructible, so we just leak this. +const std::string* system_codeset; + +// The value returned by iconv_open(3) when things go wrong. This needs to be a +// function for Reasons; just call this function whenever you need the +// value, and let the compiler inline the value. +template +T InvalidIconv() noexcept { + // POSIX specifies the invalid iconv_t as as (iconv_t)-1, but it doesn't + // specify the representation of iconv_t. On glibc, it's a pointer, but it + // could also legally be an index into a table or something. This function + // thus needs to return "whatever -1 looks like" for any scalar type. + static_assert(std::is_scalar_v); + + // It would be very strange if iconv_t were std::nullptr_t. + static_assert(!std::is_null_pointer_v); + + // This function is a template so the compiler doesn't try to validate + // branches of this if statement. (If iconv_t is a pointer, iconv_t{-1} is + // invalid.) + if constexpr (std::is_pointer_v || std::is_member_pointer_v) { + return reinterpret_cast(std::numeric_limits::max()); + } else { + return T{-1}; + } +} + +// The value returned by iconv(3) when things go wrong. +constexpr auto kIconvError = + // POSIX specifies this as (size_t)-1. + static_cast(-1); + +// Convenience wrapper for iconv_open(3). Returns a new iconv_t if one can be +// constructed and std::nullopt if iconv can't handle the conversion. +std::optional IconvOpen(const char* from, const char* to) { + iconv_t conv = iconv_open(to, from); + if (conv == InvalidIconv()) { + if (errno == EINVAL) { + return std::nullopt; + } + throw std::system_error(errno, std::system_category(), + "goldfishlocale: iconv_open"); + } + return conv; +} + +// This class is thread-compatible. +class Iconv final { + public: + // Creates an iconv converter. + explicit Iconv(const char* from, std::string to) { + size_t to_size = to.size(); + + // glibc has a nice mode for graceful degradation (i.e., converting © to (C) + // in locales that only support ASCII). Try that first. + to.append("//TRANSLIT"); + if (std::optional conv = IconvOpen(from, to.c_str()); + conv.has_value()) { + conv_ = *conv; + return; + } + + // We might still be on glibc, in which case we need to specify //IGNORE to + // get iconv to ignore characters that don't exist in the target character + // set instead of erroring out. + to.replace(to_size, to.size(), "//IGNORE"); + if (std::optional conv = IconvOpen(from, to.c_str()); + conv.has_value()) { + conv_ = *conv; + return; + } + + to.resize(to_size); + if (std::optional conv = IconvOpen(from, to.c_str()); + conv.has_value()) { + conv_ = *conv; + return; + } + + throw std::system_error(EINVAL, std::system_category(), + "goldfishlocale: iconv_open"); + } + + std::string Convert(char* in_buf, size_t in_bytes_left) { + // Reset the iconv state. + if (iconv(conv_, nullptr, nullptr, nullptr, nullptr) == kIconvError) { + throw std::system_error(errno, std::system_category(), + "goldfishlocale: iconv"); + } + + // Having the string expand during this translation is unusual. If we're + // going from UTF-8 to UTF-8, this is just going to be a memcpy; if we're + // going from UTF-8 to C, most multibyte characters are going to degrade to + // single-byte equivalents. Start by allocating the same number of bytes in + // the output buffer as are in the input buffer; we can always expand later. + std::string result(in_bytes_left, '\0'); + char* out_buf = result.data(); + size_t out_bytes_left = + result.size() * sizeof(decltype(result)::value_type); + + while (iconv(conv_, &in_buf, &in_bytes_left, &out_buf, &out_bytes_left) == + kIconvError) { + if (errno == E2BIG) { + // result is full, but we still need to decode more characters. We're + // going to reallocate result, which may invalidate out_buf; save our + // position as an index so we can recompute out_buf later. + ptrdiff_t result_index = out_buf - result.data(); + + // Just double the result buffer size. + int increment = result.size(); + result.append(increment, '\0'); + + out_buf = result.data() + result_index; + out_bytes_left += increment; + } else { + throw std::system_error(errno, std::system_category(), + "goldfishlocale: iconv"); + } + } + result.resize(result.size() - out_bytes_left); + return result; + } + + private: + iconv_t conv_; +}; + +#ifndef NDEBUG + +// The error code returned from locale functions. Since the error code is 0, +// this is valid whether locale_t is a pointer or an arithmetic type. If only +// iconv_open(3) could have worked this way. +constexpr locale_t kZeroLocale{0}; + +// A C locale that we own. +class Locale final { + public: + static Locale Duplicate(locale_t locale) { return Locale(locale); } + + // These are deleted for simplicity's sake in the current implementation; + // there's no requirement that they be absent. Copies could be implemented + // atop duplocale(3), and moves are easy to implement via swap. + Locale(const Locale&) = delete; + Locale& operator=(const Locale&) = delete; + + ~Locale() noexcept { freelocale(locale_); } + + const locale_t& get() noexcept { return locale_; } + + private: + explicit Locale(locale_t locale) : locale_(duplocale(locale)) { + if (locale_ == kZeroLocale) { + if (errno == ENOMEM) { + throw std::bad_alloc(); + } + throw std::system_error(errno, std::system_category(), + "goldfishlocale: duplocale"); + } + } + + locale_t locale_; +}; + +// Looks up the codeset of the current thread's locale. +std::string CurrentThreadCodeset() { + // Get an (unowned) reference to the current thread's locale. + locale_t locale_desc = uselocale(kZeroLocale); + if (locale_desc == kZeroLocale) { + throw std::system_error(errno, std::system_category(), + "goldfishlocale: uselocale"); + } + + // locale_desc might be LC_GLOBAL_LOCALE, which is an illegal argument to + // nl_langinfo_l(3). Duplicate the locale before asking for the codeset to get + // rid of any LC_GLOBAL_LOCALEs. + return nl_langinfo_l(CODESET, Locale::Duplicate(locale_desc).get()); +} + +#endif // !defined(NDEBUG) + +} // namespace + +std::string ToSystem(char* in_buf, size_t in_bytes_left) { +#ifndef NDEBUG + if (system_codeset == nullptr) { + throw std::logic_error( + "goldfishlocale: ToSystem was called before SetLocaleFromEnvironment"); + } + + if (CurrentThreadCodeset() != *system_codeset) { + throw std::logic_error( + "goldfishlocale: Process locale changed during execution"); + } +#endif + + // Iconv is thread-compatible, not thread-safe. Make it thread-safe and avoid + // contention by just having one per thread. + static thread_local Iconv conv("UTF-8", *system_codeset); + + return conv.Convert(in_buf, in_bytes_left); +} + +} // namespace goldfishlocale_internal + +namespace goldfishlocale { + +void SetLocaleFromEnvironment() { +#ifndef NDEBUG + if (goldfishlocale_internal::system_codeset != nullptr) { + throw std::logic_error( + "goldfishlocale: SetLocaleFromEnvironment has already been called"); + } +#endif + + std::locale loc(""); + std::locale::global(loc); + std::cin.imbue(loc); + std::cout.imbue(loc); + std::cerr.imbue(loc); + std::clog.imbue(loc); + std::wcin.imbue(loc); + std::wcout.imbue(loc); + std::wcerr.imbue(loc); + std::wclog.imbue(loc); + + goldfishlocale_internal::system_codeset = + new std::string(nl_langinfo(CODESET)); +} + +} // namespace goldfishlocale -- cgit v1.2.3