From 0112143fdaae0a6264d9e02355e9dc0ca4f7741c Mon Sep 17 00:00:00 2001 From: wm4 Date: Tue, 17 Dec 2013 02:39:45 +0100 Subject: Split mpvcore/ into common/, misc/, bstr/ --- misc/charset_conv.c | 287 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 287 insertions(+) create mode 100644 misc/charset_conv.c (limited to 'misc/charset_conv.c') diff --git a/misc/charset_conv.c b/misc/charset_conv.c new file mode 100644 index 0000000000..fe396e8ef5 --- /dev/null +++ b/misc/charset_conv.c @@ -0,0 +1,287 @@ +/* + * This file is part of mpv. + * + * Based on code taken from libass (ISC license), which was originally part + * of MPlayer (GPL). + * Copyright (C) 2006 Evgeniy Stepanov + * + * mpv is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with mpv. If not, see . + */ + +#include +#include +#include + +#include "config.h" + +#include "common/msg.h" + +#if HAVE_ENCA +#include +#endif + +#if HAVE_LIBGUESS +#include +#endif + +#if HAVE_ICONV +#include +#endif + +#include "charset_conv.h" + +bool mp_charset_is_utf8(const char *user_cp) +{ + return user_cp && (strcasecmp(user_cp, "utf8") == 0 || + strcasecmp(user_cp, "utf-8") == 0); +} + +// Split the string on ':' into components. +// out_arr is at least max entries long. +// Return number of out_arr entries filled. +static int split_colon(const char *user_cp, int max, bstr *out_arr) +{ + if (!user_cp || max < 1) + return 0; + + int count = 0; + while (1) { + const char *next = strchr(user_cp, ':'); + if (next && max - count > 1) { + out_arr[count++] = (bstr){(char *)user_cp, next - user_cp}; + user_cp = next + 1; + } else { + out_arr[count++] = (bstr){(char *)user_cp, strlen(user_cp)}; + break; + } + } + return count; +} + +// Returns true if user_cp implies that calling mp_charset_guess() on the +// input data is required to determine the real codepage. This is the case +// if user_cp is not a real iconv codepage, but a magic value that requests +// for example ENCA charset auto-detection. +bool mp_charset_requires_guess(const char *user_cp) +{ + bstr res[2] = {{0}}; + int r = split_colon(user_cp, 2, res); + // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8 + // by default, plus a codepage that is used if the input is not UTF-8. + return bstrcasecmp0(res[0], "enca") == 0 || + bstrcasecmp0(res[0], "guess") == 0 || + (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) || + (r > 1 && bstrcasecmp0(res[0], "utf8") == 0); +} + +#if HAVE_ENCA +static const char *enca_guess(bstr buf, const char *language) +{ + if (!language || !language[0]) + language = "__"; // neutral language + + const char *detected_cp = NULL; + + EncaAnalyser analyser = enca_analyser_alloc(language); + if (analyser) { + enca_set_termination_strictness(analyser, 0); + EncaEncoding enc = enca_analyse_const(analyser, buf.start, buf.len); + const char *tmp = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV); + if (tmp && enc.charset != ENCA_CS_UNKNOWN) + detected_cp = tmp; + enca_analyser_free(analyser); + } else { + mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA doesn't know language '%s'\n", + language); + size_t langcnt; + const char **languages = enca_get_languages(&langcnt); + mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA supported languages:"); + for (int i = 0; i < langcnt; i++) + mp_msg(MSGT_SUBREADER, MSGL_ERR, " %s", languages[i]); + mp_msg(MSGT_SUBREADER, MSGL_ERR, "\n"); + free(languages); + } + + return detected_cp; +} +#endif + +#if HAVE_LIBGUESS +static const char *libguess_guess(bstr buf, const char *language) +{ + if (!language || !language[0] || strcmp(language, "help") == 0) { + mp_msg(MSGT_SUBREADER, MSGL_ERR, "libguess needs a language: " + "japanese taiwanese chinese korean russian arabic turkish " + "greek hebrew polish baltic\n"); + return NULL; + } + + return libguess_determine_encoding(buf.start, buf.len, language); +} +#endif + +// Runs charset auto-detection on the input buffer, and returns the result. +// If auto-detection fails, NULL is returned. +// If user_cp doesn't refer to any known auto-detection (for example because +// it's a real iconv codepage), user_cp is returned without even looking at +// the buf data. +const char *mp_charset_guess(bstr buf, const char *user_cp, int flags) +{ + if (!mp_charset_requires_guess(user_cp)) + return user_cp; + + // Do our own UTF-8 detection, because at least ENCA seems to get it + // wrong sometimes (suggested by divVerent). + int r = bstr_validate_utf8(buf); + if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) + return "UTF-8"; + + bstr params[3] = {{0}}; + split_colon(user_cp, 3, params); + + bstr type = params[0]; + char lang[100]; + snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1])); + const char *fallback = params[2].start; // last item, already 0-terminated + + const char *res = NULL; + +#if HAVE_ENCA + if (bstrcasecmp0(type, "enca") == 0) + res = enca_guess(buf, lang); +#endif +#if HAVE_LIBGUESS + if (bstrcasecmp0(type, "guess") == 0) + res = libguess_guess(buf, lang); +#endif + if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) { + if (!fallback) + fallback = params[1].start; // must be already 0-terminated + } + + if (res) { + mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n", + BSTR_P(type), res); + } else { + res = fallback; + mp_msg(MSGT_SUBREADER, MSGL_DBG2, + "Detection with %.*s failed: fallback to %s\n", + BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1"); + } + + if (!res && !(flags & MP_STRICT_UTF8)) + res = "UTF-8-BROKEN"; + + return res; +} + +// Convert the data in buf to UTF-8. The charset argument can be an iconv +// codepage, a value returned by mp_charset_conv_guess(), or a special value +// that triggers autodetection of the charset (e.g. using ENCA). +// The auto-detection is the only difference to mp_iconv_to_utf8(). +// buf: same as mp_iconv_to_utf8() +// user_cp: iconv codepage, special value, NULL +// flags: same as mp_iconv_to_utf8() +// returns: same as mp_iconv_to_utf8() +bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags) +{ + return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp, flags), flags); +} + +// Use iconv to convert buf to UTF-8. +// Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is +// obviously no conversion required (e.g. if cp is "UTF-8"). +// Returns a newly allocated buffer if conversion is done and succeeds. The +// buffer will be terminated with 0 for convenience (the terminating 0 is not +// included in the returned length). +// Free the returned buffer with talloc_free(). +// buf: input data +// cp: iconv codepage (or NULL) +// flags: combination of MP_ICONV_* flags +// returns: buf (no conversion), .start==NULL (error), or allocated buffer +bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags) +{ +#if HAVE_ICONV + if (!cp || !cp[0] || mp_charset_is_utf8(cp)) + return buf; + + if (strcasecmp(cp, "ASCII") == 0) + return buf; + + if (strcasecmp(cp, "UTF-8-BROKEN") == 0) + return bstr_sanitize_utf8_latin1(NULL, buf); + + iconv_t icdsc; + if ((icdsc = iconv_open("UTF-8", cp)) == (iconv_t) (-1)) { + if (flags & MP_ICONV_VERBOSE) + mp_msg(MSGT_SUBREADER, MSGL_ERR, + "Error opening iconv with codepage '%s'\n", cp); + goto failure; + } + + size_t size = buf.len; + size_t osize = size; + size_t ileft = size; + size_t oleft = size - 1; + + char *outbuf = talloc_size(NULL, osize); + char *ip = buf.start; + char *op = outbuf; + + while (1) { + int clear = 0; + size_t rc; + if (ileft) + rc = iconv(icdsc, &ip, &ileft, &op, &oleft); + else { + clear = 1; // clear the conversion state and leave + rc = iconv(icdsc, NULL, NULL, &op, &oleft); + } + if (rc == (size_t) (-1)) { + if (errno == E2BIG) { + size_t offset = op - outbuf; + outbuf = talloc_realloc_size(NULL, outbuf, osize + size); + op = outbuf + offset; + osize += size; + oleft += size; + } else { + if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) { + // This is intended for cases where the input buffer is cut + // at a random byte position. If this happens in the middle + // of the buffer, it should still be an error. We say it's + // fine if the error is within 10 bytes of the end. + if (ileft <= 10) + break; + } + if (flags & MP_ICONV_VERBOSE) { + mp_msg(MSGT_SUBREADER, MSGL_ERR, + "Error recoding text with codepage '%s'\n", cp); + } + talloc_free(outbuf); + iconv_close(icdsc); + goto failure; + } + } else if (clear) + break; + } + + iconv_close(icdsc); + + outbuf[osize - oleft - 1] = 0; + return (bstr){outbuf, osize - oleft - 1}; +#endif + +failure: + return (bstr){0}; +} -- cgit v1.2.3