From fb78930d5ccf439719bc42e178aa7dc9a806d969 Mon Sep 17 00:00:00 2001 From: Alexey Yakovenko Date: Wed, 30 Apr 2014 14:11:34 +0200 Subject: junklib: added shift-jis detection --- junklib.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'junklib.c') diff --git a/junklib.c b/junklib.c index 1c57ba6e..e7f8a910 100644 --- a/junklib.c +++ b/junklib.c @@ -710,6 +710,37 @@ can_be_chinese (const uint8_t *str, int sz) { return 0; } +static int +can_be_shift_jis (const unsigned char *str, int size) { + unsigned char out[size*4]; + + if (size < 2) { + return 0; + } + + const unsigned char *p = str; + int s = size; + while (s >= 2) { + if ((((p[0] >= 0x81 && p[0] <= 0x84) || (p[0] >= 0x87 && p[0] <= 0x9f)) + && ((p[1] >= 0x40 && p[1] <= 0x9e) || (p[1] >= 0x9f && p[1] <= 0xfc))) + || ((p[0] >= 0xe0 && p[0] <= 0xef) + && ((p[1] >= 0x40 && p[1] <= 0x9e) || (p[1] >= 0x9f && p[1] <= 0xfc)))) { + break; + } + s--; + p++; + } + + if (s >= 2) { + if (junk_iconv (str, size, out, sizeof (out), "shift-jis", UTF8_STR) >= 0) { + return 1; + } + } + return 0; + +} + + static char * convstr_id3v2 (int version, uint8_t encoding, const unsigned char* str, int sz) { char out[2048] = ""; @@ -3528,6 +3559,10 @@ junk_detect_charset (const char *s) { if (u8_valid (s, len, NULL)) { return NULL; // means no recoding required } + // try shift-jis + if (can_be_shift_jis (s, len)) { + return "shift-jis"; + } // hack to add cp936 support if (can_be_chinese (s, len)) { return "cp936"; -- cgit v1.2.3