aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Hoa V. DINH <dinh.viet.hoa@gmail.com>2014-01-02 22:21:45 -0800
committerGravatar Hoa V. DINH <dinh.viet.hoa@gmail.com>2014-01-02 22:21:45 -0800
commit77a673103c05fcee0407bd5afce12e8eb8b2aa72 (patch)
tree95f0a016fee9f2ba67e081dea735dd6419f78a45
parenta7b132b697def28c05760ebac61dd59e229b1df1 (diff)
Fixed charset detection, improved collection of valid charset (fixed #532)
-rw-r--r--src/core/basetypes/MCData.cc41
1 files changed, 16 insertions, 25 deletions
diff --git a/src/core/basetypes/MCData.cc b/src/core/basetypes/MCData.cc
index 964f7b11..4cd38839 100644
--- a/src/core/basetypes/MCData.cc
+++ b/src/core/basetypes/MCData.cc
@@ -188,32 +188,23 @@ static bool isHintCharsetValid(String * hintCharset)
pthread_mutex_lock(&lock);
if (knownCharset == NULL) {
knownCharset = new Set();
- knownCharset->addObject(MCSTR("utf-8"));
- knownCharset->addObject(MCSTR("utf-16be"));
- knownCharset->addObject(MCSTR("utf-16le"));
- knownCharset->addObject(MCSTR("utf-32be"));
- knownCharset->addObject(MCSTR("utf-32le"));
- knownCharset->addObject(MCSTR("shift_jis"));
- knownCharset->addObject(MCSTR("iso-2022-jp"));
- knownCharset->addObject(MCSTR("iso-2022-jp-2"));
- knownCharset->addObject(MCSTR("iso-2022-cn"));
- knownCharset->addObject(MCSTR("iso-2022-kr"));
+ UCharsetDetector * detector;
+ UEnumeration * iterator;
+ UErrorCode err = U_ZERO_ERROR;
- knownCharset->addObject(MCSTR("gb18030"));
- knownCharset->addObject(MCSTR("big5"));
- knownCharset->addObject(MCSTR("euc-jp"));
- knownCharset->addObject(MCSTR("euc-kr"));
- knownCharset->addObject(MCSTR("iso-8859-1"));
- knownCharset->addObject(MCSTR("iso-8859-2"));
- knownCharset->addObject(MCSTR("iso-8859-5"));
- knownCharset->addObject(MCSTR("iso-8859-6"));
- knownCharset->addObject(MCSTR("iso-8859-7"));
- knownCharset->addObject(MCSTR("iso-8859-8"));
- knownCharset->addObject(MCSTR("iso-8859-9"));
- knownCharset->addObject(MCSTR("windows-1251"));
- knownCharset->addObject(MCSTR("windows-1256"));
- knownCharset->addObject(MCSTR("koi8-r"));
+ detector = ucsdet_open(&err);
+ iterator = ucsdet_getAllDetectableCharsets(detector, &err);
+ while (1) {
+ const char * validCharset = uenum_next(iterator, NULL, &err);
+ if (err != U_ZERO_ERROR)
+ break;
+ if (validCharset == NULL)
+ break;
+ knownCharset->addObject(String::stringWithUTF8Characters(validCharset));
+ }
+ uenum_close(iterator);
+ ucsdet_close(detector);
}
pthread_mutex_unlock(&lock);
@@ -246,7 +237,7 @@ String * Data::stringWithDetectedCharset(String * hintCharset, bool isHTML)
String * result;
String * charset;
- if (isHintCharsetValid(hintCharset)) {
+ if (!isHintCharsetValid(hintCharset)) {
hintCharset = NULL;
}