diff options
Diffstat (limited to 'src/core/basetypes/icu-ucsdet')
90 files changed, 39126 insertions, 0 deletions
diff --git a/src/core/basetypes/icu-ucsdet/cmemory.c b/src/core/basetypes/icu-ucsdet/cmemory.c new file mode 100644 index 00000000..cd3ccac6 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/cmemory.c @@ -0,0 +1,183 @@ +/* +****************************************************************************** +* +* Copyright (C) 2002-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* File cmemory.c ICU Heap allocation. +* All ICU heap allocation, both for C and C++ new of ICU +* class types, comes through these functions. +* +* If you have a need to replace ICU allocation, this is the +* place to do it. +* +* Note that uprv_malloc(0) returns a non-NULL pointer, and +* that a subsequent free of that pointer value is a NOP. +* +****************************************************************************** +*/ +#include "unicode/uclean.h" +#include "cmemory.h" +#include "putilimp.h" +#include "uassert.h" +#include <stdlib.h> + +/* uprv_malloc(0) returns a pointer to this read-only data. */ +static const int32_t zeroMem[] = {0, 0, 0, 0, 0, 0}; + +/* Function Pointers for user-supplied heap functions */ +static const void *pContext; +static UMemAllocFn *pAlloc; +static UMemReallocFn *pRealloc; +static UMemFreeFn *pFree; + +/* Flag indicating whether any heap allocations have happened. + * Used to prevent changing out the heap functions after allocations have been made */ +static UBool gHeapInUse; + +#if U_DEBUG && defined(UPRV_MALLOC_COUNT) +#include <stdio.h> +static int n=0; +static long b=0; +#endif + +#if U_DEBUG + +static char gValidMemorySink = 0; + +U_CAPI void uprv_checkValidMemory(const void *p, size_t n) { + /* + * Access the memory to ensure that it's all valid. + * Load and save a computed value to try to ensure that the compiler + * does not throw away the whole loop. + * A thread analyzer might complain about un-mutexed access to gValidMemorySink + * which is true but harmless because no one ever uses the value in gValidMemorySink. + */ + const char *s = (const char *)p; + char c = gValidMemorySink; + size_t i; + U_ASSERT(p != NULL); + for(i = 0; i < n; ++i) { + c ^= s[i]; + } + gValidMemorySink = c; +} + +#endif /* U_DEBUG */ + +U_CAPI void * U_EXPORT2 +uprv_malloc(size_t s) { +#if U_DEBUG && defined(UPRV_MALLOC_COUNT) +#if 1 + putchar('>'); + fflush(stdout); +#else + fprintf(stderr,"MALLOC\t#%d\t%ul bytes\t%ul total\n", ++n,s,(b+=s)); fflush(stderr); +#endif +#endif + if (s > 0) { + gHeapInUse = TRUE; + if (pAlloc) { + return (*pAlloc)(pContext, s); + } else { + return uprv_default_malloc(s); + } + } else { + return (void *)zeroMem; + } +} + +U_CAPI void * U_EXPORT2 +uprv_realloc(void * buffer, size_t size) { +#if U_DEBUG && defined(UPRV_MALLOC_COUNT) + putchar('~'); + fflush(stdout); +#endif + if (buffer == zeroMem) { + return uprv_malloc(size); + } else if (size == 0) { + if (pFree) { + (*pFree)(pContext, buffer); + } else { + uprv_default_free(buffer); + } + return (void *)zeroMem; + } else { + gHeapInUse = TRUE; + if (pRealloc) { + return (*pRealloc)(pContext, buffer, size); + } else { + return uprv_default_realloc(buffer, size); + } + } +} + +U_CAPI void U_EXPORT2 +uprv_free(void *buffer) { +#if U_DEBUG && defined(UPRV_MALLOC_COUNT) + putchar('<'); + fflush(stdout); +#endif + if (buffer != zeroMem) { + if (pFree) { + (*pFree)(pContext, buffer); + } else { + uprv_default_free(buffer); + } + } +} + +U_CAPI void * U_EXPORT2 +uprv_calloc(size_t num, size_t size) { + void *mem = NULL; + size *= num; + mem = uprv_malloc(size); + if (mem) { + uprv_memset(mem, 0, size); + } + return mem; +} + +U_CAPI void U_EXPORT2 +u_setMemoryFunctions(const void *context, UMemAllocFn *a, UMemReallocFn *r, UMemFreeFn *f, UErrorCode *status) +{ + if (U_FAILURE(*status)) { + return; + } + if (a==NULL || r==NULL || f==NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + if (gHeapInUse) { + *status = U_INVALID_STATE_ERROR; + return; + } + pContext = context; + pAlloc = a; + pRealloc = r; + pFree = f; +} + + +U_CFUNC UBool cmemory_cleanup(void) { + pContext = NULL; + pAlloc = NULL; + pRealloc = NULL; + pFree = NULL; + gHeapInUse = FALSE; + return TRUE; +} + + +/* + * gHeapInUse + * Return True if ICU has allocated any memory. + * Used by u_SetMutexFunctions() and similar to verify that ICU has not + * been used, that it is in a pristine initial state. + */ +U_CFUNC UBool cmemory_inUse() { + return gHeapInUse; +} + diff --git a/src/core/basetypes/icu-ucsdet/csdetect.cpp b/src/core/basetypes/icu-ucsdet/csdetect.cpp new file mode 100644 index 00000000..3efbd492 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/csdetect.cpp @@ -0,0 +1,485 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2013, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "unicode/ucsdet.h" + +#include "csdetect.h" +#include "csmatch.h" +#include "uenumimp.h" + +#include "cmemory.h" +#include "cstring.h" +#include "umutex.h" +#include "ucln_in.h" +#include "uarrsort.h" +#include "inputext.h" +#include "csrsbcs.h" +#include "csrmbcs.h" +#include "csrutf8.h" +#include "csrucode.h" +#include "csr2022.h" + +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) + +#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) +#define DELETE_ARRAY(array) uprv_free((void *) (array)) + +U_NAMESPACE_BEGIN + +struct CSRecognizerInfo : public UMemory { + CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) + : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}; + + ~CSRecognizerInfo() {delete recognizer;}; + + CharsetRecognizer *recognizer; + UBool isDefaultEnabled; +}; + +U_NAMESPACE_END + +static icu::CSRecognizerInfo **fCSRecognizers = NULL; +static icu::UInitOnce gCSRecognizersInitOnce; +static int32_t fCSRecognizers_size = 0; + +U_CDECL_BEGIN +static UBool U_CALLCONV csdet_cleanup(void) +{ + U_NAMESPACE_USE + if (fCSRecognizers != NULL) { + for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { + delete fCSRecognizers[r]; + fCSRecognizers[r] = NULL; + } + + DELETE_ARRAY(fCSRecognizers); + fCSRecognizers = NULL; + fCSRecognizers_size = 0; + } + gCSRecognizersInitOnce.reset(); + + return TRUE; +} + +static int32_t U_CALLCONV +charsetMatchComparator(const void * /*context*/, const void *left, const void *right) +{ + U_NAMESPACE_USE + + const CharsetMatch **csm_l = (const CharsetMatch **) left; + const CharsetMatch **csm_r = (const CharsetMatch **) right; + + // NOTE: compare is backwards to sort from highest to lowest. + return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); +} + +static void U_CALLCONV initRecognizers(UErrorCode &status) { + U_NAMESPACE_USE + ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); + CSRecognizerInfo *tempArray[] = { + new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE), + + new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE), + new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE), + new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE), + new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE), + + new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE), + new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE), + new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE), + new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE), + new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE), + new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE), + new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE), + new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE), + new CSRecognizerInfo(new CharsetRecog_big5(), TRUE), + + new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE), + new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE), + new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE), + + new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE), + new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE), + new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE), + new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE) + }; + int32_t rCount = ARRAY_SIZE(tempArray); + + fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); + + if (fCSRecognizers == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } + else { + fCSRecognizers_size = rCount; + for (int32_t r = 0; r < rCount; r += 1) { + fCSRecognizers[r] = tempArray[r]; + if (fCSRecognizers[r] == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } + } + } +} + +U_CDECL_END + +U_NAMESPACE_BEGIN + +void CharsetDetector::setRecognizers(UErrorCode &status) +{ + umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status); +} + +CharsetDetector::CharsetDetector(UErrorCode &status) + : textIn(new InputText(status)), resultArray(NULL), + resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE), + fEnabledRecognizers(NULL) +{ + if (U_FAILURE(status)) { + return; + } + + setRecognizers(status); + + if (U_FAILURE(status)) { + return; + } + + resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); + + if (resultArray == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { + resultArray[i] = new CharsetMatch(); + + if (resultArray[i] == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + break; + } + } +} + +CharsetDetector::~CharsetDetector() +{ + delete textIn; + + for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { + delete resultArray[i]; + } + + uprv_free(resultArray); + + if (fEnabledRecognizers) { + uprv_free(fEnabledRecognizers); + } +} + +void CharsetDetector::setText(const char *in, int32_t len) +{ + textIn->setText(in, len); + fFreshTextSet = TRUE; +} + +UBool CharsetDetector::setStripTagsFlag(UBool flag) +{ + UBool temp = fStripTags; + fStripTags = flag; + fFreshTextSet = TRUE; + return temp; +} + +UBool CharsetDetector::getStripTagsFlag() const +{ + return fStripTags; +} + +void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const +{ + textIn->setDeclaredEncoding(encoding,len); +} + +int32_t CharsetDetector::getDetectableCount() +{ + UErrorCode status = U_ZERO_ERROR; + + setRecognizers(status); + + return fCSRecognizers_size; +} + +const CharsetMatch *CharsetDetector::detect(UErrorCode &status) +{ + int32_t maxMatchesFound = 0; + + detectAll(maxMatchesFound, status); + + if(maxMatchesFound > 0) { + return resultArray[0]; + } else { + return NULL; + } +} + +const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) +{ + if(!textIn->isSet()) { + status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set + + return NULL; + } else if (fFreshTextSet) { + CharsetRecognizer *csr; + int32_t i; + + textIn->MungeInput(fStripTags); + + // Iterate over all possible charsets, remember all that + // give a match quality > 0. + resultCount = 0; + for (i = 0; i < fCSRecognizers_size; i += 1) { + csr = fCSRecognizers[i]->recognizer; + if (csr->match(textIn, resultArray[resultCount])) { + resultCount++; + } + } + + if (resultCount > 1) { + uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); + } + fFreshTextSet = FALSE; + } + + maxMatchesFound = resultCount; + + return resultArray; +} + +void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) +{ + if (U_FAILURE(status)) { + return; + } + + int32_t modIdx = -1; + UBool isDefaultVal = FALSE; + for (int32_t i = 0; i < fCSRecognizers_size; i++) { + CSRecognizerInfo *csrinfo = fCSRecognizers[i]; + if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { + modIdx = i; + isDefaultVal = (csrinfo->isDefaultEnabled == enabled); + break; + } + } + if (modIdx < 0) { + // No matching encoding found + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if (fEnabledRecognizers == NULL && !isDefaultVal) { + // Create an array storing the non default setting + fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); + if (fEnabledRecognizers == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + // Initialize the array with default info + for (int32_t i = 0; i < fCSRecognizers_size; i++) { + fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; + } + } + + if (fEnabledRecognizers != NULL) { + fEnabledRecognizers[modIdx] = enabled; + } +} + +/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const +{ + if( index > fCSRecognizers_size-1 || index < 0) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + + return 0; + } else { + return fCSRecognizers[index]->getName(); + } +}*/ + +U_NAMESPACE_END + +U_CDECL_BEGIN +typedef struct { + int32_t currIndex; + UBool all; + UBool *enabledRecognizers; +} Context; + + + +static void U_CALLCONV +enumClose(UEnumeration *en) { + if(en->context != NULL) { + DELETE_ARRAY(en->context); + } + + DELETE_ARRAY(en); +} + +static int32_t U_CALLCONV +enumCount(UEnumeration *en, UErrorCode *) { + if (((Context *)en->context)->all) { + // ucsdet_getAllDetectableCharsets, all charset detector names + return fCSRecognizers_size; + } + + // Otherwise, ucsdet_getDetectableCharsets - only enabled ones + int32_t count = 0; + UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; + if (enabledArray != NULL) { + // custom set + for (int32_t i = 0; i < fCSRecognizers_size; i++) { + if (enabledArray[i]) { + count++; + } + } + } else { + // default set + for (int32_t i = 0; i < fCSRecognizers_size; i++) { + if (fCSRecognizers[i]->isDefaultEnabled) { + count++; + } + } + } + return count; +} + +static const char* U_CALLCONV +enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { + const char *currName = NULL; + + if (((Context *)en->context)->currIndex < fCSRecognizers_size) { + if (((Context *)en->context)->all) { + // ucsdet_getAllDetectableCharsets, all charset detector names + currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); + ((Context *)en->context)->currIndex++; + } else { + // ucsdet_getDetectableCharsets + UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; + if (enabledArray != NULL) { + // custome set + while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { + if (enabledArray[((Context *)en->context)->currIndex]) { + currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); + } + ((Context *)en->context)->currIndex++; + } + } else { + // default set + while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { + if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { + currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); + } + ((Context *)en->context)->currIndex++; + } + } + } + } + + if(resultLength != NULL) { + *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName); + } + + return currName; +} + + +static void U_CALLCONV +enumReset(UEnumeration *en, UErrorCode *) { + ((Context *)en->context)->currIndex = 0; +} + +static const UEnumeration gCSDetEnumeration = { + NULL, + NULL, + enumClose, + enumCount, + uenum_unextDefault, + enumNext, + enumReset +}; + +U_CDECL_END + +U_NAMESPACE_BEGIN + +UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) +{ + + /* Initialize recognized charsets. */ + setRecognizers(status); + + if(U_FAILURE(status)) { + return 0; + } + + UEnumeration *en = NEW_ARRAY(UEnumeration, 1); + if (en == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); + en->context = (void*)NEW_ARRAY(Context, 1); + if (en->context == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + DELETE_ARRAY(en); + return 0; + } + uprv_memset(en->context, 0, sizeof(Context)); + ((Context*)en->context)->all = TRUE; + return en; +} + +UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const +{ + if(U_FAILURE(status)) { + return 0; + } + + UEnumeration *en = NEW_ARRAY(UEnumeration, 1); + if (en == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); + en->context = (void*)NEW_ARRAY(Context, 1); + if (en->context == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + DELETE_ARRAY(en); + return 0; + } + uprv_memset(en->context, 0, sizeof(Context)); + ((Context*)en->context)->all = FALSE; + ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; + return en; +} + +U_NAMESPACE_END + +#endif diff --git a/src/core/basetypes/icu-ucsdet/csmatch.cpp b/src/core/basetypes/icu-ucsdet/csmatch.cpp new file mode 100644 index 00000000..7233e3c2 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/csmatch.cpp @@ -0,0 +1,73 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2012, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION +#include "unicode/unistr.h" +#include "unicode/ucnv.h" + +#include "csmatch.h" + +#include "csrecog.h" +#include "inputext.h" + +U_NAMESPACE_BEGIN + +CharsetMatch::CharsetMatch() + : textIn(NULL), confidence(0), fCharsetName(NULL), fLang(NULL) +{ + // nothing else to do. +} + +void CharsetMatch::set(InputText *input, const CharsetRecognizer *cr, int32_t conf, + const char *csName, const char *lang) +{ + textIn = input; + confidence = conf; + fCharsetName = csName; + fLang = lang; + if (cr != NULL) { + if (fCharsetName == NULL) { + fCharsetName = cr->getName(); + } + if (fLang == NULL) { + fLang = cr->getLanguage(); + } + } +} + +const char* CharsetMatch::getName()const +{ + return fCharsetName; +} + +const char* CharsetMatch::getLanguage()const +{ + return fLang; +} + +int32_t CharsetMatch::getConfidence()const +{ + return confidence; +} + +/* +int32_t CharsetMatch::getUChars(UChar *buf, int32_t cap, UErrorCode *status) const +{ + UConverter *conv = ucnv_open(getName(), status); + int32_t result = ucnv_toUChars(conv, buf, cap, (const char *) textIn->fRawInput, textIn->fRawLength, status); + + ucnv_close(conv); + + return result; +} +*/ + +U_NAMESPACE_END + +#endif diff --git a/src/core/basetypes/icu-ucsdet/csr2022.cpp b/src/core/basetypes/icu-ucsdet/csr2022.cpp new file mode 100644 index 00000000..3db0bc9f --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/csr2022.cpp @@ -0,0 +1,190 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2012, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "cstring.h" + +#include "csr2022.h" +#include "csmatch.h" + +U_NAMESPACE_BEGIN + +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) + +/** + * Matching function shared among the 2022 detectors JP, CN and KR + * Counts up the number of legal and unrecognized escape sequences in + * the sample of text, and computes a score based on the total number & + * the proportion that fit the encoding. + * + * + * @param text the byte buffer containing text to analyse + * @param textLen the size of the text in the byte. + * @param escapeSequences the byte escape sequences to test for. + * @return match quality, in the range of 0-100. + */ +int32_t CharsetRecog_2022::match_2022(const uint8_t *text, int32_t textLen, const uint8_t escapeSequences[][5], int32_t escapeSequences_length) const +{ + int32_t i, j; + int32_t escN; + int32_t hits = 0; + int32_t misses = 0; + int32_t shifts = 0; + int32_t quality; + + i = 0; + while(i < textLen) { + if(text[i] == 0x1B) { + escN = 0; + while(escN < escapeSequences_length) { + const uint8_t *seq = escapeSequences[escN]; + int32_t seq_length = (int32_t)uprv_strlen((const char *) seq); + + if (textLen-i >= seq_length) { + j = 1; + while(j < seq_length) { + if(seq[j] != text[i+j]) { + goto checkEscapes; + } + + j += 1; + } + + hits += 1; + i += seq_length-1; + goto scanInput; + } + // else we ran out of string to compare this time. +checkEscapes: + escN += 1; + } + + misses += 1; + } + + if( text[i]== 0x0e || text[i] == 0x0f){ + shifts += 1; + } + +scanInput: + i += 1; + } + + if (hits == 0) { + return 0; + } + + // + // Initial quality is based on relative proportion of recongized vs. + // unrecognized escape sequences. + // All good: quality = 100; + // half or less good: quality = 0; + // linear inbetween. + quality = (100*hits - 100*misses) / (hits + misses); + + // Back off quality if there were too few escape sequences seen. + // Include shifts in this computation, so that KR does not get penalized + // for having only a single Escape sequence, but many shifts. + if (hits+shifts < 5) { + quality -= (5-(hits+shifts))*10; + } + + if (quality < 0) { + quality = 0; + } + + return quality; +} + + +static const uint8_t escapeSequences_2022JP[][5] = { + {0x1b, 0x24, 0x28, 0x43, 0x00}, // KS X 1001:1992 + {0x1b, 0x24, 0x28, 0x44, 0x00}, // JIS X 212-1990 + {0x1b, 0x24, 0x40, 0x00, 0x00}, // JIS C 6226-1978 + {0x1b, 0x24, 0x41, 0x00, 0x00}, // GB 2312-80 + {0x1b, 0x24, 0x42, 0x00, 0x00}, // JIS X 208-1983 + {0x1b, 0x26, 0x40, 0x00, 0x00}, // JIS X 208 1990, 1997 + {0x1b, 0x28, 0x42, 0x00, 0x00}, // ASCII + {0x1b, 0x28, 0x48, 0x00, 0x00}, // JIS-Roman + {0x1b, 0x28, 0x49, 0x00, 0x00}, // Half-width katakana + {0x1b, 0x28, 0x4a, 0x00, 0x00}, // JIS-Roman + {0x1b, 0x2e, 0x41, 0x00, 0x00}, // ISO 8859-1 + {0x1b, 0x2e, 0x46, 0x00, 0x00} // ISO 8859-7 +}; + +static const uint8_t escapeSequences_2022KR[][5] = { + {0x1b, 0x24, 0x29, 0x43, 0x00} +}; + +static const uint8_t escapeSequences_2022CN[][5] = { + {0x1b, 0x24, 0x29, 0x41, 0x00}, // GB 2312-80 + {0x1b, 0x24, 0x29, 0x47, 0x00}, // CNS 11643-1992 Plane 1 + {0x1b, 0x24, 0x2A, 0x48, 0x00}, // CNS 11643-1992 Plane 2 + {0x1b, 0x24, 0x29, 0x45, 0x00}, // ISO-IR-165 + {0x1b, 0x24, 0x2B, 0x49, 0x00}, // CNS 11643-1992 Plane 3 + {0x1b, 0x24, 0x2B, 0x4A, 0x00}, // CNS 11643-1992 Plane 4 + {0x1b, 0x24, 0x2B, 0x4B, 0x00}, // CNS 11643-1992 Plane 5 + {0x1b, 0x24, 0x2B, 0x4C, 0x00}, // CNS 11643-1992 Plane 6 + {0x1b, 0x24, 0x2B, 0x4D, 0x00}, // CNS 11643-1992 Plane 7 + {0x1b, 0x4e, 0x00, 0x00, 0x00}, // SS2 + {0x1b, 0x4f, 0x00, 0x00, 0x00}, // SS3 +}; + +CharsetRecog_2022JP::~CharsetRecog_2022JP() {} + +const char *CharsetRecog_2022JP::getName() const { + return "ISO-2022-JP"; +} + +UBool CharsetRecog_2022JP::match(InputText *textIn, CharsetMatch *results) const { + int32_t confidence = match_2022(textIn->fInputBytes, + textIn->fInputLen, + escapeSequences_2022JP, + ARRAY_SIZE(escapeSequences_2022JP)); + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_2022KR::~CharsetRecog_2022KR() {} + +const char *CharsetRecog_2022KR::getName() const { + return "ISO-2022-KR"; +} + +UBool CharsetRecog_2022KR::match(InputText *textIn, CharsetMatch *results) const { + int32_t confidence = match_2022(textIn->fInputBytes, + textIn->fInputLen, + escapeSequences_2022KR, + ARRAY_SIZE(escapeSequences_2022KR)); + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_2022CN::~CharsetRecog_2022CN() {} + +const char *CharsetRecog_2022CN::getName() const { + return "ISO-2022-CN"; +} + +UBool CharsetRecog_2022CN::match(InputText *textIn, CharsetMatch *results) const { + int32_t confidence = match_2022(textIn->fInputBytes, + textIn->fInputLen, + escapeSequences_2022CN, + ARRAY_SIZE(escapeSequences_2022CN)); + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_2022::~CharsetRecog_2022() { + // nothing to do +} + +U_NAMESPACE_END +#endif diff --git a/src/core/basetypes/icu-ucsdet/csrecog.cpp b/src/core/basetypes/icu-ucsdet/csrecog.cpp new file mode 100644 index 00000000..ba70b154 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/csrecog.cpp @@ -0,0 +1,28 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2006, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "csrecog.h" + +U_NAMESPACE_BEGIN + +CharsetRecognizer::~CharsetRecognizer() +{ + // nothing to do. +} + +const char *CharsetRecognizer::getLanguage() const +{ + return ""; +} + +U_NAMESPACE_END + +#endif diff --git a/src/core/basetypes/icu-ucsdet/csrmbcs.cpp b/src/core/basetypes/icu-ucsdet/csrmbcs.cpp new file mode 100644 index 00000000..fef2e869 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/csrmbcs.cpp @@ -0,0 +1,529 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2012, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "csmatch.h" +#include "csrmbcs.h" + +#include <math.h> + +U_NAMESPACE_BEGIN + +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) + +#define min(x,y) (((x)<(y))?(x):(y)) + +static const uint16_t commonChars_sjis [] = { +// TODO: This set of data comes from the character frequency- +// of-occurence analysis tool. The data needs to be moved +// into a resource and loaded from there. +0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, +0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, +0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, +0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, +0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, +0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; + +static const uint16_t commonChars_euc_jp[] = { +// TODO: This set of data comes from the character frequency- +// of-occurence analysis tool. The data needs to be moved +// into a resource and loaded from there. +0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, +0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, +0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, +0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, +0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, +0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, +0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, +0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, +0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, +0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; + +static const uint16_t commonChars_euc_kr[] = { +// TODO: This set of data comes from the character frequency- +// of-occurence analysis tool. The data needs to be moved +// into a resource and loaded from there. +0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, +0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, +0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, +0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, +0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, +0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, +0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, +0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, +0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, +0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; + +static const uint16_t commonChars_big5[] = { +// TODO: This set of data comes from the character frequency- +// of-occurence analysis tool. The data needs to be moved +// into a resource and loaded from there. +0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, +0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, +0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, +0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, +0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, +0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, +0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, +0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, +0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, +0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; + +static const uint16_t commonChars_gb_18030[] = { +// TODO: This set of data comes from the character frequency- +// of-occurence analysis tool. The data needs to be moved +// into a resource and loaded from there. +0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, +0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, +0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, +0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, +0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, +0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, +0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, +0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, +0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, +0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; + +static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value) +{ + int32_t start = 0, end = len-1; + int32_t mid = (start+end)/2; + + while(start <= end) { + if(array[mid] == value) { + return mid; + } + + if(array[mid] < value){ + start = mid+1; + } else { + end = mid-1; + } + + mid = (start+end)/2; + } + + return -1; +} + +IteratedChar::IteratedChar() : +charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE) +{ + // nothing else to do. +} + +/*void IteratedChar::reset() +{ + charValue = 0; + index = -1; + nextIndex = 0; + error = FALSE; + done = FALSE; +}*/ + +int32_t IteratedChar::nextByte(InputText *det) +{ + if (nextIndex >= det->fRawLength) { + done = TRUE; + + return -1; + } + + return det->fRawInput[nextIndex++]; +} + +CharsetRecog_mbcs::~CharsetRecog_mbcs() +{ + // nothing to do. +} + +int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const { + int32_t singleByteCharCount = 0; + int32_t doubleByteCharCount = 0; + int32_t commonCharCount = 0; + int32_t badCharCount = 0; + int32_t totalCharCount = 0; + int32_t confidence = 0; + IteratedChar iter; + + while (nextChar(&iter, det)) { + totalCharCount++; + + if (iter.error) { + badCharCount++; + } else { + if (iter.charValue <= 0xFF) { + singleByteCharCount++; + } else { + doubleByteCharCount++; + + if (commonChars != 0) { + if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){ + commonCharCount += 1; + } + } + } + } + + + if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { + // Bail out early if the byte data is not matching the encoding scheme. + // break detectBlock; + return confidence; + } + } + + if (doubleByteCharCount <= 10 && badCharCount == 0) { + // Not many multi-byte chars. + if (doubleByteCharCount == 0 && totalCharCount < 10) { + // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. + // We don't have enough data to have any confidence. + // Statistical analysis of single byte non-ASCII charcters would probably help here. + confidence = 0; + } + else { + // ASCII or ISO file? It's probably not our encoding, + // but is not incompatible with our encoding, so don't give it a zero. + confidence = 10; + } + + return confidence; + } + + // + // No match if there are too many characters that don't fit the encoding scheme. + // (should we have zero tolerance for these?) + // + if (doubleByteCharCount < 20*badCharCount) { + confidence = 0; + + return confidence; + } + + if (commonChars == 0) { + // We have no statistics on frequently occuring characters. + // Assess confidence purely on having a reasonable number of + // multi-byte characters (the more the better) + confidence = 30 + doubleByteCharCount - 20*badCharCount; + + if (confidence > 100) { + confidence = 100; + } + } else { + // + // Frequency of occurence statistics exist. + // + + double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/ + double scaleFactor = 90.0 / maxVal; + confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0); + + confidence = min(confidence, 100); + } + + if (confidence < 0) { + confidence = 0; + } + + return confidence; +} + +CharsetRecog_sjis::~CharsetRecog_sjis() +{ + // nothing to do +} + +UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const { + it->index = it->nextIndex; + it->error = FALSE; + + int32_t firstByte = it->charValue = it->nextByte(det); + + if (firstByte < 0) { + return FALSE; + } + + if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) { + return TRUE; + } + + int32_t secondByte = it->nextByte(det); + if (secondByte >= 0) { + it->charValue = (firstByte << 8) | secondByte; + } + // else we'll handle the error later. + + if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) { + // Illegal second byte value. + it->error = TRUE; + } + + return TRUE; +} + +UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const { + int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis)); + results->set(det, this, confidence); + return (confidence > 0); +} + +const char *CharsetRecog_sjis::getName() const +{ + return "Shift_JIS"; +} + +const char *CharsetRecog_sjis::getLanguage() const +{ + return "ja"; +} + +CharsetRecog_euc::~CharsetRecog_euc() +{ + // nothing to do +} + +UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const { + int32_t firstByte = 0; + int32_t secondByte = 0; + int32_t thirdByte = 0; + + it->index = it->nextIndex; + it->error = FALSE; + firstByte = it->charValue = it->nextByte(det); + + if (firstByte < 0) { + // Ran off the end of the input data + return FALSE; + } + + if (firstByte <= 0x8D) { + // single byte char + return TRUE; + } + + secondByte = it->nextByte(det); + if (secondByte >= 0) { + it->charValue = (it->charValue << 8) | secondByte; + } + // else we'll handle the error later. + + if (firstByte >= 0xA1 && firstByte <= 0xFE) { + // Two byte Char + if (secondByte < 0xA1) { + it->error = TRUE; + } + + return TRUE; + } + + if (firstByte == 0x8E) { + // Code Set 2. + // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. + // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. + // We don't know which we've got. + // Treat it like EUC-JP. If the data really was EUC-TW, the following two + // bytes will look like a well formed 2 byte char. + if (secondByte < 0xA1) { + it->error = TRUE; + } + + return TRUE; + } + + if (firstByte == 0x8F) { + // Code set 3. + // Three byte total char size, two bytes of actual char value. + thirdByte = it->nextByte(det); + it->charValue = (it->charValue << 8) | thirdByte; + + if (thirdByte < 0xa1) { + // Bad second byte or ran off the end of the input data with a non-ASCII first byte. + it->error = TRUE; + } + } + + return TRUE; + +} + +CharsetRecog_euc_jp::~CharsetRecog_euc_jp() +{ + // nothing to do +} + +const char *CharsetRecog_euc_jp::getName() const +{ + return "EUC-JP"; +} + +const char *CharsetRecog_euc_jp::getLanguage() const +{ + return "ja"; +} + +UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const +{ + int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp)); + results->set(det, this, confidence); + return (confidence > 0); +} + +CharsetRecog_euc_kr::~CharsetRecog_euc_kr() +{ + // nothing to do +} + +const char *CharsetRecog_euc_kr::getName() const +{ + return "EUC-KR"; +} + +const char *CharsetRecog_euc_kr::getLanguage() const +{ + return "ko"; +} + +UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const +{ + int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr)); + results->set(det, this, confidence); + return (confidence > 0); +} + +CharsetRecog_big5::~CharsetRecog_big5() +{ + // nothing to do +} + +UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const +{ + int32_t firstByte; + + it->index = it->nextIndex; + it->error = FALSE; + firstByte = it->charValue = it->nextByte(det); + + if (firstByte < 0) { + return FALSE; + } + + if (firstByte <= 0x7F || firstByte == 0xFF) { + // single byte character. + return TRUE; + } + + int32_t secondByte = it->nextByte(det); + if (secondByte >= 0) { + it->charValue = (it->charValue << 8) | secondByte; + } + // else we'll handle the error later. + + if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) { + it->error = TRUE; + } + + return TRUE; +} + +const char *CharsetRecog_big5::getName() const +{ + return "Big5"; +} + +const char *CharsetRecog_big5::getLanguage() const +{ + return "zh"; +} + +UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const +{ + int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5)); + results->set(det, this, confidence); + return (confidence > 0); +} + +CharsetRecog_gb_18030::~CharsetRecog_gb_18030() +{ + // nothing to do +} + +UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const { + int32_t firstByte = 0; + int32_t secondByte = 0; + int32_t thirdByte = 0; + int32_t fourthByte = 0; + + it->index = it->nextIndex; + it->error = FALSE; + firstByte = it->charValue = it->nextByte(det); + + if (firstByte < 0) { + // Ran off the end of the input data + return FALSE; + } + + if (firstByte <= 0x80) { + // single byte char + return TRUE; + } + + secondByte = it->nextByte(det); + if (secondByte >= 0) { + it->charValue = (it->charValue << 8) | secondByte; + } + // else we'll handle the error later. + + if (firstByte >= 0x81 && firstByte <= 0xFE) { + // Two byte Char + if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) { + return TRUE; + } + + // Four byte char + if (secondByte >= 0x30 && secondByte <= 0x39) { + thirdByte = it->nextByte(det); + + if (thirdByte >= 0x81 && thirdByte <= 0xFE) { + fourthByte = it->nextByte(det); + + if (fourthByte >= 0x30 && fourthByte <= 0x39) { + it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte; + + return TRUE; + } + } + } + + // Something wasn't valid, or we ran out of data (-1). + it->error = TRUE; + } + + return TRUE; +} + +const char *CharsetRecog_gb_18030::getName() const +{ + return "GB18030"; +} + +const char *CharsetRecog_gb_18030::getLanguage() const +{ + return "zh"; +} + +UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const +{ + int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030)); + results->set(det, this, confidence); + return (confidence > 0); +} + +U_NAMESPACE_END +#endif diff --git a/src/core/basetypes/icu-ucsdet/csrsbcs.cpp b/src/core/basetypes/icu-ucsdet/csrsbcs.cpp new file mode 100644 index 00000000..d03367cc --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/csrsbcs.cpp @@ -0,0 +1,1259 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2013, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#include "cmemory.h" + +#if !UCONFIG_NO_CONVERSION +#include "csrsbcs.h" +#include "csmatch.h" + +#define N_GRAM_SIZE 3 +#define N_GRAM_MASK 0xFFFFFF +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) + +U_NAMESPACE_BEGIN + +NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap) + : ngram(0), byteIndex(0) +{ + ngramList = theNgramList; + charMap = theCharMap; + + ngramCount = hitCount = 0; +} + +/* + * Binary search for value in table, which must have exactly 64 entries. + */ + +int32_t NGramParser::search(const int32_t *table, int32_t value) +{ + int32_t index = 0; + + if (table[index + 32] <= value) { + index += 32; + } + + if (table[index + 16] <= value) { + index += 16; + } + + if (table[index + 8] <= value) { + index += 8; + } + + if (table[index + 4] <= value) { + index += 4; + } + + if (table[index + 2] <= value) { + index += 2; + } + + if (table[index + 1] <= value) { + index += 1; + } + + if (table[index] > value) { + index -= 1; + } + + if (index < 0 || table[index] != value) { + return -1; + } + + return index; +} + +void NGramParser::lookup(int32_t thisNgram) +{ + ngramCount += 1; + + if (search(ngramList, thisNgram) >= 0) { + hitCount += 1; + } + +} + +void NGramParser::addByte(int32_t b) +{ + ngram = ((ngram << 8) + b) & N_GRAM_MASK; + lookup(ngram); +} + +int32_t NGramParser::nextByte(InputText *det) +{ + if (byteIndex >= det->fInputLen) { + return -1; + } + + return det->fInputBytes[byteIndex++]; +} + +void NGramParser::parseCharacters(InputText *det) +{ + int32_t b; + bool ignoreSpace = FALSE; + + while ((b = nextByte(det)) >= 0) { + uint8_t mb = charMap[b]; + + // TODO: 0x20 might not be a space in all character sets... + if (mb != 0) { + if (!(mb == 0x20 && ignoreSpace)) { + addByte(mb); + } + + ignoreSpace = (mb == 0x20); + } + } +} + +int32_t NGramParser::parse(InputText *det) +{ + parseCharacters(det); + + // TODO: Is this OK? The buffer could have ended in the middle of a word... + addByte(0x20); + + double rawPercent = (double) hitCount / (double) ngramCount; + + // if (rawPercent <= 2.0) { + // return 0; + // } + + // TODO - This is a bit of a hack to take care of a case + // were we were getting a confidence of 135... + if (rawPercent > 0.33) { + return 98; + } + + return (int32_t) (rawPercent * 300.0); +} + +static const uint8_t unshapeMap_IBM420[] = { +/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ +/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, +/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, +/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, +/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, +/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F, +/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E, +/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF, +/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF, +/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF, +/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF, +/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, +/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, +}; + +NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap) +{ + alef = 0x00; +} + + +int32_t NGramParser_IBM420::isLamAlef(int32_t b) +{ + if(b == 0xB2 || b == 0xB3){ + return 0x47; + }else if(b == 0xB4 || b == 0xB5){ + return 0x49; + }else if(b == 0xB8 || b == 0xB9){ + return 0x56; + }else + return 0x00; +} + +/* +* Arabic shaping needs to be done manually. Cannot call ArabicShaping class +* because CharsetDetector is dealing with bytes not Unicode code points. We could +* convert the bytes to Unicode code points but that would leave us dependent +* on CharsetICU which we try to avoid. IBM420 converter amongst different versions +* of JDK can produce different results and therefore is also avoided. +*/ +int32_t NGramParser_IBM420::nextByte(InputText *det) +{ + + if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) { + return -1; + } + int next; + + alef = isLamAlef(det->fInputBytes[byteIndex]); + if(alef != 0x00) + next = 0xB1 & 0xFF; + else + next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF; + + byteIndex++; + + return next; +} + +void NGramParser_IBM420::parseCharacters(InputText *det) +{ + int32_t b; + bool ignoreSpace = FALSE; + + while ((b = nextByte(det)) >= 0) { + uint8_t mb = charMap[b]; + + // TODO: 0x20 might not be a space in all character sets... + if (mb != 0) { + if (!(mb == 0x20 && ignoreSpace)) { + addByte(mb); + } + ignoreSpace = (mb == 0x20); + } + + if(alef != 0x00){ + mb = charMap[alef & 0xFF]; + + // TODO: 0x20 might not be a space in all character sets... + if (mb != 0) { + if (!(mb == 0x20 && ignoreSpace)) { + addByte(mb); + } + + ignoreSpace = (mb == 0x20); + } + + } + } +} + +CharsetRecog_sbcs::CharsetRecog_sbcs() +{ + // nothing else to do +} + +CharsetRecog_sbcs::~CharsetRecog_sbcs() +{ + // nothing to do +} + +int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const +{ + NGramParser parser(ngrams, byteMap); + int32_t result; + + result = parser.parse(det); + + return result; +} + +static const uint8_t charMap_8859_1[] = { + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, + 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, +}; + +static const uint8_t charMap_8859_2[] = { + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20, + 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF, + 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7, + 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20, +}; + +static const uint8_t charMap_8859_5[] = { + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF, +}; + +static const uint8_t charMap_8859_6[] = { + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +}; + +static const uint8_t charMap_8859_7[] = { + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20, + 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE, + 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20, +}; + +static const uint8_t charMap_8859_8[] = { + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20, +}; + +static const uint8_t charMap_8859_9[] = { + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, + 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, +}; + +static const int32_t ngrams_windows_1251[] = { + 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE, + 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED, + 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2, + 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520, +}; + +static const uint8_t charMap_windows_1251[] = { + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F, + 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F, + 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20, + 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF, + 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20, + 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, +}; + +static const int32_t ngrams_windows_1256[] = { + 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8, + 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD, + 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20, + 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420, +}; + +static const uint8_t charMap_windows_1256[] = { + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20, + 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F, + 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20, + 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF, +}; + +static const int32_t ngrams_KOI8_R[] = { + 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1, + 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE, + 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1, + 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF, +}; + +static const uint8_t charMap_KOI8_R[] = { + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, +}; + +static const int32_t ngrams_IBM424_he_rtl[] = { + 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641, + 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045, + 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056, + 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069, +}; + +static const int32_t ngrams_IBM424_he_ltr[] = { + 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141, + 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054, + 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940, + 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651, +}; + +static const uint8_t charMap_IBM424_he[] = { +/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ +/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40, +/* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +}; + +static const int32_t ngrams_IBM420_ar_rtl[] = { + 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158, + 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB, + 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40, + 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40, +}; + +static const int32_t ngrams_IBM420_ar_ltr[] = { + 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF, + 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD, + 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156, + 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156 +}; + +static const uint8_t charMap_IBM420_ar[]= { +/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ +/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, +/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, +/* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, +/* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, +/* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF, +/* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, +/* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF, +/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40, +}; + +//ISO-8859-1,2,5,6,7,8,9 Ngrams + +struct NGramsPlusLang { + const int32_t ngrams[64]; + const char * lang; +}; + +static const NGramsPlusLang ngrams_8859_1[] = { + { + { + 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F, + 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74, + 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420, + 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320, + }, + "en" + }, + { + { + 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620, + 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320, + 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520, + 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572, + }, + "da" + }, + { + { + 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F, + 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220, + 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465, + 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572, + }, + "de" + }, + { + { + 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365, + 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C, + 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064, + 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20, + }, + "es" + }, + { + { + 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E, + 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20, + 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420, + 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220, + }, + "fr" + }, + { + { + 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073, + 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220, + 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20, + 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F, + }, + "it" + }, + { + { + 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665, + 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E, + 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F, + 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F, + }, + "nl" + }, + { + { + 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469, + 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474, + 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65, + 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572, + }, + "no" + }, + { + { + 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365, + 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20, + 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065, + 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F, + }, + "pt" + }, + { + { + 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469, + 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220, + 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20, + 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220, + }, + "sv" + } +}; + + +static const NGramsPlusLang ngrams_8859_2[] = { + { + { + 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F, + 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465, + 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865, + 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564, + }, + "cs" + }, + { + { + 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69, + 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20, + 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061, + 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320, + }, + "hu" + }, + { + { + 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779, + 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20, + 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769, + 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720, + }, + "pl" + }, + { + { + 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69, + 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070, + 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72, + 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20, + }, + "ro" + } +}; + +static const int32_t ngrams_8859_5_ru[] = { + 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE, + 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD, + 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2, + 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520, +}; + +static const int32_t ngrams_8859_6_ar[] = { + 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8, + 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1, + 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20, + 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620, +}; + +static const int32_t ngrams_8859_7_el[] = { + 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7, + 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120, + 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5, + 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20, +}; + +static const int32_t ngrams_8859_8_I_he[] = { + 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0, + 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4, + 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE, + 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9, +}; + +static const int32_t ngrams_8859_8_he[] = { + 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0, + 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC, + 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920, + 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9, +}; + +static const int32_t ngrams_8859_9_tr[] = { + 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961, + 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062, + 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062, + 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD, +}; + +CharsetRecog_8859_1::~CharsetRecog_8859_1() +{ + // nothing to do +} + +UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const { + const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1"; + uint32_t i; + int32_t bestConfidenceSoFar = -1; + for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) { + const int32_t *ngrams = ngrams_8859_1[i].ngrams; + const char *lang = ngrams_8859_1[i].lang; + int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1); + if (confidence > bestConfidenceSoFar) { + results->set(textIn, this, confidence, name, lang); + bestConfidenceSoFar = confidence; + } + } + return (bestConfidenceSoFar > 0); +} + +const char *CharsetRecog_8859_1::getName() const +{ + return "ISO-8859-1"; +} + + +CharsetRecog_8859_2::~CharsetRecog_8859_2() +{ + // nothing to do +} + +UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const { + const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2"; + uint32_t i; + int32_t bestConfidenceSoFar = -1; + for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) { + const int32_t *ngrams = ngrams_8859_2[i].ngrams; + const char *lang = ngrams_8859_2[i].lang; + int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2); + if (confidence > bestConfidenceSoFar) { + results->set(textIn, this, confidence, name, lang); + bestConfidenceSoFar = confidence; + } + } + return (bestConfidenceSoFar > 0); +} + +const char *CharsetRecog_8859_2::getName() const +{ + return "ISO-8859-2"; +} + + +CharsetRecog_8859_5::~CharsetRecog_8859_5() +{ + // nothing to do +} + +const char *CharsetRecog_8859_5::getName() const +{ + return "ISO-8859-5"; +} + +CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru() +{ + // nothing to do +} + +const char *CharsetRecog_8859_5_ru::getLanguage() const +{ + return "ru"; +} + +UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const +{ + int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5); + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_8859_6::~CharsetRecog_8859_6() +{ + // nothing to do +} + +const char *CharsetRecog_8859_6::getName() const +{ + return "ISO-8859-6"; +} + +CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar() +{ + // nothing to do +} + +const char *CharsetRecog_8859_6_ar::getLanguage() const +{ + return "ar"; +} + +UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const +{ + int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6); + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_8859_7::~CharsetRecog_8859_7() +{ + // nothing to do +} + +const char *CharsetRecog_8859_7::getName() const +{ + return "ISO-8859-7"; +} + +CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el() +{ + // nothing to do +} + +const char *CharsetRecog_8859_7_el::getLanguage() const +{ + return "el"; +} + +UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const +{ + const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7"; + int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7); + results->set(textIn, this, confidence, name, "el"); + return (confidence > 0); +} + +CharsetRecog_8859_8::~CharsetRecog_8859_8() +{ + // nothing to do +} + +const char *CharsetRecog_8859_8::getName() const +{ + return "ISO-8859-8"; +} + +CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he () +{ + // nothing to do +} + +const char *CharsetRecog_8859_8_I_he::getName() const +{ + return "ISO-8859-8-I"; +} + +const char *CharsetRecog_8859_8_I_he::getLanguage() const +{ + return "he"; +} + +UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const +{ + const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I"; + int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8); + results->set(textIn, this, confidence, name, "he"); + return (confidence > 0); +} + +CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he() +{ + // od ot gnihton +} + +const char *CharsetRecog_8859_8_he::getLanguage() const +{ + return "he"; +} + +UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const +{ + const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8"; + int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8); + results->set(textIn, this, confidence, name, "he"); + return (confidence > 0); +} + +CharsetRecog_8859_9::~CharsetRecog_8859_9() +{ + // nothing to do +} + +const char *CharsetRecog_8859_9::getName() const +{ + return "ISO-8859-9"; +} + +CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr () +{ + // nothing to do +} + +const char *CharsetRecog_8859_9_tr::getLanguage() const +{ + return "tr"; +} + +UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const +{ + const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9"; + int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9); + results->set(textIn, this, confidence, name, "tr"); + return (confidence > 0); +} + +CharsetRecog_windows_1256::~CharsetRecog_windows_1256() +{ + // nothing to do +} + +const char *CharsetRecog_windows_1256::getName() const +{ + return "windows-1256"; +} + +const char *CharsetRecog_windows_1256::getLanguage() const +{ + return "ar"; +} + +UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const +{ + int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256); + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_windows_1251::~CharsetRecog_windows_1251() +{ + // nothing to do +} + +const char *CharsetRecog_windows_1251::getName() const +{ + return "windows-1251"; +} + +const char *CharsetRecog_windows_1251::getLanguage() const +{ + return "ru"; +} + +UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const +{ + int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251); + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R() +{ + // nothing to do +} + +const char *CharsetRecog_KOI8_R::getName() const +{ + return "KOI8-R"; +} + +const char *CharsetRecog_KOI8_R::getLanguage() const +{ + return "ru"; +} + +UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const +{ + int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() +{ + // nothing to do +} + +const char *CharsetRecog_IBM424_he::getLanguage() const +{ + return "he"; +} + +CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl() +{ + // nothing to do +} + +const char *CharsetRecog_IBM424_he_rtl::getName() const +{ + return "IBM424_rtl"; +} + +UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const +{ + int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he); + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr() +{ + // nothing to do +} + +const char *CharsetRecog_IBM424_he_ltr::getName() const +{ + return "IBM424_ltr"; +} + +UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const +{ + int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he); + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar() +{ + // nothing to do +} + +const char *CharsetRecog_IBM420_ar::getLanguage() const +{ + return "ar"; +} + + +int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const +{ + NGramParser_IBM420 parser(ngrams, byteMap); + int32_t result; + + result = parser.parse(det); + + return result; +} + +CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl() +{ + // nothing to do +} + +const char *CharsetRecog_IBM420_ar_rtl::getName() const +{ + return "IBM420_rtl"; +} + +UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const +{ + int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar); + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr() +{ + // nothing to do +} + +const char *CharsetRecog_IBM420_ar_ltr::getName() const +{ + return "IBM420_ltr"; +} + +UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const +{ + int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar); + results->set(textIn, this, confidence); + return (confidence > 0); +} + +U_NAMESPACE_END +#endif + diff --git a/src/core/basetypes/icu-ucsdet/csrucode.cpp b/src/core/basetypes/icu-ucsdet/csrucode.cpp new file mode 100644 index 00000000..f0983430 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/csrucode.cpp @@ -0,0 +1,198 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2013, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "csrucode.h" +#include "csmatch.h" + +U_NAMESPACE_BEGIN + +CharsetRecog_Unicode::~CharsetRecog_Unicode() +{ + // nothing to do +} + +CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() +{ + // nothing to do +} + +const char *CharsetRecog_UTF_16_BE::getName() const +{ + return "UTF-16BE"; +} + +// UTF-16 confidence calculation. Very simple minded, but better than nothing. +// Any 8 bit non-control characters bump the confidence up. These have a zero high byte, +// and are very likely to be UTF-16, although they could also be part of a UTF-32 code. +// NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32. +// NULs should be rare in actual text. + +static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) { + if (codeUnit == 0) { + confidence -= 10; + } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) { + confidence += 10; + } + if (confidence < 0) { + confidence = 0; + } else if (confidence > 100) { + confidence = 100; + } + return confidence; +} + + +UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const +{ + const uint8_t *input = textIn->fRawInput; + int32_t confidence = 10; + int32_t length = textIn->fRawLength; + + int32_t bytesToCheck = (length > 30) ? 30 : length; + for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { + UChar codeUnit = (input[charIndex] << 8) | input[charIndex + 1]; + if (charIndex == 0 && codeUnit == 0xFEFF) { + confidence = 100; + break; + } + confidence = adjustConfidence(codeUnit, confidence); + if (confidence == 0 || confidence == 100) { + break; + } + } + if (bytesToCheck < 4 && confidence < 100) { + confidence = 0; + } + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() +{ + // nothing to do +} + +const char *CharsetRecog_UTF_16_LE::getName() const +{ + return "UTF-16LE"; +} + +UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const +{ + const uint8_t *input = textIn->fRawInput; + int32_t confidence = 10; + int32_t length = textIn->fRawLength; + + int32_t bytesToCheck = (length > 30) ? 30 : length; + for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { + UChar codeUnit = input[charIndex] | (input[charIndex + 1] << 8); + if (charIndex == 0 && codeUnit == 0xFEFF) { + confidence = 100; // UTF-16 BOM + if (length >= 4 && input[2] == 0 && input[3] == 0) { + confidence = 0; // UTF-32 BOM + } + break; + } + confidence = adjustConfidence(codeUnit, confidence); + if (confidence == 0 || confidence == 100) { + break; + } + } + if (bytesToCheck < 4 && confidence < 100) { + confidence = 0; + } + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_UTF_32::~CharsetRecog_UTF_32() +{ + // nothing to do +} + +UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const +{ + const uint8_t *input = textIn->fRawInput; + int32_t limit = (textIn->fRawLength / 4) * 4; + int32_t numValid = 0; + int32_t numInvalid = 0; + bool hasBOM = FALSE; + int32_t confidence = 0; + + if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) { + hasBOM = TRUE; + } + + for(int32_t i = 0; i < limit; i += 4) { + int32_t ch = getChar(input, i); + + if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { + numInvalid += 1; + } else { + numValid += 1; + } + } + + + // Cook up some sort of confidence score, based on presense of a BOM + // and the existence of valid and/or invalid multi-byte sequences. + if (hasBOM && numInvalid==0) { + confidence = 100; + } else if (hasBOM && numValid > numInvalid*10) { + confidence = 80; + } else if (numValid > 3 && numInvalid == 0) { + confidence = 100; + } else if (numValid > 0 && numInvalid == 0) { + confidence = 80; + } else if (numValid > numInvalid*10) { + // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance. + confidence = 25; + } + + results->set(textIn, this, confidence); + return (confidence > 0); +} + +CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE() +{ + // nothing to do +} + +const char *CharsetRecog_UTF_32_BE::getName() const +{ + return "UTF-32BE"; +} + +int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const +{ + return input[index + 0] << 24 | input[index + 1] << 16 | + input[index + 2] << 8 | input[index + 3]; +} + +CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE() +{ + // nothing to do +} + +const char *CharsetRecog_UTF_32_LE::getName() const +{ + return "UTF-32LE"; +} + +int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const +{ + return input[index + 3] << 24 | input[index + 2] << 16 | + input[index + 1] << 8 | input[index + 0]; +} + +U_NAMESPACE_END +#endif + diff --git a/src/core/basetypes/icu-ucsdet/csrutf8.cpp b/src/core/basetypes/icu-ucsdet/csrutf8.cpp new file mode 100644 index 00000000..b18aa77e --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/csrutf8.cpp @@ -0,0 +1,109 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2014, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "csrutf8.h" +#include "csmatch.h" + +U_NAMESPACE_BEGIN + +CharsetRecog_UTF8::~CharsetRecog_UTF8() +{ + // nothing to do +} + +const char *CharsetRecog_UTF8::getName() const +{ + return "UTF-8"; +} + +UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { + bool hasBOM = FALSE; + int32_t numValid = 0; + int32_t numInvalid = 0; + const uint8_t *inputBytes = input->fRawInput; + int32_t i; + int32_t trailBytes = 0; + int32_t confidence; + + if (input->fRawLength >= 3 && + inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) { + hasBOM = TRUE; + } + + // Scan for multi-byte sequences + for (i=0; i < input->fRawLength; i += 1) { + int32_t b = inputBytes[i]; + + if ((b & 0x80) == 0) { + continue; // ASCII + } + + // Hi bit on char found. Figure out how long the sequence should be + if ((b & 0x0E0) == 0x0C0) { + trailBytes = 1; + } else if ((b & 0x0F0) == 0x0E0) { + trailBytes = 2; + } else if ((b & 0x0F8) == 0xF0) { + trailBytes = 3; + } else { + numInvalid += 1; + continue; + } + + // Verify that we've got the right number of trail bytes in the sequence + for (;;) { + i += 1; + + if (i >= input->fRawLength) { + break; + } + + b = inputBytes[i]; + + if ((b & 0xC0) != 0x080) { + numInvalid += 1; + break; + } + + if (--trailBytes == 0) { + numValid += 1; + break; + } + } + + } + + // Cook up some sort of confidence score, based on presence of a BOM + // and the existence of valid and/or invalid multi-byte sequences. + confidence = 0; + if (hasBOM && numInvalid == 0) { + confidence = 100; + } else if (hasBOM && numValid > numInvalid*10) { + confidence = 80; + } else if (numValid > 3 && numInvalid == 0) { + confidence = 100; + } else if (numValid > 0 && numInvalid == 0) { + confidence = 80; + } else if (numValid == 0 && numInvalid == 0) { + // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which + // accepts ASCII with confidence = 10. + confidence = 15; + } else if (numValid > numInvalid*10) { + // Probably corruput utf-8 data. Valid sequences aren't likely by chance. + confidence = 25; + } + + results->set(input, this, confidence); + return (confidence > 0); +} + +U_NAMESPACE_END +#endif diff --git a/src/core/basetypes/icu-ucsdet/cstring.c b/src/core/basetypes/icu-ucsdet/cstring.c new file mode 100644 index 00000000..3af959eb --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/cstring.c @@ -0,0 +1,339 @@ +/* +****************************************************************************** +* +* Copyright (C) 1997-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* File CSTRING.C +* +* @author Helena Shih +* +* Modification History: +* +* Date Name Description +* 6/18/98 hshih Created +* 09/08/98 stephen Added include for ctype, for Mac Port +* 11/15/99 helena Integrated S/390 IEEE changes. +****************************************************************************** +*/ + + + +#include <stdlib.h> +#include <stdio.h> +#include "unicode/utypes.h" +#include "cmemory.h" +#include "cstring.h" +#include "uassert.h" + +/* + * We hardcode case conversion for invariant characters to match our expectation + * and the compiler execution charset. + * This prevents problems on systems + * - with non-default casing behavior, like Turkish system locales where + * tolower('I') maps to dotless i and toupper('i') maps to dotted I + * - where there are no lowercase Latin characters at all, or using different + * codes (some old EBCDIC codepages) + * + * This works because the compiler usually runs on a platform where the execution + * charset includes all of the invariant characters at their expected + * code positions, so that the char * string literals in ICU code match + * the char literals here. + * + * Note that the set of lowercase Latin letters is discontiguous in EBCDIC + * and the set of uppercase Latin letters is discontiguous as well. + */ + +U_CAPI UBool U_EXPORT2 +uprv_isASCIILetter(char c) { +#if U_CHARSET_FAMILY==U_EBCDIC_FAMILY + return + ('a'<=c && c<='i') || ('j'<=c && c<='r') || ('s'<=c && c<='z') || + ('A'<=c && c<='I') || ('J'<=c && c<='R') || ('S'<=c && c<='Z'); +#else + return ('a'<=c && c<='z') || ('A'<=c && c<='Z'); +#endif +} + +U_CAPI char U_EXPORT2 +uprv_toupper(char c) { +#if U_CHARSET_FAMILY==U_EBCDIC_FAMILY + if(('a'<=c && c<='i') || ('j'<=c && c<='r') || ('s'<=c && c<='z')) { + c=(char)(c+('A'-'a')); + } +#else + if('a'<=c && c<='z') { + c=(char)(c+('A'-'a')); + } +#endif + return c; +} + + +#if 0 +/* + * Commented out because cstring.h defines uprv_tolower() to be + * the same as either uprv_asciitolower() or uprv_ebcdictolower() + * to reduce the amount of code to cover with tests. + * + * Note that this uprv_tolower() definition is likely to work for most + * charset families, not just ASCII and EBCDIC, because its #else branch + * is written generically. + */ +U_CAPI char U_EXPORT2 +uprv_tolower(char c) { +#if U_CHARSET_FAMILY==U_EBCDIC_FAMILY + if(('A'<=c && c<='I') || ('J'<=c && c<='R') || ('S'<=c && c<='Z')) { + c=(char)(c+('a'-'A')); + } +#else + if('A'<=c && c<='Z') { + c=(char)(c+('a'-'A')); + } +#endif + return c; +} +#endif + +U_CAPI char U_EXPORT2 +uprv_asciitolower(char c) { + if(0x41<=c && c<=0x5a) { + c=(char)(c+0x20); + } + return c; +} + +U_CAPI char U_EXPORT2 +uprv_ebcdictolower(char c) { + if( (0xc1<=(uint8_t)c && (uint8_t)c<=0xc9) || + (0xd1<=(uint8_t)c && (uint8_t)c<=0xd9) || + (0xe2<=(uint8_t)c && (uint8_t)c<=0xe9) + ) { + c=(char)(c-0x40); + } + return c; +} + + +U_CAPI char* U_EXPORT2 +T_CString_toLowerCase(char* str) +{ + char* origPtr = str; + + if (str) { + do + *str = (char)uprv_tolower(*str); + while (*(str++)); + } + + return origPtr; +} + +U_CAPI char* U_EXPORT2 +T_CString_toUpperCase(char* str) +{ + char* origPtr = str; + + if (str) { + do + *str = (char)uprv_toupper(*str); + while (*(str++)); + } + + return origPtr; +} + +/* + * Takes a int32_t and fills in a char* string with that number "radix"-based. + * Does not handle negative values (makes an empty string for them). + * Writes at most 12 chars ("-2147483647" plus NUL). + * Returns the length of the string (not including the NUL). + */ +U_CAPI int32_t U_EXPORT2 +T_CString_integerToString(char* buffer, int32_t v, int32_t radix) +{ + char tbuf[30]; + int32_t tbx = sizeof(tbuf); + uint8_t digit; + int32_t length = 0; + uint32_t uval; + + U_ASSERT(radix>=2 && radix<=16); + uval = (uint32_t) v; + if(v<0 && radix == 10) { + /* Only in base 10 do we conside numbers to be signed. */ + uval = (uint32_t)(-v); + buffer[length++] = '-'; + } + + tbx = sizeof(tbuf)-1; + tbuf[tbx] = 0; /* We are generating the digits backwards. Null term the end. */ + do { + digit = (uint8_t)(uval % radix); + tbuf[--tbx] = (char)(T_CString_itosOffset(digit)); + uval = uval / radix; + } while (uval != 0); + + /* copy converted number into user buffer */ + uprv_strcpy(buffer+length, tbuf+tbx); + length += sizeof(tbuf) - tbx -1; + return length; +} + + + +/* + * Takes a int64_t and fills in a char* string with that number "radix"-based. + * Writes at most 21: chars ("-9223372036854775807" plus NUL). + * Returns the length of the string, not including the terminating NULL. + */ +U_CAPI int32_t U_EXPORT2 +T_CString_int64ToString(char* buffer, int64_t v, uint32_t radix) +{ + char tbuf[30]; + int32_t tbx = sizeof(tbuf); + uint8_t digit; + int32_t length = 0; + uint64_t uval; + + U_ASSERT(radix>=2 && radix<=16); + uval = (uint64_t) v; + if(v<0 && radix == 10) { + /* Only in base 10 do we conside numbers to be signed. */ + uval = (uint64_t)(-v); + buffer[length++] = '-'; + } + + tbx = sizeof(tbuf)-1; + tbuf[tbx] = 0; /* We are generating the digits backwards. Null term the end. */ + do { + digit = (uint8_t)(uval % radix); + tbuf[--tbx] = (char)(T_CString_itosOffset(digit)); + uval = uval / radix; + } while (uval != 0); + + /* copy converted number into user buffer */ + uprv_strcpy(buffer+length, tbuf+tbx); + length += sizeof(tbuf) - tbx -1; + return length; +} + + +U_CAPI int32_t U_EXPORT2 +T_CString_stringToInteger(const char *integerString, int32_t radix) +{ + char *end; + return uprv_strtoul(integerString, &end, radix); + +} + +U_CAPI int U_EXPORT2 +uprv_stricmp(const char *str1, const char *str2) { + if(str1==NULL) { + if(str2==NULL) { + return 0; + } else { + return -1; + } + } else if(str2==NULL) { + return 1; + } else { + /* compare non-NULL strings lexically with lowercase */ + int rc; + unsigned char c1, c2; + + for(;;) { + c1=(unsigned char)*str1; + c2=(unsigned char)*str2; + if(c1==0) { + if(c2==0) { + return 0; + } else { + return -1; + } + } else if(c2==0) { + return 1; + } else { + /* compare non-zero characters with lowercase */ + rc=(int)(unsigned char)uprv_tolower(c1)-(int)(unsigned char)uprv_tolower(c2); + if(rc!=0) { + return rc; + } + } + ++str1; + ++str2; + } + } +} + +U_CAPI int U_EXPORT2 +uprv_strnicmp(const char *str1, const char *str2, uint32_t n) { + if(str1==NULL) { + if(str2==NULL) { + return 0; + } else { + return -1; + } + } else if(str2==NULL) { + return 1; + } else { + /* compare non-NULL strings lexically with lowercase */ + int rc; + unsigned char c1, c2; + + for(; n--;) { + c1=(unsigned char)*str1; + c2=(unsigned char)*str2; + if(c1==0) { + if(c2==0) { + return 0; + } else { + return -1; + } + } else if(c2==0) { + return 1; + } else { + /* compare non-zero characters with lowercase */ + rc=(int)(unsigned char)uprv_tolower(c1)-(int)(unsigned char)uprv_tolower(c2); + if(rc!=0) { + return rc; + } + } + ++str1; + ++str2; + } + } + + return 0; +} + +U_CAPI char* U_EXPORT2 +uprv_strdup(const char *src) { + size_t len = uprv_strlen(src) + 1; + char *dup = (char *) uprv_malloc(len); + + if (dup) { + uprv_memcpy(dup, src, len); + } + + return dup; +} + +U_CAPI char* U_EXPORT2 +uprv_strndup(const char *src, int32_t n) { + char *dup; + + if(n < 0) { + dup = uprv_strdup(src); + } else { + dup = (char*)uprv_malloc(n+1); + if (dup) { + uprv_memcpy(dup, src, n); + dup[n] = 0; + } + } + + return dup; +} diff --git a/src/core/basetypes/icu-ucsdet/include/charstr.h b/src/core/basetypes/icu-ucsdet/include/charstr.h new file mode 100644 index 00000000..4b86c835 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/charstr.h @@ -0,0 +1,130 @@ +/* +********************************************************************** +* Copyright (c) 2001-2012, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* Date Name Description +* 11/19/2001 aliu Creation. +* 05/19/2010 markus Rewritten from scratch +********************************************************************** +*/ + +#ifndef CHARSTRING_H +#define CHARSTRING_H + +#include "unicode/utypes.h" +#include "unicode/unistr.h" +#include "unicode/uobject.h" +#include "cmemory.h" + +U_NAMESPACE_BEGIN + +// Windows needs us to DLL-export the MaybeStackArray template specialization, +// but MacOS X cannot handle it. Same as in digitlst.h. +#if !U_PLATFORM_IS_DARWIN_BASED +template class U_COMMON_API MaybeStackArray<char, 40>; +#endif + +/** + * ICU-internal char * string class. + * This class does not assume or enforce any particular character encoding. + * Raw bytes can be stored. The string object owns its characters. + * A terminating NUL is stored, but the class does not prevent embedded NUL characters. + * + * This class wants to be convenient but is also deliberately minimalist. + * Please do not add methods if they only add minor convenience. + * For example: + * cs.data()[5]='a'; // no need for setCharAt(5, 'a') + */ +class U_COMMON_API CharString : public UMemory { +public: + CharString() : len(0) { buffer[0]=0; } + CharString(const StringPiece &s, UErrorCode &errorCode) : len(0) { + buffer[0]=0; + append(s, errorCode); + } + CharString(const CharString &s, UErrorCode &errorCode) : len(0) { + buffer[0]=0; + append(s, errorCode); + } + CharString(const char *s, int32_t sLength, UErrorCode &errorCode) : len(0) { + buffer[0]=0; + append(s, sLength, errorCode); + } + ~CharString() {} + + /** + * Replaces this string's contents with the other string's contents. + * CharString does not support the standard copy constructor nor + * the assignment operator, to make copies explicit and to + * use a UErrorCode where memory allocations might be needed. + */ + CharString ©From(const CharString &other, UErrorCode &errorCode); + + UBool isEmpty() const { return len==0; } + int32_t length() const { return len; } + char operator[](int32_t index) const { return buffer[index]; } + StringPiece toStringPiece() const { return StringPiece(buffer.getAlias(), len); } + + const char *data() const { return buffer.getAlias(); } + char *data() { return buffer.getAlias(); } + + CharString &clear() { len=0; buffer[0]=0; return *this; } + CharString &truncate(int32_t newLength); + + CharString &append(char c, UErrorCode &errorCode); + CharString &append(const StringPiece &s, UErrorCode &errorCode) { + return append(s.data(), s.length(), errorCode); + } + CharString &append(const CharString &s, UErrorCode &errorCode) { + return append(s.data(), s.length(), errorCode); + } + CharString &append(const char *s, int32_t sLength, UErrorCode &status); + /** + * Returns a writable buffer for appending and writes the buffer's capacity to + * resultCapacity. Guarantees resultCapacity>=minCapacity if U_SUCCESS(). + * There will additionally be space for a terminating NUL right at resultCapacity. + * (This function is similar to ByteSink.GetAppendBuffer().) + * + * The returned buffer is only valid until the next write operation + * on this string. + * + * After writing at most resultCapacity bytes, call append() with the + * pointer returned from this function and the number of bytes written. + * + * @param minCapacity required minimum capacity of the returned buffer; + * must be non-negative + * @param desiredCapacityHint desired capacity of the returned buffer; + * must be non-negative + * @param resultCapacity will be set to the capacity of the returned buffer + * @param errorCode in/out error code + * @return a buffer with resultCapacity>=min_capacity + */ + char *getAppendBuffer(int32_t minCapacity, + int32_t desiredCapacityHint, + int32_t &resultCapacity, + UErrorCode &errorCode); + + CharString &appendInvariantChars(const UnicodeString &s, UErrorCode &errorCode); + + /** + * Appends a filename/path part, e.g., a directory name. + * First appends a U_FILE_SEP_CHAR if necessary. + * Does nothing if s is empty. + */ + CharString &appendPathPart(const StringPiece &s, UErrorCode &errorCode); + +private: + MaybeStackArray<char, 40> buffer; + int32_t len; + + UBool ensureCapacity(int32_t capacity, int32_t desiredCapacityHint, UErrorCode &errorCode); + + CharString(const CharString &other); // forbid copying of this class + CharString &operator=(const CharString &other); // forbid copying of this class +}; + +U_NAMESPACE_END + +#endif +//eof diff --git a/src/core/basetypes/icu-ucsdet/include/cmemory.h b/src/core/basetypes/icu-ucsdet/include/cmemory.h new file mode 100644 index 00000000..ed29b63e --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/cmemory.h @@ -0,0 +1,607 @@ +/* +****************************************************************************** +* +* Copyright (C) 1997-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* File CMEMORY.H +* +* Contains stdlib.h/string.h memory functions +* +* @author Bertrand A. Damiba +* +* Modification History: +* +* Date Name Description +* 6/20/98 Bertrand Created. +* 05/03/99 stephen Changed from functions to macros. +* +****************************************************************************** +*/ + +#ifndef CMEMORY_H +#define CMEMORY_H + +#include "unicode/utypes.h" + +#include <stddef.h> +#include <string.h> +#include "unicode/localpointer.h" + +#if U_DEBUG && defined(UPRV_MALLOC_COUNT) +#include <stdio.h> +#endif + +#if U_DEBUG + +/* + * The C++ standard requires that the source pointer for memcpy() & memmove() + * is valid, not NULL, and not at the end of an allocated memory block. + * In debug mode, we read one byte from the source point to verify that it's + * a valid, readable pointer. + */ + +U_CAPI void uprv_checkValidMemory(const void *p, size_t n); + +#define uprv_memcpy(dst, src, size) ( \ + uprv_checkValidMemory(src, 1), \ + U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size)) +#define uprv_memmove(dst, src, size) ( \ + uprv_checkValidMemory(src, 1), \ + U_STANDARD_CPP_NAMESPACE memmove(dst, src, size)) + +#else + +#define uprv_memcpy(dst, src, size) U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size) +#define uprv_memmove(dst, src, size) U_STANDARD_CPP_NAMESPACE memmove(dst, src, size) + +#endif /* U_DEBUG */ + +/** + * \def UPRV_LENGTHOF + * Convenience macro to determine the length of a fixed array at compile-time. + * @param array A fixed length array + * @return The length of the array, in elements + * @internal + */ +#define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) +#define uprv_memset(buffer, mark, size) U_STANDARD_CPP_NAMESPACE memset(buffer, mark, size) +#define uprv_memcmp(buffer1, buffer2, size) U_STANDARD_CPP_NAMESPACE memcmp(buffer1, buffer2,size) + +U_CAPI void * U_EXPORT2 +uprv_malloc(size_t s) U_MALLOC_ATTR U_ALLOC_SIZE_ATTR(1); + +U_CAPI void * U_EXPORT2 +uprv_realloc(void *mem, size_t size) U_ALLOC_SIZE_ATTR(2); + +U_CAPI void U_EXPORT2 +uprv_free(void *mem); + +U_CAPI void * U_EXPORT2 +uprv_calloc(size_t num, size_t size) U_MALLOC_ATTR U_ALLOC_SIZE_ATTR2(1,2); + +/** + * This should align the memory properly on any machine. + * This is very useful for the safeClone functions. + */ +typedef union { + long t1; + double t2; + void *t3; +} UAlignedMemory; + +/** + * Get the least significant bits of a pointer (a memory address). + * For example, with a mask of 3, the macro gets the 2 least significant bits, + * which will be 0 if the pointer is 32-bit (4-byte) aligned. + * + * ptrdiff_t is the most appropriate integer type to cast to. + * size_t should work too, since on most (or all?) platforms it has the same + * width as ptrdiff_t. + */ +#define U_POINTER_MASK_LSB(ptr, mask) (((ptrdiff_t)(char *)(ptr)) & (mask)) + +/** + * Get the amount of bytes that a pointer is off by from + * the previous UAlignedMemory-aligned pointer. + */ +#define U_ALIGNMENT_OFFSET(ptr) U_POINTER_MASK_LSB(ptr, sizeof(UAlignedMemory) - 1) + +/** + * Get the amount of bytes to add to a pointer + * in order to get the next UAlignedMemory-aligned address. + */ +#define U_ALIGNMENT_OFFSET_UP(ptr) (sizeof(UAlignedMemory) - U_ALIGNMENT_OFFSET(ptr)) + +/** + * Indicate whether the ICU allocation functions have been used. + * This is used to determine whether ICU is in an initial, unused state. + */ +U_CFUNC UBool +cmemory_inUse(void); + +/** + * Heap clean up function, called from u_cleanup() + * Clears any user heap functions from u_setMemoryFunctions() + * Does NOT deallocate any remaining allocated memory. + */ +U_CFUNC UBool +cmemory_cleanup(void); + +/** + * A function called by <TT>uhash_remove</TT>, + * <TT>uhash_close</TT>, or <TT>uhash_put</TT> to delete + * an existing key or value. + * @param obj A key or value stored in a hashtable + * @see uprv_deleteUObject + */ +typedef void U_CALLCONV UObjectDeleter(void* obj); + +/** + * Deleter for UObject instances. + * Works for all subclasses of UObject because it has a virtual destructor. + */ +U_CAPI void U_EXPORT2 +uprv_deleteUObject(void *obj); + +#ifdef __cplusplus + +U_NAMESPACE_BEGIN + +/** + * "Smart pointer" class, deletes memory via uprv_free(). + * For most methods see the LocalPointerBase base class. + * Adds operator[] for array item access. + * + * @see LocalPointerBase + */ +template<typename T> +class LocalMemory : public LocalPointerBase<T> { +public: + /** + * Constructor takes ownership. + * @param p simple pointer to an array of T items that is adopted + */ + explicit LocalMemory(T *p=NULL) : LocalPointerBase<T>(p) {} + /** + * Destructor deletes the memory it owns. + */ + ~LocalMemory() { + uprv_free(LocalPointerBase<T>::ptr); + } + /** + * Deletes the array it owns, + * and adopts (takes ownership of) the one passed in. + * @param p simple pointer to an array of T items that is adopted + */ + void adoptInstead(T *p) { + uprv_free(LocalPointerBase<T>::ptr); + LocalPointerBase<T>::ptr=p; + } + /** + * Deletes the array it owns, allocates a new one and reset its bytes to 0. + * Returns the new array pointer. + * If the allocation fails, then the current array is unchanged and + * this method returns NULL. + * @param newCapacity must be >0 + * @return the allocated array pointer, or NULL if the allocation failed + */ + inline T *allocateInsteadAndReset(int32_t newCapacity=1); + /** + * Deletes the array it owns and allocates a new one, copying length T items. + * Returns the new array pointer. + * If the allocation fails, then the current array is unchanged and + * this method returns NULL. + * @param newCapacity must be >0 + * @param length number of T items to be copied from the old array to the new one; + * must be no more than the capacity of the old array, + * which the caller must track because the LocalMemory does not track it + * @return the allocated array pointer, or NULL if the allocation failed + */ + inline T *allocateInsteadAndCopy(int32_t newCapacity=1, int32_t length=0); + /** + * Array item access (writable). + * No index bounds check. + * @param i array index + * @return reference to the array item + */ + T &operator[](ptrdiff_t i) const { return LocalPointerBase<T>::ptr[i]; } +}; + +template<typename T> +inline T *LocalMemory<T>::allocateInsteadAndReset(int32_t newCapacity) { + if(newCapacity>0) { + T *p=(T *)uprv_malloc(newCapacity*sizeof(T)); + if(p!=NULL) { + uprv_memset(p, 0, newCapacity*sizeof(T)); + uprv_free(LocalPointerBase<T>::ptr); + LocalPointerBase<T>::ptr=p; + } + return p; + } else { + return NULL; + } +} + + +template<typename T> +inline T *LocalMemory<T>::allocateInsteadAndCopy(int32_t newCapacity, int32_t length) { + if(newCapacity>0) { + T *p=(T *)uprv_malloc(newCapacity*sizeof(T)); + if(p!=NULL) { + if(length>0) { + if(length>newCapacity) { + length=newCapacity; + } + uprv_memcpy(p, LocalPointerBase<T>::ptr, length*sizeof(T)); + } + uprv_free(LocalPointerBase<T>::ptr); + LocalPointerBase<T>::ptr=p; + } + return p; + } else { + return NULL; + } +} + +/** + * Simple array/buffer management class using uprv_malloc() and uprv_free(). + * Provides an internal array with fixed capacity. Can alias another array + * or allocate one. + * + * The array address is properly aligned for type T. It might not be properly + * aligned for types larger than T (or larger than the largest subtype of T). + * + * Unlike LocalMemory and LocalArray, this class never adopts + * (takes ownership of) another array. + */ +template<typename T, int32_t stackCapacity> +class MaybeStackArray { +public: + /** + * Default constructor initializes with internal T[stackCapacity] buffer. + */ + MaybeStackArray() : ptr(stackArray), capacity(stackCapacity), needToRelease(FALSE) {} + /** + * Destructor deletes the array (if owned). + */ + ~MaybeStackArray() { releaseArray(); } + /** + * Returns the array capacity (number of T items). + * @return array capacity + */ + int32_t getCapacity() const { return capacity; } + /** + * Access without ownership change. + * @return the array pointer + */ + T *getAlias() const { return ptr; } + /** + * Returns the array limit. Simple convenience method. + * @return getAlias()+getCapacity() + */ + T *getArrayLimit() const { return getAlias()+capacity; } + // No "operator T *() const" because that can make + // expressions like mbs[index] ambiguous for some compilers. + /** + * Array item access (const). + * No index bounds check. + * @param i array index + * @return reference to the array item + */ + const T &operator[](ptrdiff_t i) const { return ptr[i]; } + /** + * Array item access (writable). + * No index bounds check. + * @param i array index + * @return reference to the array item + */ + T &operator[](ptrdiff_t i) { return ptr[i]; } + /** + * Deletes the array (if owned) and aliases another one, no transfer of ownership. + * If the arguments are illegal, then the current array is unchanged. + * @param otherArray must not be NULL + * @param otherCapacity must be >0 + */ + void aliasInstead(T *otherArray, int32_t otherCapacity) { + if(otherArray!=NULL && otherCapacity>0) { + releaseArray(); + ptr=otherArray; + capacity=otherCapacity; + needToRelease=FALSE; + } + } + /** + * Deletes the array (if owned) and allocates a new one, copying length T items. + * Returns the new array pointer. + * If the allocation fails, then the current array is unchanged and + * this method returns NULL. + * @param newCapacity can be less than or greater than the current capacity; + * must be >0 + * @param length number of T items to be copied from the old array to the new one + * @return the allocated array pointer, or NULL if the allocation failed + */ + inline T *resize(int32_t newCapacity, int32_t length=0); + /** + * Gives up ownership of the array if owned, or else clones it, + * copying length T items; resets itself to the internal stack array. + * Returns NULL if the allocation failed. + * @param length number of T items to copy when cloning, + * and capacity of the clone when cloning + * @param resultCapacity will be set to the returned array's capacity (output-only) + * @return the array pointer; + * caller becomes responsible for deleting the array + */ + inline T *orphanOrClone(int32_t length, int32_t &resultCapacity); +private: + T *ptr; + int32_t capacity; + UBool needToRelease; + T stackArray[stackCapacity]; + void releaseArray() { + if(needToRelease) { + uprv_free(ptr); + } + } + /* No comparison operators with other MaybeStackArray's. */ + bool operator==(const MaybeStackArray & /*other*/) {return FALSE;} + bool operator!=(const MaybeStackArray & /*other*/) {return TRUE;} + /* No ownership transfer: No copy constructor, no assignment operator. */ + MaybeStackArray(const MaybeStackArray & /*other*/) {} + void operator=(const MaybeStackArray & /*other*/) {} + + // No heap allocation. Use only on the stack. + // (Declaring these functions private triggers a cascade of problems: + // MSVC insists on exporting an instantiation of MaybeStackArray, which + // requires that all functions be defined. + // An empty implementation of new() is rejected, it must return a value. + // Returning NULL is rejected by gcc for operator new. + // The expedient thing is just not to override operator new. + // While relatively pointless, heap allocated instances will function. + // static void * U_EXPORT2 operator new(size_t size); + // static void * U_EXPORT2 operator new[](size_t size); +#if U_HAVE_PLACEMENT_NEW + // static void * U_EXPORT2 operator new(size_t, void *ptr); +#endif +}; + +template<typename T, int32_t stackCapacity> +inline T *MaybeStackArray<T, stackCapacity>::resize(int32_t newCapacity, int32_t length) { + if(newCapacity>0) { +#if U_DEBUG && defined(UPRV_MALLOC_COUNT) + ::fprintf(::stderr,"MaybeStacArray (resize) alloc %d * %lu\n", newCapacity,sizeof(T)); +#endif + T *p=(T *)uprv_malloc(newCapacity*sizeof(T)); + if(p!=NULL) { + if(length>0) { + if(length>capacity) { + length=capacity; + } + if(length>newCapacity) { + length=newCapacity; + } + uprv_memcpy(p, ptr, length*sizeof(T)); + } + releaseArray(); + ptr=p; + capacity=newCapacity; + needToRelease=TRUE; + } + return p; + } else { + return NULL; + } +} + +template<typename T, int32_t stackCapacity> +inline T *MaybeStackArray<T, stackCapacity>::orphanOrClone(int32_t length, int32_t &resultCapacity) { + T *p; + if(needToRelease) { + p=ptr; + } else if(length<=0) { + return NULL; + } else { + if(length>capacity) { + length=capacity; + } + p=(T *)uprv_malloc(length*sizeof(T)); +#if U_DEBUG && defined(UPRV_MALLOC_COUNT) + ::fprintf(::stderr,"MaybeStacArray (orphan) alloc %d * %lu\n", length,sizeof(T)); +#endif + if(p==NULL) { + return NULL; + } + uprv_memcpy(p, ptr, length*sizeof(T)); + } + resultCapacity=length; + ptr=stackArray; + capacity=stackCapacity; + needToRelease=FALSE; + return p; +} + +/** + * Variant of MaybeStackArray that allocates a header struct and an array + * in one contiguous memory block, using uprv_malloc() and uprv_free(). + * Provides internal memory with fixed array capacity. Can alias another memory + * block or allocate one. + * The stackCapacity is the number of T items in the internal memory, + * not counting the H header. + * Unlike LocalMemory and LocalArray, this class never adopts + * (takes ownership of) another memory block. + */ +template<typename H, typename T, int32_t stackCapacity> +class MaybeStackHeaderAndArray { +public: + /** + * Default constructor initializes with internal H+T[stackCapacity] buffer. + */ + MaybeStackHeaderAndArray() : ptr(&stackHeader), capacity(stackCapacity), needToRelease(FALSE) {} + /** + * Destructor deletes the memory (if owned). + */ + ~MaybeStackHeaderAndArray() { releaseMemory(); } + /** + * Returns the array capacity (number of T items). + * @return array capacity + */ + int32_t getCapacity() const { return capacity; } + /** + * Access without ownership change. + * @return the header pointer + */ + H *getAlias() const { return ptr; } + /** + * Returns the array start. + * @return array start, same address as getAlias()+1 + */ + T *getArrayStart() const { return reinterpret_cast<T *>(getAlias()+1); } + /** + * Returns the array limit. + * @return array limit + */ + T *getArrayLimit() const { return getArrayStart()+capacity; } + /** + * Access without ownership change. Same as getAlias(). + * A class instance can be used directly in expressions that take a T *. + * @return the header pointer + */ + operator H *() const { return ptr; } + /** + * Array item access (writable). + * No index bounds check. + * @param i array index + * @return reference to the array item + */ + T &operator[](ptrdiff_t i) { return getArrayStart()[i]; } + /** + * Deletes the memory block (if owned) and aliases another one, no transfer of ownership. + * If the arguments are illegal, then the current memory is unchanged. + * @param otherArray must not be NULL + * @param otherCapacity must be >0 + */ + void aliasInstead(H *otherMemory, int32_t otherCapacity) { + if(otherMemory!=NULL && otherCapacity>0) { + releaseMemory(); + ptr=otherMemory; + capacity=otherCapacity; + needToRelease=FALSE; + } + } + /** + * Deletes the memory block (if owned) and allocates a new one, + * copying the header and length T array items. + * Returns the new header pointer. + * If the allocation fails, then the current memory is unchanged and + * this method returns NULL. + * @param newCapacity can be less than or greater than the current capacity; + * must be >0 + * @param length number of T items to be copied from the old array to the new one + * @return the allocated pointer, or NULL if the allocation failed + */ + inline H *resize(int32_t newCapacity, int32_t length=0); + /** + * Gives up ownership of the memory if owned, or else clones it, + * copying the header and length T array items; resets itself to the internal memory. + * Returns NULL if the allocation failed. + * @param length number of T items to copy when cloning, + * and array capacity of the clone when cloning + * @param resultCapacity will be set to the returned array's capacity (output-only) + * @return the header pointer; + * caller becomes responsible for deleting the array + */ + inline H *orphanOrClone(int32_t length, int32_t &resultCapacity); +private: + H *ptr; + int32_t capacity; + UBool needToRelease; + // stackHeader must precede stackArray immediately. + H stackHeader; + T stackArray[stackCapacity]; + void releaseMemory() { + if(needToRelease) { + uprv_free(ptr); + } + } + /* No comparison operators with other MaybeStackHeaderAndArray's. */ + bool operator==(const MaybeStackHeaderAndArray & /*other*/) {return FALSE;} + bool operator!=(const MaybeStackHeaderAndArray & /*other*/) {return TRUE;} + /* No ownership transfer: No copy constructor, no assignment operator. */ + MaybeStackHeaderAndArray(const MaybeStackHeaderAndArray & /*other*/) {} + void operator=(const MaybeStackHeaderAndArray & /*other*/) {} + + // No heap allocation. Use only on the stack. + // (Declaring these functions private triggers a cascade of problems; + // see the MaybeStackArray class for details.) + // static void * U_EXPORT2 operator new(size_t size); + // static void * U_EXPORT2 operator new[](size_t size); +#if U_HAVE_PLACEMENT_NEW + // static void * U_EXPORT2 operator new(size_t, void *ptr); +#endif +}; + +template<typename H, typename T, int32_t stackCapacity> +inline H *MaybeStackHeaderAndArray<H, T, stackCapacity>::resize(int32_t newCapacity, + int32_t length) { + if(newCapacity>=0) { +#if U_DEBUG && defined(UPRV_MALLOC_COUNT) + ::fprintf(::stderr,"MaybeStackHeaderAndArray alloc %d + %d * %ul\n", sizeof(H),newCapacity,sizeof(T)); +#endif + H *p=(H *)uprv_malloc(sizeof(H)+newCapacity*sizeof(T)); + if(p!=NULL) { + if(length<0) { + length=0; + } else if(length>0) { + if(length>capacity) { + length=capacity; + } + if(length>newCapacity) { + length=newCapacity; + } + } + uprv_memcpy(p, ptr, sizeof(H)+length*sizeof(T)); + releaseMemory(); + ptr=p; + capacity=newCapacity; + needToRelease=TRUE; + } + return p; + } else { + return NULL; + } +} + +template<typename H, typename T, int32_t stackCapacity> +inline H *MaybeStackHeaderAndArray<H, T, stackCapacity>::orphanOrClone(int32_t length, + int32_t &resultCapacity) { + H *p; + if(needToRelease) { + p=ptr; + } else { + if(length<0) { + length=0; + } else if(length>capacity) { + length=capacity; + } +#if U_DEBUG && defined(UPRV_MALLOC_COUNT) + ::fprintf(::stderr,"MaybeStackHeaderAndArray (orphan) alloc %ul + %d * %lu\n", sizeof(H),length,sizeof(T)); +#endif + p=(H *)uprv_malloc(sizeof(H)+length*sizeof(T)); + if(p==NULL) { + return NULL; + } + uprv_memcpy(p, ptr, sizeof(H)+length*sizeof(T)); + } + resultCapacity=length; + ptr=&stackHeader; + capacity=stackCapacity; + needToRelease=FALSE; + return p; +} + +U_NAMESPACE_END + +#endif /* __cplusplus */ +#endif /* CMEMORY_H */ diff --git a/src/core/basetypes/icu-ucsdet/include/csdetect.h b/src/core/basetypes/icu-ucsdet/include/csdetect.h new file mode 100644 index 00000000..15910453 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/csdetect.h @@ -0,0 +1,65 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2013, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#ifndef __CSDETECT_H +#define __CSDETECT_H + +#include "unicode/uobject.h" + +#if !UCONFIG_NO_CONVERSION + +U_NAMESPACE_BEGIN + +class InputText; +class CharsetRecognizer; +class CharsetMatch; + +class CharsetDetector : public UMemory +{ +private: + InputText *textIn; + CharsetMatch **resultArray; + int32_t resultCount; + UBool fStripTags; // If true, setText() will strip tags from input text. + UBool fFreshTextSet; + static void setRecognizers(UErrorCode &status); + + UBool *fEnabledRecognizers; // If not null, active set of charset recognizers had + // been changed from the default. The array index is + // corresponding to fCSRecognizers. See setDetectableCharset(). + +public: + CharsetDetector(UErrorCode &status); + + ~CharsetDetector(); + + void setText(const char *in, int32_t len); + + const CharsetMatch * const *detectAll(int32_t &maxMatchesFound, UErrorCode &status); + + const CharsetMatch *detect(UErrorCode& status); + + void setDeclaredEncoding(const char *encoding, int32_t len) const; + + UBool setStripTagsFlag(UBool flag); + + UBool getStripTagsFlag() const; + +// const char *getCharsetName(int32_t index, UErrorCode& status) const; + + static int32_t getDetectableCount(); + + + static UEnumeration * getAllDetectableCharsets(UErrorCode &status); + UEnumeration * getDetectableCharsets(UErrorCode &status) const; + void setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status); +}; + +U_NAMESPACE_END + +#endif +#endif /* __CSDETECT_H */ diff --git a/src/core/basetypes/icu-ucsdet/include/csmatch.h b/src/core/basetypes/icu-ucsdet/include/csmatch.h new file mode 100644 index 00000000..6a3671c4 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/csmatch.h @@ -0,0 +1,69 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2012, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#ifndef __CSMATCH_H +#define __CSMATCH_H + +#include "unicode/uobject.h" + +#if !UCONFIG_NO_CONVERSION + +U_NAMESPACE_BEGIN + +class InputText; +class CharsetRecognizer; + +/* + * CharsetMatch represents the results produced by one Charset Recognizer for one input text + * Any confidence > 0 indicates a possible match, meaning that the input bytes + * are at least legal. + * + * The full results of a detect are represented by an array of these + * CharsetMatch objects, each representing a possible matching charset. + * + * Note that a single charset recognizer may detect multiple closely related + * charsets, and set different names depending on the exact input bytes seen. + */ +class CharsetMatch : public UMemory +{ + private: + InputText *textIn; + int32_t confidence; + const char *fCharsetName; + const char *fLang; + + public: + CharsetMatch(); + + /** + * fully set the state of this CharsetMatch. + * Called by the CharsetRecognizers to record match results. + * Default (NULL) parameters for names will be filled by calling the + * corresponding getters on the recognizer. + */ + void set(InputText *input, + const CharsetRecognizer *cr, + int32_t conf, + const char *csName=NULL, + const char *lang=NULL); + + /** + * Return the name of the charset for this Match + */ + const char *getName() const; + + const char *getLanguage()const; + + int32_t getConfidence()const; + + int32_t getUChars(UChar *buf, int32_t cap, UErrorCode *status) const; +}; + +U_NAMESPACE_END + +#endif +#endif /* __CSMATCH_H */ diff --git a/src/core/basetypes/icu-ucsdet/include/csr2022.h b/src/core/basetypes/icu-ucsdet/include/csr2022.h new file mode 100644 index 00000000..2ac2b87d --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/csr2022.h @@ -0,0 +1,91 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2012, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#ifndef __CSR2022_H +#define __CSR2022_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "csrecog.h" + +U_NAMESPACE_BEGIN + +class CharsetMatch; + +/** + * class CharsetRecog_2022 part of the ICU charset detection imlementation. + * This is a superclass for the individual detectors for + * each of the detectable members of the ISO 2022 family + * of encodings. + * + * The separate classes are nested within this class. + * + * @internal + */ +class CharsetRecog_2022 : public CharsetRecognizer +{ + +public: + virtual ~CharsetRecog_2022() = 0; + +protected: + + /** + * Matching function shared among the 2022 detectors JP, CN and KR + * Counts up the number of legal an unrecognized escape sequences in + * the sample of text, and computes a score based on the total number & + * the proportion that fit the encoding. + * + * + * @param text the byte buffer containing text to analyse + * @param textLen the size of the text in the byte. + * @param escapeSequences the byte escape sequences to test for. + * @return match quality, in the range of 0-100. + */ + int32_t match_2022(const uint8_t *text, + int32_t textLen, + const uint8_t escapeSequences[][5], + int32_t escapeSequences_length) const; + +}; + +class CharsetRecog_2022JP :public CharsetRecog_2022 +{ +public: + virtual ~CharsetRecog_2022JP(); + + const char *getName() const; + + UBool match(InputText *textIn, CharsetMatch *results) const; +}; + +class CharsetRecog_2022KR :public CharsetRecog_2022 { +public: + virtual ~CharsetRecog_2022KR(); + + const char *getName() const; + + UBool match(InputText *textIn, CharsetMatch *results) const; + +}; + +class CharsetRecog_2022CN :public CharsetRecog_2022 +{ +public: + virtual ~CharsetRecog_2022CN(); + + const char* getName() const; + + UBool match(InputText *textIn, CharsetMatch *results) const; +}; + +U_NAMESPACE_END + +#endif +#endif /* __CSR2022_H */ diff --git a/src/core/basetypes/icu-ucsdet/include/csrecog.h b/src/core/basetypes/icu-ucsdet/include/csrecog.h new file mode 100644 index 00000000..6b7573a1 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/csrecog.h @@ -0,0 +1,55 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2012, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#ifndef __CSRECOG_H +#define __CSRECOG_H + +#include "unicode/uobject.h" + +#if !UCONFIG_NO_CONVERSION + +#include "inputext.h" + +U_NAMESPACE_BEGIN + +class CharsetMatch; + +class CharsetRecognizer : public UMemory +{ + public: + /** + * Get the IANA name of this charset. + * Note that some recognizers can recognize more than one charset, but that this API + * assumes just one name per recognizer. + * TODO: need to account for multiple names in public API that enumerates over the + * known detectable charsets. + * @return the charset name. + */ + virtual const char *getName() const = 0; + + /** + * Get the ISO language code for this charset. + * @return the language code, or <code>null</code> if the language cannot be determined. + */ + virtual const char *getLanguage() const; + + /* + * Try the given input text against this Charset, and fill in the results object + * with the quality of the match plus other information related to the match. + * + * Return TRUE if the the input bytes are a potential match, and + * FALSE if the input data is not compatible with, or illegal in this charset. + */ + virtual UBool match(InputText *textIn, CharsetMatch *results) const = 0; + + virtual ~CharsetRecognizer(); +}; + +U_NAMESPACE_END + +#endif +#endif /* __CSRECOG_H */ diff --git a/src/core/basetypes/icu-ucsdet/include/csrmbcs.h b/src/core/basetypes/icu-ucsdet/include/csrmbcs.h new file mode 100644 index 00000000..9ea9d8f8 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/csrmbcs.h @@ -0,0 +1,205 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2012, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#ifndef __CSRMBCS_H +#define __CSRMBCS_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "csrecog.h" + +U_NAMESPACE_BEGIN + +// "Character" iterated character class. +// Recognizers for specific mbcs encodings make their "characters" available +// by providing a nextChar() function that fills in an instance of IteratedChar +// with the next char from the input. +// The returned characters are not converted to Unicode, but remain as the raw +// bytes (concatenated into an int) from the codepage data. +// +// For Asian charsets, use the raw input rather than the input that has been +// stripped of markup. Detection only considers multi-byte chars, effectively +// stripping markup anyway, and double byte chars do occur in markup too. +// +class IteratedChar : public UMemory +{ +public: + uint32_t charValue; // 1-4 bytes from the raw input data + int32_t index; + int32_t nextIndex; + UBool error; + UBool done; + +public: + IteratedChar(); + //void reset(); + int32_t nextByte(InputText* det); +}; + + +class CharsetRecog_mbcs : public CharsetRecognizer { + +protected: + /** + * Test the match of this charset with the input text data + * which is obtained via the CharsetDetector object. + * + * @param det The CharsetDetector, which contains the input text + * to be checked for being in this charset. + * @return Two values packed into one int (Damn java, anyhow) + * <br/> + * bits 0-7: the match confidence, ranging from 0-100 + * <br/> + * bits 8-15: The match reason, an enum-like value. + */ + int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const; + +public: + + virtual ~CharsetRecog_mbcs(); + + /** + * Get the IANA name of this charset. + * @return the charset name. + */ + + const char *getName() const = 0; + const char *getLanguage() const = 0; + UBool match(InputText* input, CharsetMatch *results) const = 0; + + /** + * Get the next character (however many bytes it is) from the input data + * Subclasses for specific charset encodings must implement this function + * to get characters according to the rules of their encoding scheme. + * + * This function is not a method of class IteratedChar only because + * that would require a lot of extra derived classes, which is awkward. + * @param it The IteratedChar "struct" into which the returned char is placed. + * @param det The charset detector, which is needed to get at the input byte data + * being iterated over. + * @return True if a character was returned, false at end of input. + */ + virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0; + +}; + + +/** + * Shift-JIS charset recognizer. + * + */ +class CharsetRecog_sjis : public CharsetRecog_mbcs { +public: + virtual ~CharsetRecog_sjis(); + + UBool nextChar(IteratedChar *it, InputText *det) const; + + UBool match(InputText* input, CharsetMatch *results) const; + + const char *getName() const; + const char *getLanguage() const; + +}; + + +/** + * EUC charset recognizers. One abstract class that provides the common function + * for getting the next character according to the EUC encoding scheme, + * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. + * + */ +class CharsetRecog_euc : public CharsetRecog_mbcs +{ +public: + virtual ~CharsetRecog_euc(); + + const char *getName() const = 0; + const char *getLanguage() const = 0; + + UBool match(InputText* input, CharsetMatch *results) const = 0; + /* + * (non-Javadoc) + * Get the next character value for EUC based encodings. + * Character "value" is simply the raw bytes that make up the character + * packed into an int. + */ + UBool nextChar(IteratedChar *it, InputText *det) const; +}; + +/** + * The charset recognize for EUC-JP. A singleton instance of this class + * is created and kept by the public CharsetDetector class + */ +class CharsetRecog_euc_jp : public CharsetRecog_euc +{ +public: + virtual ~CharsetRecog_euc_jp(); + + const char *getName() const; + const char *getLanguage() const; + + UBool match(InputText* input, CharsetMatch *results) const; +}; + +/** + * The charset recognize for EUC-KR. A singleton instance of this class + * is created and kept by the public CharsetDetector class + */ +class CharsetRecog_euc_kr : public CharsetRecog_euc +{ +public: + virtual ~CharsetRecog_euc_kr(); + + const char *getName() const; + const char *getLanguage() const; + + UBool match(InputText* input, CharsetMatch *results) const; +}; + +/** + * + * Big5 charset recognizer. + * + */ +class CharsetRecog_big5 : public CharsetRecog_mbcs +{ +public: + virtual ~CharsetRecog_big5(); + + UBool nextChar(IteratedChar* it, InputText* det) const; + + const char *getName() const; + const char *getLanguage() const; + + UBool match(InputText* input, CharsetMatch *results) const; +}; + + +/** + * + * GB-18030 recognizer. Uses simplified Chinese statistics. + * + */ +class CharsetRecog_gb_18030 : public CharsetRecog_mbcs +{ +public: + virtual ~CharsetRecog_gb_18030(); + + UBool nextChar(IteratedChar* it, InputText* det) const; + + const char *getName() const; + const char *getLanguage() const; + + UBool match(InputText* input, CharsetMatch *results) const; +}; + +U_NAMESPACE_END + +#endif +#endif /* __CSRMBCS_H */ diff --git a/src/core/basetypes/icu-ucsdet/include/csrsbcs.h b/src/core/basetypes/icu-ucsdet/include/csrsbcs.h new file mode 100644 index 00000000..2579c029 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/csrsbcs.h @@ -0,0 +1,287 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2013, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#ifndef __CSRSBCS_H +#define __CSRSBCS_H + +#include "unicode/uobject.h" + +#if !UCONFIG_NO_CONVERSION + +#include "csrecog.h" + +U_NAMESPACE_BEGIN + +class NGramParser : public UMemory +{ +private: + int32_t ngram; + const int32_t *ngramList; + + int32_t ngramCount; + int32_t hitCount; + +protected: + int32_t byteIndex; + const uint8_t *charMap; + + void addByte(int32_t b); + +public: + NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); + +private: + /* + * Binary search for value in table, which must have exactly 64 entries. + */ + int32_t search(const int32_t *table, int32_t value); + + void lookup(int32_t thisNgram); + + virtual int32_t nextByte(InputText *det); + virtual void parseCharacters(InputText *det); + +public: + int32_t parse(InputText *det); + +}; + +class NGramParser_IBM420 : public NGramParser +{ +private: + int32_t alef; + int32_t isLamAlef(int32_t b); + int32_t nextByte(InputText *det); + void parseCharacters(InputText *det); + +public: + NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap); +}; + + +class CharsetRecog_sbcs : public CharsetRecognizer +{ +public: + CharsetRecog_sbcs(); + virtual ~CharsetRecog_sbcs(); + virtual const char *getName() const = 0; + virtual UBool match(InputText *det, CharsetMatch *results) const = 0; + virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; +}; + +class CharsetRecog_8859_1 : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_8859_1(); + const char *getName() const; + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_8859_2 : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_8859_2(); + const char *getName() const; + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_8859_5 : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_8859_5(); + const char *getName() const; +}; + +class CharsetRecog_8859_6 : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_8859_6(); + + const char *getName() const; +}; + +class CharsetRecog_8859_7 : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_8859_7(); + + const char *getName() const; +}; + +class CharsetRecog_8859_8 : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_8859_8(); + + virtual const char *getName() const; +}; + +class CharsetRecog_8859_9 : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_8859_9(); + + const char *getName() const; +}; + + + +class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 +{ +public: + virtual ~CharsetRecog_8859_5_ru(); + + const char *getLanguage() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 +{ +public: + virtual ~CharsetRecog_8859_6_ar(); + + const char *getLanguage() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 +{ +public: + virtual ~CharsetRecog_8859_7_el(); + + const char *getLanguage() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 +{ +public: + virtual ~CharsetRecog_8859_8_I_he(); + + const char *getName() const; + + const char *getLanguage() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 +{ +public: + virtual ~CharsetRecog_8859_8_he (); + + const char *getLanguage() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 +{ +public: + virtual ~CharsetRecog_8859_9_tr (); + + const char *getLanguage() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_windows_1256 : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_windows_1256(); + + const char *getName() const; + + const char *getLanguage() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_windows_1251 : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_windows_1251(); + + const char *getName() const; + + const char *getLanguage() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + + +class CharsetRecog_KOI8_R : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_KOI8_R(); + + const char *getName() const; + + const char *getLanguage() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_IBM424_he : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_IBM424_he(); + + const char *getLanguage() const; +}; + +class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { +public: + virtual ~CharsetRecog_IBM424_he_rtl(); + + const char *getName() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { + virtual ~CharsetRecog_IBM424_he_ltr(); + + const char *getName() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_IBM420_ar(); + + const char *getLanguage() const; + int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; + +}; + +class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { +public: + virtual ~CharsetRecog_IBM420_ar_rtl(); + + const char *getName() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { + virtual ~CharsetRecog_IBM420_ar_ltr(); + + const char *getName() const; + + virtual UBool match(InputText *det, CharsetMatch *results) const; +}; + +U_NAMESPACE_END + +#endif /* !UCONFIG_NO_CONVERSION */ +#endif /* __CSRSBCS_H */ diff --git a/src/core/basetypes/icu-ucsdet/include/csrucode.h b/src/core/basetypes/icu-ucsdet/include/csrucode.h new file mode 100644 index 00000000..a8a4f2bc --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/csrucode.h @@ -0,0 +1,106 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2012, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#ifndef __CSRUCODE_H +#define __CSRUCODE_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "csrecog.h" + +U_NAMESPACE_BEGIN + +/** + * This class matches UTF-16 and UTF-32, both big- and little-endian. The + * BOM will be used if it is present. + * + * @internal + */ +class CharsetRecog_Unicode : public CharsetRecognizer +{ + +public: + + virtual ~CharsetRecog_Unicode(); + /* (non-Javadoc) + * @see com.ibm.icu.text.CharsetRecognizer#getName() + */ + const char* getName() const = 0; + + /* (non-Javadoc) + * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector) + */ + UBool match(InputText* textIn, CharsetMatch *results) const = 0; +}; + + +class CharsetRecog_UTF_16_BE : public CharsetRecog_Unicode +{ +public: + + virtual ~CharsetRecog_UTF_16_BE(); + + const char *getName() const; + + UBool match(InputText* textIn, CharsetMatch *results) const; +}; + +class CharsetRecog_UTF_16_LE : public CharsetRecog_Unicode +{ +public: + + virtual ~CharsetRecog_UTF_16_LE(); + + const char *getName() const; + + UBool match(InputText* textIn, CharsetMatch *results) const; +}; + +class CharsetRecog_UTF_32 : public CharsetRecog_Unicode +{ +protected: + virtual int32_t getChar(const uint8_t *input, int32_t index) const = 0; +public: + + virtual ~CharsetRecog_UTF_32(); + + const char* getName() const = 0; + + UBool match(InputText* textIn, CharsetMatch *results) const; +}; + + +class CharsetRecog_UTF_32_BE : public CharsetRecog_UTF_32 +{ +protected: + int32_t getChar(const uint8_t *input, int32_t index) const; + +public: + + virtual ~CharsetRecog_UTF_32_BE(); + + const char *getName() const; +}; + + +class CharsetRecog_UTF_32_LE : public CharsetRecog_UTF_32 +{ +protected: + int32_t getChar(const uint8_t *input, int32_t index) const; + +public: + virtual ~CharsetRecog_UTF_32_LE(); + + const char* getName() const; +}; + +U_NAMESPACE_END + +#endif +#endif /* __CSRUCODE_H */ diff --git a/src/core/basetypes/icu-ucsdet/include/csrutf8.h b/src/core/basetypes/icu-ucsdet/include/csrutf8.h new file mode 100644 index 00000000..82e8f9d7 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/csrutf8.h @@ -0,0 +1,42 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2012, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#ifndef __CSRUTF8_H +#define __CSRUTF8_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "csrecog.h" + +U_NAMESPACE_BEGIN + +/** + * Charset recognizer for UTF-8 + * + * @internal + */ +class CharsetRecog_UTF8: public CharsetRecognizer { + + public: + + virtual ~CharsetRecog_UTF8(); + + const char *getName() const; + + /* (non-Javadoc) + * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector) + */ + UBool match(InputText *input, CharsetMatch *results) const; + +}; + +U_NAMESPACE_END + +#endif +#endif /* __CSRUTF8_H */ diff --git a/src/core/basetypes/icu-ucsdet/include/cstring.h b/src/core/basetypes/icu-ucsdet/include/cstring.h new file mode 100644 index 00000000..64b68ffa --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/cstring.h @@ -0,0 +1,140 @@ +/* +****************************************************************************** +* +* Copyright (C) 1997-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* File CSTRING.H +* +* Contains CString interface +* +* @author Helena Shih +* +* Modification History: +* +* Date Name Description +* 6/17/98 hshih Created. +* 05/03/99 stephen Changed from functions to macros. +* 06/14/99 stephen Added icu_strncat, icu_strncmp, icu_tolower +* +****************************************************************************** +*/ + +#ifndef CSTRING_H +#define CSTRING_H 1 + +#include "unicode/utypes.h" +#include "cmemory.h" +#include <string.h> +#include <stdlib.h> +#include <ctype.h> + +#define uprv_strcpy(dst, src) U_STANDARD_CPP_NAMESPACE strcpy(dst, src) +#define uprv_strlen(str) U_STANDARD_CPP_NAMESPACE strlen(str) +#define uprv_strcmp(s1, s2) U_STANDARD_CPP_NAMESPACE strcmp(s1, s2) +#define uprv_strcat(dst, src) U_STANDARD_CPP_NAMESPACE strcat(dst, src) +#define uprv_strchr(s, c) U_STANDARD_CPP_NAMESPACE strchr(s, c) +#define uprv_strstr(s, c) U_STANDARD_CPP_NAMESPACE strstr(s, c) +#define uprv_strrchr(s, c) U_STANDARD_CPP_NAMESPACE strrchr(s, c) + +#if U_DEBUG + +#define uprv_strncpy(dst, src, size) ( \ + uprv_checkValidMemory(src, 1), \ + U_STANDARD_CPP_NAMESPACE strncpy(dst, src, size)) +#define uprv_strncmp(s1, s2, n) ( \ + uprv_checkValidMemory(s1, 1), \ + uprv_checkValidMemory(s2, 1), \ + U_STANDARD_CPP_NAMESPACE strncmp(s1, s2, n)) +#define uprv_strncat(dst, src, n) ( \ + uprv_checkValidMemory(src, 1), \ + U_STANDARD_CPP_NAMESPACE strncat(dst, src, n)) + +#else + +#define uprv_strncpy(dst, src, size) U_STANDARD_CPP_NAMESPACE strncpy(dst, src, size) +#define uprv_strncmp(s1, s2, n) U_STANDARD_CPP_NAMESPACE strncmp(s1, s2, n) +#define uprv_strncat(dst, src, n) U_STANDARD_CPP_NAMESPACE strncat(dst, src, n) + +#endif /* U_DEBUG */ + +/** + * Is c an ASCII-repertoire letter a-z or A-Z? + * Note: The implementation is specific to whether ICU is compiled for + * an ASCII-based or EBCDIC-based machine. There just does not seem to be a better name for this. + */ +U_CAPI UBool U_EXPORT2 +uprv_isASCIILetter(char c); + +U_CAPI char U_EXPORT2 +uprv_toupper(char c); + + +U_CAPI char U_EXPORT2 +uprv_asciitolower(char c); + +U_CAPI char U_EXPORT2 +uprv_ebcdictolower(char c); + +#if U_CHARSET_FAMILY==U_ASCII_FAMILY +# define uprv_tolower uprv_asciitolower +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY +# define uprv_tolower uprv_ebcdictolower +#else +# error U_CHARSET_FAMILY is not valid +#endif + +#define uprv_strtod(source, end) U_STANDARD_CPP_NAMESPACE strtod(source, end) +#define uprv_strtoul(str, end, base) U_STANDARD_CPP_NAMESPACE strtoul(str, end, base) +#define uprv_strtol(str, end, base) U_STANDARD_CPP_NAMESPACE strtol(str, end, base) + +/* Conversion from a digit to the character with radix base from 2-19 */ +/* May need to use U_UPPER_ORDINAL*/ +#define T_CString_itosOffset(a) ((a)<=9?('0'+(a)):('A'+(a)-10)) + +U_CAPI char* U_EXPORT2 +uprv_strdup(const char *src); + +/** + * uprv_malloc n+1 bytes, and copy n bytes from src into the new string. + * Terminate with a null at offset n. If n is -1, works like uprv_strdup + * @param src + * @param n length of the input string, not including null. + * @return new string (owned by caller, use uprv_free to free). + * @internal + */ +U_CAPI char* U_EXPORT2 +uprv_strndup(const char *src, int32_t n); + +U_CAPI char* U_EXPORT2 +T_CString_toLowerCase(char* str); + +U_CAPI char* U_EXPORT2 +T_CString_toUpperCase(char* str); + +U_CAPI int32_t U_EXPORT2 +T_CString_integerToString(char *buffer, int32_t n, int32_t radix); + +U_CAPI int32_t U_EXPORT2 +T_CString_int64ToString(char *buffer, int64_t n, uint32_t radix); + +U_CAPI int32_t U_EXPORT2 +T_CString_stringToInteger(const char *integerString, int32_t radix); + +/** + * Case-insensitive, language-independent string comparison + * limited to the ASCII character repertoire. + */ +U_CAPI int U_EXPORT2 +uprv_stricmp(const char *str1, const char *str2); + +/** + * Case-insensitive, language-independent string comparison + * limited to the ASCII character repertoire. + */ +U_CAPI int U_EXPORT2 +uprv_strnicmp(const char *str1, const char *str2, uint32_t n); + +#endif /* ! CSTRING_H */ diff --git a/src/core/basetypes/icu-ucsdet/include/cwchar.h b/src/core/basetypes/icu-ucsdet/include/cwchar.h new file mode 100644 index 00000000..2ab36c03 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/cwchar.h @@ -0,0 +1,56 @@ +/* +****************************************************************************** +* +* Copyright (C) 2001, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: cwchar.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2001may25 +* created by: Markus W. Scherer +* +* This file contains ICU-internal definitions of wchar_t operations. +* These definitions were moved here from cstring.h so that fewer +* ICU implementation files include wchar.h. +*/ + +#ifndef __CWCHAR_H__ +#define __CWCHAR_H__ + +#include <string.h> +#include <stdlib.h> +#include "unicode/utypes.h" + +/* Do this after utypes.h so that we have U_HAVE_WCHAR_H . */ +#if U_HAVE_WCHAR_H +# include <wchar.h> +#endif + +/*===========================================================================*/ +/* Wide-character functions */ +/*===========================================================================*/ + +/* The following are not available on all systems, defined in wchar.h or string.h. */ +#if U_HAVE_WCSCPY +# define uprv_wcscpy wcscpy +# define uprv_wcscat wcscat +# define uprv_wcslen wcslen +#else +U_CAPI wchar_t* U_EXPORT2 +uprv_wcscpy(wchar_t *dst, const wchar_t *src); +U_CAPI wchar_t* U_EXPORT2 +uprv_wcscat(wchar_t *dst, const wchar_t *src); +U_CAPI size_t U_EXPORT2 +uprv_wcslen(const wchar_t *src); +#endif + +/* The following are part of the ANSI C standard, defined in stdlib.h . */ +#define uprv_wcstombs(mbstr, wcstr, count) U_STANDARD_CPP_NAMESPACE wcstombs(mbstr, wcstr, count) +#define uprv_mbstowcs(wcstr, mbstr, count) U_STANDARD_CPP_NAMESPACE mbstowcs(wcstr, mbstr, count) + + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/inputext.h b/src/core/basetypes/icu-ucsdet/include/inputext.h new file mode 100644 index 00000000..0c5973d8 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/inputext.h @@ -0,0 +1,61 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2008, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#ifndef __INPUTEXT_H +#define __INPUTEXT_H + +/** + * \file + * \internal + * + * This is an internal header for the Character Set Detection code. The + * name is probably too generic... + */ + + +#include "unicode/uobject.h" + +#if !UCONFIG_NO_CONVERSION + +U_NAMESPACE_BEGIN + +class InputText : public UMemory +{ + // Prevent copying + InputText(const InputText &); +public: + InputText(UErrorCode &status); + ~InputText(); + + void setText(const char *in, int32_t len); + void setDeclaredEncoding(const char *encoding, int32_t len); + UBool isSet() const; + void MungeInput(UBool fStripTags); + + // The text to be checked. Markup will have been + // removed if appropriate. + uint8_t *fInputBytes; + int32_t fInputLen; // Length of the byte data in fInputBytes. + // byte frequency statistics for the input text. + // Value is percent, not absolute. + // Value is rounded up, so zero really means zero occurences. + int16_t *fByteStats; + UBool fC1Bytes; // True if any bytes in the range 0x80 - 0x9F are in the input;false by default + char *fDeclaredEncoding; + + const uint8_t *fRawInput; // Original, untouched input bytes. + // If user gave us a byte array, this is it. + // If user gave us a stream, it's read to a + // buffer here. + int32_t fRawLength; // Length of data in fRawInput array. + +}; + +U_NAMESPACE_END + +#endif +#endif /* __INPUTEXT_H */ diff --git a/src/core/basetypes/icu-ucsdet/include/locmap.h b/src/core/basetypes/icu-ucsdet/include/locmap.h new file mode 100644 index 00000000..214bbcec --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/locmap.h @@ -0,0 +1,37 @@ +/* +****************************************************************************** +* +* Copyright (C) 1996-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* File locmap.h : Locale Mapping Classes +* +* +* Created by: Helena Shih +* +* Modification History: +* +* Date Name Description +* 3/11/97 aliu Added setId(). +* 4/20/99 Madhu Added T_convertToPosix() +* 09/18/00 george Removed the memory leaks. +* 08/23/01 george Convert to C +*============================================================================ +*/ + +#ifndef LOCMAP_H +#define LOCMAP_H + +#include "unicode/utypes.h" + +#define LANGUAGE_LCID(hostID) (uint16_t)(0x03FF & hostID) + +U_CAPI int32_t uprv_convertToPosix(uint32_t hostid, char* posixID, int32_t posixIDCapacity, UErrorCode* status); + +/* Don't call this function directly. Use uloc_getLCID instead. */ +U_CAPI uint32_t uprv_convertToLCID(const char *langID, const char* posixID, UErrorCode* status); + +#endif /* LOCMAP_H */ + diff --git a/src/core/basetypes/icu-ucsdet/include/mutex.h b/src/core/basetypes/icu-ucsdet/include/mutex.h new file mode 100644 index 00000000..07dcdbbc --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/mutex.h @@ -0,0 +1,77 @@ +/* +****************************************************************************** +* +* Copyright (C) 1997-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +*/ +//---------------------------------------------------------------------------- +// File: mutex.h +// +// Lightweight C++ wrapper for umtx_ C mutex functions +// +// Author: Alan Liu 1/31/97 +// History: +// 06/04/97 helena Updated setImplementation as per feedback from 5/21 drop. +// 04/07/1999 srl refocused as a thin wrapper +// +//---------------------------------------------------------------------------- +#ifndef MUTEX_H +#define MUTEX_H + +#include "unicode/utypes.h" +#include "unicode/uobject.h" +#include "umutex.h" + +U_NAMESPACE_BEGIN + +//---------------------------------------------------------------------------- +// Code within that accesses shared static or global data should +// should instantiate a Mutex object while doing so. You should make your own +// private mutex where possible. + +// For example: +// +// UMutex myMutex; +// +// void Function(int arg1, int arg2) +// { +// static Object* foo; // Shared read-write object +// Mutex mutex(&myMutex); // or no args for the global lock +// foo->Method(); +// // When 'mutex' goes out of scope and gets destroyed here, the lock is released +// } +// +// Note: Do NOT use the form 'Mutex mutex();' as that merely forward-declares a function +// returning a Mutex. This is a common mistake which silently slips through the +// compiler!! +// + +class U_COMMON_API Mutex : public UMemory { +public: + inline Mutex(UMutex *mutex = NULL); + inline ~Mutex(); + +private: + UMutex *fMutex; + + Mutex(const Mutex &other); // forbid copying of this class + Mutex &operator=(const Mutex &other); // forbid copying of this class +}; + +inline Mutex::Mutex(UMutex *mutex) + : fMutex(mutex) +{ + umtx_lock(fMutex); +} + +inline Mutex::~Mutex() +{ + umtx_unlock(fMutex); +} + +U_NAMESPACE_END + +#endif //_MUTEX_ +//eof diff --git a/src/core/basetypes/icu-ucsdet/include/putilimp.h b/src/core/basetypes/icu-ucsdet/include/putilimp.h new file mode 100644 index 00000000..d2c1c66f --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/putilimp.h @@ -0,0 +1,611 @@ +/* +****************************************************************************** +* +* Copyright (C) 1997-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* FILE NAME : putilimp.h +* +* Date Name Description +* 10/17/04 grhoten Move internal functions from putil.h to this file. +****************************************************************************** +*/ + +#ifndef PUTILIMP_H +#define PUTILIMP_H + +#include "unicode/utypes.h" +#include "unicode/putil.h" + +/** + * \def U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC + * Nearly all CPUs and compilers implement a right-shift of a signed integer + * as an Arithmetic Shift Right which copies the sign bit (the Most Significant Bit (MSB)) + * into the vacated bits (sign extension). + * For example, (int32_t)0xfff5fff3>>4 becomes 0xffff5fff and -1>>1=-1. + * + * This can be useful for storing a signed value in the upper bits + * and another bit field in the lower bits. + * The signed value can be retrieved by simple right-shifting. + * + * This is consistent with the Java language. + * + * However, the C standard allows compilers to implement a right-shift of a signed integer + * as a Logical Shift Right which copies a 0 into the vacated bits. + * For example, (int32_t)0xfff5fff3>>4 becomes 0x0fff5fff and -1>>1=0x7fffffff. + * + * Code that depends on the natural behavior should be guarded with this macro, + * with an alternate path for unusual platforms. + * @internal + */ +#ifdef U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC + /* Use the predefined value. */ +#else + /* + * Nearly all CPUs & compilers implement a right-shift of a signed integer + * as an Arithmetic Shift Right (with sign extension). + */ +# define U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC 1 +#endif + +/** Define this to 1 if your platform supports IEEE 754 floating point, + to 0 if it does not. */ +#ifndef IEEE_754 +# define IEEE_754 1 +#endif + +/** + * uintptr_t is an optional part of the standard definitions in stdint.h. + * The opengroup.org documentation for stdint.h says + * "On XSI-conformant systems, the intptr_t and uintptr_t types are required; + * otherwise, they are optional." + * We assume that when uintptr_t is defined, UINTPTR_MAX is defined as well. + * + * Do not use ptrdiff_t since it is signed. size_t is unsigned. + */ +/* TODO: This check fails on some z environments. Filed a ticket #9357 for this. */ +#if !defined(__intptr_t_defined) && !defined(UINTPTR_MAX) && (U_PLATFORM != U_PF_OS390) +typedef size_t uintptr_t; +#endif + +/** + * \def U_HAVE_MSVC_2003_OR_EARLIER + * Flag for workaround of MSVC 2003 optimization bugs + * @internal + */ +#if !defined(U_HAVE_MSVC_2003_OR_EARLIER) && defined(_MSC_VER) && (_MSC_VER < 1400) +#define U_HAVE_MSVC_2003_OR_EARLIER +#endif + +/*===========================================================================*/ +/** @{ Information about POSIX support */ +/*===========================================================================*/ + +#ifdef U_HAVE_NL_LANGINFO_CODESET + /* Use the predefined value. */ +#elif U_PLATFORM_HAS_WIN32_API || U_PLATFORM == U_PF_ANDROID || U_PLATFORM == U_PF_QNX +# define U_HAVE_NL_LANGINFO_CODESET 0 +#else +# define U_HAVE_NL_LANGINFO_CODESET 1 +#endif + +#ifdef U_NL_LANGINFO_CODESET + /* Use the predefined value. */ +#elif !U_HAVE_NL_LANGINFO_CODESET +# define U_NL_LANGINFO_CODESET -1 +#elif U_PLATFORM == U_PF_OS400 + /* not defined */ +#else +# define U_NL_LANGINFO_CODESET CODESET +#endif + +#ifdef U_TZSET + /* Use the predefined value. */ +#elif U_PLATFORM_USES_ONLY_WIN32_API +# define U_TZSET _tzset +#elif U_PLATFORM == U_PF_OS400 + /* not defined */ +#else +# define U_TZSET tzset +#endif + +#if defined(U_TIMEZONE) || defined(U_HAVE_TIMEZONE) + /* Use the predefined value. */ +#elif U_PLATFORM == U_PF_ANDROID +# define U_TIMEZONE timezone +#elif U_PLATFORM_IS_LINUX_BASED +# if defined(__UCLIBC__) + /* uClibc does not have __timezone or _timezone. */ +# elif defined(_NEWLIB_VERSION) +# define U_TIMEZONE _timezone +# elif defined(__GLIBC__) + /* glibc */ +# define U_TIMEZONE __timezone +# endif +#elif U_PLATFORM_USES_ONLY_WIN32_API +# define U_TIMEZONE _timezone +#elif U_PLATFORM == U_PF_BSD && !defined(__NetBSD__) + /* not defined */ +#elif U_PLATFORM == U_PF_OS400 + /* not defined */ +#elif U_PLATFORM == U_PF_IPHONE + /* not defined */ +#else +# define U_TIMEZONE timezone +#endif + +#ifdef U_TZNAME + /* Use the predefined value. */ +#elif U_PLATFORM_USES_ONLY_WIN32_API +# define U_TZNAME _tzname +#elif U_PLATFORM == U_PF_OS400 + /* not defined */ +#else +# define U_TZNAME tzname +#endif + +#ifdef U_HAVE_MMAP + /* Use the predefined value. */ +#elif U_PLATFORM_HAS_WIN32_API +# define U_HAVE_MMAP 0 +#else +# define U_HAVE_MMAP 1 +#endif + +#ifdef U_HAVE_POPEN + /* Use the predefined value. */ +#elif U_PLATFORM_USES_ONLY_WIN32_API +# define U_HAVE_POPEN 0 +#elif U_PLATFORM == U_PF_OS400 +# define U_HAVE_POPEN 0 +#else +# define U_HAVE_POPEN 1 +#endif + +/** + * \def U_HAVE_DIRENT_H + * Defines whether dirent.h is available. + * @internal + */ +#ifdef U_HAVE_DIRENT_H + /* Use the predefined value. */ +#elif U_PLATFORM_HAS_WIN32_API +# define U_HAVE_DIRENT_H 0 +#else +# define U_HAVE_DIRENT_H 1 +#endif + +/** @} */ + +/*===========================================================================*/ +/** @{ GCC built in functions for atomic memory operations */ +/*===========================================================================*/ + +/** + * \def U_HAVE_GCC_ATOMICS + * @internal + */ +#ifdef U_HAVE_GCC_ATOMICS + /* Use the predefined value. */ +#elif U_PLATFORM == U_PF_MINGW + #define U_HAVE_GCC_ATOMICS 0 +#elif U_GCC_MAJOR_MINOR >= 404 || defined(__clang__) + /* TODO: Intel icc and IBM xlc on AIX also support gcc atomics. (Intel originated them.) + * Add them for these compilers. + * Note: Clang sets __GNUC__ defines for version 4.2, so misses the 4.4 test here. + */ +# define U_HAVE_GCC_ATOMICS 1 +#else +# define U_HAVE_GCC_ATOMICS 0 +#endif + +/** @} */ + +/** + * \def U_HAVE_STD_ATOMICS + * Defines whether the standard C++11 <atomic> is available. + * ICU will use this when avialable, + * otherwise will fall back to compiler or platform specific alternatives. + * @internal + */ +#ifdef U_HAVE_STD_ATOMICS + /* Use the predefined value. */ +#elif !defined(__cplusplus) || __cplusplus<201103L + /* Not C++11, disable use of atomics */ +# define U_HAVE_STD_ATOMICS 0 +#elif __clang__ && __clang_major__==3 && __clang_minor__<=1 + /* Clang 3.1, has atomic variable initializer bug. */ +# define U_HAVE_STD_ATOMICS 0 +#else + /* U_HAVE_ATOMIC is typically set by an autoconf test of #include <atomic> */ + /* Can be set manually, or left undefined, on platforms without autoconf. */ +# if defined(U_HAVE_ATOMIC) && U_HAVE_ATOMIC +# define U_HAVE_STD_ATOMICS 1 +# else +# define U_HAVE_STD_ATOMICS 0 +# endif +#endif + + +/*===========================================================================*/ +/** @{ Code alignment */ +/*===========================================================================*/ + +/** + * \def U_ALIGN_CODE + * This is used to align code fragments to a specific byte boundary. + * This is useful for getting consistent performance test results. + * @internal + */ +#ifdef U_ALIGN_CODE + /* Use the predefined value. */ +#elif defined(_MSC_VER) && defined(_M_IX86) && !defined(_MANAGED) +# define U_ALIGN_CODE(boundarySize) __asm align boundarySize +#else +# define U_ALIGN_CODE(boundarySize) +#endif + +/** @} */ + +/*===========================================================================*/ +/** @{ Programs used by ICU code */ +/*===========================================================================*/ + +/** + * \def U_MAKE_IS_NMAKE + * Defines whether the "make" program is Windows nmake. + */ +#ifdef U_MAKE_IS_NMAKE + /* Use the predefined value. */ +#elif U_PLATFORM == U_PF_WINDOWS +# define U_MAKE_IS_NMAKE 1 +#else +# define U_MAKE_IS_NMAKE 0 +#endif + +/** @} */ + +/*==========================================================================*/ +/* Platform utilities */ +/*==========================================================================*/ + +/** + * Platform utilities isolates the platform dependencies of the + * libarary. For each platform which this code is ported to, these + * functions may have to be re-implemented. + */ + +/** + * Floating point utility to determine if a double is Not a Number (NaN). + * @internal + */ +U_INTERNAL UBool U_EXPORT2 uprv_isNaN(double d); +/** + * Floating point utility to determine if a double has an infinite value. + * @internal + */ +U_INTERNAL UBool U_EXPORT2 uprv_isInfinite(double d); +/** + * Floating point utility to determine if a double has a positive infinite value. + * @internal + */ +U_INTERNAL UBool U_EXPORT2 uprv_isPositiveInfinity(double d); +/** + * Floating point utility to determine if a double has a negative infinite value. + * @internal + */ +U_INTERNAL UBool U_EXPORT2 uprv_isNegativeInfinity(double d); +/** + * Floating point utility that returns a Not a Number (NaN) value. + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_getNaN(void); +/** + * Floating point utility that returns an infinite value. + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_getInfinity(void); + +/** + * Floating point utility to truncate a double. + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_trunc(double d); +/** + * Floating point utility to calculate the floor of a double. + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_floor(double d); +/** + * Floating point utility to calculate the ceiling of a double. + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_ceil(double d); +/** + * Floating point utility to calculate the absolute value of a double. + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_fabs(double d); +/** + * Floating point utility to calculate the fractional and integer parts of a double. + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_modf(double d, double* pinteger); +/** + * Floating point utility to calculate the remainder of a double divided by another double. + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_fmod(double d, double y); +/** + * Floating point utility to calculate d to the power of exponent (d^exponent). + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_pow(double d, double exponent); +/** + * Floating point utility to calculate 10 to the power of exponent (10^exponent). + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_pow10(int32_t exponent); +/** + * Floating point utility to calculate the maximum value of two doubles. + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_fmax(double d, double y); +/** + * Floating point utility to calculate the minimum value of two doubles. + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_fmin(double d, double y); +/** + * Private utility to calculate the maximum value of two integers. + * @internal + */ +U_INTERNAL int32_t U_EXPORT2 uprv_max(int32_t d, int32_t y); +/** + * Private utility to calculate the minimum value of two integers. + * @internal + */ +U_INTERNAL int32_t U_EXPORT2 uprv_min(int32_t d, int32_t y); + +#if U_IS_BIG_ENDIAN +# define uprv_isNegative(number) (*((signed char *)&(number))<0) +#else +# define uprv_isNegative(number) (*((signed char *)&(number)+sizeof(number)-1)<0) +#endif + +/** + * Return the largest positive number that can be represented by an integer + * type of arbitrary bit length. + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_maxMantissa(void); + +/** + * Floating point utility to calculate the logarithm of a double. + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_log(double d); + +/** + * Does common notion of rounding e.g. uprv_floor(x + 0.5); + * @param x the double number + * @return the rounded double + * @internal + */ +U_INTERNAL double U_EXPORT2 uprv_round(double x); + +#if 0 +/** + * Returns the number of digits after the decimal point in a double number x. + * + * @param x the double number + * @return the number of digits after the decimal point in a double number x. + * @internal + */ +/*U_INTERNAL int32_t U_EXPORT2 uprv_digitsAfterDecimal(double x);*/ +#endif + +#if !U_CHARSET_IS_UTF8 +/** + * Please use ucnv_getDefaultName() instead. + * Return the default codepage for this platform and locale. + * This function can call setlocale() on Unix platforms. Please read the + * platform documentation on setlocale() before calling this function. + * @return the default codepage for this platform + * @internal + */ +U_INTERNAL const char* U_EXPORT2 uprv_getDefaultCodepage(void); +#endif + +/** + * Please use uloc_getDefault() instead. + * Return the default locale ID string by querying ths system, or + * zero if one cannot be found. + * This function can call setlocale() on Unix platforms. Please read the + * platform documentation on setlocale() before calling this function. + * @return the default locale ID string + * @internal + */ +U_INTERNAL const char* U_EXPORT2 uprv_getDefaultLocaleID(void); + +/** + * Time zone utilities + * + * Wrappers for C runtime library functions relating to timezones. + * The t_tzset() function (similar to tzset) uses the current setting + * of the environment variable TZ to assign values to three global + * variables: daylight, timezone, and tzname. These variables have the + * following meanings, and are declared in <time.h>. + * + * daylight Nonzero if daylight-saving-time zone (DST) is specified + * in TZ; otherwise, 0. Default value is 1. + * timezone Difference in seconds between coordinated universal + * time and local time. E.g., -28,800 for PST (GMT-8hrs) + * tzname(0) Three-letter time-zone name derived from TZ environment + * variable. E.g., "PST". + * tzname(1) Three-letter DST zone name derived from TZ environment + * variable. E.g., "PDT". If DST zone is omitted from TZ, + * tzname(1) is an empty string. + * + * Notes: For example, to set the TZ environment variable to correspond + * to the current time zone in Germany, you can use one of the + * following statements: + * + * set TZ=GST1GDT + * set TZ=GST+1GDT + * + * If the TZ value is not set, t_tzset() attempts to use the time zone + * information specified by the operating system. Under Windows NT + * and Windows 95, this information is specified in the Control Panel's + * Date/Time application. + * @internal + */ +U_INTERNAL void U_EXPORT2 uprv_tzset(void); + +/** + * Difference in seconds between coordinated universal + * time and local time. E.g., -28,800 for PST (GMT-8hrs) + * @return the difference in seconds between coordinated universal time and local time. + * @internal + */ +U_INTERNAL int32_t U_EXPORT2 uprv_timezone(void); + +/** + * tzname(0) Three-letter time-zone name derived from TZ environment + * variable. E.g., "PST". + * tzname(1) Three-letter DST zone name derived from TZ environment + * variable. E.g., "PDT". If DST zone is omitted from TZ, + * tzname(1) is an empty string. + * @internal + */ +U_INTERNAL const char* U_EXPORT2 uprv_tzname(int n); + +/** + * Get UTC (GMT) time measured in milliseconds since 0:00 on 1/1/1970. + * This function is affected by 'faketime' and should be the bottleneck for all user-visible ICU time functions. + * @return the UTC time measured in milliseconds + * @internal + */ +U_INTERNAL UDate U_EXPORT2 uprv_getUTCtime(void); + +/** + * Get UTC (GMT) time measured in milliseconds since 0:00 on 1/1/1970. + * This function is not affected by 'faketime', so it should only be used by low level test functions- not by anything that + * exposes time to the end user. + * @return the UTC time measured in milliseconds + * @internal + */ +U_INTERNAL UDate U_EXPORT2 uprv_getRawUTCtime(void); + +/** + * Determine whether a pathname is absolute or not, as defined by the platform. + * @param path Pathname to test + * @return TRUE if the path is absolute + * @internal (ICU 3.0) + */ +U_INTERNAL UBool U_EXPORT2 uprv_pathIsAbsolute(const char *path); + +/** + * Use U_MAX_PTR instead of this function. + * @param void pointer to test + * @return the largest possible pointer greater than the base + * @internal (ICU 3.8) + */ +U_INTERNAL void * U_EXPORT2 uprv_maximumPtr(void *base); + +/** + * Maximum value of a (void*) - use to indicate the limit of an 'infinite' buffer. + * In fact, buffer sizes must not exceed 2GB so that the difference between + * the buffer limit and the buffer start can be expressed in an int32_t. + * + * The definition of U_MAX_PTR must fulfill the following conditions: + * - return the largest possible pointer greater than base + * - return a valid pointer according to the machine architecture (AS/400, 64-bit, etc.) + * - avoid wrapping around at high addresses + * - make sure that the returned pointer is not farther from base than 0x7fffffff bytes + * + * @param base The beginning of a buffer to find the maximum offset from + * @internal + */ +#ifndef U_MAX_PTR +# if U_PLATFORM == U_PF_OS390 && !defined(_LP64) + /* We have 31-bit pointers. */ +# define U_MAX_PTR(base) ((void *)0x7fffffff) +# elif U_PLATFORM == U_PF_OS400 +# define U_MAX_PTR(base) uprv_maximumPtr((void *)base) +# elif 0 + /* + * For platforms where pointers are scalar values (which is normal, but unlike i5/OS) + * but that do not define uintptr_t. + * + * However, this does not work on modern compilers: + * The C++ standard does not define pointer overflow, and allows compilers to + * assume that p+u>p for any pointer p and any integer u>0. + * Thus, modern compilers optimize away the ">" comparison. + * (See ICU tickets #7187 and #8096.) + */ +# define U_MAX_PTR(base) \ + ((void *)(((char *)(base)+0x7fffffffu) > (char *)(base) \ + ? ((char *)(base)+0x7fffffffu) \ + : (char *)-1)) +# else + /* Default version. C++ standard compliant for scalar pointers. */ +# define U_MAX_PTR(base) \ + ((void *)(((uintptr_t)(base)+0x7fffffffu) > (uintptr_t)(base) \ + ? ((uintptr_t)(base)+0x7fffffffu) \ + : (uintptr_t)-1)) +# endif +#endif + +/* Dynamic Library Functions */ + +typedef void (UVoidFunction)(void); + +#if U_ENABLE_DYLOAD +/** + * Load a library + * @internal (ICU 4.4) + */ +U_INTERNAL void * U_EXPORT2 uprv_dl_open(const char *libName, UErrorCode *status); + +/** + * Close a library + * @internal (ICU 4.4) + */ +U_INTERNAL void U_EXPORT2 uprv_dl_close( void *lib, UErrorCode *status); + +/** + * Extract a symbol from a library (function) + * @internal (ICU 4.8) + */ +U_INTERNAL UVoidFunction* U_EXPORT2 uprv_dlsym_func( void *lib, const char *symbolName, UErrorCode *status); + +/** + * Extract a symbol from a library (function) + * Not implemented, no clients. + * @internal + */ +/* U_INTERNAL void * U_EXPORT2 uprv_dlsym_data( void *lib, const char *symbolName, UErrorCode *status); */ + +#endif + +/** + * Define malloc and related functions + * @internal + */ +#if U_PLATFORM == U_PF_OS400 +# define uprv_default_malloc(x) _C_TS_malloc(x) +# define uprv_default_realloc(x,y) _C_TS_realloc(x,y) +# define uprv_default_free(x) _C_TS_free(x) +/* also _C_TS_calloc(x) */ +#else +/* C defaults */ +# define uprv_default_malloc(x) malloc(x) +# define uprv_default_realloc(x,y) realloc(x,y) +# define uprv_default_free(x) free(x) +#endif + + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/uarrsort.h b/src/core/basetypes/icu-ucsdet/include/uarrsort.h new file mode 100644 index 00000000..aece6da9 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/uarrsort.h @@ -0,0 +1,101 @@ +/* +******************************************************************************* +* +* Copyright (C) 2003-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uarrsort.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003aug04 +* created by: Markus W. Scherer +* +* Internal function for sorting arrays. +*/ + +#ifndef __UARRSORT_H__ +#define __UARRSORT_H__ + +#include "unicode/utypes.h" + +U_CDECL_BEGIN +/** + * Function type for comparing two items as part of sorting an array or similar. + * Callback function for uprv_sortArray(). + * + * @param context Application-specific pointer, passed through by uprv_sortArray(). + * @param left Pointer to the "left" item. + * @param right Pointer to the "right" item. + * @return 32-bit signed integer comparison result: + * <0 if left<right + * ==0 if left==right + * >0 if left>right + * + * @internal + */ +typedef int32_t U_CALLCONV +UComparator(const void *context, const void *left, const void *right); +U_CDECL_END + +/** + * Array sorting function. + * Uses a UComparator for comparing array items to each other, and simple + * memory copying to move items. + * + * @param array The array to be sorted. + * @param length The number of items in the array. + * @param itemSize The size in bytes of each array item. + * @param cmp UComparator function used to compare two items each. + * @param context Application-specific pointer, passed through to the UComparator. + * @param sortStable If true, a stable sorting algorithm must be used. + * @param pErrorCode ICU in/out UErrorCode parameter. + * + * @internal + */ +U_CAPI void U_EXPORT2 +uprv_sortArray(void *array, int32_t length, int32_t itemSize, + UComparator *cmp, const void *context, + UBool sortStable, UErrorCode *pErrorCode); + +/** + * Convenience UComparator implementation for uint16_t arrays. + * @internal + */ +U_CAPI int32_t U_EXPORT2 +uprv_uint16Comparator(const void *context, const void *left, const void *right); + +/** + * Convenience UComparator implementation for int32_t arrays. + * @internal + */ +U_CAPI int32_t U_EXPORT2 +uprv_int32Comparator(const void *context, const void *left, const void *right); + +/** + * Convenience UComparator implementation for uint32_t arrays. + * @internal + */ +U_CAPI int32_t U_EXPORT2 +uprv_uint32Comparator(const void *context, const void *left, const void *right); + +/** + * Much like Java Collections.binarySearch(list, key, comparator). + * + * Except: Java documents "If the list contains multiple elements equal to + * the specified object, there is no guarantee which one will be found." + * + * This version here will return the largest index of any equal item, + * for use in stable sorting. + * + * @return the index>=0 where the item was found: + * the largest such index, if multiple, for stable sorting; + * or the index<0 for inserting the item at ~index in sorted order + */ +U_CAPI int32_t U_EXPORT2 +uprv_stableBinarySearch(char *array, int32_t length, void *item, int32_t itemSize, + UComparator *cmp, const void *context); + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/uassert.h b/src/core/basetypes/icu-ucsdet/include/uassert.h new file mode 100644 index 00000000..9dc29b24 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/uassert.h @@ -0,0 +1,32 @@ +/* +****************************************************************************** +* +* Copyright (C) 2002-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* File uassert.h +* +* Contains U_ASSERT macro +* +* By default, U_ASSERT just wraps the C library assert macro. +* By changing the definition here, the assert behavior for ICU can be changed +* without affecting other non-ICU uses of the C library assert(). +* +****************************************************************************** +*/ + +#ifndef U_ASSERT_H +#define U_ASSERT_H +/* utypes.h is included to get the proper define for uint8_t */ +#include "unicode/utypes.h" +#if U_DEBUG +# include <assert.h> +# define U_ASSERT(exp) assert(exp) +#else +# define U_ASSERT(exp) +#endif +#endif + + diff --git a/src/core/basetypes/icu-ucsdet/include/ucase.h b/src/core/basetypes/icu-ucsdet/include/ucase.h new file mode 100644 index 00000000..8f24769d --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/ucase.h @@ -0,0 +1,409 @@ +/* +******************************************************************************* +* +* Copyright (C) 2004-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: ucase.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2004aug30 +* created by: Markus W. Scherer +* +* Low-level Unicode character/string case mapping code. +*/ + +#ifndef __UCASE_H__ +#define __UCASE_H__ + +#include "unicode/utypes.h" +#include "unicode/uset.h" +#include "putilimp.h" +#include "uset_imp.h" +#include "udataswp.h" + +#ifdef __cplusplus +U_NAMESPACE_BEGIN + +class UnicodeString; + +U_NAMESPACE_END +#endif + +/* library API -------------------------------------------------------------- */ + +U_CDECL_BEGIN + +struct UCaseProps; +typedef struct UCaseProps UCaseProps; + +U_CDECL_END + +U_CAPI const UCaseProps * U_EXPORT2 +ucase_getSingleton(void); + +U_CFUNC void U_EXPORT2 +ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode); + +/** + * Requires non-NULL locale ID but otherwise does the equivalent of + * checking for language codes as if uloc_getLanguage() were called: + * Accepts both 2- and 3-letter codes and accepts case variants. + */ +U_CFUNC int32_t +ucase_getCaseLocale(const char *locale, int32_t *locCache); + +/* Casing locale types for ucase_getCaseLocale */ +enum { + UCASE_LOC_UNKNOWN, + UCASE_LOC_ROOT, + UCASE_LOC_TURKISH, + UCASE_LOC_LITHUANIAN, + UCASE_LOC_DUTCH +}; + +/** + * Bit mask for getting just the options from a string compare options word + * that are relevant for case-insensitive string comparison. + * See uchar.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER. + * @internal + */ +#define _STRCASECMP_OPTIONS_MASK 0xffff + +/** + * Bit mask for getting just the options from a string compare options word + * that are relevant for case folding (of a single string or code point). + * See uchar.h. + * @internal + */ +#define _FOLD_CASE_OPTIONS_MASK 0xff + +/* single-code point functions */ + +U_CAPI UChar32 U_EXPORT2 +ucase_tolower(const UCaseProps *csp, UChar32 c); + +U_CAPI UChar32 U_EXPORT2 +ucase_toupper(const UCaseProps *csp, UChar32 c); + +U_CAPI UChar32 U_EXPORT2 +ucase_totitle(const UCaseProps *csp, UChar32 c); + +U_CAPI UChar32 U_EXPORT2 +ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options); + +/** + * Adds all simple case mappings and the full case folding for c to sa, + * and also adds special case closure mappings. + * c itself is not added. + * For example, the mappings + * - for s include long s + * - for sharp s include ss + * - for k include the Kelvin sign + */ +U_CFUNC void U_EXPORT2 +ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa); + +/** + * Maps the string to single code points and adds the associated case closure + * mappings. + * The string is mapped to code points if it is their full case folding string. + * In other words, this performs a reverse full case folding and then + * adds the case closure items of the resulting code points. + * If the string is found and its closure applied, then + * the string itself is added as well as part of its code points' closure. + * It must be length>=0. + * + * @return TRUE if the string was found + */ +U_CFUNC UBool U_EXPORT2 +ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa); + +#ifdef __cplusplus +U_NAMESPACE_BEGIN + +/** + * Iterator over characters with more than one code point in the full default Case_Folding. + */ +class U_COMMON_API FullCaseFoldingIterator { +public: + /** Constructor. */ + FullCaseFoldingIterator(); + /** + * Returns the next (cp, full) pair where "full" is cp's full default Case_Folding. + * Returns a negative cp value at the end of the iteration. + */ + UChar32 next(UnicodeString &full); +private: + FullCaseFoldingIterator(const FullCaseFoldingIterator &); // no copy + FullCaseFoldingIterator &operator=(const FullCaseFoldingIterator &); // no assignment + + const UChar *unfold; + int32_t unfoldRows; + int32_t unfoldRowWidth; + int32_t unfoldStringWidth; + int32_t currentRow; + int32_t rowCpIndex; +}; + +U_NAMESPACE_END +#endif + +/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ +U_CAPI int32_t U_EXPORT2 +ucase_getType(const UCaseProps *csp, UChar32 c); + +/** @return same as ucase_getType(), or <0 if c is case-ignorable */ +U_CAPI int32_t U_EXPORT2 +ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c); + +U_CAPI UBool U_EXPORT2 +ucase_isSoftDotted(const UCaseProps *csp, UChar32 c); + +U_CAPI UBool U_EXPORT2 +ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c); + +/* string case mapping functions */ + +U_CDECL_BEGIN + +/** + * Iterator function for string case mappings, which need to look at the + * context (surrounding text) of a given character for conditional mappings. + * + * The iterator only needs to go backward or forward away from the + * character in question. It does not use any indexes on this interface. + * It does not support random access or an arbitrary change of + * iteration direction. + * + * The code point being case-mapped itself is never returned by + * this iterator. + * + * @param context A pointer to the iterator's working data. + * @param dir If <0 then start iterating backward from the character; + * if >0 then start iterating forward from the character; + * if 0 then continue iterating in the current direction. + * @return Next code point, or <0 when the iteration is done. + */ +typedef UChar32 U_CALLCONV +UCaseContextIterator(void *context, int8_t dir); + +/** + * Sample struct which may be used by some implementations of + * UCaseContextIterator. + */ +struct UCaseContext { + void *p; + int32_t start, index, limit; + int32_t cpStart, cpLimit; + int8_t dir; + int8_t b1, b2, b3; +}; +typedef struct UCaseContext UCaseContext; + +U_CDECL_END + +#define UCASECONTEXT_INITIALIZER { NULL, 0, 0, 0, 0, 0, 0, 0, 0, 0 } + +enum { + /** + * For string case mappings, a single character (a code point) is mapped + * either to itself (in which case in-place mapping functions do nothing), + * or to another single code point, or to a string. + * Aside from the string contents, these are indicated with a single int32_t + * value as follows: + * + * Mapping to self: Negative values (~self instead of -self to support U+0000) + * + * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH + * + * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is + * returned. Note that the string result may indeed have zero length. + */ + UCASE_MAX_STRING_LENGTH=0x1f +}; + +/** + * Get the full lowercase mapping for c. + * + * @param csp Case mapping properties. + * @param c Character to be mapped. + * @param iter Character iterator, used for context-sensitive mappings. + * See UCaseContextIterator for details. + * If iter==NULL then a context-independent result is returned. + * @param context Pointer to be passed into iter. + * @param pString If the mapping result is a string, then the pointer is + * written to *pString. + * @param locale Locale ID for locale-dependent mappings. + * @param locCache Initialize to 0; may be used to cache the result of parsing + * the locale ID for subsequent calls. + * Can be NULL. + * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH. + * + * @see UCaseContextIterator + * @see UCASE_MAX_STRING_LENGTH + * @internal + */ +U_CAPI int32_t U_EXPORT2 +ucase_toFullLower(const UCaseProps *csp, UChar32 c, + UCaseContextIterator *iter, void *context, + const UChar **pString, + const char *locale, int32_t *locCache); + +U_CAPI int32_t U_EXPORT2 +ucase_toFullUpper(const UCaseProps *csp, UChar32 c, + UCaseContextIterator *iter, void *context, + const UChar **pString, + const char *locale, int32_t *locCache); + +U_CAPI int32_t U_EXPORT2 +ucase_toFullTitle(const UCaseProps *csp, UChar32 c, + UCaseContextIterator *iter, void *context, + const UChar **pString, + const char *locale, int32_t *locCache); + +U_CAPI int32_t U_EXPORT2 +ucase_toFullFolding(const UCaseProps *csp, UChar32 c, + const UChar **pString, + uint32_t options); + +U_CFUNC int32_t U_EXPORT2 +ucase_hasBinaryProperty(UChar32 c, UProperty which); + + +U_CDECL_BEGIN + +/** + * @internal + */ +typedef int32_t U_CALLCONV +UCaseMapFull(const UCaseProps *csp, UChar32 c, + UCaseContextIterator *iter, void *context, + const UChar **pString, + const char *locale, int32_t *locCache); + +U_CDECL_END + +/* file definitions --------------------------------------------------------- */ + +#define UCASE_DATA_NAME "ucase" +#define UCASE_DATA_TYPE "icu" + +/* format "cAsE" */ +#define UCASE_FMT_0 0x63 +#define UCASE_FMT_1 0x41 +#define UCASE_FMT_2 0x53 +#define UCASE_FMT_3 0x45 + +/* indexes into indexes[] */ +enum { + UCASE_IX_INDEX_TOP, + UCASE_IX_LENGTH, + UCASE_IX_TRIE_SIZE, + UCASE_IX_EXC_LENGTH, + UCASE_IX_UNFOLD_LENGTH, + + UCASE_IX_MAX_FULL_LENGTH=15, + UCASE_IX_TOP=16 +}; + +/* definitions for 16-bit case properties word ------------------------------ */ + +/* 2-bit constants for types of cased characters */ +#define UCASE_TYPE_MASK 3 +enum { + UCASE_NONE, + UCASE_LOWER, + UCASE_UPPER, + UCASE_TITLE +}; + +#define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK) +#define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7) + +#define UCASE_IGNORABLE 4 +#define UCASE_SENSITIVE 8 +#define UCASE_EXCEPTION 0x10 + +#define UCASE_DOT_MASK 0x60 +enum { + UCASE_NO_DOT=0, /* normal characters with cc=0 */ + UCASE_SOFT_DOTTED=0x20, /* soft-dotted characters with cc=0 */ + UCASE_ABOVE=0x40, /* "above" accents with cc=230 */ + UCASE_OTHER_ACCENT=0x60 /* other accent character (0<cc!=230) */ +}; + +/* no exception: bits 15..7 are a 9-bit signed case mapping delta */ +#define UCASE_DELTA_SHIFT 7 +#define UCASE_DELTA_MASK 0xff80 +#define UCASE_MAX_DELTA 0xff +#define UCASE_MIN_DELTA (-UCASE_MAX_DELTA-1) + +#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC +# define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT) +#else +# define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT)) +#endif + +/* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */ +#define UCASE_EXC_SHIFT 5 +#define UCASE_EXC_MASK 0xffe0 +#define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1) + +/* definitions for 16-bit main exceptions word ------------------------------ */ + +/* first 8 bits indicate values in optional slots */ +enum { + UCASE_EXC_LOWER, + UCASE_EXC_FOLD, + UCASE_EXC_UPPER, + UCASE_EXC_TITLE, + UCASE_EXC_4, /* reserved */ + UCASE_EXC_5, /* reserved */ + UCASE_EXC_CLOSURE, + UCASE_EXC_FULL_MAPPINGS, + UCASE_EXC_ALL_SLOTS /* one past the last slot */ +}; + +/* each slot is 2 uint16_t instead of 1 */ +#define UCASE_EXC_DOUBLE_SLOTS 0x100 + +/* reserved: exception bits 11..9 */ + +/* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */ +#define UCASE_EXC_DOT_SHIFT 7 + +/* normally stored in the main word, but pushed out for larger exception indexes */ +#define UCASE_EXC_DOT_MASK 0x3000 +enum { + UCASE_EXC_NO_DOT=0, + UCASE_EXC_SOFT_DOTTED=0x1000, + UCASE_EXC_ABOVE=0x2000, /* "above" accents with cc=230 */ + UCASE_EXC_OTHER_ACCENT=0x3000 /* other character (0<cc!=230) */ +}; + +/* complex/conditional mappings */ +#define UCASE_EXC_CONDITIONAL_SPECIAL 0x4000 +#define UCASE_EXC_CONDITIONAL_FOLD 0x8000 + +/* definitions for lengths word for full case mappings */ +#define UCASE_FULL_LOWER 0xf +#define UCASE_FULL_FOLDING 0xf0 +#define UCASE_FULL_UPPER 0xf00 +#define UCASE_FULL_TITLE 0xf000 + +/* maximum lengths */ +#define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf) +#define UCASE_CLOSURE_MAX_LENGTH 0xf + +/* constants for reverse case folding ("unfold") data */ +enum { + UCASE_UNFOLD_ROWS, + UCASE_UNFOLD_ROW_WIDTH, + UCASE_UNFOLD_STRING_WIDTH +}; + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/ucln.h b/src/core/basetypes/icu-ucsdet/include/ucln.h new file mode 100644 index 00000000..cd2630af --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/ucln.h @@ -0,0 +1,89 @@ +/* +****************************************************************************** +* +* Copyright (C) 2001-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: ucln.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2001July05 +* created by: George Rhoten +*/ + +#ifndef __UCLN_H__ +#define __UCLN_H__ + +#include "unicode/utypes.h" + +/** These are the functions used to register a library's memory cleanup + * functions. Each library should define a single library register function + * to call this API. In the i18n library, it is ucln_i18n_registerCleanup(). + * + * None of the cleanup functions should use a mutex to clean up an API's + * allocated memory because a cleanup function is not meant to be thread safe, + * and plenty of data cannot be reference counted in order to make sure that + * no one else needs the allocated data. + * + * In order to make a cleanup function get called when u_cleanup is called, + * You should add your function to the library specific cleanup function. + * If the cleanup function is not in the common library, the code that + * allocates the memory should call the library specific cleanup function. + * For instance, in the i18n library, any memory allocated statically must + * call ucln_i18n_registerCleanup() from the ucln_in.h header. These library + * cleanup functions are needed in order to prevent a circular dependency + * between the common library and any other library. + * + * The order of the cleanup is very important. In general, an API that + * depends on a second API should be cleaned up before the second API. + * For instance, the default converter in ustring depends upon the converter + * API. So the default converter should be closed before the converter API + * has its cache flushed. This will prevent any memory leaks due to + * reference counting. + * + * Please see common/ucln_cmn.{h,c} and i18n/ucln_in.{h,c} for examples. + */ + +/** + * Data Type for cleanup function selector. These roughly correspond to libraries. + */ +typedef enum ECleanupLibraryType { + UCLN_START = -1, + UCLN_UPLUG, /* ICU plugins */ + UCLN_CUSTOM, /* Custom is for anyone else. */ + UCLN_CTESTFW, + UCLN_TOOLUTIL, + UCLN_LAYOUTEX, + UCLN_LAYOUT, + UCLN_IO, + UCLN_I18N, + UCLN_COMMON /* This must be the last one to cleanup. */ +} ECleanupLibraryType; + +/** + * Data type for cleanup function pointer + */ +U_CDECL_BEGIN +typedef UBool U_CALLCONV cleanupFunc(void); +typedef void U_CALLCONV initFunc(UErrorCode *); +U_CDECL_END + +/** + * Register a cleanup function + * @param type which library to register for. + * @param func the function pointer + */ +U_CAPI void U_EXPORT2 ucln_registerCleanup(ECleanupLibraryType type, + cleanupFunc *func); + +/** + * Request cleanup for one specific library. + * Not thread safe. + * @param type which library to cleanup + */ +U_CAPI void U_EXPORT2 ucln_cleanupOne(ECleanupLibraryType type); + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/ucln_cmn.h b/src/core/basetypes/icu-ucsdet/include/ucln_cmn.h new file mode 100644 index 00000000..3517ca78 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/ucln_cmn.h @@ -0,0 +1,72 @@ +/* +****************************************************************************** +* * +* Copyright (C) 2001-2014, International Business Machines * +* Corporation and others. All Rights Reserved. * +* * +****************************************************************************** +* file name: ucln_cmn.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2001July05 +* created by: George Rhoten +*/ + +#ifndef __UCLN_CMN_H__ +#define __UCLN_CMN_H__ + +#include "unicode/utypes.h" +#include "ucln.h" + +/* These are the cleanup functions for various APIs. */ +/* @return true if cleanup complete successfully.*/ +U_CFUNC UBool umtx_cleanup(void); + +U_CFUNC UBool utrace_cleanup(void); + +U_CFUNC UBool ucln_lib_cleanup(void); + +/* +Please keep the order of enums declared in same order +as the cleanup functions are suppose to be called. */ +typedef enum ECleanupCommonType { + UCLN_COMMON_START = -1, + UCLN_COMMON_USPREP, + UCLN_COMMON_BREAKITERATOR, + UCLN_COMMON_BREAKITERATOR_DICT, + UCLN_COMMON_SERVICE, + UCLN_COMMON_LOCALE_KEY_TYPE, + UCLN_COMMON_LOCALE, + UCLN_COMMON_LOCALE_AVAILABLE, + UCLN_COMMON_ULOC, + UCLN_COMMON_LOADED_NORMALIZER2, + UCLN_COMMON_NORMALIZER2, + UCLN_COMMON_USET, + UCLN_COMMON_UNAMES, + UCLN_COMMON_UPROPS, + UCLN_COMMON_UCNV, + UCLN_COMMON_UCNV_IO, + UCLN_COMMON_UDATA, + UCLN_COMMON_PUTIL, + UCLN_COMMON_LIST_FORMATTER, + UCLN_COMMON_UINIT, + + /* + Unified caches caches collation stuff. Collation data structures + contain resource bundles which means that unified cache cleanup + must happen before resource bundle clean up. + */ + UCLN_COMMON_UNIFIED_CACHE, + UCLN_COMMON_URES, + UCLN_COMMON_COUNT /* This must be last */ +} ECleanupCommonType; + +/* Main library cleanup registration function. */ +/* See common/ucln.h for details on adding a cleanup function. */ +/* Note: the global mutex must not be held when calling this function. */ +U_CFUNC void U_EXPORT2 ucln_common_registerCleanup(ECleanupCommonType type, + cleanupFunc *func); + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/ucln_imp.h b/src/core/basetypes/icu-ucsdet/include/ucln_imp.h new file mode 100644 index 00000000..d5d202ec --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/ucln_imp.h @@ -0,0 +1,178 @@ +/* +****************************************************************************** +* +* Copyright (C) 2009-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: ucln_imp.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* This file contains the platform specific implementation of per-library cleanup. +* +*/ + + +#ifndef __UCLN_IMP_H__ +#define __UCLN_IMP_H__ + +#include "ucln.h" +#include <stdlib.h> + +/** + * Auto cleanup of ICU libraries + * There are several methods in per library cleanup of icu libraries: + * 1) Compiler/Platform based cleanup: + * a) Windows MSVC uses DllMain() + * b) GCC uses destructor function attribute + * c) Sun Studio, AIX VA, and HP-UX aCC uses a linker option to set the exit function + * 2) Using atexit() + * 3) Implementing own automatic cleanup functions + * + * For option 1, ensure that UCLN_NO_AUTO_CLEANUP is set to 0 by using --enable-auto-cleanup + * configure option or by otherwise setting UCLN_NO_AUTO_CLEANUP to 0 + * For option 2, follow option 1 and also define UCLN_AUTO_ATEXIT + * For option 3, follow option 1 and also define UCLN_AUTO_LOCAL (see below for more information) + */ + +#if !UCLN_NO_AUTO_CLEANUP + +/* + * The following declarations are for when UCLN_AUTO_LOCAL or UCLN_AUTO_ATEXIT + * are defined. They are commented out because they are static and will be defined + * later. The information is still here to provide some guidance for the developer + * who chooses to use UCLN_AUTO_LOCAL. + */ +/** + * Give the library an opportunity to register an automatic cleanup. + * This may be called more than once. + */ +/*static void ucln_registerAutomaticCleanup();*/ +/** + * Unregister an automatic cleanup, if possible. Called from cleanup. + */ +/*static void ucln_unRegisterAutomaticCleanup();*/ + +#ifdef UCLN_TYPE_IS_COMMON +# define UCLN_CLEAN_ME_UP u_cleanup() +#else +# define UCLN_CLEAN_ME_UP ucln_cleanupOne(UCLN_TYPE) +#endif + +/* ------------ automatic cleanup: registration. Choose ONE ------- */ +#if defined(UCLN_AUTO_LOCAL) +/* To use: + * 1. define UCLN_AUTO_LOCAL, + * 2. create ucln_local_hook.c containing implementations of + * static void ucln_registerAutomaticCleanup() + * static void ucln_unRegisterAutomaticCleanup() + */ +#include "ucln_local_hook.c" + +#elif defined(UCLN_AUTO_ATEXIT) +/* + * Use the ANSI C 'atexit' function. Note that this mechanism does not + * guarantee the order of cleanup relative to other users of ICU! + */ +static UBool gAutoCleanRegistered = FALSE; + +static void ucln_atexit_handler() +{ + UCLN_CLEAN_ME_UP; +} + +static void ucln_registerAutomaticCleanup() +{ + if(!gAutoCleanRegistered) { + gAutoCleanRegistered = TRUE; + atexit(&ucln_atexit_handler); + } +} + +static void ucln_unRegisterAutomaticCleanup () { +} +/* ------------end of automatic cleanup: registration. ------- */ + +#elif defined (UCLN_FINI) +/** + * If UCLN_FINI is defined, it is the (versioned, etc) name of a cleanup + * entrypoint. Add a stub to call ucln_cleanupOne + * Used on AIX, Solaris, and HP-UX + */ +U_CAPI void U_EXPORT2 UCLN_FINI (void); + +U_CAPI void U_EXPORT2 UCLN_FINI () +{ + /* This function must be defined, if UCLN_FINI is defined, else link error. */ + UCLN_CLEAN_ME_UP; +} + +/* Windows: DllMain */ +#elif U_PLATFORM_HAS_WIN32_API +/* + * ICU's own DllMain. + */ + +/* these are from putil.c */ +/* READ READ READ READ! Are you getting compilation errors from windows.h? + Any source file which includes this (ucln_imp.h) header MUST + be defined with language extensions ON. */ +# define WIN32_LEAN_AND_MEAN +# define VC_EXTRALEAN +# define NOUSER +# define NOSERVICE +# define NOIME +# define NOMCX +# include <windows.h> +/* + * This is a stub DllMain function with icu specific process handling code. + */ +BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) +{ + BOOL status = TRUE; + + switch(fdwReason) { + case DLL_PROCESS_ATTACH: + /* ICU does not trap process attach, but must pass these through properly. */ + /* ICU specific process attach could go here */ + break; + + case DLL_PROCESS_DETACH: + /* Here is the one we actually care about. */ + + UCLN_CLEAN_ME_UP; + + break; + + case DLL_THREAD_ATTACH: + /* ICU does not trap thread attach, but must pass these through properly. */ + /* ICU specific thread attach could go here */ + break; + + case DLL_THREAD_DETACH: + /* ICU does not trap thread detach, but must pass these through properly. */ + /* ICU specific thread detach could go here */ + break; + + } + return status; +} + +#elif defined(__GNUC__) +/* GCC - use __attribute((destructor)) */ +static void ucln_destructor() __attribute__((destructor)) ; + +static void ucln_destructor() +{ + UCLN_CLEAN_ME_UP; +} + +#endif + +#endif /* UCLN_NO_AUTO_CLEANUP */ + +#else +#error This file can only be included once. +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/ucln_in.h b/src/core/basetypes/icu-ucsdet/include/ucln_in.h new file mode 100644 index 00000000..e9685161 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/ucln_in.h @@ -0,0 +1,64 @@ +/* +****************************************************************************** +* Copyright (C) 2001-2014, International Business Machines +* Corporation and others. All Rights Reserved. +****************************************************************************** +* file name: ucln_cmn.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2001July05 +* created by: George Rhoten +*/ + +#ifndef __UCLN_IN_H__ +#define __UCLN_IN_H__ + +#include "unicode/utypes.h" +#include "ucln.h" + +/* +Please keep the order of enums declared in same order +as the functions are suppose to be called. +It's usually best to have child dependencies called first. */ +typedef enum ECleanupI18NType { + UCLN_I18N_START = -1, + UCLN_I18N_IDENTIFIER_INFO, + UCLN_I18N_SPOOF, + UCLN_I18N_TRANSLITERATOR, + UCLN_I18N_REGEX, + UCLN_I18N_ISLAMIC_CALENDAR, + UCLN_I18N_CHINESE_CALENDAR, + UCLN_I18N_HEBREW_CALENDAR, + UCLN_I18N_ASTRO_CALENDAR, + UCLN_I18N_DANGI_CALENDAR, + UCLN_I18N_CALENDAR, + UCLN_I18N_TIMEZONEFORMAT, + UCLN_I18N_TZDBTIMEZONENAMES, + UCLN_I18N_TIMEZONEGENERICNAMES, + UCLN_I18N_TIMEZONENAMES, + UCLN_I18N_ZONEMETA, + UCLN_I18N_TIMEZONE, + UCLN_I18N_CURRENCY, + UCLN_I18N_DECFMT, + UCLN_I18N_NUMFMT, + UCLN_I18N_SMPDTFMT, + UCLN_I18N_USEARCH, + UCLN_I18N_COLLATOR, + UCLN_I18N_UCOL_RES, + UCLN_I18N_CSDET, + UCLN_I18N_COLLATION_ROOT, + UCLN_I18N_GENDERINFO, + UCLN_I18N_CDFINFO, + UCLN_I18N_REGION, + UCLN_I18N_COUNT /* This must be last */ +} ECleanupI18NType; + +/* Main library cleanup registration function. */ +/* See common/ucln.h for details on adding a cleanup function. */ +/* Note: the global mutex must not be held when calling this function. */ +U_CFUNC void U_EXPORT2 ucln_i18n_registerCleanup(ECleanupI18NType type, + cleanupFunc *func); + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/ucmndata.h b/src/core/basetypes/icu-ucsdet/include/ucmndata.h new file mode 100644 index 00000000..36163c50 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/ucmndata.h @@ -0,0 +1,111 @@ +/* +****************************************************************************** +* +* Copyright (C) 1999-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************/ + + +/*---------------------------------------------------------------------------------- + * + * UCommonData An abstract interface for dealing with ICU Common Data Files. + * ICU Common Data Files are a grouping of a number of individual + * data items (resources, converters, tables, anything) into a + * single file or dll. The combined format includes a table of + * contents for locating the individual items by name. + * + * Two formats for the table of contents are supported, which is + * why there is an abstract inteface involved. + * + * These functions are part of the ICU internal implementation, and + * are not inteded to be used directly by applications. + */ + +#ifndef __UCMNDATA_H__ +#define __UCMNDATA_H__ + +#include "unicode/udata.h" +#include "umapfile.h" + + +#define COMMON_DATA_NAME U_ICUDATA_NAME + +typedef struct { + uint16_t headerSize; + uint8_t magic1; + uint8_t magic2; +} MappedData; + + +typedef struct { + MappedData dataHeader; + UDataInfo info; +} DataHeader; + +typedef struct { + uint32_t nameOffset; + uint32_t dataOffset; +} UDataOffsetTOCEntry; + +typedef struct { + uint32_t count; + UDataOffsetTOCEntry entry[2]; /* Actual size of array is from count. */ +} UDataOffsetTOC; + +/** + * Get the header size from a const DataHeader *udh. + * Handles opposite-endian data. + * + * @internal + */ +U_CFUNC uint16_t +udata_getHeaderSize(const DataHeader *udh); + +/** + * Get the UDataInfo.size from a const UDataInfo *info. + * Handles opposite-endian data. + * + * @internal + */ +U_CFUNC uint16_t +udata_getInfoSize(const UDataInfo *info); + +U_CDECL_BEGIN +/* + * "Virtual" functions for data lookup. + * To call one, given a UDataMemory *p, the code looks like this: + * p->vFuncs.Lookup(p, tocEntryName, pErrorCode); + * (I sure do wish this was written in C++, not C) + */ + +typedef const DataHeader * +(U_CALLCONV * LookupFn)(const UDataMemory *pData, + const char *tocEntryName, + int32_t *pLength, + UErrorCode *pErrorCode); + +typedef uint32_t +(U_CALLCONV * NumEntriesFn)(const UDataMemory *pData); + +U_CDECL_END + +typedef struct { + LookupFn Lookup; + NumEntriesFn NumEntries; +} commonDataFuncs; + + +/* + * Functions to check whether a UDataMemory refers to memory containing + * a recognizable header and table of contents a Common Data Format + * + * If a valid header and TOC are found, + * set the CommonDataFuncs function dispatch vector in the UDataMemory + * to point to the right functions for the TOC type. + * otherwise + * set an errorcode. + */ +U_CFUNC void udata_checkCommonData(UDataMemory *pData, UErrorCode *pErrorCode); + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/udataswp.h b/src/core/basetypes/icu-ucsdet/include/udataswp.h new file mode 100644 index 00000000..66c84955 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/udataswp.h @@ -0,0 +1,351 @@ +/* +******************************************************************************* +* +* Copyright (C) 2003-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: udataswp.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003jun05 +* created by: Markus W. Scherer +* +* Definitions for ICU data transformations for different platforms, +* changing between big- and little-endian data and/or between +* charset families (ASCII<->EBCDIC). +*/ + +#ifndef __UDATASWP_H__ +#define __UDATASWP_H__ + +#include <stdarg.h> +#include "unicode/utypes.h" + +/* forward declaration */ + +U_CDECL_BEGIN + +struct UDataSwapper; +typedef struct UDataSwapper UDataSwapper; + +/** + * Function type for data transformation. + * Transforms data, or just returns the length of the data if + * the input length is -1. + * Swap functions assume that their data pointers are aligned properly. + * + * Quick implementation outline: + * (best to copy and adapt and existing swapper implementation) + * check that the data looks like the expected format + * if(length<0) { + * preflight: + * never dereference outData + * read inData and determine the data size + * assume that inData is long enough for this + * } else { + * outData can be NULL if length==0 + * inData==outData (in-place swapping) possible but not required! + * verify that length>=(actual size) + * if there is a chance that not every byte up to size is reached + * due to padding etc.: + * if(inData!=outData) { + * memcpy(outData, inData, actual size); + * } + * swap contents + * } + * return actual size + * + * Further implementation notes: + * - read integers from inData before swapping them + * because in-place swapping can make them unreadable + * - compareInvChars compares a local Unicode string with already-swapped + * output charset strings + * + * @param ds Pointer to UDataSwapper containing global data about the + * transformation and function pointers for handling primitive + * types. + * @param inData Pointer to the input data to be transformed or examined. + * @param length Length of the data, counting bytes. May be -1 for preflighting. + * If length>=0, then transform the data. + * If length==-1, then only determine the length of the data. + * The length cannot be determined from the data itself for all + * types of data (e.g., not for simple arrays of integers). + * @param outData Pointer to the output data buffer. + * If length>=0 (transformation), then the output buffer must + * have a capacity of at least length. + * If length==-1, then outData will not be used and can be NULL. + * @param pErrorCode ICU UErrorCode parameter, must not be NULL and must + * fulfill U_SUCCESS on input. + * @return The actual length of the data. + * + * @see UDataSwapper + * @internal ICU 2.8 + */ +typedef int32_t U_CALLCONV +UDataSwapFn(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode); + +/** + * Convert one uint16_t from input to platform endianness. + * @internal ICU 2.8 + */ +typedef uint16_t U_CALLCONV +UDataReadUInt16(uint16_t x); + +/** + * Convert one uint32_t from input to platform endianness. + * @internal ICU 2.8 + */ +typedef uint32_t U_CALLCONV +UDataReadUInt32(uint32_t x); + +/** + * Convert one uint16_t from platform to input endianness. + * @internal ICU 2.8 + */ +typedef void U_CALLCONV +UDataWriteUInt16(uint16_t *p, uint16_t x); + +/** + * Convert one uint32_t from platform to input endianness. + * @internal ICU 2.8 + */ +typedef void U_CALLCONV +UDataWriteUInt32(uint32_t *p, uint32_t x); + +/** + * Compare invariant-character strings, one in the output data and the + * other one caller-provided in Unicode. + * An output data string is compared because strings are usually swapped + * before the rest of the data, to allow for sorting of string tables + * according to the output charset. + * You can use -1 for the length parameters of NUL-terminated strings as usual. + * Returns Unicode code point order for invariant characters. + * @internal ICU 2.8 + */ +typedef int32_t U_CALLCONV +UDataCompareInvChars(const UDataSwapper *ds, + const char *outString, int32_t outLength, + const UChar *localString, int32_t localLength); + +/** + * Function for message output when an error occurs during data swapping. + * A format string and variable number of arguments are passed + * like for vprintf(). + * + * @param context A function-specific context pointer. + * @param fmt The format string. + * @param args The arguments for format string inserts. + * + * @internal ICU 2.8 + */ +typedef void U_CALLCONV +UDataPrintError(void *context, const char *fmt, va_list args); + +struct UDataSwapper { + /** Input endianness. @internal ICU 2.8 */ + UBool inIsBigEndian; + /** Input charset family. @see U_CHARSET_FAMILY @internal ICU 2.8 */ + uint8_t inCharset; + /** Output endianness. @internal ICU 2.8 */ + UBool outIsBigEndian; + /** Output charset family. @see U_CHARSET_FAMILY @internal ICU 2.8 */ + uint8_t outCharset; + + /* basic functions for reading data values */ + + /** Convert one uint16_t from input to platform endianness. @internal ICU 2.8 */ + UDataReadUInt16 *readUInt16; + /** Convert one uint32_t from input to platform endianness. @internal ICU 2.8 */ + UDataReadUInt32 *readUInt32; + /** Compare an invariant-character output string with a local one. @internal ICU 2.8 */ + UDataCompareInvChars *compareInvChars; + + /* basic functions for writing data values */ + + /** Convert one uint16_t from platform to input endianness. @internal ICU 2.8 */ + UDataWriteUInt16 *writeUInt16; + /** Convert one uint32_t from platform to input endianness. @internal ICU 2.8 */ + UDataWriteUInt32 *writeUInt32; + + /* basic functions for data transformations */ + + /** Transform an array of 16-bit integers. @internal ICU 2.8 */ + UDataSwapFn *swapArray16; + /** Transform an array of 32-bit integers. @internal ICU 2.8 */ + UDataSwapFn *swapArray32; + /** Transform an array of 64-bit integers. @internal ICU 53 */ + UDataSwapFn *swapArray64; + /** Transform an invariant-character string. @internal ICU 2.8 */ + UDataSwapFn *swapInvChars; + + /** + * Function for message output when an error occurs during data swapping. + * Can be NULL. + * @internal ICU 2.8 + */ + UDataPrintError *printError; + /** Context pointer for printError. @internal ICU 2.8 */ + void *printErrorContext; +}; + +U_CDECL_END + +U_CAPI UDataSwapper * U_EXPORT2 +udata_openSwapper(UBool inIsBigEndian, uint8_t inCharset, + UBool outIsBigEndian, uint8_t outCharset, + UErrorCode *pErrorCode); + +/** + * Open a UDataSwapper for the given input data and the specified output + * characteristics. + * Values of -1 for any of the characteristics mean the local platform's + * characteristics. + * + * @see udata_swap + * @internal ICU 2.8 + */ +U_CAPI UDataSwapper * U_EXPORT2 +udata_openSwapperForInputData(const void *data, int32_t length, + UBool outIsBigEndian, uint8_t outCharset, + UErrorCode *pErrorCode); + +U_CAPI void U_EXPORT2 +udata_closeSwapper(UDataSwapper *ds); + +/** + * Read the beginning of an ICU data piece, recognize magic bytes, + * swap the structure. + * Set a U_UNSUPPORTED_ERROR if it does not look like an ICU data piece. + * + * @return The size of the data header, in bytes. + * + * @internal ICU 2.8 + */ +U_CAPI int32_t U_EXPORT2 +udata_swapDataHeader(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode); + +/** + * Convert one int16_t from input to platform endianness. + * @internal ICU 2.8 + */ +U_CAPI int16_t U_EXPORT2 +udata_readInt16(const UDataSwapper *ds, int16_t x); + +/** + * Convert one int32_t from input to platform endianness. + * @internal ICU 2.8 + */ +U_CAPI int32_t U_EXPORT2 +udata_readInt32(const UDataSwapper *ds, int32_t x); + +/** + * Swap a block of invariant, NUL-terminated strings, but not padding + * bytes after the last string. + * @internal + */ +U_CAPI int32_t U_EXPORT2 +udata_swapInvStringBlock(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode); + +U_CAPI void U_EXPORT2 +udata_printError(const UDataSwapper *ds, + const char *fmt, + ...); + +/* internal exports from putil.c -------------------------------------------- */ + +/* declared here to keep them out of the public putil.h */ + +/** + * Swap invariant char * strings ASCII->EBCDIC. + * @internal + */ +U_CAPI int32_t U_EXPORT2 +uprv_ebcdicFromAscii(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode); + +/** + * Copy invariant ASCII char * strings and verify they are invariant. + * @internal + */ +U_CFUNC int32_t +uprv_copyAscii(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode); + +/** + * Swap invariant char * strings EBCDIC->ASCII. + * @internal + */ +U_CFUNC int32_t +uprv_asciiFromEbcdic(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode); + +/** + * Copy invariant EBCDIC char * strings and verify they are invariant. + * @internal + */ +U_CFUNC int32_t +uprv_copyEbcdic(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode); + +/** + * Compare ASCII invariant char * with Unicode invariant UChar * + * @internal + */ +U_CFUNC int32_t +uprv_compareInvAscii(const UDataSwapper *ds, + const char *outString, int32_t outLength, + const UChar *localString, int32_t localLength); + +/** + * Compare EBCDIC invariant char * with Unicode invariant UChar * + * @internal + */ +U_CFUNC int32_t +uprv_compareInvEbcdic(const UDataSwapper *ds, + const char *outString, int32_t outLength, + const UChar *localString, int32_t localLength); + +/* material... -------------------------------------------------------------- */ + +#if 0 + +/* udata.h */ + +/** + * Public API function in udata.c + * + * Same as udata_openChoice() but automatically swaps the data. + * isAcceptable, if not NULL, may accept data with endianness and charset family + * different from the current platform's properties. + * If the data is acceptable and the platform properties do not match, then + * the swap function is called to swap an allocated version of the data. + * Preflighting may or may not be performed depending on whether the size of + * the loaded data item is known. + * + * @param isAcceptable Same as for udata_openChoice(). May be NULL. + * + * @internal ICU 2.8 + */ +U_CAPI UDataMemory * U_EXPORT2 +udata_openSwap(const char *path, const char *type, const char *name, + UDataMemoryIsAcceptable *isAcceptable, void *isAcceptableContext, + UDataSwapFn *swap, + UDataPrintError *printError, void *printErrorContext, + UErrorCode *pErrorCode); + +#endif + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/uelement.h b/src/core/basetypes/icu-ucsdet/include/uelement.h new file mode 100644 index 00000000..4eaddd9d --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/uelement.h @@ -0,0 +1,89 @@ +/* +******************************************************************************* +* Copyright (C) 1997-2011, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: uelement.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011jul04 +* created by: Markus W. Scherer +* +* Common definitions for UHashTable and UVector. +* UHashTok moved here from uhash.h and renamed UElement. +* This allows users of UVector to avoid the confusing #include of uhash.h. +* uhash.h aliases UElement to UHashTok, +* so that we need not change all of its code and its users. +*/ + +#ifndef __UELEMENT_H__ +#define __UELEMENT_H__ + +#include "unicode/utypes.h" + +U_CDECL_BEGIN + +/** + * A UVector element, or a key or value within a UHashtable. + * It may be either a 32-bit integral value or an opaque void* pointer. + * The void* pointer may be smaller than 32 bits (e.g. 24 bits) + * or may be larger (e.g. 64 bits). + * + * Because a UElement is the size of a native pointer or a 32-bit + * integer, we pass it around by value. + */ +union UElement { + void* pointer; + int32_t integer; +}; +typedef union UElement UElement; + +/** + * An element-equality (boolean) comparison function. + * @param e1 An element (object or integer) + * @param e2 An element (object or integer) + * @return TRUE if the two elements are equal. + */ +typedef UBool U_CALLCONV UElementsAreEqual(const UElement e1, const UElement e2); + +/** + * An element sorting (three-way) comparison function. + * @param e1 An element (object or integer) + * @param e2 An element (object or integer) + * @return 0 if the two elements are equal, -1 if e1 is < e2, or +1 if e1 is > e2. + */ +typedef int8_t U_CALLCONV UElementComparator(UElement e1, UElement e2); + +/** + * An element assignment function. It may copy an integer, copy + * a pointer, or clone a pointer, as appropriate. + * @param dst The element to be assigned to + * @param src The element to assign from + */ +typedef void U_CALLCONV UElementAssigner(UElement *dst, UElement *src); + +U_CDECL_END + +/** + * Comparator function for UnicodeString* keys. Implements UElementsAreEqual. + * @param key1 The string for comparison + * @param key2 The string for comparison + * @return true if key1 and key2 are equal, return false otherwise. + */ +U_CAPI UBool U_EXPORT2 +uhash_compareUnicodeString(const UElement key1, const UElement key2); + +/** + * Comparator function for UnicodeString* keys (case insensitive). + * Make sure to use together with uhash_hashCaselessUnicodeString. + * Implements UElementsAreEqual. + * @param key1 The string for comparison + * @param key2 The string for comparison + * @return true if key1 and key2 are equal, return false otherwise. + */ +U_CAPI UBool U_EXPORT2 +uhash_compareCaselessUnicodeString(const UElement key1, const UElement key2); + +#endif /* __UELEMENT_H__ */ diff --git a/src/core/basetypes/icu-ucsdet/include/uenumimp.h b/src/core/basetypes/icu-ucsdet/include/uenumimp.h new file mode 100644 index 00000000..664bc686 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/uenumimp.h @@ -0,0 +1,153 @@ +/* +******************************************************************************* +* +* Copyright (C) 2002-2006, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uenumimp.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:2 +* +* created on: 2002jul08 +* created by: Vladimir Weinstein +*/ + +#ifndef __UENUMIMP_H +#define __UENUMIMP_H + +#include "unicode/uenum.h" + +U_CDECL_BEGIN + +/** + * following are the type declarations for + * implementations of APIs. If any of these + * functions are NULL, U_UNSUPPORTED_ERROR + * is returned. If close is NULL, the enumeration + * object is going to be released. + * Initial error checking is done in the body + * of API function, so the implementations + * need not to check the initial error condition. + */ + +/** + * Function type declaration for uenum_close(). + * + * This function should cleanup the enumerator object + * + * @param en enumeration to be closed + */ +typedef void U_CALLCONV +UEnumClose(UEnumeration *en); + +/** + * Function type declaration for uenum_count(). + * + * This function should count the number of elements + * in this enumeration + * + * @param en enumeration to be counted + * @param status pointer to UErrorCode variable + * @return number of elements in enumeration + */ +typedef int32_t U_CALLCONV +UEnumCount(UEnumeration *en, UErrorCode *status); + +/** + * Function type declaration for uenum_unext(). + * + * This function returns the next element as a UChar *, + * or NULL after all elements haven been enumerated. + * + * @param en enumeration + * @param resultLength pointer to result length + * @param status pointer to UErrorCode variable + * @return next element as UChar *, + * or NULL after all elements haven been enumerated + */ +typedef const UChar* U_CALLCONV +UEnumUNext(UEnumeration* en, + int32_t* resultLength, + UErrorCode* status); + +/** + * Function type declaration for uenum_next(). + * + * This function returns the next element as a char *, + * or NULL after all elements haven been enumerated. + * + * @param en enumeration + * @param resultLength pointer to result length + * @param status pointer to UErrorCode variable + * @return next element as char *, + * or NULL after all elements haven been enumerated + */ +typedef const char* U_CALLCONV +UEnumNext(UEnumeration* en, + int32_t* resultLength, + UErrorCode* status); + +/** + * Function type declaration for uenum_reset(). + * + * This function should reset the enumeration + * object + * + * @param en enumeration + * @param status pointer to UErrorCode variable + */ +typedef void U_CALLCONV +UEnumReset(UEnumeration* en, + UErrorCode* status); + + +struct UEnumeration { + /* baseContext. For the base class only. Don't touch! */ + void *baseContext; + + /* context. Use it for what you need */ + void *context; + + /** + * these are functions that will + * be used for APIs + */ + /* called from uenum_close */ + UEnumClose *close; + /* called from uenum_count */ + UEnumCount *count; + /* called from uenum_unext */ + UEnumUNext *uNext; + /* called from uenum_next */ + UEnumNext *next; + /* called from uenum_reset */ + UEnumReset *reset; +}; + +U_CDECL_END + +/* This is the default implementation for uenum_unext(). + * It automatically converts the char * string to UChar *. + * Don't call this directly. This is called internally by uenum_unext + * when a UEnumeration is defined with 'uNext' pointing to this + * function. + */ +U_CAPI const UChar* U_EXPORT2 +uenum_unextDefault(UEnumeration* en, + int32_t* resultLength, + UErrorCode* status); + +/* This is the default implementation for uenum_next(). + * It automatically converts the UChar * string to char *. + * Don't call this directly. This is called internally by uenum_next + * when a UEnumeration is defined with 'next' pointing to this + * function. + */ +U_CAPI const char* U_EXPORT2 +uenum_nextDefault(UEnumeration* en, + int32_t* resultLength, + UErrorCode* status); + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/uinvchar.h b/src/core/basetypes/icu-ucsdet/include/uinvchar.h new file mode 100644 index 00000000..f307bd6a --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/uinvchar.h @@ -0,0 +1,125 @@ +/* +******************************************************************************* +* +* Copyright (C) 1999-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uinvchar.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:2 +* +* created on: 2004sep14 +* created by: Markus W. Scherer +* +* Definitions for handling invariant characters, moved here from putil.c +* for better modularization. +*/ + +#ifndef __UINVCHAR_H__ +#define __UINVCHAR_H__ + +#include "unicode/utypes.h" + +/** + * Check if a char string only contains invariant characters. + * See utypes.h for details. + * + * @param s Input string pointer. + * @param length Length of the string, can be -1 if NUL-terminated. + * @return TRUE if s contains only invariant characters. + * + * @internal (ICU 2.8) + */ +U_INTERNAL UBool U_EXPORT2 +uprv_isInvariantString(const char *s, int32_t length); + +/** + * Check if a Unicode string only contains invariant characters. + * See utypes.h for details. + * + * @param s Input string pointer. + * @param length Length of the string, can be -1 if NUL-terminated. + * @return TRUE if s contains only invariant characters. + * + * @internal (ICU 2.8) + */ +U_INTERNAL UBool U_EXPORT2 +uprv_isInvariantUString(const UChar *s, int32_t length); + +/** + * \def U_UPPER_ORDINAL + * Get the ordinal number of an uppercase invariant character + * @internal + */ +#if U_CHARSET_FAMILY==U_ASCII_FAMILY +# define U_UPPER_ORDINAL(x) ((x)-'A') +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY +# define U_UPPER_ORDINAL(x) (((x) < 'J') ? ((x)-'A') : \ + (((x) < 'S') ? ((x)-'J'+9) : \ + ((x)-'S'+18))) +#else +# error Unknown charset family! +#endif + +/** + * Compare two EBCDIC invariant-character strings in ASCII order. + * @internal + */ +U_INTERNAL int32_t U_EXPORT2 +uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2); + +/** + * \def uprv_compareInvCharsAsAscii + * Compare two invariant-character strings in ASCII order. + * @internal + */ +#if U_CHARSET_FAMILY==U_ASCII_FAMILY +# define uprv_compareInvCharsAsAscii(s1, s2) uprv_strcmp(s1, s2) +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY +# define uprv_compareInvCharsAsAscii(s1, s2) uprv_compareInvEbcdicAsAscii(s1, s2) +#else +# error Unknown charset family! +#endif + +/** + * Converts an EBCDIC invariant character to lowercase ASCII. + * @internal + */ +U_INTERNAL char U_EXPORT2 +uprv_ebcdicToLowercaseAscii(char c); + +/** + * \def uprv_invCharToLowercaseAscii + * Converts an invariant character to lowercase ASCII. + * @internal + */ +#if U_CHARSET_FAMILY==U_ASCII_FAMILY +# define uprv_invCharToLowercaseAscii uprv_asciitolower +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY +# define uprv_invCharToLowercaseAscii uprv_ebcdicToLowercaseAscii +#else +# error Unknown charset family! +#endif + +/** + * Copy EBCDIC to ASCII + * @internal + * @see uprv_strncpy + */ +U_INTERNAL uint8_t* U_EXPORT2 +uprv_aestrncpy(uint8_t *dst, const uint8_t *src, int32_t n); + + +/** + * Copy ASCII to EBCDIC + * @internal + * @see uprv_strncpy + */ +U_INTERNAL uint8_t* U_EXPORT2 +uprv_eastrncpy(uint8_t *dst, const uint8_t *src, int32_t n); + + + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/umapfile.h b/src/core/basetypes/icu-ucsdet/include/umapfile.h new file mode 100644 index 00000000..2995e381 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/umapfile.h @@ -0,0 +1,55 @@ +/* +****************************************************************************** +* +* Copyright (C) 1999-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************/ + +/*---------------------------------------------------------------------------------- + * + * Memory mapped file wrappers for use by the ICU Data Implementation + * + * Porting note: The implementation of these functions is very platform specific. + * Not all platforms can do real memory mapping. Those that can't + * still must implement these functions, getting the data into memory using + * whatever means are available. + * + * These functions are part of the ICU internal implementation, and + * are not inteded to be used directly by applications. + * + *----------------------------------------------------------------------------------*/ + +#ifndef __UMAPFILE_H__ +#define __UMAPFILE_H__ + +#include "unicode/putil.h" +#include "unicode/udata.h" +#include "putilimp.h" + +U_CFUNC UBool uprv_mapFile(UDataMemory *pdm, const char *path); +U_CFUNC void uprv_unmapFile(UDataMemory *pData); + +/* MAP_NONE: no memory mapping, no file access at all */ +#define MAP_NONE 0 +#define MAP_WIN32 1 +#define MAP_POSIX 2 +#define MAP_STDIO 3 +#define MAP_390DLL 4 + +#if UCONFIG_NO_FILE_IO +# define MAP_IMPLEMENTATION MAP_NONE +#elif U_PLATFORM_USES_ONLY_WIN32_API +# define MAP_IMPLEMENTATION MAP_WIN32 +#elif U_HAVE_MMAP || U_PLATFORM == U_PF_OS390 +# if U_PLATFORM == U_PF_OS390 && defined (OS390_STUBDATA) + /* No memory mapping for 390 batch mode. Fake it using dll loading. */ +# define MAP_IMPLEMENTATION MAP_390DLL +# else +# define MAP_IMPLEMENTATION MAP_POSIX +# endif +#else /* unknown platform, no memory map implementation: use stdio.h and uprv_malloc() instead */ +# define MAP_IMPLEMENTATION MAP_STDIO +#endif + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/umutex.h b/src/core/basetypes/icu-ucsdet/include/umutex.h new file mode 100644 index 00000000..e0ad0d3c --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/umutex.h @@ -0,0 +1,424 @@ +/* +********************************************************************** +* Copyright (C) 1997-2014, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* File UMUTEX.H +* +* Modification History: +* +* Date Name Description +* 04/02/97 aliu Creation. +* 04/07/99 srl rewrite - C interface, multiple mutices +* 05/13/99 stephen Changed to umutex (from cmutex) +****************************************************************************** +*/ + +#ifndef UMUTEX_H +#define UMUTEX_H + +#include "unicode/utypes.h" +#include "unicode/uclean.h" +#include "putilimp.h" + + + +// Forward Declarations. UMutex is not in the ICU namespace (yet) because +// there are some remaining references from plain C. +struct UMutex; +struct UConditionVar; + +U_NAMESPACE_BEGIN +struct UInitOnce; +U_NAMESPACE_END + +// Stringify macros, to allow #include of user supplied atomic & mutex files. +#define U_MUTEX_STR(s) #s +#define U_MUTEX_XSTR(s) U_MUTEX_STR(s) + +/**************************************************************************** + * + * Low Level Atomic Operations. + * Compiler dependent. Not operating system dependent. + * + ****************************************************************************/ +#if defined (U_USER_ATOMICS_H) +#include U_MUTEX_XSTR(U_USER_ATOMICS_H) + +#elif U_HAVE_STD_ATOMICS + +// C++11 atomics are available. + +#include <atomic> + +U_NAMESPACE_BEGIN + +typedef std::atomic<int32_t> u_atomic_int32_t; +#define ATOMIC_INT32_T_INITIALIZER(val) ATOMIC_VAR_INIT(val) + +inline int32_t umtx_loadAcquire(u_atomic_int32_t &var) { + return var.load(std::memory_order_acquire); +} + +inline void umtx_storeRelease(u_atomic_int32_t &var, int32_t val) { + var.store(val, std::memory_order_release); +} + +inline int32_t umtx_atomic_inc(u_atomic_int32_t *var) { + return var->fetch_add(1) + 1; +} + +inline int32_t umtx_atomic_dec(u_atomic_int32_t *var) { + return var->fetch_sub(1) - 1; +} +U_NAMESPACE_END + +#elif U_PLATFORM_HAS_WIN32_API + +// MSVC compiler. Reads and writes of volatile variables have +// acquire and release memory semantics, respectively. +// This is a Microsoft extension, not standard C++ behavior. +// +// Update: can't use this because of MinGW, built with gcc. +// Original plan was to use gcc atomics for MinGW, but they +// aren't supported, so we fold MinGW into this path. + +# define WIN32_LEAN_AND_MEAN +# define VC_EXTRALEAN +# define NOUSER +# define NOSERVICE +# define NOIME +# define NOMCX +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include <windows.h> + +U_NAMESPACE_BEGIN +typedef volatile LONG u_atomic_int32_t; +#define ATOMIC_INT32_T_INITIALIZER(val) val + +inline int32_t umtx_loadAcquire(u_atomic_int32_t &var) { + return InterlockedCompareExchange(&var, 0, 0); +} + +inline void umtx_storeRelease(u_atomic_int32_t &var, int32_t val) { + InterlockedExchange(&var, val); +} + + +inline int32_t umtx_atomic_inc(u_atomic_int32_t *var) { + return InterlockedIncrement(var); +} + +inline int32_t umtx_atomic_dec(u_atomic_int32_t *var) { + return InterlockedDecrement(var); +} +U_NAMESPACE_END + + +#elif U_HAVE_GCC_ATOMICS +/* + * gcc atomic ops. These are available on several other compilers as well. + */ + +U_NAMESPACE_BEGIN +typedef int32_t u_atomic_int32_t; +#define ATOMIC_INT32_T_INITIALIZER(val) val + +inline int32_t umtx_loadAcquire(u_atomic_int32_t &var) { + int32_t val = var; + __sync_synchronize(); + return val; +} + +inline void umtx_storeRelease(u_atomic_int32_t &var, int32_t val) { + __sync_synchronize(); + var = val; +} + +inline int32_t umtx_atomic_inc(u_atomic_int32_t *p) { + return __sync_add_and_fetch(p, 1); +} + +inline int32_t umtx_atomic_dec(u_atomic_int32_t *p) { + return __sync_sub_and_fetch(p, 1); +} +U_NAMESPACE_END + +#else + +/* + * Unknown Platform. Use out-of-line functions, which in turn use mutexes. + * Slow but correct. + */ + +#define U_NO_PLATFORM_ATOMICS + +U_NAMESPACE_BEGIN +typedef int32_t u_atomic_int32_t; +#define ATOMIC_INT32_T_INITIALIZER(val) val + +U_COMMON_API int32_t U_EXPORT2 +umtx_loadAcquire(u_atomic_int32_t &var); + +U_COMMON_API void U_EXPORT2 +umtx_storeRelease(u_atomic_int32_t &var, int32_t val); + +U_COMMON_API int32_t U_EXPORT2 +umtx_atomic_inc(u_atomic_int32_t *p); + +U_COMMON_API int32_t U_EXPORT2 +umtx_atomic_dec(u_atomic_int32_t *p); + +U_NAMESPACE_END + +#endif /* Low Level Atomic Ops Platfrom Chain */ + + + +/************************************************************************************************* + * + * UInitOnce Definitions. + * These are platform neutral. + * + *************************************************************************************************/ + +U_NAMESPACE_BEGIN + +struct UInitOnce { + u_atomic_int32_t fState; + UErrorCode fErrCode; + void reset() {fState = 0;}; + UBool isReset() {return umtx_loadAcquire(fState) == 0;}; +// Note: isReset() is used by service registration code. +// Thread safety of this usage needs review. +}; + +#define U_INITONCE_INITIALIZER {ATOMIC_INT32_T_INITIALIZER(0), U_ZERO_ERROR} + + +U_COMMON_API UBool U_EXPORT2 umtx_initImplPreInit(UInitOnce &); +U_COMMON_API void U_EXPORT2 umtx_initImplPostInit(UInitOnce &); + +template<class T> void umtx_initOnce(UInitOnce &uio, T *obj, void (T::*fp)()) { + if (umtx_loadAcquire(uio.fState) == 2) { + return; + } + if (umtx_initImplPreInit(uio)) { + (obj->*fp)(); + umtx_initImplPostInit(uio); + } +} + + +// umtx_initOnce variant for plain functions, or static class functions. +// No context parameter. +inline void umtx_initOnce(UInitOnce &uio, void (*fp)()) { + if (umtx_loadAcquire(uio.fState) == 2) { + return; + } + if (umtx_initImplPreInit(uio)) { + (*fp)(); + umtx_initImplPostInit(uio); + } +} + +// umtx_initOnce variant for plain functions, or static class functions. +// With ErrorCode, No context parameter. +inline void umtx_initOnce(UInitOnce &uio, void (*fp)(UErrorCode &), UErrorCode &errCode) { + if (U_FAILURE(errCode)) { + return; + } + if (umtx_loadAcquire(uio.fState) != 2 && umtx_initImplPreInit(uio)) { + // We run the initialization. + (*fp)(errCode); + uio.fErrCode = errCode; + umtx_initImplPostInit(uio); + } else { + // Someone else already ran the initialization. + if (U_FAILURE(uio.fErrCode)) { + errCode = uio.fErrCode; + } + } +} + +// umtx_initOnce variant for plain functions, or static class functions, +// with a context parameter. +template<class T> void umtx_initOnce(UInitOnce &uio, void (*fp)(T), T context) { + if (umtx_loadAcquire(uio.fState) == 2) { + return; + } + if (umtx_initImplPreInit(uio)) { + (*fp)(context); + umtx_initImplPostInit(uio); + } +} + +// umtx_initOnce variant for plain functions, or static class functions, +// with a context parameter and an error code. +template<class T> void umtx_initOnce(UInitOnce &uio, void (*fp)(T, UErrorCode &), T context, UErrorCode &errCode) { + if (U_FAILURE(errCode)) { + return; + } + if (umtx_loadAcquire(uio.fState) != 2 && umtx_initImplPreInit(uio)) { + // We run the initialization. + (*fp)(context, errCode); + uio.fErrCode = errCode; + umtx_initImplPostInit(uio); + } else { + // Someone else already ran the initialization. + if (U_FAILURE(uio.fErrCode)) { + errCode = uio.fErrCode; + } + } +} + +U_NAMESPACE_END + + + +/************************************************************************************************* + * + * Mutex Definitions. Platform Dependent, #if platform chain follows. + * TODO: Add a C++11 version. + * Need to convert all mutex using files to C++ first. + * + *************************************************************************************************/ + +#if defined(U_USER_MUTEX_H) +// #inlcude "U_USER_MUTEX_H" +#include U_MUTEX_XSTR(U_USER_MUTEX_H) + +#elif U_PLATFORM_HAS_WIN32_API + +/* Windows Definitions. + * Windows comes first in the platform chain. + * Cygwin (and possibly others) have both WIN32 and POSIX APIs. Prefer Win32 in this case. + */ + + +/* For CRITICAL_SECTION */ + +/* + * Note: there is an earlier include of windows.h in this file, but it is in + * different conditionals. + * This one is needed if we are using C++11 for atomic ops, but + * win32 APIs for Critical Sections. + */ + +# define WIN32_LEAN_AND_MEAN +# define VC_EXTRALEAN +# define NOUSER +# define NOSERVICE +# define NOIME +# define NOMCX +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include <windows.h> + + +typedef struct UMutex { + icu::UInitOnce fInitOnce; + CRITICAL_SECTION fCS; +} UMutex; + +/* Initializer for a static UMUTEX. Deliberately contains no value for the + * CRITICAL_SECTION. + */ +#define U_MUTEX_INITIALIZER {U_INITONCE_INITIALIZER} + +struct UConditionVar { + HANDLE fEntryGate; + HANDLE fExitGate; + int32_t fWaitCount; +}; + +#define U_CONDITION_INITIALIZER {NULL, NULL, 0} + + + +#elif U_PLATFORM_IMPLEMENTS_POSIX + +/* + * POSIX platform + */ + +#include <pthread.h> + +struct UMutex { + pthread_mutex_t fMutex; +}; +typedef struct UMutex UMutex; +#define U_MUTEX_INITIALIZER {PTHREAD_MUTEX_INITIALIZER} + +struct UConditionVar { + pthread_cond_t fCondition; +}; +#define U_CONDITION_INITIALIZER {PTHREAD_COND_INITIALIZER} + +#else + +/* + * Unknow platform type. + * This is an error condition. ICU requires mutexes. + */ + +#error Unknown Platform. + +#endif + + + +/************************************************************************************** + * + * Mutex Implementation function declaratations. + * Declarations are platform neutral. + * Implementations, in umutex.cpp, are platform specific. + * + ************************************************************************************/ + +/* Lock a mutex. + * @param mutex The given mutex to be locked. Pass NULL to specify + * the global ICU mutex. Recursive locks are an error + * and may cause a deadlock on some platforms. + */ +U_INTERNAL void U_EXPORT2 umtx_lock(UMutex* mutex); + +/* Unlock a mutex. + * @param mutex The given mutex to be unlocked. Pass NULL to specify + * the global ICU mutex. + */ +U_INTERNAL void U_EXPORT2 umtx_unlock (UMutex* mutex); + +/* + * Wait on a condition variable. + * The calling thread will unlock the mutex and wait on the condition variable. + * The mutex must be locked by the calling thread when invoking this function. + * + * @param cond the condition variable to wait on. + * @param mutex the associated mutex. + */ + +U_INTERNAL void U_EXPORT2 umtx_condWait(UConditionVar *cond, UMutex *mutex); + + +/* + * Broadcast wakeup of all threads waiting on a Condition. + * The associated mutex must be locked by the calling thread when calling + * this function; this is a temporary ICU restriction. + * + * @param cond the condition variable. + */ +U_INTERNAL void U_EXPORT2 umtx_condBroadcast(UConditionVar *cond); + +/* + * Signal a condition variable, waking up one waiting thread. + * CAUTION: Do not use. Place holder only. Not implemented for Windows. + */ +U_INTERNAL void U_EXPORT2 umtx_condSignal(UConditionVar *cond); + +#endif /* UMUTEX_H */ +/*eof*/ diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/appendable.h b/src/core/basetypes/icu-ucsdet/include/unicode/appendable.h new file mode 100644 index 00000000..a6a83b15 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/appendable.h @@ -0,0 +1,232 @@ +/* +******************************************************************************* +* Copyright (C) 2011-2012, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: appendable.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2010dec07 +* created by: Markus W. Scherer +*/ + +#ifndef __APPENDABLE_H__ +#define __APPENDABLE_H__ + +/** + * \file + * \brief C++ API: Appendable class: Sink for Unicode code points and 16-bit code units (UChars). + */ + +#include "unicode/utypes.h" +#include "unicode/uobject.h" + +U_NAMESPACE_BEGIN + +class UnicodeString; + +/** + * Base class for objects to which Unicode characters and strings can be appended. + * Combines elements of Java Appendable and ICU4C ByteSink. + * + * This class can be used in APIs where it does not matter whether the actual destination is + * a UnicodeString, a UChar[] array, a UnicodeSet, or any other object + * that receives and processes characters and/or strings. + * + * Implementation classes must implement at least appendCodeUnit(UChar). + * The base class provides default implementations for the other methods. + * + * The methods do not take UErrorCode parameters. + * If an error occurs (e.g., out-of-memory), + * in addition to returning FALSE from failing operations, + * the implementation must prevent unexpected behavior (e.g., crashes) + * from further calls and should make the error condition available separately + * (e.g., store a UErrorCode, make/keep a UnicodeString bogus). + * @stable ICU 4.8 + */ +class U_COMMON_API Appendable : public UObject { +public: + /** + * Destructor. + * @stable ICU 4.8 + */ + ~Appendable(); + + /** + * Appends a 16-bit code unit. + * @param c code unit + * @return TRUE if the operation succeeded + * @stable ICU 4.8 + */ + virtual UBool appendCodeUnit(UChar c) = 0; + + /** + * Appends a code point. + * The default implementation calls appendCodeUnit(UChar) once or twice. + * @param c code point 0..0x10ffff + * @return TRUE if the operation succeeded + * @stable ICU 4.8 + */ + virtual UBool appendCodePoint(UChar32 c); + + /** + * Appends a string. + * The default implementation calls appendCodeUnit(UChar) for each code unit. + * @param s string, must not be NULL if length!=0 + * @param length string length, or -1 if NUL-terminated + * @return TRUE if the operation succeeded + * @stable ICU 4.8 + */ + virtual UBool appendString(const UChar *s, int32_t length); + + /** + * Tells the object that the caller is going to append roughly + * appendCapacity UChars. A subclass might use this to pre-allocate + * a larger buffer if necessary. + * The default implementation does nothing. (It always returns TRUE.) + * @param appendCapacity estimated number of UChars that will be appended + * @return TRUE if the operation succeeded + * @stable ICU 4.8 + */ + virtual UBool reserveAppendCapacity(int32_t appendCapacity); + + /** + * Returns a writable buffer for appending and writes the buffer's capacity to + * *resultCapacity. Guarantees *resultCapacity>=minCapacity. + * May return a pointer to the caller-owned scratch buffer which must have + * scratchCapacity>=minCapacity. + * The returned buffer is only valid until the next operation + * on this Appendable. + * + * After writing at most *resultCapacity UChars, call appendString() with the + * pointer returned from this function and the number of UChars written. + * Many appendString() implementations will avoid copying UChars if this function + * returned an internal buffer. + * + * Partial usage example: + * \code + * int32_t capacity; + * UChar* buffer = app.getAppendBuffer(..., &capacity); + * ... Write n UChars into buffer, with n <= capacity. + * app.appendString(buffer, n); + * \endcode + * In many implementations, that call to append will avoid copying UChars. + * + * If the Appendable allocates or reallocates an internal buffer, it should use + * the desiredCapacityHint if appropriate. + * If a caller cannot provide a reasonable guess at the desired capacity, + * it should pass desiredCapacityHint=0. + * + * If a non-scratch buffer is returned, the caller may only pass + * a prefix to it to appendString(). + * That is, it is not correct to pass an interior pointer to appendString(). + * + * The default implementation always returns the scratch buffer. + * + * @param minCapacity required minimum capacity of the returned buffer; + * must be non-negative + * @param desiredCapacityHint desired capacity of the returned buffer; + * must be non-negative + * @param scratch default caller-owned buffer + * @param scratchCapacity capacity of the scratch buffer + * @param resultCapacity pointer to an integer which will be set to the + * capacity of the returned buffer + * @return a buffer with *resultCapacity>=minCapacity + * @stable ICU 4.8 + */ + virtual UChar *getAppendBuffer(int32_t minCapacity, + int32_t desiredCapacityHint, + UChar *scratch, int32_t scratchCapacity, + int32_t *resultCapacity); +}; + +/** + * An Appendable implementation which writes to a UnicodeString. + * + * This class is not intended for public subclassing. + * @stable ICU 4.8 + */ +class U_COMMON_API UnicodeStringAppendable : public Appendable { +public: + /** + * Aliases the UnicodeString (keeps its reference) for writing. + * @param s The UnicodeString to which this Appendable will write. + * @stable ICU 4.8 + */ + explicit UnicodeStringAppendable(UnicodeString &s) : str(s) {} + + /** + * Destructor. + * @stable ICU 4.8 + */ + ~UnicodeStringAppendable(); + + /** + * Appends a 16-bit code unit to the string. + * @param c code unit + * @return TRUE if the operation succeeded + * @stable ICU 4.8 + */ + virtual UBool appendCodeUnit(UChar c); + + /** + * Appends a code point to the string. + * @param c code point 0..0x10ffff + * @return TRUE if the operation succeeded + * @stable ICU 4.8 + */ + virtual UBool appendCodePoint(UChar32 c); + + /** + * Appends a string to the UnicodeString. + * @param s string, must not be NULL if length!=0 + * @param length string length, or -1 if NUL-terminated + * @return TRUE if the operation succeeded + * @stable ICU 4.8 + */ + virtual UBool appendString(const UChar *s, int32_t length); + + /** + * Tells the UnicodeString that the caller is going to append roughly + * appendCapacity UChars. + * @param appendCapacity estimated number of UChars that will be appended + * @return TRUE if the operation succeeded + * @stable ICU 4.8 + */ + virtual UBool reserveAppendCapacity(int32_t appendCapacity); + + /** + * Returns a writable buffer for appending and writes the buffer's capacity to + * *resultCapacity. Guarantees *resultCapacity>=minCapacity. + * May return a pointer to the caller-owned scratch buffer which must have + * scratchCapacity>=minCapacity. + * The returned buffer is only valid until the next write operation + * on the UnicodeString. + * + * For details see Appendable::getAppendBuffer(). + * + * @param minCapacity required minimum capacity of the returned buffer; + * must be non-negative + * @param desiredCapacityHint desired capacity of the returned buffer; + * must be non-negative + * @param scratch default caller-owned buffer + * @param scratchCapacity capacity of the scratch buffer + * @param resultCapacity pointer to an integer which will be set to the + * capacity of the returned buffer + * @return a buffer with *resultCapacity>=minCapacity + * @stable ICU 4.8 + */ + virtual UChar *getAppendBuffer(int32_t minCapacity, + int32_t desiredCapacityHint, + UChar *scratch, int32_t scratchCapacity, + int32_t *resultCapacity); + +private: + UnicodeString &str; +}; + +U_NAMESPACE_END + +#endif // __APPENDABLE_H__ diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/bytestream.h b/src/core/basetypes/icu-ucsdet/include/unicode/bytestream.h new file mode 100644 index 00000000..174aa38a --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/bytestream.h @@ -0,0 +1,257 @@ +// Copyright (C) 2009-2012, International Business Machines +// Corporation and others. All Rights Reserved. +// +// Copyright 2007 Google Inc. All Rights Reserved. +// Author: sanjay@google.com (Sanjay Ghemawat) +// +// Abstract interface that consumes a sequence of bytes (ByteSink). +// +// Used so that we can write a single piece of code that can operate +// on a variety of output string types. +// +// Various implementations of this interface are provided: +// ByteSink: +// CheckedArrayByteSink Write to a flat array, with bounds checking +// StringByteSink Write to an STL string + +// This code is a contribution of Google code, and the style used here is +// a compromise between the original Google code and the ICU coding guidelines. +// For example, data types are ICU-ified (size_t,int->int32_t), +// and API comments doxygen-ified, but function names and behavior are +// as in the original, if possible. +// Assertion-style error handling, not available in ICU, was changed to +// parameter "pinning" similar to UnicodeString. +// +// In addition, this is only a partial port of the original Google code, +// limited to what was needed so far. The (nearly) complete original code +// is in the ICU svn repository at icuhtml/trunk/design/strings/contrib +// (see ICU ticket 6765, r25517). + +#ifndef __BYTESTREAM_H__ +#define __BYTESTREAM_H__ + +/** + * \file + * \brief C++ API: Interface for writing bytes, and implementation classes. + */ + +#include "unicode/utypes.h" +#include "unicode/uobject.h" +#include "unicode/std_string.h" + +U_NAMESPACE_BEGIN + +/** + * A ByteSink can be filled with bytes. + * @stable ICU 4.2 + */ +class U_COMMON_API ByteSink : public UMemory { +public: + /** + * Default constructor. + * @stable ICU 4.2 + */ + ByteSink() { } + /** + * Virtual destructor. + * @stable ICU 4.2 + */ + virtual ~ByteSink(); + + /** + * Append "bytes[0,n-1]" to this. + * @param bytes the pointer to the bytes + * @param n the number of bytes; must be non-negative + * @stable ICU 4.2 + */ + virtual void Append(const char* bytes, int32_t n) = 0; + + /** + * Returns a writable buffer for appending and writes the buffer's capacity to + * *result_capacity. Guarantees *result_capacity>=min_capacity. + * May return a pointer to the caller-owned scratch buffer which must have + * scratch_capacity>=min_capacity. + * The returned buffer is only valid until the next operation + * on this ByteSink. + * + * After writing at most *result_capacity bytes, call Append() with the + * pointer returned from this function and the number of bytes written. + * Many Append() implementations will avoid copying bytes if this function + * returned an internal buffer. + * + * Partial usage example: + * int32_t capacity; + * char* buffer = sink->GetAppendBuffer(..., &capacity); + * ... Write n bytes into buffer, with n <= capacity. + * sink->Append(buffer, n); + * In many implementations, that call to Append will avoid copying bytes. + * + * If the ByteSink allocates or reallocates an internal buffer, it should use + * the desired_capacity_hint if appropriate. + * If a caller cannot provide a reasonable guess at the desired capacity, + * it should pass desired_capacity_hint=0. + * + * If a non-scratch buffer is returned, the caller may only pass + * a prefix to it to Append(). + * That is, it is not correct to pass an interior pointer to Append(). + * + * The default implementation always returns the scratch buffer. + * + * @param min_capacity required minimum capacity of the returned buffer; + * must be non-negative + * @param desired_capacity_hint desired capacity of the returned buffer; + * must be non-negative + * @param scratch default caller-owned buffer + * @param scratch_capacity capacity of the scratch buffer + * @param result_capacity pointer to an integer which will be set to the + * capacity of the returned buffer + * @return a buffer with *result_capacity>=min_capacity + * @stable ICU 4.2 + */ + virtual char* GetAppendBuffer(int32_t min_capacity, + int32_t desired_capacity_hint, + char* scratch, int32_t scratch_capacity, + int32_t* result_capacity); + + /** + * Flush internal buffers. + * Some byte sinks use internal buffers or provide buffering + * and require calling Flush() at the end of the stream. + * The ByteSink should be ready for further Append() calls after Flush(). + * The default implementation of Flush() does nothing. + * @stable ICU 4.2 + */ + virtual void Flush(); + +private: + ByteSink(const ByteSink &); // copy constructor not implemented + ByteSink &operator=(const ByteSink &); // assignment operator not implemented +}; + +// ------------------------------------------------------------- +// Some standard implementations + +/** + * Implementation of ByteSink that writes to a flat byte array, + * with bounds-checking: + * This sink will not write more than capacity bytes to outbuf. + * If more than capacity bytes are Append()ed, then excess bytes are ignored, + * and Overflowed() will return true. + * Overflow does not cause a runtime error. + * @stable ICU 4.2 + */ +class U_COMMON_API CheckedArrayByteSink : public ByteSink { +public: + /** + * Constructs a ByteSink that will write to outbuf[0..capacity-1]. + * @param outbuf buffer to write to + * @param capacity size of the buffer + * @stable ICU 4.2 + */ + CheckedArrayByteSink(char* outbuf, int32_t capacity); + /** + * Destructor. + * @stable ICU 4.2 + */ + virtual ~CheckedArrayByteSink(); + /** + * Returns the sink to its original state, without modifying the buffer. + * Useful for reusing both the buffer and the sink for multiple streams. + * Resets the state to NumberOfBytesWritten()=NumberOfBytesAppended()=0 + * and Overflowed()=FALSE. + * @return *this + * @stable ICU 4.6 + */ + virtual CheckedArrayByteSink& Reset(); + /** + * Append "bytes[0,n-1]" to this. + * @param bytes the pointer to the bytes + * @param n the number of bytes; must be non-negative + * @stable ICU 4.2 + */ + virtual void Append(const char* bytes, int32_t n); + /** + * Returns a writable buffer for appending and writes the buffer's capacity to + * *result_capacity. For details see the base class documentation. + * @param min_capacity required minimum capacity of the returned buffer; + * must be non-negative + * @param desired_capacity_hint desired capacity of the returned buffer; + * must be non-negative + * @param scratch default caller-owned buffer + * @param scratch_capacity capacity of the scratch buffer + * @param result_capacity pointer to an integer which will be set to the + * capacity of the returned buffer + * @return a buffer with *result_capacity>=min_capacity + * @stable ICU 4.2 + */ + virtual char* GetAppendBuffer(int32_t min_capacity, + int32_t desired_capacity_hint, + char* scratch, int32_t scratch_capacity, + int32_t* result_capacity); + /** + * Returns the number of bytes actually written to the sink. + * @return number of bytes written to the buffer + * @stable ICU 4.2 + */ + int32_t NumberOfBytesWritten() const { return size_; } + /** + * Returns true if any bytes were discarded, i.e., if there was an + * attempt to write more than 'capacity' bytes. + * @return TRUE if more than 'capacity' bytes were Append()ed + * @stable ICU 4.2 + */ + UBool Overflowed() const { return overflowed_; } + /** + * Returns the number of bytes appended to the sink. + * If Overflowed() then NumberOfBytesAppended()>NumberOfBytesWritten() + * else they return the same number. + * @return number of bytes written to the buffer + * @stable ICU 4.6 + */ + int32_t NumberOfBytesAppended() const { return appended_; } +private: + char* outbuf_; + const int32_t capacity_; + int32_t size_; + int32_t appended_; + UBool overflowed_; + CheckedArrayByteSink(); ///< default constructor not implemented + CheckedArrayByteSink(const CheckedArrayByteSink &); ///< copy constructor not implemented + CheckedArrayByteSink &operator=(const CheckedArrayByteSink &); ///< assignment operator not implemented +}; + +#if U_HAVE_STD_STRING + +/** + * Implementation of ByteSink that writes to a "string". + * The StringClass is usually instantiated with a std::string. + * @stable ICU 4.2 + */ +template<typename StringClass> +class StringByteSink : public ByteSink { + public: + /** + * Constructs a ByteSink that will append bytes to the dest string. + * @param dest pointer to string object to append to + * @stable ICU 4.2 + */ + StringByteSink(StringClass* dest) : dest_(dest) { } + /** + * Append "bytes[0,n-1]" to this. + * @param data the pointer to the bytes + * @param n the number of bytes; must be non-negative + * @stable ICU 4.2 + */ + virtual void Append(const char* data, int32_t n) { dest_->append(data, n); } + private: + StringClass* dest_; + StringByteSink(); ///< default constructor not implemented + StringByteSink(const StringByteSink &); ///< copy constructor not implemented + StringByteSink &operator=(const StringByteSink &); ///< assignment operator not implemented +}; + +#endif + +U_NAMESPACE_END + +#endif // __BYTESTREAM_H__ diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/localpointer.h b/src/core/basetypes/icu-ucsdet/include/unicode/localpointer.h new file mode 100644 index 00000000..e3ccb258 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/localpointer.h @@ -0,0 +1,304 @@ +/* +******************************************************************************* +* +* Copyright (C) 2009-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: localpointer.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009nov13 +* created by: Markus W. Scherer +*/ + +#ifndef __LOCALPOINTER_H__ +#define __LOCALPOINTER_H__ + +/** + * \file + * \brief C++ API: "Smart pointers" for use with and in ICU4C C++ code. + * + * These classes are inspired by + * - std::auto_ptr + * - boost::scoped_ptr & boost::scoped_array + * - Taligent Safe Pointers (TOnlyPointerTo) + * + * but none of those provide for all of the goals for ICU smart pointers: + * - Smart pointer owns the object and releases it when it goes out of scope. + * - No transfer of ownership via copy/assignment to reduce misuse. Simpler & more robust. + * - ICU-compatible: No exceptions. + * - Need to be able to orphan/release the pointer and its ownership. + * - Need variants for normal C++ object pointers, C++ arrays, and ICU C service objects. + * + * For details see http://site.icu-project.org/design/cpp/scoped_ptr + */ + +#include "unicode/utypes.h" + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN + +/** + * "Smart pointer" base class; do not use directly: use LocalPointer etc. + * + * Base class for smart pointer classes that do not throw exceptions. + * + * Do not use this base class directly, since it does not delete its pointer. + * A subclass must implement methods that delete the pointer: + * Destructor and adoptInstead(). + * + * There is no operator T *() provided because the programmer must decide + * whether to use getAlias() (without transfer of ownership) or orpan() + * (with transfer of ownership and NULLing of the pointer). + * + * @see LocalPointer + * @see LocalArray + * @see U_DEFINE_LOCAL_OPEN_POINTER + * @stable ICU 4.4 + */ +template<typename T> +class LocalPointerBase { +public: + /** + * Constructor takes ownership. + * @param p simple pointer to an object that is adopted + * @stable ICU 4.4 + */ + explicit LocalPointerBase(T *p=NULL) : ptr(p) {} + /** + * Destructor deletes the object it owns. + * Subclass must override: Base class does nothing. + * @stable ICU 4.4 + */ + ~LocalPointerBase() { /* delete ptr; */ } + /** + * NULL check. + * @return TRUE if ==NULL + * @stable ICU 4.4 + */ + UBool isNull() const { return ptr==NULL; } + /** + * NULL check. + * @return TRUE if !=NULL + * @stable ICU 4.4 + */ + UBool isValid() const { return ptr!=NULL; } + /** + * Comparison with a simple pointer, so that existing code + * with ==NULL need not be changed. + * @param other simple pointer for comparison + * @return true if this pointer value equals other + * @stable ICU 4.4 + */ + bool operator==(const T *other) const { return ptr==other; } + /** + * Comparison with a simple pointer, so that existing code + * with !=NULL need not be changed. + * @param other simple pointer for comparison + * @return true if this pointer value differs from other + * @stable ICU 4.4 + */ + bool operator!=(const T *other) const { return ptr!=other; } + /** + * Access without ownership change. + * @return the pointer value + * @stable ICU 4.4 + */ + T *getAlias() const { return ptr; } + /** + * Access without ownership change. + * @return the pointer value as a reference + * @stable ICU 4.4 + */ + T &operator*() const { return *ptr; } + /** + * Access without ownership change. + * @return the pointer value + * @stable ICU 4.4 + */ + T *operator->() const { return ptr; } + /** + * Gives up ownership; the internal pointer becomes NULL. + * @return the pointer value; + * caller becomes responsible for deleting the object + * @stable ICU 4.4 + */ + T *orphan() { + T *p=ptr; + ptr=NULL; + return p; + } + /** + * Deletes the object it owns, + * and adopts (takes ownership of) the one passed in. + * Subclass must override: Base class does not delete the object. + * @param p simple pointer to an object that is adopted + * @stable ICU 4.4 + */ + void adoptInstead(T *p) { + // delete ptr; + ptr=p; + } +protected: + /** + * Actual pointer. + * @internal + */ + T *ptr; +private: + // No comparison operators with other LocalPointerBases. + bool operator==(const LocalPointerBase &other); + bool operator!=(const LocalPointerBase &other); + // No ownership transfer: No copy constructor, no assignment operator. + LocalPointerBase(const LocalPointerBase &other); + void operator=(const LocalPointerBase &other); + // No heap allocation. Use only on the stack. + static void * U_EXPORT2 operator new(size_t size); + static void * U_EXPORT2 operator new[](size_t size); +#if U_HAVE_PLACEMENT_NEW + static void * U_EXPORT2 operator new(size_t, void *ptr); +#endif +}; + +/** + * "Smart pointer" class, deletes objects via the standard C++ delete operator. + * For most methods see the LocalPointerBase base class. + * + * Usage example: + * \code + * LocalPointer<UnicodeString> s(new UnicodeString((UChar32)0x50005)); + * int32_t length=s->length(); // 2 + * UChar lead=s->charAt(0); // 0xd900 + * if(some condition) { return; } // no need to explicitly delete the pointer + * s.adoptInstead(new UnicodeString((UChar)0xfffc)); + * length=s->length(); // 1 + * // no need to explicitly delete the pointer + * \endcode + * + * @see LocalPointerBase + * @stable ICU 4.4 + */ +template<typename T> +class LocalPointer : public LocalPointerBase<T> { +public: + /** + * Constructor takes ownership. + * @param p simple pointer to an object that is adopted + * @stable ICU 4.4 + */ + explicit LocalPointer(T *p=NULL) : LocalPointerBase<T>(p) {} + /** + * Destructor deletes the object it owns. + * @stable ICU 4.4 + */ + ~LocalPointer() { + delete LocalPointerBase<T>::ptr; + } + /** + * Deletes the object it owns, + * and adopts (takes ownership of) the one passed in. + * @param p simple pointer to an object that is adopted + * @stable ICU 4.4 + */ + void adoptInstead(T *p) { + delete LocalPointerBase<T>::ptr; + LocalPointerBase<T>::ptr=p; + } +}; + +/** + * "Smart pointer" class, deletes objects via the C++ array delete[] operator. + * For most methods see the LocalPointerBase base class. + * Adds operator[] for array item access. + * + * Usage example: + * \code + * LocalArray<UnicodeString> a(new UnicodeString[2]); + * a[0].append((UChar)0x61); + * if(some condition) { return; } // no need to explicitly delete the array + * a.adoptInstead(new UnicodeString[4]); + * a[3].append((UChar)0x62).append((UChar)0x63).reverse(); + * // no need to explicitly delete the array + * \endcode + * + * @see LocalPointerBase + * @stable ICU 4.4 + */ +template<typename T> +class LocalArray : public LocalPointerBase<T> { +public: + /** + * Constructor takes ownership. + * @param p simple pointer to an array of T objects that is adopted + * @stable ICU 4.4 + */ + explicit LocalArray(T *p=NULL) : LocalPointerBase<T>(p) {} + /** + * Destructor deletes the array it owns. + * @stable ICU 4.4 + */ + ~LocalArray() { + delete[] LocalPointerBase<T>::ptr; + } + /** + * Deletes the array it owns, + * and adopts (takes ownership of) the one passed in. + * @param p simple pointer to an array of T objects that is adopted + * @stable ICU 4.4 + */ + void adoptInstead(T *p) { + delete[] LocalPointerBase<T>::ptr; + LocalPointerBase<T>::ptr=p; + } + /** + * Array item access (writable). + * No index bounds check. + * @param i array index + * @return reference to the array item + * @stable ICU 4.4 + */ + T &operator[](ptrdiff_t i) const { return LocalPointerBase<T>::ptr[i]; } +}; + +/** + * \def U_DEFINE_LOCAL_OPEN_POINTER + * "Smart pointer" definition macro, deletes objects via the closeFunction. + * Defines a subclass of LocalPointerBase which works just + * like LocalPointer<Type> except that this subclass will use the closeFunction + * rather than the C++ delete operator. + * + * Requirement: The closeFunction must tolerate a NULL pointer. + * (We could add a NULL check here but it is normally redundant.) + * + * Usage example: + * \code + * LocalUCaseMapPointer csm(ucasemap_open(localeID, options, &errorCode)); + * utf8OutLength=ucasemap_utf8ToLower(csm.getAlias(), + * utf8Out, (int32_t)sizeof(utf8Out), + * utf8In, utf8InLength, &errorCode); + * if(U_FAILURE(errorCode)) { return; } // no need to explicitly delete the UCaseMap + * \endcode + * + * @see LocalPointerBase + * @see LocalPointer + * @stable ICU 4.4 + */ +#define U_DEFINE_LOCAL_OPEN_POINTER(LocalPointerClassName, Type, closeFunction) \ + class LocalPointerClassName : public LocalPointerBase<Type> { \ + public: \ + explicit LocalPointerClassName(Type *p=NULL) : LocalPointerBase<Type>(p) {} \ + ~LocalPointerClassName() { closeFunction(ptr); } \ + void adoptInstead(Type *p) { \ + closeFunction(ptr); \ + ptr=p; \ + } \ + } + +U_NAMESPACE_END + +#endif /* U_SHOW_CPLUSPLUS_API */ +#endif /* __LOCALPOINTER_H__ */ diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/platform.h b/src/core/basetypes/icu-ucsdet/include/unicode/platform.h new file mode 100644 index 00000000..07dce8ad --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/platform.h @@ -0,0 +1,751 @@ +/* +****************************************************************************** +* +* Copyright (C) 1997-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* FILE NAME : platform.h +* +* Date Name Description +* 05/13/98 nos Creation (content moved here from ptypes.h). +* 03/02/99 stephen Added AS400 support. +* 03/30/99 stephen Added Linux support. +* 04/13/99 stephen Reworked for autoconf. +****************************************************************************** +*/ + +#ifndef _PLATFORM_H +#define _PLATFORM_H + +#include "unicode/uconfig.h" +#include "unicode/uvernum.h" + +/** + * \file + * \brief Basic types for the platform. + * + * This file used to be generated by autoconf/configure. + * Starting with ICU 49, platform.h is a normal source file, + * to simplify cross-compiling and working with non-autoconf/make build systems. + * + * When a value in this file does not work on a platform, then please + * try to derive it from the U_PLATFORM value + * (for which we might need a new value constant in rare cases) + * and/or from other macros that are predefined by the compiler + * or defined in standard (POSIX or platform or compiler) headers. + * + * As a temporary workaround, you can add an explicit <code>#define</code> for some macros + * before it is first tested, or add an equivalent -D macro definition + * to the compiler's command line. + * + * Note: Some compilers provide ways to show the predefined macros. + * For example, with gcc you can compile an empty .c file and have the compiler + * print the predefined macros with + * \code + * gcc -E -dM -x c /dev/null | sort + * \endcode + * (You can provide an actual empty .c file rather than /dev/null. + * <code>-x c++</code> is for C++.) + */ + +/** + * Define some things so that they can be documented. + * @internal + */ +#ifdef U_IN_DOXYGEN +/* + * Problem: "platform.h:335: warning: documentation for unknown define U_HAVE_STD_STRING found." means that U_HAVE_STD_STRING is not documented. + * Solution: #define any defines for non @internal API here, so that they are visible in the docs. If you just set PREDEFINED in Doxyfile.in, they won't be documented. + */ + +/* None for now. */ +#endif + +/** + * \def U_PLATFORM + * The U_PLATFORM macro defines the platform we're on. + * + * We used to define one different, value-less macro per platform. + * That made it hard to know the set of relevant platforms and macros, + * and hard to deal with variants of platforms. + * + * Starting with ICU 49, we define platforms as numeric macros, + * with ranges of values for related platforms and their variants. + * The U_PLATFORM macro is set to one of these values. + * + * Historical note from the Solaris Wikipedia article: + * AT&T and Sun collaborated on a project to merge the most popular Unix variants + * on the market at that time: BSD, System V, and Xenix. + * This became Unix System V Release 4 (SVR4). + * + * @internal + */ + +/** Unknown platform. @internal */ +#define U_PF_UNKNOWN 0 +/** Windows @internal */ +#define U_PF_WINDOWS 1000 +/** MinGW. Windows, calls to Win32 API, but using GNU gcc and binutils. @internal */ +#define U_PF_MINGW 1800 +/** + * Cygwin. Windows, calls to cygwin1.dll for Posix functions, + * using MSVC or GNU gcc and binutils. + * @internal + */ +#define U_PF_CYGWIN 1900 +/* Reserve 2000 for U_PF_UNIX? */ +/** HP-UX is based on UNIX System V. @internal */ +#define U_PF_HPUX 2100 +/** Solaris is a Unix operating system based on SVR4. @internal */ +#define U_PF_SOLARIS 2600 +/** BSD is a UNIX operating system derivative. @internal */ +#define U_PF_BSD 3000 +/** AIX is based on UNIX System V Releases and 4.3 BSD. @internal */ +#define U_PF_AIX 3100 +/** IRIX is based on UNIX System V with BSD extensions. @internal */ +#define U_PF_IRIX 3200 +/** + * Darwin is a POSIX-compliant operating system, composed of code developed by Apple, + * as well as code derived from NeXTSTEP, BSD, and other projects, + * built around the Mach kernel. + * Darwin forms the core set of components upon which Mac OS X, Apple TV, and iOS are based. + * (Original description modified from WikiPedia.) + * @internal + */ +#define U_PF_DARWIN 3500 +/** iPhone OS (iOS) is a derivative of Mac OS X. @internal */ +#define U_PF_IPHONE 3550 +/** QNX is a commercial Unix-like real-time operating system related to BSD. @internal */ +#define U_PF_QNX 3700 +/** Linux is a Unix-like operating system. @internal */ +#define U_PF_LINUX 4000 +/** Android is based on Linux. @internal */ +#define U_PF_ANDROID 4050 +/** z/OS is the successor to OS/390 which was the successor to MVS. @internal */ +#define U_PF_OS390 9000 +/** "IBM i" is the current name of what used to be i5/OS and earlier OS/400. @internal */ +#define U_PF_OS400 9400 + +#ifdef U_PLATFORM + /* Use the predefined value. */ +#elif defined(__MINGW32__) +# define U_PLATFORM U_PF_MINGW +#elif defined(__CYGWIN__) +# define U_PLATFORM U_PF_CYGWIN +#elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +# define U_PLATFORM U_PF_WINDOWS +#elif defined(__ANDROID__) +# define U_PLATFORM U_PF_ANDROID + /* Android wchar_t support depends on the API level. */ +# include <android/api-level.h> +#elif defined(linux) || defined(__linux__) || defined(__linux) +# define U_PLATFORM U_PF_LINUX +#elif defined(__APPLE__) && defined(__MACH__) +# include <TargetConditionals.h> +# if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE /* variant of TARGET_OS_MAC */ +# define U_PLATFORM U_PF_IPHONE +# else +# define U_PLATFORM U_PF_DARWIN +# endif +#elif defined(BSD) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__MirBSD__) +# define U_PLATFORM U_PF_BSD +#elif defined(sun) || defined(__sun) + /* Check defined(__SVR4) || defined(__svr4__) to distinguish Solaris from SunOS? */ +# define U_PLATFORM U_PF_SOLARIS +# if defined(__GNUC__) + /* Solaris/GCC needs this header file to get the proper endianness. Normally, this + * header file is included with stddef.h but on Solairs/GCC, the GCC version of stddef.h + * is included which does not include this header file. + */ +# include <sys/isa_defs.h> +# endif +#elif defined(_AIX) || defined(__TOS_AIX__) +# define U_PLATFORM U_PF_AIX +#elif defined(_hpux) || defined(hpux) || defined(__hpux) +# define U_PLATFORM U_PF_HPUX +#elif defined(sgi) || defined(__sgi) +# define U_PLATFORM U_PF_IRIX +#elif defined(__QNX__) || defined(__QNXNTO__) +# define U_PLATFORM U_PF_QNX +#elif defined(__TOS_MVS__) +# define U_PLATFORM U_PF_OS390 +#elif defined(__OS400__) || defined(__TOS_OS400__) +# define U_PLATFORM U_PF_OS400 +#else +# define U_PLATFORM U_PF_UNKNOWN +#endif + +/** + * \def CYGWINMSVC + * Defined if this is Windows with Cygwin, but using MSVC rather than gcc. + * Otherwise undefined. + * @internal + */ +/* Commented out because this is already set in mh-cygwin-msvc +#if U_PLATFORM == U_PF_CYGWIN && defined(_MSC_VER) +# define CYGWINMSVC +#endif +*/ + +/** + * \def U_PLATFORM_USES_ONLY_WIN32_API + * Defines whether the platform uses only the Win32 API. + * Set to 1 for Windows/MSVC and MinGW but not Cygwin. + * @internal + */ +#ifdef U_PLATFORM_USES_ONLY_WIN32_API + /* Use the predefined value. */ +#elif (U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_MINGW) || defined(CYGWINMSVC) +# define U_PLATFORM_USES_ONLY_WIN32_API 1 +#else + /* Cygwin implements POSIX. */ +# define U_PLATFORM_USES_ONLY_WIN32_API 0 +#endif + +/** + * \def U_PLATFORM_HAS_WIN32_API + * Defines whether the Win32 API is available on the platform. + * Set to 1 for Windows/MSVC, MinGW and Cygwin. + * @internal + */ +#ifdef U_PLATFORM_HAS_WIN32_API + /* Use the predefined value. */ +#elif U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN +# define U_PLATFORM_HAS_WIN32_API 1 +#else +# define U_PLATFORM_HAS_WIN32_API 0 +#endif + +/** + * \def U_PLATFORM_IMPLEMENTS_POSIX + * Defines whether the platform implements (most of) the POSIX API. + * Set to 1 for Cygwin and most other platforms. + * @internal + */ +#ifdef U_PLATFORM_IMPLEMENTS_POSIX + /* Use the predefined value. */ +#elif U_PLATFORM_USES_ONLY_WIN32_API +# define U_PLATFORM_IMPLEMENTS_POSIX 0 +#else +# define U_PLATFORM_IMPLEMENTS_POSIX 1 +#endif + +/** + * \def U_PLATFORM_IS_LINUX_BASED + * Defines whether the platform is Linux or one of its derivatives. + * @internal + */ +#ifdef U_PLATFORM_IS_LINUX_BASED + /* Use the predefined value. */ +#elif U_PF_LINUX <= U_PLATFORM && U_PLATFORM <= U_PF_ANDROID +# define U_PLATFORM_IS_LINUX_BASED 1 +#else +# define U_PLATFORM_IS_LINUX_BASED 0 +#endif + +/** + * \def U_PLATFORM_IS_DARWIN_BASED + * Defines whether the platform is Darwin or one of its derivatives. + * @internal + */ +#ifdef U_PLATFORM_IS_DARWIN_BASED + /* Use the predefined value. */ +#elif U_PF_DARWIN <= U_PLATFORM && U_PLATFORM <= U_PF_IPHONE +# define U_PLATFORM_IS_DARWIN_BASED 1 +#else +# define U_PLATFORM_IS_DARWIN_BASED 0 +#endif + +/** + * \def U_HAVE_STDINT_H + * Defines whether stdint.h is available. It is a C99 standard header. + * We used to include inttypes.h which includes stdint.h but we usually do not need + * the additional definitions from inttypes.h. + * @internal + */ +#ifdef U_HAVE_STDINT_H + /* Use the predefined value. */ +#elif U_PLATFORM_USES_ONLY_WIN32_API +# if defined(__BORLANDC__) || U_PLATFORM == U_PF_MINGW || (defined(_MSC_VER) && _MSC_VER>=1600) + /* Windows Visual Studio 9 and below do not have stdint.h & inttypes.h, but VS 2010 adds them. */ +# define U_HAVE_STDINT_H 1 +# else +# define U_HAVE_STDINT_H 0 +# endif +#elif U_PLATFORM == U_PF_SOLARIS + /* Solaris has inttypes.h but not stdint.h. */ +# define U_HAVE_STDINT_H 0 +#elif U_PLATFORM == U_PF_AIX && !defined(_AIX51) && defined(_POWER) + /* PPC AIX <= 4.3 has inttypes.h but not stdint.h. */ +# define U_HAVE_STDINT_H 0 +#else +# define U_HAVE_STDINT_H 1 +#endif + +/** + * \def U_HAVE_INTTYPES_H + * Defines whether inttypes.h is available. It is a C99 standard header. + * We include inttypes.h where it is available but stdint.h is not. + * @internal + */ +#ifdef U_HAVE_INTTYPES_H + /* Use the predefined value. */ +#elif U_PLATFORM == U_PF_SOLARIS + /* Solaris has inttypes.h but not stdint.h. */ +# define U_HAVE_INTTYPES_H 1 +#elif U_PLATFORM == U_PF_AIX && !defined(_AIX51) && defined(_POWER) + /* PPC AIX <= 4.3 has inttypes.h but not stdint.h. */ +# define U_HAVE_INTTYPES_H 1 +#else + /* Most platforms have both inttypes.h and stdint.h, or neither. */ +# define U_HAVE_INTTYPES_H U_HAVE_STDINT_H +#endif + +/** + * \def U_IOSTREAM_SOURCE + * Defines what support for C++ streams is available. + * + * If U_IOSTREAM_SOURCE is set to 199711, then <iostream> is available + * (the ISO/IEC C++ FDIS was published in November 1997), and then + * one should qualify streams using the std namespace in ICU header + * files. + * Starting with ICU 49, this is the only supported version. + * + * If U_IOSTREAM_SOURCE is set to 198506, then <iostream.h> is + * available instead (in June 1985 Stroustrup published + * "An Extensible I/O Facility for C++" at the summer USENIX conference). + * Starting with ICU 49, this version is not supported any more. + * + * If U_IOSTREAM_SOURCE is 0 (or any value less than 199711), + * then C++ streams are not available and + * support for them will be silently suppressed in ICU. + * + * @internal + */ +#ifndef U_IOSTREAM_SOURCE +#define U_IOSTREAM_SOURCE 199711 +#endif + +/** + * \def U_HAVE_STD_STRING + * Defines whether the standard C++ (STL) <string> header is available. + * @internal + */ +#ifdef U_HAVE_STD_STRING + /* Use the predefined value. */ +#else +# define U_HAVE_STD_STRING 1 +#endif + +/*===========================================================================*/ +/** @{ Compiler and environment features */ +/*===========================================================================*/ + +/** + * \def U_GCC_MAJOR_MINOR + * Indicates whether the compiler is gcc (test for != 0), + * and if so, contains its major (times 100) and minor version numbers. + * If the compiler is not gcc, then U_GCC_MAJOR_MINOR == 0. + * + * For example, for testing for whether we have gcc, and whether it's 4.6 or higher, + * use "#if U_GCC_MAJOR_MINOR >= 406". + * @internal + */ +#ifdef __GNUC__ +# define U_GCC_MAJOR_MINOR (__GNUC__ * 100 + __GNUC_MINOR__) +#else +# define U_GCC_MAJOR_MINOR 0 +#endif + +/** + * \def U_IS_BIG_ENDIAN + * Determines the endianness of the platform. + * @internal + */ +#ifdef U_IS_BIG_ENDIAN + /* Use the predefined value. */ +#elif defined(BYTE_ORDER) && defined(BIG_ENDIAN) +# define U_IS_BIG_ENDIAN (BYTE_ORDER == BIG_ENDIAN) +#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) + /* gcc */ +# define U_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#elif defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) +# define U_IS_BIG_ENDIAN 1 +#elif defined(__LITTLE_ENDIAN__) || defined(_LITTLE_ENDIAN) +# define U_IS_BIG_ENDIAN 0 +#elif U_PLATFORM == U_PF_OS390 || U_PLATFORM == U_PF_OS400 || defined(__s390__) || defined(__s390x__) + /* These platforms do not appear to predefine any endianness macros. */ +# define U_IS_BIG_ENDIAN 1 +#elif defined(_PA_RISC1_0) || defined(_PA_RISC1_1) || defined(_PA_RISC2_0) + /* HPPA do not appear to predefine any endianness macros. */ +# define U_IS_BIG_ENDIAN 1 +#elif defined(sparc) || defined(__sparc) || defined(__sparc__) + /* Some sparc based systems (e.g. Linux) do not predefine any endianness macros. */ +# define U_IS_BIG_ENDIAN 1 +#else +# define U_IS_BIG_ENDIAN 0 +#endif + +/** + * \def U_HAVE_PLACEMENT_NEW + * Determines whether to override placement new and delete for STL. + * @stable ICU 2.6 + */ +#ifdef U_HAVE_PLACEMENT_NEW + /* Use the predefined value. */ +#elif defined(__BORLANDC__) +# define U_HAVE_PLACEMENT_NEW 0 +#else +# define U_HAVE_PLACEMENT_NEW 1 +#endif + +/** + * \def U_HAVE_DEBUG_LOCATION_NEW + * Define this to define the MFC debug version of the operator new. + * + * @stable ICU 3.4 + */ +#ifdef U_HAVE_DEBUG_LOCATION_NEW + /* Use the predefined value. */ +#elif defined(_MSC_VER) +# define U_HAVE_DEBUG_LOCATION_NEW 1 +#else +# define U_HAVE_DEBUG_LOCATION_NEW 0 +#endif + +/* Compatibility with non clang compilers */ +#ifndef __has_attribute +# define __has_attribute(x) 0 +#endif + +/** + * \def U_MALLOC_ATTR + * Attribute to mark functions as malloc-like + * @internal + */ +#if defined(__GNUC__) && __GNUC__>=3 +# define U_MALLOC_ATTR __attribute__ ((__malloc__)) +#else +# define U_MALLOC_ATTR +#endif + +/** + * \def U_ALLOC_SIZE_ATTR + * Attribute to specify the size of the allocated buffer for malloc-like functions + * @internal + */ +#if (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) || __has_attribute(alloc_size) +# define U_ALLOC_SIZE_ATTR(X) __attribute__ ((alloc_size(X))) +# define U_ALLOC_SIZE_ATTR2(X,Y) __attribute__ ((alloc_size(X,Y))) +#else +# define U_ALLOC_SIZE_ATTR(X) +# define U_ALLOC_SIZE_ATTR2(X,Y) +#endif + +/** @} */ + +/*===========================================================================*/ +/** @{ Character data types */ +/*===========================================================================*/ + +/** + * U_CHARSET_FAMILY is equal to this value when the platform is an ASCII based platform. + * @stable ICU 2.0 + */ +#define U_ASCII_FAMILY 0 + +/** + * U_CHARSET_FAMILY is equal to this value when the platform is an EBCDIC based platform. + * @stable ICU 2.0 + */ +#define U_EBCDIC_FAMILY 1 + +/** + * \def U_CHARSET_FAMILY + * + * <p>These definitions allow to specify the encoding of text + * in the char data type as defined by the platform and the compiler. + * It is enough to determine the code point values of "invariant characters", + * which are the ones shared by all encodings that are in use + * on a given platform.</p> + * + * <p>Those "invariant characters" should be all the uppercase and lowercase + * latin letters, the digits, the space, and "basic punctuation". + * Also, '\\n', '\\r', '\\t' should be available.</p> + * + * <p>The list of "invariant characters" is:<br> + * \code + * A-Z a-z 0-9 SPACE " % & ' ( ) * + , - . / : ; < = > ? _ + * \endcode + * <br> + * (52 letters + 10 numbers + 20 punc/sym/space = 82 total)</p> + * + * <p>This matches the IBM Syntactic Character Set (CS 640).</p> + * + * <p>In other words, all the graphic characters in 7-bit ASCII should + * be safely accessible except the following:</p> + * + * \code + * '\' <backslash> + * '[' <left bracket> + * ']' <right bracket> + * '{' <left brace> + * '}' <right brace> + * '^' <circumflex> + * '~' <tilde> + * '!' <exclamation mark> + * '#' <number sign> + * '|' <vertical line> + * '$' <dollar sign> + * '@' <commercial at> + * '`' <grave accent> + * \endcode + * @stable ICU 2.0 + */ +#ifdef U_CHARSET_FAMILY + /* Use the predefined value. */ +#elif U_PLATFORM == U_PF_OS390 && (!defined(__CHARSET_LIB) || !__CHARSET_LIB) +# define U_CHARSET_FAMILY U_EBCDIC_FAMILY +#elif U_PLATFORM == U_PF_OS400 && !defined(__UTF32__) +# define U_CHARSET_FAMILY U_EBCDIC_FAMILY +#else +# define U_CHARSET_FAMILY U_ASCII_FAMILY +#endif + +/** + * \def U_CHARSET_IS_UTF8 + * + * Hardcode the default charset to UTF-8. + * + * If this is set to 1, then + * - ICU will assume that all non-invariant char*, StringPiece, std::string etc. + * contain UTF-8 text, regardless of what the system API uses + * - some ICU code will use fast functions like u_strFromUTF8() + * rather than the more general and more heavy-weight conversion API (ucnv.h) + * - ucnv_getDefaultName() always returns "UTF-8" + * - ucnv_setDefaultName() is disabled and will not change the default charset + * - static builds of ICU are smaller + * - more functionality is available with the UCONFIG_NO_CONVERSION build-time + * configuration option (see unicode/uconfig.h) + * - the UCONFIG_NO_CONVERSION build option in uconfig.h is more usable + * + * @stable ICU 4.2 + * @see UCONFIG_NO_CONVERSION + */ +#ifdef U_CHARSET_IS_UTF8 + /* Use the predefined value. */ +#elif U_PLATFORM == U_PF_ANDROID || U_PLATFORM_IS_DARWIN_BASED +# define U_CHARSET_IS_UTF8 1 +#else +# define U_CHARSET_IS_UTF8 0 +#endif + +/** @} */ + +/*===========================================================================*/ +/** @{ Information about wchar support */ +/*===========================================================================*/ + +/** + * \def U_HAVE_WCHAR_H + * Indicates whether <wchar.h> is available (1) or not (0). Set to 1 by default. + * + * @stable ICU 2.0 + */ +#ifdef U_HAVE_WCHAR_H + /* Use the predefined value. */ +#elif U_PLATFORM == U_PF_ANDROID && __ANDROID_API__ < 9 + /* + * Android before Gingerbread (Android 2.3, API level 9) did not support wchar_t. + * The type and header existed, but the library functions did not work as expected. + * The size of wchar_t was 1 but L"xyz" string literals had 32-bit units anyway. + */ +# define U_HAVE_WCHAR_H 0 +#else +# define U_HAVE_WCHAR_H 1 +#endif + +/** + * \def U_SIZEOF_WCHAR_T + * U_SIZEOF_WCHAR_T==sizeof(wchar_t) + * + * @stable ICU 2.0 + */ +#ifdef U_SIZEOF_WCHAR_T + /* Use the predefined value. */ +#elif (U_PLATFORM == U_PF_ANDROID && __ANDROID_API__ < 9) + /* + * Classic Mac OS and Mac OS X before 10.3 (Panther) did not support wchar_t or wstring. + * Newer Mac OS X has size 4. + */ +# define U_SIZEOF_WCHAR_T 1 +#elif U_PLATFORM_HAS_WIN32_API || U_PLATFORM == U_PF_CYGWIN +# define U_SIZEOF_WCHAR_T 2 +#elif U_PLATFORM == U_PF_AIX + /* + * AIX 6.1 information, section "Wide character data representation": + * "... the wchar_t datatype is 32-bit in the 64-bit environment and + * 16-bit in the 32-bit environment." + * and + * "All locales use Unicode for their wide character code values (process code), + * except the IBM-eucTW codeset." + */ +# ifdef __64BIT__ +# define U_SIZEOF_WCHAR_T 4 +# else +# define U_SIZEOF_WCHAR_T 2 +# endif +#elif U_PLATFORM == U_PF_OS390 + /* + * z/OS V1R11 information center, section "LP64 | ILP32": + * "In 31-bit mode, the size of long and pointers is 4 bytes and the size of wchar_t is 2 bytes. + * Under LP64, the size of long and pointer is 8 bytes and the size of wchar_t is 4 bytes." + */ +# ifdef _LP64 +# define U_SIZEOF_WCHAR_T 4 +# else +# define U_SIZEOF_WCHAR_T 2 +# endif +#elif U_PLATFORM == U_PF_OS400 +# if defined(__UTF32__) + /* + * LOCALETYPE(*LOCALEUTF) is specified. + * Wide-character strings are in UTF-32, + * narrow-character strings are in UTF-8. + */ +# define U_SIZEOF_WCHAR_T 4 +# elif defined(__UCS2__) + /* + * LOCALETYPE(*LOCALEUCS2) is specified. + * Wide-character strings are in UCS-2, + * narrow-character strings are in EBCDIC. + */ +# define U_SIZEOF_WCHAR_T 2 +#else + /* + * LOCALETYPE(*CLD) or LOCALETYPE(*LOCALE) is specified. + * Wide-character strings are in 16-bit EBCDIC, + * narrow-character strings are in EBCDIC. + */ +# define U_SIZEOF_WCHAR_T 2 +# endif +#else +# define U_SIZEOF_WCHAR_T 4 +#endif + +#ifndef U_HAVE_WCSCPY +#define U_HAVE_WCSCPY U_HAVE_WCHAR_H +#endif + +/** @} */ + +/** + * \def U_HAVE_CHAR16_T + * Defines whether the char16_t type is available for UTF-16 + * and u"abc" UTF-16 string literals are supported. + * This is a new standard type and standard string literal syntax in C++0x + * but has been available in some compilers before. + * @internal + */ +#ifdef U_HAVE_CHAR16_T + /* Use the predefined value. */ +#else + /* + * Notes: + * Visual Studio 10 (_MSC_VER>=1600) defines char16_t but + * does not support u"abc" string literals. + * gcc 4.4 defines the __CHAR16_TYPE__ macro to a usable type but + * does not support u"abc" string literals. + * C++11 and C11 require support for UTF-16 literals + */ +# if (defined(__cplusplus) && __cplusplus >= 201103L) || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) +# define U_HAVE_CHAR16_T 1 +# else +# define U_HAVE_CHAR16_T 0 +# endif +#endif + +/** + * @{ + * \def U_DECLARE_UTF16 + * Do not use this macro because it is not defined on all platforms. + * Use the UNICODE_STRING or U_STRING_DECL macros instead. + * @internal + */ +#ifdef U_DECLARE_UTF16 + /* Use the predefined value. */ +#elif U_HAVE_CHAR16_T \ + || (defined(__xlC__) && defined(__IBM_UTF_LITERAL) && U_SIZEOF_WCHAR_T != 2) \ + || (defined(__HP_aCC) && __HP_aCC >= 035000) \ + || (defined(__HP_cc) && __HP_cc >= 111106) +# define U_DECLARE_UTF16(string) u ## string +#elif U_SIZEOF_WCHAR_T == 2 \ + && (U_CHARSET_FAMILY == 0 || (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400 && defined(__UCS2__))) +# define U_DECLARE_UTF16(string) L ## string +#else + /* Leave U_DECLARE_UTF16 undefined. See unistr.h. */ +#endif + +/** @} */ + +/*===========================================================================*/ +/** @{ Symbol import-export control */ +/*===========================================================================*/ + +#ifdef U_EXPORT + /* Use the predefined value. */ +#elif defined(U_STATIC_IMPLEMENTATION) +# define U_EXPORT +#elif defined(__GNUC__) +# define U_EXPORT __attribute__((visibility("default"))) +#elif (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x550) \ + || (defined(__SUNPRO_C) && __SUNPRO_C >= 0x550) +# define U_EXPORT __global +/*#elif defined(__HP_aCC) || defined(__HP_cc) +# define U_EXPORT __declspec(dllexport)*/ +#elif defined(_MSC_VER) +# define U_EXPORT __declspec(dllexport) +#else +# define U_EXPORT +#endif + +/* U_CALLCONV is releated to U_EXPORT2 */ +#ifdef U_EXPORT2 + /* Use the predefined value. */ +#elif defined(_MSC_VER) +# define U_EXPORT2 __cdecl +#else +# define U_EXPORT2 +#endif + +#ifdef U_IMPORT + /* Use the predefined value. */ +#elif defined(_MSC_VER) + /* Windows needs to export/import data. */ +# define U_IMPORT __declspec(dllimport) +#else +# define U_IMPORT +#endif + +/** + * \def U_CALLCONV + * Similar to U_CDECL_BEGIN/U_CDECL_END, this qualifier is necessary + * in callback function typedefs to make sure that the calling convention + * is compatible. + * + * This is only used for non-ICU-API functions. + * When a function is a public ICU API, + * you must use the U_CAPI and U_EXPORT2 qualifiers. + * @stable ICU 2.0 + */ +#if U_PLATFORM == U_PF_OS390 && defined(__cplusplus) +# define U_CALLCONV __cdecl +#else +# define U_CALLCONV U_EXPORT2 +#endif + +/* @} */ + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/ptypes.h b/src/core/basetypes/icu-ucsdet/include/unicode/ptypes.h new file mode 100644 index 00000000..b7f71160 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/ptypes.h @@ -0,0 +1,126 @@ +/* +****************************************************************************** +* +* Copyright (C) 1997-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* FILE NAME : ptypes.h +* +* Date Name Description +* 05/13/98 nos Creation (content moved here from ptypes.h). +* 03/02/99 stephen Added AS400 support. +* 03/30/99 stephen Added Linux support. +* 04/13/99 stephen Reworked for autoconf. +* 09/18/08 srl Moved basic types back to ptypes.h from platform.h +****************************************************************************** +*/ + +/** + * \file + * \brief C API: Definitions of integer types of various widths + */ + +#ifndef _PTYPES_H +#define _PTYPES_H + +/** + * \def __STDC_LIMIT_MACROS + * According to the Linux stdint.h, the ISO C99 standard specifies that in C++ implementations + * macros like INT32_MIN and UINTPTR_MAX should only be defined if explicitly requested. + * We need to define __STDC_LIMIT_MACROS before including stdint.h in C++ code + * that uses such limit macros. + * @internal + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS +#endif + +/* NULL, size_t, wchar_t */ +#include <stddef.h> + +/* + * If all compilers provided all of the C99 headers and types, + * we would just unconditionally #include <stdint.h> here + * and not need any of the stuff after including platform.h. + */ + +/* Find out if we have stdint.h etc. */ +#include "unicode/platform.h" + +/*===========================================================================*/ +/* Generic data types */ +/*===========================================================================*/ + +/* If your platform does not have the <stdint.h> header, you may + need to edit the typedefs in the #else section below. + Use #if...#else...#endif with predefined compiler macros if possible. */ +#if U_HAVE_STDINT_H + +/* + * We mostly need <stdint.h> (which defines the standard integer types) but not <inttypes.h>. + * <inttypes.h> includes <stdint.h> and adds the printf/scanf helpers PRId32, SCNx16 etc. + * which we almost never use, plus stuff like imaxabs() which we never use. + */ +#include <stdint.h> + +#if U_PLATFORM == U_PF_OS390 +/* The features header is needed to get (u)int64_t sometimes. */ +#include <features.h> +/* z/OS has <stdint.h>, but some versions are missing uint8_t (APAR PK62248). */ +#if !defined(__uint8_t) +#define __uint8_t 1 +typedef unsigned char uint8_t; +#endif +#endif /* U_PLATFORM == U_PF_OS390 */ + +#elif U_HAVE_INTTYPES_H + +# include <inttypes.h> + +#else /* neither U_HAVE_STDINT_H nor U_HAVE_INTTYPES_H */ + +#if ! U_HAVE_INT8_T +typedef signed char int8_t; +#endif + +#if ! U_HAVE_UINT8_T +typedef unsigned char uint8_t; +#endif + +#if ! U_HAVE_INT16_T +typedef signed short int16_t; +#endif + +#if ! U_HAVE_UINT16_T +typedef unsigned short uint16_t; +#endif + +#if ! U_HAVE_INT32_T +typedef signed int int32_t; +#endif + +#if ! U_HAVE_UINT32_T +typedef unsigned int uint32_t; +#endif + +#if ! U_HAVE_INT64_T +#ifdef _MSC_VER + typedef signed __int64 int64_t; +#else + typedef signed long long int64_t; +#endif +#endif + +#if ! U_HAVE_UINT64_T +#ifdef _MSC_VER + typedef unsigned __int64 uint64_t; +#else + typedef unsigned long long uint64_t; +#endif +#endif + +#endif /* U_HAVE_STDINT_H / U_HAVE_INTTYPES_H */ + +#endif /* _PTYPES_H */ diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/putil.h b/src/core/basetypes/icu-ucsdet/include/unicode/putil.h new file mode 100644 index 00000000..df1b17ba --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/putil.h @@ -0,0 +1,181 @@ +/* +****************************************************************************** +* +* Copyright (C) 1997-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* FILE NAME : putil.h +* +* Date Name Description +* 05/14/98 nos Creation (content moved here from utypes.h). +* 06/17/99 erm Added IEEE_754 +* 07/22/98 stephen Added IEEEremainder, max, min, trunc +* 08/13/98 stephen Added isNegativeInfinity, isPositiveInfinity +* 08/24/98 stephen Added longBitsFromDouble +* 03/02/99 stephen Removed openFile(). Added AS400 support. +* 04/15/99 stephen Converted to C +* 11/15/99 helena Integrated S/390 changes for IEEE support. +* 01/11/00 helena Added u_getVersion. +****************************************************************************** +*/ + +#ifndef PUTIL_H +#define PUTIL_H + +#include "unicode/utypes.h" + /** + * \file + * \brief C API: Platform Utilities + */ + +/*==========================================================================*/ +/* Platform utilities */ +/*==========================================================================*/ + +/** + * Platform utilities isolates the platform dependencies of the + * libarary. For each platform which this code is ported to, these + * functions may have to be re-implemented. + */ + +/** + * Return the ICU data directory. + * The data directory is where common format ICU data files (.dat files) + * are loaded from. Note that normal use of the built-in ICU + * facilities does not require loading of an external data file; + * unless you are adding custom data to ICU, the data directory + * does not need to be set. + * + * The data directory is determined as follows: + * If u_setDataDirectory() has been called, that is it, otherwise + * if the ICU_DATA environment variable is set, use that, otherwise + * If a data directory was specifed at ICU build time + * <code> + * \code + * #define ICU_DATA_DIR "path" + * \endcode + * </code> use that, + * otherwise no data directory is available. + * + * @return the data directory, or an empty string ("") if no data directory has + * been specified. + * + * @stable ICU 2.0 + */ +U_STABLE const char* U_EXPORT2 u_getDataDirectory(void); + + +/** + * Set the ICU data directory. + * The data directory is where common format ICU data files (.dat files) + * are loaded from. Note that normal use of the built-in ICU + * facilities does not require loading of an external data file; + * unless you are adding custom data to ICU, the data directory + * does not need to be set. + * + * This function should be called at most once in a process, before the + * first ICU operation (e.g., u_init()) that will require the loading of an + * ICU data file. + * This function is not thread-safe. Use it before calling ICU APIs from + * multiple threads. + * + * @param directory The directory to be set. + * + * @see u_init + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 u_setDataDirectory(const char *directory); + +#ifndef U_HIDE_INTERNAL_API +/** + * Return the time zone files override directory, or an empty string if + * no directory was specified. Certain time zone resources will be preferrentially + * loaded from individual files in this directory. + * + * @return the time zone data override directory. + * @internal + */ +U_INTERNAL const char * U_EXPORT2 u_getTimeZoneFilesDirectory(UErrorCode *status); + +/** + * Set the time zone files override directory. + * This function is not thread safe; it must not be called concurrently with + * u_getTimeZoneFilesDirectory() or any other use of ICU time zone functions. + * This function should only be called before using any ICU service that + * will access the time zone data. + * @internal + */ +U_INTERNAL void U_EXPORT2 u_setTimeZoneFilesDirectory(const char *path, UErrorCode *status); +#endif /* U_HIDE_INTERNAL_API */ + + +/** + * @{ + * Filesystem file and path separator characters. + * Example: '/' and ':' on Unix, '\\' and ';' on Windows. + * @stable ICU 2.0 + */ +#if U_PLATFORM_USES_ONLY_WIN32_API +# define U_FILE_SEP_CHAR '\\' +# define U_FILE_ALT_SEP_CHAR '/' +# define U_PATH_SEP_CHAR ';' +# define U_FILE_SEP_STRING "\\" +# define U_FILE_ALT_SEP_STRING "/" +# define U_PATH_SEP_STRING ";" +#else +# define U_FILE_SEP_CHAR '/' +# define U_FILE_ALT_SEP_CHAR '/' +# define U_PATH_SEP_CHAR ':' +# define U_FILE_SEP_STRING "/" +# define U_FILE_ALT_SEP_STRING "/" +# define U_PATH_SEP_STRING ":" +#endif + +/** @} */ + +/** + * Convert char characters to UChar characters. + * This utility function is useful only for "invariant characters" + * that are encoded in the platform default encoding. + * They are a small, constant subset of the encoding and include + * just the latin letters, digits, and some punctuation. + * For details, see U_CHARSET_FAMILY. + * + * @param cs Input string, points to <code>length</code> + * character bytes from a subset of the platform encoding. + * @param us Output string, points to memory for <code>length</code> + * Unicode characters. + * @param length The number of characters to convert; this may + * include the terminating <code>NUL</code>. + * + * @see U_CHARSET_FAMILY + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +u_charsToUChars(const char *cs, UChar *us, int32_t length); + +/** + * Convert UChar characters to char characters. + * This utility function is useful only for "invariant characters" + * that can be encoded in the platform default encoding. + * They are a small, constant subset of the encoding and include + * just the latin letters, digits, and some punctuation. + * For details, see U_CHARSET_FAMILY. + * + * @param us Input string, points to <code>length</code> + * Unicode characters that can be encoded with the + * codepage-invariant subset of the platform encoding. + * @param cs Output string, points to memory for <code>length</code> + * character bytes. + * @param length The number of characters to convert; this may + * include the terminating <code>NUL</code>. + * + * @see U_CHARSET_FAMILY + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +u_UCharsToChars(const UChar *us, char *cs, int32_t length); + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/rep.h b/src/core/basetypes/icu-ucsdet/include/unicode/rep.h new file mode 100644 index 00000000..4c7eae14 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/rep.h @@ -0,0 +1,261 @@ +/* +************************************************************************** +* Copyright (C) 1999-2012, International Business Machines Corporation and +* others. All Rights Reserved. +************************************************************************** +* Date Name Description +* 11/17/99 aliu Creation. Ported from java. Modified to +* match current UnicodeString API. Forced +* to use name "handleReplaceBetween" because +* of existing methods in UnicodeString. +************************************************************************** +*/ + +#ifndef REP_H +#define REP_H + +#include "unicode/uobject.h" + +/** + * \file + * \brief C++ API: Replaceable String + */ + +U_NAMESPACE_BEGIN + +class UnicodeString; + +/** + * <code>Replaceable</code> is an abstract base class representing a + * string of characters that supports the replacement of a range of + * itself with a new string of characters. It is used by APIs that + * change a piece of text while retaining metadata. Metadata is data + * other than the Unicode characters returned by char32At(). One + * example of metadata is style attributes; another is an edit + * history, marking each character with an author and revision number. + * + * <p>An implicit aspect of the <code>Replaceable</code> API is that + * during a replace operation, new characters take on the metadata of + * the old characters. For example, if the string "the <b>bold</b> + * font" has range (4, 8) replaced with "strong", then it becomes "the + * <b>strong</b> font". + * + * <p><code>Replaceable</code> specifies ranges using a start + * offset and a limit offset. The range of characters thus specified + * includes the characters at offset start..limit-1. That is, the + * start offset is inclusive, and the limit offset is exclusive. + * + * <p><code>Replaceable</code> also includes API to access characters + * in the string: <code>length()</code>, <code>charAt()</code>, + * <code>char32At()</code>, and <code>extractBetween()</code>. + * + * <p>For a subclass to support metadata, typical behavior of + * <code>replace()</code> is the following: + * <ul> + * <li>Set the metadata of the new text to the metadata of the first + * character replaced</li> + * <li>If no characters are replaced, use the metadata of the + * previous character</li> + * <li>If there is no previous character (i.e. start == 0), use the + * following character</li> + * <li>If there is no following character (i.e. the replaceable was + * empty), use default metadata.<br> + * <li>If the code point U+FFFF is seen, it should be interpreted as + * a special marker having no metadata<li> + * </li> + * </ul> + * If this is not the behavior, the subclass should document any differences. + * @author Alan Liu + * @stable ICU 2.0 + */ +class U_COMMON_API Replaceable : public UObject { + +public: + /** + * Destructor. + * @stable ICU 2.0 + */ + virtual ~Replaceable(); + + /** + * Returns the number of 16-bit code units in the text. + * @return number of 16-bit code units in text + * @stable ICU 1.8 + */ + inline int32_t length() const; + + /** + * Returns the 16-bit code unit at the given offset into the text. + * @param offset an integer between 0 and <code>length()</code>-1 + * inclusive + * @return 16-bit code unit of text at given offset + * @stable ICU 1.8 + */ + inline UChar charAt(int32_t offset) const; + + /** + * Returns the 32-bit code point at the given 16-bit offset into + * the text. This assumes the text is stored as 16-bit code units + * with surrogate pairs intermixed. If the offset of a leading or + * trailing code unit of a surrogate pair is given, return the + * code point of the surrogate pair. + * + * @param offset an integer between 0 and <code>length()</code>-1 + * inclusive + * @return 32-bit code point of text at given offset + * @stable ICU 1.8 + */ + inline UChar32 char32At(int32_t offset) const; + + /** + * Copies characters in the range [<tt>start</tt>, <tt>limit</tt>) + * into the UnicodeString <tt>target</tt>. + * @param start offset of first character which will be copied + * @param limit offset immediately following the last character to + * be copied + * @param target UnicodeString into which to copy characters. + * @return A reference to <TT>target</TT> + * @stable ICU 2.1 + */ + virtual void extractBetween(int32_t start, + int32_t limit, + UnicodeString& target) const = 0; + + /** + * Replaces a substring of this object with the given text. If the + * characters being replaced have metadata, the new characters + * that replace them should be given the same metadata. + * + * <p>Subclasses must ensure that if the text between start and + * limit is equal to the replacement text, that replace has no + * effect. That is, any metadata + * should be unaffected. In addition, subclasses are encouraged to + * check for initial and trailing identical characters, and make a + * smaller replacement if possible. This will preserve as much + * metadata as possible. + * @param start the beginning index, inclusive; <code>0 <= start + * <= limit</code>. + * @param limit the ending index, exclusive; <code>start <= limit + * <= length()</code>. + * @param text the text to replace characters <code>start</code> + * to <code>limit - 1</code> + * @stable ICU 2.0 + */ + virtual void handleReplaceBetween(int32_t start, + int32_t limit, + const UnicodeString& text) = 0; + // Note: All other methods in this class take the names of + // existing UnicodeString methods. This method is the exception. + // It is named differently because all replace methods of + // UnicodeString return a UnicodeString&. The 'between' is + // required in order to conform to the UnicodeString naming + // convention; API taking start/length are named <operation>, and + // those taking start/limit are named <operationBetween>. The + // 'handle' is added because 'replaceBetween' and + // 'doReplaceBetween' are already taken. + + /** + * Copies a substring of this object, retaining metadata. + * This method is used to duplicate or reorder substrings. + * The destination index must not overlap the source range. + * + * @param start the beginning index, inclusive; <code>0 <= start <= + * limit</code>. + * @param limit the ending index, exclusive; <code>start <= limit <= + * length()</code>. + * @param dest the destination index. The characters from + * <code>start..limit-1</code> will be copied to <code>dest</code>. + * Implementations of this method may assume that <code>dest <= start || + * dest >= limit</code>. + * @stable ICU 2.0 + */ + virtual void copy(int32_t start, int32_t limit, int32_t dest) = 0; + + /** + * Returns true if this object contains metadata. If a + * Replaceable object has metadata, calls to the Replaceable API + * must be made so as to preserve metadata. If it does not, calls + * to the Replaceable API may be optimized to improve performance. + * The default implementation returns true. + * @return true if this object contains metadata + * @stable ICU 2.2 + */ + virtual UBool hasMetaData() const; + + /** + * Clone this object, an instance of a subclass of Replaceable. + * Clones can be used concurrently in multiple threads. + * If a subclass does not implement clone(), or if an error occurs, + * then NULL is returned. + * The clone functions in all subclasses return a pointer to a Replaceable + * because some compilers do not support covariant (same-as-this) + * return types; cast to the appropriate subclass if necessary. + * The caller must delete the clone. + * + * @return a clone of this object + * + * @see getDynamicClassID + * @stable ICU 2.6 + */ + virtual Replaceable *clone() const; + +protected: + + /** + * Default constructor. + * @stable ICU 2.4 + */ + inline Replaceable(); + + /* + * Assignment operator not declared. The compiler will provide one + * which does nothing since this class does not contain any data members. + * API/code coverage may show the assignment operator as present and + * untested - ignore. + * Subclasses need this assignment operator if they use compiler-provided + * assignment operators of their own. An alternative to not declaring one + * here would be to declare and empty-implement a protected or public one. + Replaceable &Replaceable::operator=(const Replaceable &); + */ + + /** + * Virtual version of length(). + * @stable ICU 2.4 + */ + virtual int32_t getLength() const = 0; + + /** + * Virtual version of charAt(). + * @stable ICU 2.4 + */ + virtual UChar getCharAt(int32_t offset) const = 0; + + /** + * Virtual version of char32At(). + * @stable ICU 2.4 + */ + virtual UChar32 getChar32At(int32_t offset) const = 0; +}; + +inline Replaceable::Replaceable() {} + +inline int32_t +Replaceable::length() const { + return getLength(); +} + +inline UChar +Replaceable::charAt(int32_t offset) const { + return getCharAt(offset); +} + +inline UChar32 +Replaceable::char32At(int32_t offset) const { + return getChar32At(offset); +} + +// There is no rep.cpp, see unistr.cpp for Replaceable function implementations. + +U_NAMESPACE_END + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/std_string.h b/src/core/basetypes/icu-ucsdet/include/unicode/std_string.h new file mode 100644 index 00000000..05955c5d --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/std_string.h @@ -0,0 +1,37 @@ +/* +******************************************************************************* +* +* Copyright (C) 2009-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: std_string.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009feb19 +* created by: Markus W. Scherer +*/ + +#ifndef __STD_STRING_H__ +#define __STD_STRING_H__ + +/** + * \file + * \brief C++ API: Central ICU header for including the C++ standard <string> + * header and for related definitions. + */ + +#include "unicode/utypes.h" + +#if U_HAVE_STD_STRING + +#if !defined(_MSC_VER) +namespace std { class type_info; } // WORKAROUND: http://llvm.org/bugs/show_bug.cgi?id=13364 +#endif +#include <string> + +#endif // U_HAVE_STD_STRING + +#endif // __STD_STRING_H__ diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/strenum.h b/src/core/basetypes/icu-ucsdet/include/unicode/strenum.h new file mode 100644 index 00000000..3dbe21c6 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/strenum.h @@ -0,0 +1,276 @@ +/* +******************************************************************************* +* +* Copyright (C) 2002-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +*/ + +#ifndef STRENUM_H +#define STRENUM_H + +#include "unicode/uobject.h" +#include "unicode/unistr.h" + +/** + * \file + * \brief C++ API: String Enumeration + */ + +U_NAMESPACE_BEGIN + +/** + * Base class for 'pure' C++ implementations of uenum api. Adds a + * method that returns the next UnicodeString since in C++ this can + * be a common storage format for strings. + * + * <p>The model is that the enumeration is over strings maintained by + * a 'service.' At any point, the service might change, invalidating + * the enumerator (though this is expected to be rare). The iterator + * returns an error if this has occurred. Lack of the error is no + * guarantee that the service didn't change immediately after the + * call, so the returned string still might not be 'valid' on + * subsequent use.</p> + * + * <p>Strings may take the form of const char*, const UChar*, or const + * UnicodeString*. The type you get is determine by the variant of + * 'next' that you call. In general the StringEnumeration is + * optimized for one of these types, but all StringEnumerations can + * return all types. Returned strings are each terminated with a NUL. + * Depending on the service data, they might also include embedded NUL + * characters, so API is provided to optionally return the true + * length, counting the embedded NULs but not counting the terminating + * NUL.</p> + * + * <p>The pointers returned by next, unext, and snext become invalid + * upon any subsequent call to the enumeration's destructor, next, + * unext, snext, or reset.</p> + * + * ICU 2.8 adds some default implementations and helper functions + * for subclasses. + * + * @stable ICU 2.4 + */ +class U_COMMON_API StringEnumeration : public UObject { +public: + /** + * Destructor. + * @stable ICU 2.4 + */ + virtual ~StringEnumeration(); + + /** + * Clone this object, an instance of a subclass of StringEnumeration. + * Clones can be used concurrently in multiple threads. + * If a subclass does not implement clone(), or if an error occurs, + * then NULL is returned. + * The clone functions in all subclasses return a base class pointer + * because some compilers do not support covariant (same-as-this) + * return types; cast to the appropriate subclass if necessary. + * The caller must delete the clone. + * + * @return a clone of this object + * + * @see getDynamicClassID + * @stable ICU 2.8 + */ + virtual StringEnumeration *clone() const; + + /** + * <p>Return the number of elements that the iterator traverses. If + * the iterator is out of sync with its service, status is set to + * U_ENUM_OUT_OF_SYNC_ERROR, and the return value is zero.</p> + * + * <p>The return value will not change except possibly as a result of + * a subsequent call to reset, or if the iterator becomes out of sync.</p> + * + * <p>This is a convenience function. It can end up being very + * expensive as all the items might have to be pre-fetched + * (depending on the storage format of the data being + * traversed).</p> + * + * @param status the error code. + * @return number of elements in the iterator. + * + * @stable ICU 2.4 */ + virtual int32_t count(UErrorCode& status) const = 0; + + /** + * <p>Returns the next element as a NUL-terminated char*. If there + * are no more elements, returns NULL. If the resultLength pointer + * is not NULL, the length of the string (not counting the + * terminating NUL) is returned at that address. If an error + * status is returned, the value at resultLength is undefined.</p> + * + * <p>The returned pointer is owned by this iterator and must not be + * deleted by the caller. The pointer is valid until the next call + * to next, unext, snext, reset, or the enumerator's destructor.</p> + * + * <p>If the iterator is out of sync with its service, status is set + * to U_ENUM_OUT_OF_SYNC_ERROR and NULL is returned.</p> + * + * <p>If the native service string is a UChar* string, it is + * converted to char* with the invariant converter. If the + * conversion fails (because a character cannot be converted) then + * status is set to U_INVARIANT_CONVERSION_ERROR and the return + * value is undefined (though not NULL).</p> + * + * Starting with ICU 2.8, the default implementation calls snext() + * and handles the conversion. + * Either next() or snext() must be implemented differently by a subclass. + * + * @param status the error code. + * @param resultLength a pointer to receive the length, can be NULL. + * @return a pointer to the string, or NULL. + * + * @stable ICU 2.4 + */ + virtual const char* next(int32_t *resultLength, UErrorCode& status); + + /** + * <p>Returns the next element as a NUL-terminated UChar*. If there + * are no more elements, returns NULL. If the resultLength pointer + * is not NULL, the length of the string (not counting the + * terminating NUL) is returned at that address. If an error + * status is returned, the value at resultLength is undefined.</p> + * + * <p>The returned pointer is owned by this iterator and must not be + * deleted by the caller. The pointer is valid until the next call + * to next, unext, snext, reset, or the enumerator's destructor.</p> + * + * <p>If the iterator is out of sync with its service, status is set + * to U_ENUM_OUT_OF_SYNC_ERROR and NULL is returned.</p> + * + * Starting with ICU 2.8, the default implementation calls snext() + * and handles the conversion. + * + * @param status the error code. + * @param resultLength a ponter to receive the length, can be NULL. + * @return a pointer to the string, or NULL. + * + * @stable ICU 2.4 + */ + virtual const UChar* unext(int32_t *resultLength, UErrorCode& status); + + /** + * <p>Returns the next element a UnicodeString*. If there are no + * more elements, returns NULL.</p> + * + * <p>The returned pointer is owned by this iterator and must not be + * deleted by the caller. The pointer is valid until the next call + * to next, unext, snext, reset, or the enumerator's destructor.</p> + * + * <p>If the iterator is out of sync with its service, status is set + * to U_ENUM_OUT_OF_SYNC_ERROR and NULL is returned.</p> + * + * Starting with ICU 2.8, the default implementation calls next() + * and handles the conversion. + * Either next() or snext() must be implemented differently by a subclass. + * + * @param status the error code. + * @return a pointer to the string, or NULL. + * + * @stable ICU 2.4 + */ + virtual const UnicodeString* snext(UErrorCode& status); + + /** + * <p>Resets the iterator. This re-establishes sync with the + * service and rewinds the iterator to start at the first + * element.</p> + * + * <p>Previous pointers returned by next, unext, or snext become + * invalid, and the value returned by count might change.</p> + * + * @param status the error code. + * + * @stable ICU 2.4 + */ + virtual void reset(UErrorCode& status) = 0; + + /** + * Compares this enumeration to other to check if both are equal + * + * @param that The other string enumeration to compare this object to + * @return TRUE if the enumerations are equal. FALSE if not. + * @stable ICU 3.6 + */ + virtual UBool operator==(const StringEnumeration& that)const; + /** + * Compares this enumeration to other to check if both are not equal + * + * @param that The other string enumeration to compare this object to + * @return TRUE if the enumerations are equal. FALSE if not. + * @stable ICU 3.6 + */ + virtual UBool operator!=(const StringEnumeration& that)const; + +protected: + /** + * UnicodeString field for use with default implementations and subclasses. + * @stable ICU 2.8 + */ + UnicodeString unistr; + /** + * char * default buffer for use with default implementations and subclasses. + * @stable ICU 2.8 + */ + char charsBuffer[32]; + /** + * char * buffer for use with default implementations and subclasses. + * Allocated in constructor and in ensureCharsCapacity(). + * @stable ICU 2.8 + */ + char *chars; + /** + * Capacity of chars, for use with default implementations and subclasses. + * @stable ICU 2.8 + */ + int32_t charsCapacity; + + /** + * Default constructor for use with default implementations and subclasses. + * @stable ICU 2.8 + */ + StringEnumeration(); + + /** + * Ensures that chars is at least as large as the requested capacity. + * For use with default implementations and subclasses. + * + * @param capacity Requested capacity. + * @param status ICU in/out error code. + * @stable ICU 2.8 + */ + void ensureCharsCapacity(int32_t capacity, UErrorCode &status); + + /** + * Converts s to Unicode and sets unistr to the result. + * For use with default implementations and subclasses, + * especially for implementations of snext() in terms of next(). + * This is provided with a helper function instead of a default implementation + * of snext() to avoid potential infinite loops between next() and snext(). + * + * For example: + * \code + * const UnicodeString* snext(UErrorCode& status) { + * int32_t resultLength=0; + * const char *s=next(&resultLength, status); + * return setChars(s, resultLength, status); + * } + * \endcode + * + * @param s String to be converted to Unicode. + * @param length Length of the string. + * @param status ICU in/out error code. + * @return A pointer to unistr. + * @stable ICU 2.8 + */ + UnicodeString *setChars(const char *s, int32_t length, UErrorCode &status); +}; + +U_NAMESPACE_END + +/* STRENUM_H */ +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/stringpiece.h b/src/core/basetypes/icu-ucsdet/include/unicode/stringpiece.h new file mode 100644 index 00000000..b29571d4 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/stringpiece.h @@ -0,0 +1,224 @@ +// Copyright (C) 2009-2013, International Business Machines +// Corporation and others. All Rights Reserved. +// +// Copyright 2001 and onwards Google Inc. +// Author: Sanjay Ghemawat + +// This code is a contribution of Google code, and the style used here is +// a compromise between the original Google code and the ICU coding guidelines. +// For example, data types are ICU-ified (size_t,int->int32_t), +// and API comments doxygen-ified, but function names and behavior are +// as in the original, if possible. +// Assertion-style error handling, not available in ICU, was changed to +// parameter "pinning" similar to UnicodeString. +// +// In addition, this is only a partial port of the original Google code, +// limited to what was needed so far. The (nearly) complete original code +// is in the ICU svn repository at icuhtml/trunk/design/strings/contrib +// (see ICU ticket 6765, r25517). + +#ifndef __STRINGPIECE_H__ +#define __STRINGPIECE_H__ + +/** + * \file + * \brief C++ API: StringPiece: Read-only byte string wrapper class. + */ + +#include "unicode/utypes.h" +#include "unicode/uobject.h" +#include "unicode/std_string.h" + +// Arghh! I wish C++ literals were "string". + +U_NAMESPACE_BEGIN + +/** + * A string-like object that points to a sized piece of memory. + * + * We provide non-explicit singleton constructors so users can pass + * in a "const char*" or a "string" wherever a "StringPiece" is + * expected. + * + * Functions or methods may use const StringPiece& parameters to accept either + * a "const char*" or a "string" value that will be implicitly converted to + * a StringPiece. + * + * Systematic usage of StringPiece is encouraged as it will reduce unnecessary + * conversions from "const char*" to "string" and back again. + * + * @stable ICU 4.2 + */ +class U_COMMON_API StringPiece : public UMemory { + private: + const char* ptr_; + int32_t length_; + + public: + /** + * Default constructor, creates an empty StringPiece. + * @stable ICU 4.2 + */ + StringPiece() : ptr_(NULL), length_(0) { } + /** + * Constructs from a NUL-terminated const char * pointer. + * @param str a NUL-terminated const char * pointer + * @stable ICU 4.2 + */ + StringPiece(const char* str); +#if U_HAVE_STD_STRING + /** + * Constructs from a std::string. + * @stable ICU 4.2 + */ + StringPiece(const std::string& str) + : ptr_(str.data()), length_(static_cast<int32_t>(str.size())) { } +#endif + /** + * Constructs from a const char * pointer and a specified length. + * @param offset a const char * pointer (need not be terminated) + * @param len the length of the string; must be non-negative + * @stable ICU 4.2 + */ + StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { } + /** + * Substring of another StringPiece. + * @param x the other StringPiece + * @param pos start position in x; must be non-negative and <= x.length(). + * @stable ICU 4.2 + */ + StringPiece(const StringPiece& x, int32_t pos); + /** + * Substring of another StringPiece. + * @param x the other StringPiece + * @param pos start position in x; must be non-negative and <= x.length(). + * @param len length of the substring; + * must be non-negative and will be pinned to at most x.length() - pos. + * @stable ICU 4.2 + */ + StringPiece(const StringPiece& x, int32_t pos, int32_t len); + + /** + * Returns the string pointer. May be NULL if it is empty. + * + * data() may return a pointer to a buffer with embedded NULs, and the + * returned buffer may or may not be null terminated. Therefore it is + * typically a mistake to pass data() to a routine that expects a NUL + * terminated string. + * @return the string pointer + * @stable ICU 4.2 + */ + const char* data() const { return ptr_; } + /** + * Returns the string length. Same as length(). + * @return the string length + * @stable ICU 4.2 + */ + int32_t size() const { return length_; } + /** + * Returns the string length. Same as size(). + * @return the string length + * @stable ICU 4.2 + */ + int32_t length() const { return length_; } + /** + * Returns whether the string is empty. + * @return TRUE if the string is empty + * @stable ICU 4.2 + */ + UBool empty() const { return length_ == 0; } + + /** + * Sets to an empty string. + * @stable ICU 4.2 + */ + void clear() { ptr_ = NULL; length_ = 0; } + + /** + * Reset the stringpiece to refer to new data. + * @param xdata pointer the new string data. Need not be nul terminated. + * @param len the length of the new data + * @stable ICU 4.8 + */ + void set(const char* xdata, int32_t len) { ptr_ = xdata; length_ = len; } + + /** + * Reset the stringpiece to refer to new data. + * @param str a pointer to a NUL-terminated string. + * @stable ICU 4.8 + */ + void set(const char* str); + + /** + * Removes the first n string units. + * @param n prefix length, must be non-negative and <=length() + * @stable ICU 4.2 + */ + void remove_prefix(int32_t n) { + if (n >= 0) { + if (n > length_) { + n = length_; + } + ptr_ += n; + length_ -= n; + } + } + + /** + * Removes the last n string units. + * @param n suffix length, must be non-negative and <=length() + * @stable ICU 4.2 + */ + void remove_suffix(int32_t n) { + if (n >= 0) { + if (n <= length_) { + length_ -= n; + } else { + length_ = 0; + } + } + } + + /** + * Maximum integer, used as a default value for substring methods. + * @stable ICU 4.2 + */ + static const int32_t npos; // = 0x7fffffff; + + /** + * Returns a substring of this StringPiece. + * @param pos start position; must be non-negative and <= length(). + * @param len length of the substring; + * must be non-negative and will be pinned to at most length() - pos. + * @return the substring StringPiece + * @stable ICU 4.2 + */ + StringPiece substr(int32_t pos, int32_t len = npos) const { + return StringPiece(*this, pos, len); + } +}; + +/** + * Global operator == for StringPiece + * @param x The first StringPiece to compare. + * @param y The second StringPiece to compare. + * @return TRUE if the string data is equal + * @stable ICU 4.8 + */ +U_EXPORT UBool U_EXPORT2 +operator==(const StringPiece& x, const StringPiece& y); + +/** + * Global operator != for StringPiece + * @param x The first StringPiece to compare. + * @param y The second StringPiece to compare. + * @return TRUE if the string data is not equal + * @stable ICU 4.8 + */ +inline UBool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} + +U_NAMESPACE_END + +#endif // __STRINGPIECE_H__ diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/ucasemap.h b/src/core/basetypes/icu-ucsdet/include/unicode/ucasemap.h new file mode 100644 index 00000000..b37e1658 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/ucasemap.h @@ -0,0 +1,423 @@ +/* +******************************************************************************* +* +* Copyright (C) 2005-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: ucasemap.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005may06 +* created by: Markus W. Scherer +* +* Case mapping service object and functions using it. +*/ + +#ifndef __UCASEMAP_H__ +#define __UCASEMAP_H__ + +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "unicode/localpointer.h" + +/** + * \file + * \brief C API: Unicode case mapping functions using a UCaseMap service object. + * + * The service object takes care of memory allocations, data loading, and setup + * for the attributes, as usual. + * + * Currently, the functionality provided here does not overlap with uchar.h + * and ustring.h, except for ucasemap_toTitle(). + * + * ucasemap_utf8XYZ() functions operate directly on UTF-8 strings. + */ + +/** + * UCaseMap is an opaque service object for newer ICU case mapping functions. + * Older functions did not use a service object. + * @stable ICU 3.4 + */ +struct UCaseMap; +typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @stable ICU 3.4 */ + +/** + * Open a UCaseMap service object for a locale and a set of options. + * The locale ID and options are preprocessed so that functions using the + * service object need not process them in each call. + * + * @param locale ICU locale ID, used for language-dependent + * upper-/lower-/title-casing according to the Unicode standard. + * Usual semantics: ""=root, NULL=default locale, etc. + * @param options Options bit set, used for case folding and string comparisons. + * Same flags as for u_foldCase(), u_strFoldCase(), + * u_strCaseCompare(), etc. + * Use 0 or U_FOLD_CASE_DEFAULT for default behavior. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return Pointer to a UCaseMap service object, if successful. + * + * @see U_FOLD_CASE_DEFAULT + * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I + * @see U_TITLECASE_NO_LOWERCASE + * @see U_TITLECASE_NO_BREAK_ADJUSTMENT + * @stable ICU 3.4 + */ +U_STABLE UCaseMap * U_EXPORT2 +ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode); + +/** + * Close a UCaseMap service object. + * @param csm Object to be closed. + * @stable ICU 3.4 + */ +U_STABLE void U_EXPORT2 +ucasemap_close(UCaseMap *csm); + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN + +/** + * \class LocalUCaseMapPointer + * "Smart pointer" class, closes a UCaseMap via ucasemap_close(). + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + * @stable ICU 4.4 + */ +U_DEFINE_LOCAL_OPEN_POINTER(LocalUCaseMapPointer, UCaseMap, ucasemap_close); + +U_NAMESPACE_END + +#endif + +/** + * Get the locale ID that is used for language-dependent case mappings. + * @param csm UCaseMap service object. + * @return locale ID + * @stable ICU 3.4 + */ +U_STABLE const char * U_EXPORT2 +ucasemap_getLocale(const UCaseMap *csm); + +/** + * Get the options bit set that is used for case folding and string comparisons. + * @param csm UCaseMap service object. + * @return options bit set + * @stable ICU 3.4 + */ +U_STABLE uint32_t U_EXPORT2 +ucasemap_getOptions(const UCaseMap *csm); + +/** + * Set the locale ID that is used for language-dependent case mappings. + * + * @param csm UCaseMap service object. + * @param locale Locale ID, see ucasemap_open(). + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * + * @see ucasemap_open + * @stable ICU 3.4 + */ +U_STABLE void U_EXPORT2 +ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode); + +/** + * Set the options bit set that is used for case folding and string comparisons. + * + * @param csm UCaseMap service object. + * @param options Options bit set, see ucasemap_open(). + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * + * @see ucasemap_open + * @stable ICU 3.4 + */ +U_STABLE void U_EXPORT2 +ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode); + +/** + * Do not lowercase non-initial parts of words when titlecasing. + * Option bit for titlecasing APIs that take an options bit set. + * + * By default, titlecasing will titlecase the first cased character + * of a word and lowercase all other characters. + * With this option, the other characters will not be modified. + * + * @see ucasemap_setOptions + * @see ucasemap_toTitle + * @see ucasemap_utf8ToTitle + * @see UnicodeString::toTitle + * @stable ICU 3.8 + */ +#define U_TITLECASE_NO_LOWERCASE 0x100 + +/** + * Do not adjust the titlecasing indexes from BreakIterator::next() indexes; + * titlecase exactly the characters at breaks from the iterator. + * Option bit for titlecasing APIs that take an options bit set. + * + * By default, titlecasing will take each break iterator index, + * adjust it by looking for the next cased character, and titlecase that one. + * Other characters are lowercased. + * + * This follows Unicode 4 & 5 section 3.13 Default Case Operations: + * + * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex + * #29, "Text Boundaries." Between each pair of word boundaries, find the first + * cased character F. If F exists, map F to default_title(F); then map each + * subsequent character C to default_lower(C). + * + * @see ucasemap_setOptions + * @see ucasemap_toTitle + * @see ucasemap_utf8ToTitle + * @see UnicodeString::toTitle + * @see U_TITLECASE_NO_LOWERCASE + * @stable ICU 3.8 + */ +#define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200 + +#if !UCONFIG_NO_BREAK_ITERATION + +/** + * Get the break iterator that is used for titlecasing. + * Do not modify the returned break iterator. + * @param csm UCaseMap service object. + * @return titlecasing break iterator + * @stable ICU 3.8 + */ +U_STABLE const UBreakIterator * U_EXPORT2 +ucasemap_getBreakIterator(const UCaseMap *csm); + +/** + * Set the break iterator that is used for titlecasing. + * The UCaseMap service object releases a previously set break iterator + * and "adopts" this new one, taking ownership of it. + * It will be released in a subsequent call to ucasemap_setBreakIterator() + * or ucasemap_close(). + * + * Break iterator operations are not thread-safe. Therefore, titlecasing + * functions use non-const UCaseMap objects. It is not possible to titlecase + * strings concurrently using the same UCaseMap. + * + * @param csm UCaseMap service object. + * @param iterToAdopt Break iterator to be adopted for titlecasing. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * + * @see ucasemap_toTitle + * @see ucasemap_utf8ToTitle + * @stable ICU 3.8 + */ +U_STABLE void U_EXPORT2 +ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode); + +/** + * Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(), + * except that it takes ucasemap_setOptions() into account and has performance + * advantages from being able to use a UCaseMap object for multiple case mapping + * operations, saving setup time. + * + * Casing is locale-dependent and context-sensitive. + * Titlecasing uses a break iterator to find the first characters of words + * that are to be titlecased. It titlecases those characters and lowercases + * all others. (This can be modified with ucasemap_setOptions().) + * + * Note: This function takes a non-const UCaseMap pointer because it will + * open a default break iterator if no break iterator was set yet, + * and effectively call ucasemap_setBreakIterator(); + * also because the break iterator is stateful and will be modified during + * the iteration. + * + * The titlecase break iterator can be provided to customize for arbitrary + * styles, using rules and dictionaries beyond the standard iterators. + * The standard titlecase iterator for the root locale implements the + * algorithm of Unicode TR 21. + * + * This function uses only the setUText(), first(), next() and close() methods of the + * provided break iterator. + * + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param csm UCaseMap service object. This pointer is non-const! + * See the note above for details. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful - or in case of a buffer overflow, + * in which case it will be greater than destCapacity. + * + * @see u_strToTitle + * @stable ICU 3.8 + */ +U_STABLE int32_t U_EXPORT2 +ucasemap_toTitle(UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UErrorCode *pErrorCode); + +#endif + +/** + * Lowercase the characters in a UTF-8 string. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param csm UCaseMap service object. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful - or in case of a buffer overflow, + * in which case it will be greater than destCapacity. + * + * @see u_strToLower + * @stable ICU 3.4 + */ +U_STABLE int32_t U_EXPORT2 +ucasemap_utf8ToLower(const UCaseMap *csm, + char *dest, int32_t destCapacity, + const char *src, int32_t srcLength, + UErrorCode *pErrorCode); + +/** + * Uppercase the characters in a UTF-8 string. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param csm UCaseMap service object. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful - or in case of a buffer overflow, + * in which case it will be greater than destCapacity. + * + * @see u_strToUpper + * @stable ICU 3.4 + */ +U_STABLE int32_t U_EXPORT2 +ucasemap_utf8ToUpper(const UCaseMap *csm, + char *dest, int32_t destCapacity, + const char *src, int32_t srcLength, + UErrorCode *pErrorCode); + +#if !UCONFIG_NO_BREAK_ITERATION + +/** + * Titlecase a UTF-8 string. + * Casing is locale-dependent and context-sensitive. + * Titlecasing uses a break iterator to find the first characters of words + * that are to be titlecased. It titlecases those characters and lowercases + * all others. (This can be modified with ucasemap_setOptions().) + * + * Note: This function takes a non-const UCaseMap pointer because it will + * open a default break iterator if no break iterator was set yet, + * and effectively call ucasemap_setBreakIterator(); + * also because the break iterator is stateful and will be modified during + * the iteration. + * + * The titlecase break iterator can be provided to customize for arbitrary + * styles, using rules and dictionaries beyond the standard iterators. + * The standard titlecase iterator for the root locale implements the + * algorithm of Unicode TR 21. + * + * This function uses only the setUText(), first(), next() and close() methods of the + * provided break iterator. + * + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param csm UCaseMap service object. This pointer is non-const! + * See the note above for details. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful - or in case of a buffer overflow, + * in which case it will be greater than destCapacity. + * + * @see u_strToTitle + * @see U_TITLECASE_NO_LOWERCASE + * @see U_TITLECASE_NO_BREAK_ADJUSTMENT + * @stable ICU 3.8 + */ +U_STABLE int32_t U_EXPORT2 +ucasemap_utf8ToTitle(UCaseMap *csm, + char *dest, int32_t destCapacity, + const char *src, int32_t srcLength, + UErrorCode *pErrorCode); + +#endif + +/** + * Case-folds the characters in a UTF-8 string. + * + * Case-folding is locale-independent and not context-sensitive, + * but there is an option for whether to include or exclude mappings for dotted I + * and dotless i that are marked with 'T' in CaseFolding.txt. + * + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param csm UCaseMap service object. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful - or in case of a buffer overflow, + * in which case it will be greater than destCapacity. + * + * @see u_strFoldCase + * @see ucasemap_setOptions + * @see U_FOLD_CASE_DEFAULT + * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I + * @stable ICU 3.8 + */ +U_STABLE int32_t U_EXPORT2 +ucasemap_utf8FoldCase(const UCaseMap *csm, + char *dest, int32_t destCapacity, + const char *src, int32_t srcLength, + UErrorCode *pErrorCode); + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/uchar.h b/src/core/basetypes/icu-ucsdet/include/unicode/uchar.h new file mode 100644 index 00000000..1d3ae0dd --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/uchar.h @@ -0,0 +1,3426 @@ +/* +********************************************************************** +* Copyright (C) 1997-2014, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* File UCHAR.H +* +* Modification History: +* +* Date Name Description +* 04/02/97 aliu Creation. +* 03/29/99 helena Updated for C APIs. +* 4/15/99 Madhu Updated for C Implementation and Javadoc +* 5/20/99 Madhu Added the function u_getVersion() +* 8/19/1999 srl Upgraded scripts to Unicode 3.0 +* 8/27/1999 schererm UCharDirection constants: U_... +* 11/11/1999 weiv added u_isalnum(), cleaned comments +* 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion(). +****************************************************************************** +*/ + +#ifndef UCHAR_H +#define UCHAR_H + +#include "unicode/utypes.h" + +U_CDECL_BEGIN + +/*==========================================================================*/ +/* Unicode version number */ +/*==========================================================================*/ +/** + * Unicode version number, default for the current ICU version. + * The actual Unicode Character Database (UCD) data is stored in uprops.dat + * and may be generated from UCD files from a different Unicode version. + * Call u_getUnicodeVersion to get the actual Unicode version of the data. + * + * @see u_getUnicodeVersion + * @stable ICU 2.0 + */ +#define U_UNICODE_VERSION "7.0" + +/** + * \file + * \brief C API: Unicode Properties + * + * This C API provides low-level access to the Unicode Character Database. + * In addition to raw property values, some convenience functions calculate + * derived properties, for example for Java-style programming. + * + * Unicode assigns each code point (not just assigned character) values for + * many properties. + * Most of them are simple boolean flags, or constants from a small enumerated list. + * For some properties, values are strings or other relatively more complex types. + * + * For more information see + * "About the Unicode Character Database" (http://www.unicode.org/ucd/) + * and the ICU User Guide chapter on Properties (http://icu-project.org/userguide/properties.html). + * + * Many functions are designed to match java.lang.Character functions. + * See the individual function documentation, + * and see the JDK 1.4 java.lang.Character documentation + * at http://java.sun.com/j2se/1.4/docs/api/java/lang/Character.html + * + * There are also functions that provide easy migration from C/POSIX functions + * like isblank(). Their use is generally discouraged because the C/POSIX + * standards do not define their semantics beyond the ASCII range, which means + * that different implementations exhibit very different behavior. + * Instead, Unicode properties should be used directly. + * + * There are also only a few, broad C/POSIX character classes, and they tend + * to be used for conflicting purposes. For example, the "isalpha()" class + * is sometimes used to determine word boundaries, while a more sophisticated + * approach would at least distinguish initial letters from continuation + * characters (the latter including combining marks). + * (In ICU, BreakIterator is the most sophisticated API for word boundaries.) + * Another example: There is no "istitle()" class for titlecase characters. + * + * ICU 3.4 and later provides API access for all twelve C/POSIX character classes. + * ICU implements them according to the Standard Recommendations in + * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions + * (http://www.unicode.org/reports/tr18/#Compatibility_Properties). + * + * API access for C/POSIX character classes is as follows: + * - alpha: u_isUAlphabetic(c) or u_hasBinaryProperty(c, UCHAR_ALPHABETIC) + * - lower: u_isULowercase(c) or u_hasBinaryProperty(c, UCHAR_LOWERCASE) + * - upper: u_isUUppercase(c) or u_hasBinaryProperty(c, UCHAR_UPPERCASE) + * - punct: u_ispunct(c) + * - digit: u_isdigit(c) or u_charType(c)==U_DECIMAL_DIGIT_NUMBER + * - xdigit: u_isxdigit(c) or u_hasBinaryProperty(c, UCHAR_POSIX_XDIGIT) + * - alnum: u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM) + * - space: u_isUWhiteSpace(c) or u_hasBinaryProperty(c, UCHAR_WHITE_SPACE) + * - blank: u_isblank(c) or u_hasBinaryProperty(c, UCHAR_POSIX_BLANK) + * - cntrl: u_charType(c)==U_CONTROL_CHAR + * - graph: u_hasBinaryProperty(c, UCHAR_POSIX_GRAPH) + * - print: u_hasBinaryProperty(c, UCHAR_POSIX_PRINT) + * + * Note: Some of the u_isxyz() functions in uchar.h predate, and do not match, + * the Standard Recommendations in UTS #18. Instead, they match Java + * functions according to their API documentation. + * + * \htmlonly + * The C/POSIX character classes are also available in UnicodeSet patterns, + * using patterns like [:graph:] or \p{graph}. + * \endhtmlonly + * + * Note: There are several ICU whitespace functions. + * Comparison: + * - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; + * most of general categories "Z" (separators) + most whitespace ISO controls + * (including no-break spaces, but excluding IS1..IS4 and ZWSP) + * - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces + * - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces) + * - u_isspace: Z + whitespace ISO controls (including no-break spaces) + * - u_isblank: "horizontal spaces" = TAB + Zs - ZWSP + */ + +/** + * Constants. + */ + +/** The lowest Unicode code point value. Code points are non-negative. @stable ICU 2.0 */ +#define UCHAR_MIN_VALUE 0 + +/** + * The highest Unicode code point value (scalar value) according to + * The Unicode Standard. This is a 21-bit value (20.1 bits, rounded up). + * For a single character, UChar32 is a simple type that can hold any code point value. + * + * @see UChar32 + * @stable ICU 2.0 + */ +#define UCHAR_MAX_VALUE 0x10ffff + +/** + * Get a single-bit bit set (a flag) from a bit number 0..31. + * @stable ICU 2.1 + */ +#define U_MASK(x) ((uint32_t)1<<(x)) + +/** + * Selection constants for Unicode properties. + * These constants are used in functions like u_hasBinaryProperty to select + * one of the Unicode properties. + * + * The properties APIs are intended to reflect Unicode properties as defined + * in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). + * For details about the properties see http://www.unicode.org/ucd/ . + * For names of Unicode properties see the UCD file PropertyAliases.txt. + * + * Important: If ICU is built with UCD files from Unicode versions below, e.g., 3.2, + * then properties marked with "new in Unicode 3.2" are not or not fully available. + * Check u_getUnicodeVersion to be sure. + * + * @see u_hasBinaryProperty + * @see u_getIntPropertyValue + * @see u_getUnicodeVersion + * @stable ICU 2.1 + */ +typedef enum UProperty { + /* + * Note: UProperty constants are parsed by preparseucd.py. + * It matches lines like + * UCHAR_<Unicode property name>=<integer>, + */ + + /* Note: Place UCHAR_ALPHABETIC before UCHAR_BINARY_START so that + debuggers display UCHAR_ALPHABETIC as the symbolic name for 0, + rather than UCHAR_BINARY_START. Likewise for other *_START + identifiers. */ + + /** Binary property Alphabetic. Same as u_isUAlphabetic, different from u_isalpha. + Lu+Ll+Lt+Lm+Lo+Nl+Other_Alphabetic @stable ICU 2.1 */ + UCHAR_ALPHABETIC=0, + /** First constant for binary Unicode properties. @stable ICU 2.1 */ + UCHAR_BINARY_START=UCHAR_ALPHABETIC, + /** Binary property ASCII_Hex_Digit. 0-9 A-F a-f @stable ICU 2.1 */ + UCHAR_ASCII_HEX_DIGIT=1, + /** Binary property Bidi_Control. + Format controls which have specific functions + in the Bidi Algorithm. @stable ICU 2.1 */ + UCHAR_BIDI_CONTROL=2, + /** Binary property Bidi_Mirrored. + Characters that may change display in RTL text. + Same as u_isMirrored. + See Bidi Algorithm, UTR 9. @stable ICU 2.1 */ + UCHAR_BIDI_MIRRORED=3, + /** Binary property Dash. Variations of dashes. @stable ICU 2.1 */ + UCHAR_DASH=4, + /** Binary property Default_Ignorable_Code_Point (new in Unicode 3.2). + Ignorable in most processing. + <2060..206F, FFF0..FFFB, E0000..E0FFF>+Other_Default_Ignorable_Code_Point+(Cf+Cc+Cs-White_Space) @stable ICU 2.1 */ + UCHAR_DEFAULT_IGNORABLE_CODE_POINT=5, + /** Binary property Deprecated (new in Unicode 3.2). + The usage of deprecated characters is strongly discouraged. @stable ICU 2.1 */ + UCHAR_DEPRECATED=6, + /** Binary property Diacritic. Characters that linguistically modify + the meaning of another character to which they apply. @stable ICU 2.1 */ + UCHAR_DIACRITIC=7, + /** Binary property Extender. + Extend the value or shape of a preceding alphabetic character, + e.g., length and iteration marks. @stable ICU 2.1 */ + UCHAR_EXTENDER=8, + /** Binary property Full_Composition_Exclusion. + CompositionExclusions.txt+Singleton Decompositions+ + Non-Starter Decompositions. @stable ICU 2.1 */ + UCHAR_FULL_COMPOSITION_EXCLUSION=9, + /** Binary property Grapheme_Base (new in Unicode 3.2). + For programmatic determination of grapheme cluster boundaries. + [0..10FFFF]-Cc-Cf-Cs-Co-Cn-Zl-Zp-Grapheme_Link-Grapheme_Extend-CGJ @stable ICU 2.1 */ + UCHAR_GRAPHEME_BASE=10, + /** Binary property Grapheme_Extend (new in Unicode 3.2). + For programmatic determination of grapheme cluster boundaries. + Me+Mn+Mc+Other_Grapheme_Extend-Grapheme_Link-CGJ @stable ICU 2.1 */ + UCHAR_GRAPHEME_EXTEND=11, + /** Binary property Grapheme_Link (new in Unicode 3.2). + For programmatic determination of grapheme cluster boundaries. @stable ICU 2.1 */ + UCHAR_GRAPHEME_LINK=12, + /** Binary property Hex_Digit. + Characters commonly used for hexadecimal numbers. @stable ICU 2.1 */ + UCHAR_HEX_DIGIT=13, + /** Binary property Hyphen. Dashes used to mark connections + between pieces of words, plus the Katakana middle dot. @stable ICU 2.1 */ + UCHAR_HYPHEN=14, + /** Binary property ID_Continue. + Characters that can continue an identifier. + DerivedCoreProperties.txt also says "NOTE: Cf characters should be filtered out." + ID_Start+Mn+Mc+Nd+Pc @stable ICU 2.1 */ + UCHAR_ID_CONTINUE=15, + /** Binary property ID_Start. + Characters that can start an identifier. + Lu+Ll+Lt+Lm+Lo+Nl @stable ICU 2.1 */ + UCHAR_ID_START=16, + /** Binary property Ideographic. + CJKV ideographs. @stable ICU 2.1 */ + UCHAR_IDEOGRAPHIC=17, + /** Binary property IDS_Binary_Operator (new in Unicode 3.2). + For programmatic determination of + Ideographic Description Sequences. @stable ICU 2.1 */ + UCHAR_IDS_BINARY_OPERATOR=18, + /** Binary property IDS_Trinary_Operator (new in Unicode 3.2). + For programmatic determination of + Ideographic Description Sequences. @stable ICU 2.1 */ + UCHAR_IDS_TRINARY_OPERATOR=19, + /** Binary property Join_Control. + Format controls for cursive joining and ligation. @stable ICU 2.1 */ + UCHAR_JOIN_CONTROL=20, + /** Binary property Logical_Order_Exception (new in Unicode 3.2). + Characters that do not use logical order and + require special handling in most processing. @stable ICU 2.1 */ + UCHAR_LOGICAL_ORDER_EXCEPTION=21, + /** Binary property Lowercase. Same as u_isULowercase, different from u_islower. + Ll+Other_Lowercase @stable ICU 2.1 */ + UCHAR_LOWERCASE=22, + /** Binary property Math. Sm+Other_Math @stable ICU 2.1 */ + UCHAR_MATH=23, + /** Binary property Noncharacter_Code_Point. + Code points that are explicitly defined as illegal + for the encoding of characters. @stable ICU 2.1 */ + UCHAR_NONCHARACTER_CODE_POINT=24, + /** Binary property Quotation_Mark. @stable ICU 2.1 */ + UCHAR_QUOTATION_MARK=25, + /** Binary property Radical (new in Unicode 3.2). + For programmatic determination of + Ideographic Description Sequences. @stable ICU 2.1 */ + UCHAR_RADICAL=26, + /** Binary property Soft_Dotted (new in Unicode 3.2). + Characters with a "soft dot", like i or j. + An accent placed on these characters causes + the dot to disappear. @stable ICU 2.1 */ + UCHAR_SOFT_DOTTED=27, + /** Binary property Terminal_Punctuation. + Punctuation characters that generally mark + the end of textual units. @stable ICU 2.1 */ + UCHAR_TERMINAL_PUNCTUATION=28, + /** Binary property Unified_Ideograph (new in Unicode 3.2). + For programmatic determination of + Ideographic Description Sequences. @stable ICU 2.1 */ + UCHAR_UNIFIED_IDEOGRAPH=29, + /** Binary property Uppercase. Same as u_isUUppercase, different from u_isupper. + Lu+Other_Uppercase @stable ICU 2.1 */ + UCHAR_UPPERCASE=30, + /** Binary property White_Space. + Same as u_isUWhiteSpace, different from u_isspace and u_isWhitespace. + Space characters+TAB+CR+LF-ZWSP-ZWNBSP @stable ICU 2.1 */ + UCHAR_WHITE_SPACE=31, + /** Binary property XID_Continue. + ID_Continue modified to allow closure under + normalization forms NFKC and NFKD. @stable ICU 2.1 */ + UCHAR_XID_CONTINUE=32, + /** Binary property XID_Start. ID_Start modified to allow + closure under normalization forms NFKC and NFKD. @stable ICU 2.1 */ + UCHAR_XID_START=33, + /** Binary property Case_Sensitive. Either the source of a case + mapping or _in_ the target of a case mapping. Not the same as + the general category Cased_Letter. @stable ICU 2.6 */ + UCHAR_CASE_SENSITIVE=34, + /** Binary property STerm (new in Unicode 4.0.1). + Sentence Terminal. Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + @stable ICU 3.0 */ + UCHAR_S_TERM=35, + /** Binary property Variation_Selector (new in Unicode 4.0.1). + Indicates all those characters that qualify as Variation Selectors. + For details on the behavior of these characters, + see StandardizedVariants.html and 15.6 Variation Selectors. + @stable ICU 3.0 */ + UCHAR_VARIATION_SELECTOR=36, + /** Binary property NFD_Inert. + ICU-specific property for characters that are inert under NFD, + i.e., they do not interact with adjacent characters. + See the documentation for the Normalizer2 class and the + Normalizer2::isInert() method. + @stable ICU 3.0 */ + UCHAR_NFD_INERT=37, + /** Binary property NFKD_Inert. + ICU-specific property for characters that are inert under NFKD, + i.e., they do not interact with adjacent characters. + See the documentation for the Normalizer2 class and the + Normalizer2::isInert() method. + @stable ICU 3.0 */ + UCHAR_NFKD_INERT=38, + /** Binary property NFC_Inert. + ICU-specific property for characters that are inert under NFC, + i.e., they do not interact with adjacent characters. + See the documentation for the Normalizer2 class and the + Normalizer2::isInert() method. + @stable ICU 3.0 */ + UCHAR_NFC_INERT=39, + /** Binary property NFKC_Inert. + ICU-specific property for characters that are inert under NFKC, + i.e., they do not interact with adjacent characters. + See the documentation for the Normalizer2 class and the + Normalizer2::isInert() method. + @stable ICU 3.0 */ + UCHAR_NFKC_INERT=40, + /** Binary Property Segment_Starter. + ICU-specific property for characters that are starters in terms of + Unicode normalization and combining character sequences. + They have ccc=0 and do not occur in non-initial position of the + canonical decomposition of any character + (like a-umlaut in NFD and a Jamo T in an NFD(Hangul LVT)). + ICU uses this property for segmenting a string for generating a set of + canonically equivalent strings, e.g. for canonical closure while + processing collation tailoring rules. + @stable ICU 3.0 */ + UCHAR_SEGMENT_STARTER=41, + /** Binary property Pattern_Syntax (new in Unicode 4.1). + See UAX #31 Identifier and Pattern Syntax + (http://www.unicode.org/reports/tr31/) + @stable ICU 3.4 */ + UCHAR_PATTERN_SYNTAX=42, + /** Binary property Pattern_White_Space (new in Unicode 4.1). + See UAX #31 Identifier and Pattern Syntax + (http://www.unicode.org/reports/tr31/) + @stable ICU 3.4 */ + UCHAR_PATTERN_WHITE_SPACE=43, + /** Binary property alnum (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCHAR_POSIX_ALNUM=44, + /** Binary property blank (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCHAR_POSIX_BLANK=45, + /** Binary property graph (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCHAR_POSIX_GRAPH=46, + /** Binary property print (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCHAR_POSIX_PRINT=47, + /** Binary property xdigit (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCHAR_POSIX_XDIGIT=48, + /** Binary property Cased. For Lowercase, Uppercase and Titlecase characters. @stable ICU 4.4 */ + UCHAR_CASED=49, + /** Binary property Case_Ignorable. Used in context-sensitive case mappings. @stable ICU 4.4 */ + UCHAR_CASE_IGNORABLE=50, + /** Binary property Changes_When_Lowercased. @stable ICU 4.4 */ + UCHAR_CHANGES_WHEN_LOWERCASED=51, + /** Binary property Changes_When_Uppercased. @stable ICU 4.4 */ + UCHAR_CHANGES_WHEN_UPPERCASED=52, + /** Binary property Changes_When_Titlecased. @stable ICU 4.4 */ + UCHAR_CHANGES_WHEN_TITLECASED=53, + /** Binary property Changes_When_Casefolded. @stable ICU 4.4 */ + UCHAR_CHANGES_WHEN_CASEFOLDED=54, + /** Binary property Changes_When_Casemapped. @stable ICU 4.4 */ + UCHAR_CHANGES_WHEN_CASEMAPPED=55, + /** Binary property Changes_When_NFKC_Casefolded. @stable ICU 4.4 */ + UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED=56, + /** One more than the last constant for binary Unicode properties. @stable ICU 2.1 */ + UCHAR_BINARY_LIMIT=57, + + /** Enumerated property Bidi_Class. + Same as u_charDirection, returns UCharDirection values. @stable ICU 2.2 */ + UCHAR_BIDI_CLASS=0x1000, + /** First constant for enumerated/integer Unicode properties. @stable ICU 2.2 */ + UCHAR_INT_START=UCHAR_BIDI_CLASS, + /** Enumerated property Block. + Same as ublock_getCode, returns UBlockCode values. @stable ICU 2.2 */ + UCHAR_BLOCK=0x1001, + /** Enumerated property Canonical_Combining_Class. + Same as u_getCombiningClass, returns 8-bit numeric values. @stable ICU 2.2 */ + UCHAR_CANONICAL_COMBINING_CLASS=0x1002, + /** Enumerated property Decomposition_Type. + Returns UDecompositionType values. @stable ICU 2.2 */ + UCHAR_DECOMPOSITION_TYPE=0x1003, + /** Enumerated property East_Asian_Width. + See http://www.unicode.org/reports/tr11/ + Returns UEastAsianWidth values. @stable ICU 2.2 */ + UCHAR_EAST_ASIAN_WIDTH=0x1004, + /** Enumerated property General_Category. + Same as u_charType, returns UCharCategory values. @stable ICU 2.2 */ + UCHAR_GENERAL_CATEGORY=0x1005, + /** Enumerated property Joining_Group. + Returns UJoiningGroup values. @stable ICU 2.2 */ + UCHAR_JOINING_GROUP=0x1006, + /** Enumerated property Joining_Type. + Returns UJoiningType values. @stable ICU 2.2 */ + UCHAR_JOINING_TYPE=0x1007, + /** Enumerated property Line_Break. + Returns ULineBreak values. @stable ICU 2.2 */ + UCHAR_LINE_BREAK=0x1008, + /** Enumerated property Numeric_Type. + Returns UNumericType values. @stable ICU 2.2 */ + UCHAR_NUMERIC_TYPE=0x1009, + /** Enumerated property Script. + Same as uscript_getScript, returns UScriptCode values. @stable ICU 2.2 */ + UCHAR_SCRIPT=0x100A, + /** Enumerated property Hangul_Syllable_Type, new in Unicode 4. + Returns UHangulSyllableType values. @stable ICU 2.6 */ + UCHAR_HANGUL_SYLLABLE_TYPE=0x100B, + /** Enumerated property NFD_Quick_Check. + Returns UNormalizationCheckResult values. @stable ICU 3.0 */ + UCHAR_NFD_QUICK_CHECK=0x100C, + /** Enumerated property NFKD_Quick_Check. + Returns UNormalizationCheckResult values. @stable ICU 3.0 */ + UCHAR_NFKD_QUICK_CHECK=0x100D, + /** Enumerated property NFC_Quick_Check. + Returns UNormalizationCheckResult values. @stable ICU 3.0 */ + UCHAR_NFC_QUICK_CHECK=0x100E, + /** Enumerated property NFKC_Quick_Check. + Returns UNormalizationCheckResult values. @stable ICU 3.0 */ + UCHAR_NFKC_QUICK_CHECK=0x100F, + /** Enumerated property Lead_Canonical_Combining_Class. + ICU-specific property for the ccc of the first code point + of the decomposition, or lccc(c)=ccc(NFD(c)[0]). + Useful for checking for canonically ordered text; + see UNORM_FCD and http://www.unicode.org/notes/tn5/#FCD . + Returns 8-bit numeric values like UCHAR_CANONICAL_COMBINING_CLASS. @stable ICU 3.0 */ + UCHAR_LEAD_CANONICAL_COMBINING_CLASS=0x1010, + /** Enumerated property Trail_Canonical_Combining_Class. + ICU-specific property for the ccc of the last code point + of the decomposition, or tccc(c)=ccc(NFD(c)[last]). + Useful for checking for canonically ordered text; + see UNORM_FCD and http://www.unicode.org/notes/tn5/#FCD . + Returns 8-bit numeric values like UCHAR_CANONICAL_COMBINING_CLASS. @stable ICU 3.0 */ + UCHAR_TRAIL_CANONICAL_COMBINING_CLASS=0x1011, + /** Enumerated property Grapheme_Cluster_Break (new in Unicode 4.1). + Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + Returns UGraphemeClusterBreak values. @stable ICU 3.4 */ + UCHAR_GRAPHEME_CLUSTER_BREAK=0x1012, + /** Enumerated property Sentence_Break (new in Unicode 4.1). + Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + Returns USentenceBreak values. @stable ICU 3.4 */ + UCHAR_SENTENCE_BREAK=0x1013, + /** Enumerated property Word_Break (new in Unicode 4.1). + Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + Returns UWordBreakValues values. @stable ICU 3.4 */ + UCHAR_WORD_BREAK=0x1014, + /** Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). + Used in UAX #9: Unicode Bidirectional Algorithm + (http://www.unicode.org/reports/tr9/) + Returns UBidiPairedBracketType values. @stable ICU 52 */ + UCHAR_BIDI_PAIRED_BRACKET_TYPE=0x1015, + /** One more than the last constant for enumerated/integer Unicode properties. @stable ICU 2.2 */ + UCHAR_INT_LIMIT=0x1016, + + /** Bitmask property General_Category_Mask. + This is the General_Category property returned as a bit mask. + When used in u_getIntPropertyValue(c), same as U_MASK(u_charType(c)), + returns bit masks for UCharCategory values where exactly one bit is set. + When used with u_getPropertyValueName() and u_getPropertyValueEnum(), + a multi-bit mask is used for sets of categories like "Letters". + Mask values should be cast to uint32_t. + @stable ICU 2.4 */ + UCHAR_GENERAL_CATEGORY_MASK=0x2000, + /** First constant for bit-mask Unicode properties. @stable ICU 2.4 */ + UCHAR_MASK_START=UCHAR_GENERAL_CATEGORY_MASK, + /** One more than the last constant for bit-mask Unicode properties. @stable ICU 2.4 */ + UCHAR_MASK_LIMIT=0x2001, + + /** Double property Numeric_Value. + Corresponds to u_getNumericValue. @stable ICU 2.4 */ + UCHAR_NUMERIC_VALUE=0x3000, + /** First constant for double Unicode properties. @stable ICU 2.4 */ + UCHAR_DOUBLE_START=UCHAR_NUMERIC_VALUE, + /** One more than the last constant for double Unicode properties. @stable ICU 2.4 */ + UCHAR_DOUBLE_LIMIT=0x3001, + + /** String property Age. + Corresponds to u_charAge. @stable ICU 2.4 */ + UCHAR_AGE=0x4000, + /** First constant for string Unicode properties. @stable ICU 2.4 */ + UCHAR_STRING_START=UCHAR_AGE, + /** String property Bidi_Mirroring_Glyph. + Corresponds to u_charMirror. @stable ICU 2.4 */ + UCHAR_BIDI_MIRRORING_GLYPH=0x4001, + /** String property Case_Folding. + Corresponds to u_strFoldCase in ustring.h. @stable ICU 2.4 */ + UCHAR_CASE_FOLDING=0x4002, +#ifndef U_HIDE_DEPRECATED_API + /** Deprecated string property ISO_Comment. + Corresponds to u_getISOComment. @deprecated ICU 49 */ + UCHAR_ISO_COMMENT=0x4003, +#endif /* U_HIDE_DEPRECATED_API */ + /** String property Lowercase_Mapping. + Corresponds to u_strToLower in ustring.h. @stable ICU 2.4 */ + UCHAR_LOWERCASE_MAPPING=0x4004, + /** String property Name. + Corresponds to u_charName. @stable ICU 2.4 */ + UCHAR_NAME=0x4005, + /** String property Simple_Case_Folding. + Corresponds to u_foldCase. @stable ICU 2.4 */ + UCHAR_SIMPLE_CASE_FOLDING=0x4006, + /** String property Simple_Lowercase_Mapping. + Corresponds to u_tolower. @stable ICU 2.4 */ + UCHAR_SIMPLE_LOWERCASE_MAPPING=0x4007, + /** String property Simple_Titlecase_Mapping. + Corresponds to u_totitle. @stable ICU 2.4 */ + UCHAR_SIMPLE_TITLECASE_MAPPING=0x4008, + /** String property Simple_Uppercase_Mapping. + Corresponds to u_toupper. @stable ICU 2.4 */ + UCHAR_SIMPLE_UPPERCASE_MAPPING=0x4009, + /** String property Titlecase_Mapping. + Corresponds to u_strToTitle in ustring.h. @stable ICU 2.4 */ + UCHAR_TITLECASE_MAPPING=0x400A, +#ifndef U_HIDE_DEPRECATED_API + /** String property Unicode_1_Name. + This property is of little practical value. + Beginning with ICU 49, ICU APIs return an empty string for this property. + Corresponds to u_charName(U_UNICODE_10_CHAR_NAME). @deprecated ICU 49 */ + UCHAR_UNICODE_1_NAME=0x400B, +#endif /* U_HIDE_DEPRECATED_API */ + /** String property Uppercase_Mapping. + Corresponds to u_strToUpper in ustring.h. @stable ICU 2.4 */ + UCHAR_UPPERCASE_MAPPING=0x400C, + /** String property Bidi_Paired_Bracket (new in Unicode 6.3). + Corresponds to u_getBidiPairedBracket. @stable ICU 52 */ + UCHAR_BIDI_PAIRED_BRACKET=0x400D, + /** One more than the last constant for string Unicode properties. @stable ICU 2.4 */ + UCHAR_STRING_LIMIT=0x400E, + + /** Miscellaneous property Script_Extensions (new in Unicode 6.0). + Some characters are commonly used in multiple scripts. + For more information, see UAX #24: http://www.unicode.org/reports/tr24/. + Corresponds to uscript_hasScript and uscript_getScriptExtensions in uscript.h. + @stable ICU 4.6 */ + UCHAR_SCRIPT_EXTENSIONS=0x7000, + /** First constant for Unicode properties with unusual value types. @stable ICU 4.6 */ + UCHAR_OTHER_PROPERTY_START=UCHAR_SCRIPT_EXTENSIONS, + /** One more than the last constant for Unicode properties with unusual value types. + * @stable ICU 4.6 */ + UCHAR_OTHER_PROPERTY_LIMIT=0x7001, + /** Represents a nonexistent or invalid property or property value. @stable ICU 2.4 */ + UCHAR_INVALID_CODE = -1 +} UProperty; + +/** + * Data for enumerated Unicode general category types. + * See http://www.unicode.org/Public/UNIDATA/UnicodeData.html . + * @stable ICU 2.0 + */ +typedef enum UCharCategory +{ + /* + * Note: UCharCategory constants and their API comments are parsed by preparseucd.py. + * It matches pairs of lines like + * / ** <Unicode 2-letter General_Category value> comment... * / + * U_<[A-Z_]+> = <integer>, + */ + + /** Non-category for unassigned and non-character code points. @stable ICU 2.0 */ + U_UNASSIGNED = 0, + /** Cn "Other, Not Assigned (no characters in [UnicodeData.txt] have this property)" (same as U_UNASSIGNED!) @stable ICU 2.0 */ + U_GENERAL_OTHER_TYPES = 0, + /** Lu @stable ICU 2.0 */ + U_UPPERCASE_LETTER = 1, + /** Ll @stable ICU 2.0 */ + U_LOWERCASE_LETTER = 2, + /** Lt @stable ICU 2.0 */ + U_TITLECASE_LETTER = 3, + /** Lm @stable ICU 2.0 */ + U_MODIFIER_LETTER = 4, + /** Lo @stable ICU 2.0 */ + U_OTHER_LETTER = 5, + /** Mn @stable ICU 2.0 */ + U_NON_SPACING_MARK = 6, + /** Me @stable ICU 2.0 */ + U_ENCLOSING_MARK = 7, + /** Mc @stable ICU 2.0 */ + U_COMBINING_SPACING_MARK = 8, + /** Nd @stable ICU 2.0 */ + U_DECIMAL_DIGIT_NUMBER = 9, + /** Nl @stable ICU 2.0 */ + U_LETTER_NUMBER = 10, + /** No @stable ICU 2.0 */ + U_OTHER_NUMBER = 11, + /** Zs @stable ICU 2.0 */ + U_SPACE_SEPARATOR = 12, + /** Zl @stable ICU 2.0 */ + U_LINE_SEPARATOR = 13, + /** Zp @stable ICU 2.0 */ + U_PARAGRAPH_SEPARATOR = 14, + /** Cc @stable ICU 2.0 */ + U_CONTROL_CHAR = 15, + /** Cf @stable ICU 2.0 */ + U_FORMAT_CHAR = 16, + /** Co @stable ICU 2.0 */ + U_PRIVATE_USE_CHAR = 17, + /** Cs @stable ICU 2.0 */ + U_SURROGATE = 18, + /** Pd @stable ICU 2.0 */ + U_DASH_PUNCTUATION = 19, + /** Ps @stable ICU 2.0 */ + U_START_PUNCTUATION = 20, + /** Pe @stable ICU 2.0 */ + U_END_PUNCTUATION = 21, + /** Pc @stable ICU 2.0 */ + U_CONNECTOR_PUNCTUATION = 22, + /** Po @stable ICU 2.0 */ + U_OTHER_PUNCTUATION = 23, + /** Sm @stable ICU 2.0 */ + U_MATH_SYMBOL = 24, + /** Sc @stable ICU 2.0 */ + U_CURRENCY_SYMBOL = 25, + /** Sk @stable ICU 2.0 */ + U_MODIFIER_SYMBOL = 26, + /** So @stable ICU 2.0 */ + U_OTHER_SYMBOL = 27, + /** Pi @stable ICU 2.0 */ + U_INITIAL_PUNCTUATION = 28, + /** Pf @stable ICU 2.0 */ + U_FINAL_PUNCTUATION = 29, + /** One higher than the last enum UCharCategory constant. @stable ICU 2.0 */ + U_CHAR_CATEGORY_COUNT +} UCharCategory; + +/** + * U_GC_XX_MASK constants are bit flags corresponding to Unicode + * general category values. + * For each category, the nth bit is set if the numeric value of the + * corresponding UCharCategory constant is n. + * + * There are also some U_GC_Y_MASK constants for groups of general categories + * like L for all letter categories. + * + * @see u_charType + * @see U_GET_GC_MASK + * @see UCharCategory + * @stable ICU 2.1 + */ +#define U_GC_CN_MASK U_MASK(U_GENERAL_OTHER_TYPES) + +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_LU_MASK U_MASK(U_UPPERCASE_LETTER) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_LL_MASK U_MASK(U_LOWERCASE_LETTER) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_LT_MASK U_MASK(U_TITLECASE_LETTER) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_LM_MASK U_MASK(U_MODIFIER_LETTER) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_LO_MASK U_MASK(U_OTHER_LETTER) + +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_MN_MASK U_MASK(U_NON_SPACING_MARK) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_ME_MASK U_MASK(U_ENCLOSING_MARK) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_MC_MASK U_MASK(U_COMBINING_SPACING_MARK) + +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_ND_MASK U_MASK(U_DECIMAL_DIGIT_NUMBER) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_NL_MASK U_MASK(U_LETTER_NUMBER) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_NO_MASK U_MASK(U_OTHER_NUMBER) + +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_ZS_MASK U_MASK(U_SPACE_SEPARATOR) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_ZL_MASK U_MASK(U_LINE_SEPARATOR) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_ZP_MASK U_MASK(U_PARAGRAPH_SEPARATOR) + +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_CC_MASK U_MASK(U_CONTROL_CHAR) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_CF_MASK U_MASK(U_FORMAT_CHAR) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_CO_MASK U_MASK(U_PRIVATE_USE_CHAR) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_CS_MASK U_MASK(U_SURROGATE) + +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_PD_MASK U_MASK(U_DASH_PUNCTUATION) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_PS_MASK U_MASK(U_START_PUNCTUATION) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_PE_MASK U_MASK(U_END_PUNCTUATION) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_PC_MASK U_MASK(U_CONNECTOR_PUNCTUATION) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_PO_MASK U_MASK(U_OTHER_PUNCTUATION) + +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_SM_MASK U_MASK(U_MATH_SYMBOL) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_SC_MASK U_MASK(U_CURRENCY_SYMBOL) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_SK_MASK U_MASK(U_MODIFIER_SYMBOL) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_SO_MASK U_MASK(U_OTHER_SYMBOL) + +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_PI_MASK U_MASK(U_INITIAL_PUNCTUATION) +/** Mask constant for a UCharCategory. @stable ICU 2.1 */ +#define U_GC_PF_MASK U_MASK(U_FINAL_PUNCTUATION) + + +/** Mask constant for multiple UCharCategory bits (L Letters). @stable ICU 2.1 */ +#define U_GC_L_MASK \ + (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK|U_GC_LM_MASK|U_GC_LO_MASK) + +/** Mask constant for multiple UCharCategory bits (LC Cased Letters). @stable ICU 2.1 */ +#define U_GC_LC_MASK \ + (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK) + +/** Mask constant for multiple UCharCategory bits (M Marks). @stable ICU 2.1 */ +#define U_GC_M_MASK (U_GC_MN_MASK|U_GC_ME_MASK|U_GC_MC_MASK) + +/** Mask constant for multiple UCharCategory bits (N Numbers). @stable ICU 2.1 */ +#define U_GC_N_MASK (U_GC_ND_MASK|U_GC_NL_MASK|U_GC_NO_MASK) + +/** Mask constant for multiple UCharCategory bits (Z Separators). @stable ICU 2.1 */ +#define U_GC_Z_MASK (U_GC_ZS_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK) + +/** Mask constant for multiple UCharCategory bits (C Others). @stable ICU 2.1 */ +#define U_GC_C_MASK \ + (U_GC_CN_MASK|U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CO_MASK|U_GC_CS_MASK) + +/** Mask constant for multiple UCharCategory bits (P Punctuation). @stable ICU 2.1 */ +#define U_GC_P_MASK \ + (U_GC_PD_MASK|U_GC_PS_MASK|U_GC_PE_MASK|U_GC_PC_MASK|U_GC_PO_MASK| \ + U_GC_PI_MASK|U_GC_PF_MASK) + +/** Mask constant for multiple UCharCategory bits (S Symbols). @stable ICU 2.1 */ +#define U_GC_S_MASK (U_GC_SM_MASK|U_GC_SC_MASK|U_GC_SK_MASK|U_GC_SO_MASK) + +/** + * This specifies the language directional property of a character set. + * @stable ICU 2.0 + */ +typedef enum UCharDirection { + /* + * Note: UCharDirection constants and their API comments are parsed by preparseucd.py. + * It matches pairs of lines like + * / ** <Unicode 1..3-letter Bidi_Class value> comment... * / + * U_<[A-Z_]+> = <integer>, + */ + + /** L @stable ICU 2.0 */ + U_LEFT_TO_RIGHT = 0, + /** R @stable ICU 2.0 */ + U_RIGHT_TO_LEFT = 1, + /** EN @stable ICU 2.0 */ + U_EUROPEAN_NUMBER = 2, + /** ES @stable ICU 2.0 */ + U_EUROPEAN_NUMBER_SEPARATOR = 3, + /** ET @stable ICU 2.0 */ + U_EUROPEAN_NUMBER_TERMINATOR = 4, + /** AN @stable ICU 2.0 */ + U_ARABIC_NUMBER = 5, + /** CS @stable ICU 2.0 */ + U_COMMON_NUMBER_SEPARATOR = 6, + /** B @stable ICU 2.0 */ + U_BLOCK_SEPARATOR = 7, + /** S @stable ICU 2.0 */ + U_SEGMENT_SEPARATOR = 8, + /** WS @stable ICU 2.0 */ + U_WHITE_SPACE_NEUTRAL = 9, + /** ON @stable ICU 2.0 */ + U_OTHER_NEUTRAL = 10, + /** LRE @stable ICU 2.0 */ + U_LEFT_TO_RIGHT_EMBEDDING = 11, + /** LRO @stable ICU 2.0 */ + U_LEFT_TO_RIGHT_OVERRIDE = 12, + /** AL @stable ICU 2.0 */ + U_RIGHT_TO_LEFT_ARABIC = 13, + /** RLE @stable ICU 2.0 */ + U_RIGHT_TO_LEFT_EMBEDDING = 14, + /** RLO @stable ICU 2.0 */ + U_RIGHT_TO_LEFT_OVERRIDE = 15, + /** PDF @stable ICU 2.0 */ + U_POP_DIRECTIONAL_FORMAT = 16, + /** NSM @stable ICU 2.0 */ + U_DIR_NON_SPACING_MARK = 17, + /** BN @stable ICU 2.0 */ + U_BOUNDARY_NEUTRAL = 18, + /** FSI @stable ICU 52 */ + U_FIRST_STRONG_ISOLATE = 19, + /** LRI @stable ICU 52 */ + U_LEFT_TO_RIGHT_ISOLATE = 20, + /** RLI @stable ICU 52 */ + U_RIGHT_TO_LEFT_ISOLATE = 21, + /** PDI @stable ICU 52 */ + U_POP_DIRECTIONAL_ISOLATE = 22, + /** @stable ICU 2.0 */ + U_CHAR_DIRECTION_COUNT +} UCharDirection; + +/** + * Bidi Paired Bracket Type constants. + * + * @see UCHAR_BIDI_PAIRED_BRACKET_TYPE + * @stable ICU 52 + */ +typedef enum UBidiPairedBracketType { + /* + * Note: UBidiPairedBracketType constants are parsed by preparseucd.py. + * It matches lines like + * U_BPT_<Unicode Bidi_Paired_Bracket_Type value name> + */ + + /** Not a paired bracket. @stable ICU 52 */ + U_BPT_NONE, + /** Open paired bracket. @stable ICU 52 */ + U_BPT_OPEN, + /** Close paired bracket. @stable ICU 52 */ + U_BPT_CLOSE, + /** @stable ICU 52 */ + U_BPT_COUNT /* 3 */ +} UBidiPairedBracketType; + +/** + * Constants for Unicode blocks, see the Unicode Data file Blocks.txt + * @stable ICU 2.0 + */ +enum UBlockCode { + /* + * Note: UBlockCode constants are parsed by preparseucd.py. + * It matches lines like + * UBLOCK_<Unicode Block value name> = <integer>, + */ + + /** New No_Block value in Unicode 4. @stable ICU 2.6 */ + UBLOCK_NO_BLOCK = 0, /*[none]*/ /* Special range indicating No_Block */ + + /** @stable ICU 2.0 */ + UBLOCK_BASIC_LATIN = 1, /*[0000]*/ + + /** @stable ICU 2.0 */ + UBLOCK_LATIN_1_SUPPLEMENT=2, /*[0080]*/ + + /** @stable ICU 2.0 */ + UBLOCK_LATIN_EXTENDED_A =3, /*[0100]*/ + + /** @stable ICU 2.0 */ + UBLOCK_LATIN_EXTENDED_B =4, /*[0180]*/ + + /** @stable ICU 2.0 */ + UBLOCK_IPA_EXTENSIONS =5, /*[0250]*/ + + /** @stable ICU 2.0 */ + UBLOCK_SPACING_MODIFIER_LETTERS =6, /*[02B0]*/ + + /** @stable ICU 2.0 */ + UBLOCK_COMBINING_DIACRITICAL_MARKS =7, /*[0300]*/ + + /** + * Unicode 3.2 renames this block to "Greek and Coptic". + * @stable ICU 2.0 + */ + UBLOCK_GREEK =8, /*[0370]*/ + + /** @stable ICU 2.0 */ + UBLOCK_CYRILLIC =9, /*[0400]*/ + + /** @stable ICU 2.0 */ + UBLOCK_ARMENIAN =10, /*[0530]*/ + + /** @stable ICU 2.0 */ + UBLOCK_HEBREW =11, /*[0590]*/ + + /** @stable ICU 2.0 */ + UBLOCK_ARABIC =12, /*[0600]*/ + + /** @stable ICU 2.0 */ + UBLOCK_SYRIAC =13, /*[0700]*/ + + /** @stable ICU 2.0 */ + UBLOCK_THAANA =14, /*[0780]*/ + + /** @stable ICU 2.0 */ + UBLOCK_DEVANAGARI =15, /*[0900]*/ + + /** @stable ICU 2.0 */ + UBLOCK_BENGALI =16, /*[0980]*/ + + /** @stable ICU 2.0 */ + UBLOCK_GURMUKHI =17, /*[0A00]*/ + + /** @stable ICU 2.0 */ + UBLOCK_GUJARATI =18, /*[0A80]*/ + + /** @stable ICU 2.0 */ + UBLOCK_ORIYA =19, /*[0B00]*/ + + /** @stable ICU 2.0 */ + UBLOCK_TAMIL =20, /*[0B80]*/ + + /** @stable ICU 2.0 */ + UBLOCK_TELUGU =21, /*[0C00]*/ + + /** @stable ICU 2.0 */ + UBLOCK_KANNADA =22, /*[0C80]*/ + + /** @stable ICU 2.0 */ + UBLOCK_MALAYALAM =23, /*[0D00]*/ + + /** @stable ICU 2.0 */ + UBLOCK_SINHALA =24, /*[0D80]*/ + + /** @stable ICU 2.0 */ + UBLOCK_THAI =25, /*[0E00]*/ + + /** @stable ICU 2.0 */ + UBLOCK_LAO =26, /*[0E80]*/ + + /** @stable ICU 2.0 */ + UBLOCK_TIBETAN =27, /*[0F00]*/ + + /** @stable ICU 2.0 */ + UBLOCK_MYANMAR =28, /*[1000]*/ + + /** @stable ICU 2.0 */ + UBLOCK_GEORGIAN =29, /*[10A0]*/ + + /** @stable ICU 2.0 */ + UBLOCK_HANGUL_JAMO =30, /*[1100]*/ + + /** @stable ICU 2.0 */ + UBLOCK_ETHIOPIC =31, /*[1200]*/ + + /** @stable ICU 2.0 */ + UBLOCK_CHEROKEE =32, /*[13A0]*/ + + /** @stable ICU 2.0 */ + UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS =33, /*[1400]*/ + + /** @stable ICU 2.0 */ + UBLOCK_OGHAM =34, /*[1680]*/ + + /** @stable ICU 2.0 */ + UBLOCK_RUNIC =35, /*[16A0]*/ + + /** @stable ICU 2.0 */ + UBLOCK_KHMER =36, /*[1780]*/ + + /** @stable ICU 2.0 */ + UBLOCK_MONGOLIAN =37, /*[1800]*/ + + /** @stable ICU 2.0 */ + UBLOCK_LATIN_EXTENDED_ADDITIONAL =38, /*[1E00]*/ + + /** @stable ICU 2.0 */ + UBLOCK_GREEK_EXTENDED =39, /*[1F00]*/ + + /** @stable ICU 2.0 */ + UBLOCK_GENERAL_PUNCTUATION =40, /*[2000]*/ + + /** @stable ICU 2.0 */ + UBLOCK_SUPERSCRIPTS_AND_SUBSCRIPTS =41, /*[2070]*/ + + /** @stable ICU 2.0 */ + UBLOCK_CURRENCY_SYMBOLS =42, /*[20A0]*/ + + /** + * Unicode 3.2 renames this block to "Combining Diacritical Marks for Symbols". + * @stable ICU 2.0 + */ + UBLOCK_COMBINING_MARKS_FOR_SYMBOLS =43, /*[20D0]*/ + + /** @stable ICU 2.0 */ + UBLOCK_LETTERLIKE_SYMBOLS =44, /*[2100]*/ + + /** @stable ICU 2.0 */ + UBLOCK_NUMBER_FORMS =45, /*[2150]*/ + + /** @stable ICU 2.0 */ + UBLOCK_ARROWS =46, /*[2190]*/ + + /** @stable ICU 2.0 */ + UBLOCK_MATHEMATICAL_OPERATORS =47, /*[2200]*/ + + /** @stable ICU 2.0 */ + UBLOCK_MISCELLANEOUS_TECHNICAL =48, /*[2300]*/ + + /** @stable ICU 2.0 */ + UBLOCK_CONTROL_PICTURES =49, /*[2400]*/ + + /** @stable ICU 2.0 */ + UBLOCK_OPTICAL_CHARACTER_RECOGNITION =50, /*[2440]*/ + + /** @stable ICU 2.0 */ + UBLOCK_ENCLOSED_ALPHANUMERICS =51, /*[2460]*/ + + /** @stable ICU 2.0 */ + UBLOCK_BOX_DRAWING =52, /*[2500]*/ + + /** @stable ICU 2.0 */ + UBLOCK_BLOCK_ELEMENTS =53, /*[2580]*/ + + /** @stable ICU 2.0 */ + UBLOCK_GEOMETRIC_SHAPES =54, /*[25A0]*/ + + /** @stable ICU 2.0 */ + UBLOCK_MISCELLANEOUS_SYMBOLS =55, /*[2600]*/ + + /** @stable ICU 2.0 */ + UBLOCK_DINGBATS =56, /*[2700]*/ + + /** @stable ICU 2.0 */ + UBLOCK_BRAILLE_PATTERNS =57, /*[2800]*/ + + /** @stable ICU 2.0 */ + UBLOCK_CJK_RADICALS_SUPPLEMENT =58, /*[2E80]*/ + + /** @stable ICU 2.0 */ + UBLOCK_KANGXI_RADICALS =59, /*[2F00]*/ + + /** @stable ICU 2.0 */ + UBLOCK_IDEOGRAPHIC_DESCRIPTION_CHARACTERS =60, /*[2FF0]*/ + + /** @stable ICU 2.0 */ + UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION =61, /*[3000]*/ + + /** @stable ICU 2.0 */ + UBLOCK_HIRAGANA =62, /*[3040]*/ + + /** @stable ICU 2.0 */ + UBLOCK_KATAKANA =63, /*[30A0]*/ + + /** @stable ICU 2.0 */ + UBLOCK_BOPOMOFO =64, /*[3100]*/ + + /** @stable ICU 2.0 */ + UBLOCK_HANGUL_COMPATIBILITY_JAMO =65, /*[3130]*/ + + /** @stable ICU 2.0 */ + UBLOCK_KANBUN =66, /*[3190]*/ + + /** @stable ICU 2.0 */ + UBLOCK_BOPOMOFO_EXTENDED =67, /*[31A0]*/ + + /** @stable ICU 2.0 */ + UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS =68, /*[3200]*/ + + /** @stable ICU 2.0 */ + UBLOCK_CJK_COMPATIBILITY =69, /*[3300]*/ + + /** @stable ICU 2.0 */ + UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A =70, /*[3400]*/ + + /** @stable ICU 2.0 */ + UBLOCK_CJK_UNIFIED_IDEOGRAPHS =71, /*[4E00]*/ + + /** @stable ICU 2.0 */ + UBLOCK_YI_SYLLABLES =72, /*[A000]*/ + + /** @stable ICU 2.0 */ + UBLOCK_YI_RADICALS =73, /*[A490]*/ + + /** @stable ICU 2.0 */ + UBLOCK_HANGUL_SYLLABLES =74, /*[AC00]*/ + + /** @stable ICU 2.0 */ + UBLOCK_HIGH_SURROGATES =75, /*[D800]*/ + + /** @stable ICU 2.0 */ + UBLOCK_HIGH_PRIVATE_USE_SURROGATES =76, /*[DB80]*/ + + /** @stable ICU 2.0 */ + UBLOCK_LOW_SURROGATES =77, /*[DC00]*/ + + /** + * Same as UBLOCK_PRIVATE_USE. + * Until Unicode 3.1.1, the corresponding block name was "Private Use", + * and multiple code point ranges had this block. + * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area" and + * adds separate blocks for the supplementary PUAs. + * + * @stable ICU 2.0 + */ + UBLOCK_PRIVATE_USE_AREA =78, /*[E000]*/ + /** + * Same as UBLOCK_PRIVATE_USE_AREA. + * Until Unicode 3.1.1, the corresponding block name was "Private Use", + * and multiple code point ranges had this block. + * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area" and + * adds separate blocks for the supplementary PUAs. + * + * @stable ICU 2.0 + */ + UBLOCK_PRIVATE_USE = UBLOCK_PRIVATE_USE_AREA, + + /** @stable ICU 2.0 */ + UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS =79, /*[F900]*/ + + /** @stable ICU 2.0 */ + UBLOCK_ALPHABETIC_PRESENTATION_FORMS =80, /*[FB00]*/ + + /** @stable ICU 2.0 */ + UBLOCK_ARABIC_PRESENTATION_FORMS_A =81, /*[FB50]*/ + + /** @stable ICU 2.0 */ + UBLOCK_COMBINING_HALF_MARKS =82, /*[FE20]*/ + + /** @stable ICU 2.0 */ + UBLOCK_CJK_COMPATIBILITY_FORMS =83, /*[FE30]*/ + + /** @stable ICU 2.0 */ + UBLOCK_SMALL_FORM_VARIANTS =84, /*[FE50]*/ + + /** @stable ICU 2.0 */ + UBLOCK_ARABIC_PRESENTATION_FORMS_B =85, /*[FE70]*/ + + /** @stable ICU 2.0 */ + UBLOCK_SPECIALS =86, /*[FFF0]*/ + + /** @stable ICU 2.0 */ + UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS =87, /*[FF00]*/ + + /* New blocks in Unicode 3.1 */ + + /** @stable ICU 2.0 */ + UBLOCK_OLD_ITALIC = 88, /*[10300]*/ + /** @stable ICU 2.0 */ + UBLOCK_GOTHIC = 89, /*[10330]*/ + /** @stable ICU 2.0 */ + UBLOCK_DESERET = 90, /*[10400]*/ + /** @stable ICU 2.0 */ + UBLOCK_BYZANTINE_MUSICAL_SYMBOLS = 91, /*[1D000]*/ + /** @stable ICU 2.0 */ + UBLOCK_MUSICAL_SYMBOLS = 92, /*[1D100]*/ + /** @stable ICU 2.0 */ + UBLOCK_MATHEMATICAL_ALPHANUMERIC_SYMBOLS = 93, /*[1D400]*/ + /** @stable ICU 2.0 */ + UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = 94, /*[20000]*/ + /** @stable ICU 2.0 */ + UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = 95, /*[2F800]*/ + /** @stable ICU 2.0 */ + UBLOCK_TAGS = 96, /*[E0000]*/ + + /* New blocks in Unicode 3.2 */ + + /** @stable ICU 3.0 */ + UBLOCK_CYRILLIC_SUPPLEMENT = 97, /*[0500]*/ + /** + * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement". + * @stable ICU 2.2 + */ + UBLOCK_CYRILLIC_SUPPLEMENTARY = UBLOCK_CYRILLIC_SUPPLEMENT, + /** @stable ICU 2.2 */ + UBLOCK_TAGALOG = 98, /*[1700]*/ + /** @stable ICU 2.2 */ + UBLOCK_HANUNOO = 99, /*[1720]*/ + /** @stable ICU 2.2 */ + UBLOCK_BUHID = 100, /*[1740]*/ + /** @stable ICU 2.2 */ + UBLOCK_TAGBANWA = 101, /*[1760]*/ + /** @stable ICU 2.2 */ + UBLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = 102, /*[27C0]*/ + /** @stable ICU 2.2 */ + UBLOCK_SUPPLEMENTAL_ARROWS_A = 103, /*[27F0]*/ + /** @stable ICU 2.2 */ + UBLOCK_SUPPLEMENTAL_ARROWS_B = 104, /*[2900]*/ + /** @stable ICU 2.2 */ + UBLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = 105, /*[2980]*/ + /** @stable ICU 2.2 */ + UBLOCK_SUPPLEMENTAL_MATHEMATICAL_OPERATORS = 106, /*[2A00]*/ + /** @stable ICU 2.2 */ + UBLOCK_KATAKANA_PHONETIC_EXTENSIONS = 107, /*[31F0]*/ + /** @stable ICU 2.2 */ + UBLOCK_VARIATION_SELECTORS = 108, /*[FE00]*/ + /** @stable ICU 2.2 */ + UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_A = 109, /*[F0000]*/ + /** @stable ICU 2.2 */ + UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B = 110, /*[100000]*/ + + /* New blocks in Unicode 4 */ + + /** @stable ICU 2.6 */ + UBLOCK_LIMBU = 111, /*[1900]*/ + /** @stable ICU 2.6 */ + UBLOCK_TAI_LE = 112, /*[1950]*/ + /** @stable ICU 2.6 */ + UBLOCK_KHMER_SYMBOLS = 113, /*[19E0]*/ + /** @stable ICU 2.6 */ + UBLOCK_PHONETIC_EXTENSIONS = 114, /*[1D00]*/ + /** @stable ICU 2.6 */ + UBLOCK_MISCELLANEOUS_SYMBOLS_AND_ARROWS = 115, /*[2B00]*/ + /** @stable ICU 2.6 */ + UBLOCK_YIJING_HEXAGRAM_SYMBOLS = 116, /*[4DC0]*/ + /** @stable ICU 2.6 */ + UBLOCK_LINEAR_B_SYLLABARY = 117, /*[10000]*/ + /** @stable ICU 2.6 */ + UBLOCK_LINEAR_B_IDEOGRAMS = 118, /*[10080]*/ + /** @stable ICU 2.6 */ + UBLOCK_AEGEAN_NUMBERS = 119, /*[10100]*/ + /** @stable ICU 2.6 */ + UBLOCK_UGARITIC = 120, /*[10380]*/ + /** @stable ICU 2.6 */ + UBLOCK_SHAVIAN = 121, /*[10450]*/ + /** @stable ICU 2.6 */ + UBLOCK_OSMANYA = 122, /*[10480]*/ + /** @stable ICU 2.6 */ + UBLOCK_CYPRIOT_SYLLABARY = 123, /*[10800]*/ + /** @stable ICU 2.6 */ + UBLOCK_TAI_XUAN_JING_SYMBOLS = 124, /*[1D300]*/ + /** @stable ICU 2.6 */ + UBLOCK_VARIATION_SELECTORS_SUPPLEMENT = 125, /*[E0100]*/ + + /* New blocks in Unicode 4.1 */ + + /** @stable ICU 3.4 */ + UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION = 126, /*[1D200]*/ + /** @stable ICU 3.4 */ + UBLOCK_ANCIENT_GREEK_NUMBERS = 127, /*[10140]*/ + /** @stable ICU 3.4 */ + UBLOCK_ARABIC_SUPPLEMENT = 128, /*[0750]*/ + /** @stable ICU 3.4 */ + UBLOCK_BUGINESE = 129, /*[1A00]*/ + /** @stable ICU 3.4 */ + UBLOCK_CJK_STROKES = 130, /*[31C0]*/ + /** @stable ICU 3.4 */ + UBLOCK_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = 131, /*[1DC0]*/ + /** @stable ICU 3.4 */ + UBLOCK_COPTIC = 132, /*[2C80]*/ + /** @stable ICU 3.4 */ + UBLOCK_ETHIOPIC_EXTENDED = 133, /*[2D80]*/ + /** @stable ICU 3.4 */ + UBLOCK_ETHIOPIC_SUPPLEMENT = 134, /*[1380]*/ + /** @stable ICU 3.4 */ + UBLOCK_GEORGIAN_SUPPLEMENT = 135, /*[2D00]*/ + /** @stable ICU 3.4 */ + UBLOCK_GLAGOLITIC = 136, /*[2C00]*/ + /** @stable ICU 3.4 */ + UBLOCK_KHAROSHTHI = 137, /*[10A00]*/ + /** @stable ICU 3.4 */ + UBLOCK_MODIFIER_TONE_LETTERS = 138, /*[A700]*/ + /** @stable ICU 3.4 */ + UBLOCK_NEW_TAI_LUE = 139, /*[1980]*/ + /** @stable ICU 3.4 */ + UBLOCK_OLD_PERSIAN = 140, /*[103A0]*/ + /** @stable ICU 3.4 */ + UBLOCK_PHONETIC_EXTENSIONS_SUPPLEMENT = 141, /*[1D80]*/ + /** @stable ICU 3.4 */ + UBLOCK_SUPPLEMENTAL_PUNCTUATION = 142, /*[2E00]*/ + /** @stable ICU 3.4 */ + UBLOCK_SYLOTI_NAGRI = 143, /*[A800]*/ + /** @stable ICU 3.4 */ + UBLOCK_TIFINAGH = 144, /*[2D30]*/ + /** @stable ICU 3.4 */ + UBLOCK_VERTICAL_FORMS = 145, /*[FE10]*/ + + /* New blocks in Unicode 5.0 */ + + /** @stable ICU 3.6 */ + UBLOCK_NKO = 146, /*[07C0]*/ + /** @stable ICU 3.6 */ + UBLOCK_BALINESE = 147, /*[1B00]*/ + /** @stable ICU 3.6 */ + UBLOCK_LATIN_EXTENDED_C = 148, /*[2C60]*/ + /** @stable ICU 3.6 */ + UBLOCK_LATIN_EXTENDED_D = 149, /*[A720]*/ + /** @stable ICU 3.6 */ + UBLOCK_PHAGS_PA = 150, /*[A840]*/ + /** @stable ICU 3.6 */ + UBLOCK_PHOENICIAN = 151, /*[10900]*/ + /** @stable ICU 3.6 */ + UBLOCK_CUNEIFORM = 152, /*[12000]*/ + /** @stable ICU 3.6 */ + UBLOCK_CUNEIFORM_NUMBERS_AND_PUNCTUATION = 153, /*[12400]*/ + /** @stable ICU 3.6 */ + UBLOCK_COUNTING_ROD_NUMERALS = 154, /*[1D360]*/ + + /* New blocks in Unicode 5.1 */ + + /** @stable ICU 4.0 */ + UBLOCK_SUNDANESE = 155, /*[1B80]*/ + /** @stable ICU 4.0 */ + UBLOCK_LEPCHA = 156, /*[1C00]*/ + /** @stable ICU 4.0 */ + UBLOCK_OL_CHIKI = 157, /*[1C50]*/ + /** @stable ICU 4.0 */ + UBLOCK_CYRILLIC_EXTENDED_A = 158, /*[2DE0]*/ + /** @stable ICU 4.0 */ + UBLOCK_VAI = 159, /*[A500]*/ + /** @stable ICU 4.0 */ + UBLOCK_CYRILLIC_EXTENDED_B = 160, /*[A640]*/ + /** @stable ICU 4.0 */ + UBLOCK_SAURASHTRA = 161, /*[A880]*/ + /** @stable ICU 4.0 */ + UBLOCK_KAYAH_LI = 162, /*[A900]*/ + /** @stable ICU 4.0 */ + UBLOCK_REJANG = 163, /*[A930]*/ + /** @stable ICU 4.0 */ + UBLOCK_CHAM = 164, /*[AA00]*/ + /** @stable ICU 4.0 */ + UBLOCK_ANCIENT_SYMBOLS = 165, /*[10190]*/ + /** @stable ICU 4.0 */ + UBLOCK_PHAISTOS_DISC = 166, /*[101D0]*/ + /** @stable ICU 4.0 */ + UBLOCK_LYCIAN = 167, /*[10280]*/ + /** @stable ICU 4.0 */ + UBLOCK_CARIAN = 168, /*[102A0]*/ + /** @stable ICU 4.0 */ + UBLOCK_LYDIAN = 169, /*[10920]*/ + /** @stable ICU 4.0 */ + UBLOCK_MAHJONG_TILES = 170, /*[1F000]*/ + /** @stable ICU 4.0 */ + UBLOCK_DOMINO_TILES = 171, /*[1F030]*/ + + /* New blocks in Unicode 5.2 */ + + /** @stable ICU 4.4 */ + UBLOCK_SAMARITAN = 172, /*[0800]*/ + /** @stable ICU 4.4 */ + UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED = 173, /*[18B0]*/ + /** @stable ICU 4.4 */ + UBLOCK_TAI_THAM = 174, /*[1A20]*/ + /** @stable ICU 4.4 */ + UBLOCK_VEDIC_EXTENSIONS = 175, /*[1CD0]*/ + /** @stable ICU 4.4 */ + UBLOCK_LISU = 176, /*[A4D0]*/ + /** @stable ICU 4.4 */ + UBLOCK_BAMUM = 177, /*[A6A0]*/ + /** @stable ICU 4.4 */ + UBLOCK_COMMON_INDIC_NUMBER_FORMS = 178, /*[A830]*/ + /** @stable ICU 4.4 */ + UBLOCK_DEVANAGARI_EXTENDED = 179, /*[A8E0]*/ + /** @stable ICU 4.4 */ + UBLOCK_HANGUL_JAMO_EXTENDED_A = 180, /*[A960]*/ + /** @stable ICU 4.4 */ + UBLOCK_JAVANESE = 181, /*[A980]*/ + /** @stable ICU 4.4 */ + UBLOCK_MYANMAR_EXTENDED_A = 182, /*[AA60]*/ + /** @stable ICU 4.4 */ + UBLOCK_TAI_VIET = 183, /*[AA80]*/ + /** @stable ICU 4.4 */ + UBLOCK_MEETEI_MAYEK = 184, /*[ABC0]*/ + /** @stable ICU 4.4 */ + UBLOCK_HANGUL_JAMO_EXTENDED_B = 185, /*[D7B0]*/ + /** @stable ICU 4.4 */ + UBLOCK_IMPERIAL_ARAMAIC = 186, /*[10840]*/ + /** @stable ICU 4.4 */ + UBLOCK_OLD_SOUTH_ARABIAN = 187, /*[10A60]*/ + /** @stable ICU 4.4 */ + UBLOCK_AVESTAN = 188, /*[10B00]*/ + /** @stable ICU 4.4 */ + UBLOCK_INSCRIPTIONAL_PARTHIAN = 189, /*[10B40]*/ + /** @stable ICU 4.4 */ + UBLOCK_INSCRIPTIONAL_PAHLAVI = 190, /*[10B60]*/ + /** @stable ICU 4.4 */ + UBLOCK_OLD_TURKIC = 191, /*[10C00]*/ + /** @stable ICU 4.4 */ + UBLOCK_RUMI_NUMERAL_SYMBOLS = 192, /*[10E60]*/ + /** @stable ICU 4.4 */ + UBLOCK_KAITHI = 193, /*[11080]*/ + /** @stable ICU 4.4 */ + UBLOCK_EGYPTIAN_HIEROGLYPHS = 194, /*[13000]*/ + /** @stable ICU 4.4 */ + UBLOCK_ENCLOSED_ALPHANUMERIC_SUPPLEMENT = 195, /*[1F100]*/ + /** @stable ICU 4.4 */ + UBLOCK_ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = 196, /*[1F200]*/ + /** @stable ICU 4.4 */ + UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C = 197, /*[2A700]*/ + + /* New blocks in Unicode 6.0 */ + + /** @stable ICU 4.6 */ + UBLOCK_MANDAIC = 198, /*[0840]*/ + /** @stable ICU 4.6 */ + UBLOCK_BATAK = 199, /*[1BC0]*/ + /** @stable ICU 4.6 */ + UBLOCK_ETHIOPIC_EXTENDED_A = 200, /*[AB00]*/ + /** @stable ICU 4.6 */ + UBLOCK_BRAHMI = 201, /*[11000]*/ + /** @stable ICU 4.6 */ + UBLOCK_BAMUM_SUPPLEMENT = 202, /*[16800]*/ + /** @stable ICU 4.6 */ + UBLOCK_KANA_SUPPLEMENT = 203, /*[1B000]*/ + /** @stable ICU 4.6 */ + UBLOCK_PLAYING_CARDS = 204, /*[1F0A0]*/ + /** @stable ICU 4.6 */ + UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS = 205, /*[1F300]*/ + /** @stable ICU 4.6 */ + UBLOCK_EMOTICONS = 206, /*[1F600]*/ + /** @stable ICU 4.6 */ + UBLOCK_TRANSPORT_AND_MAP_SYMBOLS = 207, /*[1F680]*/ + /** @stable ICU 4.6 */ + UBLOCK_ALCHEMICAL_SYMBOLS = 208, /*[1F700]*/ + /** @stable ICU 4.6 */ + UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D = 209, /*[2B740]*/ + + /* New blocks in Unicode 6.1 */ + + /** @stable ICU 49 */ + UBLOCK_ARABIC_EXTENDED_A = 210, /*[08A0]*/ + /** @stable ICU 49 */ + UBLOCK_ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS = 211, /*[1EE00]*/ + /** @stable ICU 49 */ + UBLOCK_CHAKMA = 212, /*[11100]*/ + /** @stable ICU 49 */ + UBLOCK_MEETEI_MAYEK_EXTENSIONS = 213, /*[AAE0]*/ + /** @stable ICU 49 */ + UBLOCK_MEROITIC_CURSIVE = 214, /*[109A0]*/ + /** @stable ICU 49 */ + UBLOCK_MEROITIC_HIEROGLYPHS = 215, /*[10980]*/ + /** @stable ICU 49 */ + UBLOCK_MIAO = 216, /*[16F00]*/ + /** @stable ICU 49 */ + UBLOCK_SHARADA = 217, /*[11180]*/ + /** @stable ICU 49 */ + UBLOCK_SORA_SOMPENG = 218, /*[110D0]*/ + /** @stable ICU 49 */ + UBLOCK_SUNDANESE_SUPPLEMENT = 219, /*[1CC0]*/ + /** @stable ICU 49 */ + UBLOCK_TAKRI = 220, /*[11680]*/ + + /* New blocks in Unicode 7.0 */ + + /** @stable ICU 54 */ + UBLOCK_BASSA_VAH = 221, /*[16AD0]*/ + /** @stable ICU 54 */ + UBLOCK_CAUCASIAN_ALBANIAN = 222, /*[10530]*/ + /** @stable ICU 54 */ + UBLOCK_COPTIC_EPACT_NUMBERS = 223, /*[102E0]*/ + /** @stable ICU 54 */ + UBLOCK_COMBINING_DIACRITICAL_MARKS_EXTENDED = 224, /*[1AB0]*/ + /** @stable ICU 54 */ + UBLOCK_DUPLOYAN = 225, /*[1BC00]*/ + /** @stable ICU 54 */ + UBLOCK_ELBASAN = 226, /*[10500]*/ + /** @stable ICU 54 */ + UBLOCK_GEOMETRIC_SHAPES_EXTENDED = 227, /*[1F780]*/ + /** @stable ICU 54 */ + UBLOCK_GRANTHA = 228, /*[11300]*/ + /** @stable ICU 54 */ + UBLOCK_KHOJKI = 229, /*[11200]*/ + /** @stable ICU 54 */ + UBLOCK_KHUDAWADI = 230, /*[112B0]*/ + /** @stable ICU 54 */ + UBLOCK_LATIN_EXTENDED_E = 231, /*[AB30]*/ + /** @stable ICU 54 */ + UBLOCK_LINEAR_A = 232, /*[10600]*/ + /** @stable ICU 54 */ + UBLOCK_MAHAJANI = 233, /*[11150]*/ + /** @stable ICU 54 */ + UBLOCK_MANICHAEAN = 234, /*[10AC0]*/ + /** @stable ICU 54 */ + UBLOCK_MENDE_KIKAKUI = 235, /*[1E800]*/ + /** @stable ICU 54 */ + UBLOCK_MODI = 236, /*[11600]*/ + /** @stable ICU 54 */ + UBLOCK_MRO = 237, /*[16A40]*/ + /** @stable ICU 54 */ + UBLOCK_MYANMAR_EXTENDED_B = 238, /*[A9E0]*/ + /** @stable ICU 54 */ + UBLOCK_NABATAEAN = 239, /*[10880]*/ + /** @stable ICU 54 */ + UBLOCK_OLD_NORTH_ARABIAN = 240, /*[10A80]*/ + /** @stable ICU 54 */ + UBLOCK_OLD_PERMIC = 241, /*[10350]*/ + /** @stable ICU 54 */ + UBLOCK_ORNAMENTAL_DINGBATS = 242, /*[1F650]*/ + /** @stable ICU 54 */ + UBLOCK_PAHAWH_HMONG = 243, /*[16B00]*/ + /** @stable ICU 54 */ + UBLOCK_PALMYRENE = 244, /*[10860]*/ + /** @stable ICU 54 */ + UBLOCK_PAU_CIN_HAU = 245, /*[11AC0]*/ + /** @stable ICU 54 */ + UBLOCK_PSALTER_PAHLAVI = 246, /*[10B80]*/ + /** @stable ICU 54 */ + UBLOCK_SHORTHAND_FORMAT_CONTROLS = 247, /*[1BCA0]*/ + /** @stable ICU 54 */ + UBLOCK_SIDDHAM = 248, /*[11580]*/ + /** @stable ICU 54 */ + UBLOCK_SINHALA_ARCHAIC_NUMBERS = 249, /*[111E0]*/ + /** @stable ICU 54 */ + UBLOCK_SUPPLEMENTAL_ARROWS_C = 250, /*[1F800]*/ + /** @stable ICU 54 */ + UBLOCK_TIRHUTA = 251, /*[11480]*/ + /** @stable ICU 54 */ + UBLOCK_WARANG_CITI = 252, /*[118A0]*/ + + /** @stable ICU 2.0 */ + UBLOCK_COUNT = 253, + + /** @stable ICU 2.0 */ + UBLOCK_INVALID_CODE=-1 +}; + +/** @stable ICU 2.0 */ +typedef enum UBlockCode UBlockCode; + +/** + * East Asian Width constants. + * + * @see UCHAR_EAST_ASIAN_WIDTH + * @see u_getIntPropertyValue + * @stable ICU 2.2 + */ +typedef enum UEastAsianWidth { + /* + * Note: UEastAsianWidth constants are parsed by preparseucd.py. + * It matches lines like + * U_EA_<Unicode East_Asian_Width value name> + */ + + U_EA_NEUTRAL, /*[N]*/ + U_EA_AMBIGUOUS, /*[A]*/ + U_EA_HALFWIDTH, /*[H]*/ + U_EA_FULLWIDTH, /*[F]*/ + U_EA_NARROW, /*[Na]*/ + U_EA_WIDE, /*[W]*/ + U_EA_COUNT +} UEastAsianWidth; + +/** + * Selector constants for u_charName(). + * u_charName() returns the "modern" name of a + * Unicode character; or the name that was defined in + * Unicode version 1.0, before the Unicode standard merged + * with ISO-10646; or an "extended" name that gives each + * Unicode code point a unique name. + * + * @see u_charName + * @stable ICU 2.0 + */ +typedef enum UCharNameChoice { + /** Unicode character name (Name property). @stable ICU 2.0 */ + U_UNICODE_CHAR_NAME, +#ifndef U_HIDE_DEPRECATED_API + /** + * The Unicode_1_Name property value which is of little practical value. + * Beginning with ICU 49, ICU APIs return an empty string for this name choice. + * @deprecated ICU 49 + */ + U_UNICODE_10_CHAR_NAME, +#endif /* U_HIDE_DEPRECATED_API */ + /** Standard or synthetic character name. @stable ICU 2.0 */ + U_EXTENDED_CHAR_NAME = U_UNICODE_CHAR_NAME+2, + /** Corrected name from NameAliases.txt. @stable ICU 4.4 */ + U_CHAR_NAME_ALIAS, + /** @stable ICU 2.0 */ + U_CHAR_NAME_CHOICE_COUNT +} UCharNameChoice; + +/** + * Selector constants for u_getPropertyName() and + * u_getPropertyValueName(). These selectors are used to choose which + * name is returned for a given property or value. All properties and + * values have a long name. Most have a short name, but some do not. + * Unicode allows for additional names, beyond the long and short + * name, which would be indicated by U_LONG_PROPERTY_NAME + i, where + * i=1, 2,... + * + * @see u_getPropertyName() + * @see u_getPropertyValueName() + * @stable ICU 2.4 + */ +typedef enum UPropertyNameChoice { + U_SHORT_PROPERTY_NAME, + U_LONG_PROPERTY_NAME, + U_PROPERTY_NAME_CHOICE_COUNT +} UPropertyNameChoice; + +/** + * Decomposition Type constants. + * + * @see UCHAR_DECOMPOSITION_TYPE + * @stable ICU 2.2 + */ +typedef enum UDecompositionType { + /* + * Note: UDecompositionType constants are parsed by preparseucd.py. + * It matches lines like + * U_DT_<Unicode Decomposition_Type value name> + */ + + U_DT_NONE, /*[none]*/ + U_DT_CANONICAL, /*[can]*/ + U_DT_COMPAT, /*[com]*/ + U_DT_CIRCLE, /*[enc]*/ + U_DT_FINAL, /*[fin]*/ + U_DT_FONT, /*[font]*/ + U_DT_FRACTION, /*[fra]*/ + U_DT_INITIAL, /*[init]*/ + U_DT_ISOLATED, /*[iso]*/ + U_DT_MEDIAL, /*[med]*/ + U_DT_NARROW, /*[nar]*/ + U_DT_NOBREAK, /*[nb]*/ + U_DT_SMALL, /*[sml]*/ + U_DT_SQUARE, /*[sqr]*/ + U_DT_SUB, /*[sub]*/ + U_DT_SUPER, /*[sup]*/ + U_DT_VERTICAL, /*[vert]*/ + U_DT_WIDE, /*[wide]*/ + U_DT_COUNT /* 18 */ +} UDecompositionType; + +/** + * Joining Type constants. + * + * @see UCHAR_JOINING_TYPE + * @stable ICU 2.2 + */ +typedef enum UJoiningType { + /* + * Note: UJoiningType constants are parsed by preparseucd.py. + * It matches lines like + * U_JT_<Unicode Joining_Type value name> + */ + + U_JT_NON_JOINING, /*[U]*/ + U_JT_JOIN_CAUSING, /*[C]*/ + U_JT_DUAL_JOINING, /*[D]*/ + U_JT_LEFT_JOINING, /*[L]*/ + U_JT_RIGHT_JOINING, /*[R]*/ + U_JT_TRANSPARENT, /*[T]*/ + U_JT_COUNT /* 6 */ +} UJoiningType; + +/** + * Joining Group constants. + * + * @see UCHAR_JOINING_GROUP + * @stable ICU 2.2 + */ +typedef enum UJoiningGroup { + /* + * Note: UJoiningGroup constants are parsed by preparseucd.py. + * It matches lines like + * U_JG_<Unicode Joining_Group value name> + */ + + U_JG_NO_JOINING_GROUP, + U_JG_AIN, + U_JG_ALAPH, + U_JG_ALEF, + U_JG_BEH, + U_JG_BETH, + U_JG_DAL, + U_JG_DALATH_RISH, + U_JG_E, + U_JG_FEH, + U_JG_FINAL_SEMKATH, + U_JG_GAF, + U_JG_GAMAL, + U_JG_HAH, + U_JG_TEH_MARBUTA_GOAL, /**< @stable ICU 4.6 */ + U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL, + U_JG_HE, + U_JG_HEH, + U_JG_HEH_GOAL, + U_JG_HETH, + U_JG_KAF, + U_JG_KAPH, + U_JG_KNOTTED_HEH, + U_JG_LAM, + U_JG_LAMADH, + U_JG_MEEM, + U_JG_MIM, + U_JG_NOON, + U_JG_NUN, + U_JG_PE, + U_JG_QAF, + U_JG_QAPH, + U_JG_REH, + U_JG_REVERSED_PE, + U_JG_SAD, + U_JG_SADHE, + U_JG_SEEN, + U_JG_SEMKATH, + U_JG_SHIN, + U_JG_SWASH_KAF, + U_JG_SYRIAC_WAW, + U_JG_TAH, + U_JG_TAW, + U_JG_TEH_MARBUTA, + U_JG_TETH, + U_JG_WAW, + U_JG_YEH, + U_JG_YEH_BARREE, + U_JG_YEH_WITH_TAIL, + U_JG_YUDH, + U_JG_YUDH_HE, + U_JG_ZAIN, + U_JG_FE, /**< @stable ICU 2.6 */ + U_JG_KHAPH, /**< @stable ICU 2.6 */ + U_JG_ZHAIN, /**< @stable ICU 2.6 */ + U_JG_BURUSHASKI_YEH_BARREE, /**< @stable ICU 4.0 */ + U_JG_FARSI_YEH, /**< @stable ICU 4.4 */ + U_JG_NYA, /**< @stable ICU 4.4 */ + U_JG_ROHINGYA_YEH, /**< @stable ICU 49 */ + U_JG_MANICHAEAN_ALEPH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_AYIN, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_BETH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_DALETH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_DHAMEDH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_FIVE, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_GIMEL, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_HETH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_HUNDRED, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_KAPH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_LAMEDH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_MEM, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_NUN, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_ONE, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_PE, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_QOPH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_RESH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_SADHE, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_SAMEKH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_TAW, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_TEN, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_TETH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_THAMEDH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_TWENTY, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_WAW, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_YODH, /**< @stable ICU 54 */ + U_JG_MANICHAEAN_ZAYIN, /**< @stable ICU 54 */ + U_JG_STRAIGHT_WAW, /**< @stable ICU 54 */ + U_JG_COUNT +} UJoiningGroup; + +/** + * Grapheme Cluster Break constants. + * + * @see UCHAR_GRAPHEME_CLUSTER_BREAK + * @stable ICU 3.4 + */ +typedef enum UGraphemeClusterBreak { + /* + * Note: UGraphemeClusterBreak constants are parsed by preparseucd.py. + * It matches lines like + * U_GCB_<Unicode Grapheme_Cluster_Break value name> + */ + + U_GCB_OTHER = 0, /*[XX]*/ + U_GCB_CONTROL = 1, /*[CN]*/ + U_GCB_CR = 2, /*[CR]*/ + U_GCB_EXTEND = 3, /*[EX]*/ + U_GCB_L = 4, /*[L]*/ + U_GCB_LF = 5, /*[LF]*/ + U_GCB_LV = 6, /*[LV]*/ + U_GCB_LVT = 7, /*[LVT]*/ + U_GCB_T = 8, /*[T]*/ + U_GCB_V = 9, /*[V]*/ + U_GCB_SPACING_MARK = 10, /*[SM]*/ /* from here on: new in Unicode 5.1/ICU 4.0 */ + U_GCB_PREPEND = 11, /*[PP]*/ + U_GCB_REGIONAL_INDICATOR = 12, /*[RI]*/ /* new in Unicode 6.2/ICU 50 */ + U_GCB_COUNT = 13 +} UGraphemeClusterBreak; + +/** + * Word Break constants. + * (UWordBreak is a pre-existing enum type in ubrk.h for word break status tags.) + * + * @see UCHAR_WORD_BREAK + * @stable ICU 3.4 + */ +typedef enum UWordBreakValues { + /* + * Note: UWordBreakValues constants are parsed by preparseucd.py. + * It matches lines like + * U_WB_<Unicode Word_Break value name> + */ + + U_WB_OTHER = 0, /*[XX]*/ + U_WB_ALETTER = 1, /*[LE]*/ + U_WB_FORMAT = 2, /*[FO]*/ + U_WB_KATAKANA = 3, /*[KA]*/ + U_WB_MIDLETTER = 4, /*[ML]*/ + U_WB_MIDNUM = 5, /*[MN]*/ + U_WB_NUMERIC = 6, /*[NU]*/ + U_WB_EXTENDNUMLET = 7, /*[EX]*/ + U_WB_CR = 8, /*[CR]*/ /* from here on: new in Unicode 5.1/ICU 4.0 */ + U_WB_EXTEND = 9, /*[Extend]*/ + U_WB_LF = 10, /*[LF]*/ + U_WB_MIDNUMLET =11, /*[MB]*/ + U_WB_NEWLINE =12, /*[NL]*/ + U_WB_REGIONAL_INDICATOR = 13, /*[RI]*/ /* new in Unicode 6.2/ICU 50 */ + U_WB_HEBREW_LETTER = 14, /*[HL]*/ /* from here on: new in Unicode 6.3/ICU 52 */ + U_WB_SINGLE_QUOTE = 15, /*[SQ]*/ + U_WB_DOUBLE_QUOTE = 16, /*[DQ]*/ + U_WB_COUNT = 17 +} UWordBreakValues; + +/** + * Sentence Break constants. + * + * @see UCHAR_SENTENCE_BREAK + * @stable ICU 3.4 + */ +typedef enum USentenceBreak { + /* + * Note: USentenceBreak constants are parsed by preparseucd.py. + * It matches lines like + * U_SB_<Unicode Sentence_Break value name> + */ + + U_SB_OTHER = 0, /*[XX]*/ + U_SB_ATERM = 1, /*[AT]*/ + U_SB_CLOSE = 2, /*[CL]*/ + U_SB_FORMAT = 3, /*[FO]*/ + U_SB_LOWER = 4, /*[LO]*/ + U_SB_NUMERIC = 5, /*[NU]*/ + U_SB_OLETTER = 6, /*[LE]*/ + U_SB_SEP = 7, /*[SE]*/ + U_SB_SP = 8, /*[SP]*/ + U_SB_STERM = 9, /*[ST]*/ + U_SB_UPPER = 10, /*[UP]*/ + U_SB_CR = 11, /*[CR]*/ /* from here on: new in Unicode 5.1/ICU 4.0 */ + U_SB_EXTEND = 12, /*[EX]*/ + U_SB_LF = 13, /*[LF]*/ + U_SB_SCONTINUE = 14, /*[SC]*/ + U_SB_COUNT = 15 +} USentenceBreak; + +/** + * Line Break constants. + * + * @see UCHAR_LINE_BREAK + * @stable ICU 2.2 + */ +typedef enum ULineBreak { + /* + * Note: ULineBreak constants are parsed by preparseucd.py. + * It matches lines like + * U_LB_<Unicode Line_Break value name> + */ + + U_LB_UNKNOWN = 0, /*[XX]*/ + U_LB_AMBIGUOUS = 1, /*[AI]*/ + U_LB_ALPHABETIC = 2, /*[AL]*/ + U_LB_BREAK_BOTH = 3, /*[B2]*/ + U_LB_BREAK_AFTER = 4, /*[BA]*/ + U_LB_BREAK_BEFORE = 5, /*[BB]*/ + U_LB_MANDATORY_BREAK = 6, /*[BK]*/ + U_LB_CONTINGENT_BREAK = 7, /*[CB]*/ + U_LB_CLOSE_PUNCTUATION = 8, /*[CL]*/ + U_LB_COMBINING_MARK = 9, /*[CM]*/ + U_LB_CARRIAGE_RETURN = 10, /*[CR]*/ + U_LB_EXCLAMATION = 11, /*[EX]*/ + U_LB_GLUE = 12, /*[GL]*/ + U_LB_HYPHEN = 13, /*[HY]*/ + U_LB_IDEOGRAPHIC = 14, /*[ID]*/ + /** Renamed from the misspelled "inseperable" in Unicode 4.0.1/ICU 3.0 @stable ICU 3.0 */ + U_LB_INSEPARABLE = 15, /*[IN]*/ + U_LB_INSEPERABLE = U_LB_INSEPARABLE, + U_LB_INFIX_NUMERIC = 16, /*[IS]*/ + U_LB_LINE_FEED = 17, /*[LF]*/ + U_LB_NONSTARTER = 18, /*[NS]*/ + U_LB_NUMERIC = 19, /*[NU]*/ + U_LB_OPEN_PUNCTUATION = 20, /*[OP]*/ + U_LB_POSTFIX_NUMERIC = 21, /*[PO]*/ + U_LB_PREFIX_NUMERIC = 22, /*[PR]*/ + U_LB_QUOTATION = 23, /*[QU]*/ + U_LB_COMPLEX_CONTEXT = 24, /*[SA]*/ + U_LB_SURROGATE = 25, /*[SG]*/ + U_LB_SPACE = 26, /*[SP]*/ + U_LB_BREAK_SYMBOLS = 27, /*[SY]*/ + U_LB_ZWSPACE = 28, /*[ZW]*/ + U_LB_NEXT_LINE = 29, /*[NL]*/ /* from here on: new in Unicode 4/ICU 2.6 */ + U_LB_WORD_JOINER = 30, /*[WJ]*/ + U_LB_H2 = 31, /*[H2]*/ /* from here on: new in Unicode 4.1/ICU 3.4 */ + U_LB_H3 = 32, /*[H3]*/ + U_LB_JL = 33, /*[JL]*/ + U_LB_JT = 34, /*[JT]*/ + U_LB_JV = 35, /*[JV]*/ + U_LB_CLOSE_PARENTHESIS = 36, /*[CP]*/ /* new in Unicode 5.2/ICU 4.4 */ + U_LB_CONDITIONAL_JAPANESE_STARTER = 37,/*[CJ]*/ /* new in Unicode 6.1/ICU 49 */ + U_LB_HEBREW_LETTER = 38, /*[HL]*/ /* new in Unicode 6.1/ICU 49 */ + U_LB_REGIONAL_INDICATOR = 39,/*[RI]*/ /* new in Unicode 6.2/ICU 50 */ + U_LB_COUNT = 40 +} ULineBreak; + +/** + * Numeric Type constants. + * + * @see UCHAR_NUMERIC_TYPE + * @stable ICU 2.2 + */ +typedef enum UNumericType { + /* + * Note: UNumericType constants are parsed by preparseucd.py. + * It matches lines like + * U_NT_<Unicode Numeric_Type value name> + */ + + U_NT_NONE, /*[None]*/ + U_NT_DECIMAL, /*[de]*/ + U_NT_DIGIT, /*[di]*/ + U_NT_NUMERIC, /*[nu]*/ + U_NT_COUNT +} UNumericType; + +/** + * Hangul Syllable Type constants. + * + * @see UCHAR_HANGUL_SYLLABLE_TYPE + * @stable ICU 2.6 + */ +typedef enum UHangulSyllableType { + /* + * Note: UHangulSyllableType constants are parsed by preparseucd.py. + * It matches lines like + * U_HST_<Unicode Hangul_Syllable_Type value name> + */ + + U_HST_NOT_APPLICABLE, /*[NA]*/ + U_HST_LEADING_JAMO, /*[L]*/ + U_HST_VOWEL_JAMO, /*[V]*/ + U_HST_TRAILING_JAMO, /*[T]*/ + U_HST_LV_SYLLABLE, /*[LV]*/ + U_HST_LVT_SYLLABLE, /*[LVT]*/ + U_HST_COUNT +} UHangulSyllableType; + +/** + * Check a binary Unicode property for a code point. + * + * Unicode, especially in version 3.2, defines many more properties than the + * original set in UnicodeData.txt. + * + * The properties APIs are intended to reflect Unicode properties as defined + * in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). + * For details about the properties see http://www.unicode.org/ucd/ . + * For names of Unicode properties see the UCD file PropertyAliases.txt. + * + * Important: If ICU is built with UCD files from Unicode versions below 3.2, + * then properties marked with "new in Unicode 3.2" are not or not fully available. + * + * @param c Code point to test. + * @param which UProperty selector constant, identifies which binary property to check. + * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT. + * @return TRUE or FALSE according to the binary Unicode property value for c. + * Also FALSE if 'which' is out of bounds or if the Unicode version + * does not have data for the property at all, or not for this code point. + * + * @see UProperty + * @see u_getIntPropertyValue + * @see u_getUnicodeVersion + * @stable ICU 2.1 + */ +U_STABLE UBool U_EXPORT2 +u_hasBinaryProperty(UChar32 c, UProperty which); + +/** + * Check if a code point has the Alphabetic Unicode property. + * Same as u_hasBinaryProperty(c, UCHAR_ALPHABETIC). + * This is different from u_isalpha! + * @param c Code point to test + * @return true if the code point has the Alphabetic Unicode property, false otherwise + * + * @see UCHAR_ALPHABETIC + * @see u_isalpha + * @see u_hasBinaryProperty + * @stable ICU 2.1 + */ +U_STABLE UBool U_EXPORT2 +u_isUAlphabetic(UChar32 c); + +/** + * Check if a code point has the Lowercase Unicode property. + * Same as u_hasBinaryProperty(c, UCHAR_LOWERCASE). + * This is different from u_islower! + * @param c Code point to test + * @return true if the code point has the Lowercase Unicode property, false otherwise + * + * @see UCHAR_LOWERCASE + * @see u_islower + * @see u_hasBinaryProperty + * @stable ICU 2.1 + */ +U_STABLE UBool U_EXPORT2 +u_isULowercase(UChar32 c); + +/** + * Check if a code point has the Uppercase Unicode property. + * Same as u_hasBinaryProperty(c, UCHAR_UPPERCASE). + * This is different from u_isupper! + * @param c Code point to test + * @return true if the code point has the Uppercase Unicode property, false otherwise + * + * @see UCHAR_UPPERCASE + * @see u_isupper + * @see u_hasBinaryProperty + * @stable ICU 2.1 + */ +U_STABLE UBool U_EXPORT2 +u_isUUppercase(UChar32 c); + +/** + * Check if a code point has the White_Space Unicode property. + * Same as u_hasBinaryProperty(c, UCHAR_WHITE_SPACE). + * This is different from both u_isspace and u_isWhitespace! + * + * Note: There are several ICU whitespace functions; please see the uchar.h + * file documentation for a detailed comparison. + * + * @param c Code point to test + * @return true if the code point has the White_Space Unicode property, false otherwise. + * + * @see UCHAR_WHITE_SPACE + * @see u_isWhitespace + * @see u_isspace + * @see u_isJavaSpaceChar + * @see u_hasBinaryProperty + * @stable ICU 2.1 + */ +U_STABLE UBool U_EXPORT2 +u_isUWhiteSpace(UChar32 c); + +/** + * Get the property value for an enumerated or integer Unicode property for a code point. + * Also returns binary and mask property values. + * + * Unicode, especially in version 3.2, defines many more properties than the + * original set in UnicodeData.txt. + * + * The properties APIs are intended to reflect Unicode properties as defined + * in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). + * For details about the properties see http://www.unicode.org/ . + * For names of Unicode properties see the UCD file PropertyAliases.txt. + * + * Sample usage: + * UEastAsianWidth ea=(UEastAsianWidth)u_getIntPropertyValue(c, UCHAR_EAST_ASIAN_WIDTH); + * UBool b=(UBool)u_getIntPropertyValue(c, UCHAR_IDEOGRAPHIC); + * + * @param c Code point to test. + * @param which UProperty selector constant, identifies which property to check. + * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT + * or UCHAR_INT_START<=which<UCHAR_INT_LIMIT + * or UCHAR_MASK_START<=which<UCHAR_MASK_LIMIT. + * @return Numeric value that is directly the property value or, + * for enumerated properties, corresponds to the numeric value of the enumerated + * constant of the respective property value enumeration type + * (cast to enum type if necessary). + * Returns 0 or 1 (for FALSE/TRUE) for binary Unicode properties. + * Returns a bit-mask for mask properties. + * Returns 0 if 'which' is out of bounds or if the Unicode version + * does not have data for the property at all, or not for this code point. + * + * @see UProperty + * @see u_hasBinaryProperty + * @see u_getIntPropertyMinValue + * @see u_getIntPropertyMaxValue + * @see u_getUnicodeVersion + * @stable ICU 2.2 + */ +U_STABLE int32_t U_EXPORT2 +u_getIntPropertyValue(UChar32 c, UProperty which); + +/** + * Get the minimum value for an enumerated/integer/binary Unicode property. + * Can be used together with u_getIntPropertyMaxValue + * to allocate arrays of UnicodeSet or similar. + * + * @param which UProperty selector constant, identifies which binary property to check. + * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT + * or UCHAR_INT_START<=which<UCHAR_INT_LIMIT. + * @return Minimum value returned by u_getIntPropertyValue for a Unicode property. + * 0 if the property selector is out of range. + * + * @see UProperty + * @see u_hasBinaryProperty + * @see u_getUnicodeVersion + * @see u_getIntPropertyMaxValue + * @see u_getIntPropertyValue + * @stable ICU 2.2 + */ +U_STABLE int32_t U_EXPORT2 +u_getIntPropertyMinValue(UProperty which); + +/** + * Get the maximum value for an enumerated/integer/binary Unicode property. + * Can be used together with u_getIntPropertyMinValue + * to allocate arrays of UnicodeSet or similar. + * + * Examples for min/max values (for Unicode 3.2): + * + * - UCHAR_BIDI_CLASS: 0/18 (U_LEFT_TO_RIGHT/U_BOUNDARY_NEUTRAL) + * - UCHAR_SCRIPT: 0/45 (USCRIPT_COMMON/USCRIPT_TAGBANWA) + * - UCHAR_IDEOGRAPHIC: 0/1 (FALSE/TRUE) + * + * For undefined UProperty constant values, min/max values will be 0/-1. + * + * @param which UProperty selector constant, identifies which binary property to check. + * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT + * or UCHAR_INT_START<=which<UCHAR_INT_LIMIT. + * @return Maximum value returned by u_getIntPropertyValue for a Unicode property. + * <=0 if the property selector is out of range. + * + * @see UProperty + * @see u_hasBinaryProperty + * @see u_getUnicodeVersion + * @see u_getIntPropertyMaxValue + * @see u_getIntPropertyValue + * @stable ICU 2.2 + */ +U_STABLE int32_t U_EXPORT2 +u_getIntPropertyMaxValue(UProperty which); + +/** + * Get the numeric value for a Unicode code point as defined in the + * Unicode Character Database. + * + * A "double" return type is necessary because + * some numeric values are fractions, negative, or too large for int32_t. + * + * For characters without any numeric values in the Unicode Character Database, + * this function will return U_NO_NUMERIC_VALUE. + * Note: This is different from the Unicode Standard which specifies NaN as the default value. + * (NaN is not available on all platforms.) + * + * Similar to java.lang.Character.getNumericValue(), but u_getNumericValue() + * also supports negative values, large values, and fractions, + * while Java's getNumericValue() returns values 10..35 for ASCII letters. + * + * @param c Code point to get the numeric value for. + * @return Numeric value of c, or U_NO_NUMERIC_VALUE if none is defined. + * + * @see U_NO_NUMERIC_VALUE + * @stable ICU 2.2 + */ +U_STABLE double U_EXPORT2 +u_getNumericValue(UChar32 c); + +/** + * Special value that is returned by u_getNumericValue when + * no numeric value is defined for a code point. + * + * @see u_getNumericValue + * @stable ICU 2.2 + */ +#define U_NO_NUMERIC_VALUE ((double)-123456789.) + +/** + * Determines whether the specified code point has the general category "Ll" + * (lowercase letter). + * + * Same as java.lang.Character.isLowerCase(). + * + * This misses some characters that are also lowercase but + * have a different general category value. + * In order to include those, use UCHAR_LOWERCASE. + * + * In addition to being equivalent to a Java function, this also serves + * as a C/POSIX migration function. + * See the comments about C/POSIX character classification functions in the + * documentation at the top of this header file. + * + * @param c the code point to be tested + * @return TRUE if the code point is an Ll lowercase letter + * + * @see UCHAR_LOWERCASE + * @see u_isupper + * @see u_istitle + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_islower(UChar32 c); + +/** + * Determines whether the specified code point has the general category "Lu" + * (uppercase letter). + * + * Same as java.lang.Character.isUpperCase(). + * + * This misses some characters that are also uppercase but + * have a different general category value. + * In order to include those, use UCHAR_UPPERCASE. + * + * In addition to being equivalent to a Java function, this also serves + * as a C/POSIX migration function. + * See the comments about C/POSIX character classification functions in the + * documentation at the top of this header file. + * + * @param c the code point to be tested + * @return TRUE if the code point is an Lu uppercase letter + * + * @see UCHAR_UPPERCASE + * @see u_islower + * @see u_istitle + * @see u_tolower + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isupper(UChar32 c); + +/** + * Determines whether the specified code point is a titlecase letter. + * True for general category "Lt" (titlecase letter). + * + * Same as java.lang.Character.isTitleCase(). + * + * @param c the code point to be tested + * @return TRUE if the code point is an Lt titlecase letter + * + * @see u_isupper + * @see u_islower + * @see u_totitle + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_istitle(UChar32 c); + +/** + * Determines whether the specified code point is a digit character according to Java. + * True for characters with general category "Nd" (decimal digit numbers). + * Beginning with Unicode 4, this is the same as + * testing for the Numeric_Type of Decimal. + * + * Same as java.lang.Character.isDigit(). + * + * In addition to being equivalent to a Java function, this also serves + * as a C/POSIX migration function. + * See the comments about C/POSIX character classification functions in the + * documentation at the top of this header file. + * + * @param c the code point to be tested + * @return TRUE if the code point is a digit character according to Character.isDigit() + * + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isdigit(UChar32 c); + +/** + * Determines whether the specified code point is a letter character. + * True for general categories "L" (letters). + * + * Same as java.lang.Character.isLetter(). + * + * In addition to being equivalent to a Java function, this also serves + * as a C/POSIX migration function. + * See the comments about C/POSIX character classification functions in the + * documentation at the top of this header file. + * + * @param c the code point to be tested + * @return TRUE if the code point is a letter character + * + * @see u_isdigit + * @see u_isalnum + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isalpha(UChar32 c); + +/** + * Determines whether the specified code point is an alphanumeric character + * (letter or digit) according to Java. + * True for characters with general categories + * "L" (letters) and "Nd" (decimal digit numbers). + * + * Same as java.lang.Character.isLetterOrDigit(). + * + * In addition to being equivalent to a Java function, this also serves + * as a C/POSIX migration function. + * See the comments about C/POSIX character classification functions in the + * documentation at the top of this header file. + * + * @param c the code point to be tested + * @return TRUE if the code point is an alphanumeric character according to Character.isLetterOrDigit() + * + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isalnum(UChar32 c); + +/** + * Determines whether the specified code point is a hexadecimal digit. + * This is equivalent to u_digit(c, 16)>=0. + * True for characters with general category "Nd" (decimal digit numbers) + * as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII. + * (That is, for letters with code points + * 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.) + * + * In order to narrow the definition of hexadecimal digits to only ASCII + * characters, use (c<=0x7f && u_isxdigit(c)). + * + * This is a C/POSIX migration function. + * See the comments about C/POSIX character classification functions in the + * documentation at the top of this header file. + * + * @param c the code point to be tested + * @return TRUE if the code point is a hexadecimal digit + * + * @stable ICU 2.6 + */ +U_STABLE UBool U_EXPORT2 +u_isxdigit(UChar32 c); + +/** + * Determines whether the specified code point is a punctuation character. + * True for characters with general categories "P" (punctuation). + * + * This is a C/POSIX migration function. + * See the comments about C/POSIX character classification functions in the + * documentation at the top of this header file. + * + * @param c the code point to be tested + * @return TRUE if the code point is a punctuation character + * + * @stable ICU 2.6 + */ +U_STABLE UBool U_EXPORT2 +u_ispunct(UChar32 c); + +/** + * Determines whether the specified code point is a "graphic" character + * (printable, excluding spaces). + * TRUE for all characters except those with general categories + * "Cc" (control codes), "Cf" (format controls), "Cs" (surrogates), + * "Cn" (unassigned), and "Z" (separators). + * + * This is a C/POSIX migration function. + * See the comments about C/POSIX character classification functions in the + * documentation at the top of this header file. + * + * @param c the code point to be tested + * @return TRUE if the code point is a "graphic" character + * + * @stable ICU 2.6 + */ +U_STABLE UBool U_EXPORT2 +u_isgraph(UChar32 c); + +/** + * Determines whether the specified code point is a "blank" or "horizontal space", + * a character that visibly separates words on a line. + * The following are equivalent definitions: + * + * TRUE for Unicode White_Space characters except for "vertical space controls" + * where "vertical space controls" are the following characters: + * U+000A (LF) U+000B (VT) U+000C (FF) U+000D (CR) U+0085 (NEL) U+2028 (LS) U+2029 (PS) + * + * same as + * + * TRUE for U+0009 (TAB) and characters with general category "Zs" (space separators) + * except Zero Width Space (ZWSP, U+200B). + * + * Note: There are several ICU whitespace functions; please see the uchar.h + * file documentation for a detailed comparison. + * + * This is a C/POSIX migration function. + * See the comments about C/POSIX character classification functions in the + * documentation at the top of this header file. + * + * @param c the code point to be tested + * @return TRUE if the code point is a "blank" + * + * @stable ICU 2.6 + */ +U_STABLE UBool U_EXPORT2 +u_isblank(UChar32 c); + +/** + * Determines whether the specified code point is "defined", + * which usually means that it is assigned a character. + * True for general categories other than "Cn" (other, not assigned), + * i.e., true for all code points mentioned in UnicodeData.txt. + * + * Note that non-character code points (e.g., U+FDD0) are not "defined" + * (they are Cn), but surrogate code points are "defined" (Cs). + * + * Same as java.lang.Character.isDefined(). + * + * @param c the code point to be tested + * @return TRUE if the code point is assigned a character + * + * @see u_isdigit + * @see u_isalpha + * @see u_isalnum + * @see u_isupper + * @see u_islower + * @see u_istitle + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isdefined(UChar32 c); + +/** + * Determines if the specified character is a space character or not. + * + * Note: There are several ICU whitespace functions; please see the uchar.h + * file documentation for a detailed comparison. + * + * This is a C/POSIX migration function. + * See the comments about C/POSIX character classification functions in the + * documentation at the top of this header file. + * + * @param c the character to be tested + * @return true if the character is a space character; false otherwise. + * + * @see u_isJavaSpaceChar + * @see u_isWhitespace + * @see u_isUWhiteSpace + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isspace(UChar32 c); + +/** + * Determine if the specified code point is a space character according to Java. + * True for characters with general categories "Z" (separators), + * which does not include control codes (e.g., TAB or Line Feed). + * + * Same as java.lang.Character.isSpaceChar(). + * + * Note: There are several ICU whitespace functions; please see the uchar.h + * file documentation for a detailed comparison. + * + * @param c the code point to be tested + * @return TRUE if the code point is a space character according to Character.isSpaceChar() + * + * @see u_isspace + * @see u_isWhitespace + * @see u_isUWhiteSpace + * @stable ICU 2.6 + */ +U_STABLE UBool U_EXPORT2 +u_isJavaSpaceChar(UChar32 c); + +/** + * Determines if the specified code point is a whitespace character according to Java/ICU. + * A character is considered to be a Java whitespace character if and only + * if it satisfies one of the following criteria: + * + * - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"), but is not + * also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space or U+202F Narrow NBSP). + * - It is U+0009 HORIZONTAL TABULATION. + * - It is U+000A LINE FEED. + * - It is U+000B VERTICAL TABULATION. + * - It is U+000C FORM FEED. + * - It is U+000D CARRIAGE RETURN. + * - It is U+001C FILE SEPARATOR. + * - It is U+001D GROUP SEPARATOR. + * - It is U+001E RECORD SEPARATOR. + * - It is U+001F UNIT SEPARATOR. + * + * This API tries to sync with the semantics of Java's + * java.lang.Character.isWhitespace(), but it may not return + * the exact same results because of the Unicode version + * difference. + * + * Note: Unicode 4.0.1 changed U+200B ZERO WIDTH SPACE from a Space Separator (Zs) + * to a Format Control (Cf). Since then, isWhitespace(0x200b) returns false. + * See http://www.unicode.org/versions/Unicode4.0.1/ + * + * Note: There are several ICU whitespace functions; please see the uchar.h + * file documentation for a detailed comparison. + * + * @param c the code point to be tested + * @return TRUE if the code point is a whitespace character according to Java/ICU + * + * @see u_isspace + * @see u_isJavaSpaceChar + * @see u_isUWhiteSpace + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isWhitespace(UChar32 c); + +/** + * Determines whether the specified code point is a control character + * (as defined by this function). + * A control character is one of the following: + * - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f) + * - U_CONTROL_CHAR (Cc) + * - U_FORMAT_CHAR (Cf) + * - U_LINE_SEPARATOR (Zl) + * - U_PARAGRAPH_SEPARATOR (Zp) + * + * This is a C/POSIX migration function. + * See the comments about C/POSIX character classification functions in the + * documentation at the top of this header file. + * + * @param c the code point to be tested + * @return TRUE if the code point is a control character + * + * @see UCHAR_DEFAULT_IGNORABLE_CODE_POINT + * @see u_isprint + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_iscntrl(UChar32 c); + +/** + * Determines whether the specified code point is an ISO control code. + * True for U+0000..U+001f and U+007f..U+009f (general category "Cc"). + * + * Same as java.lang.Character.isISOControl(). + * + * @param c the code point to be tested + * @return TRUE if the code point is an ISO control code + * + * @see u_iscntrl + * @stable ICU 2.6 + */ +U_STABLE UBool U_EXPORT2 +u_isISOControl(UChar32 c); + +/** + * Determines whether the specified code point is a printable character. + * True for general categories <em>other</em> than "C" (controls). + * + * This is a C/POSIX migration function. + * See the comments about C/POSIX character classification functions in the + * documentation at the top of this header file. + * + * @param c the code point to be tested + * @return TRUE if the code point is a printable character + * + * @see UCHAR_DEFAULT_IGNORABLE_CODE_POINT + * @see u_iscntrl + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isprint(UChar32 c); + +/** + * Determines whether the specified code point is a base character. + * True for general categories "L" (letters), "N" (numbers), + * "Mc" (spacing combining marks), and "Me" (enclosing marks). + * + * Note that this is different from the Unicode definition in + * chapter 3.5, conformance clause D13, + * which defines base characters to be all characters (not Cn) + * that do not graphically combine with preceding characters (M) + * and that are neither control (Cc) or format (Cf) characters. + * + * @param c the code point to be tested + * @return TRUE if the code point is a base character according to this function + * + * @see u_isalpha + * @see u_isdigit + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isbase(UChar32 c); + +/** + * Returns the bidirectional category value for the code point, + * which is used in the Unicode bidirectional algorithm + * (UAX #9 http://www.unicode.org/reports/tr9/). + * Note that some <em>unassigned</em> code points have bidi values + * of R or AL because they are in blocks that are reserved + * for Right-To-Left scripts. + * + * Same as java.lang.Character.getDirectionality() + * + * @param c the code point to be tested + * @return the bidirectional category (UCharDirection) value + * + * @see UCharDirection + * @stable ICU 2.0 + */ +U_STABLE UCharDirection U_EXPORT2 +u_charDirection(UChar32 c); + +/** + * Determines whether the code point has the Bidi_Mirrored property. + * This property is set for characters that are commonly used in + * Right-To-Left contexts and need to be displayed with a "mirrored" + * glyph. + * + * Same as java.lang.Character.isMirrored(). + * Same as UCHAR_BIDI_MIRRORED + * + * @param c the code point to be tested + * @return TRUE if the character has the Bidi_Mirrored property + * + * @see UCHAR_BIDI_MIRRORED + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isMirrored(UChar32 c); + +/** + * Maps the specified character to a "mirror-image" character. + * For characters with the Bidi_Mirrored property, implementations + * sometimes need a "poor man's" mapping to another Unicode + * character (code point) such that the default glyph may serve + * as the mirror-image of the default glyph of the specified + * character. This is useful for text conversion to and from + * codepages with visual order, and for displays without glyph + * selection capabilities. + * + * @param c the code point to be mapped + * @return another Unicode code point that may serve as a mirror-image + * substitute, or c itself if there is no such mapping or c + * does not have the Bidi_Mirrored property + * + * @see UCHAR_BIDI_MIRRORED + * @see u_isMirrored + * @stable ICU 2.0 + */ +U_STABLE UChar32 U_EXPORT2 +u_charMirror(UChar32 c); + +/** + * Maps the specified character to its paired bracket character. + * For Bidi_Paired_Bracket_Type!=None, this is the same as u_charMirror(). + * Otherwise c itself is returned. + * See http://www.unicode.org/reports/tr9/ + * + * @param c the code point to be mapped + * @return the paired bracket code point, + * or c itself if there is no such mapping + * (Bidi_Paired_Bracket_Type=None) + * + * @see UCHAR_BIDI_PAIRED_BRACKET + * @see UCHAR_BIDI_PAIRED_BRACKET_TYPE + * @see u_charMirror + * @stable ICU 52 + */ +U_STABLE UChar32 U_EXPORT2 +u_getBidiPairedBracket(UChar32 c); + +/** + * Returns the general category value for the code point. + * + * Same as java.lang.Character.getType(). + * + * @param c the code point to be tested + * @return the general category (UCharCategory) value + * + * @see UCharCategory + * @stable ICU 2.0 + */ +U_STABLE int8_t U_EXPORT2 +u_charType(UChar32 c); + +/** + * Get a single-bit bit set for the general category of a character. + * This bit set can be compared bitwise with U_GC_SM_MASK, U_GC_L_MASK, etc. + * Same as U_MASK(u_charType(c)). + * + * @param c the code point to be tested + * @return a single-bit mask corresponding to the general category (UCharCategory) value + * + * @see u_charType + * @see UCharCategory + * @see U_GC_CN_MASK + * @stable ICU 2.1 + */ +#define U_GET_GC_MASK(c) U_MASK(u_charType(c)) + +/** + * Callback from u_enumCharTypes(), is called for each contiguous range + * of code points c (where start<=c<limit) + * with the same Unicode general category ("character type"). + * + * The callback function can stop the enumeration by returning FALSE. + * + * @param context an opaque pointer, as passed into utrie_enum() + * @param start the first code point in a contiguous range with value + * @param limit one past the last code point in a contiguous range with value + * @param type the general category for all code points in [start..limit[ + * @return FALSE to stop the enumeration + * + * @stable ICU 2.1 + * @see UCharCategory + * @see u_enumCharTypes + */ +typedef UBool U_CALLCONV +UCharEnumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type); + +/** + * Enumerate efficiently all code points with their Unicode general categories. + * + * This is useful for building data structures (e.g., UnicodeSet's), + * for enumerating all assigned code points (type!=U_UNASSIGNED), etc. + * + * For each contiguous range of code points with a given general category ("character type"), + * the UCharEnumTypeRange function is called. + * Adjacent ranges have different types. + * The Unicode Standard guarantees that the numeric value of the type is 0..31. + * + * @param enumRange a pointer to a function that is called for each contiguous range + * of code points with the same general category + * @param context an opaque pointer that is passed on to the callback function + * + * @stable ICU 2.1 + * @see UCharCategory + * @see UCharEnumTypeRange + */ +U_STABLE void U_EXPORT2 +u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context); + +#if !UCONFIG_NO_NORMALIZATION + +/** + * Returns the combining class of the code point as specified in UnicodeData.txt. + * + * @param c the code point of the character + * @return the combining class of the character + * @stable ICU 2.0 + */ +U_STABLE uint8_t U_EXPORT2 +u_getCombiningClass(UChar32 c); + +#endif + +/** + * Returns the decimal digit value of a decimal digit character. + * Such characters have the general category "Nd" (decimal digit numbers) + * and a Numeric_Type of Decimal. + * + * Unlike ICU releases before 2.6, no digit values are returned for any + * Han characters because Han number characters are often used with a special + * Chinese-style number format (with characters for powers of 10 in between) + * instead of in decimal-positional notation. + * Unicode 4 explicitly assigns Han number characters the Numeric_Type + * Numeric instead of Decimal. + * See Jitterbug 1483 for more details. + * + * Use u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE) and u_getNumericValue() + * for complete numeric Unicode properties. + * + * @param c the code point for which to get the decimal digit value + * @return the decimal digit value of c, + * or -1 if c is not a decimal digit character + * + * @see u_getNumericValue + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_charDigitValue(UChar32 c); + +/** + * Returns the Unicode allocation block that contains the character. + * + * @param c the code point to be tested + * @return the block value (UBlockCode) for c + * + * @see UBlockCode + * @stable ICU 2.0 + */ +U_STABLE UBlockCode U_EXPORT2 +ublock_getCode(UChar32 c); + +/** + * Retrieve the name of a Unicode character. + * Depending on <code>nameChoice</code>, the character name written + * into the buffer is the "modern" name or the name that was defined + * in Unicode version 1.0. + * The name contains only "invariant" characters + * like A-Z, 0-9, space, and '-'. + * Unicode 1.0 names are only retrieved if they are different from the modern + * names and if the data file contains the data for them. gennames may or may + * not be called with a command line option to include 1.0 names in unames.dat. + * + * @param code The character (code point) for which to get the name. + * It must be <code>0<=code<=0x10ffff</code>. + * @param nameChoice Selector for which name to get. + * @param buffer Destination address for copying the name. + * The name will always be zero-terminated. + * If there is no name, then the buffer will be set to the empty string. + * @param bufferLength <code>==sizeof(buffer)</code> + * @param pErrorCode Pointer to a UErrorCode variable; + * check for <code>U_SUCCESS()</code> after <code>u_charName()</code> + * returns. + * @return The length of the name, or 0 if there is no name for this character. + * If the bufferLength is less than or equal to the length, then the buffer + * contains the truncated name and the returned length indicates the full + * length of the name. + * The length does not include the zero-termination. + * + * @see UCharNameChoice + * @see u_charFromName + * @see u_enumCharNames + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_charName(UChar32 code, UCharNameChoice nameChoice, + char *buffer, int32_t bufferLength, + UErrorCode *pErrorCode); + +#ifndef U_HIDE_DEPRECATED_API +/** + * Returns an empty string. + * Used to return the ISO 10646 comment for a character. + * The Unicode ISO_Comment property is deprecated and has no values. + * + * @param c The character (code point) for which to get the ISO comment. + * It must be <code>0<=c<=0x10ffff</code>. + * @param dest Destination address for copying the comment. + * The comment will be zero-terminated if possible. + * If there is no comment, then the buffer will be set to the empty string. + * @param destCapacity <code>==sizeof(dest)</code> + * @param pErrorCode Pointer to a UErrorCode variable; + * check for <code>U_SUCCESS()</code> after <code>u_getISOComment()</code> + * returns. + * @return 0 + * + * @deprecated ICU 49 + */ +U_DEPRECATED int32_t U_EXPORT2 +u_getISOComment(UChar32 c, + char *dest, int32_t destCapacity, + UErrorCode *pErrorCode); +#endif /* U_HIDE_DEPRECATED_API */ + +/** + * Find a Unicode character by its name and return its code point value. + * The name is matched exactly and completely. + * If the name does not correspond to a code point, <i>pErrorCode</i> + * is set to <code>U_INVALID_CHAR_FOUND</code>. + * A Unicode 1.0 name is matched only if it differs from the modern name. + * Unicode names are all uppercase. Extended names are lowercase followed + * by an uppercase hexadecimal number, and within angle brackets. + * + * @param nameChoice Selector for which name to match. + * @param name The name to match. + * @param pErrorCode Pointer to a UErrorCode variable + * @return The Unicode value of the code point with the given name, + * or an undefined value if there is no such code point. + * + * @see UCharNameChoice + * @see u_charName + * @see u_enumCharNames + * @stable ICU 1.7 + */ +U_STABLE UChar32 U_EXPORT2 +u_charFromName(UCharNameChoice nameChoice, + const char *name, + UErrorCode *pErrorCode); + +/** + * Type of a callback function for u_enumCharNames() that gets called + * for each Unicode character with the code point value and + * the character name. + * If such a function returns FALSE, then the enumeration is stopped. + * + * @param context The context pointer that was passed to u_enumCharNames(). + * @param code The Unicode code point for the character with this name. + * @param nameChoice Selector for which kind of names is enumerated. + * @param name The character's name, zero-terminated. + * @param length The length of the name. + * @return TRUE if the enumeration should continue, FALSE to stop it. + * + * @see UCharNameChoice + * @see u_enumCharNames + * @stable ICU 1.7 + */ +typedef UBool U_CALLCONV UEnumCharNamesFn(void *context, + UChar32 code, + UCharNameChoice nameChoice, + const char *name, + int32_t length); + +/** + * Enumerate all assigned Unicode characters between the start and limit + * code points (start inclusive, limit exclusive) and call a function + * for each, passing the code point value and the character name. + * For Unicode 1.0 names, only those are enumerated that differ from the + * modern names. + * + * @param start The first code point in the enumeration range. + * @param limit One more than the last code point in the enumeration range + * (the first one after the range). + * @param fn The function that is to be called for each character name. + * @param context An arbitrary pointer that is passed to the function. + * @param nameChoice Selector for which kind of names to enumerate. + * @param pErrorCode Pointer to a UErrorCode variable + * + * @see UCharNameChoice + * @see UEnumCharNamesFn + * @see u_charName + * @see u_charFromName + * @stable ICU 1.7 + */ +U_STABLE void U_EXPORT2 +u_enumCharNames(UChar32 start, UChar32 limit, + UEnumCharNamesFn *fn, + void *context, + UCharNameChoice nameChoice, + UErrorCode *pErrorCode); + +/** + * Return the Unicode name for a given property, as given in the + * Unicode database file PropertyAliases.txt. + * + * In addition, this function maps the property + * UCHAR_GENERAL_CATEGORY_MASK to the synthetic names "gcm" / + * "General_Category_Mask". These names are not in + * PropertyAliases.txt. + * + * @param property UProperty selector other than UCHAR_INVALID_CODE. + * If out of range, NULL is returned. + * + * @param nameChoice selector for which name to get. If out of range, + * NULL is returned. All properties have a long name. Most + * have a short name, but some do not. Unicode allows for + * additional names; if present these will be returned by + * U_LONG_PROPERTY_NAME + i, where i=1, 2,... + * + * @return a pointer to the name, or NULL if either the + * property or the nameChoice is out of range. If a given + * nameChoice returns NULL, then all larger values of + * nameChoice will return NULL, with one exception: if NULL is + * returned for U_SHORT_PROPERTY_NAME, then + * U_LONG_PROPERTY_NAME (and higher) may still return a + * non-NULL value. The returned pointer is valid until + * u_cleanup() is called. + * + * @see UProperty + * @see UPropertyNameChoice + * @stable ICU 2.4 + */ +U_STABLE const char* U_EXPORT2 +u_getPropertyName(UProperty property, + UPropertyNameChoice nameChoice); + +/** + * Return the UProperty enum for a given property name, as specified + * in the Unicode database file PropertyAliases.txt. Short, long, and + * any other variants are recognized. + * + * In addition, this function maps the synthetic names "gcm" / + * "General_Category_Mask" to the property + * UCHAR_GENERAL_CATEGORY_MASK. These names are not in + * PropertyAliases.txt. + * + * @param alias the property name to be matched. The name is compared + * using "loose matching" as described in PropertyAliases.txt. + * + * @return a UProperty enum, or UCHAR_INVALID_CODE if the given name + * does not match any property. + * + * @see UProperty + * @stable ICU 2.4 + */ +U_STABLE UProperty U_EXPORT2 +u_getPropertyEnum(const char* alias); + +/** + * Return the Unicode name for a given property value, as given in the + * Unicode database file PropertyValueAliases.txt. + * + * Note: Some of the names in PropertyValueAliases.txt can only be + * retrieved using UCHAR_GENERAL_CATEGORY_MASK, not + * UCHAR_GENERAL_CATEGORY. These include: "C" / "Other", "L" / + * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P" + * / "Punctuation", "S" / "Symbol", and "Z" / "Separator". + * + * @param property UProperty selector constant. + * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT + * or UCHAR_INT_START<=which<UCHAR_INT_LIMIT + * or UCHAR_MASK_START<=which<UCHAR_MASK_LIMIT. + * If out of range, NULL is returned. + * + * @param value selector for a value for the given property. If out + * of range, NULL is returned. In general, valid values range + * from 0 up to some maximum. There are a few exceptions: + * (1.) UCHAR_BLOCK values begin at the non-zero value + * UBLOCK_BASIC_LATIN. (2.) UCHAR_CANONICAL_COMBINING_CLASS + * values are not contiguous and range from 0..240. (3.) + * UCHAR_GENERAL_CATEGORY_MASK values are not values of + * UCharCategory, but rather mask values produced by + * U_GET_GC_MASK(). This allows grouped categories such as + * [:L:] to be represented. Mask values range + * non-contiguously from 1..U_GC_P_MASK. + * + * @param nameChoice selector for which name to get. If out of range, + * NULL is returned. All values have a long name. Most have + * a short name, but some do not. Unicode allows for + * additional names; if present these will be returned by + * U_LONG_PROPERTY_NAME + i, where i=1, 2,... + + * @return a pointer to the name, or NULL if either the + * property or the nameChoice is out of range. If a given + * nameChoice returns NULL, then all larger values of + * nameChoice will return NULL, with one exception: if NULL is + * returned for U_SHORT_PROPERTY_NAME, then + * U_LONG_PROPERTY_NAME (and higher) may still return a + * non-NULL value. The returned pointer is valid until + * u_cleanup() is called. + * + * @see UProperty + * @see UPropertyNameChoice + * @stable ICU 2.4 + */ +U_STABLE const char* U_EXPORT2 +u_getPropertyValueName(UProperty property, + int32_t value, + UPropertyNameChoice nameChoice); + +/** + * Return the property value integer for a given value name, as + * specified in the Unicode database file PropertyValueAliases.txt. + * Short, long, and any other variants are recognized. + * + * Note: Some of the names in PropertyValueAliases.txt will only be + * recognized with UCHAR_GENERAL_CATEGORY_MASK, not + * UCHAR_GENERAL_CATEGORY. These include: "C" / "Other", "L" / + * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P" + * / "Punctuation", "S" / "Symbol", and "Z" / "Separator". + * + * @param property UProperty selector constant. + * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT + * or UCHAR_INT_START<=which<UCHAR_INT_LIMIT + * or UCHAR_MASK_START<=which<UCHAR_MASK_LIMIT. + * If out of range, UCHAR_INVALID_CODE is returned. + * + * @param alias the value name to be matched. The name is compared + * using "loose matching" as described in + * PropertyValueAliases.txt. + * + * @return a value integer or UCHAR_INVALID_CODE if the given name + * does not match any value of the given property, or if the + * property is invalid. Note: UCHAR_GENERAL_CATEGORY_MASK values + * are not values of UCharCategory, but rather mask values + * produced by U_GET_GC_MASK(). This allows grouped + * categories such as [:L:] to be represented. + * + * @see UProperty + * @stable ICU 2.4 + */ +U_STABLE int32_t U_EXPORT2 +u_getPropertyValueEnum(UProperty property, + const char* alias); + +/** + * Determines if the specified character is permissible as the + * first character in an identifier according to Unicode + * (The Unicode Standard, Version 3.0, chapter 5.16 Identifiers). + * True for characters with general categories "L" (letters) and "Nl" (letter numbers). + * + * Same as java.lang.Character.isUnicodeIdentifierStart(). + * Same as UCHAR_ID_START + * + * @param c the code point to be tested + * @return TRUE if the code point may start an identifier + * + * @see UCHAR_ID_START + * @see u_isalpha + * @see u_isIDPart + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isIDStart(UChar32 c); + +/** + * Determines if the specified character is permissible + * in an identifier according to Java. + * True for characters with general categories "L" (letters), + * "Nl" (letter numbers), "Nd" (decimal digits), + * "Mc" and "Mn" (combining marks), "Pc" (connecting punctuation), and + * u_isIDIgnorable(c). + * + * Same as java.lang.Character.isUnicodeIdentifierPart(). + * Almost the same as Unicode's ID_Continue (UCHAR_ID_CONTINUE) + * except that Unicode recommends to ignore Cf which is less than + * u_isIDIgnorable(c). + * + * @param c the code point to be tested + * @return TRUE if the code point may occur in an identifier according to Java + * + * @see UCHAR_ID_CONTINUE + * @see u_isIDStart + * @see u_isIDIgnorable + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isIDPart(UChar32 c); + +/** + * Determines if the specified character should be regarded + * as an ignorable character in an identifier, + * according to Java. + * True for characters with general category "Cf" (format controls) as well as + * non-whitespace ISO controls + * (U+0000..U+0008, U+000E..U+001B, U+007F..U+009F). + * + * Same as java.lang.Character.isIdentifierIgnorable(). + * + * Note that Unicode just recommends to ignore Cf (format controls). + * + * @param c the code point to be tested + * @return TRUE if the code point is ignorable in identifiers according to Java + * + * @see UCHAR_DEFAULT_IGNORABLE_CODE_POINT + * @see u_isIDStart + * @see u_isIDPart + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isIDIgnorable(UChar32 c); + +/** + * Determines if the specified character is permissible as the + * first character in a Java identifier. + * In addition to u_isIDStart(c), true for characters with + * general categories "Sc" (currency symbols) and "Pc" (connecting punctuation). + * + * Same as java.lang.Character.isJavaIdentifierStart(). + * + * @param c the code point to be tested + * @return TRUE if the code point may start a Java identifier + * + * @see u_isJavaIDPart + * @see u_isalpha + * @see u_isIDStart + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isJavaIDStart(UChar32 c); + +/** + * Determines if the specified character is permissible + * in a Java identifier. + * In addition to u_isIDPart(c), true for characters with + * general category "Sc" (currency symbols). + * + * Same as java.lang.Character.isJavaIdentifierPart(). + * + * @param c the code point to be tested + * @return TRUE if the code point may occur in a Java identifier + * + * @see u_isIDIgnorable + * @see u_isJavaIDStart + * @see u_isalpha + * @see u_isdigit + * @see u_isIDPart + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +u_isJavaIDPart(UChar32 c); + +/** + * The given character is mapped to its lowercase equivalent according to + * UnicodeData.txt; if the character has no lowercase equivalent, the character + * itself is returned. + * + * Same as java.lang.Character.toLowerCase(). + * + * This function only returns the simple, single-code point case mapping. + * Full case mappings should be used whenever possible because they produce + * better results by working on whole strings. + * They take into account the string context and the language and can map + * to a result string with a different length as appropriate. + * Full case mappings are applied by the string case mapping functions, + * see ustring.h and the UnicodeString class. + * See also the User Guide chapter on C/POSIX migration: + * http://icu-project.org/userguide/posix.html#case_mappings + * + * @param c the code point to be mapped + * @return the Simple_Lowercase_Mapping of the code point, if any; + * otherwise the code point itself. + * @stable ICU 2.0 + */ +U_STABLE UChar32 U_EXPORT2 +u_tolower(UChar32 c); + +/** + * The given character is mapped to its uppercase equivalent according to UnicodeData.txt; + * if the character has no uppercase equivalent, the character itself is + * returned. + * + * Same as java.lang.Character.toUpperCase(). + * + * This function only returns the simple, single-code point case mapping. + * Full case mappings should be used whenever possible because they produce + * better results by working on whole strings. + * They take into account the string context and the language and can map + * to a result string with a different length as appropriate. + * Full case mappings are applied by the string case mapping functions, + * see ustring.h and the UnicodeString class. + * See also the User Guide chapter on C/POSIX migration: + * http://icu-project.org/userguide/posix.html#case_mappings + * + * @param c the code point to be mapped + * @return the Simple_Uppercase_Mapping of the code point, if any; + * otherwise the code point itself. + * @stable ICU 2.0 + */ +U_STABLE UChar32 U_EXPORT2 +u_toupper(UChar32 c); + +/** + * The given character is mapped to its titlecase equivalent + * according to UnicodeData.txt; + * if none is defined, the character itself is returned. + * + * Same as java.lang.Character.toTitleCase(). + * + * This function only returns the simple, single-code point case mapping. + * Full case mappings should be used whenever possible because they produce + * better results by working on whole strings. + * They take into account the string context and the language and can map + * to a result string with a different length as appropriate. + * Full case mappings are applied by the string case mapping functions, + * see ustring.h and the UnicodeString class. + * See also the User Guide chapter on C/POSIX migration: + * http://icu-project.org/userguide/posix.html#case_mappings + * + * @param c the code point to be mapped + * @return the Simple_Titlecase_Mapping of the code point, if any; + * otherwise the code point itself. + * @stable ICU 2.0 + */ +U_STABLE UChar32 U_EXPORT2 +u_totitle(UChar32 c); + +/** Option value for case folding: use default mappings defined in CaseFolding.txt. @stable ICU 2.0 */ +#define U_FOLD_CASE_DEFAULT 0 + +/** + * Option value for case folding: + * + * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I + * and dotless i appropriately for Turkic languages (tr, az). + * + * Before Unicode 3.2, CaseFolding.txt contains mappings marked with 'I' that + * are to be included for default mappings and + * excluded for the Turkic-specific mappings. + * + * Unicode 3.2 CaseFolding.txt instead contains mappings marked with 'T' that + * are to be excluded for default mappings and + * included for the Turkic-specific mappings. + * + * @stable ICU 2.0 + */ +#define U_FOLD_CASE_EXCLUDE_SPECIAL_I 1 + +/** + * The given character is mapped to its case folding equivalent according to + * UnicodeData.txt and CaseFolding.txt; + * if the character has no case folding equivalent, the character + * itself is returned. + * + * This function only returns the simple, single-code point case mapping. + * Full case mappings should be used whenever possible because they produce + * better results by working on whole strings. + * They take into account the string context and the language and can map + * to a result string with a different length as appropriate. + * Full case mappings are applied by the string case mapping functions, + * see ustring.h and the UnicodeString class. + * See also the User Guide chapter on C/POSIX migration: + * http://icu-project.org/userguide/posix.html#case_mappings + * + * @param c the code point to be mapped + * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I + * @return the Simple_Case_Folding of the code point, if any; + * otherwise the code point itself. + * @stable ICU 2.0 + */ +U_STABLE UChar32 U_EXPORT2 +u_foldCase(UChar32 c, uint32_t options); + +/** + * Returns the decimal digit value of the code point in the + * specified radix. + * + * If the radix is not in the range <code>2<=radix<=36</code> or if the + * value of <code>c</code> is not a valid digit in the specified + * radix, <code>-1</code> is returned. A character is a valid digit + * if at least one of the following is true: + * <ul> + * <li>The character has a decimal digit value. + * Such characters have the general category "Nd" (decimal digit numbers) + * and a Numeric_Type of Decimal. + * In this case the value is the character's decimal digit value.</li> + * <li>The character is one of the uppercase Latin letters + * <code>'A'</code> through <code>'Z'</code>. + * In this case the value is <code>c-'A'+10</code>.</li> + * <li>The character is one of the lowercase Latin letters + * <code>'a'</code> through <code>'z'</code>. + * In this case the value is <code>ch-'a'+10</code>.</li> + * <li>Latin letters from both the ASCII range (0061..007A, 0041..005A) + * as well as from the Fullwidth ASCII range (FF41..FF5A, FF21..FF3A) + * are recognized.</li> + * </ul> + * + * Same as java.lang.Character.digit(). + * + * @param ch the code point to be tested. + * @param radix the radix. + * @return the numeric value represented by the character in the + * specified radix, + * or -1 if there is no value or if the value exceeds the radix. + * + * @see UCHAR_NUMERIC_TYPE + * @see u_forDigit + * @see u_charDigitValue + * @see u_isdigit + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_digit(UChar32 ch, int8_t radix); + +/** + * Determines the character representation for a specific digit in + * the specified radix. If the value of <code>radix</code> is not a + * valid radix, or the value of <code>digit</code> is not a valid + * digit in the specified radix, the null character + * (<code>U+0000</code>) is returned. + * <p> + * The <code>radix</code> argument is valid if it is greater than or + * equal to 2 and less than or equal to 36. + * The <code>digit</code> argument is valid if + * <code>0 <= digit < radix</code>. + * <p> + * If the digit is less than 10, then + * <code>'0' + digit</code> is returned. Otherwise, the value + * <code>'a' + digit - 10</code> is returned. + * + * Same as java.lang.Character.forDigit(). + * + * @param digit the number to convert to a character. + * @param radix the radix. + * @return the <code>char</code> representation of the specified digit + * in the specified radix. + * + * @see u_digit + * @see u_charDigitValue + * @see u_isdigit + * @stable ICU 2.0 + */ +U_STABLE UChar32 U_EXPORT2 +u_forDigit(int32_t digit, int8_t radix); + +/** + * Get the "age" of the code point. + * The "age" is the Unicode version when the code point was first + * designated (as a non-character or for Private Use) + * or assigned a character. + * This can be useful to avoid emitting code points to receiving + * processes that do not accept newer characters. + * The data is from the UCD file DerivedAge.txt. + * + * @param c The code point. + * @param versionArray The Unicode version number array, to be filled in. + * + * @stable ICU 2.1 + */ +U_STABLE void U_EXPORT2 +u_charAge(UChar32 c, UVersionInfo versionArray); + +/** + * Gets the Unicode version information. + * The version array is filled in with the version information + * for the Unicode standard that is currently used by ICU. + * For example, Unicode version 3.1.1 is represented as an array with + * the values { 3, 1, 1, 0 }. + * + * @param versionArray an output array that will be filled in with + * the Unicode version number + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +u_getUnicodeVersion(UVersionInfo versionArray); + +#if !UCONFIG_NO_NORMALIZATION +/** + * Get the FC_NFKC_Closure property string for a character. + * See Unicode Standard Annex #15 for details, search for "FC_NFKC_Closure" + * or for "FNC": http://www.unicode.org/reports/tr15/ + * + * @param c The character (code point) for which to get the FC_NFKC_Closure string. + * It must be <code>0<=c<=0x10ffff</code>. + * @param dest Destination address for copying the string. + * The string will be zero-terminated if possible. + * If there is no FC_NFKC_Closure string, + * then the buffer will be set to the empty string. + * @param destCapacity <code>==sizeof(dest)</code> + * @param pErrorCode Pointer to a UErrorCode variable. + * @return The length of the string, or 0 if there is no FC_NFKC_Closure string for this character. + * If the destCapacity is less than or equal to the length, then the buffer + * contains the truncated name and the returned length indicates the full + * length of the name. + * The length does not include the zero-termination. + * + * @stable ICU 2.2 + */ +U_STABLE int32_t U_EXPORT2 +u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode); + +#endif + + +U_CDECL_END + +#endif /*_UCHAR*/ +/*eof*/ diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/uclean.h b/src/core/basetypes/icu-ucsdet/include/unicode/uclean.h new file mode 100644 index 00000000..d9a1e539 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/uclean.h @@ -0,0 +1,258 @@ +/* +****************************************************************************** +* Copyright (C) 2001-2014, International Business Machines +* Corporation and others. All Rights Reserved. +****************************************************************************** +* file name: uclean.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2001July05 +* created by: George Rhoten +*/ + +#ifndef __UCLEAN_H__ +#define __UCLEAN_H__ + +#include "unicode/utypes.h" +/** + * \file + * \brief C API: Initialize and clean up ICU + */ + +/** + * Initialize ICU. + * + * Use of this function is optional. It is OK to simply use ICU + * services and functions without first having initialized + * ICU by calling u_init(). + * + * u_init() will attempt to load some part of ICU's data, and is + * useful as a test for configuration or installation problems that + * leave the ICU data inaccessible. A successful invocation of u_init() + * does not, however, guarantee that all ICU data is accessible. + * + * Multiple calls to u_init() cause no harm, aside from the small amount + * of time required. + * + * In old versions of ICU, u_init() was required in multi-threaded applications + * to ensure the thread safety of ICU. u_init() is no longer needed for this purpose. + * + * @param status An ICU UErrorCode parameter. It must not be <code>NULL</code>. + * An Error will be returned if some required part of ICU data can not + * be loaded or initialized. + * The function returns immediately if the input error code indicates a + * failure, as usual. + * + * @stable ICU 2.6 + */ +U_STABLE void U_EXPORT2 +u_init(UErrorCode *status); + +#ifndef U_HIDE_SYSTEM_API +/** + * Clean up the system resources, such as allocated memory or open files, + * used in all ICU libraries. This will free/delete all memory owned by the + * ICU libraries, and return them to their original load state. All open ICU + * items (collators, resource bundles, converters, etc.) must be closed before + * calling this function, otherwise ICU may not free its allocated memory + * (e.g. close your converters and resource bundles before calling this + * function). Generally, this function should be called once just before + * an application exits. For applications that dynamically load and unload + * the ICU libraries (relatively uncommon), u_cleanup() should be called + * just before the library unload. + * <p> + * u_cleanup() also clears any ICU heap functions, mutex functions or + * trace functions that may have been set for the process. + * This has the effect of restoring ICU to its initial condition, before + * any of these override functions were installed. Refer to + * u_setMemoryFunctions(), u_setMutexFunctions and + * utrace_setFunctions(). If ICU is to be reinitialized after after + * calling u_cleanup(), these runtime override functions will need to + * be set up again if they are still required. + * <p> + * u_cleanup() is not thread safe. All other threads should stop using ICU + * before calling this function. + * <p> + * Any open ICU items will be left in an undefined state by u_cleanup(), + * and any subsequent attempt to use such an item will give unpredictable + * results. + * <p> + * After calling u_cleanup(), an application may continue to use ICU by + * calling u_init(). An application must invoke u_init() first from one single + * thread before allowing other threads call u_init(). All threads existing + * at the time of the first thread's call to u_init() must also call + * u_init() themselves before continuing with other ICU operations. + * <p> + * The use of u_cleanup() just before an application terminates is optional, + * but it should be called only once for performance reasons. The primary + * benefit is to eliminate reports of memory or resource leaks originating + * in ICU code from the results generated by heap analysis tools. + * <p> + * <strong>Use this function with great care!</strong> + * </p> + * + * @stable ICU 2.0 + * @system + */ +U_STABLE void U_EXPORT2 +u_cleanup(void); + + +/** + * Pointer type for a user supplied memory allocation function. + * @param context user supplied value, obtained from from u_setMemoryFunctions(). + * @param size The number of bytes to be allocated + * @return Pointer to the newly allocated memory, or NULL if the allocation failed. + * @stable ICU 2.8 + * @system + */ +typedef void *U_CALLCONV UMemAllocFn(const void *context, size_t size); +/** + * Pointer type for a user supplied memory re-allocation function. + * @param context user supplied value, obtained from from u_setMemoryFunctions(). + * @param size The number of bytes to be allocated + * @return Pointer to the newly allocated memory, or NULL if the allocation failed. + * @stable ICU 2.8 + * @system + */ +typedef void *U_CALLCONV UMemReallocFn(const void *context, void *mem, size_t size); +/** + * Pointer type for a user supplied memory free function. Behavior should be + * similar the standard C library free(). + * @param context user supplied value, obtained from from u_setMemoryFunctions(). + * @param mem Pointer to the memory block to be resized + * @param size The new size for the block + * @return Pointer to the resized memory block, or NULL if the resizing failed. + * @stable ICU 2.8 + * @system + */ +typedef void U_CALLCONV UMemFreeFn (const void *context, void *mem); + +/** + * Set the functions that ICU will use for memory allocation. + * Use of this function is optional; by default (without this function), ICU will + * use the standard C library malloc() and free() functions. + * This function can only be used when ICU is in an initial, unused state, before + * u_init() has been called. + * @param context This pointer value will be saved, and then (later) passed as + * a parameter to the memory functions each time they + * are called. + * @param a Pointer to a user-supplied malloc function. + * @param r Pointer to a user-supplied realloc function. + * @param f Pointer to a user-supplied free function. + * @param status Receives error values. + * @stable ICU 2.8 + * @system + */ +U_STABLE void U_EXPORT2 +u_setMemoryFunctions(const void *context, UMemAllocFn *a, UMemReallocFn *r, UMemFreeFn *f, + UErrorCode *status); + + +#ifndef U_HIDE_DEPRECATED_API +/********************************************************************************* + * + * Deprecated Functions + * + * The following functions for user supplied mutexes are no longer supported. + * Any attempt to use them will return a U_UNSUPPORTED_ERROR. + * + **********************************************************************************/ + +/** + * An opaque pointer type that represents an ICU mutex. + * For user-implemented mutexes, the value will typically point to a + * struct or object that implements the mutex. + * @deprecated ICU 52. This type is no longer supported. + * @system + */ +typedef void *UMTX; + +/** + * Function Pointer type for a user supplied mutex initialization function. + * The user-supplied function will be called by ICU whenever ICU needs to create a + * new mutex. The function implementation should create a mutex, and store a pointer + * to something that uniquely identifies the mutex into the UMTX that is supplied + * as a paramter. + * @param context user supplied value, obtained from from u_setMutexFunctions(). + * @param mutex Receives a pointer that identifies the new mutex. + * The mutex init function must set the UMTX to a non-null value. + * Subsequent calls by ICU to lock, unlock, or destroy a mutex will + * identify the mutex by the UMTX value. + * @param status Error status. Report errors back to ICU by setting this variable + * with an error code. + * @deprecated ICU 52. This function is no longer supported. + * @system + */ +typedef void U_CALLCONV UMtxInitFn (const void *context, UMTX *mutex, UErrorCode* status); + + +/** + * Function Pointer type for a user supplied mutex functions. + * One of the user-supplied functions with this signature will be called by ICU + * whenever ICU needs to lock, unlock, or destroy a mutex. + * @param context user supplied value, obtained from from u_setMutexFunctions(). + * @param mutex specify the mutex on which to operate. + * @deprecated ICU 52. This function is no longer supported. + * @system + */ +typedef void U_CALLCONV UMtxFn (const void *context, UMTX *mutex); + + +/** + * Set the functions that ICU will use for mutex operations + * Use of this function is optional; by default (without this function), ICU will + * directly access system functions for mutex operations + * This function can only be used when ICU is in an initial, unused state, before + * u_init() has been called. + * @param context This pointer value will be saved, and then (later) passed as + * a parameter to the user-supplied mutex functions each time they + * are called. + * @param init Pointer to a mutex initialization function. Must be non-null. + * @param destroy Pointer to the mutex destroy function. Must be non-null. + * @param lock pointer to the mutex lock function. Must be non-null. + * @param unlock Pointer to the mutex unlock function. Must be non-null. + * @param status Receives error values. + * @deprecated ICU 52. This function is no longer supported. + * @system + */ +U_DEPRECATED void U_EXPORT2 +u_setMutexFunctions(const void *context, UMtxInitFn *init, UMtxFn *destroy, UMtxFn *lock, UMtxFn *unlock, + UErrorCode *status); + + +/** + * Pointer type for a user supplied atomic increment or decrement function. + * @param context user supplied value, obtained from from u_setAtomicIncDecFunctions(). + * @param p Pointer to a 32 bit int to be incremented or decremented + * @return The value of the variable after the inc or dec operation. + * @deprecated ICU 52. This function is no longer supported. + * @system + */ +typedef int32_t U_CALLCONV UMtxAtomicFn(const void *context, int32_t *p); + +/** + * Set the functions that ICU will use for atomic increment and decrement of int32_t values. + * Use of this function is optional; by default (without this function), ICU will + * use its own internal implementation of atomic increment/decrement. + * This function can only be used when ICU is in an initial, unused state, before + * u_init() has been called. + * @param context This pointer value will be saved, and then (later) passed as + * a parameter to the increment and decrement functions each time they + * are called. This function can only be called + * @param inc Pointer to a function to do an atomic increment operation. Must be non-null. + * @param dec Pointer to a function to do an atomic decrement operation. Must be non-null. + * @param status Receives error values. + * @deprecated ICU 52. This function is no longer supported. + * @system + */ +U_DEPRECATED void U_EXPORT2 +u_setAtomicIncDecFunctions(const void *context, UMtxAtomicFn *inc, UMtxAtomicFn *dec, + UErrorCode *status); + +#endif /* U_HIDE_DEPRECATED_API */ +#endif /* U_HIDE_SYSTEM_API */ + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/ucnv.h b/src/core/basetypes/icu-ucsdet/include/unicode/ucnv.h new file mode 100644 index 00000000..564656c2 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/ucnv.h @@ -0,0 +1,2036 @@ +/* +********************************************************************** +* Copyright (C) 1999-2014, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** + * ucnv.h: + * External APIs for the ICU's codeset conversion library + * Bertrand A. Damiba + * + * Modification History: + * + * Date Name Description + * 04/04/99 helena Fixed internal header inclusion. + * 05/11/00 helena Added setFallback and usesFallback APIs. + * 06/29/2000 helena Major rewrite of the callback APIs. + * 12/07/2000 srl Update of documentation + */ + +/** + * \file + * \brief C API: Character conversion + * + * <h2>Character Conversion C API</h2> + * + * <p>This API is used to convert codepage or character encoded data to and + * from UTF-16. You can open a converter with {@link ucnv_open() }. With that + * converter, you can get its properties, set options, convert your data and + * close the converter.</p> + * + * <p>Since many software programs recogize different converter names for + * different types of converters, there are other functions in this API to + * iterate over the converter aliases. The functions {@link ucnv_getAvailableName() }, + * {@link ucnv_getAlias() } and {@link ucnv_getStandardName() } are some of the + * more frequently used alias functions to get this information.</p> + * + * <p>When a converter encounters an illegal, irregular, invalid or unmappable character + * its default behavior is to use a substitution character to replace the + * bad byte sequence. This behavior can be changed by using {@link ucnv_setFromUCallBack() } + * or {@link ucnv_setToUCallBack() } on the converter. The header ucnv_err.h defines + * many other callback actions that can be used instead of a character substitution.</p> + * + * <p>More information about this API can be found in our + * <a href="http://icu-project.org/userguide/conversion.html">User's + * Guide</a>.</p> + */ + +#ifndef UCNV_H +#define UCNV_H + +#include "unicode/ucnv_err.h" +#include "unicode/uenum.h" +#include "unicode/localpointer.h" + +#ifndef __USET_H__ + +/** + * USet is the C API type for Unicode sets. + * It is forward-declared here to avoid including the header file if related + * conversion APIs are not used. + * See unicode/uset.h + * + * @see ucnv_getUnicodeSet + * @stable ICU 2.6 + */ +struct USet; +/** @stable ICU 2.6 */ +typedef struct USet USet; + +#endif + +#if !UCONFIG_NO_CONVERSION + +U_CDECL_BEGIN + +/** Maximum length of a converter name including the terminating NULL @stable ICU 2.0 */ +#define UCNV_MAX_CONVERTER_NAME_LENGTH 60 +/** Maximum length of a converter name including path and terminating NULL @stable ICU 2.0 */ +#define UCNV_MAX_FULL_FILE_NAME_LENGTH (600+UCNV_MAX_CONVERTER_NAME_LENGTH) + +/** Shift in for EBDCDIC_STATEFUL and iso2022 states @stable ICU 2.0 */ +#define UCNV_SI 0x0F +/** Shift out for EBDCDIC_STATEFUL and iso2022 states @stable ICU 2.0 */ +#define UCNV_SO 0x0E + +/** + * Enum for specifying basic types of converters + * @see ucnv_getType + * @stable ICU 2.0 + */ +typedef enum { + /** @stable ICU 2.0 */ + UCNV_UNSUPPORTED_CONVERTER = -1, + /** @stable ICU 2.0 */ + UCNV_SBCS = 0, + /** @stable ICU 2.0 */ + UCNV_DBCS = 1, + /** @stable ICU 2.0 */ + UCNV_MBCS = 2, + /** @stable ICU 2.0 */ + UCNV_LATIN_1 = 3, + /** @stable ICU 2.0 */ + UCNV_UTF8 = 4, + /** @stable ICU 2.0 */ + UCNV_UTF16_BigEndian = 5, + /** @stable ICU 2.0 */ + UCNV_UTF16_LittleEndian = 6, + /** @stable ICU 2.0 */ + UCNV_UTF32_BigEndian = 7, + /** @stable ICU 2.0 */ + UCNV_UTF32_LittleEndian = 8, + /** @stable ICU 2.0 */ + UCNV_EBCDIC_STATEFUL = 9, + /** @stable ICU 2.0 */ + UCNV_ISO_2022 = 10, + + /** @stable ICU 2.0 */ + UCNV_LMBCS_1 = 11, + /** @stable ICU 2.0 */ + UCNV_LMBCS_2, + /** @stable ICU 2.0 */ + UCNV_LMBCS_3, + /** @stable ICU 2.0 */ + UCNV_LMBCS_4, + /** @stable ICU 2.0 */ + UCNV_LMBCS_5, + /** @stable ICU 2.0 */ + UCNV_LMBCS_6, + /** @stable ICU 2.0 */ + UCNV_LMBCS_8, + /** @stable ICU 2.0 */ + UCNV_LMBCS_11, + /** @stable ICU 2.0 */ + UCNV_LMBCS_16, + /** @stable ICU 2.0 */ + UCNV_LMBCS_17, + /** @stable ICU 2.0 */ + UCNV_LMBCS_18, + /** @stable ICU 2.0 */ + UCNV_LMBCS_19, + /** @stable ICU 2.0 */ + UCNV_LMBCS_LAST = UCNV_LMBCS_19, + /** @stable ICU 2.0 */ + UCNV_HZ, + /** @stable ICU 2.0 */ + UCNV_SCSU, + /** @stable ICU 2.0 */ + UCNV_ISCII, + /** @stable ICU 2.0 */ + UCNV_US_ASCII, + /** @stable ICU 2.0 */ + UCNV_UTF7, + /** @stable ICU 2.2 */ + UCNV_BOCU1, + /** @stable ICU 2.2 */ + UCNV_UTF16, + /** @stable ICU 2.2 */ + UCNV_UTF32, + /** @stable ICU 2.2 */ + UCNV_CESU8, + /** @stable ICU 2.4 */ + UCNV_IMAP_MAILBOX, + /** @stable ICU 4.8 */ + UCNV_COMPOUND_TEXT, + + /* Number of converter types for which we have conversion routines. */ + UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES +} UConverterType; + +/** + * Enum for specifying which platform a converter ID refers to. + * The use of platform/CCSID is not recommended. See ucnv_openCCSID(). + * + * @see ucnv_getPlatform + * @see ucnv_openCCSID + * @see ucnv_getCCSID + * @stable ICU 2.0 + */ +typedef enum { + UCNV_UNKNOWN = -1, + UCNV_IBM = 0 +} UConverterPlatform; + +/** + * Function pointer for error callback in the codepage to unicode direction. + * Called when an error has occured in conversion to unicode, or on open/close of the callback (see reason). + * @param context Pointer to the callback's private data + * @param args Information about the conversion in progress + * @param codeUnits Points to 'length' bytes of the concerned codepage sequence + * @param length Size (in bytes) of the concerned codepage sequence + * @param reason Defines the reason the callback was invoked + * @param pErrorCode ICU error code in/out parameter. + * For converter callback functions, set to a conversion error + * before the call, and the callback may reset it to U_ZERO_ERROR. + * @see ucnv_setToUCallBack + * @see UConverterToUnicodeArgs + * @stable ICU 2.0 + */ +typedef void (U_EXPORT2 *UConverterToUCallback) ( + const void* context, + UConverterToUnicodeArgs *args, + const char *codeUnits, + int32_t length, + UConverterCallbackReason reason, + UErrorCode *pErrorCode); + +/** + * Function pointer for error callback in the unicode to codepage direction. + * Called when an error has occured in conversion from unicode, or on open/close of the callback (see reason). + * @param context Pointer to the callback's private data + * @param args Information about the conversion in progress + * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence + * @param length Size (in bytes) of the concerned codepage sequence + * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. + * @param reason Defines the reason the callback was invoked + * @param pErrorCode ICU error code in/out parameter. + * For converter callback functions, set to a conversion error + * before the call, and the callback may reset it to U_ZERO_ERROR. + * @see ucnv_setFromUCallBack + * @stable ICU 2.0 + */ +typedef void (U_EXPORT2 *UConverterFromUCallback) ( + const void* context, + UConverterFromUnicodeArgs *args, + const UChar* codeUnits, + int32_t length, + UChar32 codePoint, + UConverterCallbackReason reason, + UErrorCode *pErrorCode); + +U_CDECL_END + +/** + * Character that separates converter names from options and options from each other. + * @see ucnv_open + * @stable ICU 2.0 + */ +#define UCNV_OPTION_SEP_CHAR ',' + +/** + * String version of UCNV_OPTION_SEP_CHAR. + * @see ucnv_open + * @stable ICU 2.0 + */ +#define UCNV_OPTION_SEP_STRING "," + +/** + * Character that separates a converter option from its value. + * @see ucnv_open + * @stable ICU 2.0 + */ +#define UCNV_VALUE_SEP_CHAR '=' + +/** + * String version of UCNV_VALUE_SEP_CHAR. + * @see ucnv_open + * @stable ICU 2.0 + */ +#define UCNV_VALUE_SEP_STRING "=" + +/** + * Converter option for specifying a locale. + * For example, ucnv_open("SCSU,locale=ja", &errorCode); + * See convrtrs.txt. + * + * @see ucnv_open + * @stable ICU 2.0 + */ +#define UCNV_LOCALE_OPTION_STRING ",locale=" + +/** + * Converter option for specifying a version selector (0..9) for some converters. + * For example, + * \code + * ucnv_open("UTF-7,version=1", &errorCode); + * \endcode + * See convrtrs.txt. + * + * @see ucnv_open + * @stable ICU 2.4 + */ +#define UCNV_VERSION_OPTION_STRING ",version=" + +/** + * Converter option for EBCDIC SBCS or mixed-SBCS/DBCS (stateful) codepages. + * Swaps Unicode mappings for EBCDIC LF and NL codes, as used on + * S/390 (z/OS) Unix System Services (Open Edition). + * For example, ucnv_open("ibm-1047,swaplfnl", &errorCode); + * See convrtrs.txt. + * + * @see ucnv_open + * @stable ICU 2.4 + */ +#define UCNV_SWAP_LFNL_OPTION_STRING ",swaplfnl" + +/** + * Do a fuzzy compare of two converter/alias names. + * The comparison is case-insensitive, ignores leading zeroes if they are not + * followed by further digits, and ignores all but letters and digits. + * Thus the strings "UTF-8", "utf_8", "u*T@f08" and "Utf 8" are exactly equivalent. + * See section 1.4, Charset Alias Matching in Unicode Technical Standard #22 + * at http://www.unicode.org/reports/tr22/ + * + * @param name1 a converter name or alias, zero-terminated + * @param name2 a converter name or alias, zero-terminated + * @return 0 if the names match, or a negative value if the name1 + * lexically precedes name2, or a positive value if the name1 + * lexically follows name2. + * @stable ICU 2.0 + */ +U_STABLE int U_EXPORT2 +ucnv_compareNames(const char *name1, const char *name2); + + +/** + * Creates a UConverter object with the name of a coded character set specified as a C string. + * The actual name will be resolved with the alias file + * using a case-insensitive string comparison that ignores + * leading zeroes and all non-alphanumeric characters. + * E.g., the names "UTF8", "utf-8", "u*T@f08" and "Utf 8" are all equivalent. + * (See also ucnv_compareNames().) + * If <code>NULL</code> is passed for the converter name, it will create one with the + * getDefaultName return value. + * + * <p>A converter name for ICU 1.5 and above may contain options + * like a locale specification to control the specific behavior of + * the newly instantiated converter. + * The meaning of the options depends on the particular converter. + * If an option is not defined for or recognized by a given converter, then it is ignored.</p> + * + * <p>Options are appended to the converter name string, with a + * <code>UCNV_OPTION_SEP_CHAR</code> between the name and the first option and + * also between adjacent options.</p> + * + * <p>If the alias is ambiguous, then the preferred converter is used + * and the status is set to U_AMBIGUOUS_ALIAS_WARNING.</p> + * + * <p>The conversion behavior and names can vary between platforms. ICU may + * convert some characters differently from other platforms. Details on this topic + * are in the <a href="http://icu-project.org/userguide/conversion.html">User's + * Guide</a>. Aliases starting with a "cp" prefix have no specific meaning + * other than its an alias starting with the letters "cp". Please do not + * associate any meaning to these aliases.</p> + * + * \snippet samples/ucnv/convsamp.cpp ucnv_open + * + * @param converterName Name of the coded character set table. + * This may have options appended to the string. + * IANA alias character set names, IBM CCSIDs starting with "ibm-", + * Windows codepage numbers starting with "windows-" are frequently + * used for this parameter. See ucnv_getAvailableName and + * ucnv_getAlias for a complete list that is available. + * If this parameter is NULL, the default converter will be used. + * @param err outgoing error status <TT>U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR</TT> + * @return the created Unicode converter object, or <TT>NULL</TT> if an error occured + * @see ucnv_openU + * @see ucnv_openCCSID + * @see ucnv_getAvailableName + * @see ucnv_getAlias + * @see ucnv_getDefaultName + * @see ucnv_close + * @see ucnv_compareNames + * @stable ICU 2.0 + */ +U_STABLE UConverter* U_EXPORT2 +ucnv_open(const char *converterName, UErrorCode *err); + + +/** + * Creates a Unicode converter with the names specified as unicode string. + * The name should be limited to the ASCII-7 alphanumerics range. + * The actual name will be resolved with the alias file + * using a case-insensitive string comparison that ignores + * leading zeroes and all non-alphanumeric characters. + * E.g., the names "UTF8", "utf-8", "u*T@f08" and "Utf 8" are all equivalent. + * (See also ucnv_compareNames().) + * If <TT>NULL</TT> is passed for the converter name, it will create + * one with the ucnv_getDefaultName() return value. + * If the alias is ambiguous, then the preferred converter is used + * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. + * + * <p>See ucnv_open for the complete details</p> + * @param name Name of the UConverter table in a zero terminated + * Unicode string + * @param err outgoing error status <TT>U_MEMORY_ALLOCATION_ERROR, + * U_FILE_ACCESS_ERROR</TT> + * @return the created Unicode converter object, or <TT>NULL</TT> if an + * error occured + * @see ucnv_open + * @see ucnv_openCCSID + * @see ucnv_close + * @see ucnv_compareNames + * @stable ICU 2.0 + */ +U_STABLE UConverter* U_EXPORT2 +ucnv_openU(const UChar *name, + UErrorCode *err); + +/** + * Creates a UConverter object from a CCSID number and platform pair. + * Note that the usefulness of this function is limited to platforms with numeric + * encoding IDs. Only IBM and Microsoft platforms use numeric (16-bit) identifiers for + * encodings. + * + * In addition, IBM CCSIDs and Unicode conversion tables are not 1:1 related. + * For many IBM CCSIDs there are multiple (up to six) Unicode conversion tables, and + * for some Unicode conversion tables there are multiple CCSIDs. + * Some "alternate" Unicode conversion tables are provided by the + * IBM CDRA conversion table registry. + * The most prominent example of a systematic modification of conversion tables that is + * not provided in the form of conversion table files in the repository is + * that S/390 Unix System Services swaps the codes for Line Feed and New Line in all + * EBCDIC codepages, which requires such a swap in the Unicode conversion tables as well. + * + * Only IBM default conversion tables are accessible with ucnv_openCCSID(). + * ucnv_getCCSID() will return the same CCSID for all conversion tables that are associated + * with that CCSID. + * + * Currently, the only "platform" supported in the ICU converter API is UCNV_IBM. + * + * In summary, the use of CCSIDs and the associated API functions is not recommended. + * + * In order to open a converter with the default IBM CDRA Unicode conversion table, + * you can use this function or use the prefix "ibm-": + * \code + * char name[20]; + * sprintf(name, "ibm-%hu", ccsid); + * cnv=ucnv_open(name, &errorCode); + * \endcode + * + * In order to open a converter with the IBM S/390 Unix System Services variant + * of a Unicode/EBCDIC conversion table, + * you can use the prefix "ibm-" together with the option string UCNV_SWAP_LFNL_OPTION_STRING: + * \code + * char name[20]; + * sprintf(name, "ibm-%hu" UCNV_SWAP_LFNL_OPTION_STRING, ccsid); + * cnv=ucnv_open(name, &errorCode); + * \endcode + * + * In order to open a converter from a Microsoft codepage number, use the prefix "cp": + * \code + * char name[20]; + * sprintf(name, "cp%hu", codepageID); + * cnv=ucnv_open(name, &errorCode); + * \endcode + * + * If the alias is ambiguous, then the preferred converter is used + * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. + * + * @param codepage codepage number to create + * @param platform the platform in which the codepage number exists + * @param err error status <TT>U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR</TT> + * @return the created Unicode converter object, or <TT>NULL</TT> if an error + * occured. + * @see ucnv_open + * @see ucnv_openU + * @see ucnv_close + * @see ucnv_getCCSID + * @see ucnv_getPlatform + * @see UConverterPlatform + * @stable ICU 2.0 + */ +U_STABLE UConverter* U_EXPORT2 +ucnv_openCCSID(int32_t codepage, + UConverterPlatform platform, + UErrorCode * err); + +/** + * <p>Creates a UConverter object specified from a packageName and a converterName.</p> + * + * <p>The packageName and converterName must point to an ICU udata object, as defined by + * <code> udata_open( packageName, "cnv", converterName, err) </code> or equivalent. + * Typically, packageName will refer to a (.dat) file, or to a package registered with + * udata_setAppData(). Using a full file or directory pathname for packageName is deprecated.</p> + * + * <p>The name will NOT be looked up in the alias mechanism, nor will the converter be + * stored in the converter cache or the alias table. The only way to open further converters + * is call this function multiple times, or use the ucnv_safeClone() function to clone a + * 'master' converter.</p> + * + * <p>A future version of ICU may add alias table lookups and/or caching + * to this function.</p> + * + * <p>Example Use: + * <code>cnv = ucnv_openPackage("myapp", "myconverter", &err);</code> + * </p> + * + * @param packageName name of the package (equivalent to 'path' in udata_open() call) + * @param converterName name of the data item to be used, without suffix. + * @param err outgoing error status <TT>U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR</TT> + * @return the created Unicode converter object, or <TT>NULL</TT> if an error occured + * @see udata_open + * @see ucnv_open + * @see ucnv_safeClone + * @see ucnv_close + * @stable ICU 2.2 + */ +U_STABLE UConverter* U_EXPORT2 +ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode *err); + +/** + * Thread safe converter cloning operation. + * For most efficient operation, pass in a stackBuffer (and a *pBufferSize) + * with at least U_CNV_SAFECLONE_BUFFERSIZE bytes of space. + * If the buffer size is sufficient, then the clone will use the stack buffer; + * otherwise, it will be allocated, and *pBufferSize will indicate + * the actual size. (This should not occur with U_CNV_SAFECLONE_BUFFERSIZE.) + * + * You must ucnv_close() the clone in any case. + * + * If *pBufferSize==0, (regardless of whether stackBuffer==NULL or not) + * then *pBufferSize will be changed to a sufficient size + * for cloning this converter, + * without actually cloning the converter ("pure pre-flighting"). + * + * If *pBufferSize is greater than zero but not large enough for a stack-based + * clone, then the converter is cloned using newly allocated memory + * and *pBufferSize is changed to the necessary size. + * + * If the converter clone fits into the stack buffer but the stack buffer is not + * sufficiently aligned for the clone, then the clone will use an + * adjusted pointer and use an accordingly smaller buffer size. + * + * @param cnv converter to be cloned + * @param stackBuffer <em>Deprecated functionality as of ICU 52, use NULL.</em><br> + * user allocated space for the new clone. If NULL new memory will be allocated. + * If buffer is not large enough, new memory will be allocated. + * Clients can use the U_CNV_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations. + * @param pBufferSize <em>Deprecated functionality as of ICU 52, use NULL or 1.</em><br> + * pointer to size of allocated space. + * @param status to indicate whether the operation went on smoothly or there were errors + * An informational status value, U_SAFECLONE_ALLOCATED_WARNING, + * is used if any allocations were necessary. + * However, it is better to check if *pBufferSize grew for checking for + * allocations because warning codes can be overridden by subsequent + * function calls. + * @return pointer to the new clone + * @stable ICU 2.0 + */ +U_STABLE UConverter * U_EXPORT2 +ucnv_safeClone(const UConverter *cnv, + void *stackBuffer, + int32_t *pBufferSize, + UErrorCode *status); + +#ifndef U_HIDE_DEPRECATED_API + +/** + * \def U_CNV_SAFECLONE_BUFFERSIZE + * Definition of a buffer size that is designed to be large enough for + * converters to be cloned with ucnv_safeClone(). + * @deprecated ICU 52. Do not rely on ucnv_safeClone() cloning into any provided buffer. + */ +#define U_CNV_SAFECLONE_BUFFERSIZE 1024 + +#endif /* U_HIDE_DEPRECATED_API */ + +/** + * Deletes the unicode converter and releases resources associated + * with just this instance. + * Does not free up shared converter tables. + * + * @param converter the converter object to be deleted + * @see ucnv_open + * @see ucnv_openU + * @see ucnv_openCCSID + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_close(UConverter * converter); + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN + +/** + * \class LocalUConverterPointer + * "Smart pointer" class, closes a UConverter via ucnv_close(). + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + * @stable ICU 4.4 + */ +U_DEFINE_LOCAL_OPEN_POINTER(LocalUConverterPointer, UConverter, ucnv_close); + +U_NAMESPACE_END + +#endif + +/** + * Fills in the output parameter, subChars, with the substitution characters + * as multiple bytes. + * If ucnv_setSubstString() set a Unicode string because the converter is + * stateful, then subChars will be an empty string. + * + * @param converter the Unicode converter + * @param subChars the subsitution characters + * @param len on input the capacity of subChars, on output the number + * of bytes copied to it + * @param err the outgoing error status code. + * If the substitution character array is too small, an + * <TT>U_INDEX_OUTOFBOUNDS_ERROR</TT> will be returned. + * @see ucnv_setSubstString + * @see ucnv_setSubstChars + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_getSubstChars(const UConverter *converter, + char *subChars, + int8_t *len, + UErrorCode *err); + +/** + * Sets the substitution chars when converting from unicode to a codepage. The + * substitution is specified as a string of 1-4 bytes, and may contain + * <TT>NULL</TT> bytes. + * The subChars must represent a single character. The caller needs to know the + * byte sequence of a valid character in the converter's charset. + * For some converters, for example some ISO 2022 variants, only single-byte + * substitution characters may be supported. + * The newer ucnv_setSubstString() function relaxes these limitations. + * + * @param converter the Unicode converter + * @param subChars the substitution character byte sequence we want set + * @param len the number of bytes in subChars + * @param err the error status code. <TT>U_INDEX_OUTOFBOUNDS_ERROR </TT> if + * len is bigger than the maximum number of bytes allowed in subchars + * @see ucnv_setSubstString + * @see ucnv_getSubstChars + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_setSubstChars(UConverter *converter, + const char *subChars, + int8_t len, + UErrorCode *err); + +/** + * Set a substitution string for converting from Unicode to a charset. + * The caller need not know the charset byte sequence for each charset. + * + * Unlike ucnv_setSubstChars() which is designed to set a charset byte sequence + * for a single character, this function takes a Unicode string with + * zero, one or more characters, and immediately verifies that the string can be + * converted to the charset. + * If not, or if the result is too long (more than 32 bytes as of ICU 3.6), + * then the function returns with an error accordingly. + * + * Also unlike ucnv_setSubstChars(), this function works for stateful charsets + * by converting on the fly at the point of substitution rather than setting + * a fixed byte sequence. + * + * @param cnv The UConverter object. + * @param s The Unicode string. + * @param length The number of UChars in s, or -1 for a NUL-terminated string. + * @param err Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * + * @see ucnv_setSubstChars + * @see ucnv_getSubstChars + * @stable ICU 3.6 + */ +U_STABLE void U_EXPORT2 +ucnv_setSubstString(UConverter *cnv, + const UChar *s, + int32_t length, + UErrorCode *err); + +/** + * Fills in the output parameter, errBytes, with the error characters from the + * last failing conversion. + * + * @param converter the Unicode converter + * @param errBytes the codepage bytes which were in error + * @param len on input the capacity of errBytes, on output the number of + * bytes which were copied to it + * @param err the error status code. + * If the substitution character array is too small, an + * <TT>U_INDEX_OUTOFBOUNDS_ERROR</TT> will be returned. + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_getInvalidChars(const UConverter *converter, + char *errBytes, + int8_t *len, + UErrorCode *err); + +/** + * Fills in the output parameter, errChars, with the error characters from the + * last failing conversion. + * + * @param converter the Unicode converter + * @param errUChars the UChars which were in error + * @param len on input the capacity of errUChars, on output the number of + * UChars which were copied to it + * @param err the error status code. + * If the substitution character array is too small, an + * <TT>U_INDEX_OUTOFBOUNDS_ERROR</TT> will be returned. + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_getInvalidUChars(const UConverter *converter, + UChar *errUChars, + int8_t *len, + UErrorCode *err); + +/** + * Resets the state of a converter to the default state. This is used + * in the case of an error, to restart a conversion from a known default state. + * It will also empty the internal output buffers. + * @param converter the Unicode converter + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_reset(UConverter *converter); + +/** + * Resets the to-Unicode part of a converter state to the default state. + * This is used in the case of an error to restart a conversion to + * Unicode to a known default state. It will also empty the internal + * output buffers used for the conversion to Unicode codepoints. + * @param converter the Unicode converter + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_resetToUnicode(UConverter *converter); + +/** + * Resets the from-Unicode part of a converter state to the default state. + * This is used in the case of an error to restart a conversion from + * Unicode to a known default state. It will also empty the internal output + * buffers used for the conversion from Unicode codepoints. + * @param converter the Unicode converter + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_resetFromUnicode(UConverter *converter); + +/** + * Returns the maximum number of bytes that are output per UChar in conversion + * from Unicode using this converter. + * The returned number can be used with UCNV_GET_MAX_BYTES_FOR_STRING + * to calculate the size of a target buffer for conversion from Unicode. + * + * Note: Before ICU 2.8, this function did not return reliable numbers for + * some stateful converters (EBCDIC_STATEFUL, ISO-2022) and LMBCS. + * + * This number may not be the same as the maximum number of bytes per + * "conversion unit". In other words, it may not be the intuitively expected + * number of bytes per character that would be published for a charset, + * and may not fulfill any other purpose than the allocation of an output + * buffer of guaranteed sufficient size for a given input length and converter. + * + * Examples for special cases that are taken into account: + * - Supplementary code points may convert to more bytes than BMP code points. + * This function returns bytes per UChar (UTF-16 code unit), not per + * Unicode code point, for efficient buffer allocation. + * - State-shifting output (SI/SO, escapes, etc.) from stateful converters. + * - When m input UChars are converted to n output bytes, then the maximum m/n + * is taken into account. + * + * The number returned here does not take into account + * (see UCNV_GET_MAX_BYTES_FOR_STRING): + * - callbacks which output more than one charset character sequence per call, + * like escape callbacks + * - initial and final non-character bytes that are output by some converters + * (automatic BOMs, initial escape sequence, final SI, etc.) + * + * Examples for returned values: + * - SBCS charsets: 1 + * - Shift-JIS: 2 + * - UTF-16: 2 (2 per BMP, 4 per surrogate _pair_, BOM not counted) + * - UTF-8: 3 (3 per BMP, 4 per surrogate _pair_) + * - EBCDIC_STATEFUL (EBCDIC mixed SBCS/DBCS): 3 (SO + DBCS) + * - ISO-2022: 3 (always outputs UTF-8) + * - ISO-2022-JP: 6 (4-byte escape sequences + DBCS) + * - ISO-2022-CN: 8 (4-byte designator sequences + 2-byte SS2/SS3 + DBCS) + * + * @param converter The Unicode converter. + * @return The maximum number of bytes per UChar (16 bit code unit) + * that are output by ucnv_fromUnicode(), + * to be used together with UCNV_GET_MAX_BYTES_FOR_STRING + * for buffer allocation. + * + * @see UCNV_GET_MAX_BYTES_FOR_STRING + * @see ucnv_getMinCharSize + * @stable ICU 2.0 + */ +U_STABLE int8_t U_EXPORT2 +ucnv_getMaxCharSize(const UConverter *converter); + +/** + * Calculates the size of a buffer for conversion from Unicode to a charset. + * The calculated size is guaranteed to be sufficient for this conversion. + * + * It takes into account initial and final non-character bytes that are output + * by some converters. + * It does not take into account callbacks which output more than one charset + * character sequence per call, like escape callbacks. + * The default (substitution) callback only outputs one charset character sequence. + * + * @param length Number of UChars to be converted. + * @param maxCharSize Return value from ucnv_getMaxCharSize() for the converter + * that will be used. + * @return Size of a buffer that will be large enough to hold the output bytes of + * converting length UChars with the converter that returned the maxCharSize. + * + * @see ucnv_getMaxCharSize + * @stable ICU 2.8 + */ +#define UCNV_GET_MAX_BYTES_FOR_STRING(length, maxCharSize) \ + (((int32_t)(length)+10)*(int32_t)(maxCharSize)) + +/** + * Returns the minimum byte length (per codepoint) for characters in this codepage. + * This is usually either 1 or 2. + * @param converter the Unicode converter + * @return the minimum number of bytes per codepoint allowed by this particular converter + * @see ucnv_getMaxCharSize + * @stable ICU 2.0 + */ +U_STABLE int8_t U_EXPORT2 +ucnv_getMinCharSize(const UConverter *converter); + +/** + * Returns the display name of the converter passed in based on the Locale + * passed in. If the locale contains no display name, the internal ASCII + * name will be filled in. + * + * @param converter the Unicode converter. + * @param displayLocale is the specific Locale we want to localised for + * @param displayName user provided buffer to be filled in + * @param displayNameCapacity size of displayName Buffer + * @param err error status code + * @return displayNameLength number of UChar needed in displayName + * @see ucnv_getName + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +ucnv_getDisplayName(const UConverter *converter, + const char *displayLocale, + UChar *displayName, + int32_t displayNameCapacity, + UErrorCode *err); + +/** + * Gets the internal, canonical name of the converter (zero-terminated). + * The lifetime of the returned string will be that of the converter + * passed to this function. + * @param converter the Unicode converter + * @param err UErrorCode status + * @return the internal name of the converter + * @see ucnv_getDisplayName + * @stable ICU 2.0 + */ +U_STABLE const char * U_EXPORT2 +ucnv_getName(const UConverter *converter, UErrorCode *err); + +/** + * Gets a codepage number associated with the converter. This is not guaranteed + * to be the one used to create the converter. Some converters do not represent + * platform registered codepages and return zero for the codepage number. + * The error code fill-in parameter indicates if the codepage number + * is available. + * Does not check if the converter is <TT>NULL</TT> or if converter's data + * table is <TT>NULL</TT>. + * + * Important: The use of CCSIDs is not recommended because it is limited + * to only two platforms in principle and only one (UCNV_IBM) in the current + * ICU converter API. + * Also, CCSIDs are insufficient to identify IBM Unicode conversion tables precisely. + * For more details see ucnv_openCCSID(). + * + * @param converter the Unicode converter + * @param err the error status code. + * @return If any error occurrs, -1 will be returned otherwise, the codepage number + * will be returned + * @see ucnv_openCCSID + * @see ucnv_getPlatform + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +ucnv_getCCSID(const UConverter *converter, + UErrorCode *err); + +/** + * Gets a codepage platform associated with the converter. Currently, + * only <TT>UCNV_IBM</TT> will be returned. + * Does not test if the converter is <TT>NULL</TT> or if converter's data + * table is <TT>NULL</TT>. + * @param converter the Unicode converter + * @param err the error status code. + * @return The codepage platform + * @stable ICU 2.0 + */ +U_STABLE UConverterPlatform U_EXPORT2 +ucnv_getPlatform(const UConverter *converter, + UErrorCode *err); + +/** + * Gets the type of the converter + * e.g. SBCS, MBCS, DBCS, UTF8, UTF16_BE, UTF16_LE, ISO_2022, + * EBCDIC_STATEFUL, LATIN_1 + * @param converter a valid, opened converter + * @return the type of the converter + * @stable ICU 2.0 + */ +U_STABLE UConverterType U_EXPORT2 +ucnv_getType(const UConverter * converter); + +/** + * Gets the "starter" (lead) bytes for converters of type MBCS. + * Will fill in an <TT>U_ILLEGAL_ARGUMENT_ERROR</TT> if converter passed in + * is not MBCS. Fills in an array of type UBool, with the value of the byte + * as offset to the array. For example, if (starters[0x20] == TRUE) at return, + * it means that the byte 0x20 is a starter byte in this converter. + * Context pointers are always owned by the caller. + * + * @param converter a valid, opened converter of type MBCS + * @param starters an array of size 256 to be filled in + * @param err error status, <TT>U_ILLEGAL_ARGUMENT_ERROR</TT> if the + * converter is not a type which can return starters. + * @see ucnv_getType + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_getStarters(const UConverter* converter, + UBool starters[256], + UErrorCode* err); + + +/** + * Selectors for Unicode sets that can be returned by ucnv_getUnicodeSet(). + * @see ucnv_getUnicodeSet + * @stable ICU 2.6 + */ +typedef enum UConverterUnicodeSet { + /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */ + UCNV_ROUNDTRIP_SET, + /** Select the set of Unicode code points with roundtrip or fallback mappings. @stable ICU 4.0 */ + UCNV_ROUNDTRIP_AND_FALLBACK_SET, + /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */ + UCNV_SET_COUNT +} UConverterUnicodeSet; + + +/** + * Returns the set of Unicode code points that can be converted by an ICU converter. + * + * Returns one of several kinds of set: + * + * 1. UCNV_ROUNDTRIP_SET + * + * The set of all Unicode code points that can be roundtrip-converted + * (converted without any data loss) with the converter (ucnv_fromUnicode()). + * This set will not include code points that have fallback mappings + * or are only the result of reverse fallback mappings. + * This set will also not include PUA code points with fallbacks, although + * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback(). + * See UTR #22 "Character Mapping Markup Language" + * at http://www.unicode.org/reports/tr22/ + * + * This is useful for example for + * - checking that a string or document can be roundtrip-converted with a converter, + * without/before actually performing the conversion + * - testing if a converter can be used for text for typical text for a certain locale, + * by comparing its roundtrip set with the set of ExemplarCharacters from + * ICU's locale data or other sources + * + * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET + * + * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode()) + * when fallbacks are turned on (see ucnv_setFallback()). + * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks). + * + * In the future, there may be more UConverterUnicodeSet choices to select + * sets with different properties. + * + * @param cnv The converter for which a set is requested. + * @param setFillIn A valid USet *. It will be cleared by this function before + * the converter's specific set is filled into the USet. + * @param whichSet A UConverterUnicodeSet selector; + * currently UCNV_ROUNDTRIP_SET is the only supported value. + * @param pErrorCode ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * + * @see UConverterUnicodeSet + * @see uset_open + * @see uset_close + * @stable ICU 2.6 + */ +U_STABLE void U_EXPORT2 +ucnv_getUnicodeSet(const UConverter *cnv, + USet *setFillIn, + UConverterUnicodeSet whichSet, + UErrorCode *pErrorCode); + +/** + * Gets the current calback function used by the converter when an illegal + * or invalid codepage sequence is found. + * Context pointers are always owned by the caller. + * + * @param converter the unicode converter + * @param action fillin: returns the callback function pointer + * @param context fillin: returns the callback's private void* context + * @see ucnv_setToUCallBack + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_getToUCallBack (const UConverter * converter, + UConverterToUCallback *action, + const void **context); + +/** + * Gets the current callback function used by the converter when illegal + * or invalid Unicode sequence is found. + * Context pointers are always owned by the caller. + * + * @param converter the unicode converter + * @param action fillin: returns the callback function pointer + * @param context fillin: returns the callback's private void* context + * @see ucnv_setFromUCallBack + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_getFromUCallBack (const UConverter * converter, + UConverterFromUCallback *action, + const void **context); + +/** + * Changes the callback function used by the converter when + * an illegal or invalid sequence is found. + * Context pointers are always owned by the caller. + * Predefined actions and contexts can be found in the ucnv_err.h header. + * + * @param converter the unicode converter + * @param newAction the new callback function + * @param newContext the new toUnicode callback context pointer. This can be NULL. + * @param oldAction fillin: returns the old callback function pointer. This can be NULL. + * @param oldContext fillin: returns the old callback's private void* context. This can be NULL. + * @param err The error code status + * @see ucnv_getToUCallBack + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_setToUCallBack (UConverter * converter, + UConverterToUCallback newAction, + const void* newContext, + UConverterToUCallback *oldAction, + const void** oldContext, + UErrorCode * err); + +/** + * Changes the current callback function used by the converter when + * an illegal or invalid sequence is found. + * Context pointers are always owned by the caller. + * Predefined actions and contexts can be found in the ucnv_err.h header. + * + * @param converter the unicode converter + * @param newAction the new callback function + * @param newContext the new fromUnicode callback context pointer. This can be NULL. + * @param oldAction fillin: returns the old callback function pointer. This can be NULL. + * @param oldContext fillin: returns the old callback's private void* context. This can be NULL. + * @param err The error code status + * @see ucnv_getFromUCallBack + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_setFromUCallBack (UConverter * converter, + UConverterFromUCallback newAction, + const void *newContext, + UConverterFromUCallback *oldAction, + const void **oldContext, + UErrorCode * err); + +/** + * Converts an array of unicode characters to an array of codepage + * characters. This function is optimized for converting a continuous + * stream of data in buffer-sized chunks, where the entire source and + * target does not fit in available buffers. + * + * The source pointer is an in/out parameter. It starts out pointing where the + * conversion is to begin, and ends up pointing after the last UChar consumed. + * + * Target similarly starts out pointer at the first available byte in the output + * buffer, and ends up pointing after the last byte written to the output. + * + * The converter always attempts to consume the entire source buffer, unless + * (1.) the target buffer is full, or (2.) a failing error is returned from the + * current callback function. When a successful error status has been + * returned, it means that all of the source buffer has been + * consumed. At that point, the caller should reset the source and + * sourceLimit pointers to point to the next chunk. + * + * At the end of the stream (flush==TRUE), the input is completely consumed + * when *source==sourceLimit and no error code is set. + * The converter object is then automatically reset by this function. + * (This means that a converter need not be reset explicitly between data + * streams if it finishes the previous stream without errors.) + * + * This is a <I>stateful</I> conversion. Additionally, even when all source data has + * been consumed, some data may be in the converters' internal state. + * Call this function repeatedly, updating the target pointers with + * the next empty chunk of target in case of a + * <TT>U_BUFFER_OVERFLOW_ERROR</TT>, and updating the source pointers + * with the next chunk of source when a successful error status is + * returned, until there are no more chunks of source data. + * @param converter the Unicode converter + * @param target I/O parameter. Input : Points to the beginning of the buffer to copy + * codepage characters to. Output : points to after the last codepage character copied + * to <TT>target</TT>. + * @param targetLimit the pointer just after last of the <TT>target</TT> buffer + * @param source I/O parameter, pointer to pointer to the source Unicode character buffer. + * @param sourceLimit the pointer just after the last of the source buffer + * @param offsets if NULL is passed, nothing will happen to it, otherwise it needs to have the same number + * of allocated cells as <TT>target</TT>. Will fill in offsets from target to source pointer + * e.g: <TT>offsets[3]</TT> is equal to 6, it means that the <TT>target[3]</TT> was a result of transcoding <TT>source[6]</TT> + * For output data carried across calls, and other data without a specific source character + * (such as from escape sequences or callbacks) -1 will be placed for offsets. + * @param flush set to <TT>TRUE</TT> if the current source buffer is the last available + * chunk of the source, <TT>FALSE</TT> otherwise. Note that if a failing status is returned, + * this function may have to be called multiple times with flush set to <TT>TRUE</TT> until + * the source buffer is consumed. + * @param err the error status. <TT>U_ILLEGAL_ARGUMENT_ERROR</TT> will be set if the + * converter is <TT>NULL</TT>. + * <code>U_BUFFER_OVERFLOW_ERROR</code> will be set if the target is full and there is + * still data to be written to the target. + * @see ucnv_fromUChars + * @see ucnv_convert + * @see ucnv_getMinCharSize + * @see ucnv_setToUCallBack + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_fromUnicode (UConverter * converter, + char **target, + const char *targetLimit, + const UChar ** source, + const UChar * sourceLimit, + int32_t* offsets, + UBool flush, + UErrorCode * err); + +/** + * Converts a buffer of codepage bytes into an array of unicode UChars + * characters. This function is optimized for converting a continuous + * stream of data in buffer-sized chunks, where the entire source and + * target does not fit in available buffers. + * + * The source pointer is an in/out parameter. It starts out pointing where the + * conversion is to begin, and ends up pointing after the last byte of source consumed. + * + * Target similarly starts out pointer at the first available UChar in the output + * buffer, and ends up pointing after the last UChar written to the output. + * It does NOT necessarily keep UChar sequences together. + * + * The converter always attempts to consume the entire source buffer, unless + * (1.) the target buffer is full, or (2.) a failing error is returned from the + * current callback function. When a successful error status has been + * returned, it means that all of the source buffer has been + * consumed. At that point, the caller should reset the source and + * sourceLimit pointers to point to the next chunk. + * + * At the end of the stream (flush==TRUE), the input is completely consumed + * when *source==sourceLimit and no error code is set + * The converter object is then automatically reset by this function. + * (This means that a converter need not be reset explicitly between data + * streams if it finishes the previous stream without errors.) + * + * This is a <I>stateful</I> conversion. Additionally, even when all source data has + * been consumed, some data may be in the converters' internal state. + * Call this function repeatedly, updating the target pointers with + * the next empty chunk of target in case of a + * <TT>U_BUFFER_OVERFLOW_ERROR</TT>, and updating the source pointers + * with the next chunk of source when a successful error status is + * returned, until there are no more chunks of source data. + * @param converter the Unicode converter + * @param target I/O parameter. Input : Points to the beginning of the buffer to copy + * UChars into. Output : points to after the last UChar copied. + * @param targetLimit the pointer just after the end of the <TT>target</TT> buffer + * @param source I/O parameter, pointer to pointer to the source codepage buffer. + * @param sourceLimit the pointer to the byte after the end of the source buffer + * @param offsets if NULL is passed, nothing will happen to it, otherwise it needs to have the same number + * of allocated cells as <TT>target</TT>. Will fill in offsets from target to source pointer + * e.g: <TT>offsets[3]</TT> is equal to 6, it means that the <TT>target[3]</TT> was a result of transcoding <TT>source[6]</TT> + * For output data carried across calls, and other data without a specific source character + * (such as from escape sequences or callbacks) -1 will be placed for offsets. + * @param flush set to <TT>TRUE</TT> if the current source buffer is the last available + * chunk of the source, <TT>FALSE</TT> otherwise. Note that if a failing status is returned, + * this function may have to be called multiple times with flush set to <TT>TRUE</TT> until + * the source buffer is consumed. + * @param err the error status. <TT>U_ILLEGAL_ARGUMENT_ERROR</TT> will be set if the + * converter is <TT>NULL</TT>. + * <code>U_BUFFER_OVERFLOW_ERROR</code> will be set if the target is full and there is + * still data to be written to the target. + * @see ucnv_fromUChars + * @see ucnv_convert + * @see ucnv_getMinCharSize + * @see ucnv_setFromUCallBack + * @see ucnv_getNextUChar + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_toUnicode(UConverter *converter, + UChar **target, + const UChar *targetLimit, + const char **source, + const char *sourceLimit, + int32_t *offsets, + UBool flush, + UErrorCode *err); + +/** + * Convert the Unicode string into a codepage string using an existing UConverter. + * The output string is NUL-terminated if possible. + * + * This function is a more convenient but less powerful version of ucnv_fromUnicode(). + * It is only useful for whole strings, not for streaming conversion. + * + * The maximum output buffer capacity required (barring output from callbacks) will be + * UCNV_GET_MAX_BYTES_FOR_STRING(srcLength, ucnv_getMaxCharSize(cnv)). + * + * @param cnv the converter object to be used (ucnv_resetFromUnicode() will be called) + * @param src the input Unicode string + * @param srcLength the input string length, or -1 if NUL-terminated + * @param dest destination string buffer, can be NULL if destCapacity==0 + * @param destCapacity the number of chars available at dest + * @param pErrorCode normal ICU error code; + * common error codes that may be set by this function include + * U_BUFFER_OVERFLOW_ERROR, U_STRING_NOT_TERMINATED_WARNING, + * U_ILLEGAL_ARGUMENT_ERROR, and conversion errors + * @return the length of the output string, not counting the terminating NUL; + * if the length is greater than destCapacity, then the string will not fit + * and a buffer of the indicated length would need to be passed in + * @see ucnv_fromUnicode + * @see ucnv_convert + * @see UCNV_GET_MAX_BYTES_FOR_STRING + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +ucnv_fromUChars(UConverter *cnv, + char *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UErrorCode *pErrorCode); + +/** + * Convert the codepage string into a Unicode string using an existing UConverter. + * The output string is NUL-terminated if possible. + * + * This function is a more convenient but less powerful version of ucnv_toUnicode(). + * It is only useful for whole strings, not for streaming conversion. + * + * The maximum output buffer capacity required (barring output from callbacks) will be + * 2*srcLength (each char may be converted into a surrogate pair). + * + * @param cnv the converter object to be used (ucnv_resetToUnicode() will be called) + * @param src the input codepage string + * @param srcLength the input string length, or -1 if NUL-terminated + * @param dest destination string buffer, can be NULL if destCapacity==0 + * @param destCapacity the number of UChars available at dest + * @param pErrorCode normal ICU error code; + * common error codes that may be set by this function include + * U_BUFFER_OVERFLOW_ERROR, U_STRING_NOT_TERMINATED_WARNING, + * U_ILLEGAL_ARGUMENT_ERROR, and conversion errors + * @return the length of the output string, not counting the terminating NUL; + * if the length is greater than destCapacity, then the string will not fit + * and a buffer of the indicated length would need to be passed in + * @see ucnv_toUnicode + * @see ucnv_convert + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +ucnv_toUChars(UConverter *cnv, + UChar *dest, int32_t destCapacity, + const char *src, int32_t srcLength, + UErrorCode *pErrorCode); + +/** + * Convert a codepage buffer into Unicode one character at a time. + * The input is completely consumed when the U_INDEX_OUTOFBOUNDS_ERROR is set. + * + * Advantage compared to ucnv_toUnicode() or ucnv_toUChars(): + * - Faster for small amounts of data, for most converters, e.g., + * US-ASCII, ISO-8859-1, UTF-8/16/32, and most "normal" charsets. + * (For complex converters, e.g., SCSU, UTF-7 and ISO 2022 variants, + * it uses ucnv_toUnicode() internally.) + * - Convenient. + * + * Limitations compared to ucnv_toUnicode(): + * - Always assumes flush=TRUE. + * This makes ucnv_getNextUChar() unsuitable for "streaming" conversion, + * that is, for where the input is supplied in multiple buffers, + * because ucnv_getNextUChar() will assume the end of the input at the end + * of the first buffer. + * - Does not provide offset output. + * + * It is possible to "mix" ucnv_getNextUChar() and ucnv_toUnicode() because + * ucnv_getNextUChar() uses the current state of the converter + * (unlike ucnv_toUChars() which always resets first). + * However, if ucnv_getNextUChar() is called after ucnv_toUnicode() + * stopped in the middle of a character sequence (with flush=FALSE), + * then ucnv_getNextUChar() will always use the slower ucnv_toUnicode() + * internally until the next character boundary. + * (This is new in ICU 2.6. In earlier releases, ucnv_getNextUChar() had to + * start at a character boundary.) + * + * Instead of using ucnv_getNextUChar(), it is recommended + * to convert using ucnv_toUnicode() or ucnv_toUChars() + * and then iterate over the text using U16_NEXT() or a UCharIterator (uiter.h) + * or a C++ CharacterIterator or similar. + * This allows streaming conversion and offset output, for example. + * + * <p>Handling of surrogate pairs and supplementary-plane code points:<br> + * There are two different kinds of codepages that provide mappings for surrogate characters: + * <ul> + * <li>Codepages like UTF-8, UTF-32, and GB 18030 provide direct representations for Unicode + * code points U+10000-U+10ffff as well as for single surrogates U+d800-U+dfff. + * Each valid sequence will result in exactly one returned code point. + * If a sequence results in a single surrogate, then that will be returned + * by itself, even if a neighboring sequence encodes the matching surrogate.</li> + * <li>Codepages like SCSU and LMBCS (and UTF-16) provide direct representations only for BMP code points + * including surrogates. Code points in supplementary planes are represented with + * two sequences, each encoding a surrogate. + * For these codepages, matching pairs of surrogates will be combined into single + * code points for returning from this function. + * (Note that SCSU is actually a mix of these codepage types.)</li> + * </ul></p> + * + * @param converter an open UConverter + * @param source the address of a pointer to the codepage buffer, will be + * updated to point after the bytes consumed in the conversion call. + * @param sourceLimit points to the end of the input buffer + * @param err fills in error status (see ucnv_toUnicode) + * <code>U_INDEX_OUTOFBOUNDS_ERROR</code> will be set if the input + * is empty or does not convert to any output (e.g.: pure state-change + * codes SI/SO, escape sequences for ISO 2022, + * or if the callback did not output anything, ...). + * This function will not set a <code>U_BUFFER_OVERFLOW_ERROR</code> because + * the "buffer" is the return code. However, there might be subsequent output + * stored in the converter object + * that will be returned in following calls to this function. + * @return a UChar32 resulting from the partial conversion of source + * @see ucnv_toUnicode + * @see ucnv_toUChars + * @see ucnv_convert + * @stable ICU 2.0 + */ +U_STABLE UChar32 U_EXPORT2 +ucnv_getNextUChar(UConverter * converter, + const char **source, + const char * sourceLimit, + UErrorCode * err); + +/** + * Convert from one external charset to another using two existing UConverters. + * Internally, two conversions - ucnv_toUnicode() and ucnv_fromUnicode() - + * are used, "pivoting" through 16-bit Unicode. + * + * Important: For streaming conversion (multiple function calls for successive + * parts of a text stream), the caller must provide a pivot buffer explicitly, + * and must preserve the pivot buffer and associated pointers from one + * call to another. (The buffer may be moved if its contents and the relative + * pointer positions are preserved.) + * + * There is a similar function, ucnv_convert(), + * which has the following limitations: + * - it takes charset names, not converter objects, so that + * - two converters are opened for each call + * - only single-string conversion is possible, not streaming operation + * - it does not provide enough information to find out, + * in case of failure, whether the toUnicode or + * the fromUnicode conversion failed + * + * By contrast, ucnv_convertEx() + * - takes UConverter parameters instead of charset names + * - fully exposes the pivot buffer for streaming conversion and complete error handling + * + * ucnv_convertEx() also provides further convenience: + * - an option to reset the converters at the beginning + * (if reset==TRUE, see parameters; + * also sets *pivotTarget=*pivotSource=pivotStart) + * - allow NUL-terminated input + * (only a single NUL byte, will not work for charsets with multi-byte NULs) + * (if sourceLimit==NULL, see parameters) + * - terminate with a NUL on output + * (only a single NUL byte, not useful for charsets with multi-byte NULs), + * or set U_STRING_NOT_TERMINATED_WARNING if the output exactly fills + * the target buffer + * - the pivot buffer can be provided internally; + * possible only for whole-string conversion, not streaming conversion; + * in this case, the caller will not be able to get details about where an + * error occurred + * (if pivotStart==NULL, see below) + * + * The function returns when one of the following is true: + * - the entire source text has been converted successfully to the target buffer + * - a target buffer overflow occurred (U_BUFFER_OVERFLOW_ERROR) + * - a conversion error occurred + * (other U_FAILURE(), see description of pErrorCode) + * + * Limitation compared to the direct use of + * ucnv_fromUnicode() and ucnv_toUnicode(): + * ucnv_convertEx() does not provide offset information. + * + * Limitation compared to ucnv_fromUChars() and ucnv_toUChars(): + * ucnv_convertEx() does not support preflighting directly. + * + * Sample code for converting a single string from + * one external charset to UTF-8, ignoring the location of errors: + * + * \code + * int32_t + * myToUTF8(UConverter *cnv, + * const char *s, int32_t length, + * char *u8, int32_t capacity, + * UErrorCode *pErrorCode) { + * UConverter *utf8Cnv; + * char *target; + * + * if(U_FAILURE(*pErrorCode)) { + * return 0; + * } + * + * utf8Cnv=myGetCachedUTF8Converter(pErrorCode); + * if(U_FAILURE(*pErrorCode)) { + * return 0; + * } + * + * if(length<0) { + * length=strlen(s); + * } + * target=u8; + * ucnv_convertEx(utf8Cnv, cnv, + * &target, u8+capacity, + * &s, s+length, + * NULL, NULL, NULL, NULL, + * TRUE, TRUE, + * pErrorCode); + * + * myReleaseCachedUTF8Converter(utf8Cnv); + * + * // return the output string length, but without preflighting + * return (int32_t)(target-u8); + * } + * \endcode + * + * @param targetCnv Output converter, used to convert from the UTF-16 pivot + * to the target using ucnv_fromUnicode(). + * @param sourceCnv Input converter, used to convert from the source to + * the UTF-16 pivot using ucnv_toUnicode(). + * @param target I/O parameter, same as for ucnv_fromUChars(). + * Input: *target points to the beginning of the target buffer. + * Output: *target points to the first unit after the last char written. + * @param targetLimit Pointer to the first unit after the target buffer. + * @param source I/O parameter, same as for ucnv_toUChars(). + * Input: *source points to the beginning of the source buffer. + * Output: *source points to the first unit after the last char read. + * @param sourceLimit Pointer to the first unit after the source buffer. + * @param pivotStart Pointer to the UTF-16 pivot buffer. If pivotStart==NULL, + * then an internal buffer is used and the other pivot + * arguments are ignored and can be NULL as well. + * @param pivotSource I/O parameter, same as source in ucnv_fromUChars() for + * conversion from the pivot buffer to the target buffer. + * @param pivotTarget I/O parameter, same as target in ucnv_toUChars() for + * conversion from the source buffer to the pivot buffer. + * It must be pivotStart<=*pivotSource<=*pivotTarget<=pivotLimit + * and pivotStart<pivotLimit (unless pivotStart==NULL). + * @param pivotLimit Pointer to the first unit after the pivot buffer. + * @param reset If TRUE, then ucnv_resetToUnicode(sourceCnv) and + * ucnv_resetFromUnicode(targetCnv) are called, and the + * pivot pointers are reset (*pivotTarget=*pivotSource=pivotStart). + * @param flush If true, indicates the end of the input. + * Passed directly to ucnv_toUnicode(), and carried over to + * ucnv_fromUnicode() when the source is empty as well. + * @param pErrorCode ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * U_BUFFER_OVERFLOW_ERROR always refers to the target buffer + * because overflows into the pivot buffer are handled internally. + * Other conversion errors are from the source-to-pivot + * conversion if *pivotSource==pivotStart, otherwise from + * the pivot-to-target conversion. + * + * @see ucnv_convert + * @see ucnv_fromAlgorithmic + * @see ucnv_toAlgorithmic + * @see ucnv_fromUnicode + * @see ucnv_toUnicode + * @see ucnv_fromUChars + * @see ucnv_toUChars + * @stable ICU 2.6 + */ +U_STABLE void U_EXPORT2 +ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv, + char **target, const char *targetLimit, + const char **source, const char *sourceLimit, + UChar *pivotStart, UChar **pivotSource, + UChar **pivotTarget, const UChar *pivotLimit, + UBool reset, UBool flush, + UErrorCode *pErrorCode); + +/** + * Convert from one external charset to another. + * Internally, two converters are opened according to the name arguments, + * then the text is converted to and from the 16-bit Unicode "pivot" + * using ucnv_convertEx(), then the converters are closed again. + * + * This is a convenience function, not an efficient way to convert a lot of text: + * ucnv_convert() + * - takes charset names, not converter objects, so that + * - two converters are opened for each call + * - only single-string conversion is possible, not streaming operation + * - does not provide enough information to find out, + * in case of failure, whether the toUnicode or + * the fromUnicode conversion failed + * - allows NUL-terminated input + * (only a single NUL byte, will not work for charsets with multi-byte NULs) + * (if sourceLength==-1, see parameters) + * - terminate with a NUL on output + * (only a single NUL byte, not useful for charsets with multi-byte NULs), + * or set U_STRING_NOT_TERMINATED_WARNING if the output exactly fills + * the target buffer + * - a pivot buffer is provided internally + * + * The function returns when one of the following is true: + * - the entire source text has been converted successfully to the target buffer + * and either the target buffer is terminated with a single NUL byte + * or the error code is set to U_STRING_NOT_TERMINATED_WARNING + * - a target buffer overflow occurred (U_BUFFER_OVERFLOW_ERROR) + * and the full output string length is returned ("preflighting") + * - a conversion error occurred + * (other U_FAILURE(), see description of pErrorCode) + * + * @param toConverterName The name of the converter that is used to convert + * from the UTF-16 pivot buffer to the target. + * @param fromConverterName The name of the converter that is used to convert + * from the source to the UTF-16 pivot buffer. + * @param target Pointer to the output buffer. + * @param targetCapacity Capacity of the target, in bytes. + * @param source Pointer to the input buffer. + * @param sourceLength Length of the input text, in bytes, or -1 for NUL-terminated input. + * @param pErrorCode ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return Length of the complete output text in bytes, even if it exceeds the targetCapacity + * and a U_BUFFER_OVERFLOW_ERROR is set. + * + * @see ucnv_convertEx + * @see ucnv_fromAlgorithmic + * @see ucnv_toAlgorithmic + * @see ucnv_fromUnicode + * @see ucnv_toUnicode + * @see ucnv_fromUChars + * @see ucnv_toUChars + * @see ucnv_getNextUChar + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +ucnv_convert(const char *toConverterName, + const char *fromConverterName, + char *target, + int32_t targetCapacity, + const char *source, + int32_t sourceLength, + UErrorCode *pErrorCode); + +/** + * Convert from one external charset to another. + * Internally, the text is converted to and from the 16-bit Unicode "pivot" + * using ucnv_convertEx(). ucnv_toAlgorithmic() works exactly like ucnv_convert() + * except that the two converters need not be looked up and opened completely. + * + * The source-to-pivot conversion uses the cnv converter parameter. + * The pivot-to-target conversion uses a purely algorithmic converter + * according to the specified type, e.g., UCNV_UTF8 for a UTF-8 converter. + * + * Internally, the algorithmic converter is opened and closed for each + * function call, which is more efficient than using the public ucnv_open() + * but somewhat less efficient than only resetting an existing converter + * and using ucnv_convertEx(). + * + * This function is more convenient than ucnv_convertEx() for single-string + * conversions, especially when "preflighting" is desired (returning the length + * of the complete output even if it does not fit into the target buffer; + * see the User Guide Strings chapter). See ucnv_convert() for details. + * + * @param algorithmicType UConverterType constant identifying the desired target + * charset as a purely algorithmic converter. + * Those are converters for Unicode charsets like + * UTF-8, BOCU-1, SCSU, UTF-7, IMAP-mailbox-name, etc., + * as well as US-ASCII and ISO-8859-1. + * @param cnv The converter that is used to convert + * from the source to the UTF-16 pivot buffer. + * @param target Pointer to the output buffer. + * @param targetCapacity Capacity of the target, in bytes. + * @param source Pointer to the input buffer. + * @param sourceLength Length of the input text, in bytes + * @param pErrorCode ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return Length of the complete output text in bytes, even if it exceeds the targetCapacity + * and a U_BUFFER_OVERFLOW_ERROR is set. + * + * @see ucnv_fromAlgorithmic + * @see ucnv_convert + * @see ucnv_convertEx + * @see ucnv_fromUnicode + * @see ucnv_toUnicode + * @see ucnv_fromUChars + * @see ucnv_toUChars + * @stable ICU 2.6 + */ +U_STABLE int32_t U_EXPORT2 +ucnv_toAlgorithmic(UConverterType algorithmicType, + UConverter *cnv, + char *target, int32_t targetCapacity, + const char *source, int32_t sourceLength, + UErrorCode *pErrorCode); + +/** + * Convert from one external charset to another. + * Internally, the text is converted to and from the 16-bit Unicode "pivot" + * using ucnv_convertEx(). ucnv_fromAlgorithmic() works exactly like ucnv_convert() + * except that the two converters need not be looked up and opened completely. + * + * The source-to-pivot conversion uses a purely algorithmic converter + * according to the specified type, e.g., UCNV_UTF8 for a UTF-8 converter. + * The pivot-to-target conversion uses the cnv converter parameter. + * + * Internally, the algorithmic converter is opened and closed for each + * function call, which is more efficient than using the public ucnv_open() + * but somewhat less efficient than only resetting an existing converter + * and using ucnv_convertEx(). + * + * This function is more convenient than ucnv_convertEx() for single-string + * conversions, especially when "preflighting" is desired (returning the length + * of the complete output even if it does not fit into the target buffer; + * see the User Guide Strings chapter). See ucnv_convert() for details. + * + * @param cnv The converter that is used to convert + * from the UTF-16 pivot buffer to the target. + * @param algorithmicType UConverterType constant identifying the desired source + * charset as a purely algorithmic converter. + * Those are converters for Unicode charsets like + * UTF-8, BOCU-1, SCSU, UTF-7, IMAP-mailbox-name, etc., + * as well as US-ASCII and ISO-8859-1. + * @param target Pointer to the output buffer. + * @param targetCapacity Capacity of the target, in bytes. + * @param source Pointer to the input buffer. + * @param sourceLength Length of the input text, in bytes + * @param pErrorCode ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return Length of the complete output text in bytes, even if it exceeds the targetCapacity + * and a U_BUFFER_OVERFLOW_ERROR is set. + * + * @see ucnv_fromAlgorithmic + * @see ucnv_convert + * @see ucnv_convertEx + * @see ucnv_fromUnicode + * @see ucnv_toUnicode + * @see ucnv_fromUChars + * @see ucnv_toUChars + * @stable ICU 2.6 + */ +U_STABLE int32_t U_EXPORT2 +ucnv_fromAlgorithmic(UConverter *cnv, + UConverterType algorithmicType, + char *target, int32_t targetCapacity, + const char *source, int32_t sourceLength, + UErrorCode *pErrorCode); + +/** + * Frees up memory occupied by unused, cached converter shared data. + * + * @return the number of cached converters successfully deleted + * @see ucnv_close + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +ucnv_flushCache(void); + +/** + * Returns the number of available converters, as per the alias file. + * + * @return the number of available converters + * @see ucnv_getAvailableName + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +ucnv_countAvailable(void); + +/** + * Gets the canonical converter name of the specified converter from a list of + * all available converters contaied in the alias file. All converters + * in this list can be opened. + * + * @param n the index to a converter available on the system (in the range <TT>[0..ucnv_countAvaiable()]</TT>) + * @return a pointer a string (library owned), or <TT>NULL</TT> if the index is out of bounds. + * @see ucnv_countAvailable + * @stable ICU 2.0 + */ +U_STABLE const char* U_EXPORT2 +ucnv_getAvailableName(int32_t n); + +/** + * Returns a UEnumeration to enumerate all of the canonical converter + * names, as per the alias file, regardless of the ability to open each + * converter. + * + * @return A UEnumeration object for getting all the recognized canonical + * converter names. + * @see ucnv_getAvailableName + * @see uenum_close + * @see uenum_next + * @stable ICU 2.4 + */ +U_STABLE UEnumeration * U_EXPORT2 +ucnv_openAllNames(UErrorCode *pErrorCode); + +/** + * Gives the number of aliases for a given converter or alias name. + * If the alias is ambiguous, then the preferred converter is used + * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. + * This method only enumerates the listed entries in the alias file. + * @param alias alias name + * @param pErrorCode error status + * @return number of names on alias list for given alias + * @stable ICU 2.0 + */ +U_STABLE uint16_t U_EXPORT2 +ucnv_countAliases(const char *alias, UErrorCode *pErrorCode); + +/** + * Gives the name of the alias at given index of alias list. + * This method only enumerates the listed entries in the alias file. + * If the alias is ambiguous, then the preferred converter is used + * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. + * @param alias alias name + * @param n index in alias list + * @param pErrorCode result of operation + * @return returns the name of the alias at given index + * @see ucnv_countAliases + * @stable ICU 2.0 + */ +U_STABLE const char * U_EXPORT2 +ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode); + +/** + * Fill-up the list of alias names for the given alias. + * This method only enumerates the listed entries in the alias file. + * If the alias is ambiguous, then the preferred converter is used + * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. + * @param alias alias name + * @param aliases fill-in list, aliases is a pointer to an array of + * <code>ucnv_countAliases()</code> string-pointers + * (<code>const char *</code>) that will be filled in. + * The strings themselves are owned by the library. + * @param pErrorCode result of operation + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode); + +/** + * Return a new UEnumeration object for enumerating all the + * alias names for a given converter that are recognized by a standard. + * This method only enumerates the listed entries in the alias file. + * The convrtrs.txt file can be modified to change the results of + * this function. + * The first result in this list is the same result given by + * <code>ucnv_getStandardName</code>, which is the default alias for + * the specified standard name. The returned object must be closed with + * <code>uenum_close</code> when you are done with the object. + * + * @param convName original converter name + * @param standard name of the standard governing the names; MIME and IANA + * are such standards + * @param pErrorCode The error code + * @return A UEnumeration object for getting all aliases that are recognized + * by a standard. If any of the parameters are invalid, NULL + * is returned. + * @see ucnv_getStandardName + * @see uenum_close + * @see uenum_next + * @stable ICU 2.2 + */ +U_STABLE UEnumeration * U_EXPORT2 +ucnv_openStandardNames(const char *convName, + const char *standard, + UErrorCode *pErrorCode); + +/** + * Gives the number of standards associated to converter names. + * @return number of standards + * @stable ICU 2.0 + */ +U_STABLE uint16_t U_EXPORT2 +ucnv_countStandards(void); + +/** + * Gives the name of the standard at given index of standard list. + * @param n index in standard list + * @param pErrorCode result of operation + * @return returns the name of the standard at given index. Owned by the library. + * @stable ICU 2.0 + */ +U_STABLE const char * U_EXPORT2 +ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode); + +/** + * Returns a standard name for a given converter name. + * <p> + * Example alias table:<br> + * conv alias1 { STANDARD1 } alias2 { STANDARD1* } + * <p> + * Result of ucnv_getStandardName("conv", "STANDARD1") from example + * alias table:<br> + * <b>"alias2"</b> + * + * @param name original converter name + * @param standard name of the standard governing the names; MIME and IANA + * are such standards + * @param pErrorCode result of operation + * @return returns the standard converter name; + * if a standard converter name cannot be determined, + * then <code>NULL</code> is returned. Owned by the library. + * @stable ICU 2.0 + */ +U_STABLE const char * U_EXPORT2 +ucnv_getStandardName(const char *name, const char *standard, UErrorCode *pErrorCode); + +/** + * This function will return the internal canonical converter name of the + * tagged alias. This is the opposite of ucnv_openStandardNames, which + * returns the tagged alias given the canonical name. + * <p> + * Example alias table:<br> + * conv alias1 { STANDARD1 } alias2 { STANDARD1* } + * <p> + * Result of ucnv_getStandardName("alias1", "STANDARD1") from example + * alias table:<br> + * <b>"conv"</b> + * + * @return returns the canonical converter name; + * if a standard or alias name cannot be determined, + * then <code>NULL</code> is returned. The returned string is + * owned by the library. + * @see ucnv_getStandardName + * @stable ICU 2.4 + */ +U_STABLE const char * U_EXPORT2 +ucnv_getCanonicalName(const char *alias, const char *standard, UErrorCode *pErrorCode); + +/** + * Returns the current default converter name. If you want to open + * a default converter, you do not need to use this function. + * It is faster if you pass a NULL argument to ucnv_open the + * default converter. + * + * If U_CHARSET_IS_UTF8 is defined to 1 in utypes.h then this function + * always returns "UTF-8". + * + * @return returns the current default converter name. + * Storage owned by the library + * @see ucnv_setDefaultName + * @stable ICU 2.0 + */ +U_STABLE const char * U_EXPORT2 +ucnv_getDefaultName(void); + +#ifndef U_HIDE_SYSTEM_API +/** + * This function is not thread safe. DO NOT call this function when ANY ICU + * function is being used from more than one thread! This function sets the + * current default converter name. If this function needs to be called, it + * should be called during application initialization. Most of the time, the + * results from ucnv_getDefaultName() or ucnv_open with a NULL string argument + * is sufficient for your application. + * + * If U_CHARSET_IS_UTF8 is defined to 1 in utypes.h then this function + * does nothing. + * + * @param name the converter name to be the default (must be known by ICU). + * @see ucnv_getDefaultName + * @system + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_setDefaultName(const char *name); +#endif /* U_HIDE_SYSTEM_API */ + +/** + * Fixes the backslash character mismapping. For example, in SJIS, the backslash + * character in the ASCII portion is also used to represent the yen currency sign. + * When mapping from Unicode character 0x005C, it's unclear whether to map the + * character back to yen or backslash in SJIS. This function will take the input + * buffer and replace all the yen sign characters with backslash. This is necessary + * when the user tries to open a file with the input buffer on Windows. + * This function will test the converter to see whether such mapping is + * required. You can sometimes avoid using this function by using the correct version + * of Shift-JIS. + * + * @param cnv The converter representing the target codepage. + * @param source the input buffer to be fixed + * @param sourceLen the length of the input buffer + * @see ucnv_isAmbiguous + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +ucnv_fixFileSeparator(const UConverter *cnv, UChar *source, int32_t sourceLen); + +/** + * Determines if the converter contains ambiguous mappings of the same + * character or not. + * @param cnv the converter to be tested + * @return TRUE if the converter contains ambiguous mapping of the same + * character, FALSE otherwise. + * @stable ICU 2.0 + */ +U_STABLE UBool U_EXPORT2 +ucnv_isAmbiguous(const UConverter *cnv); + +/** + * Sets the converter to use fallback mappings or not. + * Regardless of this flag, the converter will always use + * fallbacks from Unicode Private Use code points, as well as + * reverse fallbacks (to Unicode). + * For details see ".ucm File Format" + * in the Conversion Data chapter of the ICU User Guide: + * http://www.icu-project.org/userguide/conversion-data.html#ucmformat + * + * @param cnv The converter to set the fallback mapping usage on. + * @param usesFallback TRUE if the user wants the converter to take advantage of the fallback + * mapping, FALSE otherwise. + * @stable ICU 2.0 + * @see ucnv_usesFallback + */ +U_STABLE void U_EXPORT2 +ucnv_setFallback(UConverter *cnv, UBool usesFallback); + +/** + * Determines if the converter uses fallback mappings or not. + * This flag has restrictions, see ucnv_setFallback(). + * + * @param cnv The converter to be tested + * @return TRUE if the converter uses fallback, FALSE otherwise. + * @stable ICU 2.0 + * @see ucnv_setFallback + */ +U_STABLE UBool U_EXPORT2 +ucnv_usesFallback(const UConverter *cnv); + +/** + * Detects Unicode signature byte sequences at the start of the byte stream + * and returns the charset name of the indicated Unicode charset. + * NULL is returned when no Unicode signature is recognized. + * The number of bytes in the signature is output as well. + * + * The caller can ucnv_open() a converter using the charset name. + * The first code unit (UChar) from the start of the stream will be U+FEFF + * (the Unicode BOM/signature character) and can usually be ignored. + * + * For most Unicode charsets it is also possible to ignore the indicated + * number of initial stream bytes and start converting after them. + * However, there are stateful Unicode charsets (UTF-7 and BOCU-1) for which + * this will not work. Therefore, it is best to ignore the first output UChar + * instead of the input signature bytes. + * <p> + * Usage: + * \snippet samples/ucnv/convsamp.cpp ucnv_detectUnicodeSignature + * + * @param source The source string in which the signature should be detected. + * @param sourceLength Length of the input string, or -1 if terminated with a NUL byte. + * @param signatureLength A pointer to int32_t to receive the number of bytes that make up the signature + * of the detected UTF. 0 if not detected. + * Can be a NULL pointer. + * @param pErrorCode ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return The name of the encoding detected. NULL if encoding is not detected. + * @stable ICU 2.4 + */ +U_STABLE const char* U_EXPORT2 +ucnv_detectUnicodeSignature(const char* source, + int32_t sourceLength, + int32_t *signatureLength, + UErrorCode *pErrorCode); + +/** + * Returns the number of UChars held in the converter's internal state + * because more input is needed for completing the conversion. This function is + * useful for mapping semantics of ICU's converter interface to those of iconv, + * and this information is not needed for normal conversion. + * @param cnv The converter in which the input is held + * @param status ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return The number of UChars in the state. -1 if an error is encountered. + * @stable ICU 3.4 + */ +U_STABLE int32_t U_EXPORT2 +ucnv_fromUCountPending(const UConverter* cnv, UErrorCode* status); + +/** + * Returns the number of chars held in the converter's internal state + * because more input is needed for completing the conversion. This function is + * useful for mapping semantics of ICU's converter interface to those of iconv, + * and this information is not needed for normal conversion. + * @param cnv The converter in which the input is held as internal state + * @param status ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return The number of chars in the state. -1 if an error is encountered. + * @stable ICU 3.4 + */ +U_STABLE int32_t U_EXPORT2 +ucnv_toUCountPending(const UConverter* cnv, UErrorCode* status); + +/** + * Returns whether or not the charset of the converter has a fixed number of bytes + * per charset character. + * An example of this are converters that are of the type UCNV_SBCS or UCNV_DBCS. + * Another example is UTF-32 which is always 4 bytes per character. + * A Unicode code point may be represented by more than one UTF-8 or UTF-16 code unit + * but a UTF-32 converter encodes each code point with 4 bytes. + * Note: This method is not intended to be used to determine whether the charset has a + * fixed ratio of bytes to Unicode codes <i>units</i> for any particular Unicode encoding form. + * FALSE is returned with the UErrorCode if error occurs or cnv is NULL. + * @param cnv The converter to be tested + * @param status ICU error code in/out paramter + * @return TRUE if the converter is fixed-width + * @stable ICU 4.8 + */ +U_STABLE UBool U_EXPORT2 +ucnv_isFixedWidth(UConverter *cnv, UErrorCode *status); + +#endif + +#endif +/*_UCNV*/ diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/ucnv_err.h b/src/core/basetypes/icu-ucsdet/include/unicode/ucnv_err.h new file mode 100644 index 00000000..e092e95f --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/ucnv_err.h @@ -0,0 +1,463 @@ +/* +********************************************************************** +* Copyright (C) 1999-2009, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** + * + * + * ucnv_err.h: + */ + +/** + * \file + * \brief C UConverter predefined error callbacks + * + * <h2>Error Behaviour Functions</h2> + * Defines some error behaviour functions called by ucnv_{from,to}Unicode + * These are provided as part of ICU and many are stable, but they + * can also be considered only as an example of what can be done with + * callbacks. You may of course write your own. + * + * If you want to write your own, you may also find the functions from + * ucnv_cb.h useful when writing your own callbacks. + * + * These functions, although public, should NEVER be called directly. + * They should be used as parameters to the ucnv_setFromUCallback + * and ucnv_setToUCallback functions, to set the behaviour of a converter + * when it encounters ILLEGAL/UNMAPPED/INVALID sequences. + * + * usage example: 'STOP' doesn't need any context, but newContext + * could be set to something other than 'NULL' if needed. The available + * contexts in this header can modify the default behavior of the callback. + * + * \code + * UErrorCode err = U_ZERO_ERROR; + * UConverter *myConverter = ucnv_open("ibm-949", &err); + * const void *oldContext; + * UConverterFromUCallback oldAction; + * + * + * if (U_SUCCESS(err)) + * { + * ucnv_setFromUCallBack(myConverter, + * UCNV_FROM_U_CALLBACK_STOP, + * NULL, + * &oldAction, + * &oldContext, + * &status); + * } + * \endcode + * + * The code above tells "myConverter" to stop when it encounters an + * ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from + * Unicode -> Codepage. The behavior from Codepage to Unicode is not changed, + * and ucnv_setToUCallBack would need to be called in order to change + * that behavior too. + * + * Here is an example with a context: + * + * \code + * UErrorCode err = U_ZERO_ERROR; + * UConverter *myConverter = ucnv_open("ibm-949", &err); + * const void *oldContext; + * UConverterFromUCallback oldAction; + * + * + * if (U_SUCCESS(err)) + * { + * ucnv_setToUCallBack(myConverter, + * UCNV_TO_U_CALLBACK_SUBSTITUTE, + * UCNV_SUB_STOP_ON_ILLEGAL, + * &oldAction, + * &oldContext, + * &status); + * } + * \endcode + * + * The code above tells "myConverter" to stop when it encounters an + * ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from + * Codepage -> Unicode. Any unmapped and legal characters will be + * substituted to be the default substitution character. + */ + +#ifndef UCNV_ERR_H +#define UCNV_ERR_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +/** Forward declaring the UConverter structure. @stable ICU 2.0 */ +struct UConverter; + +/** @stable ICU 2.0 */ +typedef struct UConverter UConverter; + +/** + * FROM_U, TO_U context options for sub callback + * @stable ICU 2.0 + */ +#define UCNV_SUB_STOP_ON_ILLEGAL "i" + +/** + * FROM_U, TO_U context options for skip callback + * @stable ICU 2.0 + */ +#define UCNV_SKIP_STOP_ON_ILLEGAL "i" + +/** + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX) + * @stable ICU 2.0 + */ +#define UCNV_ESCAPE_ICU NULL +/** + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX) + * @stable ICU 2.0 + */ +#define UCNV_ESCAPE_JAVA "J" +/** + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX) + * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX) + * @stable ICU 2.0 + */ +#define UCNV_ESCAPE_C "C" +/** + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly + * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly + * @stable ICU 2.0 + */ +#define UCNV_ESCAPE_XML_DEC "D" +/** + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly + * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly + * @stable ICU 2.0 + */ +#define UCNV_ESCAPE_XML_HEX "X" +/** + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX) + * @stable ICU 2.0 + */ +#define UCNV_ESCAPE_UNICODE "U" + +/** + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to CSS2 conventions (\\HH..H<space>, that is, + * a backslash, 1..6 hex digits, and a space) + * @stable ICU 4.0 + */ +#define UCNV_ESCAPE_CSS2 "S" + +/** + * The process condition code to be used with the callbacks. + * Codes which are greater than UCNV_IRREGULAR should be + * passed on to any chained callbacks. + * @stable ICU 2.0 + */ +typedef enum { + UCNV_UNASSIGNED = 0, /**< The code point is unassigned. + The error code U_INVALID_CHAR_FOUND will be set. */ + UCNV_ILLEGAL = 1, /**< The code point is illegal. For example, + \\x81\\x2E is illegal in SJIS because \\x2E + is not a valid trail byte for the \\x81 + lead byte. + Also, starting with Unicode 3.0.1, non-shortest byte sequences + in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061) + are also illegal, not just irregular. + The error code U_ILLEGAL_CHAR_FOUND will be set. */ + UCNV_IRREGULAR = 2, /**< The codepoint is not a regular sequence in + the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF + are irregular UTF-8 byte sequences for single surrogate + code points. + The error code U_INVALID_CHAR_FOUND will be set. */ + UCNV_RESET = 3, /**< The callback is called with this reason when a + 'reset' has occured. Callback should reset all + state. */ + UCNV_CLOSE = 4, /**< Called when the converter is closed. The + callback should release any allocated memory.*/ + UCNV_CLONE = 5 /**< Called when ucnv_safeClone() is called on the + converter. the pointer available as the + 'context' is an alias to the original converters' + context pointer. If the context must be owned + by the new converter, the callback must clone + the data and call ucnv_setFromUCallback + (or setToUCallback) with the correct pointer. + @stable ICU 2.2 + */ +} UConverterCallbackReason; + + +/** + * The structure for the fromUnicode callback function parameter. + * @stable ICU 2.0 + */ +typedef struct { + uint16_t size; /**< The size of this struct. @stable ICU 2.0 */ + UBool flush; /**< The internal state of converter will be reset and data flushed if set to TRUE. @stable ICU 2.0 */ + UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */ + const UChar *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */ + const UChar *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */ + char *target; /**< Pointer to the target buffer. @stable ICU 2.0 */ + const char *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */ + int32_t *offsets; /**< Pointer to the buffer that recieves the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */ +} UConverterFromUnicodeArgs; + + +/** + * The structure for the toUnicode callback function parameter. + * @stable ICU 2.0 + */ +typedef struct { + uint16_t size; /**< The size of this struct @stable ICU 2.0 */ + UBool flush; /**< The internal state of converter will be reset and data flushed if set to TRUE. @stable ICU 2.0 */ + UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */ + const char *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */ + const char *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */ + UChar *target; /**< Pointer to the target buffer. @stable ICU 2.0 */ + const UChar *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */ + int32_t *offsets; /**< Pointer to the buffer that recieves the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */ +} UConverterToUnicodeArgs; + + +/** + * DO NOT CALL THIS FUNCTION DIRECTLY! + * This From Unicode callback STOPS at the ILLEGAL_SEQUENCE, + * returning the error code back to the caller immediately. + * + * @param context Pointer to the callback's private data + * @param fromUArgs Information about the conversion in progress + * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence + * @param length Size (in bytes) of the concerned codepage sequence + * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. + * @param reason Defines the reason the callback was invoked + * @param err This should always be set to a failure status prior to calling. + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP ( + const void *context, + UConverterFromUnicodeArgs *fromUArgs, + const UChar* codeUnits, + int32_t length, + UChar32 codePoint, + UConverterCallbackReason reason, + UErrorCode * err); + + + +/** + * DO NOT CALL THIS FUNCTION DIRECTLY! + * This To Unicode callback STOPS at the ILLEGAL_SEQUENCE, + * returning the error code back to the caller immediately. + * + * @param context Pointer to the callback's private data + * @param toUArgs Information about the conversion in progress + * @param codeUnits Points to 'length' bytes of the concerned codepage sequence + * @param length Size (in bytes) of the concerned codepage sequence + * @param reason Defines the reason the callback was invoked + * @param err This should always be set to a failure status prior to calling. + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP ( + const void *context, + UConverterToUnicodeArgs *toUArgs, + const char* codeUnits, + int32_t length, + UConverterCallbackReason reason, + UErrorCode * err); + +/** + * DO NOT CALL THIS FUNCTION DIRECTLY! + * This From Unicode callback skips any ILLEGAL_SEQUENCE, or + * skips only UNASSINGED_SEQUENCE depending on the context parameter + * simply ignoring those characters. + * + * @param context The function currently recognizes the callback options: + * UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, + * returning the error code back to the caller immediately. + * NULL: Skips any ILLEGAL_SEQUENCE + * @param fromUArgs Information about the conversion in progress + * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence + * @param length Size (in bytes) of the concerned codepage sequence + * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. + * @param reason Defines the reason the callback was invoked + * @param err Return value will be set to success if the callback was handled, + * otherwise this value will be set to a failure status. + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP ( + const void *context, + UConverterFromUnicodeArgs *fromUArgs, + const UChar* codeUnits, + int32_t length, + UChar32 codePoint, + UConverterCallbackReason reason, + UErrorCode * err); + +/** + * DO NOT CALL THIS FUNCTION DIRECTLY! + * This From Unicode callback will Substitute the ILLEGAL SEQUENCE, or + * UNASSIGNED_SEQUENCE depending on context parameter, with the + * current substitution string for the converter. This is the default + * callback. + * + * @param context The function currently recognizes the callback options: + * UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, + * returning the error code back to the caller immediately. + * NULL: Substitutes any ILLEGAL_SEQUENCE + * @param fromUArgs Information about the conversion in progress + * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence + * @param length Size (in bytes) of the concerned codepage sequence + * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. + * @param reason Defines the reason the callback was invoked + * @param err Return value will be set to success if the callback was handled, + * otherwise this value will be set to a failure status. + * @see ucnv_setSubstChars + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_SUBSTITUTE ( + const void *context, + UConverterFromUnicodeArgs *fromUArgs, + const UChar* codeUnits, + int32_t length, + UChar32 codePoint, + UConverterCallbackReason reason, + UErrorCode * err); + +/** + * DO NOT CALL THIS FUNCTION DIRECTLY! + * This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the + * hexadecimal representation of the illegal codepoints + * + * @param context The function currently recognizes the callback options: + * <ul> + * <li>UCNV_ESCAPE_ICU: Substitues the ILLEGAL SEQUENCE with the hexadecimal + * representation in the format %UXXXX, e.g. "%uFFFE%u00AC%uC8FE"). + * In the Event the converter doesn't support the characters {%,U}[A-F][0-9], + * it will substitute the illegal sequence with the substitution characters. + * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as + * %UD84D%UDC56</li> + * <li>UCNV_ESCAPE_JAVA: Substitues the ILLEGAL SEQUENCE with the hexadecimal + * representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE"). + * In the Event the converter doesn't support the characters {\,u}[A-F][0-9], + * it will substitute the illegal sequence with the substitution characters. + * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as + * \\uD84D\\uDC56</li> + * <li>UCNV_ESCAPE_C: Substitues the ILLEGAL SEQUENCE with the hexadecimal + * representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE"). + * In the Event the converter doesn't support the characters {\,u,U}[A-F][0-9], + * it will substitute the illegal sequence with the substitution characters. + * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as + * \\U00023456</li> + * <li>UCNV_ESCAPE_XML_DEC: Substitues the ILLEGAL SEQUENCE with the decimal + * representation in the format \htmlonly&#DDDDDDDD;, e.g. "&#65534;&#172;&#51454;")\endhtmlonly. + * In the Event the converter doesn't support the characters {&,#}[0-9], + * it will substitute the illegal sequence with the substitution characters. + * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as + * &#144470; and Zero padding is ignored.</li> + * <li>UCNV_ESCAPE_XML_HEX:Substitues the ILLEGAL SEQUENCE with the decimal + * representation in the format \htmlonly&#xXXXX; e.g. "&#xFFFE;&#x00AC;&#xC8FE;")\endhtmlonly. + * In the Event the converter doesn't support the characters {&,#,x}[0-9], + * it will substitute the illegal sequence with the substitution characters. + * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as + * \htmlonly&#x23456;\endhtmlonly</li> + * </ul> + * @param fromUArgs Information about the conversion in progress + * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence + * @param length Size (in bytes) of the concerned codepage sequence + * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. + * @param reason Defines the reason the callback was invoked + * @param err Return value will be set to success if the callback was handled, + * otherwise this value will be set to a failure status. + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_ESCAPE ( + const void *context, + UConverterFromUnicodeArgs *fromUArgs, + const UChar* codeUnits, + int32_t length, + UChar32 codePoint, + UConverterCallbackReason reason, + UErrorCode * err); + + +/** + * DO NOT CALL THIS FUNCTION DIRECTLY! + * This To Unicode callback skips any ILLEGAL_SEQUENCE, or + * skips only UNASSINGED_SEQUENCE depending on the context parameter + * simply ignoring those characters. + * + * @param context The function currently recognizes the callback options: + * UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, + * returning the error code back to the caller immediately. + * NULL: Skips any ILLEGAL_SEQUENCE + * @param toUArgs Information about the conversion in progress + * @param codeUnits Points to 'length' bytes of the concerned codepage sequence + * @param length Size (in bytes) of the concerned codepage sequence + * @param reason Defines the reason the callback was invoked + * @param err Return value will be set to success if the callback was handled, + * otherwise this value will be set to a failure status. + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP ( + const void *context, + UConverterToUnicodeArgs *toUArgs, + const char* codeUnits, + int32_t length, + UConverterCallbackReason reason, + UErrorCode * err); + +/** + * DO NOT CALL THIS FUNCTION DIRECTLY! + * This To Unicode callback will Substitute the ILLEGAL SEQUENCE,or + * UNASSIGNED_SEQUENCE depending on context parameter, with the + * Unicode substitution character, U+FFFD. + * + * @param context The function currently recognizes the callback options: + * UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, + * returning the error code back to the caller immediately. + * NULL: Substitutes any ILLEGAL_SEQUENCE + * @param toUArgs Information about the conversion in progress + * @param codeUnits Points to 'length' bytes of the concerned codepage sequence + * @param length Size (in bytes) of the concerned codepage sequence + * @param reason Defines the reason the callback was invoked + * @param err Return value will be set to success if the callback was handled, + * otherwise this value will be set to a failure status. + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_SUBSTITUTE ( + const void *context, + UConverterToUnicodeArgs *toUArgs, + const char* codeUnits, + int32_t length, + UConverterCallbackReason reason, + UErrorCode * err); + +/** + * DO NOT CALL THIS FUNCTION DIRECTLY! + * This To Unicode callback will Substitute the ILLEGAL SEQUENCE with the + * hexadecimal representation of the illegal bytes + * (in the format %XNN, e.g. "%XFF%X0A%XC8%X03"). + * + * @param context This function currently recognizes the callback options: + * UCNV_ESCAPE_ICU, UCNV_ESCAPE_JAVA, UCNV_ESCAPE_C, UCNV_ESCAPE_XML_DEC, + * UCNV_ESCAPE_XML_HEX and UCNV_ESCAPE_UNICODE. + * @param toUArgs Information about the conversion in progress + * @param codeUnits Points to 'length' bytes of the concerned codepage sequence + * @param length Size (in bytes) of the concerned codepage sequence + * @param reason Defines the reason the callback was invoked + * @param err Return value will be set to success if the callback was handled, + * otherwise this value will be set to a failure status. + * @stable ICU 2.0 + */ + +U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_ESCAPE ( + const void *context, + UConverterToUnicodeArgs *toUArgs, + const char* codeUnits, + int32_t length, + UConverterCallbackReason reason, + UErrorCode * err); + +#endif + +#endif + +/*UCNV_ERR_H*/ diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/uconfig.h b/src/core/basetypes/icu-ucsdet/include/unicode/uconfig.h new file mode 100644 index 00000000..ed073b63 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/uconfig.h @@ -0,0 +1,430 @@ +/* +********************************************************************** +* Copyright (C) 2002-2014, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* file name: uconfig.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002sep19 +* created by: Markus W. Scherer +*/ + +#ifndef __UCONFIG_H__ +#define __UCONFIG_H__ + + +/*! + * \file + * \brief User-configurable settings + * + * Miscellaneous switches: + * + * A number of macros affect a variety of minor aspects of ICU. + * Most of them used to be defined elsewhere (e.g., in utypes.h or platform.h) + * and moved here to make them easier to find. + * + * Switches for excluding parts of ICU library code modules: + * + * Changing these macros allows building partial, smaller libraries for special purposes. + * By default, all modules are built. + * The switches are fairly coarse, controlling large modules. + * Basic services cannot be turned off. + * + * Building with any of these options does not guarantee that the + * ICU build process will completely work. It is recommended that + * the ICU libraries and data be built using the normal build. + * At that time you should remove the data used by those services. + * After building the ICU data library, you should rebuild the ICU + * libraries with these switches customized to your needs. + * + * @stable ICU 2.4 + */ + +/** + * If this switch is defined, ICU will attempt to load a header file named "uconfig_local.h" + * prior to determining default settings for uconfig variables. + * + * @internal ICU 4.0 + */ +#if defined(UCONFIG_USE_LOCAL) +#include "uconfig_local.h" +#endif + +/** + * \def U_DEBUG + * Determines whether to include debugging code. + * Automatically set on Windows, but most compilers do not have + * related predefined macros. + * @internal + */ +#ifdef U_DEBUG + /* Use the predefined value. */ +#elif defined(_DEBUG) + /* + * _DEBUG is defined by Visual Studio debug compilation. + * Do *not* test for its NDEBUG macro: It is an orthogonal macro + * which disables assert(). + */ +# define U_DEBUG 1 +# else +# define U_DEBUG 0 +#endif + +/** + * Determines wheter to enable auto cleanup of libraries. + * @internal + */ +#ifndef UCLN_NO_AUTO_CLEANUP +#define UCLN_NO_AUTO_CLEANUP 1 +#endif + +/** + * \def U_DISABLE_RENAMING + * Determines whether to disable renaming or not. + * @internal + */ +#ifndef U_DISABLE_RENAMING +#define U_DISABLE_RENAMING 0 +#endif + +/** + * \def U_NO_DEFAULT_INCLUDE_UTF_HEADERS + * Determines whether utypes.h includes utf.h, utf8.h, utf16.h and utf_old.h. + * utypes.h includes those headers if this macro is defined to 0. + * Otherwise, each those headers must be included explicitly when using one of their macros. + * Defaults to 0 for backward compatibility, except inside ICU. + * @stable ICU 49 + */ +#ifdef U_NO_DEFAULT_INCLUDE_UTF_HEADERS + /* Use the predefined value. */ +#elif defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || \ + defined(U_IO_IMPLEMENTATION) || defined(U_LAYOUT_IMPLEMENTATION) || defined(U_LAYOUTEX_IMPLEMENTATION) || \ + defined(U_TOOLUTIL_IMPLEMENTATION) +# define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1 +#else +# define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 0 +#endif + +/** + * \def U_OVERRIDE_CXX_ALLOCATION + * Determines whether to override new and delete. + * ICU is normally built such that all of its C++ classes, via their UMemory base, + * override operators new and delete to use its internal, customizable, + * non-exception-throwing memory allocation functions. (Default value 1 for this macro.) + * + * This is especially important when the application and its libraries use multiple heaps. + * For example, on Windows, this allows the ICU DLL to be used by + * applications that statically link the C Runtime library. + * + * @stable ICU 2.2 + */ +#ifndef U_OVERRIDE_CXX_ALLOCATION +#define U_OVERRIDE_CXX_ALLOCATION 1 +#endif + +/** + * \def U_ENABLE_TRACING + * Determines whether to enable tracing. + * @internal + */ +#ifndef U_ENABLE_TRACING +#define U_ENABLE_TRACING 0 +#endif + +/** + * \def U_ENABLE_DYLOAD + * Whether to enable Dynamic loading in ICU. + * @internal + */ +#ifndef U_ENABLE_DYLOAD +#define U_ENABLE_DYLOAD 1 +#endif + +/** + * \def U_CHECK_DYLOAD + * Whether to test Dynamic loading as an OS capability. + * @internal + */ +#ifndef U_CHECK_DYLOAD +#define U_CHECK_DYLOAD 1 +#endif + + +/** + * \def U_DEFAULT_SHOW_DRAFT + * Do we allow ICU users to use the draft APIs by default? + * @internal + */ +#ifndef U_DEFAULT_SHOW_DRAFT +#define U_DEFAULT_SHOW_DRAFT 1 +#endif + +/*===========================================================================*/ +/* Custom icu entry point renaming */ +/*===========================================================================*/ + +/** + * \def U_HAVE_LIB_SUFFIX + * 1 if a custom library suffix is set. + * @internal + */ +#ifdef U_HAVE_LIB_SUFFIX + /* Use the predefined value. */ +#elif defined(U_LIB_SUFFIX_C_NAME) +# define U_HAVE_LIB_SUFFIX 1 +#endif + +/** + * \def U_LIB_SUFFIX_C_NAME_STRING + * Defines the library suffix as a string with C syntax. + * @internal + */ +#ifdef U_LIB_SUFFIX_C_NAME_STRING + /* Use the predefined value. */ +#elif defined(U_LIB_SUFFIX_C_NAME) +# define CONVERT_TO_STRING(s) #s +# define U_LIB_SUFFIX_C_NAME_STRING CONVERT_TO_STRING(U_LIB_SUFFIX_C_NAME) +#else +# define U_LIB_SUFFIX_C_NAME_STRING "" +#endif + +/* common/i18n library switches --------------------------------------------- */ + +/** + * \def UCONFIG_ONLY_COLLATION + * This switch turns off modules that are not needed for collation. + * + * It does not turn off legacy conversion because that is necessary + * for ICU to work on EBCDIC platforms (for the default converter). + * If you want "only collation" and do not build for EBCDIC, + * then you can define UCONFIG_NO_LEGACY_CONVERSION 1 as well. + * + * @stable ICU 2.4 + */ +#ifndef UCONFIG_ONLY_COLLATION +# define UCONFIG_ONLY_COLLATION 0 +#endif + +#if UCONFIG_ONLY_COLLATION + /* common library */ +# define UCONFIG_NO_BREAK_ITERATION 1 +# define UCONFIG_NO_IDNA 1 + + /* i18n library */ +# if UCONFIG_NO_COLLATION +# error Contradictory collation switches in uconfig.h. +# endif +# define UCONFIG_NO_FORMATTING 1 +# define UCONFIG_NO_TRANSLITERATION 1 +# define UCONFIG_NO_REGULAR_EXPRESSIONS 1 +#endif + +/* common library switches -------------------------------------------------- */ + +/** + * \def UCONFIG_NO_FILE_IO + * This switch turns off all file access in the common library + * where file access is only used for data loading. + * ICU data must then be provided in the form of a data DLL (or with an + * equivalent way to link to the data residing in an executable, + * as in building a combined library with both the common library's code and + * the data), or via udata_setCommonData(). + * Application data must be provided via udata_setAppData() or by using + * "open" functions that take pointers to data, for example ucol_openBinary(). + * + * File access is not used at all in the i18n library. + * + * File access cannot be turned off for the icuio library or for the ICU + * test suites and ICU tools. + * + * @stable ICU 3.6 + */ +#ifndef UCONFIG_NO_FILE_IO +# define UCONFIG_NO_FILE_IO 0 +#endif + +#if UCONFIG_NO_FILE_IO && defined(U_TIMEZONE_FILES_DIR) +# error Contradictory file io switches in uconfig.h. +#endif + +/** + * \def UCONFIG_NO_CONVERSION + * ICU will not completely build with this switch turned on. + * This switch turns off all converters. + * + * You may want to use this together with U_CHARSET_IS_UTF8 defined to 1 + * in utypes.h if char* strings in your environment are always in UTF-8. + * + * @stable ICU 3.2 + * @see U_CHARSET_IS_UTF8 + */ +#ifndef UCONFIG_NO_CONVERSION +# define UCONFIG_NO_CONVERSION 0 +#endif + +#if UCONFIG_NO_CONVERSION +# define UCONFIG_NO_LEGACY_CONVERSION 1 +#endif + +/** + * \def UCONFIG_NO_LEGACY_CONVERSION + * This switch turns off all converters except for + * - Unicode charsets (UTF-7/8/16/32, CESU-8, SCSU, BOCU-1) + * - US-ASCII + * - ISO-8859-1 + * + * Turning off legacy conversion is not possible on EBCDIC platforms + * because they need ibm-37 or ibm-1047 default converters. + * + * @stable ICU 2.4 + */ +#ifndef UCONFIG_NO_LEGACY_CONVERSION +# define UCONFIG_NO_LEGACY_CONVERSION 0 +#endif + +/** + * \def UCONFIG_NO_NORMALIZATION + * This switch turns off normalization. + * It implies turning off several other services as well, for example + * collation and IDNA. + * + * @stable ICU 2.6 + */ +#ifndef UCONFIG_NO_NORMALIZATION +# define UCONFIG_NO_NORMALIZATION 0 +#elif UCONFIG_NO_NORMALIZATION + /* common library */ + /* ICU 50 CJK dictionary BreakIterator uses normalization */ +# define UCONFIG_NO_BREAK_ITERATION 1 + /* IDNA (UTS #46) is implemented via normalization */ +# define UCONFIG_NO_IDNA 1 + + /* i18n library */ +# if UCONFIG_ONLY_COLLATION +# error Contradictory collation switches in uconfig.h. +# endif +# define UCONFIG_NO_COLLATION 1 +# define UCONFIG_NO_TRANSLITERATION 1 +#endif + +/** + * \def UCONFIG_NO_BREAK_ITERATION + * This switch turns off break iteration. + * + * @stable ICU 2.4 + */ +#ifndef UCONFIG_NO_BREAK_ITERATION +# define UCONFIG_NO_BREAK_ITERATION 0 +#endif + +/** + * \def UCONFIG_NO_IDNA + * This switch turns off IDNA. + * + * @stable ICU 2.6 + */ +#ifndef UCONFIG_NO_IDNA +# define UCONFIG_NO_IDNA 0 +#endif + +/** + * \def UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE + * Determines the default UMessagePatternApostropheMode. + * See the documentation for that enum. + * + * @stable ICU 4.8 + */ +#ifndef UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE +# define UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE UMSGPAT_APOS_DOUBLE_OPTIONAL +#endif + +/* i18n library switches ---------------------------------------------------- */ + +/** + * \def UCONFIG_NO_COLLATION + * This switch turns off collation and collation-based string search. + * + * @stable ICU 2.4 + */ +#ifndef UCONFIG_NO_COLLATION +# define UCONFIG_NO_COLLATION 0 +#endif + +/** + * \def UCONFIG_NO_FORMATTING + * This switch turns off formatting and calendar/timezone services. + * + * @stable ICU 2.4 + */ +#ifndef UCONFIG_NO_FORMATTING +# define UCONFIG_NO_FORMATTING 0 +#endif + +/** + * \def UCONFIG_NO_TRANSLITERATION + * This switch turns off transliteration. + * + * @stable ICU 2.4 + */ +#ifndef UCONFIG_NO_TRANSLITERATION +# define UCONFIG_NO_TRANSLITERATION 0 +#endif + +/** + * \def UCONFIG_NO_REGULAR_EXPRESSIONS + * This switch turns off regular expressions. + * + * @stable ICU 2.4 + */ +#ifndef UCONFIG_NO_REGULAR_EXPRESSIONS +# define UCONFIG_NO_REGULAR_EXPRESSIONS 0 +#endif + +/** + * \def UCONFIG_NO_SERVICE + * This switch turns off service registration. + * + * @stable ICU 3.2 + */ +#ifndef UCONFIG_NO_SERVICE +# define UCONFIG_NO_SERVICE 0 +#endif + +/** + * \def UCONFIG_HAVE_PARSEALLINPUT + * This switch turns on the "parse all input" attribute. Binary incompatible. + * + * @internal + */ +#ifndef UCONFIG_HAVE_PARSEALLINPUT +# define UCONFIG_HAVE_PARSEALLINPUT 1 +#endif + + +/** + * \def UCONFIG_FORMAT_FASTPATHS_49 + * This switch turns on other formatting fastpaths. Binary incompatible in object DecimalFormat and DecimalFormatSymbols + * + * @internal + */ +#ifndef UCONFIG_FORMAT_FASTPATHS_49 +# define UCONFIG_FORMAT_FASTPATHS_49 1 +#endif + +/** + * \def UCONFIG_NO_FILTERED_BREAK_ITERATION + * This switch turns off filtered break iteration code. + * + * @internal + */ +#ifndef UCONFIG_NO_FILTERED_BREAK_ITERATION +# define UCONFIG_NO_FILTERED_BREAK_ITERATION 0 + + + +#endif + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/ucsdet.h b/src/core/basetypes/icu-ucsdet/include/unicode/ucsdet.h new file mode 100644 index 00000000..d3a297be --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/ucsdet.h @@ -0,0 +1,413 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2013, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + * file name: ucsdet.h + * encoding: US-ASCII + * indentation:4 + * + * created on: 2005Aug04 + * created by: Andy Heninger + * + * ICU Character Set Detection, API for C + * + * Draft version 18 Oct 2005 + * + */ + +#ifndef __UCSDET_H +#define __UCSDET_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "unicode/localpointer.h" +#include "unicode/uenum.h" + +/** + * \file + * \brief C API: Charset Detection API + * + * This API provides a facility for detecting the + * charset or encoding of character data in an unknown text format. + * The input data can be from an array of bytes. + * <p> + * Character set detection is at best an imprecise operation. The detection + * process will attempt to identify the charset that best matches the characteristics + * of the byte data, but the process is partly statistical in nature, and + * the results can not be guaranteed to always be correct. + * <p> + * For best accuracy in charset detection, the input data should be primarily + * in a single language, and a minimum of a few hundred bytes worth of plain text + * in the language are needed. The detection process will attempt to + * ignore html or xml style markup that could otherwise obscure the content. + */ + + +struct UCharsetDetector; +/** + * Structure representing a charset detector + * @stable ICU 3.6 + */ +typedef struct UCharsetDetector UCharsetDetector; + +struct UCharsetMatch; +/** + * Opaque structure representing a match that was identified + * from a charset detection operation. + * @stable ICU 3.6 + */ +typedef struct UCharsetMatch UCharsetMatch; + +/** + * Open a charset detector. + * + * @param status Any error conditions occurring during the open + * operation are reported back in this variable. + * @return the newly opened charset detector. + * @stable ICU 3.6 + */ +U_STABLE UCharsetDetector * U_EXPORT2 +ucsdet_open(UErrorCode *status); + +/** + * Close a charset detector. All storage and any other resources + * owned by this charset detector will be released. Failure to + * close a charset detector when finished with it can result in + * memory leaks in the application. + * + * @param ucsd The charset detector to be closed. + * @stable ICU 3.6 + */ +U_STABLE void U_EXPORT2 +ucsdet_close(UCharsetDetector *ucsd); + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN + +/** + * \class LocalUCharsetDetectorPointer + * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + * @stable ICU 4.4 + */ +U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); + +U_NAMESPACE_END + +#endif + +/** + * Set the input byte data whose charset is to detected. + * + * Ownership of the input text byte array remains with the caller. + * The input string must not be altered or deleted until the charset + * detector is either closed or reset to refer to different input text. + * + * @param ucsd the charset detector to be used. + * @param textIn the input text of unknown encoding. . + * @param len the length of the input text, or -1 if the text + * is NUL terminated. + * @param status any error conditions are reported back in this variable. + * + * @stable ICU 3.6 + */ +U_STABLE void U_EXPORT2 +ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); + + +/** Set the declared encoding for charset detection. + * The declared encoding of an input text is an encoding obtained + * by the user from an http header or xml declaration or similar source that + * can be provided as an additional hint to the charset detector. + * + * How and whether the declared encoding will be used during the + * detection process is TBD. + * + * @param ucsd the charset detector to be used. + * @param encoding an encoding for the current data obtained from + * a header or declaration or other source outside + * of the byte data itself. + * @param length the length of the encoding name, or -1 if the name string + * is NUL terminated. + * @param status any error conditions are reported back in this variable. + * + * @stable ICU 3.6 + */ +U_STABLE void U_EXPORT2 +ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); + + +/** + * Return the charset that best matches the supplied input data. + * + * Note though, that because the detection + * only looks at the start of the input data, + * there is a possibility that the returned charset will fail to handle + * the full set of input data. + * <p> + * The returned UCharsetMatch object is owned by the UCharsetDetector. + * It will remain valid until the detector input is reset, or until + * the detector is closed. + * <p> + * The function will fail if + * <ul> + * <li>no charset appears to match the data.</li> + * <li>no input text has been provided</li> + * </ul> + * + * @param ucsd the charset detector to be used. + * @param status any error conditions are reported back in this variable. + * @return a UCharsetMatch representing the best matching charset, + * or NULL if no charset matches the byte data. + * + * @stable ICU 3.6 + */ +U_STABLE const UCharsetMatch * U_EXPORT2 +ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status); + + +/** + * Find all charset matches that appear to be consistent with the input, + * returning an array of results. The results are ordered with the + * best quality match first. + * + * Because the detection only looks at a limited amount of the + * input byte data, some of the returned charsets may fail to handle + * the all of input data. + * <p> + * The returned UCharsetMatch objects are owned by the UCharsetDetector. + * They will remain valid until the detector is closed or modified + * + * <p> + * Return an error if + * <ul> + * <li>no charsets appear to match the input data.</li> + * <li>no input text has been provided</li> + * </ul> + * + * @param ucsd the charset detector to be used. + * @param matchesFound pointer to a variable that will be set to the + * number of charsets identified that are consistent with + * the input data. Output only. + * @param status any error conditions are reported back in this variable. + * @return A pointer to an array of pointers to UCharSetMatch objects. + * This array, and the UCharSetMatch instances to which it refers, + * are owned by the UCharsetDetector, and will remain valid until + * the detector is closed or modified. + * @stable ICU 3.6 + */ +U_STABLE const UCharsetMatch ** U_EXPORT2 +ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status); + + + +/** + * Get the name of the charset represented by a UCharsetMatch. + * + * The storage for the returned name string is owned by the + * UCharsetMatch, and will remain valid while the UCharsetMatch + * is valid. + * + * The name returned is suitable for use with the ICU conversion APIs. + * + * @param ucsm The charset match object. + * @param status Any error conditions are reported back in this variable. + * @return The name of the matching charset. + * + * @stable ICU 3.6 + */ +U_STABLE const char * U_EXPORT2 +ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status); + +/** + * Get a confidence number for the quality of the match of the byte + * data with the charset. Confidence numbers range from zero to 100, + * with 100 representing complete confidence and zero representing + * no confidence. + * + * The confidence values are somewhat arbitrary. They define an + * an ordering within the results for any single detection operation + * but are not generally comparable between the results for different input. + * + * A confidence value of ten does have a general meaning - it is used + * for charsets that can represent the input data, but for which there + * is no other indication that suggests that the charset is the correct one. + * Pure 7 bit ASCII data, for example, is compatible with a + * great many charsets, most of which will appear as possible matches + * with a confidence of 10. + * + * @param ucsm The charset match object. + * @param status Any error conditions are reported back in this variable. + * @return A confidence number for the charset match. + * + * @stable ICU 3.6 + */ +U_STABLE int32_t U_EXPORT2 +ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status); + +/** + * Get the RFC 3066 code for the language of the input data. + * + * The Charset Detection service is intended primarily for detecting + * charsets, not language. For some, but not all, charsets, a language is + * identified as a byproduct of the detection process, and that is what + * is returned by this function. + * + * CAUTION: + * 1. Language information is not available for input data encoded in + * all charsets. In particular, no language is identified + * for UTF-8 input data. + * + * 2. Closely related languages may sometimes be confused. + * + * If more accurate language detection is required, a linguistic + * analysis package should be used. + * + * The storage for the returned name string is owned by the + * UCharsetMatch, and will remain valid while the UCharsetMatch + * is valid. + * + * @param ucsm The charset match object. + * @param status Any error conditions are reported back in this variable. + * @return The RFC 3066 code for the language of the input data, or + * an empty string if the language could not be determined. + * + * @stable ICU 3.6 + */ +U_STABLE const char * U_EXPORT2 +ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status); + + +/** + * Get the entire input text as a UChar string, placing it into + * a caller-supplied buffer. A terminating + * NUL character will be appended to the buffer if space is available. + * + * The number of UChars in the output string, not including the terminating + * NUL, is returned. + * + * If the supplied buffer is smaller than required to hold the output, + * the contents of the buffer are undefined. The full output string length + * (in UChars) is returned as always, and can be used to allocate a buffer + * of the correct size. + * + * + * @param ucsm The charset match object. + * @param buf A UChar buffer to be filled with the converted text data. + * @param cap The capacity of the buffer in UChars. + * @param status Any error conditions are reported back in this variable. + * @return The number of UChars in the output string. + * + * @stable ICU 3.6 + */ +U_STABLE int32_t U_EXPORT2 +ucsdet_getUChars(const UCharsetMatch *ucsm, + UChar *buf, int32_t cap, UErrorCode *status); + + + +/** + * Get an iterator over the set of all detectable charsets - + * over the charsets that are known to the charset detection + * service. + * + * The returned UEnumeration provides access to the names of + * the charsets. + * + * <p> + * The state of the Charset detector that is passed in does not + * affect the result of this function, but requiring a valid, open + * charset detector as a parameter insures that the charset detection + * service has been safely initialized and that the required detection + * data is available. + * + * <p> + * <b>Note:</b> Multiple different charset encodings in a same family may use + * a single shared name in this implementation. For example, this method returns + * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" + * (Windows Latin 1). However, actual detection result could be "windows-1252" + * when the input data matches Latin 1 code points with any points only available + * in "windows-1252". + * + * @param ucsd a Charset detector. + * @param status Any error conditions are reported back in this variable. + * @return an iterator providing access to the detectable charset names. + * @stable ICU 3.6 + */ +U_STABLE UEnumeration * U_EXPORT2 +ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); + +/** + * Test whether input filtering is enabled for this charset detector. + * Input filtering removes text that appears to be HTML or xml + * markup from the input before applying the code page detection + * heuristics. + * + * @param ucsd The charset detector to check. + * @return TRUE if filtering is enabled. + * @stable ICU 3.6 + */ + +U_STABLE UBool U_EXPORT2 +ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); + + +/** + * Enable filtering of input text. If filtering is enabled, + * text within angle brackets ("<" and ">") will be removed + * before detection, which will remove most HTML or xml markup. + * + * @param ucsd the charset detector to be modified. + * @param filter <code>true</code> to enable input text filtering. + * @return The previous setting. + * + * @stable ICU 3.6 + */ +U_STABLE UBool U_EXPORT2 +ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); + +#ifndef U_HIDE_INTERNAL_API +/** + * Get an iterator over the set of detectable charsets - + * over the charsets that are enabled by the specified charset detector. + * + * The returned UEnumeration provides access to the names of + * the charsets. + * + * @param ucsd a Charset detector. + * @param status Any error conditions are reported back in this variable. + * @return an iterator providing access to the detectable charset names by + * the specified charset detector. + * @internal + */ +U_INTERNAL UEnumeration * U_EXPORT2 +ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); + +/** + * Enable or disable individual charset encoding. + * A name of charset encoding must be included in the names returned by + * {@link #getAllDetectableCharsets()}. + * + * @param ucsd a Charset detector. + * @param encoding encoding the name of charset encoding. + * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the + * charset encoding. + * @param status receives the return status. When the name of charset encoding + * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set. + * @internal + */ +U_INTERNAL void U_EXPORT2 +ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); +#endif /* U_HIDE_INTERNAL_API */ + +#endif +#endif /* __UCSDET_H */ + + diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/udata.h b/src/core/basetypes/icu-ucsdet/include/unicode/udata.h new file mode 100644 index 00000000..29e46630 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/udata.h @@ -0,0 +1,430 @@ +/* +****************************************************************************** +* +* Copyright (C) 1999-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: udata.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999oct25 +* created by: Markus W. Scherer +*/ + +#ifndef __UDATA_H__ +#define __UDATA_H__ + +#include "unicode/utypes.h" +#include "unicode/localpointer.h" + +U_CDECL_BEGIN + +/** + * \file + * \brief C API: Data loading interface + * + * <h2>Information about data loading interface</h2> + * + * This API is used to find and efficiently load data for ICU and applications + * using ICU. It provides an abstract interface that specifies a data type and + * name to find and load the data. Normally this API is used by other ICU APIs + * to load required data out of the ICU data library, but it can be used to + * load data out of other places. + * + * See the User Guide Data Management chapter. + */ + +#ifndef U_HIDE_INTERNAL_API +/** + * Character used to separate package names from tree names + * @internal ICU 3.0 + */ +#define U_TREE_SEPARATOR '-' + +/** + * String used to separate package names from tree names + * @internal ICU 3.0 + */ +#define U_TREE_SEPARATOR_STRING "-" + +/** + * Character used to separate parts of entry names + * @internal ICU 3.0 + */ +#define U_TREE_ENTRY_SEP_CHAR '/' + +/** + * String used to separate parts of entry names + * @internal ICU 3.0 + */ +#define U_TREE_ENTRY_SEP_STRING "/" + +/** + * Alias for standard ICU data + * @internal ICU 3.0 + */ +#define U_ICUDATA_ALIAS "ICUDATA" + +#endif /* U_HIDE_INTERNAL_API */ + +/** + * UDataInfo contains the properties about the requested data. + * This is meta data. + * + * <p>This structure may grow in the future, indicated by the + * <code>size</code> field.</p> + * + * <p>ICU data must be at least 8-aligned, and should be 16-aligned. + * The UDataInfo struct begins 4 bytes after the start of the data item, + * so it is 4-aligned. + * + * <p>The platform data property fields help determine if a data + * file can be efficiently used on a given machine. + * The particular fields are of importance only if the data + * is affected by the properties - if there is integer data + * with word sizes > 1 byte, char* text, or UChar* text.</p> + * + * <p>The implementation for the <code>udata_open[Choice]()</code> + * functions may reject data based on the value in <code>isBigEndian</code>. + * No other field is used by the <code>udata</code> API implementation.</p> + * + * <p>The <code>dataFormat</code> may be used to identify + * the kind of data, e.g. a converter table.</p> + * + * <p>The <code>formatVersion</code> field should be used to + * make sure that the format can be interpreted. + * It may be a good idea to check only for the one or two highest + * of the version elements to allow the data memory to + * get more or somewhat rearranged contents, for as long + * as the using code can still interpret the older contents.</p> + * + * <p>The <code>dataVersion</code> field is intended to be a + * common place to store the source version of the data; + * for data from the Unicode character database, this could + * reflect the Unicode version.</p> + * + * @stable ICU 2.0 + */ +typedef struct { + /** sizeof(UDataInfo) + * @stable ICU 2.0 */ + uint16_t size; + + /** unused, set to 0 + * @stable ICU 2.0*/ + uint16_t reservedWord; + + /* platform data properties */ + /** 0 for little-endian machine, 1 for big-endian + * @stable ICU 2.0 */ + uint8_t isBigEndian; + + /** see U_CHARSET_FAMILY values in utypes.h + * @stable ICU 2.0*/ + uint8_t charsetFamily; + + /** sizeof(UChar), one of { 1, 2, 4 } + * @stable ICU 2.0*/ + uint8_t sizeofUChar; + + /** unused, set to 0 + * @stable ICU 2.0*/ + uint8_t reservedByte; + + /** data format identifier + * @stable ICU 2.0*/ + uint8_t dataFormat[4]; + + /** versions: [0] major [1] minor [2] milli [3] micro + * @stable ICU 2.0*/ + uint8_t formatVersion[4]; + + /** versions: [0] major [1] minor [2] milli [3] micro + * @stable ICU 2.0*/ + uint8_t dataVersion[4]; +} UDataInfo; + +/* API for reading data -----------------------------------------------------*/ + +/** + * Forward declaration of the data memory type. + * @stable ICU 2.0 + */ +typedef struct UDataMemory UDataMemory; + +/** + * Callback function for udata_openChoice(). + * @param context parameter passed into <code>udata_openChoice()</code>. + * @param type The type of the data as passed into <code>udata_openChoice()</code>. + * It may be <code>NULL</code>. + * @param name The name of the data as passed into <code>udata_openChoice()</code>. + * @param pInfo A pointer to the <code>UDataInfo</code> structure + * of data that has been loaded and will be returned + * by <code>udata_openChoice()</code> if this function + * returns <code>TRUE</code>. + * @return TRUE if the current data memory is acceptable + * @stable ICU 2.0 + */ +typedef UBool U_CALLCONV +UDataMemoryIsAcceptable(void *context, + const char *type, const char *name, + const UDataInfo *pInfo); + + +/** + * Convenience function. + * This function works the same as <code>udata_openChoice</code> + * except that any data that matches the type and name + * is assumed to be acceptable. + * @param path Specifies an absolute path and/or a basename for the + * finding of the data in the file system. + * <code>NULL</code> for ICU data. + * @param type A string that specifies the type of data to be loaded. + * For example, resource bundles are loaded with type "res", + * conversion tables with type "cnv". + * This may be <code>NULL</code> or empty. + * @param name A string that specifies the name of the data. + * @param pErrorCode An ICU UErrorCode parameter. It must not be <code>NULL</code>. + * @return A pointer (handle) to a data memory object, or <code>NULL</code> + * if an error occurs. Call <code>udata_getMemory()</code> + * to get a pointer to the actual data. + * + * @see udata_openChoice + * @stable ICU 2.0 + */ +U_STABLE UDataMemory * U_EXPORT2 +udata_open(const char *path, const char *type, const char *name, + UErrorCode *pErrorCode); + +/** + * Data loading function. + * This function is used to find and load efficiently data for + * ICU and applications using ICU. + * It provides an abstract interface that allows to specify a data + * type and name to find and load the data. + * + * <p>The implementation depends on platform properties and user preferences + * and may involve loading shared libraries (DLLs), mapping + * files into memory, or fopen()/fread() files. + * It may also involve using static memory or database queries etc. + * Several or all data items may be combined into one entity + * (DLL, memory-mappable file).</p> + * + * <p>The data is always preceded by a header that includes + * a <code>UDataInfo</code> structure. + * The caller's <code>isAcceptable()</code> function is called to make + * sure that the data is useful. It may be called several times if it + * rejects the data and there is more than one location with data + * matching the type and name.</p> + * + * <p>If <code>path==NULL</code>, then ICU data is loaded. + * Otherwise, it is separated into a basename and a basename-less directory string. + * The basename is used as the data package name, and the directory is + * logically prepended to the ICU data directory string.</p> + * + * <p>For details about ICU data loading see the User Guide + * Data Management chapter. (http://icu-project.org/userguide/icudata.html)</p> + * + * @param path Specifies an absolute path and/or a basename for the + * finding of the data in the file system. + * <code>NULL</code> for ICU data. + * @param type A string that specifies the type of data to be loaded. + * For example, resource bundles are loaded with type "res", + * conversion tables with type "cnv". + * This may be <code>NULL</code> or empty. + * @param name A string that specifies the name of the data. + * @param isAcceptable This function is called to verify that loaded data + * is useful for the client code. If it returns FALSE + * for all data items, then <code>udata_openChoice()</code> + * will return with an error. + * @param context Arbitrary parameter to be passed into isAcceptable. + * @param pErrorCode An ICU UErrorCode parameter. It must not be <code>NULL</code>. + * @return A pointer (handle) to a data memory object, or <code>NULL</code> + * if an error occurs. Call <code>udata_getMemory()</code> + * to get a pointer to the actual data. + * @stable ICU 2.0 + */ +U_STABLE UDataMemory * U_EXPORT2 +udata_openChoice(const char *path, const char *type, const char *name, + UDataMemoryIsAcceptable *isAcceptable, void *context, + UErrorCode *pErrorCode); + +/** + * Close the data memory. + * This function must be called to allow the system to + * release resources associated with this data memory. + * @param pData The pointer to data memory object + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +udata_close(UDataMemory *pData); + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN + +/** + * \class LocalUDataMemoryPointer + * "Smart pointer" class, closes a UDataMemory via udata_close(). + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + * @stable ICU 4.4 + */ +U_DEFINE_LOCAL_OPEN_POINTER(LocalUDataMemoryPointer, UDataMemory, udata_close); + +U_NAMESPACE_END + +#endif + +/** + * Get the pointer to the actual data inside the data memory. + * The data is read-only. + * + * ICU data must be at least 8-aligned, and should be 16-aligned. + * + * @param pData The pointer to data memory object + * @stable ICU 2.0 + */ +U_STABLE const void * U_EXPORT2 +udata_getMemory(UDataMemory *pData); + +/** + * Get the information from the data memory header. + * This allows to get access to the header containing + * platform data properties etc. which is not part of + * the data itself and can therefore not be accessed + * via the pointer that <code>udata_getMemory()</code> returns. + * + * @param pData pointer to the data memory object + * @param pInfo pointer to a UDataInfo object; + * its <code>size</code> field must be set correctly, + * typically to <code>sizeof(UDataInfo)</code>. + * + * <code>*pInfo</code> will be filled with the UDataInfo structure + * in the data memory object. If this structure is smaller than + * <code>pInfo->size</code>, then the <code>size</code> will be + * adjusted and only part of the structure will be filled. + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +udata_getInfo(UDataMemory *pData, UDataInfo *pInfo); + +/** + * This function bypasses the normal ICU data loading process and + * allows you to force ICU's system data to come out of a user-specified + * area in memory. + * + * ICU data must be at least 8-aligned, and should be 16-aligned. + * See http://userguide.icu-project.org/icudata + * + * The format of this data is that of the icu common data file, as is + * generated by the pkgdata tool with mode=common or mode=dll. + * You can read in a whole common mode file and pass the address to the start of the + * data, or (with the appropriate link options) pass in the pointer to + * the data that has been loaded from a dll by the operating system, + * as shown in this code: + * + * extern const char U_IMPORT U_ICUDATA_ENTRY_POINT []; + * // U_ICUDATA_ENTRY_POINT is same as entry point specified to pkgdata tool + * UErrorCode status = U_ZERO_ERROR; + * + * udata_setCommonData(&U_ICUDATA_ENTRY_POINT, &status); + * + * It is important that the declaration be as above. The entry point + * must not be declared as an extern void*. + * + * Starting with ICU 4.4, it is possible to set several data packages, + * one per call to this function. + * udata_open() will look for data in the multiple data packages in the order + * in which they were set. + * The position of the linked-in or default-name ICU .data package in the + * search list depends on when the first data item is loaded that is not contained + * in the already explicitly set packages. + * If data was loaded implicitly before the first call to this function + * (for example, via opening a converter, constructing a UnicodeString + * from default-codepage data, using formatting or collation APIs, etc.), + * then the default data will be first in the list. + * + * This function has no effect on application (non ICU) data. See udata_setAppData() + * for similar functionality for application data. + * + * @param data pointer to ICU common data + * @param err outgoing error status <code>U_USING_DEFAULT_WARNING, U_UNSUPPORTED_ERROR</code> + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +udata_setCommonData(const void *data, UErrorCode *err); + + +/** + * This function bypasses the normal ICU data loading process for application-specific + * data and allows you to force the it to come out of a user-specified + * pointer. + * + * ICU data must be at least 8-aligned, and should be 16-aligned. + * See http://userguide.icu-project.org/icudata + * + * The format of this data is that of the icu common data file, like 'icudt26l.dat' + * or the corresponding shared library (DLL) file. + * The application must read in or otherwise construct an image of the data and then + * pass the address of it to this function. + * + * + * Warning: setAppData will set a U_USING_DEFAULT_WARNING code if + * data with the specifed path that has already been opened, or + * if setAppData with the same path has already been called. + * Any such calls to setAppData will have no effect. + * + * + * @param packageName the package name by which the application will refer + * to (open) this data + * @param data pointer to the data + * @param err outgoing error status <code>U_USING_DEFAULT_WARNING, U_UNSUPPORTED_ERROR</code> + * @see udata_setCommonData + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +udata_setAppData(const char *packageName, const void *data, UErrorCode *err); + +/** + * Possible settings for udata_setFileAccess() + * @see udata_setFileAccess + * @stable ICU 3.4 + */ +typedef enum UDataFileAccess { + /** ICU looks for data in single files first, then in packages. (default) @stable ICU 3.4 */ + UDATA_FILES_FIRST, + /** An alias for the default access mode. @stable ICU 3.4 */ + UDATA_DEFAULT_ACCESS = UDATA_FILES_FIRST, + /** ICU only loads data from packages, not from single files. @stable ICU 3.4 */ + UDATA_ONLY_PACKAGES, + /** ICU loads data from packages first, and only from single files + if the data cannot be found in a package. @stable ICU 3.4 */ + UDATA_PACKAGES_FIRST, + /** ICU does not access the file system for data loading. @stable ICU 3.4 */ + UDATA_NO_FILES, + /** Number of real UDataFileAccess values. @stable ICU 3.4 */ + UDATA_FILE_ACCESS_COUNT +} UDataFileAccess; + +/** + * This function may be called to control how ICU loads data. It must be called + * before any ICU data is loaded, including application data loaded with + * ures/ResourceBundle or udata APIs. This function is not multithread safe. + * The results of calling it while other threads are loading data are undefined. + * @param access The type of file access to be used + * @param status Error code. + * @see UDataFileAccess + * @stable ICU 3.4 + */ +U_STABLE void U_EXPORT2 +udata_setFileAccess(UDataFileAccess access, UErrorCode *status); + +U_CDECL_END + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/uenum.h b/src/core/basetypes/icu-ucsdet/include/unicode/uenum.h new file mode 100644 index 00000000..5408ec5a --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/uenum.h @@ -0,0 +1,206 @@ +/* +******************************************************************************* +* +* Copyright (C) 2002-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uenum.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:2 +* +* created on: 2002jul08 +* created by: Vladimir Weinstein +*/ + +#ifndef __UENUM_H +#define __UENUM_H + +#include "unicode/utypes.h" +#include "unicode/localpointer.h" + +#if U_SHOW_CPLUSPLUS_API +#include "unicode/strenum.h" +#endif + +/** + * \file + * \brief C API: String Enumeration + */ + +/** + * An enumeration object. + * For usage in C programs. + * @stable ICU 2.2 + */ +struct UEnumeration; +/** structure representing an enumeration object instance @stable ICU 2.2 */ +typedef struct UEnumeration UEnumeration; + +/** + * Disposes of resources in use by the iterator. If en is NULL, + * does nothing. After this call, any char* or UChar* pointer + * returned by uenum_unext() or uenum_next() is invalid. + * @param en UEnumeration structure pointer + * @stable ICU 2.2 + */ +U_STABLE void U_EXPORT2 +uenum_close(UEnumeration* en); + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN + +/** + * \class LocalUEnumerationPointer + * "Smart pointer" class, closes a UEnumeration via uenum_close(). + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + * @stable ICU 4.4 + */ +U_DEFINE_LOCAL_OPEN_POINTER(LocalUEnumerationPointer, UEnumeration, uenum_close); + +U_NAMESPACE_END + +#endif + +/** + * Returns the number of elements that the iterator traverses. If + * the iterator is out-of-sync with its service, status is set to + * U_ENUM_OUT_OF_SYNC_ERROR. + * This is a convenience function. It can end up being very + * expensive as all the items might have to be pre-fetched (depending + * on the type of data being traversed). Use with caution and only + * when necessary. + * @param en UEnumeration structure pointer + * @param status error code, can be U_ENUM_OUT_OF_SYNC_ERROR if the + * iterator is out of sync. + * @return number of elements in the iterator + * @stable ICU 2.2 + */ +U_STABLE int32_t U_EXPORT2 +uenum_count(UEnumeration* en, UErrorCode* status); + +/** + * Returns the next element in the iterator's list. If there are + * no more elements, returns NULL. If the iterator is out-of-sync + * with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and + * NULL is returned. If the native service string is a char* string, + * it is converted to UChar* with the invariant converter. + * The result is terminated by (UChar)0. + * @param en the iterator object + * @param resultLength pointer to receive the length of the result + * (not including the terminating \\0). + * If the pointer is NULL it is ignored. + * @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if + * the iterator is out of sync with its service. + * @return a pointer to the string. The string will be + * zero-terminated. The return pointer is owned by this iterator + * and must not be deleted by the caller. The pointer is valid + * until the next call to any uenum_... method, including + * uenum_next() or uenum_unext(). When all strings have been + * traversed, returns NULL. + * @stable ICU 2.2 + */ +U_STABLE const UChar* U_EXPORT2 +uenum_unext(UEnumeration* en, + int32_t* resultLength, + UErrorCode* status); + +/** + * Returns the next element in the iterator's list. If there are + * no more elements, returns NULL. If the iterator is out-of-sync + * with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and + * NULL is returned. If the native service string is a UChar* + * string, it is converted to char* with the invariant converter. + * The result is terminated by (char)0. If the conversion fails + * (because a character cannot be converted) then status is set to + * U_INVARIANT_CONVERSION_ERROR and the return value is undefined + * (but non-NULL). + * @param en the iterator object + * @param resultLength pointer to receive the length of the result + * (not including the terminating \\0). + * If the pointer is NULL it is ignored. + * @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if + * the iterator is out of sync with its service. Set to + * U_INVARIANT_CONVERSION_ERROR if the underlying native string is + * UChar* and conversion to char* with the invariant converter + * fails. This error pertains only to current string, so iteration + * might be able to continue successfully. + * @return a pointer to the string. The string will be + * zero-terminated. The return pointer is owned by this iterator + * and must not be deleted by the caller. The pointer is valid + * until the next call to any uenum_... method, including + * uenum_next() or uenum_unext(). When all strings have been + * traversed, returns NULL. + * @stable ICU 2.2 + */ +U_STABLE const char* U_EXPORT2 +uenum_next(UEnumeration* en, + int32_t* resultLength, + UErrorCode* status); + +/** + * Resets the iterator to the current list of service IDs. This + * re-establishes sync with the service and rewinds the iterator + * to start at the first element. + * @param en the iterator object + * @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if + * the iterator is out of sync with its service. + * @stable ICU 2.2 + */ +U_STABLE void U_EXPORT2 +uenum_reset(UEnumeration* en, UErrorCode* status); + +#if U_SHOW_CPLUSPLUS_API + +/** + * Given a StringEnumeration, wrap it in a UEnumeration. The + * StringEnumeration is adopted; after this call, the caller must not + * delete it (regardless of error status). + * @param adopted the C++ StringEnumeration to be wrapped in a UEnumeration. + * @param ec the error code. + * @return a UEnumeration wrapping the adopted StringEnumeration. + * @stable ICU 4.2 + */ +U_STABLE UEnumeration* U_EXPORT2 +uenum_openFromStringEnumeration(icu::StringEnumeration* adopted, UErrorCode* ec); + +#endif + +/** + * Given an array of const UChar* strings, return a UEnumeration. String pointers from 0..count-1 must not be null. + * Do not free or modify either the string array or the characters it points to until this object has been destroyed with uenum_close. + * \snippet test/cintltst/uenumtst.c uenum_openUCharStringsEnumeration + * @param strings array of const UChar* strings (each null terminated). All storage is owned by the caller. + * @param count length of the array + * @param ec error code + * @return the new UEnumeration object. Caller is responsible for calling uenum_close to free memory. + * @see uenum_close + * @stable ICU 50 + */ +U_STABLE UEnumeration* U_EXPORT2 +uenum_openUCharStringsEnumeration(const UChar* const strings[], int32_t count, + UErrorCode* ec); + +/* Note: next function is not hidden as draft, as it is used internally (it was formerly an internal function). */ + +/** + * Given an array of const char* strings (invariant chars only), return a UEnumeration. String pointers from 0..count-1 must not be null. + * Do not free or modify either the string array or the characters it points to until this object has been destroyed with uenum_close. + * \snippet test/cintltst/uenumtst.c uenum_openCharStringsEnumeration + * @param strings array of char* strings (each null terminated). All storage is owned by the caller. + * @param count length of the array + * @param ec error code + * @return the new UEnumeration object. Caller is responsible for calling uenum_close to free memory + * @see uenum_close + * @stable ICU 50 + */ +U_STABLE UEnumeration* U_EXPORT2 +uenum_openCharStringsEnumeration(const char* const strings[], int32_t count, + UErrorCode* ec); + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/uiter.h b/src/core/basetypes/icu-ucsdet/include/unicode/uiter.h new file mode 100644 index 00000000..0cdb8ffb --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/uiter.h @@ -0,0 +1,707 @@ +/* +******************************************************************************* +* +* Copyright (C) 2002-2011 International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uiter.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002jan18 +* created by: Markus W. Scherer +*/ + +#ifndef __UITER_H__ +#define __UITER_H__ + +/** + * \file + * \brief C API: Unicode Character Iteration + * + * @see UCharIterator + */ + +#include "unicode/utypes.h" + +#if U_SHOW_CPLUSPLUS_API + U_NAMESPACE_BEGIN + + class CharacterIterator; + class Replaceable; + + U_NAMESPACE_END +#endif + +U_CDECL_BEGIN + +struct UCharIterator; +typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */ + +/** + * Origin constants for UCharIterator.getIndex() and UCharIterator.move(). + * @see UCharIteratorMove + * @see UCharIterator + * @stable ICU 2.1 + */ +typedef enum UCharIteratorOrigin { + UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH +} UCharIteratorOrigin; + +/** Constants for UCharIterator. @stable ICU 2.6 */ +enum { + /** + * Constant value that may be returned by UCharIteratorMove + * indicating that the final UTF-16 index is not known, but that the move succeeded. + * This can occur when moving relative to limit or length, or + * when moving relative to the current index after a setState() + * when the current UTF-16 index is not known. + * + * It would be very inefficient to have to count from the beginning of the text + * just to get the current/limit/length index after moving relative to it. + * The actual index can be determined with getIndex(UITER_CURRENT) + * which will count the UChars if necessary. + * + * @stable ICU 2.6 + */ + UITER_UNKNOWN_INDEX=-2 +}; + + +/** + * Constant for UCharIterator getState() indicating an error or + * an unknown state. + * Returned by uiter_getState()/UCharIteratorGetState + * when an error occurs. + * Also, some UCharIterator implementations may not be able to return + * a valid state for each position. This will be clearly documented + * for each such iterator (none of the public ones here). + * + * @stable ICU 2.6 + */ +#define UITER_NO_STATE ((uint32_t)0xffffffff) + +/** + * Function type declaration for UCharIterator.getIndex(). + * + * Gets the current position, or the start or limit of the + * iteration range. + * + * This function may perform slowly for UITER_CURRENT after setState() was called, + * or for UITER_LENGTH, because an iterator implementation may have to count + * UChars if the underlying storage is not UTF-16. + * + * @param iter the UCharIterator structure ("this pointer") + * @param origin get the 0, start, limit, length, or current index + * @return the requested index, or U_SENTINEL in an error condition + * + * @see UCharIteratorOrigin + * @see UCharIterator + * @stable ICU 2.1 + */ +typedef int32_t U_CALLCONV +UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin); + +/** + * Function type declaration for UCharIterator.move(). + * + * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index). + * + * Moves the current position relative to the start or limit of the + * iteration range, or relative to the current position itself. + * The movement is expressed in numbers of code units forward + * or backward by specifying a positive or negative delta. + * Out of bounds movement will be pinned to the start or limit. + * + * This function may perform slowly for moving relative to UITER_LENGTH + * because an iterator implementation may have to count the rest of the + * UChars if the native storage is not UTF-16. + * + * When moving relative to the limit or length, or + * relative to the current position after setState() was called, + * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient + * determination of the actual UTF-16 index. + * The actual index can be determined with getIndex(UITER_CURRENT) + * which will count the UChars if necessary. + * See UITER_UNKNOWN_INDEX for details. + * + * @param iter the UCharIterator structure ("this pointer") + * @param delta can be positive, zero, or negative + * @param origin move relative to the 0, start, limit, length, or current index + * @return the new index, or U_SENTINEL on an error condition, + * or UITER_UNKNOWN_INDEX when the index is not known. + * + * @see UCharIteratorOrigin + * @see UCharIterator + * @see UITER_UNKNOWN_INDEX + * @stable ICU 2.1 + */ +typedef int32_t U_CALLCONV +UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin); + +/** + * Function type declaration for UCharIterator.hasNext(). + * + * Check if current() and next() can still + * return another code unit. + * + * @param iter the UCharIterator structure ("this pointer") + * @return boolean value for whether current() and next() can still return another code unit + * + * @see UCharIterator + * @stable ICU 2.1 + */ +typedef UBool U_CALLCONV +UCharIteratorHasNext(UCharIterator *iter); + +/** + * Function type declaration for UCharIterator.hasPrevious(). + * + * Check if previous() can still return another code unit. + * + * @param iter the UCharIterator structure ("this pointer") + * @return boolean value for whether previous() can still return another code unit + * + * @see UCharIterator + * @stable ICU 2.1 + */ +typedef UBool U_CALLCONV +UCharIteratorHasPrevious(UCharIterator *iter); + +/** + * Function type declaration for UCharIterator.current(). + * + * Return the code unit at the current position, + * or U_SENTINEL if there is none (index is at the limit). + * + * @param iter the UCharIterator structure ("this pointer") + * @return the current code unit + * + * @see UCharIterator + * @stable ICU 2.1 + */ +typedef UChar32 U_CALLCONV +UCharIteratorCurrent(UCharIterator *iter); + +/** + * Function type declaration for UCharIterator.next(). + * + * Return the code unit at the current index and increment + * the index (post-increment, like s[i++]), + * or return U_SENTINEL if there is none (index is at the limit). + * + * @param iter the UCharIterator structure ("this pointer") + * @return the current code unit (and post-increment the current index) + * + * @see UCharIterator + * @stable ICU 2.1 + */ +typedef UChar32 U_CALLCONV +UCharIteratorNext(UCharIterator *iter); + +/** + * Function type declaration for UCharIterator.previous(). + * + * Decrement the index and return the code unit from there + * (pre-decrement, like s[--i]), + * or return U_SENTINEL if there is none (index is at the start). + * + * @param iter the UCharIterator structure ("this pointer") + * @return the previous code unit (after pre-decrementing the current index) + * + * @see UCharIterator + * @stable ICU 2.1 + */ +typedef UChar32 U_CALLCONV +UCharIteratorPrevious(UCharIterator *iter); + +/** + * Function type declaration for UCharIterator.reservedFn(). + * Reserved for future use. + * + * @param iter the UCharIterator structure ("this pointer") + * @param something some integer argument + * @return some integer + * + * @see UCharIterator + * @stable ICU 2.1 + */ +typedef int32_t U_CALLCONV +UCharIteratorReserved(UCharIterator *iter, int32_t something); + +/** + * Function type declaration for UCharIterator.getState(). + * + * Get the "state" of the iterator in the form of a single 32-bit word. + * It is recommended that the state value be calculated to be as small as + * is feasible. For strings with limited lengths, fewer than 32 bits may + * be sufficient. + * + * This is used together with setState()/UCharIteratorSetState + * to save and restore the iterator position more efficiently than with + * getIndex()/move(). + * + * The iterator state is defined as a uint32_t value because it is designed + * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state + * of the character iterator. + * + * With some UCharIterator implementations (e.g., UTF-8), + * getting and setting the UTF-16 index with existing functions + * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but + * relatively slow because the iterator has to "walk" from a known index + * to the requested one. + * This takes more time the farther it needs to go. + * + * An opaque state value allows an iterator implementation to provide + * an internal index (UTF-8: the source byte array index) for + * fast, constant-time restoration. + * + * After calling setState(), a getIndex(UITER_CURRENT) may be slow because + * the UTF-16 index may not be restored as well, but the iterator can deliver + * the correct text contents and move relative to the current position + * without performance degradation. + * + * Some UCharIterator implementations may not be able to return + * a valid state for each position, in which case they return UITER_NO_STATE instead. + * This will be clearly documented for each such iterator (none of the public ones here). + * + * @param iter the UCharIterator structure ("this pointer") + * @return the state word + * + * @see UCharIterator + * @see UCharIteratorSetState + * @see UITER_NO_STATE + * @stable ICU 2.6 + */ +typedef uint32_t U_CALLCONV +UCharIteratorGetState(const UCharIterator *iter); + +/** + * Function type declaration for UCharIterator.setState(). + * + * Restore the "state" of the iterator using a state word from a getState() call. + * The iterator object need not be the same one as for which getState() was called, + * but it must be of the same type (set up using the same uiter_setXYZ function) + * and it must iterate over the same string + * (binary identical regardless of memory address). + * For more about the state word see UCharIteratorGetState. + * + * After calling setState(), a getIndex(UITER_CURRENT) may be slow because + * the UTF-16 index may not be restored as well, but the iterator can deliver + * the correct text contents and move relative to the current position + * without performance degradation. + * + * @param iter the UCharIterator structure ("this pointer") + * @param state the state word from a getState() call + * on a same-type, same-string iterator + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * + * @see UCharIterator + * @see UCharIteratorGetState + * @stable ICU 2.6 + */ +typedef void U_CALLCONV +UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode); + + +/** + * C API for code unit iteration. + * This can be used as a C wrapper around + * CharacterIterator, Replaceable, or implemented using simple strings, etc. + * + * There are two roles for using UCharIterator: + * + * A "provider" sets the necessary function pointers and controls the "protected" + * fields of the UCharIterator structure. A "provider" passes a UCharIterator + * into C APIs that need a UCharIterator as an abstract, flexible string interface. + * + * Implementations of such C APIs are "callers" of UCharIterator functions; + * they only use the "public" function pointers and never access the "protected" + * fields directly. + * + * The current() and next() functions only check the current index against the + * limit, and previous() only checks the current index against the start, + * to see if the iterator already reached the end of the iteration range. + * + * The assumption - in all iterators - is that the index is moved via the API, + * which means it won't go out of bounds, or the index is modified by + * user code that knows enough about the iterator implementation to set valid + * index values. + * + * UCharIterator functions return code unit values 0..0xffff, + * or U_SENTINEL if the iteration bounds are reached. + * + * @stable ICU 2.1 + */ +struct UCharIterator { + /** + * (protected) Pointer to string or wrapped object or similar. + * Not used by caller. + * @stable ICU 2.1 + */ + const void *context; + + /** + * (protected) Length of string or similar. + * Not used by caller. + * @stable ICU 2.1 + */ + int32_t length; + + /** + * (protected) Start index or similar. + * Not used by caller. + * @stable ICU 2.1 + */ + int32_t start; + + /** + * (protected) Current index or similar. + * Not used by caller. + * @stable ICU 2.1 + */ + int32_t index; + + /** + * (protected) Limit index or similar. + * Not used by caller. + * @stable ICU 2.1 + */ + int32_t limit; + + /** + * (protected) Used by UTF-8 iterators and possibly others. + * @stable ICU 2.1 + */ + int32_t reservedField; + + /** + * (public) Returns the current position or the + * start or limit index of the iteration range. + * + * @see UCharIteratorGetIndex + * @stable ICU 2.1 + */ + UCharIteratorGetIndex *getIndex; + + /** + * (public) Moves the current position relative to the start or limit of the + * iteration range, or relative to the current position itself. + * The movement is expressed in numbers of code units forward + * or backward by specifying a positive or negative delta. + * + * @see UCharIteratorMove + * @stable ICU 2.1 + */ + UCharIteratorMove *move; + + /** + * (public) Check if current() and next() can still + * return another code unit. + * + * @see UCharIteratorHasNext + * @stable ICU 2.1 + */ + UCharIteratorHasNext *hasNext; + + /** + * (public) Check if previous() can still return another code unit. + * + * @see UCharIteratorHasPrevious + * @stable ICU 2.1 + */ + UCharIteratorHasPrevious *hasPrevious; + + /** + * (public) Return the code unit at the current position, + * or U_SENTINEL if there is none (index is at the limit). + * + * @see UCharIteratorCurrent + * @stable ICU 2.1 + */ + UCharIteratorCurrent *current; + + /** + * (public) Return the code unit at the current index and increment + * the index (post-increment, like s[i++]), + * or return U_SENTINEL if there is none (index is at the limit). + * + * @see UCharIteratorNext + * @stable ICU 2.1 + */ + UCharIteratorNext *next; + + /** + * (public) Decrement the index and return the code unit from there + * (pre-decrement, like s[--i]), + * or return U_SENTINEL if there is none (index is at the start). + * + * @see UCharIteratorPrevious + * @stable ICU 2.1 + */ + UCharIteratorPrevious *previous; + + /** + * (public) Reserved for future use. Currently NULL. + * + * @see UCharIteratorReserved + * @stable ICU 2.1 + */ + UCharIteratorReserved *reservedFn; + + /** + * (public) Return the state of the iterator, to be restored later with setState(). + * This function pointer is NULL if the iterator does not implement it. + * + * @see UCharIteratorGet + * @stable ICU 2.6 + */ + UCharIteratorGetState *getState; + + /** + * (public) Restore the iterator state from the state word from a call + * to getState(). + * This function pointer is NULL if the iterator does not implement it. + * + * @see UCharIteratorSet + * @stable ICU 2.6 + */ + UCharIteratorSetState *setState; +}; + +/** + * Helper function for UCharIterator to get the code point + * at the current index. + * + * Return the code point that includes the code unit at the current position, + * or U_SENTINEL if there is none (index is at the limit). + * If the current code unit is a lead or trail surrogate, + * then the following or preceding surrogate is used to form + * the code point value. + * + * @param iter the UCharIterator structure ("this pointer") + * @return the current code point + * + * @see UCharIterator + * @see U16_GET + * @see UnicodeString::char32At() + * @stable ICU 2.1 + */ +U_STABLE UChar32 U_EXPORT2 +uiter_current32(UCharIterator *iter); + +/** + * Helper function for UCharIterator to get the next code point. + * + * Return the code point at the current index and increment + * the index (post-increment, like s[i++]), + * or return U_SENTINEL if there is none (index is at the limit). + * + * @param iter the UCharIterator structure ("this pointer") + * @return the current code point (and post-increment the current index) + * + * @see UCharIterator + * @see U16_NEXT + * @stable ICU 2.1 + */ +U_STABLE UChar32 U_EXPORT2 +uiter_next32(UCharIterator *iter); + +/** + * Helper function for UCharIterator to get the previous code point. + * + * Decrement the index and return the code point from there + * (pre-decrement, like s[--i]), + * or return U_SENTINEL if there is none (index is at the start). + * + * @param iter the UCharIterator structure ("this pointer") + * @return the previous code point (after pre-decrementing the current index) + * + * @see UCharIterator + * @see U16_PREV + * @stable ICU 2.1 + */ +U_STABLE UChar32 U_EXPORT2 +uiter_previous32(UCharIterator *iter); + +/** + * Get the "state" of the iterator in the form of a single 32-bit word. + * This is a convenience function that calls iter->getState(iter) + * if iter->getState is not NULL; + * if it is NULL or any other error occurs, then UITER_NO_STATE is returned. + * + * Some UCharIterator implementations may not be able to return + * a valid state for each position, in which case they return UITER_NO_STATE instead. + * This will be clearly documented for each such iterator (none of the public ones here). + * + * @param iter the UCharIterator structure ("this pointer") + * @return the state word + * + * @see UCharIterator + * @see UCharIteratorGetState + * @see UITER_NO_STATE + * @stable ICU 2.6 + */ +U_STABLE uint32_t U_EXPORT2 +uiter_getState(const UCharIterator *iter); + +/** + * Restore the "state" of the iterator using a state word from a getState() call. + * This is a convenience function that calls iter->setState(iter, state, pErrorCode) + * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set. + * + * @param iter the UCharIterator structure ("this pointer") + * @param state the state word from a getState() call + * on a same-type, same-string iterator + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * + * @see UCharIterator + * @see UCharIteratorSetState + * @stable ICU 2.6 + */ +U_STABLE void U_EXPORT2 +uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode); + +/** + * Set up a UCharIterator to iterate over a string. + * + * Sets the UCharIterator function pointers for iteration over the string s + * with iteration boundaries start=index=0 and length=limit=string length. + * The "provider" may set the start, index, and limit values at any time + * within the range 0..length. + * The length field will be ignored. + * + * The string pointer s is set into UCharIterator.context without copying + * or reallocating the string contents. + * + * getState() simply returns the current index. + * move() will always return the final index. + * + * @param iter UCharIterator structure to be set for iteration + * @param s String to iterate over + * @param length Length of s, or -1 if NUL-terminated + * + * @see UCharIterator + * @stable ICU 2.1 + */ +U_STABLE void U_EXPORT2 +uiter_setString(UCharIterator *iter, const UChar *s, int32_t length); + +/** + * Set up a UCharIterator to iterate over a UTF-16BE string + * (byte vector with a big-endian pair of bytes per UChar). + * + * Everything works just like with a normal UChar iterator (uiter_setString), + * except that UChars are assembled from byte pairs, + * and that the length argument here indicates an even number of bytes. + * + * getState() simply returns the current index. + * move() will always return the final index. + * + * @param iter UCharIterator structure to be set for iteration + * @param s UTF-16BE string to iterate over + * @param length Length of s as an even number of bytes, or -1 if NUL-terminated + * (NUL means pair of 0 bytes at even index from s) + * + * @see UCharIterator + * @see uiter_setString + * @stable ICU 2.6 + */ +U_STABLE void U_EXPORT2 +uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length); + +/** + * Set up a UCharIterator to iterate over a UTF-8 string. + * + * Sets the UCharIterator function pointers for iteration over the UTF-8 string s + * with UTF-8 iteration boundaries 0 and length. + * The implementation counts the UTF-16 index on the fly and + * lazily evaluates the UTF-16 length of the text. + * + * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length. + * When the reservedField is not 0, then it contains a supplementary code point + * and the UTF-16 index is between the two corresponding surrogates. + * At that point, the UTF-8 index is behind that code point. + * + * The UTF-8 string pointer s is set into UCharIterator.context without copying + * or reallocating the string contents. + * + * getState() returns a state value consisting of + * - the current UTF-8 source byte index (bits 31..1) + * - a flag (bit 0) that indicates whether the UChar position is in the middle + * of a surrogate pair + * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point) + * + * getState() cannot also encode the UTF-16 index in the state value. + * move(relative to limit or length), or + * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX. + * + * @param iter UCharIterator structure to be set for iteration + * @param s UTF-8 string to iterate over + * @param length Length of s in bytes, or -1 if NUL-terminated + * + * @see UCharIterator + * @stable ICU 2.6 + */ +U_STABLE void U_EXPORT2 +uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length); + +#if U_SHOW_CPLUSPLUS_API + +/** + * Set up a UCharIterator to wrap around a C++ CharacterIterator. + * + * Sets the UCharIterator function pointers for iteration using the + * CharacterIterator charIter. + * + * The CharacterIterator pointer charIter is set into UCharIterator.context + * without copying or cloning the CharacterIterator object. + * The other "protected" UCharIterator fields are set to 0 and will be ignored. + * The iteration index and boundaries are controlled by the CharacterIterator. + * + * getState() simply returns the current index. + * move() will always return the final index. + * + * @param iter UCharIterator structure to be set for iteration + * @param charIter CharacterIterator to wrap + * + * @see UCharIterator + * @stable ICU 2.1 + */ +U_STABLE void U_EXPORT2 +uiter_setCharacterIterator(UCharIterator *iter, icu::CharacterIterator *charIter); + +/** + * Set up a UCharIterator to iterate over a C++ Replaceable. + * + * Sets the UCharIterator function pointers for iteration over the + * Replaceable rep with iteration boundaries start=index=0 and + * length=limit=rep->length(). + * The "provider" may set the start, index, and limit values at any time + * within the range 0..length=rep->length(). + * The length field will be ignored. + * + * The Replaceable pointer rep is set into UCharIterator.context without copying + * or cloning/reallocating the Replaceable object. + * + * getState() simply returns the current index. + * move() will always return the final index. + * + * @param iter UCharIterator structure to be set for iteration + * @param rep Replaceable to iterate over + * + * @see UCharIterator + * @stable ICU 2.1 + */ +U_STABLE void U_EXPORT2 +uiter_setReplaceable(UCharIterator *iter, const icu::Replaceable *rep); + +#endif + +U_CDECL_END + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/umachine.h b/src/core/basetypes/icu-ucsdet/include/unicode/umachine.h new file mode 100644 index 00000000..53215921 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/umachine.h @@ -0,0 +1,356 @@ +/* +****************************************************************************** +* +* Copyright (C) 1999-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: umachine.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999sep13 +* created by: Markus W. Scherer +* +* This file defines basic types and constants for ICU to be +* platform-independent. umachine.h and utf.h are included into +* utypes.h to provide all the general definitions for ICU. +* All of these definitions used to be in utypes.h before +* the UTF-handling macros made this unmaintainable. +*/ + +#ifndef __UMACHINE_H__ +#define __UMACHINE_H__ + + +/** + * \file + * \brief Basic types and constants for UTF + * + * <h2> Basic types and constants for UTF </h2> + * This file defines basic types and constants for utf.h to be + * platform-independent. umachine.h and utf.h are included into + * utypes.h to provide all the general definitions for ICU. + * All of these definitions used to be in utypes.h before + * the UTF-handling macros made this unmaintainable. + * + */ +/*==========================================================================*/ +/* Include platform-dependent definitions */ +/* which are contained in the platform-specific file platform.h */ +/*==========================================================================*/ + +#include "unicode/ptypes.h" /* platform.h is included in ptypes.h */ + +/* + * ANSI C headers: + * stddef.h defines wchar_t + */ +#include <stddef.h> + +/*==========================================================================*/ +/* For C wrappers, we use the symbol U_STABLE. */ +/* This works properly if the includer is C or C++. */ +/* Functions are declared U_STABLE return-type U_EXPORT2 function-name()... */ +/*==========================================================================*/ + +/** + * \def U_CFUNC + * This is used in a declaration of a library private ICU C function. + * @stable ICU 2.4 + */ + +/** + * \def U_CDECL_BEGIN + * This is used to begin a declaration of a library private ICU C API. + * @stable ICU 2.4 + */ + +/** + * \def U_CDECL_END + * This is used to end a declaration of a library private ICU C API + * @stable ICU 2.4 + */ + +#ifdef __cplusplus +# define U_CFUNC extern "C" +# define U_CDECL_BEGIN extern "C" { +# define U_CDECL_END } +#else +# define U_CFUNC extern +# define U_CDECL_BEGIN +# define U_CDECL_END +#endif + +#ifndef U_ATTRIBUTE_DEPRECATED +/** + * \def U_ATTRIBUTE_DEPRECATED + * This is used for GCC specific attributes + * @internal + */ +#if U_GCC_MAJOR_MINOR >= 302 +# define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated)) +/** + * \def U_ATTRIBUTE_DEPRECATED + * This is used for Visual C++ specific attributes + * @internal + */ +#elif defined(_MSC_VER) && (_MSC_VER >= 1400) +# define U_ATTRIBUTE_DEPRECATED __declspec(deprecated) +#else +# define U_ATTRIBUTE_DEPRECATED +#endif +#endif + +/** This is used to declare a function as a public ICU C API @stable ICU 2.0*/ +#define U_CAPI U_CFUNC U_EXPORT +/** This is used to declare a function as a stable public ICU C API*/ +#define U_STABLE U_CAPI +/** This is used to declare a function as a draft public ICU C API */ +#define U_DRAFT U_CAPI +/** This is used to declare a function as a deprecated public ICU C API */ +#define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED +/** This is used to declare a function as an obsolete public ICU C API */ +#define U_OBSOLETE U_CAPI +/** This is used to declare a function as an internal ICU C API */ +#define U_INTERNAL U_CAPI + +/** + * \def U_OVERRIDE + * Defined to the C++11 "override" keyword if available. + * Denotes a class or member which is an override of the base class. + * May result in an error if it applied to something not an override. + * @internal + */ + +/** + * \def U_FINAL + * Defined to the C++11 "final" keyword if available. + * Denotes a class or member which may not be overridden in subclasses. + * May result in an error if subclasses attempt to override. + * @internal + */ + +#if defined(__cplusplus) && __cplusplus>=201103L +/* C++11 */ +#ifndef U_OVERRIDE +#define U_OVERRIDE override +#endif +#ifndef U_FINAL +#define U_FINAL final +#endif +#else +/* not C++11 - define to nothing */ +#ifndef U_OVERRIDE +#define U_OVERRIDE +#endif +#ifndef U_FINAL +#define U_FINAL +#endif +#endif + +/*==========================================================================*/ +/* limits for int32_t etc., like in POSIX inttypes.h */ +/*==========================================================================*/ + +#ifndef INT8_MIN +/** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */ +# define INT8_MIN ((int8_t)(-128)) +#endif +#ifndef INT16_MIN +/** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */ +# define INT16_MIN ((int16_t)(-32767-1)) +#endif +#ifndef INT32_MIN +/** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */ +# define INT32_MIN ((int32_t)(-2147483647-1)) +#endif + +#ifndef INT8_MAX +/** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */ +# define INT8_MAX ((int8_t)(127)) +#endif +#ifndef INT16_MAX +/** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */ +# define INT16_MAX ((int16_t)(32767)) +#endif +#ifndef INT32_MAX +/** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */ +# define INT32_MAX ((int32_t)(2147483647)) +#endif + +#ifndef UINT8_MAX +/** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */ +# define UINT8_MAX ((uint8_t)(255U)) +#endif +#ifndef UINT16_MAX +/** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */ +# define UINT16_MAX ((uint16_t)(65535U)) +#endif +#ifndef UINT32_MAX +/** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */ +# define UINT32_MAX ((uint32_t)(4294967295U)) +#endif + +#if defined(U_INT64_T_UNAVAILABLE) +# error int64_t is required for decimal format and rule-based number format. +#else +# ifndef INT64_C +/** + * Provides a platform independent way to specify a signed 64-bit integer constant. + * note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C + * @stable ICU 2.8 + */ +# define INT64_C(c) c ## LL +# endif +# ifndef UINT64_C +/** + * Provides a platform independent way to specify an unsigned 64-bit integer constant. + * note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C + * @stable ICU 2.8 + */ +# define UINT64_C(c) c ## ULL +# endif +# ifndef U_INT64_MIN +/** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */ +# define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1)) +# endif +# ifndef U_INT64_MAX +/** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */ +# define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807))) +# endif +# ifndef U_UINT64_MAX +/** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */ +# define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615))) +# endif +#endif + +/*==========================================================================*/ +/* Boolean data type */ +/*==========================================================================*/ + +/** The ICU boolean type @stable ICU 2.0 */ +typedef int8_t UBool; + +#ifndef TRUE +/** The TRUE value of a UBool @stable ICU 2.0 */ +# define TRUE 1 +#endif +#ifndef FALSE +/** The FALSE value of a UBool @stable ICU 2.0 */ +# define FALSE 0 +#endif + + +/*==========================================================================*/ +/* Unicode data types */ +/*==========================================================================*/ + +/* wchar_t-related definitions -------------------------------------------- */ + +/* + * \def U_WCHAR_IS_UTF16 + * Defined if wchar_t uses UTF-16. + * + * @stable ICU 2.0 + */ +/* + * \def U_WCHAR_IS_UTF32 + * Defined if wchar_t uses UTF-32. + * + * @stable ICU 2.0 + */ +#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32) +# ifdef __STDC_ISO_10646__ +# if (U_SIZEOF_WCHAR_T==2) +# define U_WCHAR_IS_UTF16 +# elif (U_SIZEOF_WCHAR_T==4) +# define U_WCHAR_IS_UTF32 +# endif +# elif defined __UCS2__ +# if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2) +# define U_WCHAR_IS_UTF16 +# endif +# elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__)) +# if (U_SIZEOF_WCHAR_T==4) +# define U_WCHAR_IS_UTF32 +# endif +# elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED) +# define U_WCHAR_IS_UTF32 +# elif U_PLATFORM_HAS_WIN32_API +# define U_WCHAR_IS_UTF16 +# endif +#endif + +/* UChar and UChar32 definitions -------------------------------------------- */ + +/** Number of bytes in a UChar. @stable ICU 2.0 */ +#define U_SIZEOF_UCHAR 2 + +/** + * \var UChar + * Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), + * or wchar_t if that is 16 bits wide; always assumed to be unsigned. + * If neither is available, then define UChar to be uint16_t. + * + * This makes the definition of UChar platform-dependent + * but allows direct string type compatibility with platforms with + * 16-bit wchar_t types. + * + * @stable ICU 4.4 + */ +#if defined(UCHAR_TYPE) + typedef UCHAR_TYPE UChar; +/* Not #elif U_HAVE_CHAR16_T -- because that is type-incompatible with pre-C++11 callers + typedef char16_t UChar; */ +#elif U_SIZEOF_WCHAR_T==2 + typedef wchar_t UChar; +#elif defined(__CHAR16_TYPE__) + typedef __CHAR16_TYPE__ UChar; +#else + typedef uint16_t UChar; +#endif + +/** + * Define UChar32 as a type for single Unicode code points. + * UChar32 is a signed 32-bit integer (same as int32_t). + * + * The Unicode code point range is 0..0x10ffff. + * All other values (negative or >=0x110000) are illegal as Unicode code points. + * They may be used as sentinel values to indicate "done", "error" + * or similar non-code point conditions. + * + * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined + * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned) + * or else to be uint32_t. + * That is, the definition of UChar32 was platform-dependent. + * + * @see U_SENTINEL + * @stable ICU 2.4 + */ +typedef int32_t UChar32; + +/** + * This value is intended for sentinel values for APIs that + * (take or) return single code points (UChar32). + * It is outside of the Unicode code point range 0..0x10ffff. + * + * For example, a "done" or "error" value in a new API + * could be indicated with U_SENTINEL. + * + * ICU APIs designed before ICU 2.4 usually define service-specific "done" + * values, mostly 0xffff. + * Those may need to be distinguished from + * actual U+ffff text contents by calling functions like + * CharacterIterator::hasNext() or UnicodeString::length(). + * + * @return -1 + * @see UChar32 + * @stable ICU 2.4 + */ +#define U_SENTINEL (-1) + +#include "unicode/urename.h" + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/unistr.h b/src/core/basetypes/icu-ucsdet/include/unicode/unistr.h new file mode 100644 index 00000000..c6e8b446 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/unistr.h @@ -0,0 +1,4470 @@ +/* +********************************************************************** +* Copyright (C) 1998-2013, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* File unistr.h +* +* Modification History: +* +* Date Name Description +* 09/25/98 stephen Creation. +* 11/11/98 stephen Changed per 11/9 code review. +* 04/20/99 stephen Overhauled per 4/16 code review. +* 11/18/99 aliu Made to inherit from Replaceable. Added method +* handleReplaceBetween(); other methods unchanged. +* 06/25/01 grhoten Remove dependency on iostream. +****************************************************************************** +*/ + +#ifndef UNISTR_H +#define UNISTR_H + +/** + * \file + * \brief C++ API: Unicode String + */ + +#include "unicode/utypes.h" +#include "unicode/rep.h" +#include "unicode/std_string.h" +#include "unicode/stringpiece.h" +#include "unicode/bytestream.h" +#include "unicode/ucasemap.h" + +struct UConverter; // unicode/ucnv.h +class StringThreadTest; + +#ifndef U_COMPARE_CODE_POINT_ORDER +/* see also ustring.h and unorm.h */ +/** + * Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc: + * Compare strings in code point order instead of code unit order. + * @stable ICU 2.2 + */ +#define U_COMPARE_CODE_POINT_ORDER 0x8000 +#endif + +#ifndef USTRING_H +/** + * \ingroup ustring_ustrlen + */ +U_STABLE int32_t U_EXPORT2 +u_strlen(const UChar *s); +#endif + +/** + * \def U_STRING_CASE_MAPPER_DEFINED + * @internal + */ +#ifndef U_STRING_CASE_MAPPER_DEFINED +#define U_STRING_CASE_MAPPER_DEFINED + +/** + * Internal string case mapping function type. + * @internal + */ +typedef int32_t U_CALLCONV +UStringCaseMapper(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UErrorCode *pErrorCode); + +#endif + +U_NAMESPACE_BEGIN + +class BreakIterator; // unicode/brkiter.h +class Locale; // unicode/locid.h +class StringCharacterIterator; +class UnicodeStringAppendable; // unicode/appendable.h + +/* The <iostream> include has been moved to unicode/ustream.h */ + +/** + * Constant to be used in the UnicodeString(char *, int32_t, EInvariant) constructor + * which constructs a Unicode string from an invariant-character char * string. + * About invariant characters see utypes.h. + * This constructor has no runtime dependency on conversion code and is + * therefore recommended over ones taking a charset name string + * (where the empty string "" indicates invariant-character conversion). + * + * @stable ICU 3.2 + */ +#define US_INV icu::UnicodeString::kInvariant + +/** + * Unicode String literals in C++. + * Dependent on the platform properties, different UnicodeString + * constructors should be used to create a UnicodeString object from + * a string literal. + * The macros are defined for maximum performance. + * They work only for strings that contain "invariant characters", i.e., + * only latin letters, digits, and some punctuation. + * See utypes.h for details. + * + * The string parameter must be a C string literal. + * The length of the string, not including the terminating + * <code>NUL</code>, must be specified as a constant. + * The U_STRING_DECL macro should be invoked exactly once for one + * such string variable before it is used. + * @stable ICU 2.0 + */ +#if defined(U_DECLARE_UTF16) +# define UNICODE_STRING(cs, _length) icu::UnicodeString(TRUE, (const UChar *)U_DECLARE_UTF16(cs), _length) +#elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16))) +# define UNICODE_STRING(cs, _length) icu::UnicodeString(TRUE, (const UChar *)L ## cs, _length) +#elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY +# define UNICODE_STRING(cs, _length) icu::UnicodeString(TRUE, (const UChar *)cs, _length) +#else +# define UNICODE_STRING(cs, _length) icu::UnicodeString(cs, _length, US_INV) +#endif + +/** + * Unicode String literals in C++. + * Dependent on the platform properties, different UnicodeString + * constructors should be used to create a UnicodeString object from + * a string literal. + * The macros are defined for improved performance. + * They work only for strings that contain "invariant characters", i.e., + * only latin letters, digits, and some punctuation. + * See utypes.h for details. + * + * The string parameter must be a C string literal. + * @stable ICU 2.0 + */ +#define UNICODE_STRING_SIMPLE(cs) UNICODE_STRING(cs, -1) + +/** + * \def UNISTR_FROM_CHAR_EXPLICIT + * This can be defined to be empty or "explicit". + * If explicit, then the UnicodeString(UChar) and UnicodeString(UChar32) + * constructors are marked as explicit, preventing their inadvertent use. + * @stable ICU 49 + */ +#ifndef UNISTR_FROM_CHAR_EXPLICIT +# if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) + // Auto-"explicit" in ICU library code. +# define UNISTR_FROM_CHAR_EXPLICIT explicit +# else + // Empty by default for source code compatibility. +# define UNISTR_FROM_CHAR_EXPLICIT +# endif +#endif + +/** + * \def UNISTR_FROM_STRING_EXPLICIT + * This can be defined to be empty or "explicit". + * If explicit, then the UnicodeString(const char *) and UnicodeString(const UChar *) + * constructors are marked as explicit, preventing their inadvertent use. + * + * In particular, this helps prevent accidentally depending on ICU conversion code + * by passing a string literal into an API with a const UnicodeString & parameter. + * @stable ICU 49 + */ +#ifndef UNISTR_FROM_STRING_EXPLICIT +# if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) + // Auto-"explicit" in ICU library code. +# define UNISTR_FROM_STRING_EXPLICIT explicit +# else + // Empty by default for source code compatibility. +# define UNISTR_FROM_STRING_EXPLICIT +# endif +#endif + +/** + * UnicodeString is a string class that stores Unicode characters directly and provides + * similar functionality as the Java String and StringBuffer classes. + * It is a concrete implementation of the abstract class Replaceable (for transliteration). + * + * The UnicodeString class is not suitable for subclassing. + * + * <p>For an overview of Unicode strings in C and C++ see the + * <a href="http://icu-project.org/userguide/strings.html">User Guide Strings chapter</a>.</p> + * + * <p>In ICU, a Unicode string consists of 16-bit Unicode <em>code units</em>. + * A Unicode character may be stored with either one code unit + * (the most common case) or with a matched pair of special code units + * ("surrogates"). The data type for code units is UChar. + * For single-character handling, a Unicode character code <em>point</em> is a value + * in the range 0..0x10ffff. ICU uses the UChar32 type for code points.</p> + * + * <p>Indexes and offsets into and lengths of strings always count code units, not code points. + * This is the same as with multi-byte char* strings in traditional string handling. + * Operations on partial strings typically do not test for code point boundaries. + * If necessary, the user needs to take care of such boundaries by testing for the code unit + * values or by using functions like + * UnicodeString::getChar32Start() and UnicodeString::getChar32Limit() + * (or, in C, the equivalent macros U16_SET_CP_START() and U16_SET_CP_LIMIT(), see utf.h).</p> + * + * UnicodeString methods are more lenient with regard to input parameter values + * than other ICU APIs. In particular: + * - If indexes are out of bounds for a UnicodeString object + * (<0 or >length()) then they are "pinned" to the nearest boundary. + * - If primitive string pointer values (e.g., const UChar * or char *) + * for input strings are NULL, then those input string parameters are treated + * as if they pointed to an empty string. + * However, this is <em>not</em> the case for char * parameters for charset names + * or other IDs. + * - Most UnicodeString methods do not take a UErrorCode parameter because + * there are usually very few opportunities for failure other than a shortage + * of memory, error codes in low-level C++ string methods would be inconvenient, + * and the error code as the last parameter (ICU convention) would prevent + * the use of default parameter values. + * Instead, such methods set the UnicodeString into a "bogus" state + * (see isBogus()) if an error occurs. + * + * In string comparisons, two UnicodeString objects that are both "bogus" + * compare equal (to be transitive and prevent endless loops in sorting), + * and a "bogus" string compares less than any non-"bogus" one. + * + * Const UnicodeString methods are thread-safe. Multiple threads can use + * const methods on the same UnicodeString object simultaneously, + * but non-const methods must not be called concurrently (in multiple threads) + * with any other (const or non-const) methods. + * + * Similarly, const UnicodeString & parameters are thread-safe. + * One object may be passed in as such a parameter concurrently in multiple threads. + * This includes the const UnicodeString & parameters for + * copy construction, assignment, and cloning. + * + * <p>UnicodeString uses several storage methods. + * String contents can be stored inside the UnicodeString object itself, + * in an allocated and shared buffer, or in an outside buffer that is "aliased". + * Most of this is done transparently, but careful aliasing in particular provides + * significant performance improvements. + * Also, the internal buffer is accessible via special functions. + * For details see the + * <a href="http://icu-project.org/userguide/strings.html">User Guide Strings chapter</a>.</p> + * + * @see utf.h + * @see CharacterIterator + * @stable ICU 2.0 + */ +class U_COMMON_API UnicodeString : public Replaceable +{ +public: + + /** + * Constant to be used in the UnicodeString(char *, int32_t, EInvariant) constructor + * which constructs a Unicode string from an invariant-character char * string. + * Use the macro US_INV instead of the full qualification for this value. + * + * @see US_INV + * @stable ICU 3.2 + */ + enum EInvariant { + /** + * @see EInvariant + * @stable ICU 3.2 + */ + kInvariant + }; + + //======================================== + // Read-only operations + //======================================== + + /* Comparison - bitwise only - for international comparison use collation */ + + /** + * Equality operator. Performs only bitwise comparison. + * @param text The UnicodeString to compare to this one. + * @return TRUE if <TT>text</TT> contains the same characters as this one, + * FALSE otherwise. + * @stable ICU 2.0 + */ + inline UBool operator== (const UnicodeString& text) const; + + /** + * Inequality operator. Performs only bitwise comparison. + * @param text The UnicodeString to compare to this one. + * @return FALSE if <TT>text</TT> contains the same characters as this one, + * TRUE otherwise. + * @stable ICU 2.0 + */ + inline UBool operator!= (const UnicodeString& text) const; + + /** + * Greater than operator. Performs only bitwise comparison. + * @param text The UnicodeString to compare to this one. + * @return TRUE if the characters in this are bitwise + * greater than the characters in <code>text</code>, FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool operator> (const UnicodeString& text) const; + + /** + * Less than operator. Performs only bitwise comparison. + * @param text The UnicodeString to compare to this one. + * @return TRUE if the characters in this are bitwise + * less than the characters in <code>text</code>, FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool operator< (const UnicodeString& text) const; + + /** + * Greater than or equal operator. Performs only bitwise comparison. + * @param text The UnicodeString to compare to this one. + * @return TRUE if the characters in this are bitwise + * greater than or equal to the characters in <code>text</code>, FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool operator>= (const UnicodeString& text) const; + + /** + * Less than or equal operator. Performs only bitwise comparison. + * @param text The UnicodeString to compare to this one. + * @return TRUE if the characters in this are bitwise + * less than or equal to the characters in <code>text</code>, FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool operator<= (const UnicodeString& text) const; + + /** + * Compare the characters bitwise in this UnicodeString to + * the characters in <code>text</code>. + * @param text The UnicodeString to compare to this one. + * @return The result of bitwise character comparison: 0 if this + * contains the same characters as <code>text</code>, -1 if the characters in + * this are bitwise less than the characters in <code>text</code>, +1 if the + * characters in this are bitwise greater than the characters + * in <code>text</code>. + * @stable ICU 2.0 + */ + inline int8_t compare(const UnicodeString& text) const; + + /** + * Compare the characters bitwise in the range + * [<TT>start</TT>, <TT>start + length</TT>) with the characters + * in the <b>entire string</b> <TT>text</TT>. + * (The parameters "start" and "length" are not applied to the other text "text".) + * @param start the offset at which the compare operation begins + * @param length the number of characters of text to compare. + * @param text the other text to be compared against this string. + * @return The result of bitwise character comparison: 0 if this + * contains the same characters as <code>text</code>, -1 if the characters in + * this are bitwise less than the characters in <code>text</code>, +1 if the + * characters in this are bitwise greater than the characters + * in <code>text</code>. + * @stable ICU 2.0 + */ + inline int8_t compare(int32_t start, + int32_t length, + const UnicodeString& text) const; + + /** + * Compare the characters bitwise in the range + * [<TT>start</TT>, <TT>start + length</TT>) with the characters + * in <TT>srcText</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>). + * @param start the offset at which the compare operation begins + * @param length the number of characters in this to compare. + * @param srcText the text to be compared + * @param srcStart the offset into <TT>srcText</TT> to start comparison + * @param srcLength the number of characters in <TT>src</TT> to compare + * @return The result of bitwise character comparison: 0 if this + * contains the same characters as <code>srcText</code>, -1 if the characters in + * this are bitwise less than the characters in <code>srcText</code>, +1 if the + * characters in this are bitwise greater than the characters + * in <code>srcText</code>. + * @stable ICU 2.0 + */ + inline int8_t compare(int32_t start, + int32_t length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const; + + /** + * Compare the characters bitwise in this UnicodeString with the first + * <TT>srcLength</TT> characters in <TT>srcChars</TT>. + * @param srcChars The characters to compare to this UnicodeString. + * @param srcLength the number of characters in <TT>srcChars</TT> to compare + * @return The result of bitwise character comparison: 0 if this + * contains the same characters as <code>srcChars</code>, -1 if the characters in + * this are bitwise less than the characters in <code>srcChars</code>, +1 if the + * characters in this are bitwise greater than the characters + * in <code>srcChars</code>. + * @stable ICU 2.0 + */ + inline int8_t compare(const UChar *srcChars, + int32_t srcLength) const; + + /** + * Compare the characters bitwise in the range + * [<TT>start</TT>, <TT>start + length</TT>) with the first + * <TT>length</TT> characters in <TT>srcChars</TT> + * @param start the offset at which the compare operation begins + * @param length the number of characters to compare. + * @param srcChars the characters to be compared + * @return The result of bitwise character comparison: 0 if this + * contains the same characters as <code>srcChars</code>, -1 if the characters in + * this are bitwise less than the characters in <code>srcChars</code>, +1 if the + * characters in this are bitwise greater than the characters + * in <code>srcChars</code>. + * @stable ICU 2.0 + */ + inline int8_t compare(int32_t start, + int32_t length, + const UChar *srcChars) const; + + /** + * Compare the characters bitwise in the range + * [<TT>start</TT>, <TT>start + length</TT>) with the characters + * in <TT>srcChars</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>). + * @param start the offset at which the compare operation begins + * @param length the number of characters in this to compare + * @param srcChars the characters to be compared + * @param srcStart the offset into <TT>srcChars</TT> to start comparison + * @param srcLength the number of characters in <TT>srcChars</TT> to compare + * @return The result of bitwise character comparison: 0 if this + * contains the same characters as <code>srcChars</code>, -1 if the characters in + * this are bitwise less than the characters in <code>srcChars</code>, +1 if the + * characters in this are bitwise greater than the characters + * in <code>srcChars</code>. + * @stable ICU 2.0 + */ + inline int8_t compare(int32_t start, + int32_t length, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength) const; + + /** + * Compare the characters bitwise in the range + * [<TT>start</TT>, <TT>limit</TT>) with the characters + * in <TT>srcText</TT> in the range + * [<TT>srcStart</TT>, <TT>srcLimit</TT>). + * @param start the offset at which the compare operation begins + * @param limit the offset immediately following the compare operation + * @param srcText the text to be compared + * @param srcStart the offset into <TT>srcText</TT> to start comparison + * @param srcLimit the offset into <TT>srcText</TT> to limit comparison + * @return The result of bitwise character comparison: 0 if this + * contains the same characters as <code>srcText</code>, -1 if the characters in + * this are bitwise less than the characters in <code>srcText</code>, +1 if the + * characters in this are bitwise greater than the characters + * in <code>srcText</code>. + * @stable ICU 2.0 + */ + inline int8_t compareBetween(int32_t start, + int32_t limit, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLimit) const; + + /** + * Compare two Unicode strings in code point order. + * The result may be different from the results of compare(), operator<, etc. + * if supplementary characters are present: + * + * In UTF-16, supplementary characters (with code points U+10000 and above) are + * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff, + * which means that they compare as less than some other BMP characters like U+feff. + * This function compares Unicode strings in code point order. + * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined. + * + * @param text Another string to compare this one to. + * @return a negative/zero/positive integer corresponding to whether + * this string is less than/equal to/greater than the second one + * in code point order + * @stable ICU 2.0 + */ + inline int8_t compareCodePointOrder(const UnicodeString& text) const; + + /** + * Compare two Unicode strings in code point order. + * The result may be different from the results of compare(), operator<, etc. + * if supplementary characters are present: + * + * In UTF-16, supplementary characters (with code points U+10000 and above) are + * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff, + * which means that they compare as less than some other BMP characters like U+feff. + * This function compares Unicode strings in code point order. + * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined. + * + * @param start The start offset in this string at which the compare operation begins. + * @param length The number of code units from this string to compare. + * @param srcText Another string to compare this one to. + * @return a negative/zero/positive integer corresponding to whether + * this string is less than/equal to/greater than the second one + * in code point order + * @stable ICU 2.0 + */ + inline int8_t compareCodePointOrder(int32_t start, + int32_t length, + const UnicodeString& srcText) const; + + /** + * Compare two Unicode strings in code point order. + * The result may be different from the results of compare(), operator<, etc. + * if supplementary characters are present: + * + * In UTF-16, supplementary characters (with code points U+10000 and above) are + * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff, + * which means that they compare as less than some other BMP characters like U+feff. + * This function compares Unicode strings in code point order. + * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined. + * + * @param start The start offset in this string at which the compare operation begins. + * @param length The number of code units from this string to compare. + * @param srcText Another string to compare this one to. + * @param srcStart The start offset in that string at which the compare operation begins. + * @param srcLength The number of code units from that string to compare. + * @return a negative/zero/positive integer corresponding to whether + * this string is less than/equal to/greater than the second one + * in code point order + * @stable ICU 2.0 + */ + inline int8_t compareCodePointOrder(int32_t start, + int32_t length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const; + + /** + * Compare two Unicode strings in code point order. + * The result may be different from the results of compare(), operator<, etc. + * if supplementary characters are present: + * + * In UTF-16, supplementary characters (with code points U+10000 and above) are + * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff, + * which means that they compare as less than some other BMP characters like U+feff. + * This function compares Unicode strings in code point order. + * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined. + * + * @param srcChars A pointer to another string to compare this one to. + * @param srcLength The number of code units from that string to compare. + * @return a negative/zero/positive integer corresponding to whether + * this string is less than/equal to/greater than the second one + * in code point order + * @stable ICU 2.0 + */ + inline int8_t compareCodePointOrder(const UChar *srcChars, + int32_t srcLength) const; + + /** + * Compare two Unicode strings in code point order. + * The result may be different from the results of compare(), operator<, etc. + * if supplementary characters are present: + * + * In UTF-16, supplementary characters (with code points U+10000 and above) are + * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff, + * which means that they compare as less than some other BMP characters like U+feff. + * This function compares Unicode strings in code point order. + * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined. + * + * @param start The start offset in this string at which the compare operation begins. + * @param length The number of code units from this string to compare. + * @param srcChars A pointer to another string to compare this one to. + * @return a negative/zero/positive integer corresponding to whether + * this string is less than/equal to/greater than the second one + * in code point order + * @stable ICU 2.0 + */ + inline int8_t compareCodePointOrder(int32_t start, + int32_t length, + const UChar *srcChars) const; + + /** + * Compare two Unicode strings in code point order. + * The result may be different from the results of compare(), operator<, etc. + * if supplementary characters are present: + * + * In UTF-16, supplementary characters (with code points U+10000 and above) are + * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff, + * which means that they compare as less than some other BMP characters like U+feff. + * This function compares Unicode strings in code point order. + * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined. + * + * @param start The start offset in this string at which the compare operation begins. + * @param length The number of code units from this string to compare. + * @param srcChars A pointer to another string to compare this one to. + * @param srcStart The start offset in that string at which the compare operation begins. + * @param srcLength The number of code units from that string to compare. + * @return a negative/zero/positive integer corresponding to whether + * this string is less than/equal to/greater than the second one + * in code point order + * @stable ICU 2.0 + */ + inline int8_t compareCodePointOrder(int32_t start, + int32_t length, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength) const; + + /** + * Compare two Unicode strings in code point order. + * The result may be different from the results of compare(), operator<, etc. + * if supplementary characters are present: + * + * In UTF-16, supplementary characters (with code points U+10000 and above) are + * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff, + * which means that they compare as less than some other BMP characters like U+feff. + * This function compares Unicode strings in code point order. + * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined. + * + * @param start The start offset in this string at which the compare operation begins. + * @param limit The offset after the last code unit from this string to compare. + * @param srcText Another string to compare this one to. + * @param srcStart The start offset in that string at which the compare operation begins. + * @param srcLimit The offset after the last code unit from that string to compare. + * @return a negative/zero/positive integer corresponding to whether + * this string is less than/equal to/greater than the second one + * in code point order + * @stable ICU 2.0 + */ + inline int8_t compareCodePointOrderBetween(int32_t start, + int32_t limit, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLimit) const; + + /** + * Compare two strings case-insensitively using full case folding. + * This is equivalent to this->foldCase(options).compare(text.foldCase(options)). + * + * @param text Another string to compare this one to. + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code unit order with default case folding. + * + * - U_COMPARE_CODE_POINT_ORDER + * Set to choose code point order instead of code unit order + * (see u_strCompare for details). + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @return A negative, zero, or positive integer indicating the comparison result. + * @stable ICU 2.0 + */ + inline int8_t caseCompare(const UnicodeString& text, uint32_t options) const; + + /** + * Compare two strings case-insensitively using full case folding. + * This is equivalent to this->foldCase(options).compare(srcText.foldCase(options)). + * + * @param start The start offset in this string at which the compare operation begins. + * @param length The number of code units from this string to compare. + * @param srcText Another string to compare this one to. + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code unit order with default case folding. + * + * - U_COMPARE_CODE_POINT_ORDER + * Set to choose code point order instead of code unit order + * (see u_strCompare for details). + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @return A negative, zero, or positive integer indicating the comparison result. + * @stable ICU 2.0 + */ + inline int8_t caseCompare(int32_t start, + int32_t length, + const UnicodeString& srcText, + uint32_t options) const; + + /** + * Compare two strings case-insensitively using full case folding. + * This is equivalent to this->foldCase(options).compare(srcText.foldCase(options)). + * + * @param start The start offset in this string at which the compare operation begins. + * @param length The number of code units from this string to compare. + * @param srcText Another string to compare this one to. + * @param srcStart The start offset in that string at which the compare operation begins. + * @param srcLength The number of code units from that string to compare. + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code unit order with default case folding. + * + * - U_COMPARE_CODE_POINT_ORDER + * Set to choose code point order instead of code unit order + * (see u_strCompare for details). + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @return A negative, zero, or positive integer indicating the comparison result. + * @stable ICU 2.0 + */ + inline int8_t caseCompare(int32_t start, + int32_t length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength, + uint32_t options) const; + + /** + * Compare two strings case-insensitively using full case folding. + * This is equivalent to this->foldCase(options).compare(srcChars.foldCase(options)). + * + * @param srcChars A pointer to another string to compare this one to. + * @param srcLength The number of code units from that string to compare. + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code unit order with default case folding. + * + * - U_COMPARE_CODE_POINT_ORDER + * Set to choose code point order instead of code unit order + * (see u_strCompare for details). + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @return A negative, zero, or positive integer indicating the comparison result. + * @stable ICU 2.0 + */ + inline int8_t caseCompare(const UChar *srcChars, + int32_t srcLength, + uint32_t options) const; + + /** + * Compare two strings case-insensitively using full case folding. + * This is equivalent to this->foldCase(options).compare(srcChars.foldCase(options)). + * + * @param start The start offset in this string at which the compare operation begins. + * @param length The number of code units from this string to compare. + * @param srcChars A pointer to another string to compare this one to. + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code unit order with default case folding. + * + * - U_COMPARE_CODE_POINT_ORDER + * Set to choose code point order instead of code unit order + * (see u_strCompare for details). + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @return A negative, zero, or positive integer indicating the comparison result. + * @stable ICU 2.0 + */ + inline int8_t caseCompare(int32_t start, + int32_t length, + const UChar *srcChars, + uint32_t options) const; + + /** + * Compare two strings case-insensitively using full case folding. + * This is equivalent to this->foldCase(options).compare(srcChars.foldCase(options)). + * + * @param start The start offset in this string at which the compare operation begins. + * @param length The number of code units from this string to compare. + * @param srcChars A pointer to another string to compare this one to. + * @param srcStart The start offset in that string at which the compare operation begins. + * @param srcLength The number of code units from that string to compare. + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code unit order with default case folding. + * + * - U_COMPARE_CODE_POINT_ORDER + * Set to choose code point order instead of code unit order + * (see u_strCompare for details). + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @return A negative, zero, or positive integer indicating the comparison result. + * @stable ICU 2.0 + */ + inline int8_t caseCompare(int32_t start, + int32_t length, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength, + uint32_t options) const; + + /** + * Compare two strings case-insensitively using full case folding. + * This is equivalent to this->foldCase(options).compareBetween(text.foldCase(options)). + * + * @param start The start offset in this string at which the compare operation begins. + * @param limit The offset after the last code unit from this string to compare. + * @param srcText Another string to compare this one to. + * @param srcStart The start offset in that string at which the compare operation begins. + * @param srcLimit The offset after the last code unit from that string to compare. + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code unit order with default case folding. + * + * - U_COMPARE_CODE_POINT_ORDER + * Set to choose code point order instead of code unit order + * (see u_strCompare for details). + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @return A negative, zero, or positive integer indicating the comparison result. + * @stable ICU 2.0 + */ + inline int8_t caseCompareBetween(int32_t start, + int32_t limit, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLimit, + uint32_t options) const; + + /** + * Determine if this starts with the characters in <TT>text</TT> + * @param text The text to match. + * @return TRUE if this starts with the characters in <TT>text</TT>, + * FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool startsWith(const UnicodeString& text) const; + + /** + * Determine if this starts with the characters in <TT>srcText</TT> + * in the range [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>). + * @param srcText The text to match. + * @param srcStart the offset into <TT>srcText</TT> to start matching + * @param srcLength the number of characters in <TT>srcText</TT> to match + * @return TRUE if this starts with the characters in <TT>text</TT>, + * FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool startsWith(const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const; + + /** + * Determine if this starts with the characters in <TT>srcChars</TT> + * @param srcChars The characters to match. + * @param srcLength the number of characters in <TT>srcChars</TT> + * @return TRUE if this starts with the characters in <TT>srcChars</TT>, + * FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool startsWith(const UChar *srcChars, + int32_t srcLength) const; + + /** + * Determine if this ends with the characters in <TT>srcChars</TT> + * in the range [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>). + * @param srcChars The characters to match. + * @param srcStart the offset into <TT>srcText</TT> to start matching + * @param srcLength the number of characters in <TT>srcChars</TT> to match + * @return TRUE if this ends with the characters in <TT>srcChars</TT>, FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool startsWith(const UChar *srcChars, + int32_t srcStart, + int32_t srcLength) const; + + /** + * Determine if this ends with the characters in <TT>text</TT> + * @param text The text to match. + * @return TRUE if this ends with the characters in <TT>text</TT>, + * FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool endsWith(const UnicodeString& text) const; + + /** + * Determine if this ends with the characters in <TT>srcText</TT> + * in the range [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>). + * @param srcText The text to match. + * @param srcStart the offset into <TT>srcText</TT> to start matching + * @param srcLength the number of characters in <TT>srcText</TT> to match + * @return TRUE if this ends with the characters in <TT>text</TT>, + * FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool endsWith(const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const; + + /** + * Determine if this ends with the characters in <TT>srcChars</TT> + * @param srcChars The characters to match. + * @param srcLength the number of characters in <TT>srcChars</TT> + * @return TRUE if this ends with the characters in <TT>srcChars</TT>, + * FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool endsWith(const UChar *srcChars, + int32_t srcLength) const; + + /** + * Determine if this ends with the characters in <TT>srcChars</TT> + * in the range [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>). + * @param srcChars The characters to match. + * @param srcStart the offset into <TT>srcText</TT> to start matching + * @param srcLength the number of characters in <TT>srcChars</TT> to match + * @return TRUE if this ends with the characters in <TT>srcChars</TT>, + * FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool endsWith(const UChar *srcChars, + int32_t srcStart, + int32_t srcLength) const; + + + /* Searching - bitwise only */ + + /** + * Locate in this the first occurrence of the characters in <TT>text</TT>, + * using bitwise comparison. + * @param text The text to search for. + * @return The offset into this of the start of <TT>text</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t indexOf(const UnicodeString& text) const; + + /** + * Locate in this the first occurrence of the characters in <TT>text</TT> + * starting at offset <TT>start</TT>, using bitwise comparison. + * @param text The text to search for. + * @param start The offset at which searching will start. + * @return The offset into this of the start of <TT>text</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t indexOf(const UnicodeString& text, + int32_t start) const; + + /** + * Locate in this the first occurrence in the range + * [<TT>start</TT>, <TT>start + length</TT>) of the characters + * in <TT>text</TT>, using bitwise comparison. + * @param text The text to search for. + * @param start The offset at which searching will start. + * @param length The number of characters to search + * @return The offset into this of the start of <TT>text</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t indexOf(const UnicodeString& text, + int32_t start, + int32_t length) const; + + /** + * Locate in this the first occurrence in the range + * [<TT>start</TT>, <TT>start + length</TT>) of the characters + * in <TT>srcText</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>), + * using bitwise comparison. + * @param srcText The text to search for. + * @param srcStart the offset into <TT>srcText</TT> at which + * to start matching + * @param srcLength the number of characters in <TT>srcText</TT> to match + * @param start the offset into this at which to start matching + * @param length the number of characters in this to search + * @return The offset into this of the start of <TT>text</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t indexOf(const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength, + int32_t start, + int32_t length) const; + + /** + * Locate in this the first occurrence of the characters in + * <TT>srcChars</TT> + * starting at offset <TT>start</TT>, using bitwise comparison. + * @param srcChars The text to search for. + * @param srcLength the number of characters in <TT>srcChars</TT> to match + * @param start the offset into this at which to start matching + * @return The offset into this of the start of <TT>text</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t indexOf(const UChar *srcChars, + int32_t srcLength, + int32_t start) const; + + /** + * Locate in this the first occurrence in the range + * [<TT>start</TT>, <TT>start + length</TT>) of the characters + * in <TT>srcChars</TT>, using bitwise comparison. + * @param srcChars The text to search for. + * @param srcLength the number of characters in <TT>srcChars</TT> + * @param start The offset at which searching will start. + * @param length The number of characters to search + * @return The offset into this of the start of <TT>srcChars</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t indexOf(const UChar *srcChars, + int32_t srcLength, + int32_t start, + int32_t length) const; + + /** + * Locate in this the first occurrence in the range + * [<TT>start</TT>, <TT>start + length</TT>) of the characters + * in <TT>srcChars</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>), + * using bitwise comparison. + * @param srcChars The text to search for. + * @param srcStart the offset into <TT>srcChars</TT> at which + * to start matching + * @param srcLength the number of characters in <TT>srcChars</TT> to match + * @param start the offset into this at which to start matching + * @param length the number of characters in this to search + * @return The offset into this of the start of <TT>text</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + int32_t indexOf(const UChar *srcChars, + int32_t srcStart, + int32_t srcLength, + int32_t start, + int32_t length) const; + + /** + * Locate in this the first occurrence of the BMP code point <code>c</code>, + * using bitwise comparison. + * @param c The code unit to search for. + * @return The offset into this of <TT>c</TT>, or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t indexOf(UChar c) const; + + /** + * Locate in this the first occurrence of the code point <TT>c</TT>, + * using bitwise comparison. + * + * @param c The code point to search for. + * @return The offset into this of <TT>c</TT>, or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t indexOf(UChar32 c) const; + + /** + * Locate in this the first occurrence of the BMP code point <code>c</code>, + * starting at offset <TT>start</TT>, using bitwise comparison. + * @param c The code unit to search for. + * @param start The offset at which searching will start. + * @return The offset into this of <TT>c</TT>, or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t indexOf(UChar c, + int32_t start) const; + + /** + * Locate in this the first occurrence of the code point <TT>c</TT> + * starting at offset <TT>start</TT>, using bitwise comparison. + * + * @param c The code point to search for. + * @param start The offset at which searching will start. + * @return The offset into this of <TT>c</TT>, or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t indexOf(UChar32 c, + int32_t start) const; + + /** + * Locate in this the first occurrence of the BMP code point <code>c</code> + * in the range [<TT>start</TT>, <TT>start + length</TT>), + * using bitwise comparison. + * @param c The code unit to search for. + * @param start the offset into this at which to start matching + * @param length the number of characters in this to search + * @return The offset into this of <TT>c</TT>, or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t indexOf(UChar c, + int32_t start, + int32_t length) const; + + /** + * Locate in this the first occurrence of the code point <TT>c</TT> + * in the range [<TT>start</TT>, <TT>start + length</TT>), + * using bitwise comparison. + * + * @param c The code point to search for. + * @param start the offset into this at which to start matching + * @param length the number of characters in this to search + * @return The offset into this of <TT>c</TT>, or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t indexOf(UChar32 c, + int32_t start, + int32_t length) const; + + /** + * Locate in this the last occurrence of the characters in <TT>text</TT>, + * using bitwise comparison. + * @param text The text to search for. + * @return The offset into this of the start of <TT>text</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t lastIndexOf(const UnicodeString& text) const; + + /** + * Locate in this the last occurrence of the characters in <TT>text</TT> + * starting at offset <TT>start</TT>, using bitwise comparison. + * @param text The text to search for. + * @param start The offset at which searching will start. + * @return The offset into this of the start of <TT>text</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t lastIndexOf(const UnicodeString& text, + int32_t start) const; + + /** + * Locate in this the last occurrence in the range + * [<TT>start</TT>, <TT>start + length</TT>) of the characters + * in <TT>text</TT>, using bitwise comparison. + * @param text The text to search for. + * @param start The offset at which searching will start. + * @param length The number of characters to search + * @return The offset into this of the start of <TT>text</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t lastIndexOf(const UnicodeString& text, + int32_t start, + int32_t length) const; + + /** + * Locate in this the last occurrence in the range + * [<TT>start</TT>, <TT>start + length</TT>) of the characters + * in <TT>srcText</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>), + * using bitwise comparison. + * @param srcText The text to search for. + * @param srcStart the offset into <TT>srcText</TT> at which + * to start matching + * @param srcLength the number of characters in <TT>srcText</TT> to match + * @param start the offset into this at which to start matching + * @param length the number of characters in this to search + * @return The offset into this of the start of <TT>text</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t lastIndexOf(const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength, + int32_t start, + int32_t length) const; + + /** + * Locate in this the last occurrence of the characters in <TT>srcChars</TT> + * starting at offset <TT>start</TT>, using bitwise comparison. + * @param srcChars The text to search for. + * @param srcLength the number of characters in <TT>srcChars</TT> to match + * @param start the offset into this at which to start matching + * @return The offset into this of the start of <TT>text</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t lastIndexOf(const UChar *srcChars, + int32_t srcLength, + int32_t start) const; + + /** + * Locate in this the last occurrence in the range + * [<TT>start</TT>, <TT>start + length</TT>) of the characters + * in <TT>srcChars</TT>, using bitwise comparison. + * @param srcChars The text to search for. + * @param srcLength the number of characters in <TT>srcChars</TT> + * @param start The offset at which searching will start. + * @param length The number of characters to search + * @return The offset into this of the start of <TT>srcChars</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t lastIndexOf(const UChar *srcChars, + int32_t srcLength, + int32_t start, + int32_t length) const; + + /** + * Locate in this the last occurrence in the range + * [<TT>start</TT>, <TT>start + length</TT>) of the characters + * in <TT>srcChars</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>), + * using bitwise comparison. + * @param srcChars The text to search for. + * @param srcStart the offset into <TT>srcChars</TT> at which + * to start matching + * @param srcLength the number of characters in <TT>srcChars</TT> to match + * @param start the offset into this at which to start matching + * @param length the number of characters in this to search + * @return The offset into this of the start of <TT>text</TT>, + * or -1 if not found. + * @stable ICU 2.0 + */ + int32_t lastIndexOf(const UChar *srcChars, + int32_t srcStart, + int32_t srcLength, + int32_t start, + int32_t length) const; + + /** + * Locate in this the last occurrence of the BMP code point <code>c</code>, + * using bitwise comparison. + * @param c The code unit to search for. + * @return The offset into this of <TT>c</TT>, or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t lastIndexOf(UChar c) const; + + /** + * Locate in this the last occurrence of the code point <TT>c</TT>, + * using bitwise comparison. + * + * @param c The code point to search for. + * @return The offset into this of <TT>c</TT>, or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t lastIndexOf(UChar32 c) const; + + /** + * Locate in this the last occurrence of the BMP code point <code>c</code> + * starting at offset <TT>start</TT>, using bitwise comparison. + * @param c The code unit to search for. + * @param start The offset at which searching will start. + * @return The offset into this of <TT>c</TT>, or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t lastIndexOf(UChar c, + int32_t start) const; + + /** + * Locate in this the last occurrence of the code point <TT>c</TT> + * starting at offset <TT>start</TT>, using bitwise comparison. + * + * @param c The code point to search for. + * @param start The offset at which searching will start. + * @return The offset into this of <TT>c</TT>, or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t lastIndexOf(UChar32 c, + int32_t start) const; + + /** + * Locate in this the last occurrence of the BMP code point <code>c</code> + * in the range [<TT>start</TT>, <TT>start + length</TT>), + * using bitwise comparison. + * @param c The code unit to search for. + * @param start the offset into this at which to start matching + * @param length the number of characters in this to search + * @return The offset into this of <TT>c</TT>, or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t lastIndexOf(UChar c, + int32_t start, + int32_t length) const; + + /** + * Locate in this the last occurrence of the code point <TT>c</TT> + * in the range [<TT>start</TT>, <TT>start + length</TT>), + * using bitwise comparison. + * + * @param c The code point to search for. + * @param start the offset into this at which to start matching + * @param length the number of characters in this to search + * @return The offset into this of <TT>c</TT>, or -1 if not found. + * @stable ICU 2.0 + */ + inline int32_t lastIndexOf(UChar32 c, + int32_t start, + int32_t length) const; + + + /* Character access */ + + /** + * Return the code unit at offset <tt>offset</tt>. + * If the offset is not valid (0..length()-1) then U+ffff is returned. + * @param offset a valid offset into the text + * @return the code unit at offset <tt>offset</tt> + * or 0xffff if the offset is not valid for this string + * @stable ICU 2.0 + */ + inline UChar charAt(int32_t offset) const; + + /** + * Return the code unit at offset <tt>offset</tt>. + * If the offset is not valid (0..length()-1) then U+ffff is returned. + * @param offset a valid offset into the text + * @return the code unit at offset <tt>offset</tt> + * @stable ICU 2.0 + */ + inline UChar operator[] (int32_t offset) const; + + /** + * Return the code point that contains the code unit + * at offset <tt>offset</tt>. + * If the offset is not valid (0..length()-1) then U+ffff is returned. + * @param offset a valid offset into the text + * that indicates the text offset of any of the code units + * that will be assembled into a code point (21-bit value) and returned + * @return the code point of text at <tt>offset</tt> + * or 0xffff if the offset is not valid for this string + * @stable ICU 2.0 + */ + UChar32 char32At(int32_t offset) const; + + /** + * Adjust a random-access offset so that + * it points to the beginning of a Unicode character. + * The offset that is passed in points to + * any code unit of a code point, + * while the returned offset will point to the first code unit + * of the same code point. + * In UTF-16, if the input offset points to a second surrogate + * of a surrogate pair, then the returned offset will point + * to the first surrogate. + * @param offset a valid offset into one code point of the text + * @return offset of the first code unit of the same code point + * @see U16_SET_CP_START + * @stable ICU 2.0 + */ + int32_t getChar32Start(int32_t offset) const; + + /** + * Adjust a random-access offset so that + * it points behind a Unicode character. + * The offset that is passed in points behind + * any code unit of a code point, + * while the returned offset will point behind the last code unit + * of the same code point. + * In UTF-16, if the input offset points behind the first surrogate + * (i.e., to the second surrogate) + * of a surrogate pair, then the returned offset will point + * behind the second surrogate (i.e., to the first surrogate). + * @param offset a valid offset after any code unit of a code point of the text + * @return offset of the first code unit after the same code point + * @see U16_SET_CP_LIMIT + * @stable ICU 2.0 + */ + int32_t getChar32Limit(int32_t offset) const; + + /** + * Move the code unit index along the string by delta code points. + * Interpret the input index as a code unit-based offset into the string, + * move the index forward or backward by delta code points, and + * return the resulting index. + * The input index should point to the first code unit of a code point, + * if there is more than one. + * + * Both input and output indexes are code unit-based as for all + * string indexes/offsets in ICU (and other libraries, like MBCS char*). + * If delta<0 then the index is moved backward (toward the start of the string). + * If delta>0 then the index is moved forward (toward the end of the string). + * + * This behaves like CharacterIterator::move32(delta, kCurrent). + * + * Behavior for out-of-bounds indexes: + * <code>moveIndex32</code> pins the input index to 0..length(), i.e., + * if the input index<0 then it is pinned to 0; + * if it is index>length() then it is pinned to length(). + * Afterwards, the index is moved by <code>delta</code> code points + * forward or backward, + * but no further backward than to 0 and no further forward than to length(). + * The resulting index return value will be in between 0 and length(), inclusively. + * + * Examples: + * <pre> + * // s has code points 'a' U+10000 'b' U+10ffff U+2029 + * UnicodeString s=UNICODE_STRING("a\\U00010000b\\U0010ffff\\u2029", 31).unescape(); + * + * // initial index: position of U+10000 + * int32_t index=1; + * + * // the following examples will all result in index==4, position of U+10ffff + * + * // skip 2 code points from some position in the string + * index=s.moveIndex32(index, 2); // skips U+10000 and 'b' + * + * // go to the 3rd code point from the start of s (0-based) + * index=s.moveIndex32(0, 3); // skips 'a', U+10000, and 'b' + * + * // go to the next-to-last code point of s + * index=s.moveIndex32(s.length(), -2); // backward-skips U+2029 and U+10ffff + * </pre> + * + * @param index input code unit index + * @param delta (signed) code point count to move the index forward or backward + * in the string + * @return the resulting code unit index + * @stable ICU 2.0 + */ + int32_t moveIndex32(int32_t index, int32_t delta) const; + + /* Substring extraction */ + + /** + * Copy the characters in the range + * [<tt>start</tt>, <tt>start + length</tt>) into the array <tt>dst</tt>, + * beginning at <tt>dstStart</tt>. + * If the string aliases to <code>dst</code> itself as an external buffer, + * then extract() will not copy the contents. + * + * @param start offset of first character which will be copied into the array + * @param length the number of characters to extract + * @param dst array in which to copy characters. The length of <tt>dst</tt> + * must be at least (<tt>dstStart + length</tt>). + * @param dstStart the offset in <TT>dst</TT> where the first character + * will be extracted + * @stable ICU 2.0 + */ + inline void extract(int32_t start, + int32_t length, + UChar *dst, + int32_t dstStart = 0) const; + + /** + * Copy the contents of the string into dest. + * This is a convenience function that + * checks if there is enough space in dest, + * extracts the entire string if possible, + * and NUL-terminates dest if possible. + * + * If the string fits into dest but cannot be NUL-terminated + * (length()==destCapacity) then the error code is set to U_STRING_NOT_TERMINATED_WARNING. + * If the string itself does not fit into dest + * (length()>destCapacity) then the error code is set to U_BUFFER_OVERFLOW_ERROR. + * + * If the string aliases to <code>dest</code> itself as an external buffer, + * then extract() will not copy the contents. + * + * @param dest Destination string buffer. + * @param destCapacity Number of UChars available at dest. + * @param errorCode ICU error code. + * @return length() + * @stable ICU 2.0 + */ + int32_t + extract(UChar *dest, int32_t destCapacity, + UErrorCode &errorCode) const; + + /** + * Copy the characters in the range + * [<tt>start</tt>, <tt>start + length</tt>) into the UnicodeString + * <tt>target</tt>. + * @param start offset of first character which will be copied + * @param length the number of characters to extract + * @param target UnicodeString into which to copy characters. + * @return A reference to <TT>target</TT> + * @stable ICU 2.0 + */ + inline void extract(int32_t start, + int32_t length, + UnicodeString& target) const; + + /** + * Copy the characters in the range [<tt>start</tt>, <tt>limit</tt>) + * into the array <tt>dst</tt>, beginning at <tt>dstStart</tt>. + * @param start offset of first character which will be copied into the array + * @param limit offset immediately following the last character to be copied + * @param dst array in which to copy characters. The length of <tt>dst</tt> + * must be at least (<tt>dstStart + (limit - start)</tt>). + * @param dstStart the offset in <TT>dst</TT> where the first character + * will be extracted + * @stable ICU 2.0 + */ + inline void extractBetween(int32_t start, + int32_t limit, + UChar *dst, + int32_t dstStart = 0) const; + + /** + * Copy the characters in the range [<tt>start</tt>, <tt>limit</tt>) + * into the UnicodeString <tt>target</tt>. Replaceable API. + * @param start offset of first character which will be copied + * @param limit offset immediately following the last character to be copied + * @param target UnicodeString into which to copy characters. + * @return A reference to <TT>target</TT> + * @stable ICU 2.0 + */ + virtual void extractBetween(int32_t start, + int32_t limit, + UnicodeString& target) const; + + /** + * Copy the characters in the range + * [<tt>start</TT>, <tt>start + length</TT>) into an array of characters. + * All characters must be invariant (see utypes.h). + * Use US_INV as the last, signature-distinguishing parameter. + * + * This function does not write any more than <code>targetLength</code> + * characters but returns the length of the entire output string + * so that one can allocate a larger buffer and call the function again + * if necessary. + * The output string is NUL-terminated if possible. + * + * @param start offset of first character which will be copied + * @param startLength the number of characters to extract + * @param target the target buffer for extraction, can be NULL + * if targetLength is 0 + * @param targetCapacity the length of the target buffer + * @param inv Signature-distinguishing paramater, use US_INV. + * @return the output string length, not including the terminating NUL + * @stable ICU 3.2 + */ + int32_t extract(int32_t start, + int32_t startLength, + char *target, + int32_t targetCapacity, + enum EInvariant inv) const; + +#if U_CHARSET_IS_UTF8 || !UCONFIG_NO_CONVERSION + + /** + * Copy the characters in the range + * [<tt>start</TT>, <tt>start + length</TT>) into an array of characters + * in the platform's default codepage. + * This function does not write any more than <code>targetLength</code> + * characters but returns the length of the entire output string + * so that one can allocate a larger buffer and call the function again + * if necessary. + * The output string is NUL-terminated if possible. + * + * @param start offset of first character which will be copied + * @param startLength the number of characters to extract + * @param target the target buffer for extraction + * @param targetLength the length of the target buffer + * If <TT>target</TT> is NULL, then the number of bytes required for + * <TT>target</TT> is returned. + * @return the output string length, not including the terminating NUL + * @stable ICU 2.0 + */ + int32_t extract(int32_t start, + int32_t startLength, + char *target, + uint32_t targetLength) const; + +#endif + +#if !UCONFIG_NO_CONVERSION + + /** + * Copy the characters in the range + * [<tt>start</TT>, <tt>start + length</TT>) into an array of characters + * in a specified codepage. + * The output string is NUL-terminated. + * + * Recommendation: For invariant-character strings use + * extract(int32_t start, int32_t length, char *target, int32_t targetCapacity, enum EInvariant inv) const + * because it avoids object code dependencies of UnicodeString on + * the conversion code. + * + * @param start offset of first character which will be copied + * @param startLength the number of characters to extract + * @param target the target buffer for extraction + * @param codepage the desired codepage for the characters. 0 has + * the special meaning of the default codepage + * If <code>codepage</code> is an empty string (<code>""</code>), + * then a simple conversion is performed on the codepage-invariant + * subset ("invariant characters") of the platform encoding. See utypes.h. + * If <TT>target</TT> is NULL, then the number of bytes required for + * <TT>target</TT> is returned. It is assumed that the target is big enough + * to fit all of the characters. + * @return the output string length, not including the terminating NUL + * @stable ICU 2.0 + */ + inline int32_t extract(int32_t start, + int32_t startLength, + char *target, + const char *codepage = 0) const; + + /** + * Copy the characters in the range + * [<tt>start</TT>, <tt>start + length</TT>) into an array of characters + * in a specified codepage. + * This function does not write any more than <code>targetLength</code> + * characters but returns the length of the entire output string + * so that one can allocate a larger buffer and call the function again + * if necessary. + * The output string is NUL-terminated if possible. + * + * Recommendation: For invariant-character strings use + * extract(int32_t start, int32_t length, char *target, int32_t targetCapacity, enum EInvariant inv) const + * because it avoids object code dependencies of UnicodeString on + * the conversion code. + * + * @param start offset of first character which will be copied + * @param startLength the number of characters to extract + * @param target the target buffer for extraction + * @param targetLength the length of the target buffer + * @param codepage the desired codepage for the characters. 0 has + * the special meaning of the default codepage + * If <code>codepage</code> is an empty string (<code>""</code>), + * then a simple conversion is performed on the codepage-invariant + * subset ("invariant characters") of the platform encoding. See utypes.h. + * If <TT>target</TT> is NULL, then the number of bytes required for + * <TT>target</TT> is returned. + * @return the output string length, not including the terminating NUL + * @stable ICU 2.0 + */ + int32_t extract(int32_t start, + int32_t startLength, + char *target, + uint32_t targetLength, + const char *codepage) const; + + /** + * Convert the UnicodeString into a codepage string using an existing UConverter. + * The output string is NUL-terminated if possible. + * + * This function avoids the overhead of opening and closing a converter if + * multiple strings are extracted. + * + * @param dest destination string buffer, can be NULL if destCapacity==0 + * @param destCapacity the number of chars available at dest + * @param cnv the converter object to be used (ucnv_resetFromUnicode() will be called), + * or NULL for the default converter + * @param errorCode normal ICU error code + * @return the length of the output string, not counting the terminating NUL; + * if the length is greater than destCapacity, then the string will not fit + * and a buffer of the indicated length would need to be passed in + * @stable ICU 2.0 + */ + int32_t extract(char *dest, int32_t destCapacity, + UConverter *cnv, + UErrorCode &errorCode) const; + +#endif + + /** + * Create a temporary substring for the specified range. + * Unlike the substring constructor and setTo() functions, + * the object returned here will be a read-only alias (using getBuffer()) + * rather than copying the text. + * As a result, this substring operation is much faster but requires + * that the original string not be modified or deleted during the lifetime + * of the returned substring object. + * @param start offset of the first character visible in the substring + * @param length length of the substring + * @return a read-only alias UnicodeString object for the substring + * @stable ICU 4.4 + */ + UnicodeString tempSubString(int32_t start=0, int32_t length=INT32_MAX) const; + + /** + * Create a temporary substring for the specified range. + * Same as tempSubString(start, length) except that the substring range + * is specified as a (start, limit) pair (with an exclusive limit index) + * rather than a (start, length) pair. + * @param start offset of the first character visible in the substring + * @param limit offset immediately following the last character visible in the substring + * @return a read-only alias UnicodeString object for the substring + * @stable ICU 4.4 + */ + inline UnicodeString tempSubStringBetween(int32_t start, int32_t limit=INT32_MAX) const; + + /** + * Convert the UnicodeString to UTF-8 and write the result + * to a ByteSink. This is called by toUTF8String(). + * Unpaired surrogates are replaced with U+FFFD. + * Calls u_strToUTF8WithSub(). + * + * @param sink A ByteSink to which the UTF-8 version of the string is written. + * sink.Flush() is called at the end. + * @stable ICU 4.2 + * @see toUTF8String + */ + void toUTF8(ByteSink &sink) const; + +#if U_HAVE_STD_STRING + + /** + * Convert the UnicodeString to UTF-8 and append the result + * to a standard string. + * Unpaired surrogates are replaced with U+FFFD. + * Calls toUTF8(). + * + * @param result A standard string (or a compatible object) + * to which the UTF-8 version of the string is appended. + * @return The string object. + * @stable ICU 4.2 + * @see toUTF8 + */ + template<typename StringClass> + StringClass &toUTF8String(StringClass &result) const { + StringByteSink<StringClass> sbs(&result); + toUTF8(sbs); + return result; + } + +#endif + + /** + * Convert the UnicodeString to UTF-32. + * Unpaired surrogates are replaced with U+FFFD. + * Calls u_strToUTF32WithSub(). + * + * @param utf32 destination string buffer, can be NULL if capacity==0 + * @param capacity the number of UChar32s available at utf32 + * @param errorCode Standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The length of the UTF-32 string. + * @see fromUTF32 + * @stable ICU 4.2 + */ + int32_t toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const; + + /* Length operations */ + + /** + * Return the length of the UnicodeString object. + * The length is the number of UChar code units are in the UnicodeString. + * If you want the number of code points, please use countChar32(). + * @return the length of the UnicodeString object + * @see countChar32 + * @stable ICU 2.0 + */ + inline int32_t length(void) const; + + /** + * Count Unicode code points in the length UChar code units of the string. + * A code point may occupy either one or two UChar code units. + * Counting code points involves reading all code units. + * + * This functions is basically the inverse of moveIndex32(). + * + * @param start the index of the first code unit to check + * @param length the number of UChar code units to check + * @return the number of code points in the specified code units + * @see length + * @stable ICU 2.0 + */ + int32_t + countChar32(int32_t start=0, int32_t length=INT32_MAX) const; + + /** + * Check if the length UChar code units of the string + * contain more Unicode code points than a certain number. + * This is more efficient than counting all code points in this part of the string + * and comparing that number with a threshold. + * This function may not need to scan the string at all if the length + * falls within a certain range, and + * never needs to count more than 'number+1' code points. + * Logically equivalent to (countChar32(start, length)>number). + * A Unicode code point may occupy either one or two UChar code units. + * + * @param start the index of the first code unit to check (0 for the entire string) + * @param length the number of UChar code units to check + * (use INT32_MAX for the entire string; remember that start/length + * values are pinned) + * @param number The number of code points in the (sub)string is compared against + * the 'number' parameter. + * @return Boolean value for whether the string contains more Unicode code points + * than 'number'. Same as (u_countChar32(s, length)>number). + * @see countChar32 + * @see u_strHasMoreChar32Than + * @stable ICU 2.4 + */ + UBool + hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const; + + /** + * Determine if this string is empty. + * @return TRUE if this string contains 0 characters, FALSE otherwise. + * @stable ICU 2.0 + */ + inline UBool isEmpty(void) const; + + /** + * Return the capacity of the internal buffer of the UnicodeString object. + * This is useful together with the getBuffer functions. + * See there for details. + * + * @return the number of UChars available in the internal buffer + * @see getBuffer + * @stable ICU 2.0 + */ + inline int32_t getCapacity(void) const; + + /* Other operations */ + + /** + * Generate a hash code for this object. + * @return The hash code of this UnicodeString. + * @stable ICU 2.0 + */ + inline int32_t hashCode(void) const; + + /** + * Determine if this object contains a valid string. + * A bogus string has no value. It is different from an empty string, + * although in both cases isEmpty() returns TRUE and length() returns 0. + * setToBogus() and isBogus() can be used to indicate that no string value is available. + * For a bogus string, getBuffer() and getTerminatedBuffer() return NULL, and + * length() returns 0. + * + * @return TRUE if the string is bogus/invalid, FALSE otherwise + * @see setToBogus() + * @stable ICU 2.0 + */ + inline UBool isBogus(void) const; + + + //======================================== + // Write operations + //======================================== + + /* Assignment operations */ + + /** + * Assignment operator. Replace the characters in this UnicodeString + * with the characters from <TT>srcText</TT>. + * @param srcText The text containing the characters to replace + * @return a reference to this + * @stable ICU 2.0 + */ + UnicodeString &operator=(const UnicodeString &srcText); + + /** + * Almost the same as the assignment operator. + * Replace the characters in this UnicodeString + * with the characters from <code>srcText</code>. + * + * This function works the same as the assignment operator + * for all strings except for ones that are readonly aliases. + * + * Starting with ICU 2.4, the assignment operator and the copy constructor + * allocate a new buffer and copy the buffer contents even for readonly aliases. + * This function implements the old, more efficient but less safe behavior + * of making this string also a readonly alias to the same buffer. + * + * The fastCopyFrom function must be used only if it is known that the lifetime of + * this UnicodeString does not exceed the lifetime of the aliased buffer + * including its contents, for example for strings from resource bundles + * or aliases to string constants. + * + * @param src The text containing the characters to replace. + * @return a reference to this + * @stable ICU 2.4 + */ + UnicodeString &fastCopyFrom(const UnicodeString &src); + + /** + * Assignment operator. Replace the characters in this UnicodeString + * with the code unit <TT>ch</TT>. + * @param ch the code unit to replace + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& operator= (UChar ch); + + /** + * Assignment operator. Replace the characters in this UnicodeString + * with the code point <TT>ch</TT>. + * @param ch the code point to replace + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& operator= (UChar32 ch); + + /** + * Set the text in the UnicodeString object to the characters + * in <TT>srcText</TT> in the range + * [<TT>srcStart</TT>, <TT>srcText.length()</TT>). + * <TT>srcText</TT> is not modified. + * @param srcText the source for the new characters + * @param srcStart the offset into <TT>srcText</TT> where new characters + * will be obtained + * @return a reference to this + * @stable ICU 2.2 + */ + inline UnicodeString& setTo(const UnicodeString& srcText, + int32_t srcStart); + + /** + * Set the text in the UnicodeString object to the characters + * in <TT>srcText</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>). + * <TT>srcText</TT> is not modified. + * @param srcText the source for the new characters + * @param srcStart the offset into <TT>srcText</TT> where new characters + * will be obtained + * @param srcLength the number of characters in <TT>srcText</TT> in the + * replace string. + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& setTo(const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength); + + /** + * Set the text in the UnicodeString object to the characters in + * <TT>srcText</TT>. + * <TT>srcText</TT> is not modified. + * @param srcText the source for the new characters + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& setTo(const UnicodeString& srcText); + + /** + * Set the characters in the UnicodeString object to the characters + * in <TT>srcChars</TT>. <TT>srcChars</TT> is not modified. + * @param srcChars the source for the new characters + * @param srcLength the number of Unicode characters in srcChars. + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& setTo(const UChar *srcChars, + int32_t srcLength); + + /** + * Set the characters in the UnicodeString object to the code unit + * <TT>srcChar</TT>. + * @param srcChar the code unit which becomes the UnicodeString's character + * content + * @return a reference to this + * @stable ICU 2.0 + */ + UnicodeString& setTo(UChar srcChar); + + /** + * Set the characters in the UnicodeString object to the code point + * <TT>srcChar</TT>. + * @param srcChar the code point which becomes the UnicodeString's character + * content + * @return a reference to this + * @stable ICU 2.0 + */ + UnicodeString& setTo(UChar32 srcChar); + + /** + * Aliasing setTo() function, analogous to the readonly-aliasing UChar* constructor. + * The text will be used for the UnicodeString object, but + * it will not be released when the UnicodeString is destroyed. + * This has copy-on-write semantics: + * When the string is modified, then the buffer is first copied into + * newly allocated memory. + * The aliased buffer is never modified. + * + * In an assignment to another UnicodeString, when using the copy constructor + * or the assignment operator, the text will be copied. + * When using fastCopyFrom(), the text will be aliased again, + * so that both strings then alias the same readonly-text. + * + * @param isTerminated specifies if <code>text</code> is <code>NUL</code>-terminated. + * This must be true if <code>textLength==-1</code>. + * @param text The characters to alias for the UnicodeString. + * @param textLength The number of Unicode characters in <code>text</code> to alias. + * If -1, then this constructor will determine the length + * by calling <code>u_strlen()</code>. + * @return a reference to this + * @stable ICU 2.0 + */ + UnicodeString &setTo(UBool isTerminated, + const UChar *text, + int32_t textLength); + + /** + * Aliasing setTo() function, analogous to the writable-aliasing UChar* constructor. + * The text will be used for the UnicodeString object, but + * it will not be released when the UnicodeString is destroyed. + * This has write-through semantics: + * For as long as the capacity of the buffer is sufficient, write operations + * will directly affect the buffer. When more capacity is necessary, then + * a new buffer will be allocated and the contents copied as with regularly + * constructed strings. + * In an assignment to another UnicodeString, the buffer will be copied. + * The extract(UChar *dst) function detects whether the dst pointer is the same + * as the string buffer itself and will in this case not copy the contents. + * + * @param buffer The characters to alias for the UnicodeString. + * @param buffLength The number of Unicode characters in <code>buffer</code> to alias. + * @param buffCapacity The size of <code>buffer</code> in UChars. + * @return a reference to this + * @stable ICU 2.0 + */ + UnicodeString &setTo(UChar *buffer, + int32_t buffLength, + int32_t buffCapacity); + + /** + * Make this UnicodeString object invalid. + * The string will test TRUE with isBogus(). + * + * A bogus string has no value. It is different from an empty string. + * It can be used to indicate that no string value is available. + * getBuffer() and getTerminatedBuffer() return NULL, and + * length() returns 0. + * + * This utility function is used throughout the UnicodeString + * implementation to indicate that a UnicodeString operation failed, + * and may be used in other functions, + * especially but not exclusively when such functions do not + * take a UErrorCode for simplicity. + * + * The following methods, and no others, will clear a string object's bogus flag: + * - remove() + * - remove(0, INT32_MAX) + * - truncate(0) + * - operator=() (assignment operator) + * - setTo(...) + * + * The simplest ways to turn a bogus string into an empty one + * is to use the remove() function. + * Examples for other functions that are equivalent to "set to empty string": + * \code + * if(s.isBogus()) { + * s.remove(); // set to an empty string (remove all), or + * s.remove(0, INT32_MAX); // set to an empty string (remove all), or + * s.truncate(0); // set to an empty string (complete truncation), or + * s=UnicodeString(); // assign an empty string, or + * s.setTo((UChar32)-1); // set to a pseudo code point that is out of range, or + * static const UChar nul=0; + * s.setTo(&nul, 0); // set to an empty C Unicode string + * } + * \endcode + * + * @see isBogus() + * @stable ICU 2.0 + */ + void setToBogus(); + + /** + * Set the character at the specified offset to the specified character. + * @param offset A valid offset into the text of the character to set + * @param ch The new character + * @return A reference to this + * @stable ICU 2.0 + */ + UnicodeString& setCharAt(int32_t offset, + UChar ch); + + + /* Append operations */ + + /** + * Append operator. Append the code unit <TT>ch</TT> to the UnicodeString + * object. + * @param ch the code unit to be appended + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& operator+= (UChar ch); + + /** + * Append operator. Append the code point <TT>ch</TT> to the UnicodeString + * object. + * @param ch the code point to be appended + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& operator+= (UChar32 ch); + + /** + * Append operator. Append the characters in <TT>srcText</TT> to the + * UnicodeString object. <TT>srcText</TT> is not modified. + * @param srcText the source for the new characters + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& operator+= (const UnicodeString& srcText); + + /** + * Append the characters + * in <TT>srcText</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>) to the + * UnicodeString object at offset <TT>start</TT>. <TT>srcText</TT> + * is not modified. + * @param srcText the source for the new characters + * @param srcStart the offset into <TT>srcText</TT> where new characters + * will be obtained + * @param srcLength the number of characters in <TT>srcText</TT> in + * the append string + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& append(const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength); + + /** + * Append the characters in <TT>srcText</TT> to the UnicodeString object. + * <TT>srcText</TT> is not modified. + * @param srcText the source for the new characters + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& append(const UnicodeString& srcText); + + /** + * Append the characters in <TT>srcChars</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>) to the UnicodeString + * object at offset + * <TT>start</TT>. <TT>srcChars</TT> is not modified. + * @param srcChars the source for the new characters + * @param srcStart the offset into <TT>srcChars</TT> where new characters + * will be obtained + * @param srcLength the number of characters in <TT>srcChars</TT> in + * the append string; can be -1 if <TT>srcChars</TT> is NUL-terminated + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& append(const UChar *srcChars, + int32_t srcStart, + int32_t srcLength); + + /** + * Append the characters in <TT>srcChars</TT> to the UnicodeString object + * at offset <TT>start</TT>. <TT>srcChars</TT> is not modified. + * @param srcChars the source for the new characters + * @param srcLength the number of Unicode characters in <TT>srcChars</TT>; + * can be -1 if <TT>srcChars</TT> is NUL-terminated + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& append(const UChar *srcChars, + int32_t srcLength); + + /** + * Append the code unit <TT>srcChar</TT> to the UnicodeString object. + * @param srcChar the code unit to append + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& append(UChar srcChar); + + /** + * Append the code point <TT>srcChar</TT> to the UnicodeString object. + * @param srcChar the code point to append + * @return a reference to this + * @stable ICU 2.0 + */ + UnicodeString& append(UChar32 srcChar); + + + /* Insert operations */ + + /** + * Insert the characters in <TT>srcText</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>) into the UnicodeString + * object at offset <TT>start</TT>. <TT>srcText</TT> is not modified. + * @param start the offset where the insertion begins + * @param srcText the source for the new characters + * @param srcStart the offset into <TT>srcText</TT> where new characters + * will be obtained + * @param srcLength the number of characters in <TT>srcText</TT> in + * the insert string + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& insert(int32_t start, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength); + + /** + * Insert the characters in <TT>srcText</TT> into the UnicodeString object + * at offset <TT>start</TT>. <TT>srcText</TT> is not modified. + * @param start the offset where the insertion begins + * @param srcText the source for the new characters + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& insert(int32_t start, + const UnicodeString& srcText); + + /** + * Insert the characters in <TT>srcChars</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>) into the UnicodeString + * object at offset <TT>start</TT>. <TT>srcChars</TT> is not modified. + * @param start the offset at which the insertion begins + * @param srcChars the source for the new characters + * @param srcStart the offset into <TT>srcChars</TT> where new characters + * will be obtained + * @param srcLength the number of characters in <TT>srcChars</TT> + * in the insert string + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& insert(int32_t start, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength); + + /** + * Insert the characters in <TT>srcChars</TT> into the UnicodeString object + * at offset <TT>start</TT>. <TT>srcChars</TT> is not modified. + * @param start the offset where the insertion begins + * @param srcChars the source for the new characters + * @param srcLength the number of Unicode characters in srcChars. + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& insert(int32_t start, + const UChar *srcChars, + int32_t srcLength); + + /** + * Insert the code unit <TT>srcChar</TT> into the UnicodeString object at + * offset <TT>start</TT>. + * @param start the offset at which the insertion occurs + * @param srcChar the code unit to insert + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& insert(int32_t start, + UChar srcChar); + + /** + * Insert the code point <TT>srcChar</TT> into the UnicodeString object at + * offset <TT>start</TT>. + * @param start the offset at which the insertion occurs + * @param srcChar the code point to insert + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& insert(int32_t start, + UChar32 srcChar); + + + /* Replace operations */ + + /** + * Replace the characters in the range + * [<TT>start</TT>, <TT>start + length</TT>) with the characters in + * <TT>srcText</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>). + * <TT>srcText</TT> is not modified. + * @param start the offset at which the replace operation begins + * @param length the number of characters to replace. The character at + * <TT>start + length</TT> is not modified. + * @param srcText the source for the new characters + * @param srcStart the offset into <TT>srcText</TT> where new characters + * will be obtained + * @param srcLength the number of characters in <TT>srcText</TT> in + * the replace string + * @return a reference to this + * @stable ICU 2.0 + */ + UnicodeString& replace(int32_t start, + int32_t length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength); + + /** + * Replace the characters in the range + * [<TT>start</TT>, <TT>start + length</TT>) + * with the characters in <TT>srcText</TT>. <TT>srcText</TT> is + * not modified. + * @param start the offset at which the replace operation begins + * @param length the number of characters to replace. The character at + * <TT>start + length</TT> is not modified. + * @param srcText the source for the new characters + * @return a reference to this + * @stable ICU 2.0 + */ + UnicodeString& replace(int32_t start, + int32_t length, + const UnicodeString& srcText); + + /** + * Replace the characters in the range + * [<TT>start</TT>, <TT>start + length</TT>) with the characters in + * <TT>srcChars</TT> in the range + * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>). <TT>srcChars</TT> + * is not modified. + * @param start the offset at which the replace operation begins + * @param length the number of characters to replace. The character at + * <TT>start + length</TT> is not modified. + * @param srcChars the source for the new characters + * @param srcStart the offset into <TT>srcChars</TT> where new characters + * will be obtained + * @param srcLength the number of characters in <TT>srcChars</TT> + * in the replace string + * @return a reference to this + * @stable ICU 2.0 + */ + UnicodeString& replace(int32_t start, + int32_t length, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength); + + /** + * Replace the characters in the range + * [<TT>start</TT>, <TT>start + length</TT>) with the characters in + * <TT>srcChars</TT>. <TT>srcChars</TT> is not modified. + * @param start the offset at which the replace operation begins + * @param length number of characters to replace. The character at + * <TT>start + length</TT> is not modified. + * @param srcChars the source for the new characters + * @param srcLength the number of Unicode characters in srcChars + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& replace(int32_t start, + int32_t length, + const UChar *srcChars, + int32_t srcLength); + + /** + * Replace the characters in the range + * [<TT>start</TT>, <TT>start + length</TT>) with the code unit + * <TT>srcChar</TT>. + * @param start the offset at which the replace operation begins + * @param length the number of characters to replace. The character at + * <TT>start + length</TT> is not modified. + * @param srcChar the new code unit + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& replace(int32_t start, + int32_t length, + UChar srcChar); + + /** + * Replace the characters in the range + * [<TT>start</TT>, <TT>start + length</TT>) with the code point + * <TT>srcChar</TT>. + * @param start the offset at which the replace operation begins + * @param length the number of characters to replace. The character at + * <TT>start + length</TT> is not modified. + * @param srcChar the new code point + * @return a reference to this + * @stable ICU 2.0 + */ + UnicodeString& replace(int32_t start, int32_t length, UChar32 srcChar); + + /** + * Replace the characters in the range [<TT>start</TT>, <TT>limit</TT>) + * with the characters in <TT>srcText</TT>. <TT>srcText</TT> is not modified. + * @param start the offset at which the replace operation begins + * @param limit the offset immediately following the replace range + * @param srcText the source for the new characters + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& replaceBetween(int32_t start, + int32_t limit, + const UnicodeString& srcText); + + /** + * Replace the characters in the range [<TT>start</TT>, <TT>limit</TT>) + * with the characters in <TT>srcText</TT> in the range + * [<TT>srcStart</TT>, <TT>srcLimit</TT>). <TT>srcText</TT> is not modified. + * @param start the offset at which the replace operation begins + * @param limit the offset immediately following the replace range + * @param srcText the source for the new characters + * @param srcStart the offset into <TT>srcChars</TT> where new characters + * will be obtained + * @param srcLimit the offset immediately following the range to copy + * in <TT>srcText</TT> + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& replaceBetween(int32_t start, + int32_t limit, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLimit); + + /** + * Replace a substring of this object with the given text. + * @param start the beginning index, inclusive; <code>0 <= start + * <= limit</code>. + * @param limit the ending index, exclusive; <code>start <= limit + * <= length()</code>. + * @param text the text to replace characters <code>start</code> + * to <code>limit - 1</code> + * @stable ICU 2.0 + */ + virtual void handleReplaceBetween(int32_t start, + int32_t limit, + const UnicodeString& text); + + /** + * Replaceable API + * @return TRUE if it has MetaData + * @stable ICU 2.4 + */ + virtual UBool hasMetaData() const; + + /** + * Copy a substring of this object, retaining attribute (out-of-band) + * information. This method is used to duplicate or reorder substrings. + * The destination index must not overlap the source range. + * + * @param start the beginning index, inclusive; <code>0 <= start <= + * limit</code>. + * @param limit the ending index, exclusive; <code>start <= limit <= + * length()</code>. + * @param dest the destination index. The characters from + * <code>start..limit-1</code> will be copied to <code>dest</code>. + * Implementations of this method may assume that <code>dest <= start || + * dest >= limit</code>. + * @stable ICU 2.0 + */ + virtual void copy(int32_t start, int32_t limit, int32_t dest); + + /* Search and replace operations */ + + /** + * Replace all occurrences of characters in oldText with the characters + * in newText + * @param oldText the text containing the search text + * @param newText the text containing the replacement text + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& findAndReplace(const UnicodeString& oldText, + const UnicodeString& newText); + + /** + * Replace all occurrences of characters in oldText with characters + * in newText + * in the range [<TT>start</TT>, <TT>start + length</TT>). + * @param start the start of the range in which replace will performed + * @param length the length of the range in which replace will be performed + * @param oldText the text containing the search text + * @param newText the text containing the replacement text + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& findAndReplace(int32_t start, + int32_t length, + const UnicodeString& oldText, + const UnicodeString& newText); + + /** + * Replace all occurrences of characters in oldText in the range + * [<TT>oldStart</TT>, <TT>oldStart + oldLength</TT>) with the characters + * in newText in the range + * [<TT>newStart</TT>, <TT>newStart + newLength</TT>) + * in the range [<TT>start</TT>, <TT>start + length</TT>). + * @param start the start of the range in which replace will performed + * @param length the length of the range in which replace will be performed + * @param oldText the text containing the search text + * @param oldStart the start of the search range in <TT>oldText</TT> + * @param oldLength the length of the search range in <TT>oldText</TT> + * @param newText the text containing the replacement text + * @param newStart the start of the replacement range in <TT>newText</TT> + * @param newLength the length of the replacement range in <TT>newText</TT> + * @return a reference to this + * @stable ICU 2.0 + */ + UnicodeString& findAndReplace(int32_t start, + int32_t length, + const UnicodeString& oldText, + int32_t oldStart, + int32_t oldLength, + const UnicodeString& newText, + int32_t newStart, + int32_t newLength); + + + /* Remove operations */ + + /** + * Remove all characters from the UnicodeString object. + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& remove(void); + + /** + * Remove the characters in the range + * [<TT>start</TT>, <TT>start + length</TT>) from the UnicodeString object. + * @param start the offset of the first character to remove + * @param length the number of characters to remove + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& remove(int32_t start, + int32_t length = (int32_t)INT32_MAX); + + /** + * Remove the characters in the range + * [<TT>start</TT>, <TT>limit</TT>) from the UnicodeString object. + * @param start the offset of the first character to remove + * @param limit the offset immediately following the range to remove + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& removeBetween(int32_t start, + int32_t limit = (int32_t)INT32_MAX); + + /** + * Retain only the characters in the range + * [<code>start</code>, <code>limit</code>) from the UnicodeString object. + * Removes characters before <code>start</code> and at and after <code>limit</code>. + * @param start the offset of the first character to retain + * @param limit the offset immediately following the range to retain + * @return a reference to this + * @stable ICU 4.4 + */ + inline UnicodeString &retainBetween(int32_t start, int32_t limit = INT32_MAX); + + /* Length operations */ + + /** + * Pad the start of this UnicodeString with the character <TT>padChar</TT>. + * If the length of this UnicodeString is less than targetLength, + * length() - targetLength copies of padChar will be added to the + * beginning of this UnicodeString. + * @param targetLength the desired length of the string + * @param padChar the character to use for padding. Defaults to + * space (U+0020) + * @return TRUE if the text was padded, FALSE otherwise. + * @stable ICU 2.0 + */ + UBool padLeading(int32_t targetLength, + UChar padChar = 0x0020); + + /** + * Pad the end of this UnicodeString with the character <TT>padChar</TT>. + * If the length of this UnicodeString is less than targetLength, + * length() - targetLength copies of padChar will be added to the + * end of this UnicodeString. + * @param targetLength the desired length of the string + * @param padChar the character to use for padding. Defaults to + * space (U+0020) + * @return TRUE if the text was padded, FALSE otherwise. + * @stable ICU 2.0 + */ + UBool padTrailing(int32_t targetLength, + UChar padChar = 0x0020); + + /** + * Truncate this UnicodeString to the <TT>targetLength</TT>. + * @param targetLength the desired length of this UnicodeString. + * @return TRUE if the text was truncated, FALSE otherwise + * @stable ICU 2.0 + */ + inline UBool truncate(int32_t targetLength); + + /** + * Trims leading and trailing whitespace from this UnicodeString. + * @return a reference to this + * @stable ICU 2.0 + */ + UnicodeString& trim(void); + + + /* Miscellaneous operations */ + + /** + * Reverse this UnicodeString in place. + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& reverse(void); + + /** + * Reverse the range [<TT>start</TT>, <TT>start + length</TT>) in + * this UnicodeString. + * @param start the start of the range to reverse + * @param length the number of characters to to reverse + * @return a reference to this + * @stable ICU 2.0 + */ + inline UnicodeString& reverse(int32_t start, + int32_t length); + + /** + * Convert the characters in this to UPPER CASE following the conventions of + * the default locale. + * @return A reference to this. + * @stable ICU 2.0 + */ + UnicodeString& toUpper(void); + + /** + * Convert the characters in this to UPPER CASE following the conventions of + * a specific locale. + * @param locale The locale containing the conventions to use. + * @return A reference to this. + * @stable ICU 2.0 + */ + UnicodeString& toUpper(const Locale& locale); + + /** + * Convert the characters in this to lower case following the conventions of + * the default locale. + * @return A reference to this. + * @stable ICU 2.0 + */ + UnicodeString& toLower(void); + + /** + * Convert the characters in this to lower case following the conventions of + * a specific locale. + * @param locale The locale containing the conventions to use. + * @return A reference to this. + * @stable ICU 2.0 + */ + UnicodeString& toLower(const Locale& locale); + +#if !UCONFIG_NO_BREAK_ITERATION + + /** + * Titlecase this string, convenience function using the default locale. + * + * Casing is locale-dependent and context-sensitive. + * Titlecasing uses a break iterator to find the first characters of words + * that are to be titlecased. It titlecases those characters and lowercases + * all others. + * + * The titlecase break iterator can be provided to customize for arbitrary + * styles, using rules and dictionaries beyond the standard iterators. + * It may be more efficient to always provide an iterator to avoid + * opening and closing one for each string. + * The standard titlecase iterator for the root locale implements the + * algorithm of Unicode TR 21. + * + * This function uses only the setText(), first() and next() methods of the + * provided break iterator. + * + * @param titleIter A break iterator to find the first characters of words + * that are to be titlecased. + * If none is provided (0), then a standard titlecase + * break iterator is opened. + * Otherwise the provided iterator is set to the string's text. + * @return A reference to this. + * @stable ICU 2.1 + */ + UnicodeString &toTitle(BreakIterator *titleIter); + + /** + * Titlecase this string. + * + * Casing is locale-dependent and context-sensitive. + * Titlecasing uses a break iterator to find the first characters of words + * that are to be titlecased. It titlecases those characters and lowercases + * all others. + * + * The titlecase break iterator can be provided to customize for arbitrary + * styles, using rules and dictionaries beyond the standard iterators. + * It may be more efficient to always provide an iterator to avoid + * opening and closing one for each string. + * The standard titlecase iterator for the root locale implements the + * algorithm of Unicode TR 21. + * + * This function uses only the setText(), first() and next() methods of the + * provided break iterator. + * + * @param titleIter A break iterator to find the first characters of words + * that are to be titlecased. + * If none is provided (0), then a standard titlecase + * break iterator is opened. + * Otherwise the provided iterator is set to the string's text. + * @param locale The locale to consider. + * @return A reference to this. + * @stable ICU 2.1 + */ + UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale); + + /** + * Titlecase this string, with options. + * + * Casing is locale-dependent and context-sensitive. + * Titlecasing uses a break iterator to find the first characters of words + * that are to be titlecased. It titlecases those characters and lowercases + * all others. (This can be modified with options.) + * + * The titlecase break iterator can be provided to customize for arbitrary + * styles, using rules and dictionaries beyond the standard iterators. + * It may be more efficient to always provide an iterator to avoid + * opening and closing one for each string. + * The standard titlecase iterator for the root locale implements the + * algorithm of Unicode TR 21. + * + * This function uses only the setText(), first() and next() methods of the + * provided break iterator. + * + * @param titleIter A break iterator to find the first characters of words + * that are to be titlecased. + * If none is provided (0), then a standard titlecase + * break iterator is opened. + * Otherwise the provided iterator is set to the string's text. + * @param locale The locale to consider. + * @param options Options bit set, see ucasemap_open(). + * @return A reference to this. + * @see U_TITLECASE_NO_LOWERCASE + * @see U_TITLECASE_NO_BREAK_ADJUSTMENT + * @see ucasemap_open + * @stable ICU 3.8 + */ + UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options); + +#endif + + /** + * Case-folds the characters in this string. + * + * Case-folding is locale-independent and not context-sensitive, + * but there is an option for whether to include or exclude mappings for dotted I + * and dotless i that are marked with 'T' in CaseFolding.txt. + * + * The result may be longer or shorter than the original. + * + * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I + * @return A reference to this. + * @stable ICU 2.0 + */ + UnicodeString &foldCase(uint32_t options=0 /*U_FOLD_CASE_DEFAULT*/); + + //======================================== + // Access to the internal buffer + //======================================== + + /** + * Get a read/write pointer to the internal buffer. + * The buffer is guaranteed to be large enough for at least minCapacity UChars, + * writable, and is still owned by the UnicodeString object. + * Calls to getBuffer(minCapacity) must not be nested, and + * must be matched with calls to releaseBuffer(newLength). + * If the string buffer was read-only or shared, + * then it will be reallocated and copied. + * + * An attempted nested call will return 0, and will not further modify the + * state of the UnicodeString object. + * It also returns 0 if the string is bogus. + * + * The actual capacity of the string buffer may be larger than minCapacity. + * getCapacity() returns the actual capacity. + * For many operations, the full capacity should be used to avoid reallocations. + * + * While the buffer is "open" between getBuffer(minCapacity) + * and releaseBuffer(newLength), the following applies: + * - The string length is set to 0. + * - Any read API call on the UnicodeString object will behave like on a 0-length string. + * - Any write API call on the UnicodeString object is disallowed and will have no effect. + * - You can read from and write to the returned buffer. + * - The previous string contents will still be in the buffer; + * if you want to use it, then you need to call length() before getBuffer(minCapacity). + * If the length() was greater than minCapacity, then any contents after minCapacity + * may be lost. + * The buffer contents is not NUL-terminated by getBuffer(). + * If length()<getCapacity() then you can terminate it by writing a NUL + * at index length(). + * - You must call releaseBuffer(newLength) before and in order to + * return to normal UnicodeString operation. + * + * @param minCapacity the minimum number of UChars that are to be available + * in the buffer, starting at the returned pointer; + * default to the current string capacity if minCapacity==-1 + * @return a writable pointer to the internal string buffer, + * or 0 if an error occurs (nested calls, out of memory) + * + * @see releaseBuffer + * @see getTerminatedBuffer() + * @stable ICU 2.0 + */ + UChar *getBuffer(int32_t minCapacity); + + /** + * Release a read/write buffer on a UnicodeString object with an + * "open" getBuffer(minCapacity). + * This function must be called in a matched pair with getBuffer(minCapacity). + * releaseBuffer(newLength) must be called if and only if a getBuffer(minCapacity) is "open". + * + * It will set the string length to newLength, at most to the current capacity. + * If newLength==-1 then it will set the length according to the + * first NUL in the buffer, or to the capacity if there is no NUL. + * + * After calling releaseBuffer(newLength) the UnicodeString is back to normal operation. + * + * @param newLength the new length of the UnicodeString object; + * defaults to the current capacity if newLength is greater than that; + * if newLength==-1, it defaults to u_strlen(buffer) but not more than + * the current capacity of the string + * + * @see getBuffer(int32_t minCapacity) + * @stable ICU 2.0 + */ + void releaseBuffer(int32_t newLength=-1); + + /** + * Get a read-only pointer to the internal buffer. + * This can be called at any time on a valid UnicodeString. + * + * It returns 0 if the string is bogus, or + * during an "open" getBuffer(minCapacity). + * + * It can be called as many times as desired. + * The pointer that it returns will remain valid until the UnicodeString object is modified, + * at which time the pointer is semantically invalidated and must not be used any more. + * + * The capacity of the buffer can be determined with getCapacity(). + * The part after length() may or may not be initialized and valid, + * depending on the history of the UnicodeString object. + * + * The buffer contents is (probably) not NUL-terminated. + * You can check if it is with + * <code>(s.length()<s.getCapacity() && buffer[s.length()]==0)</code>. + * (See getTerminatedBuffer().) + * + * The buffer may reside in read-only memory. Its contents must not + * be modified. + * + * @return a read-only pointer to the internal string buffer, + * or 0 if the string is empty or bogus + * + * @see getBuffer(int32_t minCapacity) + * @see getTerminatedBuffer() + * @stable ICU 2.0 + */ + inline const UChar *getBuffer() const; + + /** + * Get a read-only pointer to the internal buffer, + * making sure that it is NUL-terminated. + * This can be called at any time on a valid UnicodeString. + * + * It returns 0 if the string is bogus, or + * during an "open" getBuffer(minCapacity), or if the buffer cannot + * be NUL-terminated (because memory allocation failed). + * + * It can be called as many times as desired. + * The pointer that it returns will remain valid until the UnicodeString object is modified, + * at which time the pointer is semantically invalidated and must not be used any more. + * + * The capacity of the buffer can be determined with getCapacity(). + * The part after length()+1 may or may not be initialized and valid, + * depending on the history of the UnicodeString object. + * + * The buffer contents is guaranteed to be NUL-terminated. + * getTerminatedBuffer() may reallocate the buffer if a terminating NUL + * is written. + * For this reason, this function is not const, unlike getBuffer(). + * Note that a UnicodeString may also contain NUL characters as part of its contents. + * + * The buffer may reside in read-only memory. Its contents must not + * be modified. + * + * @return a read-only pointer to the internal string buffer, + * or 0 if the string is empty or bogus + * + * @see getBuffer(int32_t minCapacity) + * @see getBuffer() + * @stable ICU 2.2 + */ + const UChar *getTerminatedBuffer(); + + //======================================== + // Constructors + //======================================== + + /** Construct an empty UnicodeString. + * @stable ICU 2.0 + */ + inline UnicodeString(); + + /** + * Construct a UnicodeString with capacity to hold <TT>capacity</TT> UChars + * @param capacity the number of UChars this UnicodeString should hold + * before a resize is necessary; if count is greater than 0 and count + * code points c take up more space than capacity, then capacity is adjusted + * accordingly. + * @param c is used to initially fill the string + * @param count specifies how many code points c are to be written in the + * string + * @stable ICU 2.0 + */ + UnicodeString(int32_t capacity, UChar32 c, int32_t count); + + /** + * Single UChar (code unit) constructor. + * + * It is recommended to mark this constructor "explicit" by + * <code>-DUNISTR_FROM_CHAR_EXPLICIT=explicit</code> + * on the compiler command line or similar. + * @param ch the character to place in the UnicodeString + * @stable ICU 2.0 + */ + UNISTR_FROM_CHAR_EXPLICIT UnicodeString(UChar ch); + + /** + * Single UChar32 (code point) constructor. + * + * It is recommended to mark this constructor "explicit" by + * <code>-DUNISTR_FROM_CHAR_EXPLICIT=explicit</code> + * on the compiler command line or similar. + * @param ch the character to place in the UnicodeString + * @stable ICU 2.0 + */ + UNISTR_FROM_CHAR_EXPLICIT UnicodeString(UChar32 ch); + + /** + * UChar* constructor. + * + * It is recommended to mark this constructor "explicit" by + * <code>-DUNISTR_FROM_STRING_EXPLICIT=explicit</code> + * on the compiler command line or similar. + * @param text The characters to place in the UnicodeString. <TT>text</TT> + * must be NULL (U+0000) terminated. + * @stable ICU 2.0 + */ + UNISTR_FROM_STRING_EXPLICIT UnicodeString(const UChar *text); + + /** + * UChar* constructor. + * @param text The characters to place in the UnicodeString. + * @param textLength The number of Unicode characters in <TT>text</TT> + * to copy. + * @stable ICU 2.0 + */ + UnicodeString(const UChar *text, + int32_t textLength); + + /** + * Readonly-aliasing UChar* constructor. + * The text will be used for the UnicodeString object, but + * it will not be released when the UnicodeString is destroyed. + * This has copy-on-write semantics: + * When the string is modified, then the buffer is first copied into + * newly allocated memory. + * The aliased buffer is never modified. + * + * In an assignment to another UnicodeString, when using the copy constructor + * or the assignment operator, the text will be copied. + * When using fastCopyFrom(), the text will be aliased again, + * so that both strings then alias the same readonly-text. + * + * @param isTerminated specifies if <code>text</code> is <code>NUL</code>-terminated. + * This must be true if <code>textLength==-1</code>. + * @param text The characters to alias for the UnicodeString. + * @param textLength The number of Unicode characters in <code>text</code> to alias. + * If -1, then this constructor will determine the length + * by calling <code>u_strlen()</code>. + * @stable ICU 2.0 + */ + UnicodeString(UBool isTerminated, + const UChar *text, + int32_t textLength); + + /** + * Writable-aliasing UChar* constructor. + * The text will be used for the UnicodeString object, but + * it will not be released when the UnicodeString is destroyed. + * This has write-through semantics: + * For as long as the capacity of the buffer is sufficient, write operations + * will directly affect the buffer. When more capacity is necessary, then + * a new buffer will be allocated and the contents copied as with regularly + * constructed strings. + * In an assignment to another UnicodeString, the buffer will be copied. + * The extract(UChar *dst) function detects whether the dst pointer is the same + * as the string buffer itself and will in this case not copy the contents. + * + * @param buffer The characters to alias for the UnicodeString. + * @param buffLength The number of Unicode characters in <code>buffer</code> to alias. + * @param buffCapacity The size of <code>buffer</code> in UChars. + * @stable ICU 2.0 + */ + UnicodeString(UChar *buffer, int32_t buffLength, int32_t buffCapacity); + +#if U_CHARSET_IS_UTF8 || !UCONFIG_NO_CONVERSION + + /** + * char* constructor. + * Uses the default converter (and thus depends on the ICU conversion code) + * unless U_CHARSET_IS_UTF8 is set to 1. + * + * For ASCII (really "invariant character") strings it is more efficient to use + * the constructor that takes a US_INV (for its enum EInvariant). + * For ASCII (invariant-character) string literals, see UNICODE_STRING and + * UNICODE_STRING_SIMPLE. + * + * It is recommended to mark this constructor "explicit" by + * <code>-DUNISTR_FROM_STRING_EXPLICIT=explicit</code> + * on the compiler command line or similar. + * @param codepageData an array of bytes, null-terminated, + * in the platform's default codepage. + * @stable ICU 2.0 + * @see UNICODE_STRING + * @see UNICODE_STRING_SIMPLE + */ + UNISTR_FROM_STRING_EXPLICIT UnicodeString(const char *codepageData); + + /** + * char* constructor. + * Uses the default converter (and thus depends on the ICU conversion code) + * unless U_CHARSET_IS_UTF8 is set to 1. + * @param codepageData an array of bytes in the platform's default codepage. + * @param dataLength The number of bytes in <TT>codepageData</TT>. + * @stable ICU 2.0 + */ + UnicodeString(const char *codepageData, int32_t dataLength); + +#endif + +#if !UCONFIG_NO_CONVERSION + + /** + * char* constructor. + * @param codepageData an array of bytes, null-terminated + * @param codepage the encoding of <TT>codepageData</TT>. The special + * value 0 for <TT>codepage</TT> indicates that the text is in the + * platform's default codepage. + * + * If <code>codepage</code> is an empty string (<code>""</code>), + * then a simple conversion is performed on the codepage-invariant + * subset ("invariant characters") of the platform encoding. See utypes.h. + * Recommendation: For invariant-character strings use the constructor + * UnicodeString(const char *src, int32_t length, enum EInvariant inv) + * because it avoids object code dependencies of UnicodeString on + * the conversion code. + * + * @stable ICU 2.0 + */ + UnicodeString(const char *codepageData, const char *codepage); + + /** + * char* constructor. + * @param codepageData an array of bytes. + * @param dataLength The number of bytes in <TT>codepageData</TT>. + * @param codepage the encoding of <TT>codepageData</TT>. The special + * value 0 for <TT>codepage</TT> indicates that the text is in the + * platform's default codepage. + * If <code>codepage</code> is an empty string (<code>""</code>), + * then a simple conversion is performed on the codepage-invariant + * subset ("invariant characters") of the platform encoding. See utypes.h. + * Recommendation: For invariant-character strings use the constructor + * UnicodeString(const char *src, int32_t length, enum EInvariant inv) + * because it avoids object code dependencies of UnicodeString on + * the conversion code. + * + * @stable ICU 2.0 + */ + UnicodeString(const char *codepageData, int32_t dataLength, const char *codepage); + + /** + * char * / UConverter constructor. + * This constructor uses an existing UConverter object to + * convert the codepage string to Unicode and construct a UnicodeString + * from that. + * + * The converter is reset at first. + * If the error code indicates a failure before this constructor is called, + * or if an error occurs during conversion or construction, + * then the string will be bogus. + * + * This function avoids the overhead of opening and closing a converter if + * multiple strings are constructed. + * + * @param src input codepage string + * @param srcLength length of the input string, can be -1 for NUL-terminated strings + * @param cnv converter object (ucnv_resetToUnicode() will be called), + * can be NULL for the default converter + * @param errorCode normal ICU error code + * @stable ICU 2.0 + */ + UnicodeString( + const char *src, int32_t srcLength, + UConverter *cnv, + UErrorCode &errorCode); + +#endif + + /** + * Constructs a Unicode string from an invariant-character char * string. + * About invariant characters see utypes.h. + * This constructor has no runtime dependency on conversion code and is + * therefore recommended over ones taking a charset name string + * (where the empty string "" indicates invariant-character conversion). + * + * Use the macro US_INV as the third, signature-distinguishing parameter. + * + * For example: + * \code + * void fn(const char *s) { + * UnicodeString ustr(s, -1, US_INV); + * // use ustr ... + * } + * \endcode + * + * @param src String using only invariant characters. + * @param length Length of src, or -1 if NUL-terminated. + * @param inv Signature-distinguishing paramater, use US_INV. + * + * @see US_INV + * @stable ICU 3.2 + */ + UnicodeString(const char *src, int32_t length, enum EInvariant inv); + + + /** + * Copy constructor. + * @param that The UnicodeString object to copy. + * @stable ICU 2.0 + */ + UnicodeString(const UnicodeString& that); + + /** + * 'Substring' constructor from tail of source string. + * @param src The UnicodeString object to copy. + * @param srcStart The offset into <tt>src</tt> at which to start copying. + * @stable ICU 2.2 + */ + UnicodeString(const UnicodeString& src, int32_t srcStart); + + /** + * 'Substring' constructor from subrange of source string. + * @param src The UnicodeString object to copy. + * @param srcStart The offset into <tt>src</tt> at which to start copying. + * @param srcLength The number of characters from <tt>src</tt> to copy. + * @stable ICU 2.2 + */ + UnicodeString(const UnicodeString& src, int32_t srcStart, int32_t srcLength); + + /** + * Clone this object, an instance of a subclass of Replaceable. + * Clones can be used concurrently in multiple threads. + * If a subclass does not implement clone(), or if an error occurs, + * then NULL is returned. + * The clone functions in all subclasses return a pointer to a Replaceable + * because some compilers do not support covariant (same-as-this) + * return types; cast to the appropriate subclass if necessary. + * The caller must delete the clone. + * + * @return a clone of this object + * + * @see Replaceable::clone + * @see getDynamicClassID + * @stable ICU 2.6 + */ + virtual Replaceable *clone() const; + + /** Destructor. + * @stable ICU 2.0 + */ + virtual ~UnicodeString(); + + /** + * Create a UnicodeString from a UTF-8 string. + * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string. + * Calls u_strFromUTF8WithSub(). + * + * @param utf8 UTF-8 input string. + * Note that a StringPiece can be implicitly constructed + * from a std::string or a NUL-terminated const char * string. + * @return A UnicodeString with equivalent UTF-16 contents. + * @see toUTF8 + * @see toUTF8String + * @stable ICU 4.2 + */ + static UnicodeString fromUTF8(const StringPiece &utf8); + + /** + * Create a UnicodeString from a UTF-32 string. + * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string. + * Calls u_strFromUTF32WithSub(). + * + * @param utf32 UTF-32 input string. Must not be NULL. + * @param length Length of the input string, or -1 if NUL-terminated. + * @return A UnicodeString with equivalent UTF-16 contents. + * @see toUTF32 + * @stable ICU 4.2 + */ + static UnicodeString fromUTF32(const UChar32 *utf32, int32_t length); + + /* Miscellaneous operations */ + + /** + * Unescape a string of characters and return a string containing + * the result. The following escape sequences are recognized: + * + * \\uhhhh 4 hex digits; h in [0-9A-Fa-f] + * \\Uhhhhhhhh 8 hex digits + * \\xhh 1-2 hex digits + * \\ooo 1-3 octal digits; o in [0-7] + * \\cX control-X; X is masked with 0x1F + * + * as well as the standard ANSI C escapes: + * + * \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A, + * \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B, + * \\" => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C + * + * Anything else following a backslash is generically escaped. For + * example, "[a\\-z]" returns "[a-z]". + * + * If an escape sequence is ill-formed, this method returns an empty + * string. An example of an ill-formed sequence is "\\u" followed by + * fewer than 4 hex digits. + * + * This function is similar to u_unescape() but not identical to it. + * The latter takes a source char*, so it does escape recognition + * and also invariant conversion. + * + * @return a string with backslash escapes interpreted, or an + * empty string on error. + * @see UnicodeString#unescapeAt() + * @see u_unescape() + * @see u_unescapeAt() + * @stable ICU 2.0 + */ + UnicodeString unescape() const; + + /** + * Unescape a single escape sequence and return the represented + * character. See unescape() for a listing of the recognized escape + * sequences. The character at offset-1 is assumed (without + * checking) to be a backslash. If the escape sequence is + * ill-formed, or the offset is out of range, U_SENTINEL=-1 is + * returned. + * + * @param offset an input output parameter. On input, it is the + * offset into this string where the escape sequence is located, + * after the initial backslash. On output, it is advanced after the + * last character parsed. On error, it is not advanced at all. + * @return the character represented by the escape sequence at + * offset, or U_SENTINEL=-1 on error. + * @see UnicodeString#unescape() + * @see u_unescape() + * @see u_unescapeAt() + * @stable ICU 2.0 + */ + UChar32 unescapeAt(int32_t &offset) const; + + /** + * ICU "poor man's RTTI", returns a UClassID for this class. + * + * @stable ICU 2.2 + */ + static UClassID U_EXPORT2 getStaticClassID(); + + /** + * ICU "poor man's RTTI", returns a UClassID for the actual class. + * + * @stable ICU 2.2 + */ + virtual UClassID getDynamicClassID() const; + + //======================================== + // Implementation methods + //======================================== + +protected: + /** + * Implement Replaceable::getLength() (see jitterbug 1027). + * @stable ICU 2.4 + */ + virtual int32_t getLength() const; + + /** + * The change in Replaceable to use virtual getCharAt() allows + * UnicodeString::charAt() to be inline again (see jitterbug 709). + * @stable ICU 2.4 + */ + virtual UChar getCharAt(int32_t offset) const; + + /** + * The change in Replaceable to use virtual getChar32At() allows + * UnicodeString::char32At() to be inline again (see jitterbug 709). + * @stable ICU 2.4 + */ + virtual UChar32 getChar32At(int32_t offset) const; + +private: + // For char* constructors. Could be made public. + UnicodeString &setToUTF8(const StringPiece &utf8); + // For extract(char*). + // We could make a toUTF8(target, capacity, errorCode) public but not + // this version: New API will be cleaner if we make callers create substrings + // rather than having start+length on every method, + // and it should take a UErrorCode&. + int32_t + toUTF8(int32_t start, int32_t len, + char *target, int32_t capacity) const; + + /** + * Internal string contents comparison, called by operator==. + * Requires: this & text not bogus and have same lengths. + */ + UBool doEquals(const UnicodeString &text, int32_t len) const; + + inline int8_t + doCompare(int32_t start, + int32_t length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const; + + int8_t doCompare(int32_t start, + int32_t length, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength) const; + + inline int8_t + doCompareCodePointOrder(int32_t start, + int32_t length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const; + + int8_t doCompareCodePointOrder(int32_t start, + int32_t length, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength) const; + + inline int8_t + doCaseCompare(int32_t start, + int32_t length, + const UnicodeString &srcText, + int32_t srcStart, + int32_t srcLength, + uint32_t options) const; + + int8_t + doCaseCompare(int32_t start, + int32_t length, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength, + uint32_t options) const; + + int32_t doIndexOf(UChar c, + int32_t start, + int32_t length) const; + + int32_t doIndexOf(UChar32 c, + int32_t start, + int32_t length) const; + + int32_t doLastIndexOf(UChar c, + int32_t start, + int32_t length) const; + + int32_t doLastIndexOf(UChar32 c, + int32_t start, + int32_t length) const; + + void doExtract(int32_t start, + int32_t length, + UChar *dst, + int32_t dstStart) const; + + inline void doExtract(int32_t start, + int32_t length, + UnicodeString& target) const; + + inline UChar doCharAt(int32_t offset) const; + + UnicodeString& doReplace(int32_t start, + int32_t length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength); + + UnicodeString& doReplace(int32_t start, + int32_t length, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength); + + UnicodeString& doReverse(int32_t start, + int32_t length); + + // calculate hash code + int32_t doHashCode(void) const; + + // get pointer to start of array + // these do not check for kOpenGetBuffer, unlike the public getBuffer() function + inline UChar* getArrayStart(void); + inline const UChar* getArrayStart(void) const; + + // A UnicodeString object (not necessarily its current buffer) + // is writable unless it isBogus() or it has an "open" getBuffer(minCapacity). + inline UBool isWritable() const; + + // Is the current buffer writable? + inline UBool isBufferWritable() const; + + // None of the following does releaseArray(). + inline void setLength(int32_t len); // sets only fShortLength and fLength + inline void setToEmpty(); // sets fFlags=kShortString + inline void setArray(UChar *array, int32_t len, int32_t capacity); // does not set fFlags + + // allocate the array; result may be fStackBuffer + // sets refCount to 1 if appropriate + // sets fArray, fCapacity, and fFlags + // returns boolean for success or failure + UBool allocate(int32_t capacity); + + // release the array if owned + void releaseArray(void); + + // turn a bogus string into an empty one + void unBogus(); + + // implements assigment operator, copy constructor, and fastCopyFrom() + UnicodeString ©From(const UnicodeString &src, UBool fastCopy=FALSE); + + // Pin start and limit to acceptable values. + inline void pinIndex(int32_t& start) const; + inline void pinIndices(int32_t& start, + int32_t& length) const; + +#if !UCONFIG_NO_CONVERSION + + /* Internal extract() using UConverter. */ + int32_t doExtract(int32_t start, int32_t length, + char *dest, int32_t destCapacity, + UConverter *cnv, + UErrorCode &errorCode) const; + + /* + * Real constructor for converting from codepage data. + * It assumes that it is called with !fRefCounted. + * + * If <code>codepage==0</code>, then the default converter + * is used for the platform encoding. + * If <code>codepage</code> is an empty string (<code>""</code>), + * then a simple conversion is performed on the codepage-invariant + * subset ("invariant characters") of the platform encoding. See utypes.h. + */ + void doCodepageCreate(const char *codepageData, + int32_t dataLength, + const char *codepage); + + /* + * Worker function for creating a UnicodeString from + * a codepage string using a UConverter. + */ + void + doCodepageCreate(const char *codepageData, + int32_t dataLength, + UConverter *converter, + UErrorCode &status); + +#endif + + /* + * This function is called when write access to the array + * is necessary. + * + * We need to make a copy of the array if + * the buffer is read-only, or + * the buffer is refCounted (shared), and refCount>1, or + * the buffer is too small. + * + * Return FALSE if memory could not be allocated. + */ + UBool cloneArrayIfNeeded(int32_t newCapacity = -1, + int32_t growCapacity = -1, + UBool doCopyArray = TRUE, + int32_t **pBufferToDelete = 0, + UBool forceClone = FALSE); + + /** + * Common function for UnicodeString case mappings. + * The stringCaseMapper has the same type UStringCaseMapper + * as in ustr_imp.h for ustrcase_map(). + */ + UnicodeString & + caseMap(const UCaseMap *csm, UStringCaseMapper *stringCaseMapper); + + // ref counting + void addRef(void); + int32_t removeRef(void); + int32_t refCount(void) const; + + // constants + enum { + // Set the stack buffer size so that sizeof(UnicodeString) is, + // naturally (without padding), a multiple of sizeof(pointer). + US_STACKBUF_SIZE= sizeof(void *)==4 ? 13 : 15, // Size of stack buffer for short strings + kInvalidUChar=0xffff, // invalid UChar index + kGrowSize=128, // grow size for this buffer + kInvalidHashCode=0, // invalid hash code + kEmptyHashCode=1, // hash code for empty string + + // bit flag values for fFlags + kIsBogus=1, // this string is bogus, i.e., not valid or NULL + kUsingStackBuffer=2,// using fUnion.fStackBuffer instead of fUnion.fFields + kRefCounted=4, // there is a refCount field before the characters in fArray + kBufferIsReadonly=8,// do not write to this buffer + kOpenGetBuffer=16, // getBuffer(minCapacity) was called (is "open"), + // and releaseBuffer(newLength) must be called + + // combined values for convenience + kShortString=kUsingStackBuffer, + kLongString=kRefCounted, + kReadonlyAlias=kBufferIsReadonly, + kWritableAlias=0 + }; + + friend class StringThreadTest; + friend class UnicodeStringAppendable; + + union StackBufferOrFields; // forward declaration necessary before friend declaration + friend union StackBufferOrFields; // make US_STACKBUF_SIZE visible inside fUnion + + /* + * The following are all the class fields that are stored + * in each UnicodeString object. + * Note that UnicodeString has virtual functions, + * therefore there is an implicit vtable pointer + * as the first real field. + * The fields should be aligned such that no padding is necessary. + * On 32-bit machines, the size should be 32 bytes, + * on 64-bit machines (8-byte pointers), it should be 40 bytes. + * + * We use a hack to achieve this. + * + * With at least some compilers, each of the following is forced to + * a multiple of sizeof(pointer) [the largest field base unit here is a data pointer], + * rounded up with additional padding if the fields do not already fit that requirement: + * - sizeof(class UnicodeString) + * - offsetof(UnicodeString, fUnion) + * - sizeof(fUnion) + * - sizeof(fFields) + * + * In order to avoid padding, we make sizeof(fStackBuffer)=16 (=8 UChars) + * which is at least as large as sizeof(fFields) on 32-bit and 64-bit machines. + * (Padding at the end of fFields is ok: + * As long as there is no padding after fStackBuffer, it is not wasted space.) + * + * We further assume that the compiler does not reorder the fields, + * so that fRestOfStackBuffer (which holds a few more UChars) immediately follows after fUnion, + * with at most some padding (but no other field) in between. + * (Padding there would be wasted space, but functionally harmless.) + * + * We use a few more sizeof(pointer)'s chunks of space with + * fRestOfStackBuffer, fShortLength and fFlags, + * to get up exactly to the intended sizeof(UnicodeString). + */ + // (implicit) *vtable; + union StackBufferOrFields { + // fStackBuffer is used iff (fFlags&kUsingStackBuffer) + // else fFields is used + UChar fStackBuffer[8]; // buffer for short strings, together with fRestOfStackBuffer + struct { + UChar *fArray; // the Unicode data + int32_t fCapacity; // capacity of fArray (in UChars) + int32_t fLength; // number of characters in fArray if >127; else undefined + } fFields; + } fUnion; + UChar fRestOfStackBuffer[US_STACKBUF_SIZE-8]; + int8_t fShortLength; // 0..127: length <0: real length is in fUnion.fFields.fLength + uint8_t fFlags; // bit flags: see constants above +}; + +/** + * Create a new UnicodeString with the concatenation of two others. + * + * @param s1 The first string to be copied to the new one. + * @param s2 The second string to be copied to the new one, after s1. + * @return UnicodeString(s1).append(s2) + * @stable ICU 2.8 + */ +U_COMMON_API UnicodeString U_EXPORT2 +operator+ (const UnicodeString &s1, const UnicodeString &s2); + +//======================================== +// Inline members +//======================================== + +//======================================== +// Privates +//======================================== + +inline void +UnicodeString::pinIndex(int32_t& start) const +{ + // pin index + if(start < 0) { + start = 0; + } else if(start > length()) { + start = length(); + } +} + +inline void +UnicodeString::pinIndices(int32_t& start, + int32_t& _length) const +{ + // pin indices + int32_t len = length(); + if(start < 0) { + start = 0; + } else if(start > len) { + start = len; + } + if(_length < 0) { + _length = 0; + } else if(_length > (len - start)) { + _length = (len - start); + } +} + +inline UChar* +UnicodeString::getArrayStart() +{ return (fFlags&kUsingStackBuffer) ? fUnion.fStackBuffer : fUnion.fFields.fArray; } + +inline const UChar* +UnicodeString::getArrayStart() const +{ return (fFlags&kUsingStackBuffer) ? fUnion.fStackBuffer : fUnion.fFields.fArray; } + +//======================================== +// Default constructor +//======================================== + +inline +UnicodeString::UnicodeString() + : fShortLength(0), + fFlags(kShortString) +{} + +//======================================== +// Read-only implementation methods +//======================================== +inline int32_t +UnicodeString::length() const +{ return fShortLength>=0 ? fShortLength : fUnion.fFields.fLength; } + +inline int32_t +UnicodeString::getCapacity() const +{ return (fFlags&kUsingStackBuffer) ? US_STACKBUF_SIZE : fUnion.fFields.fCapacity; } + +inline int32_t +UnicodeString::hashCode() const +{ return doHashCode(); } + +inline UBool +UnicodeString::isBogus() const +{ return (UBool)(fFlags & kIsBogus); } + +inline UBool +UnicodeString::isWritable() const +{ return (UBool)!(fFlags&(kOpenGetBuffer|kIsBogus)); } + +inline UBool +UnicodeString::isBufferWritable() const +{ + return (UBool)( + !(fFlags&(kOpenGetBuffer|kIsBogus|kBufferIsReadonly)) && + (!(fFlags&kRefCounted) || refCount()==1)); +} + +inline const UChar * +UnicodeString::getBuffer() const { + if(fFlags&(kIsBogus|kOpenGetBuffer)) { + return 0; + } else if(fFlags&kUsingStackBuffer) { + return fUnion.fStackBuffer; + } else { + return fUnion.fFields.fArray; + } +} + +//======================================== +// Read-only alias methods +//======================================== +inline int8_t +UnicodeString::doCompare(int32_t start, + int32_t thisLength, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const +{ + if(srcText.isBogus()) { + return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise + } else { + srcText.pinIndices(srcStart, srcLength); + return doCompare(start, thisLength, srcText.getArrayStart(), srcStart, srcLength); + } +} + +inline UBool +UnicodeString::operator== (const UnicodeString& text) const +{ + if(isBogus()) { + return text.isBogus(); + } else { + int32_t len = length(), textLength = text.length(); + return !text.isBogus() && len == textLength && doEquals(text, len); + } +} + +inline UBool +UnicodeString::operator!= (const UnicodeString& text) const +{ return (! operator==(text)); } + +inline UBool +UnicodeString::operator> (const UnicodeString& text) const +{ return doCompare(0, length(), text, 0, text.length()) == 1; } + +inline UBool +UnicodeString::operator< (const UnicodeString& text) const +{ return doCompare(0, length(), text, 0, text.length()) == -1; } + +inline UBool +UnicodeString::operator>= (const UnicodeString& text) const +{ return doCompare(0, length(), text, 0, text.length()) != -1; } + +inline UBool +UnicodeString::operator<= (const UnicodeString& text) const +{ return doCompare(0, length(), text, 0, text.length()) != 1; } + +inline int8_t +UnicodeString::compare(const UnicodeString& text) const +{ return doCompare(0, length(), text, 0, text.length()); } + +inline int8_t +UnicodeString::compare(int32_t start, + int32_t _length, + const UnicodeString& srcText) const +{ return doCompare(start, _length, srcText, 0, srcText.length()); } + +inline int8_t +UnicodeString::compare(const UChar *srcChars, + int32_t srcLength) const +{ return doCompare(0, length(), srcChars, 0, srcLength); } + +inline int8_t +UnicodeString::compare(int32_t start, + int32_t _length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const +{ return doCompare(start, _length, srcText, srcStart, srcLength); } + +inline int8_t +UnicodeString::compare(int32_t start, + int32_t _length, + const UChar *srcChars) const +{ return doCompare(start, _length, srcChars, 0, _length); } + +inline int8_t +UnicodeString::compare(int32_t start, + int32_t _length, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength) const +{ return doCompare(start, _length, srcChars, srcStart, srcLength); } + +inline int8_t +UnicodeString::compareBetween(int32_t start, + int32_t limit, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLimit) const +{ return doCompare(start, limit - start, + srcText, srcStart, srcLimit - srcStart); } + +inline int8_t +UnicodeString::doCompareCodePointOrder(int32_t start, + int32_t thisLength, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const +{ + if(srcText.isBogus()) { + return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise + } else { + srcText.pinIndices(srcStart, srcLength); + return doCompareCodePointOrder(start, thisLength, srcText.getArrayStart(), srcStart, srcLength); + } +} + +inline int8_t +UnicodeString::compareCodePointOrder(const UnicodeString& text) const +{ return doCompareCodePointOrder(0, length(), text, 0, text.length()); } + +inline int8_t +UnicodeString::compareCodePointOrder(int32_t start, + int32_t _length, + const UnicodeString& srcText) const +{ return doCompareCodePointOrder(start, _length, srcText, 0, srcText.length()); } + +inline int8_t +UnicodeString::compareCodePointOrder(const UChar *srcChars, + int32_t srcLength) const +{ return doCompareCodePointOrder(0, length(), srcChars, 0, srcLength); } + +inline int8_t +UnicodeString::compareCodePointOrder(int32_t start, + int32_t _length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const +{ return doCompareCodePointOrder(start, _length, srcText, srcStart, srcLength); } + +inline int8_t +UnicodeString::compareCodePointOrder(int32_t start, + int32_t _length, + const UChar *srcChars) const +{ return doCompareCodePointOrder(start, _length, srcChars, 0, _length); } + +inline int8_t +UnicodeString::compareCodePointOrder(int32_t start, + int32_t _length, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength) const +{ return doCompareCodePointOrder(start, _length, srcChars, srcStart, srcLength); } + +inline int8_t +UnicodeString::compareCodePointOrderBetween(int32_t start, + int32_t limit, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLimit) const +{ return doCompareCodePointOrder(start, limit - start, + srcText, srcStart, srcLimit - srcStart); } + +inline int8_t +UnicodeString::doCaseCompare(int32_t start, + int32_t thisLength, + const UnicodeString &srcText, + int32_t srcStart, + int32_t srcLength, + uint32_t options) const +{ + if(srcText.isBogus()) { + return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise + } else { + srcText.pinIndices(srcStart, srcLength); + return doCaseCompare(start, thisLength, srcText.getArrayStart(), srcStart, srcLength, options); + } +} + +inline int8_t +UnicodeString::caseCompare(const UnicodeString &text, uint32_t options) const { + return doCaseCompare(0, length(), text, 0, text.length(), options); +} + +inline int8_t +UnicodeString::caseCompare(int32_t start, + int32_t _length, + const UnicodeString &srcText, + uint32_t options) const { + return doCaseCompare(start, _length, srcText, 0, srcText.length(), options); +} + +inline int8_t +UnicodeString::caseCompare(const UChar *srcChars, + int32_t srcLength, + uint32_t options) const { + return doCaseCompare(0, length(), srcChars, 0, srcLength, options); +} + +inline int8_t +UnicodeString::caseCompare(int32_t start, + int32_t _length, + const UnicodeString &srcText, + int32_t srcStart, + int32_t srcLength, + uint32_t options) const { + return doCaseCompare(start, _length, srcText, srcStart, srcLength, options); +} + +inline int8_t +UnicodeString::caseCompare(int32_t start, + int32_t _length, + const UChar *srcChars, + uint32_t options) const { + return doCaseCompare(start, _length, srcChars, 0, _length, options); +} + +inline int8_t +UnicodeString::caseCompare(int32_t start, + int32_t _length, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength, + uint32_t options) const { + return doCaseCompare(start, _length, srcChars, srcStart, srcLength, options); +} + +inline int8_t +UnicodeString::caseCompareBetween(int32_t start, + int32_t limit, + const UnicodeString &srcText, + int32_t srcStart, + int32_t srcLimit, + uint32_t options) const { + return doCaseCompare(start, limit - start, srcText, srcStart, srcLimit - srcStart, options); +} + +inline int32_t +UnicodeString::indexOf(const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength, + int32_t start, + int32_t _length) const +{ + if(!srcText.isBogus()) { + srcText.pinIndices(srcStart, srcLength); + if(srcLength > 0) { + return indexOf(srcText.getArrayStart(), srcStart, srcLength, start, _length); + } + } + return -1; +} + +inline int32_t +UnicodeString::indexOf(const UnicodeString& text) const +{ return indexOf(text, 0, text.length(), 0, length()); } + +inline int32_t +UnicodeString::indexOf(const UnicodeString& text, + int32_t start) const { + pinIndex(start); + return indexOf(text, 0, text.length(), start, length() - start); +} + +inline int32_t +UnicodeString::indexOf(const UnicodeString& text, + int32_t start, + int32_t _length) const +{ return indexOf(text, 0, text.length(), start, _length); } + +inline int32_t +UnicodeString::indexOf(const UChar *srcChars, + int32_t srcLength, + int32_t start) const { + pinIndex(start); + return indexOf(srcChars, 0, srcLength, start, length() - start); +} + +inline int32_t +UnicodeString::indexOf(const UChar *srcChars, + int32_t srcLength, + int32_t start, + int32_t _length) const +{ return indexOf(srcChars, 0, srcLength, start, _length); } + +inline int32_t +UnicodeString::indexOf(UChar c, + int32_t start, + int32_t _length) const +{ return doIndexOf(c, start, _length); } + +inline int32_t +UnicodeString::indexOf(UChar32 c, + int32_t start, + int32_t _length) const +{ return doIndexOf(c, start, _length); } + +inline int32_t +UnicodeString::indexOf(UChar c) const +{ return doIndexOf(c, 0, length()); } + +inline int32_t +UnicodeString::indexOf(UChar32 c) const +{ return indexOf(c, 0, length()); } + +inline int32_t +UnicodeString::indexOf(UChar c, + int32_t start) const { + pinIndex(start); + return doIndexOf(c, start, length() - start); +} + +inline int32_t +UnicodeString::indexOf(UChar32 c, + int32_t start) const { + pinIndex(start); + return indexOf(c, start, length() - start); +} + +inline int32_t +UnicodeString::lastIndexOf(const UChar *srcChars, + int32_t srcLength, + int32_t start, + int32_t _length) const +{ return lastIndexOf(srcChars, 0, srcLength, start, _length); } + +inline int32_t +UnicodeString::lastIndexOf(const UChar *srcChars, + int32_t srcLength, + int32_t start) const { + pinIndex(start); + return lastIndexOf(srcChars, 0, srcLength, start, length() - start); +} + +inline int32_t +UnicodeString::lastIndexOf(const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength, + int32_t start, + int32_t _length) const +{ + if(!srcText.isBogus()) { + srcText.pinIndices(srcStart, srcLength); + if(srcLength > 0) { + return lastIndexOf(srcText.getArrayStart(), srcStart, srcLength, start, _length); + } + } + return -1; +} + +inline int32_t +UnicodeString::lastIndexOf(const UnicodeString& text, + int32_t start, + int32_t _length) const +{ return lastIndexOf(text, 0, text.length(), start, _length); } + +inline int32_t +UnicodeString::lastIndexOf(const UnicodeString& text, + int32_t start) const { + pinIndex(start); + return lastIndexOf(text, 0, text.length(), start, length() - start); +} + +inline int32_t +UnicodeString::lastIndexOf(const UnicodeString& text) const +{ return lastIndexOf(text, 0, text.length(), 0, length()); } + +inline int32_t +UnicodeString::lastIndexOf(UChar c, + int32_t start, + int32_t _length) const +{ return doLastIndexOf(c, start, _length); } + +inline int32_t +UnicodeString::lastIndexOf(UChar32 c, + int32_t start, + int32_t _length) const { + return doLastIndexOf(c, start, _length); +} + +inline int32_t +UnicodeString::lastIndexOf(UChar c) const +{ return doLastIndexOf(c, 0, length()); } + +inline int32_t +UnicodeString::lastIndexOf(UChar32 c) const { + return lastIndexOf(c, 0, length()); +} + +inline int32_t +UnicodeString::lastIndexOf(UChar c, + int32_t start) const { + pinIndex(start); + return doLastIndexOf(c, start, length() - start); +} + +inline int32_t +UnicodeString::lastIndexOf(UChar32 c, + int32_t start) const { + pinIndex(start); + return lastIndexOf(c, start, length() - start); +} + +inline UBool +UnicodeString::startsWith(const UnicodeString& text) const +{ return compare(0, text.length(), text, 0, text.length()) == 0; } + +inline UBool +UnicodeString::startsWith(const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const +{ return doCompare(0, srcLength, srcText, srcStart, srcLength) == 0; } + +inline UBool +UnicodeString::startsWith(const UChar *srcChars, int32_t srcLength) const { + if(srcLength < 0) { + srcLength = u_strlen(srcChars); + } + return doCompare(0, srcLength, srcChars, 0, srcLength) == 0; +} + +inline UBool +UnicodeString::startsWith(const UChar *srcChars, int32_t srcStart, int32_t srcLength) const { + if(srcLength < 0) { + srcLength = u_strlen(srcChars); + } + return doCompare(0, srcLength, srcChars, srcStart, srcLength) == 0; +} + +inline UBool +UnicodeString::endsWith(const UnicodeString& text) const +{ return doCompare(length() - text.length(), text.length(), + text, 0, text.length()) == 0; } + +inline UBool +UnicodeString::endsWith(const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const { + srcText.pinIndices(srcStart, srcLength); + return doCompare(length() - srcLength, srcLength, + srcText, srcStart, srcLength) == 0; +} + +inline UBool +UnicodeString::endsWith(const UChar *srcChars, + int32_t srcLength) const { + if(srcLength < 0) { + srcLength = u_strlen(srcChars); + } + return doCompare(length() - srcLength, srcLength, + srcChars, 0, srcLength) == 0; +} + +inline UBool +UnicodeString::endsWith(const UChar *srcChars, + int32_t srcStart, + int32_t srcLength) const { + if(srcLength < 0) { + srcLength = u_strlen(srcChars + srcStart); + } + return doCompare(length() - srcLength, srcLength, + srcChars, srcStart, srcLength) == 0; +} + +//======================================== +// replace +//======================================== +inline UnicodeString& +UnicodeString::replace(int32_t start, + int32_t _length, + const UnicodeString& srcText) +{ return doReplace(start, _length, srcText, 0, srcText.length()); } + +inline UnicodeString& +UnicodeString::replace(int32_t start, + int32_t _length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) +{ return doReplace(start, _length, srcText, srcStart, srcLength); } + +inline UnicodeString& +UnicodeString::replace(int32_t start, + int32_t _length, + const UChar *srcChars, + int32_t srcLength) +{ return doReplace(start, _length, srcChars, 0, srcLength); } + +inline UnicodeString& +UnicodeString::replace(int32_t start, + int32_t _length, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength) +{ return doReplace(start, _length, srcChars, srcStart, srcLength); } + +inline UnicodeString& +UnicodeString::replace(int32_t start, + int32_t _length, + UChar srcChar) +{ return doReplace(start, _length, &srcChar, 0, 1); } + +inline UnicodeString& +UnicodeString::replaceBetween(int32_t start, + int32_t limit, + const UnicodeString& srcText) +{ return doReplace(start, limit - start, srcText, 0, srcText.length()); } + +inline UnicodeString& +UnicodeString::replaceBetween(int32_t start, + int32_t limit, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLimit) +{ return doReplace(start, limit - start, srcText, srcStart, srcLimit - srcStart); } + +inline UnicodeString& +UnicodeString::findAndReplace(const UnicodeString& oldText, + const UnicodeString& newText) +{ return findAndReplace(0, length(), oldText, 0, oldText.length(), + newText, 0, newText.length()); } + +inline UnicodeString& +UnicodeString::findAndReplace(int32_t start, + int32_t _length, + const UnicodeString& oldText, + const UnicodeString& newText) +{ return findAndReplace(start, _length, oldText, 0, oldText.length(), + newText, 0, newText.length()); } + +// ============================ +// extract +// ============================ +inline void +UnicodeString::doExtract(int32_t start, + int32_t _length, + UnicodeString& target) const +{ target.replace(0, target.length(), *this, start, _length); } + +inline void +UnicodeString::extract(int32_t start, + int32_t _length, + UChar *target, + int32_t targetStart) const +{ doExtract(start, _length, target, targetStart); } + +inline void +UnicodeString::extract(int32_t start, + int32_t _length, + UnicodeString& target) const +{ doExtract(start, _length, target); } + +#if !UCONFIG_NO_CONVERSION + +inline int32_t +UnicodeString::extract(int32_t start, + int32_t _length, + char *dst, + const char *codepage) const + +{ + // This dstSize value will be checked explicitly + return extract(start, _length, dst, dst!=0 ? 0xffffffff : 0, codepage); +} + +#endif + +inline void +UnicodeString::extractBetween(int32_t start, + int32_t limit, + UChar *dst, + int32_t dstStart) const { + pinIndex(start); + pinIndex(limit); + doExtract(start, limit - start, dst, dstStart); +} + +inline UnicodeString +UnicodeString::tempSubStringBetween(int32_t start, int32_t limit) const { + return tempSubString(start, limit - start); +} + +inline UChar +UnicodeString::doCharAt(int32_t offset) const +{ + if((uint32_t)offset < (uint32_t)length()) { + return getArrayStart()[offset]; + } else { + return kInvalidUChar; + } +} + +inline UChar +UnicodeString::charAt(int32_t offset) const +{ return doCharAt(offset); } + +inline UChar +UnicodeString::operator[] (int32_t offset) const +{ return doCharAt(offset); } + +inline UBool +UnicodeString::isEmpty() const { + return fShortLength == 0; +} + +//======================================== +// Write implementation methods +//======================================== +inline void +UnicodeString::setLength(int32_t len) { + if(len <= 127) { + fShortLength = (int8_t)len; + } else { + fShortLength = (int8_t)-1; + fUnion.fFields.fLength = len; + } +} + +inline void +UnicodeString::setToEmpty() { + fShortLength = 0; + fFlags = kShortString; +} + +inline void +UnicodeString::setArray(UChar *array, int32_t len, int32_t capacity) { + setLength(len); + fUnion.fFields.fArray = array; + fUnion.fFields.fCapacity = capacity; +} + +inline UnicodeString& +UnicodeString::operator= (UChar ch) +{ return doReplace(0, length(), &ch, 0, 1); } + +inline UnicodeString& +UnicodeString::operator= (UChar32 ch) +{ return replace(0, length(), ch); } + +inline UnicodeString& +UnicodeString::setTo(const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) +{ + unBogus(); + return doReplace(0, length(), srcText, srcStart, srcLength); +} + +inline UnicodeString& +UnicodeString::setTo(const UnicodeString& srcText, + int32_t srcStart) +{ + unBogus(); + srcText.pinIndex(srcStart); + return doReplace(0, length(), srcText, srcStart, srcText.length() - srcStart); +} + +inline UnicodeString& +UnicodeString::setTo(const UnicodeString& srcText) +{ + return copyFrom(srcText); +} + +inline UnicodeString& +UnicodeString::setTo(const UChar *srcChars, + int32_t srcLength) +{ + unBogus(); + return doReplace(0, length(), srcChars, 0, srcLength); +} + +inline UnicodeString& +UnicodeString::setTo(UChar srcChar) +{ + unBogus(); + return doReplace(0, length(), &srcChar, 0, 1); +} + +inline UnicodeString& +UnicodeString::setTo(UChar32 srcChar) +{ + unBogus(); + return replace(0, length(), srcChar); +} + +inline UnicodeString& +UnicodeString::append(const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) +{ return doReplace(length(), 0, srcText, srcStart, srcLength); } + +inline UnicodeString& +UnicodeString::append(const UnicodeString& srcText) +{ return doReplace(length(), 0, srcText, 0, srcText.length()); } + +inline UnicodeString& +UnicodeString::append(const UChar *srcChars, + int32_t srcStart, + int32_t srcLength) +{ return doReplace(length(), 0, srcChars, srcStart, srcLength); } + +inline UnicodeString& +UnicodeString::append(const UChar *srcChars, + int32_t srcLength) +{ return doReplace(length(), 0, srcChars, 0, srcLength); } + +inline UnicodeString& +UnicodeString::append(UChar srcChar) +{ return doReplace(length(), 0, &srcChar, 0, 1); } + +inline UnicodeString& +UnicodeString::operator+= (UChar ch) +{ return doReplace(length(), 0, &ch, 0, 1); } + +inline UnicodeString& +UnicodeString::operator+= (UChar32 ch) { + return append(ch); +} + +inline UnicodeString& +UnicodeString::operator+= (const UnicodeString& srcText) +{ return doReplace(length(), 0, srcText, 0, srcText.length()); } + +inline UnicodeString& +UnicodeString::insert(int32_t start, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) +{ return doReplace(start, 0, srcText, srcStart, srcLength); } + +inline UnicodeString& +UnicodeString::insert(int32_t start, + const UnicodeString& srcText) +{ return doReplace(start, 0, srcText, 0, srcText.length()); } + +inline UnicodeString& +UnicodeString::insert(int32_t start, + const UChar *srcChars, + int32_t srcStart, + int32_t srcLength) +{ return doReplace(start, 0, srcChars, srcStart, srcLength); } + +inline UnicodeString& +UnicodeString::insert(int32_t start, + const UChar *srcChars, + int32_t srcLength) +{ return doReplace(start, 0, srcChars, 0, srcLength); } + +inline UnicodeString& +UnicodeString::insert(int32_t start, + UChar srcChar) +{ return doReplace(start, 0, &srcChar, 0, 1); } + +inline UnicodeString& +UnicodeString::insert(int32_t start, + UChar32 srcChar) +{ return replace(start, 0, srcChar); } + + +inline UnicodeString& +UnicodeString::remove() +{ + // remove() of a bogus string makes the string empty and non-bogus + if(isBogus()) { + setToEmpty(); + } else { + fShortLength = 0; + } + return *this; +} + +inline UnicodeString& +UnicodeString::remove(int32_t start, + int32_t _length) +{ + if(start <= 0 && _length == INT32_MAX) { + // remove(guaranteed everything) of a bogus string makes the string empty and non-bogus + return remove(); + } + return doReplace(start, _length, NULL, 0, 0); +} + +inline UnicodeString& +UnicodeString::removeBetween(int32_t start, + int32_t limit) +{ return doReplace(start, limit - start, NULL, 0, 0); } + +inline UnicodeString & +UnicodeString::retainBetween(int32_t start, int32_t limit) { + truncate(limit); + return doReplace(0, start, NULL, 0, 0); +} + +inline UBool +UnicodeString::truncate(int32_t targetLength) +{ + if(isBogus() && targetLength == 0) { + // truncate(0) of a bogus string makes the string empty and non-bogus + unBogus(); + return FALSE; + } else if((uint32_t)targetLength < (uint32_t)length()) { + setLength(targetLength); + return TRUE; + } else { + return FALSE; + } +} + +inline UnicodeString& +UnicodeString::reverse() +{ return doReverse(0, length()); } + +inline UnicodeString& +UnicodeString::reverse(int32_t start, + int32_t _length) +{ return doReverse(start, _length); } + +U_NAMESPACE_END + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/uobject.h b/src/core/basetypes/icu-ucsdet/include/unicode/uobject.h new file mode 100644 index 00000000..54ceace6 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/uobject.h @@ -0,0 +1,320 @@ +/* +****************************************************************************** +* +* Copyright (C) 2002-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: uobject.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002jun26 +* created by: Markus W. Scherer +*/ + +#ifndef __UOBJECT_H__ +#define __UOBJECT_H__ + +#include "unicode/utypes.h" + +/** + * \file + * \brief C++ API: Common ICU base class UObject. + */ + +/** + * @{ + * \def U_NO_THROW + * Define this to define the throw() specification so + * certain functions do not throw any exceptions + * + * UMemory operator new methods should have the throw() specification + * appended to them, so that the compiler adds the additional NULL check + * before calling constructors. Without, if <code>operator new</code> returns NULL the + * constructor is still called, and if the constructor references member + * data, (which it typically does), the result is a segmentation violation. + * + * @stable ICU 4.2 + */ +#ifndef U_NO_THROW +#define U_NO_THROW throw() +#endif + +/** @} */ + +/*===========================================================================*/ +/* UClassID-based RTTI */ +/*===========================================================================*/ + +/** + * UClassID is used to identify classes without using the compiler's RTTI. + * This was used before C++ compilers consistently supported RTTI. + * ICU 4.6 requires compiler RTTI to be turned on. + * + * Each class hierarchy which needs + * to implement polymorphic clone() or operator==() defines two methods, + * described in detail below. UClassID values can be compared using + * operator==(). Nothing else should be done with them. + * + * \par + * In class hierarchies that implement "poor man's RTTI", + * each concrete subclass implements getDynamicClassID() in the same way: + * + * \code + * class Derived { + * public: + * virtual UClassID getDynamicClassID() const + * { return Derived::getStaticClassID(); } + * } + * \endcode + * + * Each concrete class implements getStaticClassID() as well, which allows + * clients to test for a specific type. + * + * \code + * class Derived { + * public: + * static UClassID U_EXPORT2 getStaticClassID(); + * private: + * static char fgClassID; + * } + * + * // In Derived.cpp: + * UClassID Derived::getStaticClassID() + * { return (UClassID)&Derived::fgClassID; } + * char Derived::fgClassID = 0; // Value is irrelevant + * \endcode + * @stable ICU 2.0 + */ +typedef void* UClassID; + +U_NAMESPACE_BEGIN + +/** + * UMemory is the common ICU base class. + * All other ICU C++ classes are derived from UMemory (starting with ICU 2.4). + * + * This is primarily to make it possible and simple to override the + * C++ memory management by adding new/delete operators to this base class. + * + * To override ALL ICU memory management, including that from plain C code, + * replace the allocation functions declared in cmemory.h + * + * UMemory does not contain any virtual functions. + * Common "boilerplate" functions are defined in UObject. + * + * @stable ICU 2.4 + */ +class U_COMMON_API UMemory { +public: + +/* test versions for debugging shaper heap memory problems */ +#ifdef SHAPER_MEMORY_DEBUG + static void * NewArray(int size, int count); + static void * GrowArray(void * array, int newSize ); + static void FreeArray(void * array ); +#endif + +#if U_OVERRIDE_CXX_ALLOCATION + /** + * Override for ICU4C C++ memory management. + * simple, non-class types are allocated using the macros in common/cmemory.h + * (uprv_malloc(), uprv_free(), uprv_realloc()); + * they or something else could be used here to implement C++ new/delete + * for ICU4C C++ classes + * @stable ICU 2.4 + */ + static void * U_EXPORT2 operator new(size_t size) U_NO_THROW; + + /** + * Override for ICU4C C++ memory management. + * See new(). + * @stable ICU 2.4 + */ + static void * U_EXPORT2 operator new[](size_t size) U_NO_THROW; + + /** + * Override for ICU4C C++ memory management. + * simple, non-class types are allocated using the macros in common/cmemory.h + * (uprv_malloc(), uprv_free(), uprv_realloc()); + * they or something else could be used here to implement C++ new/delete + * for ICU4C C++ classes + * @stable ICU 2.4 + */ + static void U_EXPORT2 operator delete(void *p) U_NO_THROW; + + /** + * Override for ICU4C C++ memory management. + * See delete(). + * @stable ICU 2.4 + */ + static void U_EXPORT2 operator delete[](void *p) U_NO_THROW; + +#if U_HAVE_PLACEMENT_NEW + /** + * Override for ICU4C C++ memory management for STL. + * See new(). + * @stable ICU 2.6 + */ + static inline void * U_EXPORT2 operator new(size_t, void *ptr) U_NO_THROW { return ptr; } + + /** + * Override for ICU4C C++ memory management for STL. + * See delete(). + * @stable ICU 2.6 + */ + static inline void U_EXPORT2 operator delete(void *, void *) U_NO_THROW {} +#endif /* U_HAVE_PLACEMENT_NEW */ +#if U_HAVE_DEBUG_LOCATION_NEW + /** + * This method overrides the MFC debug version of the operator new + * + * @param size The requested memory size + * @param file The file where the allocation was requested + * @param line The line where the allocation was requested + */ + static void * U_EXPORT2 operator new(size_t size, const char* file, int line) U_NO_THROW; + /** + * This method provides a matching delete for the MFC debug new + * + * @param p The pointer to the allocated memory + * @param file The file where the allocation was requested + * @param line The line where the allocation was requested + */ + static void U_EXPORT2 operator delete(void* p, const char* file, int line) U_NO_THROW; +#endif /* U_HAVE_DEBUG_LOCATION_NEW */ +#endif /* U_OVERRIDE_CXX_ALLOCATION */ + + /* + * Assignment operator not declared. The compiler will provide one + * which does nothing since this class does not contain any data members. + * API/code coverage may show the assignment operator as present and + * untested - ignore. + * Subclasses need this assignment operator if they use compiler-provided + * assignment operators of their own. An alternative to not declaring one + * here would be to declare and empty-implement a protected or public one. + UMemory &UMemory::operator=(const UMemory &); + */ +}; + +/** + * UObject is the common ICU "boilerplate" class. + * UObject inherits UMemory (starting with ICU 2.4), + * and all other public ICU C++ classes + * are derived from UObject (starting with ICU 2.2). + * + * UObject contains common virtual functions, in particular a virtual destructor. + * + * The clone() function is not available in UObject because it is not + * implemented by all ICU classes. + * Many ICU services provide a clone() function for their class trees, + * defined on the service's C++ base class, and all subclasses within that + * service class tree return a pointer to the service base class + * (which itself is a subclass of UObject). + * This is because some compilers do not support covariant (same-as-this) + * return types; cast to the appropriate subclass if necessary. + * + * @stable ICU 2.2 + */ +class U_COMMON_API UObject : public UMemory { +public: + /** + * Destructor. + * + * @stable ICU 2.2 + */ + virtual ~UObject(); + + /** + * ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class. + * The base class implementation returns a dummy value. + * + * Use compiler RTTI rather than ICU's "poor man's RTTI". + * Since ICU 4.6, new ICU C++ class hierarchies do not implement "poor man's RTTI". + * + * @stable ICU 2.2 + */ + virtual UClassID getDynamicClassID() const; + +protected: + // the following functions are protected to prevent instantiation and + // direct use of UObject itself + + // default constructor + // inline UObject() {} + + // copy constructor + // inline UObject(const UObject &other) {} + +#if 0 + // TODO Sometime in the future. Implement operator==(). + // (This comment inserted in 2.2) + // some or all of the following "boilerplate" functions may be made public + // in a future ICU4C release when all subclasses implement them + + // assignment operator + // (not virtual, see "Taligent's Guide to Designing Programs" pp.73..74) + // commented out because the implementation is the same as a compiler's default + // UObject &operator=(const UObject &other) { return *this; } + + // comparison operators + virtual inline UBool operator==(const UObject &other) const { return this==&other; } + inline UBool operator!=(const UObject &other) const { return !operator==(other); } + + // clone() commented out from the base class: + // some compilers do not support co-variant return types + // (i.e., subclasses would have to return UObject * as well, instead of SubClass *) + // see also UObject class documentation. + // virtual UObject *clone() const; +#endif + + /* + * Assignment operator not declared. The compiler will provide one + * which does nothing since this class does not contain any data members. + * API/code coverage may show the assignment operator as present and + * untested - ignore. + * Subclasses need this assignment operator if they use compiler-provided + * assignment operators of their own. An alternative to not declaring one + * here would be to declare and empty-implement a protected or public one. + UObject &UObject::operator=(const UObject &); + */ +}; + +#ifndef U_HIDE_INTERNAL_API +/** + * This is a simple macro to add ICU RTTI to an ICU object implementation. + * This does not go into the header. This should only be used in *.cpp files. + * + * @param myClass The name of the class that needs RTTI defined. + * @internal + */ +#define UOBJECT_DEFINE_RTTI_IMPLEMENTATION(myClass) \ + UClassID U_EXPORT2 myClass::getStaticClassID() { \ + static char classID = 0; \ + return (UClassID)&classID; \ + } \ + UClassID myClass::getDynamicClassID() const \ + { return myClass::getStaticClassID(); } + + +/** + * This macro adds ICU RTTI to an ICU abstract class implementation. + * This macro should be invoked in *.cpp files. The corresponding + * header should declare getStaticClassID. + * + * @param myClass The name of the class that needs RTTI defined. + * @internal + */ +#define UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(myClass) \ + UClassID U_EXPORT2 myClass::getStaticClassID() { \ + static char classID = 0; \ + return (UClassID)&classID; \ + } + +#endif /* U_HIDE_INTERNAL_API */ + +U_NAMESPACE_END + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/urename.h b/src/core/basetypes/icu-ucsdet/include/unicode/urename.h new file mode 100644 index 00000000..a8172626 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/urename.h @@ -0,0 +1,1784 @@ +/* +******************************************************************************* +* Copyright (C) 2002-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* +* file name: urename.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* Created by: Perl script tools/genren.pl written by Vladimir Weinstein +* +* Contains data for renaming ICU exports. +* Gets included by umachine.h +* +* THIS FILE IS MACHINE-GENERATED, DON'T PLAY WITH IT IF YOU DON'T KNOW WHAT +* YOU ARE DOING, OTHERWISE VERY BAD THINGS WILL HAPPEN! +*/ + +#ifndef URENAME_H +#define URENAME_H + +/* U_DISABLE_RENAMING can be defined in the following ways: + * - when running configure, e.g. + * runConfigureICU Linux --disable-renaming + * - by changing the default setting of U_DISABLE_RENAMING in uconfig.h + */ + +#include "unicode/uconfig.h" + +#if !U_DISABLE_RENAMING + +/* We need the U_ICU_ENTRY_POINT_RENAME definition. There's a default one in unicode/uvernum.h we can use, but we will give + the platform a chance to define it first. + Normally (if utypes.h or umachine.h was included first) this will not be necessary as it will already be defined. + */ + +#ifndef U_ICU_ENTRY_POINT_RENAME +#include "unicode/umachine.h" +#endif + +/* If we still don't have U_ICU_ENTRY_POINT_RENAME use the default. */ +#ifndef U_ICU_ENTRY_POINT_RENAME +#include "unicode/uvernum.h" +#endif + +/* Error out before the following defines cause very strange and unexpected code breakage */ +#ifndef U_ICU_ENTRY_POINT_RENAME +#error U_ICU_ENTRY_POINT_RENAME is not defined - cannot continue. Consider defining U_DISABLE_RENAMING if renaming should not be used. +#endif + + +/* C exports renaming data */ + +#define T_CString_int64ToString U_ICU_ENTRY_POINT_RENAME(T_CString_int64ToString) +#define T_CString_integerToString U_ICU_ENTRY_POINT_RENAME(T_CString_integerToString) +#define T_CString_stringToInteger U_ICU_ENTRY_POINT_RENAME(T_CString_stringToInteger) +#define T_CString_toLowerCase U_ICU_ENTRY_POINT_RENAME(T_CString_toLowerCase) +#define T_CString_toUpperCase U_ICU_ENTRY_POINT_RENAME(T_CString_toUpperCase) +#define UCNV_FROM_U_CALLBACK_ESCAPE U_ICU_ENTRY_POINT_RENAME(UCNV_FROM_U_CALLBACK_ESCAPE) +#define UCNV_FROM_U_CALLBACK_SKIP U_ICU_ENTRY_POINT_RENAME(UCNV_FROM_U_CALLBACK_SKIP) +#define UCNV_FROM_U_CALLBACK_STOP U_ICU_ENTRY_POINT_RENAME(UCNV_FROM_U_CALLBACK_STOP) +#define UCNV_FROM_U_CALLBACK_SUBSTITUTE U_ICU_ENTRY_POINT_RENAME(UCNV_FROM_U_CALLBACK_SUBSTITUTE) +#define UCNV_TO_U_CALLBACK_ESCAPE U_ICU_ENTRY_POINT_RENAME(UCNV_TO_U_CALLBACK_ESCAPE) +#define UCNV_TO_U_CALLBACK_SKIP U_ICU_ENTRY_POINT_RENAME(UCNV_TO_U_CALLBACK_SKIP) +#define UCNV_TO_U_CALLBACK_STOP U_ICU_ENTRY_POINT_RENAME(UCNV_TO_U_CALLBACK_STOP) +#define UCNV_TO_U_CALLBACK_SUBSTITUTE U_ICU_ENTRY_POINT_RENAME(UCNV_TO_U_CALLBACK_SUBSTITUTE) +#define UDataMemory_createNewInstance U_ICU_ENTRY_POINT_RENAME(UDataMemory_createNewInstance) +#define UDataMemory_init U_ICU_ENTRY_POINT_RENAME(UDataMemory_init) +#define UDataMemory_isLoaded U_ICU_ENTRY_POINT_RENAME(UDataMemory_isLoaded) +#define UDataMemory_normalizeDataPointer U_ICU_ENTRY_POINT_RENAME(UDataMemory_normalizeDataPointer) +#define UDataMemory_setData U_ICU_ENTRY_POINT_RENAME(UDataMemory_setData) +#define UDatamemory_assign U_ICU_ENTRY_POINT_RENAME(UDatamemory_assign) +#define _ASCIIData U_ICU_ENTRY_POINT_RENAME(_ASCIIData) +#define _Bocu1Data U_ICU_ENTRY_POINT_RENAME(_Bocu1Data) +#define _CESU8Data U_ICU_ENTRY_POINT_RENAME(_CESU8Data) +#define _CompoundTextData U_ICU_ENTRY_POINT_RENAME(_CompoundTextData) +#define _HZData U_ICU_ENTRY_POINT_RENAME(_HZData) +#define _IMAPData U_ICU_ENTRY_POINT_RENAME(_IMAPData) +#define _ISCIIData U_ICU_ENTRY_POINT_RENAME(_ISCIIData) +#define _ISO2022Data U_ICU_ENTRY_POINT_RENAME(_ISO2022Data) +#define _LMBCSData1 U_ICU_ENTRY_POINT_RENAME(_LMBCSData1) +#define _LMBCSData11 U_ICU_ENTRY_POINT_RENAME(_LMBCSData11) +#define _LMBCSData16 U_ICU_ENTRY_POINT_RENAME(_LMBCSData16) +#define _LMBCSData17 U_ICU_ENTRY_POINT_RENAME(_LMBCSData17) +#define _LMBCSData18 U_ICU_ENTRY_POINT_RENAME(_LMBCSData18) +#define _LMBCSData19 U_ICU_ENTRY_POINT_RENAME(_LMBCSData19) +#define _LMBCSData2 U_ICU_ENTRY_POINT_RENAME(_LMBCSData2) +#define _LMBCSData3 U_ICU_ENTRY_POINT_RENAME(_LMBCSData3) +#define _LMBCSData4 U_ICU_ENTRY_POINT_RENAME(_LMBCSData4) +#define _LMBCSData5 U_ICU_ENTRY_POINT_RENAME(_LMBCSData5) +#define _LMBCSData6 U_ICU_ENTRY_POINT_RENAME(_LMBCSData6) +#define _LMBCSData8 U_ICU_ENTRY_POINT_RENAME(_LMBCSData8) +#define _Latin1Data U_ICU_ENTRY_POINT_RENAME(_Latin1Data) +#define _MBCSData U_ICU_ENTRY_POINT_RENAME(_MBCSData) +#define _SCSUData U_ICU_ENTRY_POINT_RENAME(_SCSUData) +#define _UTF16BEData U_ICU_ENTRY_POINT_RENAME(_UTF16BEData) +#define _UTF16Data U_ICU_ENTRY_POINT_RENAME(_UTF16Data) +#define _UTF16LEData U_ICU_ENTRY_POINT_RENAME(_UTF16LEData) +#define _UTF32BEData U_ICU_ENTRY_POINT_RENAME(_UTF32BEData) +#define _UTF32Data U_ICU_ENTRY_POINT_RENAME(_UTF32Data) +#define _UTF32LEData U_ICU_ENTRY_POINT_RENAME(_UTF32LEData) +#define _UTF7Data U_ICU_ENTRY_POINT_RENAME(_UTF7Data) +#define _UTF8Data U_ICU_ENTRY_POINT_RENAME(_UTF8Data) +#define cmemory_cleanup U_ICU_ENTRY_POINT_RENAME(cmemory_cleanup) +#define cmemory_inUse U_ICU_ENTRY_POINT_RENAME(cmemory_inUse) +#define gTimeZoneFilesInitOnce U_ICU_ENTRY_POINT_RENAME(gTimeZoneFilesInitOnce) +#define izrule_clone U_ICU_ENTRY_POINT_RENAME(izrule_clone) +#define izrule_close U_ICU_ENTRY_POINT_RENAME(izrule_close) +#define izrule_equals U_ICU_ENTRY_POINT_RENAME(izrule_equals) +#define izrule_getDSTSavings U_ICU_ENTRY_POINT_RENAME(izrule_getDSTSavings) +#define izrule_getDynamicClassID U_ICU_ENTRY_POINT_RENAME(izrule_getDynamicClassID) +#define izrule_getFinalStart U_ICU_ENTRY_POINT_RENAME(izrule_getFinalStart) +#define izrule_getFirstStart U_ICU_ENTRY_POINT_RENAME(izrule_getFirstStart) +#define izrule_getName U_ICU_ENTRY_POINT_RENAME(izrule_getName) +#define izrule_getNextStart U_ICU_ENTRY_POINT_RENAME(izrule_getNextStart) +#define izrule_getPreviousStart U_ICU_ENTRY_POINT_RENAME(izrule_getPreviousStart) +#define izrule_getRawOffset U_ICU_ENTRY_POINT_RENAME(izrule_getRawOffset) +#define izrule_getStaticClassID U_ICU_ENTRY_POINT_RENAME(izrule_getStaticClassID) +#define izrule_isEquivalentTo U_ICU_ENTRY_POINT_RENAME(izrule_isEquivalentTo) +#define izrule_open U_ICU_ENTRY_POINT_RENAME(izrule_open) +#define le_close U_ICU_ENTRY_POINT_RENAME(le_close) +#define le_create U_ICU_ENTRY_POINT_RENAME(le_create) +#define le_getCharIndices U_ICU_ENTRY_POINT_RENAME(le_getCharIndices) +#define le_getCharIndicesWithBase U_ICU_ENTRY_POINT_RENAME(le_getCharIndicesWithBase) +#define le_getGlyphCount U_ICU_ENTRY_POINT_RENAME(le_getGlyphCount) +#define le_getGlyphPosition U_ICU_ENTRY_POINT_RENAME(le_getGlyphPosition) +#define le_getGlyphPositions U_ICU_ENTRY_POINT_RENAME(le_getGlyphPositions) +#define le_getGlyphs U_ICU_ENTRY_POINT_RENAME(le_getGlyphs) +#define le_layoutChars U_ICU_ENTRY_POINT_RENAME(le_layoutChars) +#define le_reset U_ICU_ENTRY_POINT_RENAME(le_reset) +#define locale_getKeywords U_ICU_ENTRY_POINT_RENAME(locale_getKeywords) +#define locale_getKeywordsStart U_ICU_ENTRY_POINT_RENAME(locale_getKeywordsStart) +#define locale_get_default U_ICU_ENTRY_POINT_RENAME(locale_get_default) +#define locale_set_default U_ICU_ENTRY_POINT_RENAME(locale_set_default) +#define pl_addFontRun U_ICU_ENTRY_POINT_RENAME(pl_addFontRun) +#define pl_addLocaleRun U_ICU_ENTRY_POINT_RENAME(pl_addLocaleRun) +#define pl_addValueRun U_ICU_ENTRY_POINT_RENAME(pl_addValueRun) +#define pl_close U_ICU_ENTRY_POINT_RENAME(pl_close) +#define pl_closeFontRuns U_ICU_ENTRY_POINT_RENAME(pl_closeFontRuns) +#define pl_closeLine U_ICU_ENTRY_POINT_RENAME(pl_closeLine) +#define pl_closeLocaleRuns U_ICU_ENTRY_POINT_RENAME(pl_closeLocaleRuns) +#define pl_closeValueRuns U_ICU_ENTRY_POINT_RENAME(pl_closeValueRuns) +#define pl_countLineRuns U_ICU_ENTRY_POINT_RENAME(pl_countLineRuns) +#define pl_create U_ICU_ENTRY_POINT_RENAME(pl_create) +#define pl_getAscent U_ICU_ENTRY_POINT_RENAME(pl_getAscent) +#define pl_getDescent U_ICU_ENTRY_POINT_RENAME(pl_getDescent) +#define pl_getFontRunCount U_ICU_ENTRY_POINT_RENAME(pl_getFontRunCount) +#define pl_getFontRunFont U_ICU_ENTRY_POINT_RENAME(pl_getFontRunFont) +#define pl_getFontRunLastLimit U_ICU_ENTRY_POINT_RENAME(pl_getFontRunLastLimit) +#define pl_getFontRunLimit U_ICU_ENTRY_POINT_RENAME(pl_getFontRunLimit) +#define pl_getLeading U_ICU_ENTRY_POINT_RENAME(pl_getLeading) +#define pl_getLineAscent U_ICU_ENTRY_POINT_RENAME(pl_getLineAscent) +#define pl_getLineDescent U_ICU_ENTRY_POINT_RENAME(pl_getLineDescent) +#define pl_getLineLeading U_ICU_ENTRY_POINT_RENAME(pl_getLineLeading) +#define pl_getLineVisualRun U_ICU_ENTRY_POINT_RENAME(pl_getLineVisualRun) +#define pl_getLineWidth U_ICU_ENTRY_POINT_RENAME(pl_getLineWidth) +#define pl_getLocaleRunCount U_ICU_ENTRY_POINT_RENAME(pl_getLocaleRunCount) +#define pl_getLocaleRunLastLimit U_ICU_ENTRY_POINT_RENAME(pl_getLocaleRunLastLimit) +#define pl_getLocaleRunLimit U_ICU_ENTRY_POINT_RENAME(pl_getLocaleRunLimit) +#define pl_getLocaleRunLocale U_ICU_ENTRY_POINT_RENAME(pl_getLocaleRunLocale) +#define pl_getParagraphLevel U_ICU_ENTRY_POINT_RENAME(pl_getParagraphLevel) +#define pl_getTextDirection U_ICU_ENTRY_POINT_RENAME(pl_getTextDirection) +#define pl_getValueRunCount U_ICU_ENTRY_POINT_RENAME(pl_getValueRunCount) +#define pl_getValueRunLastLimit U_ICU_ENTRY_POINT_RENAME(pl_getValueRunLastLimit) +#define pl_getValueRunLimit U_ICU_ENTRY_POINT_RENAME(pl_getValueRunLimit) +#define pl_getValueRunValue U_ICU_ENTRY_POINT_RENAME(pl_getValueRunValue) +#define pl_getVisualRunAscent U_ICU_ENTRY_POINT_RENAME(pl_getVisualRunAscent) +#define pl_getVisualRunDescent U_ICU_ENTRY_POINT_RENAME(pl_getVisualRunDescent) +#define pl_getVisualRunDirection U_ICU_ENTRY_POINT_RENAME(pl_getVisualRunDirection) +#define pl_getVisualRunFont U_ICU_ENTRY_POINT_RENAME(pl_getVisualRunFont) +#define pl_getVisualRunGlyphCount U_ICU_ENTRY_POINT_RENAME(pl_getVisualRunGlyphCount) +#define pl_getVisualRunGlyphToCharMap U_ICU_ENTRY_POINT_RENAME(pl_getVisualRunGlyphToCharMap) +#define pl_getVisualRunGlyphs U_ICU_ENTRY_POINT_RENAME(pl_getVisualRunGlyphs) +#define pl_getVisualRunLeading U_ICU_ENTRY_POINT_RENAME(pl_getVisualRunLeading) +#define pl_getVisualRunPositions U_ICU_ENTRY_POINT_RENAME(pl_getVisualRunPositions) +#define pl_isComplex U_ICU_ENTRY_POINT_RENAME(pl_isComplex) +#define pl_nextLine U_ICU_ENTRY_POINT_RENAME(pl_nextLine) +#define pl_openEmptyFontRuns U_ICU_ENTRY_POINT_RENAME(pl_openEmptyFontRuns) +#define pl_openEmptyLocaleRuns U_ICU_ENTRY_POINT_RENAME(pl_openEmptyLocaleRuns) +#define pl_openEmptyValueRuns U_ICU_ENTRY_POINT_RENAME(pl_openEmptyValueRuns) +#define pl_openFontRuns U_ICU_ENTRY_POINT_RENAME(pl_openFontRuns) +#define pl_openLocaleRuns U_ICU_ENTRY_POINT_RENAME(pl_openLocaleRuns) +#define pl_openValueRuns U_ICU_ENTRY_POINT_RENAME(pl_openValueRuns) +#define pl_reflow U_ICU_ENTRY_POINT_RENAME(pl_reflow) +#define pl_resetFontRuns U_ICU_ENTRY_POINT_RENAME(pl_resetFontRuns) +#define pl_resetLocaleRuns U_ICU_ENTRY_POINT_RENAME(pl_resetLocaleRuns) +#define pl_resetValueRuns U_ICU_ENTRY_POINT_RENAME(pl_resetValueRuns) +#define res_countArrayItems U_ICU_ENTRY_POINT_RENAME(res_countArrayItems) +#define res_findResource U_ICU_ENTRY_POINT_RENAME(res_findResource) +#define res_getAlias U_ICU_ENTRY_POINT_RENAME(res_getAlias) +#define res_getArrayItem U_ICU_ENTRY_POINT_RENAME(res_getArrayItem) +#define res_getBinary U_ICU_ENTRY_POINT_RENAME(res_getBinary) +#define res_getIntVector U_ICU_ENTRY_POINT_RENAME(res_getIntVector) +#define res_getPublicType U_ICU_ENTRY_POINT_RENAME(res_getPublicType) +#define res_getResource U_ICU_ENTRY_POINT_RENAME(res_getResource) +#define res_getString U_ICU_ENTRY_POINT_RENAME(res_getString) +#define res_getTableItemByIndex U_ICU_ENTRY_POINT_RENAME(res_getTableItemByIndex) +#define res_getTableItemByKey U_ICU_ENTRY_POINT_RENAME(res_getTableItemByKey) +#define res_load U_ICU_ENTRY_POINT_RENAME(res_load) +#define res_read U_ICU_ENTRY_POINT_RENAME(res_read) +#define res_unload U_ICU_ENTRY_POINT_RENAME(res_unload) +#define u_UCharsToChars U_ICU_ENTRY_POINT_RENAME(u_UCharsToChars) +#define u_austrcpy U_ICU_ENTRY_POINT_RENAME(u_austrcpy) +#define u_austrncpy U_ICU_ENTRY_POINT_RENAME(u_austrncpy) +#define u_catclose U_ICU_ENTRY_POINT_RENAME(u_catclose) +#define u_catgets U_ICU_ENTRY_POINT_RENAME(u_catgets) +#define u_catopen U_ICU_ENTRY_POINT_RENAME(u_catopen) +#define u_charAge U_ICU_ENTRY_POINT_RENAME(u_charAge) +#define u_charDigitValue U_ICU_ENTRY_POINT_RENAME(u_charDigitValue) +#define u_charDirection U_ICU_ENTRY_POINT_RENAME(u_charDirection) +#define u_charFromName U_ICU_ENTRY_POINT_RENAME(u_charFromName) +#define u_charMirror U_ICU_ENTRY_POINT_RENAME(u_charMirror) +#define u_charName U_ICU_ENTRY_POINT_RENAME(u_charName) +#define u_charType U_ICU_ENTRY_POINT_RENAME(u_charType) +#define u_charsToUChars U_ICU_ENTRY_POINT_RENAME(u_charsToUChars) +#define u_cleanup U_ICU_ENTRY_POINT_RENAME(u_cleanup) +#define u_countChar32 U_ICU_ENTRY_POINT_RENAME(u_countChar32) +#define u_digit U_ICU_ENTRY_POINT_RENAME(u_digit) +#define u_enumCharNames U_ICU_ENTRY_POINT_RENAME(u_enumCharNames) +#define u_enumCharTypes U_ICU_ENTRY_POINT_RENAME(u_enumCharTypes) +#define u_errorName U_ICU_ENTRY_POINT_RENAME(u_errorName) +#define u_fadopt U_ICU_ENTRY_POINT_RENAME(u_fadopt) +#define u_fclose U_ICU_ENTRY_POINT_RENAME(u_fclose) +#define u_feof U_ICU_ENTRY_POINT_RENAME(u_feof) +#define u_fflush U_ICU_ENTRY_POINT_RENAME(u_fflush) +#define u_fgetConverter U_ICU_ENTRY_POINT_RENAME(u_fgetConverter) +#define u_fgetNumberFormat U_ICU_ENTRY_POINT_RENAME(u_fgetNumberFormat) +#define u_fgetc U_ICU_ENTRY_POINT_RENAME(u_fgetc) +#define u_fgetcodepage U_ICU_ENTRY_POINT_RENAME(u_fgetcodepage) +#define u_fgetcx U_ICU_ENTRY_POINT_RENAME(u_fgetcx) +#define u_fgetfile U_ICU_ENTRY_POINT_RENAME(u_fgetfile) +#define u_fgetlocale U_ICU_ENTRY_POINT_RENAME(u_fgetlocale) +#define u_fgets U_ICU_ENTRY_POINT_RENAME(u_fgets) +#define u_file_read U_ICU_ENTRY_POINT_RENAME(u_file_read) +#define u_file_write U_ICU_ENTRY_POINT_RENAME(u_file_write) +#define u_file_write_flush U_ICU_ENTRY_POINT_RENAME(u_file_write_flush) +#define u_finit U_ICU_ENTRY_POINT_RENAME(u_finit) +#define u_flushDefaultConverter U_ICU_ENTRY_POINT_RENAME(u_flushDefaultConverter) +#define u_foldCase U_ICU_ENTRY_POINT_RENAME(u_foldCase) +#define u_fopen U_ICU_ENTRY_POINT_RENAME(u_fopen) +#define u_fopen_u U_ICU_ENTRY_POINT_RENAME(u_fopen_u) +#define u_forDigit U_ICU_ENTRY_POINT_RENAME(u_forDigit) +#define u_formatMessage U_ICU_ENTRY_POINT_RENAME(u_formatMessage) +#define u_formatMessageWithError U_ICU_ENTRY_POINT_RENAME(u_formatMessageWithError) +#define u_fprintf U_ICU_ENTRY_POINT_RENAME(u_fprintf) +#define u_fprintf_u U_ICU_ENTRY_POINT_RENAME(u_fprintf_u) +#define u_fputc U_ICU_ENTRY_POINT_RENAME(u_fputc) +#define u_fputs U_ICU_ENTRY_POINT_RENAME(u_fputs) +#define u_frewind U_ICU_ENTRY_POINT_RENAME(u_frewind) +#define u_fscanf U_ICU_ENTRY_POINT_RENAME(u_fscanf) +#define u_fscanf_u U_ICU_ENTRY_POINT_RENAME(u_fscanf_u) +#define u_fsetcodepage U_ICU_ENTRY_POINT_RENAME(u_fsetcodepage) +#define u_fsetlocale U_ICU_ENTRY_POINT_RENAME(u_fsetlocale) +#define u_fsettransliterator U_ICU_ENTRY_POINT_RENAME(u_fsettransliterator) +#define u_fstropen U_ICU_ENTRY_POINT_RENAME(u_fstropen) +#define u_fungetc U_ICU_ENTRY_POINT_RENAME(u_fungetc) +#define u_getBidiPairedBracket U_ICU_ENTRY_POINT_RENAME(u_getBidiPairedBracket) +#define u_getCombiningClass U_ICU_ENTRY_POINT_RENAME(u_getCombiningClass) +#define u_getDataDirectory U_ICU_ENTRY_POINT_RENAME(u_getDataDirectory) +#define u_getDataVersion U_ICU_ENTRY_POINT_RENAME(u_getDataVersion) +#define u_getDefaultConverter U_ICU_ENTRY_POINT_RENAME(u_getDefaultConverter) +#define u_getFC_NFKC_Closure U_ICU_ENTRY_POINT_RENAME(u_getFC_NFKC_Closure) +#define u_getISOComment U_ICU_ENTRY_POINT_RENAME(u_getISOComment) +#define u_getIntPropertyMaxValue U_ICU_ENTRY_POINT_RENAME(u_getIntPropertyMaxValue) +#define u_getIntPropertyMinValue U_ICU_ENTRY_POINT_RENAME(u_getIntPropertyMinValue) +#define u_getIntPropertyValue U_ICU_ENTRY_POINT_RENAME(u_getIntPropertyValue) +#define u_getMainProperties U_ICU_ENTRY_POINT_RENAME(u_getMainProperties) +#define u_getNumericValue U_ICU_ENTRY_POINT_RENAME(u_getNumericValue) +#define u_getPropertyEnum U_ICU_ENTRY_POINT_RENAME(u_getPropertyEnum) +#define u_getPropertyName U_ICU_ENTRY_POINT_RENAME(u_getPropertyName) +#define u_getPropertyValueEnum U_ICU_ENTRY_POINT_RENAME(u_getPropertyValueEnum) +#define u_getPropertyValueName U_ICU_ENTRY_POINT_RENAME(u_getPropertyValueName) +#define u_getTimeZoneFilesDirectory U_ICU_ENTRY_POINT_RENAME(u_getTimeZoneFilesDirectory) +#define u_getUnicodeProperties U_ICU_ENTRY_POINT_RENAME(u_getUnicodeProperties) +#define u_getUnicodeVersion U_ICU_ENTRY_POINT_RENAME(u_getUnicodeVersion) +#define u_getVersion U_ICU_ENTRY_POINT_RENAME(u_getVersion) +#define u_get_stdout U_ICU_ENTRY_POINT_RENAME(u_get_stdout) +#define u_hasBinaryProperty U_ICU_ENTRY_POINT_RENAME(u_hasBinaryProperty) +#define u_init U_ICU_ENTRY_POINT_RENAME(u_init) +#define u_isIDIgnorable U_ICU_ENTRY_POINT_RENAME(u_isIDIgnorable) +#define u_isIDPart U_ICU_ENTRY_POINT_RENAME(u_isIDPart) +#define u_isIDStart U_ICU_ENTRY_POINT_RENAME(u_isIDStart) +#define u_isISOControl U_ICU_ENTRY_POINT_RENAME(u_isISOControl) +#define u_isJavaIDPart U_ICU_ENTRY_POINT_RENAME(u_isJavaIDPart) +#define u_isJavaIDStart U_ICU_ENTRY_POINT_RENAME(u_isJavaIDStart) +#define u_isJavaSpaceChar U_ICU_ENTRY_POINT_RENAME(u_isJavaSpaceChar) +#define u_isMirrored U_ICU_ENTRY_POINT_RENAME(u_isMirrored) +#define u_isUAlphabetic U_ICU_ENTRY_POINT_RENAME(u_isUAlphabetic) +#define u_isULowercase U_ICU_ENTRY_POINT_RENAME(u_isULowercase) +#define u_isUUppercase U_ICU_ENTRY_POINT_RENAME(u_isUUppercase) +#define u_isUWhiteSpace U_ICU_ENTRY_POINT_RENAME(u_isUWhiteSpace) +#define u_isWhitespace U_ICU_ENTRY_POINT_RENAME(u_isWhitespace) +#define u_isalnum U_ICU_ENTRY_POINT_RENAME(u_isalnum) +#define u_isalnumPOSIX U_ICU_ENTRY_POINT_RENAME(u_isalnumPOSIX) +#define u_isalpha U_ICU_ENTRY_POINT_RENAME(u_isalpha) +#define u_isbase U_ICU_ENTRY_POINT_RENAME(u_isbase) +#define u_isblank U_ICU_ENTRY_POINT_RENAME(u_isblank) +#define u_iscntrl U_ICU_ENTRY_POINT_RENAME(u_iscntrl) +#define u_isdefined U_ICU_ENTRY_POINT_RENAME(u_isdefined) +#define u_isdigit U_ICU_ENTRY_POINT_RENAME(u_isdigit) +#define u_isgraph U_ICU_ENTRY_POINT_RENAME(u_isgraph) +#define u_isgraphPOSIX U_ICU_ENTRY_POINT_RENAME(u_isgraphPOSIX) +#define u_islower U_ICU_ENTRY_POINT_RENAME(u_islower) +#define u_isprint U_ICU_ENTRY_POINT_RENAME(u_isprint) +#define u_isprintPOSIX U_ICU_ENTRY_POINT_RENAME(u_isprintPOSIX) +#define u_ispunct U_ICU_ENTRY_POINT_RENAME(u_ispunct) +#define u_isspace U_ICU_ENTRY_POINT_RENAME(u_isspace) +#define u_istitle U_ICU_ENTRY_POINT_RENAME(u_istitle) +#define u_isupper U_ICU_ENTRY_POINT_RENAME(u_isupper) +#define u_isxdigit U_ICU_ENTRY_POINT_RENAME(u_isxdigit) +#define u_locbund_close U_ICU_ENTRY_POINT_RENAME(u_locbund_close) +#define u_locbund_getNumberFormat U_ICU_ENTRY_POINT_RENAME(u_locbund_getNumberFormat) +#define u_locbund_init U_ICU_ENTRY_POINT_RENAME(u_locbund_init) +#define u_memcasecmp U_ICU_ENTRY_POINT_RENAME(u_memcasecmp) +#define u_memchr U_ICU_ENTRY_POINT_RENAME(u_memchr) +#define u_memchr32 U_ICU_ENTRY_POINT_RENAME(u_memchr32) +#define u_memcmp U_ICU_ENTRY_POINT_RENAME(u_memcmp) +#define u_memcmpCodePointOrder U_ICU_ENTRY_POINT_RENAME(u_memcmpCodePointOrder) +#define u_memcpy U_ICU_ENTRY_POINT_RENAME(u_memcpy) +#define u_memmove U_ICU_ENTRY_POINT_RENAME(u_memmove) +#define u_memrchr U_ICU_ENTRY_POINT_RENAME(u_memrchr) +#define u_memrchr32 U_ICU_ENTRY_POINT_RENAME(u_memrchr32) +#define u_memset U_ICU_ENTRY_POINT_RENAME(u_memset) +#define u_parseMessage U_ICU_ENTRY_POINT_RENAME(u_parseMessage) +#define u_parseMessageWithError U_ICU_ENTRY_POINT_RENAME(u_parseMessageWithError) +#define u_printf U_ICU_ENTRY_POINT_RENAME(u_printf) +#define u_printf_parse U_ICU_ENTRY_POINT_RENAME(u_printf_parse) +#define u_printf_u U_ICU_ENTRY_POINT_RENAME(u_printf_u) +#define u_releaseDefaultConverter U_ICU_ENTRY_POINT_RENAME(u_releaseDefaultConverter) +#define u_scanf_parse U_ICU_ENTRY_POINT_RENAME(u_scanf_parse) +#define u_setAtomicIncDecFunctions U_ICU_ENTRY_POINT_RENAME(u_setAtomicIncDecFunctions) +#define u_setDataDirectory U_ICU_ENTRY_POINT_RENAME(u_setDataDirectory) +#define u_setMemoryFunctions U_ICU_ENTRY_POINT_RENAME(u_setMemoryFunctions) +#define u_setMutexFunctions U_ICU_ENTRY_POINT_RENAME(u_setMutexFunctions) +#define u_setTimeZoneFilesDirectory U_ICU_ENTRY_POINT_RENAME(u_setTimeZoneFilesDirectory) +#define u_shapeArabic U_ICU_ENTRY_POINT_RENAME(u_shapeArabic) +#define u_snprintf U_ICU_ENTRY_POINT_RENAME(u_snprintf) +#define u_snprintf_u U_ICU_ENTRY_POINT_RENAME(u_snprintf_u) +#define u_sprintf U_ICU_ENTRY_POINT_RENAME(u_sprintf) +#define u_sprintf_u U_ICU_ENTRY_POINT_RENAME(u_sprintf_u) +#define u_sscanf U_ICU_ENTRY_POINT_RENAME(u_sscanf) +#define u_sscanf_u U_ICU_ENTRY_POINT_RENAME(u_sscanf_u) +#define u_strCaseCompare U_ICU_ENTRY_POINT_RENAME(u_strCaseCompare) +#define u_strCompare U_ICU_ENTRY_POINT_RENAME(u_strCompare) +#define u_strCompareIter U_ICU_ENTRY_POINT_RENAME(u_strCompareIter) +#define u_strFindFirst U_ICU_ENTRY_POINT_RENAME(u_strFindFirst) +#define u_strFindLast U_ICU_ENTRY_POINT_RENAME(u_strFindLast) +#define u_strFoldCase U_ICU_ENTRY_POINT_RENAME(u_strFoldCase) +#define u_strFromJavaModifiedUTF8WithSub U_ICU_ENTRY_POINT_RENAME(u_strFromJavaModifiedUTF8WithSub) +#define u_strFromPunycode U_ICU_ENTRY_POINT_RENAME(u_strFromPunycode) +#define u_strFromUTF32 U_ICU_ENTRY_POINT_RENAME(u_strFromUTF32) +#define u_strFromUTF32WithSub U_ICU_ENTRY_POINT_RENAME(u_strFromUTF32WithSub) +#define u_strFromUTF8 U_ICU_ENTRY_POINT_RENAME(u_strFromUTF8) +#define u_strFromUTF8Lenient U_ICU_ENTRY_POINT_RENAME(u_strFromUTF8Lenient) +#define u_strFromUTF8WithSub U_ICU_ENTRY_POINT_RENAME(u_strFromUTF8WithSub) +#define u_strFromWCS U_ICU_ENTRY_POINT_RENAME(u_strFromWCS) +#define u_strHasMoreChar32Than U_ICU_ENTRY_POINT_RENAME(u_strHasMoreChar32Than) +#define u_strToJavaModifiedUTF8 U_ICU_ENTRY_POINT_RENAME(u_strToJavaModifiedUTF8) +#define u_strToLower U_ICU_ENTRY_POINT_RENAME(u_strToLower) +#define u_strToPunycode U_ICU_ENTRY_POINT_RENAME(u_strToPunycode) +#define u_strToTitle U_ICU_ENTRY_POINT_RENAME(u_strToTitle) +#define u_strToUTF32 U_ICU_ENTRY_POINT_RENAME(u_strToUTF32) +#define u_strToUTF32WithSub U_ICU_ENTRY_POINT_RENAME(u_strToUTF32WithSub) +#define u_strToUTF8 U_ICU_ENTRY_POINT_RENAME(u_strToUTF8) +#define u_strToUTF8WithSub U_ICU_ENTRY_POINT_RENAME(u_strToUTF8WithSub) +#define u_strToUpper U_ICU_ENTRY_POINT_RENAME(u_strToUpper) +#define u_strToWCS U_ICU_ENTRY_POINT_RENAME(u_strToWCS) +#define u_strcasecmp U_ICU_ENTRY_POINT_RENAME(u_strcasecmp) +#define u_strcat U_ICU_ENTRY_POINT_RENAME(u_strcat) +#define u_strchr U_ICU_ENTRY_POINT_RENAME(u_strchr) +#define u_strchr32 U_ICU_ENTRY_POINT_RENAME(u_strchr32) +#define u_strcmp U_ICU_ENTRY_POINT_RENAME(u_strcmp) +#define u_strcmpCodePointOrder U_ICU_ENTRY_POINT_RENAME(u_strcmpCodePointOrder) +#define u_strcmpFold U_ICU_ENTRY_POINT_RENAME(u_strcmpFold) +#define u_strcpy U_ICU_ENTRY_POINT_RENAME(u_strcpy) +#define u_strcspn U_ICU_ENTRY_POINT_RENAME(u_strcspn) +#define u_strlen U_ICU_ENTRY_POINT_RENAME(u_strlen) +#define u_strncasecmp U_ICU_ENTRY_POINT_RENAME(u_strncasecmp) +#define u_strncat U_ICU_ENTRY_POINT_RENAME(u_strncat) +#define u_strncmp U_ICU_ENTRY_POINT_RENAME(u_strncmp) +#define u_strncmpCodePointOrder U_ICU_ENTRY_POINT_RENAME(u_strncmpCodePointOrder) +#define u_strncpy U_ICU_ENTRY_POINT_RENAME(u_strncpy) +#define u_strpbrk U_ICU_ENTRY_POINT_RENAME(u_strpbrk) +#define u_strrchr U_ICU_ENTRY_POINT_RENAME(u_strrchr) +#define u_strrchr32 U_ICU_ENTRY_POINT_RENAME(u_strrchr32) +#define u_strrstr U_ICU_ENTRY_POINT_RENAME(u_strrstr) +#define u_strspn U_ICU_ENTRY_POINT_RENAME(u_strspn) +#define u_strstr U_ICU_ENTRY_POINT_RENAME(u_strstr) +#define u_strtok_r U_ICU_ENTRY_POINT_RENAME(u_strtok_r) +#define u_terminateChars U_ICU_ENTRY_POINT_RENAME(u_terminateChars) +#define u_terminateUChar32s U_ICU_ENTRY_POINT_RENAME(u_terminateUChar32s) +#define u_terminateUChars U_ICU_ENTRY_POINT_RENAME(u_terminateUChars) +#define u_terminateWChars U_ICU_ENTRY_POINT_RENAME(u_terminateWChars) +#define u_tolower U_ICU_ENTRY_POINT_RENAME(u_tolower) +#define u_totitle U_ICU_ENTRY_POINT_RENAME(u_totitle) +#define u_toupper U_ICU_ENTRY_POINT_RENAME(u_toupper) +#define u_uastrcpy U_ICU_ENTRY_POINT_RENAME(u_uastrcpy) +#define u_uastrncpy U_ICU_ENTRY_POINT_RENAME(u_uastrncpy) +#define u_unescape U_ICU_ENTRY_POINT_RENAME(u_unescape) +#define u_unescapeAt U_ICU_ENTRY_POINT_RENAME(u_unescapeAt) +#define u_versionFromString U_ICU_ENTRY_POINT_RENAME(u_versionFromString) +#define u_versionFromUString U_ICU_ENTRY_POINT_RENAME(u_versionFromUString) +#define u_versionToString U_ICU_ENTRY_POINT_RENAME(u_versionToString) +#define u_vformatMessage U_ICU_ENTRY_POINT_RENAME(u_vformatMessage) +#define u_vformatMessageWithError U_ICU_ENTRY_POINT_RENAME(u_vformatMessageWithError) +#define u_vfprintf U_ICU_ENTRY_POINT_RENAME(u_vfprintf) +#define u_vfprintf_u U_ICU_ENTRY_POINT_RENAME(u_vfprintf_u) +#define u_vfscanf U_ICU_ENTRY_POINT_RENAME(u_vfscanf) +#define u_vfscanf_u U_ICU_ENTRY_POINT_RENAME(u_vfscanf_u) +#define u_vparseMessage U_ICU_ENTRY_POINT_RENAME(u_vparseMessage) +#define u_vparseMessageWithError U_ICU_ENTRY_POINT_RENAME(u_vparseMessageWithError) +#define u_vsnprintf U_ICU_ENTRY_POINT_RENAME(u_vsnprintf) +#define u_vsnprintf_u U_ICU_ENTRY_POINT_RENAME(u_vsnprintf_u) +#define u_vsprintf U_ICU_ENTRY_POINT_RENAME(u_vsprintf) +#define u_vsprintf_u U_ICU_ENTRY_POINT_RENAME(u_vsprintf_u) +#define u_vsscanf U_ICU_ENTRY_POINT_RENAME(u_vsscanf) +#define u_vsscanf_u U_ICU_ENTRY_POINT_RENAME(u_vsscanf_u) +#define u_writeIdenticalLevelRun U_ICU_ENTRY_POINT_RENAME(u_writeIdenticalLevelRun) +#define ubidi_addPropertyStarts U_ICU_ENTRY_POINT_RENAME(ubidi_addPropertyStarts) +#define ubidi_close U_ICU_ENTRY_POINT_RENAME(ubidi_close) +#define ubidi_countParagraphs U_ICU_ENTRY_POINT_RENAME(ubidi_countParagraphs) +#define ubidi_countRuns U_ICU_ENTRY_POINT_RENAME(ubidi_countRuns) +#define ubidi_getBaseDirection U_ICU_ENTRY_POINT_RENAME(ubidi_getBaseDirection) +#define ubidi_getClass U_ICU_ENTRY_POINT_RENAME(ubidi_getClass) +#define ubidi_getClassCallback U_ICU_ENTRY_POINT_RENAME(ubidi_getClassCallback) +#define ubidi_getCustomizedClass U_ICU_ENTRY_POINT_RENAME(ubidi_getCustomizedClass) +#define ubidi_getDirection U_ICU_ENTRY_POINT_RENAME(ubidi_getDirection) +#define ubidi_getJoiningGroup U_ICU_ENTRY_POINT_RENAME(ubidi_getJoiningGroup) +#define ubidi_getJoiningType U_ICU_ENTRY_POINT_RENAME(ubidi_getJoiningType) +#define ubidi_getLength U_ICU_ENTRY_POINT_RENAME(ubidi_getLength) +#define ubidi_getLevelAt U_ICU_ENTRY_POINT_RENAME(ubidi_getLevelAt) +#define ubidi_getLevels U_ICU_ENTRY_POINT_RENAME(ubidi_getLevels) +#define ubidi_getLogicalIndex U_ICU_ENTRY_POINT_RENAME(ubidi_getLogicalIndex) +#define ubidi_getLogicalMap U_ICU_ENTRY_POINT_RENAME(ubidi_getLogicalMap) +#define ubidi_getLogicalRun U_ICU_ENTRY_POINT_RENAME(ubidi_getLogicalRun) +#define ubidi_getMaxValue U_ICU_ENTRY_POINT_RENAME(ubidi_getMaxValue) +#define ubidi_getMemory U_ICU_ENTRY_POINT_RENAME(ubidi_getMemory) +#define ubidi_getMirror U_ICU_ENTRY_POINT_RENAME(ubidi_getMirror) +#define ubidi_getPairedBracket U_ICU_ENTRY_POINT_RENAME(ubidi_getPairedBracket) +#define ubidi_getPairedBracketType U_ICU_ENTRY_POINT_RENAME(ubidi_getPairedBracketType) +#define ubidi_getParaLevel U_ICU_ENTRY_POINT_RENAME(ubidi_getParaLevel) +#define ubidi_getParaLevelAtIndex U_ICU_ENTRY_POINT_RENAME(ubidi_getParaLevelAtIndex) +#define ubidi_getParagraph U_ICU_ENTRY_POINT_RENAME(ubidi_getParagraph) +#define ubidi_getParagraphByIndex U_ICU_ENTRY_POINT_RENAME(ubidi_getParagraphByIndex) +#define ubidi_getProcessedLength U_ICU_ENTRY_POINT_RENAME(ubidi_getProcessedLength) +#define ubidi_getReorderingMode U_ICU_ENTRY_POINT_RENAME(ubidi_getReorderingMode) +#define ubidi_getReorderingOptions U_ICU_ENTRY_POINT_RENAME(ubidi_getReorderingOptions) +#define ubidi_getResultLength U_ICU_ENTRY_POINT_RENAME(ubidi_getResultLength) +#define ubidi_getRuns U_ICU_ENTRY_POINT_RENAME(ubidi_getRuns) +#define ubidi_getSingleton U_ICU_ENTRY_POINT_RENAME(ubidi_getSingleton) +#define ubidi_getText U_ICU_ENTRY_POINT_RENAME(ubidi_getText) +#define ubidi_getVisualIndex U_ICU_ENTRY_POINT_RENAME(ubidi_getVisualIndex) +#define ubidi_getVisualMap U_ICU_ENTRY_POINT_RENAME(ubidi_getVisualMap) +#define ubidi_getVisualRun U_ICU_ENTRY_POINT_RENAME(ubidi_getVisualRun) +#define ubidi_invertMap U_ICU_ENTRY_POINT_RENAME(ubidi_invertMap) +#define ubidi_isBidiControl U_ICU_ENTRY_POINT_RENAME(ubidi_isBidiControl) +#define ubidi_isInverse U_ICU_ENTRY_POINT_RENAME(ubidi_isInverse) +#define ubidi_isJoinControl U_ICU_ENTRY_POINT_RENAME(ubidi_isJoinControl) +#define ubidi_isMirrored U_ICU_ENTRY_POINT_RENAME(ubidi_isMirrored) +#define ubidi_isOrderParagraphsLTR U_ICU_ENTRY_POINT_RENAME(ubidi_isOrderParagraphsLTR) +#define ubidi_open U_ICU_ENTRY_POINT_RENAME(ubidi_open) +#define ubidi_openSized U_ICU_ENTRY_POINT_RENAME(ubidi_openSized) +#define ubidi_orderParagraphsLTR U_ICU_ENTRY_POINT_RENAME(ubidi_orderParagraphsLTR) +#define ubidi_reorderLogical U_ICU_ENTRY_POINT_RENAME(ubidi_reorderLogical) +#define ubidi_reorderVisual U_ICU_ENTRY_POINT_RENAME(ubidi_reorderVisual) +#define ubidi_setClassCallback U_ICU_ENTRY_POINT_RENAME(ubidi_setClassCallback) +#define ubidi_setContext U_ICU_ENTRY_POINT_RENAME(ubidi_setContext) +#define ubidi_setInverse U_ICU_ENTRY_POINT_RENAME(ubidi_setInverse) +#define ubidi_setLine U_ICU_ENTRY_POINT_RENAME(ubidi_setLine) +#define ubidi_setPara U_ICU_ENTRY_POINT_RENAME(ubidi_setPara) +#define ubidi_setReorderingMode U_ICU_ENTRY_POINT_RENAME(ubidi_setReorderingMode) +#define ubidi_setReorderingOptions U_ICU_ENTRY_POINT_RENAME(ubidi_setReorderingOptions) +#define ubidi_writeReordered U_ICU_ENTRY_POINT_RENAME(ubidi_writeReordered) +#define ubidi_writeReverse U_ICU_ENTRY_POINT_RENAME(ubidi_writeReverse) +#define ublock_getCode U_ICU_ENTRY_POINT_RENAME(ublock_getCode) +#define ubrk_close U_ICU_ENTRY_POINT_RENAME(ubrk_close) +#define ubrk_countAvailable U_ICU_ENTRY_POINT_RENAME(ubrk_countAvailable) +#define ubrk_current U_ICU_ENTRY_POINT_RENAME(ubrk_current) +#define ubrk_first U_ICU_ENTRY_POINT_RENAME(ubrk_first) +#define ubrk_following U_ICU_ENTRY_POINT_RENAME(ubrk_following) +#define ubrk_getAvailable U_ICU_ENTRY_POINT_RENAME(ubrk_getAvailable) +#define ubrk_getLocaleByType U_ICU_ENTRY_POINT_RENAME(ubrk_getLocaleByType) +#define ubrk_getRuleStatus U_ICU_ENTRY_POINT_RENAME(ubrk_getRuleStatus) +#define ubrk_getRuleStatusVec U_ICU_ENTRY_POINT_RENAME(ubrk_getRuleStatusVec) +#define ubrk_isBoundary U_ICU_ENTRY_POINT_RENAME(ubrk_isBoundary) +#define ubrk_last U_ICU_ENTRY_POINT_RENAME(ubrk_last) +#define ubrk_next U_ICU_ENTRY_POINT_RENAME(ubrk_next) +#define ubrk_open U_ICU_ENTRY_POINT_RENAME(ubrk_open) +#define ubrk_openRules U_ICU_ENTRY_POINT_RENAME(ubrk_openRules) +#define ubrk_preceding U_ICU_ENTRY_POINT_RENAME(ubrk_preceding) +#define ubrk_previous U_ICU_ENTRY_POINT_RENAME(ubrk_previous) +#define ubrk_refreshUText U_ICU_ENTRY_POINT_RENAME(ubrk_refreshUText) +#define ubrk_safeClone U_ICU_ENTRY_POINT_RENAME(ubrk_safeClone) +#define ubrk_setText U_ICU_ENTRY_POINT_RENAME(ubrk_setText) +#define ubrk_setUText U_ICU_ENTRY_POINT_RENAME(ubrk_setUText) +#define ubrk_swap U_ICU_ENTRY_POINT_RENAME(ubrk_swap) +#define ucache_compareKeys U_ICU_ENTRY_POINT_RENAME(ucache_compareKeys) +#define ucache_deleteKey U_ICU_ENTRY_POINT_RENAME(ucache_deleteKey) +#define ucache_hashKeys U_ICU_ENTRY_POINT_RENAME(ucache_hashKeys) +#define ucal_add U_ICU_ENTRY_POINT_RENAME(ucal_add) +#define ucal_clear U_ICU_ENTRY_POINT_RENAME(ucal_clear) +#define ucal_clearField U_ICU_ENTRY_POINT_RENAME(ucal_clearField) +#define ucal_clone U_ICU_ENTRY_POINT_RENAME(ucal_clone) +#define ucal_close U_ICU_ENTRY_POINT_RENAME(ucal_close) +#define ucal_countAvailable U_ICU_ENTRY_POINT_RENAME(ucal_countAvailable) +#define ucal_equivalentTo U_ICU_ENTRY_POINT_RENAME(ucal_equivalentTo) +#define ucal_get U_ICU_ENTRY_POINT_RENAME(ucal_get) +#define ucal_getAttribute U_ICU_ENTRY_POINT_RENAME(ucal_getAttribute) +#define ucal_getAvailable U_ICU_ENTRY_POINT_RENAME(ucal_getAvailable) +#define ucal_getCanonicalTimeZoneID U_ICU_ENTRY_POINT_RENAME(ucal_getCanonicalTimeZoneID) +#define ucal_getDSTSavings U_ICU_ENTRY_POINT_RENAME(ucal_getDSTSavings) +#define ucal_getDayOfWeekType U_ICU_ENTRY_POINT_RENAME(ucal_getDayOfWeekType) +#define ucal_getDefaultTimeZone U_ICU_ENTRY_POINT_RENAME(ucal_getDefaultTimeZone) +#define ucal_getFieldDifference U_ICU_ENTRY_POINT_RENAME(ucal_getFieldDifference) +#define ucal_getGregorianChange U_ICU_ENTRY_POINT_RENAME(ucal_getGregorianChange) +#define ucal_getKeywordValuesForLocale U_ICU_ENTRY_POINT_RENAME(ucal_getKeywordValuesForLocale) +#define ucal_getLimit U_ICU_ENTRY_POINT_RENAME(ucal_getLimit) +#define ucal_getLocaleByType U_ICU_ENTRY_POINT_RENAME(ucal_getLocaleByType) +#define ucal_getMillis U_ICU_ENTRY_POINT_RENAME(ucal_getMillis) +#define ucal_getNow U_ICU_ENTRY_POINT_RENAME(ucal_getNow) +#define ucal_getTZDataVersion U_ICU_ENTRY_POINT_RENAME(ucal_getTZDataVersion) +#define ucal_getTimeZoneDisplayName U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneDisplayName) +#define ucal_getTimeZoneID U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneID) +#define ucal_getTimeZoneIDForWindowsID U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneIDForWindowsID) +#define ucal_getTimeZoneTransitionDate U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneTransitionDate) +#define ucal_getType U_ICU_ENTRY_POINT_RENAME(ucal_getType) +#define ucal_getWeekendTransition U_ICU_ENTRY_POINT_RENAME(ucal_getWeekendTransition) +#define ucal_getWindowsTimeZoneID U_ICU_ENTRY_POINT_RENAME(ucal_getWindowsTimeZoneID) +#define ucal_inDaylightTime U_ICU_ENTRY_POINT_RENAME(ucal_inDaylightTime) +#define ucal_isSet U_ICU_ENTRY_POINT_RENAME(ucal_isSet) +#define ucal_isWeekend U_ICU_ENTRY_POINT_RENAME(ucal_isWeekend) +#define ucal_open U_ICU_ENTRY_POINT_RENAME(ucal_open) +#define ucal_openCountryTimeZones U_ICU_ENTRY_POINT_RENAME(ucal_openCountryTimeZones) +#define ucal_openTimeZoneIDEnumeration U_ICU_ENTRY_POINT_RENAME(ucal_openTimeZoneIDEnumeration) +#define ucal_openTimeZones U_ICU_ENTRY_POINT_RENAME(ucal_openTimeZones) +#define ucal_roll U_ICU_ENTRY_POINT_RENAME(ucal_roll) +#define ucal_set U_ICU_ENTRY_POINT_RENAME(ucal_set) +#define ucal_setAttribute U_ICU_ENTRY_POINT_RENAME(ucal_setAttribute) +#define ucal_setDate U_ICU_ENTRY_POINT_RENAME(ucal_setDate) +#define ucal_setDateTime U_ICU_ENTRY_POINT_RENAME(ucal_setDateTime) +#define ucal_setDefaultTimeZone U_ICU_ENTRY_POINT_RENAME(ucal_setDefaultTimeZone) +#define ucal_setGregorianChange U_ICU_ENTRY_POINT_RENAME(ucal_setGregorianChange) +#define ucal_setMillis U_ICU_ENTRY_POINT_RENAME(ucal_setMillis) +#define ucal_setTimeZone U_ICU_ENTRY_POINT_RENAME(ucal_setTimeZone) +#define ucase_addCaseClosure U_ICU_ENTRY_POINT_RENAME(ucase_addCaseClosure) +#define ucase_addPropertyStarts U_ICU_ENTRY_POINT_RENAME(ucase_addPropertyStarts) +#define ucase_addStringCaseClosure U_ICU_ENTRY_POINT_RENAME(ucase_addStringCaseClosure) +#define ucase_fold U_ICU_ENTRY_POINT_RENAME(ucase_fold) +#define ucase_getCaseLocale U_ICU_ENTRY_POINT_RENAME(ucase_getCaseLocale) +#define ucase_getSingleton U_ICU_ENTRY_POINT_RENAME(ucase_getSingleton) +#define ucase_getType U_ICU_ENTRY_POINT_RENAME(ucase_getType) +#define ucase_getTypeOrIgnorable U_ICU_ENTRY_POINT_RENAME(ucase_getTypeOrIgnorable) +#define ucase_hasBinaryProperty U_ICU_ENTRY_POINT_RENAME(ucase_hasBinaryProperty) +#define ucase_isCaseSensitive U_ICU_ENTRY_POINT_RENAME(ucase_isCaseSensitive) +#define ucase_isSoftDotted U_ICU_ENTRY_POINT_RENAME(ucase_isSoftDotted) +#define ucase_toFullFolding U_ICU_ENTRY_POINT_RENAME(ucase_toFullFolding) +#define ucase_toFullLower U_ICU_ENTRY_POINT_RENAME(ucase_toFullLower) +#define ucase_toFullTitle U_ICU_ENTRY_POINT_RENAME(ucase_toFullTitle) +#define ucase_toFullUpper U_ICU_ENTRY_POINT_RENAME(ucase_toFullUpper) +#define ucase_tolower U_ICU_ENTRY_POINT_RENAME(ucase_tolower) +#define ucase_totitle U_ICU_ENTRY_POINT_RENAME(ucase_totitle) +#define ucase_toupper U_ICU_ENTRY_POINT_RENAME(ucase_toupper) +#define ucasemap_close U_ICU_ENTRY_POINT_RENAME(ucasemap_close) +#define ucasemap_getBreakIterator U_ICU_ENTRY_POINT_RENAME(ucasemap_getBreakIterator) +#define ucasemap_getLocale U_ICU_ENTRY_POINT_RENAME(ucasemap_getLocale) +#define ucasemap_getOptions U_ICU_ENTRY_POINT_RENAME(ucasemap_getOptions) +#define ucasemap_internalUTF8ToTitle U_ICU_ENTRY_POINT_RENAME(ucasemap_internalUTF8ToTitle) +#define ucasemap_mapUTF8 U_ICU_ENTRY_POINT_RENAME(ucasemap_mapUTF8) +#define ucasemap_open U_ICU_ENTRY_POINT_RENAME(ucasemap_open) +#define ucasemap_setBreakIterator U_ICU_ENTRY_POINT_RENAME(ucasemap_setBreakIterator) +#define ucasemap_setLocale U_ICU_ENTRY_POINT_RENAME(ucasemap_setLocale) +#define ucasemap_setOptions U_ICU_ENTRY_POINT_RENAME(ucasemap_setOptions) +#define ucasemap_toTitle U_ICU_ENTRY_POINT_RENAME(ucasemap_toTitle) +#define ucasemap_utf8FoldCase U_ICU_ENTRY_POINT_RENAME(ucasemap_utf8FoldCase) +#define ucasemap_utf8ToLower U_ICU_ENTRY_POINT_RENAME(ucasemap_utf8ToLower) +#define ucasemap_utf8ToTitle U_ICU_ENTRY_POINT_RENAME(ucasemap_utf8ToTitle) +#define ucasemap_utf8ToUpper U_ICU_ENTRY_POINT_RENAME(ucasemap_utf8ToUpper) +#define uchar_addPropertyStarts U_ICU_ENTRY_POINT_RENAME(uchar_addPropertyStarts) +#define uchar_swapNames U_ICU_ENTRY_POINT_RENAME(uchar_swapNames) +#define ucln_cleanupOne U_ICU_ENTRY_POINT_RENAME(ucln_cleanupOne) +#define ucln_common_registerCleanup U_ICU_ENTRY_POINT_RENAME(ucln_common_registerCleanup) +#define ucln_i18n_registerCleanup U_ICU_ENTRY_POINT_RENAME(ucln_i18n_registerCleanup) +#define ucln_io_registerCleanup U_ICU_ENTRY_POINT_RENAME(ucln_io_registerCleanup) +#define ucln_lib_cleanup U_ICU_ENTRY_POINT_RENAME(ucln_lib_cleanup) +#define ucln_registerCleanup U_ICU_ENTRY_POINT_RENAME(ucln_registerCleanup) +#define ucnv_MBCSFromUChar32 U_ICU_ENTRY_POINT_RENAME(ucnv_MBCSFromUChar32) +#define ucnv_MBCSFromUnicodeWithOffsets U_ICU_ENTRY_POINT_RENAME(ucnv_MBCSFromUnicodeWithOffsets) +#define ucnv_MBCSGetFilteredUnicodeSetForUnicode U_ICU_ENTRY_POINT_RENAME(ucnv_MBCSGetFilteredUnicodeSetForUnicode) +#define ucnv_MBCSGetType U_ICU_ENTRY_POINT_RENAME(ucnv_MBCSGetType) +#define ucnv_MBCSGetUnicodeSetForUnicode U_ICU_ENTRY_POINT_RENAME(ucnv_MBCSGetUnicodeSetForUnicode) +#define ucnv_MBCSIsLeadByte U_ICU_ENTRY_POINT_RENAME(ucnv_MBCSIsLeadByte) +#define ucnv_MBCSSimpleGetNextUChar U_ICU_ENTRY_POINT_RENAME(ucnv_MBCSSimpleGetNextUChar) +#define ucnv_MBCSToUnicodeWithOffsets U_ICU_ENTRY_POINT_RENAME(ucnv_MBCSToUnicodeWithOffsets) +#define ucnv_bld_countAvailableConverters U_ICU_ENTRY_POINT_RENAME(ucnv_bld_countAvailableConverters) +#define ucnv_bld_getAvailableConverter U_ICU_ENTRY_POINT_RENAME(ucnv_bld_getAvailableConverter) +#define ucnv_canCreateConverter U_ICU_ENTRY_POINT_RENAME(ucnv_canCreateConverter) +#define ucnv_cbFromUWriteBytes U_ICU_ENTRY_POINT_RENAME(ucnv_cbFromUWriteBytes) +#define ucnv_cbFromUWriteSub U_ICU_ENTRY_POINT_RENAME(ucnv_cbFromUWriteSub) +#define ucnv_cbFromUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbFromUWriteUChars) +#define ucnv_cbToUWriteSub U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteSub) +#define ucnv_cbToUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteUChars) +#define ucnv_close U_ICU_ENTRY_POINT_RENAME(ucnv_close) +#define ucnv_compareNames U_ICU_ENTRY_POINT_RENAME(ucnv_compareNames) +#define ucnv_convert U_ICU_ENTRY_POINT_RENAME(ucnv_convert) +#define ucnv_convertEx U_ICU_ENTRY_POINT_RENAME(ucnv_convertEx) +#define ucnv_countAliases U_ICU_ENTRY_POINT_RENAME(ucnv_countAliases) +#define ucnv_countAvailable U_ICU_ENTRY_POINT_RENAME(ucnv_countAvailable) +#define ucnv_countStandards U_ICU_ENTRY_POINT_RENAME(ucnv_countStandards) +#define ucnv_createAlgorithmicConverter U_ICU_ENTRY_POINT_RENAME(ucnv_createAlgorithmicConverter) +#define ucnv_createConverter U_ICU_ENTRY_POINT_RENAME(ucnv_createConverter) +#define ucnv_createConverterFromPackage U_ICU_ENTRY_POINT_RENAME(ucnv_createConverterFromPackage) +#define ucnv_createConverterFromSharedData U_ICU_ENTRY_POINT_RENAME(ucnv_createConverterFromSharedData) +#define ucnv_detectUnicodeSignature U_ICU_ENTRY_POINT_RENAME(ucnv_detectUnicodeSignature) +#define ucnv_extContinueMatchFromU U_ICU_ENTRY_POINT_RENAME(ucnv_extContinueMatchFromU) +#define ucnv_extContinueMatchToU U_ICU_ENTRY_POINT_RENAME(ucnv_extContinueMatchToU) +#define ucnv_extGetUnicodeSet U_ICU_ENTRY_POINT_RENAME(ucnv_extGetUnicodeSet) +#define ucnv_extInitialMatchFromU U_ICU_ENTRY_POINT_RENAME(ucnv_extInitialMatchFromU) +#define ucnv_extInitialMatchToU U_ICU_ENTRY_POINT_RENAME(ucnv_extInitialMatchToU) +#define ucnv_extSimpleMatchFromU U_ICU_ENTRY_POINT_RENAME(ucnv_extSimpleMatchFromU) +#define ucnv_extSimpleMatchToU U_ICU_ENTRY_POINT_RENAME(ucnv_extSimpleMatchToU) +#define ucnv_fixFileSeparator U_ICU_ENTRY_POINT_RENAME(ucnv_fixFileSeparator) +#define ucnv_flushCache U_ICU_ENTRY_POINT_RENAME(ucnv_flushCache) +#define ucnv_fromAlgorithmic U_ICU_ENTRY_POINT_RENAME(ucnv_fromAlgorithmic) +#define ucnv_fromUChars U_ICU_ENTRY_POINT_RENAME(ucnv_fromUChars) +#define ucnv_fromUCountPending U_ICU_ENTRY_POINT_RENAME(ucnv_fromUCountPending) +#define ucnv_fromUWriteBytes U_ICU_ENTRY_POINT_RENAME(ucnv_fromUWriteBytes) +#define ucnv_fromUnicode U_ICU_ENTRY_POINT_RENAME(ucnv_fromUnicode) +#define ucnv_fromUnicode_UTF8 U_ICU_ENTRY_POINT_RENAME(ucnv_fromUnicode_UTF8) +#define ucnv_fromUnicode_UTF8_OFFSETS_LOGIC U_ICU_ENTRY_POINT_RENAME(ucnv_fromUnicode_UTF8_OFFSETS_LOGIC) +#define ucnv_getAlias U_ICU_ENTRY_POINT_RENAME(ucnv_getAlias) +#define ucnv_getAliases U_ICU_ENTRY_POINT_RENAME(ucnv_getAliases) +#define ucnv_getAvailableName U_ICU_ENTRY_POINT_RENAME(ucnv_getAvailableName) +#define ucnv_getCCSID U_ICU_ENTRY_POINT_RENAME(ucnv_getCCSID) +#define ucnv_getCanonicalName U_ICU_ENTRY_POINT_RENAME(ucnv_getCanonicalName) +#define ucnv_getCompleteUnicodeSet U_ICU_ENTRY_POINT_RENAME(ucnv_getCompleteUnicodeSet) +#define ucnv_getDefaultName U_ICU_ENTRY_POINT_RENAME(ucnv_getDefaultName) +#define ucnv_getDisplayName U_ICU_ENTRY_POINT_RENAME(ucnv_getDisplayName) +#define ucnv_getFromUCallBack U_ICU_ENTRY_POINT_RENAME(ucnv_getFromUCallBack) +#define ucnv_getInvalidChars U_ICU_ENTRY_POINT_RENAME(ucnv_getInvalidChars) +#define ucnv_getInvalidUChars U_ICU_ENTRY_POINT_RENAME(ucnv_getInvalidUChars) +#define ucnv_getMaxCharSize U_ICU_ENTRY_POINT_RENAME(ucnv_getMaxCharSize) +#define ucnv_getMinCharSize U_ICU_ENTRY_POINT_RENAME(ucnv_getMinCharSize) +#define ucnv_getName U_ICU_ENTRY_POINT_RENAME(ucnv_getName) +#define ucnv_getNextUChar U_ICU_ENTRY_POINT_RENAME(ucnv_getNextUChar) +#define ucnv_getNonSurrogateUnicodeSet U_ICU_ENTRY_POINT_RENAME(ucnv_getNonSurrogateUnicodeSet) +#define ucnv_getPlatform U_ICU_ENTRY_POINT_RENAME(ucnv_getPlatform) +#define ucnv_getStandard U_ICU_ENTRY_POINT_RENAME(ucnv_getStandard) +#define ucnv_getStandardName U_ICU_ENTRY_POINT_RENAME(ucnv_getStandardName) +#define ucnv_getStarters U_ICU_ENTRY_POINT_RENAME(ucnv_getStarters) +#define ucnv_getSubstChars U_ICU_ENTRY_POINT_RENAME(ucnv_getSubstChars) +#define ucnv_getToUCallBack U_ICU_ENTRY_POINT_RENAME(ucnv_getToUCallBack) +#define ucnv_getType U_ICU_ENTRY_POINT_RENAME(ucnv_getType) +#define ucnv_getUnicodeSet U_ICU_ENTRY_POINT_RENAME(ucnv_getUnicodeSet) +#define ucnv_incrementRefCount U_ICU_ENTRY_POINT_RENAME(ucnv_incrementRefCount) +#define ucnv_io_countKnownConverters U_ICU_ENTRY_POINT_RENAME(ucnv_io_countKnownConverters) +#define ucnv_io_getConverterName U_ICU_ENTRY_POINT_RENAME(ucnv_io_getConverterName) +#define ucnv_io_stripASCIIForCompare U_ICU_ENTRY_POINT_RENAME(ucnv_io_stripASCIIForCompare) +#define ucnv_io_stripEBCDICForCompare U_ICU_ENTRY_POINT_RENAME(ucnv_io_stripEBCDICForCompare) +#define ucnv_isAmbiguous U_ICU_ENTRY_POINT_RENAME(ucnv_isAmbiguous) +#define ucnv_isFixedWidth U_ICU_ENTRY_POINT_RENAME(ucnv_isFixedWidth) +#define ucnv_load U_ICU_ENTRY_POINT_RENAME(ucnv_load) +#define ucnv_loadSharedData U_ICU_ENTRY_POINT_RENAME(ucnv_loadSharedData) +#define ucnv_open U_ICU_ENTRY_POINT_RENAME(ucnv_open) +#define ucnv_openAllNames U_ICU_ENTRY_POINT_RENAME(ucnv_openAllNames) +#define ucnv_openCCSID U_ICU_ENTRY_POINT_RENAME(ucnv_openCCSID) +#define ucnv_openPackage U_ICU_ENTRY_POINT_RENAME(ucnv_openPackage) +#define ucnv_openStandardNames U_ICU_ENTRY_POINT_RENAME(ucnv_openStandardNames) +#define ucnv_openU U_ICU_ENTRY_POINT_RENAME(ucnv_openU) +#define ucnv_reset U_ICU_ENTRY_POINT_RENAME(ucnv_reset) +#define ucnv_resetFromUnicode U_ICU_ENTRY_POINT_RENAME(ucnv_resetFromUnicode) +#define ucnv_resetToUnicode U_ICU_ENTRY_POINT_RENAME(ucnv_resetToUnicode) +#define ucnv_safeClone U_ICU_ENTRY_POINT_RENAME(ucnv_safeClone) +#define ucnv_setDefaultName U_ICU_ENTRY_POINT_RENAME(ucnv_setDefaultName) +#define ucnv_setFallback U_ICU_ENTRY_POINT_RENAME(ucnv_setFallback) +#define ucnv_setFromUCallBack U_ICU_ENTRY_POINT_RENAME(ucnv_setFromUCallBack) +#define ucnv_setSubstChars U_ICU_ENTRY_POINT_RENAME(ucnv_setSubstChars) +#define ucnv_setSubstString U_ICU_ENTRY_POINT_RENAME(ucnv_setSubstString) +#define ucnv_setToUCallBack U_ICU_ENTRY_POINT_RENAME(ucnv_setToUCallBack) +#define ucnv_swap U_ICU_ENTRY_POINT_RENAME(ucnv_swap) +#define ucnv_swapAliases U_ICU_ENTRY_POINT_RENAME(ucnv_swapAliases) +#define ucnv_toAlgorithmic U_ICU_ENTRY_POINT_RENAME(ucnv_toAlgorithmic) +#define ucnv_toUChars U_ICU_ENTRY_POINT_RENAME(ucnv_toUChars) +#define ucnv_toUCountPending U_ICU_ENTRY_POINT_RENAME(ucnv_toUCountPending) +#define ucnv_toUWriteCodePoint U_ICU_ENTRY_POINT_RENAME(ucnv_toUWriteCodePoint) +#define ucnv_toUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_toUWriteUChars) +#define ucnv_toUnicode U_ICU_ENTRY_POINT_RENAME(ucnv_toUnicode) +#define ucnv_unload U_ICU_ENTRY_POINT_RENAME(ucnv_unload) +#define ucnv_unloadSharedDataIfReady U_ICU_ENTRY_POINT_RENAME(ucnv_unloadSharedDataIfReady) +#define ucnv_usesFallback U_ICU_ENTRY_POINT_RENAME(ucnv_usesFallback) +#define ucnvsel_close U_ICU_ENTRY_POINT_RENAME(ucnvsel_close) +#define ucnvsel_open U_ICU_ENTRY_POINT_RENAME(ucnvsel_open) +#define ucnvsel_openFromSerialized U_ICU_ENTRY_POINT_RENAME(ucnvsel_openFromSerialized) +#define ucnvsel_selectForString U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForString) +#define ucnvsel_selectForUTF8 U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForUTF8) +#define ucnvsel_serialize U_ICU_ENTRY_POINT_RENAME(ucnvsel_serialize) +#define ucol_cloneBinary U_ICU_ENTRY_POINT_RENAME(ucol_cloneBinary) +#define ucol_close U_ICU_ENTRY_POINT_RENAME(ucol_close) +#define ucol_closeElements U_ICU_ENTRY_POINT_RENAME(ucol_closeElements) +#define ucol_countAvailable U_ICU_ENTRY_POINT_RENAME(ucol_countAvailable) +#define ucol_equal U_ICU_ENTRY_POINT_RENAME(ucol_equal) +#define ucol_equals U_ICU_ENTRY_POINT_RENAME(ucol_equals) +#define ucol_getAttribute U_ICU_ENTRY_POINT_RENAME(ucol_getAttribute) +#define ucol_getAvailable U_ICU_ENTRY_POINT_RENAME(ucol_getAvailable) +#define ucol_getBound U_ICU_ENTRY_POINT_RENAME(ucol_getBound) +#define ucol_getContractions U_ICU_ENTRY_POINT_RENAME(ucol_getContractions) +#define ucol_getContractionsAndExpansions U_ICU_ENTRY_POINT_RENAME(ucol_getContractionsAndExpansions) +#define ucol_getDisplayName U_ICU_ENTRY_POINT_RENAME(ucol_getDisplayName) +#define ucol_getEquivalentReorderCodes U_ICU_ENTRY_POINT_RENAME(ucol_getEquivalentReorderCodes) +#define ucol_getFunctionalEquivalent U_ICU_ENTRY_POINT_RENAME(ucol_getFunctionalEquivalent) +#define ucol_getKeywordValues U_ICU_ENTRY_POINT_RENAME(ucol_getKeywordValues) +#define ucol_getKeywordValuesForLocale U_ICU_ENTRY_POINT_RENAME(ucol_getKeywordValuesForLocale) +#define ucol_getKeywords U_ICU_ENTRY_POINT_RENAME(ucol_getKeywords) +#define ucol_getLocale U_ICU_ENTRY_POINT_RENAME(ucol_getLocale) +#define ucol_getLocaleByType U_ICU_ENTRY_POINT_RENAME(ucol_getLocaleByType) +#define ucol_getMaxExpansion U_ICU_ENTRY_POINT_RENAME(ucol_getMaxExpansion) +#define ucol_getMaxVariable U_ICU_ENTRY_POINT_RENAME(ucol_getMaxVariable) +#define ucol_getOffset U_ICU_ENTRY_POINT_RENAME(ucol_getOffset) +#define ucol_getReorderCodes U_ICU_ENTRY_POINT_RENAME(ucol_getReorderCodes) +#define ucol_getRules U_ICU_ENTRY_POINT_RENAME(ucol_getRules) +#define ucol_getRulesEx U_ICU_ENTRY_POINT_RENAME(ucol_getRulesEx) +#define ucol_getShortDefinitionString U_ICU_ENTRY_POINT_RENAME(ucol_getShortDefinitionString) +#define ucol_getSortKey U_ICU_ENTRY_POINT_RENAME(ucol_getSortKey) +#define ucol_getStrength U_ICU_ENTRY_POINT_RENAME(ucol_getStrength) +#define ucol_getTailoredSet U_ICU_ENTRY_POINT_RENAME(ucol_getTailoredSet) +#define ucol_getUCAVersion U_ICU_ENTRY_POINT_RENAME(ucol_getUCAVersion) +#define ucol_getUnsafeSet U_ICU_ENTRY_POINT_RENAME(ucol_getUnsafeSet) +#define ucol_getVariableTop U_ICU_ENTRY_POINT_RENAME(ucol_getVariableTop) +#define ucol_getVersion U_ICU_ENTRY_POINT_RENAME(ucol_getVersion) +#define ucol_greater U_ICU_ENTRY_POINT_RENAME(ucol_greater) +#define ucol_greaterOrEqual U_ICU_ENTRY_POINT_RENAME(ucol_greaterOrEqual) +#define ucol_keyHashCode U_ICU_ENTRY_POINT_RENAME(ucol_keyHashCode) +#define ucol_looksLikeCollationBinary U_ICU_ENTRY_POINT_RENAME(ucol_looksLikeCollationBinary) +#define ucol_mergeSortkeys U_ICU_ENTRY_POINT_RENAME(ucol_mergeSortkeys) +#define ucol_next U_ICU_ENTRY_POINT_RENAME(ucol_next) +#define ucol_nextSortKeyPart U_ICU_ENTRY_POINT_RENAME(ucol_nextSortKeyPart) +#define ucol_normalizeShortDefinitionString U_ICU_ENTRY_POINT_RENAME(ucol_normalizeShortDefinitionString) +#define ucol_open U_ICU_ENTRY_POINT_RENAME(ucol_open) +#define ucol_openAvailableLocales U_ICU_ENTRY_POINT_RENAME(ucol_openAvailableLocales) +#define ucol_openBinary U_ICU_ENTRY_POINT_RENAME(ucol_openBinary) +#define ucol_openElements U_ICU_ENTRY_POINT_RENAME(ucol_openElements) +#define ucol_openFromShortString U_ICU_ENTRY_POINT_RENAME(ucol_openFromShortString) +#define ucol_openRules U_ICU_ENTRY_POINT_RENAME(ucol_openRules) +#define ucol_prepareShortStringOpen U_ICU_ENTRY_POINT_RENAME(ucol_prepareShortStringOpen) +#define ucol_previous U_ICU_ENTRY_POINT_RENAME(ucol_previous) +#define ucol_primaryOrder U_ICU_ENTRY_POINT_RENAME(ucol_primaryOrder) +#define ucol_reset U_ICU_ENTRY_POINT_RENAME(ucol_reset) +#define ucol_restoreVariableTop U_ICU_ENTRY_POINT_RENAME(ucol_restoreVariableTop) +#define ucol_safeClone U_ICU_ENTRY_POINT_RENAME(ucol_safeClone) +#define ucol_secondaryOrder U_ICU_ENTRY_POINT_RENAME(ucol_secondaryOrder) +#define ucol_setAttribute U_ICU_ENTRY_POINT_RENAME(ucol_setAttribute) +#define ucol_setMaxVariable U_ICU_ENTRY_POINT_RENAME(ucol_setMaxVariable) +#define ucol_setOffset U_ICU_ENTRY_POINT_RENAME(ucol_setOffset) +#define ucol_setReorderCodes U_ICU_ENTRY_POINT_RENAME(ucol_setReorderCodes) +#define ucol_setStrength U_ICU_ENTRY_POINT_RENAME(ucol_setStrength) +#define ucol_setText U_ICU_ENTRY_POINT_RENAME(ucol_setText) +#define ucol_setVariableTop U_ICU_ENTRY_POINT_RENAME(ucol_setVariableTop) +#define ucol_strcoll U_ICU_ENTRY_POINT_RENAME(ucol_strcoll) +#define ucol_strcollIter U_ICU_ENTRY_POINT_RENAME(ucol_strcollIter) +#define ucol_strcollUTF8 U_ICU_ENTRY_POINT_RENAME(ucol_strcollUTF8) +#define ucol_swap U_ICU_ENTRY_POINT_RENAME(ucol_swap) +#define ucol_swapInverseUCA U_ICU_ENTRY_POINT_RENAME(ucol_swapInverseUCA) +#define ucol_tertiaryOrder U_ICU_ENTRY_POINT_RENAME(ucol_tertiaryOrder) +#define ucsdet_close U_ICU_ENTRY_POINT_RENAME(ucsdet_close) +#define ucsdet_detect U_ICU_ENTRY_POINT_RENAME(ucsdet_detect) +#define ucsdet_detectAll U_ICU_ENTRY_POINT_RENAME(ucsdet_detectAll) +#define ucsdet_enableInputFilter U_ICU_ENTRY_POINT_RENAME(ucsdet_enableInputFilter) +#define ucsdet_getAllDetectableCharsets U_ICU_ENTRY_POINT_RENAME(ucsdet_getAllDetectableCharsets) +#define ucsdet_getConfidence U_ICU_ENTRY_POINT_RENAME(ucsdet_getConfidence) +#define ucsdet_getDetectableCharsets U_ICU_ENTRY_POINT_RENAME(ucsdet_getDetectableCharsets) +#define ucsdet_getLanguage U_ICU_ENTRY_POINT_RENAME(ucsdet_getLanguage) +#define ucsdet_getName U_ICU_ENTRY_POINT_RENAME(ucsdet_getName) +#define ucsdet_getUChars U_ICU_ENTRY_POINT_RENAME(ucsdet_getUChars) +#define ucsdet_isInputFilterEnabled U_ICU_ENTRY_POINT_RENAME(ucsdet_isInputFilterEnabled) +#define ucsdet_open U_ICU_ENTRY_POINT_RENAME(ucsdet_open) +#define ucsdet_setDeclaredEncoding U_ICU_ENTRY_POINT_RENAME(ucsdet_setDeclaredEncoding) +#define ucsdet_setDetectableCharset U_ICU_ENTRY_POINT_RENAME(ucsdet_setDetectableCharset) +#define ucsdet_setText U_ICU_ENTRY_POINT_RENAME(ucsdet_setText) +#define ucurr_countCurrencies U_ICU_ENTRY_POINT_RENAME(ucurr_countCurrencies) +#define ucurr_forLocale U_ICU_ENTRY_POINT_RENAME(ucurr_forLocale) +#define ucurr_forLocaleAndDate U_ICU_ENTRY_POINT_RENAME(ucurr_forLocaleAndDate) +#define ucurr_getDefaultFractionDigits U_ICU_ENTRY_POINT_RENAME(ucurr_getDefaultFractionDigits) +#define ucurr_getDefaultFractionDigitsForUsage U_ICU_ENTRY_POINT_RENAME(ucurr_getDefaultFractionDigitsForUsage) +#define ucurr_getKeywordValuesForLocale U_ICU_ENTRY_POINT_RENAME(ucurr_getKeywordValuesForLocale) +#define ucurr_getName U_ICU_ENTRY_POINT_RENAME(ucurr_getName) +#define ucurr_getNumericCode U_ICU_ENTRY_POINT_RENAME(ucurr_getNumericCode) +#define ucurr_getPluralName U_ICU_ENTRY_POINT_RENAME(ucurr_getPluralName) +#define ucurr_getRoundingIncrement U_ICU_ENTRY_POINT_RENAME(ucurr_getRoundingIncrement) +#define ucurr_getRoundingIncrementForUsage U_ICU_ENTRY_POINT_RENAME(ucurr_getRoundingIncrementForUsage) +#define ucurr_isAvailable U_ICU_ENTRY_POINT_RENAME(ucurr_isAvailable) +#define ucurr_openISOCurrencies U_ICU_ENTRY_POINT_RENAME(ucurr_openISOCurrencies) +#define ucurr_register U_ICU_ENTRY_POINT_RENAME(ucurr_register) +#define ucurr_unregister U_ICU_ENTRY_POINT_RENAME(ucurr_unregister) +#define udat_adoptNumberFormat U_ICU_ENTRY_POINT_RENAME(udat_adoptNumberFormat) +#define udat_adoptNumberFormatForFields U_ICU_ENTRY_POINT_RENAME(udat_adoptNumberFormatForFields) +#define udat_applyPattern U_ICU_ENTRY_POINT_RENAME(udat_applyPattern) +#define udat_applyPatternRelative U_ICU_ENTRY_POINT_RENAME(udat_applyPatternRelative) +#define udat_clone U_ICU_ENTRY_POINT_RENAME(udat_clone) +#define udat_close U_ICU_ENTRY_POINT_RENAME(udat_close) +#define udat_countAvailable U_ICU_ENTRY_POINT_RENAME(udat_countAvailable) +#define udat_countSymbols U_ICU_ENTRY_POINT_RENAME(udat_countSymbols) +#define udat_format U_ICU_ENTRY_POINT_RENAME(udat_format) +#define udat_get2DigitYearStart U_ICU_ENTRY_POINT_RENAME(udat_get2DigitYearStart) +#define udat_getAvailable U_ICU_ENTRY_POINT_RENAME(udat_getAvailable) +#define udat_getBooleanAttribute U_ICU_ENTRY_POINT_RENAME(udat_getBooleanAttribute) +#define udat_getCalendar U_ICU_ENTRY_POINT_RENAME(udat_getCalendar) +#define udat_getContext U_ICU_ENTRY_POINT_RENAME(udat_getContext) +#define udat_getLocaleByType U_ICU_ENTRY_POINT_RENAME(udat_getLocaleByType) +#define udat_getNumberFormat U_ICU_ENTRY_POINT_RENAME(udat_getNumberFormat) +#define udat_getNumberFormatForField U_ICU_ENTRY_POINT_RENAME(udat_getNumberFormatForField) +#define udat_getSymbols U_ICU_ENTRY_POINT_RENAME(udat_getSymbols) +#define udat_isLenient U_ICU_ENTRY_POINT_RENAME(udat_isLenient) +#define udat_open U_ICU_ENTRY_POINT_RENAME(udat_open) +#define udat_parse U_ICU_ENTRY_POINT_RENAME(udat_parse) +#define udat_parseCalendar U_ICU_ENTRY_POINT_RENAME(udat_parseCalendar) +#define udat_registerOpener U_ICU_ENTRY_POINT_RENAME(udat_registerOpener) +#define udat_set2DigitYearStart U_ICU_ENTRY_POINT_RENAME(udat_set2DigitYearStart) +#define udat_setBooleanAttribute U_ICU_ENTRY_POINT_RENAME(udat_setBooleanAttribute) +#define udat_setCalendar U_ICU_ENTRY_POINT_RENAME(udat_setCalendar) +#define udat_setContext U_ICU_ENTRY_POINT_RENAME(udat_setContext) +#define udat_setLenient U_ICU_ENTRY_POINT_RENAME(udat_setLenient) +#define udat_setNumberFormat U_ICU_ENTRY_POINT_RENAME(udat_setNumberFormat) +#define udat_setSymbols U_ICU_ENTRY_POINT_RENAME(udat_setSymbols) +#define udat_toCalendarDateField U_ICU_ENTRY_POINT_RENAME(udat_toCalendarDateField) +#define udat_toPattern U_ICU_ENTRY_POINT_RENAME(udat_toPattern) +#define udat_toPatternRelativeDate U_ICU_ENTRY_POINT_RENAME(udat_toPatternRelativeDate) +#define udat_toPatternRelativeTime U_ICU_ENTRY_POINT_RENAME(udat_toPatternRelativeTime) +#define udat_unregisterOpener U_ICU_ENTRY_POINT_RENAME(udat_unregisterOpener) +#define udata_checkCommonData U_ICU_ENTRY_POINT_RENAME(udata_checkCommonData) +#define udata_close U_ICU_ENTRY_POINT_RENAME(udata_close) +#define udata_closeSwapper U_ICU_ENTRY_POINT_RENAME(udata_closeSwapper) +#define udata_getHeaderSize U_ICU_ENTRY_POINT_RENAME(udata_getHeaderSize) +#define udata_getInfo U_ICU_ENTRY_POINT_RENAME(udata_getInfo) +#define udata_getInfoSize U_ICU_ENTRY_POINT_RENAME(udata_getInfoSize) +#define udata_getLength U_ICU_ENTRY_POINT_RENAME(udata_getLength) +#define udata_getMemory U_ICU_ENTRY_POINT_RENAME(udata_getMemory) +#define udata_getRawMemory U_ICU_ENTRY_POINT_RENAME(udata_getRawMemory) +#define udata_open U_ICU_ENTRY_POINT_RENAME(udata_open) +#define udata_openChoice U_ICU_ENTRY_POINT_RENAME(udata_openChoice) +#define udata_openSwapper U_ICU_ENTRY_POINT_RENAME(udata_openSwapper) +#define udata_openSwapperForInputData U_ICU_ENTRY_POINT_RENAME(udata_openSwapperForInputData) +#define udata_printError U_ICU_ENTRY_POINT_RENAME(udata_printError) +#define udata_readInt16 U_ICU_ENTRY_POINT_RENAME(udata_readInt16) +#define udata_readInt32 U_ICU_ENTRY_POINT_RENAME(udata_readInt32) +#define udata_setAppData U_ICU_ENTRY_POINT_RENAME(udata_setAppData) +#define udata_setCommonData U_ICU_ENTRY_POINT_RENAME(udata_setCommonData) +#define udata_setFileAccess U_ICU_ENTRY_POINT_RENAME(udata_setFileAccess) +#define udata_swapDataHeader U_ICU_ENTRY_POINT_RENAME(udata_swapDataHeader) +#define udata_swapInvStringBlock U_ICU_ENTRY_POINT_RENAME(udata_swapInvStringBlock) +#define udatpg_addPattern U_ICU_ENTRY_POINT_RENAME(udatpg_addPattern) +#define udatpg_clone U_ICU_ENTRY_POINT_RENAME(udatpg_clone) +#define udatpg_close U_ICU_ENTRY_POINT_RENAME(udatpg_close) +#define udatpg_getAppendItemFormat U_ICU_ENTRY_POINT_RENAME(udatpg_getAppendItemFormat) +#define udatpg_getAppendItemName U_ICU_ENTRY_POINT_RENAME(udatpg_getAppendItemName) +#define udatpg_getBaseSkeleton U_ICU_ENTRY_POINT_RENAME(udatpg_getBaseSkeleton) +#define udatpg_getBestPattern U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPattern) +#define udatpg_getBestPatternWithOptions U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPatternWithOptions) +#define udatpg_getDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormat) +#define udatpg_getDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_getDecimal) +#define udatpg_getPatternForSkeleton U_ICU_ENTRY_POINT_RENAME(udatpg_getPatternForSkeleton) +#define udatpg_getSkeleton U_ICU_ENTRY_POINT_RENAME(udatpg_getSkeleton) +#define udatpg_open U_ICU_ENTRY_POINT_RENAME(udatpg_open) +#define udatpg_openBaseSkeletons U_ICU_ENTRY_POINT_RENAME(udatpg_openBaseSkeletons) +#define udatpg_openEmpty U_ICU_ENTRY_POINT_RENAME(udatpg_openEmpty) +#define udatpg_openSkeletons U_ICU_ENTRY_POINT_RENAME(udatpg_openSkeletons) +#define udatpg_replaceFieldTypes U_ICU_ENTRY_POINT_RENAME(udatpg_replaceFieldTypes) +#define udatpg_replaceFieldTypesWithOptions U_ICU_ENTRY_POINT_RENAME(udatpg_replaceFieldTypesWithOptions) +#define udatpg_setAppendItemFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemFormat) +#define udatpg_setAppendItemName U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemName) +#define udatpg_setDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormat) +#define udatpg_setDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_setDecimal) +#define udict_swap U_ICU_ENTRY_POINT_RENAME(udict_swap) +#define udtitvfmt_close U_ICU_ENTRY_POINT_RENAME(udtitvfmt_close) +#define udtitvfmt_format U_ICU_ENTRY_POINT_RENAME(udtitvfmt_format) +#define udtitvfmt_open U_ICU_ENTRY_POINT_RENAME(udtitvfmt_open) +#define uenum_close U_ICU_ENTRY_POINT_RENAME(uenum_close) +#define uenum_count U_ICU_ENTRY_POINT_RENAME(uenum_count) +#define uenum_next U_ICU_ENTRY_POINT_RENAME(uenum_next) +#define uenum_nextDefault U_ICU_ENTRY_POINT_RENAME(uenum_nextDefault) +#define uenum_openCharStringsEnumeration U_ICU_ENTRY_POINT_RENAME(uenum_openCharStringsEnumeration) +#define uenum_openFromStringEnumeration U_ICU_ENTRY_POINT_RENAME(uenum_openFromStringEnumeration) +#define uenum_openUCharStringsEnumeration U_ICU_ENTRY_POINT_RENAME(uenum_openUCharStringsEnumeration) +#define uenum_reset U_ICU_ENTRY_POINT_RENAME(uenum_reset) +#define uenum_unext U_ICU_ENTRY_POINT_RENAME(uenum_unext) +#define uenum_unextDefault U_ICU_ENTRY_POINT_RENAME(uenum_unextDefault) +#define ufile_close_translit U_ICU_ENTRY_POINT_RENAME(ufile_close_translit) +#define ufile_fill_uchar_buffer U_ICU_ENTRY_POINT_RENAME(ufile_fill_uchar_buffer) +#define ufile_flush_io U_ICU_ENTRY_POINT_RENAME(ufile_flush_io) +#define ufile_flush_translit U_ICU_ENTRY_POINT_RENAME(ufile_flush_translit) +#define ufile_getch U_ICU_ENTRY_POINT_RENAME(ufile_getch) +#define ufile_getch32 U_ICU_ENTRY_POINT_RENAME(ufile_getch32) +#define ufmt_64tou U_ICU_ENTRY_POINT_RENAME(ufmt_64tou) +#define ufmt_close U_ICU_ENTRY_POINT_RENAME(ufmt_close) +#define ufmt_defaultCPToUnicode U_ICU_ENTRY_POINT_RENAME(ufmt_defaultCPToUnicode) +#define ufmt_digitvalue U_ICU_ENTRY_POINT_RENAME(ufmt_digitvalue) +#define ufmt_getArrayItemByIndex U_ICU_ENTRY_POINT_RENAME(ufmt_getArrayItemByIndex) +#define ufmt_getArrayLength U_ICU_ENTRY_POINT_RENAME(ufmt_getArrayLength) +#define ufmt_getDate U_ICU_ENTRY_POINT_RENAME(ufmt_getDate) +#define ufmt_getDecNumChars U_ICU_ENTRY_POINT_RENAME(ufmt_getDecNumChars) +#define ufmt_getDouble U_ICU_ENTRY_POINT_RENAME(ufmt_getDouble) +#define ufmt_getInt64 U_ICU_ENTRY_POINT_RENAME(ufmt_getInt64) +#define ufmt_getLong U_ICU_ENTRY_POINT_RENAME(ufmt_getLong) +#define ufmt_getObject U_ICU_ENTRY_POINT_RENAME(ufmt_getObject) +#define ufmt_getType U_ICU_ENTRY_POINT_RENAME(ufmt_getType) +#define ufmt_getUChars U_ICU_ENTRY_POINT_RENAME(ufmt_getUChars) +#define ufmt_isNumeric U_ICU_ENTRY_POINT_RENAME(ufmt_isNumeric) +#define ufmt_isdigit U_ICU_ENTRY_POINT_RENAME(ufmt_isdigit) +#define ufmt_open U_ICU_ENTRY_POINT_RENAME(ufmt_open) +#define ufmt_ptou U_ICU_ENTRY_POINT_RENAME(ufmt_ptou) +#define ufmt_uto64 U_ICU_ENTRY_POINT_RENAME(ufmt_uto64) +#define ufmt_utop U_ICU_ENTRY_POINT_RENAME(ufmt_utop) +#define ugender_getInstance U_ICU_ENTRY_POINT_RENAME(ugender_getInstance) +#define ugender_getListGender U_ICU_ENTRY_POINT_RENAME(ugender_getListGender) +#define uhash_close U_ICU_ENTRY_POINT_RENAME(uhash_close) +#define uhash_compareCaselessUnicodeString U_ICU_ENTRY_POINT_RENAME(uhash_compareCaselessUnicodeString) +#define uhash_compareChars U_ICU_ENTRY_POINT_RENAME(uhash_compareChars) +#define uhash_compareIChars U_ICU_ENTRY_POINT_RENAME(uhash_compareIChars) +#define uhash_compareLong U_ICU_ENTRY_POINT_RENAME(uhash_compareLong) +#define uhash_compareScriptSet U_ICU_ENTRY_POINT_RENAME(uhash_compareScriptSet) +#define uhash_compareUChars U_ICU_ENTRY_POINT_RENAME(uhash_compareUChars) +#define uhash_compareUnicodeString U_ICU_ENTRY_POINT_RENAME(uhash_compareUnicodeString) +#define uhash_count U_ICU_ENTRY_POINT_RENAME(uhash_count) +#define uhash_deleteHashtable U_ICU_ENTRY_POINT_RENAME(uhash_deleteHashtable) +#define uhash_deleteScriptSet U_ICU_ENTRY_POINT_RENAME(uhash_deleteScriptSet) +#define uhash_equals U_ICU_ENTRY_POINT_RENAME(uhash_equals) +#define uhash_equalsScriptSet U_ICU_ENTRY_POINT_RENAME(uhash_equalsScriptSet) +#define uhash_find U_ICU_ENTRY_POINT_RENAME(uhash_find) +#define uhash_get U_ICU_ENTRY_POINT_RENAME(uhash_get) +#define uhash_geti U_ICU_ENTRY_POINT_RENAME(uhash_geti) +#define uhash_hashCaselessUnicodeString U_ICU_ENTRY_POINT_RENAME(uhash_hashCaselessUnicodeString) +#define uhash_hashChars U_ICU_ENTRY_POINT_RENAME(uhash_hashChars) +#define uhash_hashIChars U_ICU_ENTRY_POINT_RENAME(uhash_hashIChars) +#define uhash_hashLong U_ICU_ENTRY_POINT_RENAME(uhash_hashLong) +#define uhash_hashScriptSet U_ICU_ENTRY_POINT_RENAME(uhash_hashScriptSet) +#define uhash_hashUChars U_ICU_ENTRY_POINT_RENAME(uhash_hashUChars) +#define uhash_hashUnicodeString U_ICU_ENTRY_POINT_RENAME(uhash_hashUnicodeString) +#define uhash_iget U_ICU_ENTRY_POINT_RENAME(uhash_iget) +#define uhash_igeti U_ICU_ENTRY_POINT_RENAME(uhash_igeti) +#define uhash_init U_ICU_ENTRY_POINT_RENAME(uhash_init) +#define uhash_iput U_ICU_ENTRY_POINT_RENAME(uhash_iput) +#define uhash_iputi U_ICU_ENTRY_POINT_RENAME(uhash_iputi) +#define uhash_iremove U_ICU_ENTRY_POINT_RENAME(uhash_iremove) +#define uhash_iremovei U_ICU_ENTRY_POINT_RENAME(uhash_iremovei) +#define uhash_nextElement U_ICU_ENTRY_POINT_RENAME(uhash_nextElement) +#define uhash_open U_ICU_ENTRY_POINT_RENAME(uhash_open) +#define uhash_openSize U_ICU_ENTRY_POINT_RENAME(uhash_openSize) +#define uhash_put U_ICU_ENTRY_POINT_RENAME(uhash_put) +#define uhash_puti U_ICU_ENTRY_POINT_RENAME(uhash_puti) +#define uhash_remove U_ICU_ENTRY_POINT_RENAME(uhash_remove) +#define uhash_removeAll U_ICU_ENTRY_POINT_RENAME(uhash_removeAll) +#define uhash_removeElement U_ICU_ENTRY_POINT_RENAME(uhash_removeElement) +#define uhash_removei U_ICU_ENTRY_POINT_RENAME(uhash_removei) +#define uhash_setKeyComparator U_ICU_ENTRY_POINT_RENAME(uhash_setKeyComparator) +#define uhash_setKeyDeleter U_ICU_ENTRY_POINT_RENAME(uhash_setKeyDeleter) +#define uhash_setKeyHasher U_ICU_ENTRY_POINT_RENAME(uhash_setKeyHasher) +#define uhash_setResizePolicy U_ICU_ENTRY_POINT_RENAME(uhash_setResizePolicy) +#define uhash_setValueComparator U_ICU_ENTRY_POINT_RENAME(uhash_setValueComparator) +#define uhash_setValueDeleter U_ICU_ENTRY_POINT_RENAME(uhash_setValueDeleter) +#define uidna_IDNToASCII U_ICU_ENTRY_POINT_RENAME(uidna_IDNToASCII) +#define uidna_IDNToUnicode U_ICU_ENTRY_POINT_RENAME(uidna_IDNToUnicode) +#define uidna_close U_ICU_ENTRY_POINT_RENAME(uidna_close) +#define uidna_compare U_ICU_ENTRY_POINT_RENAME(uidna_compare) +#define uidna_labelToASCII U_ICU_ENTRY_POINT_RENAME(uidna_labelToASCII) +#define uidna_labelToASCII_UTF8 U_ICU_ENTRY_POINT_RENAME(uidna_labelToASCII_UTF8) +#define uidna_labelToUnicode U_ICU_ENTRY_POINT_RENAME(uidna_labelToUnicode) +#define uidna_labelToUnicodeUTF8 U_ICU_ENTRY_POINT_RENAME(uidna_labelToUnicodeUTF8) +#define uidna_nameToASCII U_ICU_ENTRY_POINT_RENAME(uidna_nameToASCII) +#define uidna_nameToASCII_UTF8 U_ICU_ENTRY_POINT_RENAME(uidna_nameToASCII_UTF8) +#define uidna_nameToUnicode U_ICU_ENTRY_POINT_RENAME(uidna_nameToUnicode) +#define uidna_nameToUnicodeUTF8 U_ICU_ENTRY_POINT_RENAME(uidna_nameToUnicodeUTF8) +#define uidna_openUTS46 U_ICU_ENTRY_POINT_RENAME(uidna_openUTS46) +#define uidna_toASCII U_ICU_ENTRY_POINT_RENAME(uidna_toASCII) +#define uidna_toUnicode U_ICU_ENTRY_POINT_RENAME(uidna_toUnicode) +#define uiter_current32 U_ICU_ENTRY_POINT_RENAME(uiter_current32) +#define uiter_getState U_ICU_ENTRY_POINT_RENAME(uiter_getState) +#define uiter_next32 U_ICU_ENTRY_POINT_RENAME(uiter_next32) +#define uiter_previous32 U_ICU_ENTRY_POINT_RENAME(uiter_previous32) +#define uiter_setCharacterIterator U_ICU_ENTRY_POINT_RENAME(uiter_setCharacterIterator) +#define uiter_setReplaceable U_ICU_ENTRY_POINT_RENAME(uiter_setReplaceable) +#define uiter_setState U_ICU_ENTRY_POINT_RENAME(uiter_setState) +#define uiter_setString U_ICU_ENTRY_POINT_RENAME(uiter_setString) +#define uiter_setUTF16BE U_ICU_ENTRY_POINT_RENAME(uiter_setUTF16BE) +#define uiter_setUTF8 U_ICU_ENTRY_POINT_RENAME(uiter_setUTF8) +#define uldn_close U_ICU_ENTRY_POINT_RENAME(uldn_close) +#define uldn_getContext U_ICU_ENTRY_POINT_RENAME(uldn_getContext) +#define uldn_getDialectHandling U_ICU_ENTRY_POINT_RENAME(uldn_getDialectHandling) +#define uldn_getLocale U_ICU_ENTRY_POINT_RENAME(uldn_getLocale) +#define uldn_keyDisplayName U_ICU_ENTRY_POINT_RENAME(uldn_keyDisplayName) +#define uldn_keyValueDisplayName U_ICU_ENTRY_POINT_RENAME(uldn_keyValueDisplayName) +#define uldn_languageDisplayName U_ICU_ENTRY_POINT_RENAME(uldn_languageDisplayName) +#define uldn_localeDisplayName U_ICU_ENTRY_POINT_RENAME(uldn_localeDisplayName) +#define uldn_open U_ICU_ENTRY_POINT_RENAME(uldn_open) +#define uldn_openForContext U_ICU_ENTRY_POINT_RENAME(uldn_openForContext) +#define uldn_regionDisplayName U_ICU_ENTRY_POINT_RENAME(uldn_regionDisplayName) +#define uldn_scriptCodeDisplayName U_ICU_ENTRY_POINT_RENAME(uldn_scriptCodeDisplayName) +#define uldn_scriptDisplayName U_ICU_ENTRY_POINT_RENAME(uldn_scriptDisplayName) +#define uldn_variantDisplayName U_ICU_ENTRY_POINT_RENAME(uldn_variantDisplayName) +#define ulist_addItemBeginList U_ICU_ENTRY_POINT_RENAME(ulist_addItemBeginList) +#define ulist_addItemEndList U_ICU_ENTRY_POINT_RENAME(ulist_addItemEndList) +#define ulist_close_keyword_values_iterator U_ICU_ENTRY_POINT_RENAME(ulist_close_keyword_values_iterator) +#define ulist_containsString U_ICU_ENTRY_POINT_RENAME(ulist_containsString) +#define ulist_count_keyword_values U_ICU_ENTRY_POINT_RENAME(ulist_count_keyword_values) +#define ulist_createEmptyList U_ICU_ENTRY_POINT_RENAME(ulist_createEmptyList) +#define ulist_deleteList U_ICU_ENTRY_POINT_RENAME(ulist_deleteList) +#define ulist_getListFromEnum U_ICU_ENTRY_POINT_RENAME(ulist_getListFromEnum) +#define ulist_getListSize U_ICU_ENTRY_POINT_RENAME(ulist_getListSize) +#define ulist_getNext U_ICU_ENTRY_POINT_RENAME(ulist_getNext) +#define ulist_next_keyword_value U_ICU_ENTRY_POINT_RENAME(ulist_next_keyword_value) +#define ulist_resetList U_ICU_ENTRY_POINT_RENAME(ulist_resetList) +#define ulist_reset_keyword_values_iterator U_ICU_ENTRY_POINT_RENAME(ulist_reset_keyword_values_iterator) +#define uloc_acceptLanguage U_ICU_ENTRY_POINT_RENAME(uloc_acceptLanguage) +#define uloc_acceptLanguageFromHTTP U_ICU_ENTRY_POINT_RENAME(uloc_acceptLanguageFromHTTP) +#define uloc_addLikelySubtags U_ICU_ENTRY_POINT_RENAME(uloc_addLikelySubtags) +#define uloc_canonicalize U_ICU_ENTRY_POINT_RENAME(uloc_canonicalize) +#define uloc_countAvailable U_ICU_ENTRY_POINT_RENAME(uloc_countAvailable) +#define uloc_forLanguageTag U_ICU_ENTRY_POINT_RENAME(uloc_forLanguageTag) +#define uloc_getAvailable U_ICU_ENTRY_POINT_RENAME(uloc_getAvailable) +#define uloc_getBaseName U_ICU_ENTRY_POINT_RENAME(uloc_getBaseName) +#define uloc_getCharacterOrientation U_ICU_ENTRY_POINT_RENAME(uloc_getCharacterOrientation) +#define uloc_getCountry U_ICU_ENTRY_POINT_RENAME(uloc_getCountry) +#define uloc_getCurrentCountryID U_ICU_ENTRY_POINT_RENAME(uloc_getCurrentCountryID) +#define uloc_getCurrentLanguageID U_ICU_ENTRY_POINT_RENAME(uloc_getCurrentLanguageID) +#define uloc_getDefault U_ICU_ENTRY_POINT_RENAME(uloc_getDefault) +#define uloc_getDisplayCountry U_ICU_ENTRY_POINT_RENAME(uloc_getDisplayCountry) +#define uloc_getDisplayKeyword U_ICU_ENTRY_POINT_RENAME(uloc_getDisplayKeyword) +#define uloc_getDisplayKeywordValue U_ICU_ENTRY_POINT_RENAME(uloc_getDisplayKeywordValue) +#define uloc_getDisplayLanguage U_ICU_ENTRY_POINT_RENAME(uloc_getDisplayLanguage) +#define uloc_getDisplayName U_ICU_ENTRY_POINT_RENAME(uloc_getDisplayName) +#define uloc_getDisplayScript U_ICU_ENTRY_POINT_RENAME(uloc_getDisplayScript) +#define uloc_getDisplayScriptInContext U_ICU_ENTRY_POINT_RENAME(uloc_getDisplayScriptInContext) +#define uloc_getDisplayVariant U_ICU_ENTRY_POINT_RENAME(uloc_getDisplayVariant) +#define uloc_getISO3Country U_ICU_ENTRY_POINT_RENAME(uloc_getISO3Country) +#define uloc_getISO3Language U_ICU_ENTRY_POINT_RENAME(uloc_getISO3Language) +#define uloc_getISOCountries U_ICU_ENTRY_POINT_RENAME(uloc_getISOCountries) +#define uloc_getISOLanguages U_ICU_ENTRY_POINT_RENAME(uloc_getISOLanguages) +#define uloc_getKeywordValue U_ICU_ENTRY_POINT_RENAME(uloc_getKeywordValue) +#define uloc_getLCID U_ICU_ENTRY_POINT_RENAME(uloc_getLCID) +#define uloc_getLanguage U_ICU_ENTRY_POINT_RENAME(uloc_getLanguage) +#define uloc_getLineOrientation U_ICU_ENTRY_POINT_RENAME(uloc_getLineOrientation) +#define uloc_getLocaleForLCID U_ICU_ENTRY_POINT_RENAME(uloc_getLocaleForLCID) +#define uloc_getName U_ICU_ENTRY_POINT_RENAME(uloc_getName) +#define uloc_getParent U_ICU_ENTRY_POINT_RENAME(uloc_getParent) +#define uloc_getScript U_ICU_ENTRY_POINT_RENAME(uloc_getScript) +#define uloc_getTableStringWithFallback U_ICU_ENTRY_POINT_RENAME(uloc_getTableStringWithFallback) +#define uloc_getVariant U_ICU_ENTRY_POINT_RENAME(uloc_getVariant) +#define uloc_isRightToLeft U_ICU_ENTRY_POINT_RENAME(uloc_isRightToLeft) +#define uloc_minimizeSubtags U_ICU_ENTRY_POINT_RENAME(uloc_minimizeSubtags) +#define uloc_openKeywordList U_ICU_ENTRY_POINT_RENAME(uloc_openKeywordList) +#define uloc_openKeywords U_ICU_ENTRY_POINT_RENAME(uloc_openKeywords) +#define uloc_setDefault U_ICU_ENTRY_POINT_RENAME(uloc_setDefault) +#define uloc_setKeywordValue U_ICU_ENTRY_POINT_RENAME(uloc_setKeywordValue) +#define uloc_toLanguageTag U_ICU_ENTRY_POINT_RENAME(uloc_toLanguageTag) +#define uloc_toLegacyKey U_ICU_ENTRY_POINT_RENAME(uloc_toLegacyKey) +#define uloc_toLegacyType U_ICU_ENTRY_POINT_RENAME(uloc_toLegacyType) +#define uloc_toUnicodeLocaleKey U_ICU_ENTRY_POINT_RENAME(uloc_toUnicodeLocaleKey) +#define uloc_toUnicodeLocaleType U_ICU_ENTRY_POINT_RENAME(uloc_toUnicodeLocaleType) +#define ulocdata_close U_ICU_ENTRY_POINT_RENAME(ulocdata_close) +#define ulocdata_getCLDRVersion U_ICU_ENTRY_POINT_RENAME(ulocdata_getCLDRVersion) +#define ulocdata_getDelimiter U_ICU_ENTRY_POINT_RENAME(ulocdata_getDelimiter) +#define ulocdata_getExemplarSet U_ICU_ENTRY_POINT_RENAME(ulocdata_getExemplarSet) +#define ulocdata_getLocaleDisplayPattern U_ICU_ENTRY_POINT_RENAME(ulocdata_getLocaleDisplayPattern) +#define ulocdata_getLocaleSeparator U_ICU_ENTRY_POINT_RENAME(ulocdata_getLocaleSeparator) +#define ulocdata_getMeasurementSystem U_ICU_ENTRY_POINT_RENAME(ulocdata_getMeasurementSystem) +#define ulocdata_getNoSubstitute U_ICU_ENTRY_POINT_RENAME(ulocdata_getNoSubstitute) +#define ulocdata_getPaperSize U_ICU_ENTRY_POINT_RENAME(ulocdata_getPaperSize) +#define ulocdata_open U_ICU_ENTRY_POINT_RENAME(ulocdata_open) +#define ulocdata_setNoSubstitute U_ICU_ENTRY_POINT_RENAME(ulocdata_setNoSubstitute) +#define ulocimp_getCountry U_ICU_ENTRY_POINT_RENAME(ulocimp_getCountry) +#define ulocimp_getLanguage U_ICU_ENTRY_POINT_RENAME(ulocimp_getLanguage) +#define ulocimp_getScript U_ICU_ENTRY_POINT_RENAME(ulocimp_getScript) +#define ulocimp_toBcpKey U_ICU_ENTRY_POINT_RENAME(ulocimp_toBcpKey) +#define ulocimp_toBcpType U_ICU_ENTRY_POINT_RENAME(ulocimp_toBcpType) +#define ulocimp_toLegacyKey U_ICU_ENTRY_POINT_RENAME(ulocimp_toLegacyKey) +#define ulocimp_toLegacyType U_ICU_ENTRY_POINT_RENAME(ulocimp_toLegacyType) +#define ultag_isUnicodeLocaleKey U_ICU_ENTRY_POINT_RENAME(ultag_isUnicodeLocaleKey) +#define ultag_isUnicodeLocaleType U_ICU_ENTRY_POINT_RENAME(ultag_isUnicodeLocaleType) +#define umsg_applyPattern U_ICU_ENTRY_POINT_RENAME(umsg_applyPattern) +#define umsg_autoQuoteApostrophe U_ICU_ENTRY_POINT_RENAME(umsg_autoQuoteApostrophe) +#define umsg_clone U_ICU_ENTRY_POINT_RENAME(umsg_clone) +#define umsg_close U_ICU_ENTRY_POINT_RENAME(umsg_close) +#define umsg_format U_ICU_ENTRY_POINT_RENAME(umsg_format) +#define umsg_getLocale U_ICU_ENTRY_POINT_RENAME(umsg_getLocale) +#define umsg_open U_ICU_ENTRY_POINT_RENAME(umsg_open) +#define umsg_parse U_ICU_ENTRY_POINT_RENAME(umsg_parse) +#define umsg_setLocale U_ICU_ENTRY_POINT_RENAME(umsg_setLocale) +#define umsg_toPattern U_ICU_ENTRY_POINT_RENAME(umsg_toPattern) +#define umsg_vformat U_ICU_ENTRY_POINT_RENAME(umsg_vformat) +#define umsg_vparse U_ICU_ENTRY_POINT_RENAME(umsg_vparse) +#define umtx_condBroadcast U_ICU_ENTRY_POINT_RENAME(umtx_condBroadcast) +#define umtx_condSignal U_ICU_ENTRY_POINT_RENAME(umtx_condSignal) +#define umtx_condWait U_ICU_ENTRY_POINT_RENAME(umtx_condWait) +#define umtx_lock U_ICU_ENTRY_POINT_RENAME(umtx_lock) +#define umtx_unlock U_ICU_ENTRY_POINT_RENAME(umtx_unlock) +#define uniset_getUnicode32Instance U_ICU_ENTRY_POINT_RENAME(uniset_getUnicode32Instance) +#define unorm2_append U_ICU_ENTRY_POINT_RENAME(unorm2_append) +#define unorm2_close U_ICU_ENTRY_POINT_RENAME(unorm2_close) +#define unorm2_composePair U_ICU_ENTRY_POINT_RENAME(unorm2_composePair) +#define unorm2_getCombiningClass U_ICU_ENTRY_POINT_RENAME(unorm2_getCombiningClass) +#define unorm2_getDecomposition U_ICU_ENTRY_POINT_RENAME(unorm2_getDecomposition) +#define unorm2_getInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getInstance) +#define unorm2_getNFCInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFCInstance) +#define unorm2_getNFDInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFDInstance) +#define unorm2_getNFKCCasefoldInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFKCCasefoldInstance) +#define unorm2_getNFKCInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFKCInstance) +#define unorm2_getNFKDInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFKDInstance) +#define unorm2_getRawDecomposition U_ICU_ENTRY_POINT_RENAME(unorm2_getRawDecomposition) +#define unorm2_hasBoundaryAfter U_ICU_ENTRY_POINT_RENAME(unorm2_hasBoundaryAfter) +#define unorm2_hasBoundaryBefore U_ICU_ENTRY_POINT_RENAME(unorm2_hasBoundaryBefore) +#define unorm2_isInert U_ICU_ENTRY_POINT_RENAME(unorm2_isInert) +#define unorm2_isNormalized U_ICU_ENTRY_POINT_RENAME(unorm2_isNormalized) +#define unorm2_normalize U_ICU_ENTRY_POINT_RENAME(unorm2_normalize) +#define unorm2_normalizeSecondAndAppend U_ICU_ENTRY_POINT_RENAME(unorm2_normalizeSecondAndAppend) +#define unorm2_openFiltered U_ICU_ENTRY_POINT_RENAME(unorm2_openFiltered) +#define unorm2_quickCheck U_ICU_ENTRY_POINT_RENAME(unorm2_quickCheck) +#define unorm2_spanQuickCheckYes U_ICU_ENTRY_POINT_RENAME(unorm2_spanQuickCheckYes) +#define unorm2_swap U_ICU_ENTRY_POINT_RENAME(unorm2_swap) +#define unorm_compare U_ICU_ENTRY_POINT_RENAME(unorm_compare) +#define unorm_concatenate U_ICU_ENTRY_POINT_RENAME(unorm_concatenate) +#define unorm_getFCD16 U_ICU_ENTRY_POINT_RENAME(unorm_getFCD16) +#define unorm_getQuickCheck U_ICU_ENTRY_POINT_RENAME(unorm_getQuickCheck) +#define unorm_isNormalized U_ICU_ENTRY_POINT_RENAME(unorm_isNormalized) +#define unorm_isNormalizedWithOptions U_ICU_ENTRY_POINT_RENAME(unorm_isNormalizedWithOptions) +#define unorm_next U_ICU_ENTRY_POINT_RENAME(unorm_next) +#define unorm_normalize U_ICU_ENTRY_POINT_RENAME(unorm_normalize) +#define unorm_previous U_ICU_ENTRY_POINT_RENAME(unorm_previous) +#define unorm_quickCheck U_ICU_ENTRY_POINT_RENAME(unorm_quickCheck) +#define unorm_quickCheckWithOptions U_ICU_ENTRY_POINT_RENAME(unorm_quickCheckWithOptions) +#define unum_applyPattern U_ICU_ENTRY_POINT_RENAME(unum_applyPattern) +#define unum_clone U_ICU_ENTRY_POINT_RENAME(unum_clone) +#define unum_close U_ICU_ENTRY_POINT_RENAME(unum_close) +#define unum_countAvailable U_ICU_ENTRY_POINT_RENAME(unum_countAvailable) +#define unum_format U_ICU_ENTRY_POINT_RENAME(unum_format) +#define unum_formatDecimal U_ICU_ENTRY_POINT_RENAME(unum_formatDecimal) +#define unum_formatDouble U_ICU_ENTRY_POINT_RENAME(unum_formatDouble) +#define unum_formatDoubleCurrency U_ICU_ENTRY_POINT_RENAME(unum_formatDoubleCurrency) +#define unum_formatInt64 U_ICU_ENTRY_POINT_RENAME(unum_formatInt64) +#define unum_formatUFormattable U_ICU_ENTRY_POINT_RENAME(unum_formatUFormattable) +#define unum_getAttribute U_ICU_ENTRY_POINT_RENAME(unum_getAttribute) +#define unum_getAvailable U_ICU_ENTRY_POINT_RENAME(unum_getAvailable) +#define unum_getContext U_ICU_ENTRY_POINT_RENAME(unum_getContext) +#define unum_getDoubleAttribute U_ICU_ENTRY_POINT_RENAME(unum_getDoubleAttribute) +#define unum_getLocaleByType U_ICU_ENTRY_POINT_RENAME(unum_getLocaleByType) +#define unum_getSymbol U_ICU_ENTRY_POINT_RENAME(unum_getSymbol) +#define unum_getTextAttribute U_ICU_ENTRY_POINT_RENAME(unum_getTextAttribute) +#define unum_open U_ICU_ENTRY_POINT_RENAME(unum_open) +#define unum_parse U_ICU_ENTRY_POINT_RENAME(unum_parse) +#define unum_parseDecimal U_ICU_ENTRY_POINT_RENAME(unum_parseDecimal) +#define unum_parseDouble U_ICU_ENTRY_POINT_RENAME(unum_parseDouble) +#define unum_parseDoubleCurrency U_ICU_ENTRY_POINT_RENAME(unum_parseDoubleCurrency) +#define unum_parseInt64 U_ICU_ENTRY_POINT_RENAME(unum_parseInt64) +#define unum_parseToUFormattable U_ICU_ENTRY_POINT_RENAME(unum_parseToUFormattable) +#define unum_setAttribute U_ICU_ENTRY_POINT_RENAME(unum_setAttribute) +#define unum_setContext U_ICU_ENTRY_POINT_RENAME(unum_setContext) +#define unum_setDoubleAttribute U_ICU_ENTRY_POINT_RENAME(unum_setDoubleAttribute) +#define unum_setSymbol U_ICU_ENTRY_POINT_RENAME(unum_setSymbol) +#define unum_setTextAttribute U_ICU_ENTRY_POINT_RENAME(unum_setTextAttribute) +#define unum_toPattern U_ICU_ENTRY_POINT_RENAME(unum_toPattern) +#define unumsys_close U_ICU_ENTRY_POINT_RENAME(unumsys_close) +#define unumsys_getDescription U_ICU_ENTRY_POINT_RENAME(unumsys_getDescription) +#define unumsys_getName U_ICU_ENTRY_POINT_RENAME(unumsys_getName) +#define unumsys_getRadix U_ICU_ENTRY_POINT_RENAME(unumsys_getRadix) +#define unumsys_isAlgorithmic U_ICU_ENTRY_POINT_RENAME(unumsys_isAlgorithmic) +#define unumsys_open U_ICU_ENTRY_POINT_RENAME(unumsys_open) +#define unumsys_openAvailableNames U_ICU_ENTRY_POINT_RENAME(unumsys_openAvailableNames) +#define unumsys_openByName U_ICU_ENTRY_POINT_RENAME(unumsys_openByName) +#define uplrules_close U_ICU_ENTRY_POINT_RENAME(uplrules_close) +#define uplrules_open U_ICU_ENTRY_POINT_RENAME(uplrules_open) +#define uplrules_openForType U_ICU_ENTRY_POINT_RENAME(uplrules_openForType) +#define uplrules_select U_ICU_ENTRY_POINT_RENAME(uplrules_select) +#define uplug_closeLibrary U_ICU_ENTRY_POINT_RENAME(uplug_closeLibrary) +#define uplug_findLibrary U_ICU_ENTRY_POINT_RENAME(uplug_findLibrary) +#define uplug_getConfiguration U_ICU_ENTRY_POINT_RENAME(uplug_getConfiguration) +#define uplug_getContext U_ICU_ENTRY_POINT_RENAME(uplug_getContext) +#define uplug_getCurrentLevel U_ICU_ENTRY_POINT_RENAME(uplug_getCurrentLevel) +#define uplug_getLibrary U_ICU_ENTRY_POINT_RENAME(uplug_getLibrary) +#define uplug_getLibraryName U_ICU_ENTRY_POINT_RENAME(uplug_getLibraryName) +#define uplug_getPlugInternal U_ICU_ENTRY_POINT_RENAME(uplug_getPlugInternal) +#define uplug_getPlugLevel U_ICU_ENTRY_POINT_RENAME(uplug_getPlugLevel) +#define uplug_getPlugLoadStatus U_ICU_ENTRY_POINT_RENAME(uplug_getPlugLoadStatus) +#define uplug_getPlugName U_ICU_ENTRY_POINT_RENAME(uplug_getPlugName) +#define uplug_getPluginFile U_ICU_ENTRY_POINT_RENAME(uplug_getPluginFile) +#define uplug_getSymbolName U_ICU_ENTRY_POINT_RENAME(uplug_getSymbolName) +#define uplug_init U_ICU_ENTRY_POINT_RENAME(uplug_init) +#define uplug_loadPlugFromEntrypoint U_ICU_ENTRY_POINT_RENAME(uplug_loadPlugFromEntrypoint) +#define uplug_loadPlugFromLibrary U_ICU_ENTRY_POINT_RENAME(uplug_loadPlugFromLibrary) +#define uplug_nextPlug U_ICU_ENTRY_POINT_RENAME(uplug_nextPlug) +#define uplug_openLibrary U_ICU_ENTRY_POINT_RENAME(uplug_openLibrary) +#define uplug_removePlug U_ICU_ENTRY_POINT_RENAME(uplug_removePlug) +#define uplug_setContext U_ICU_ENTRY_POINT_RENAME(uplug_setContext) +#define uplug_setPlugLevel U_ICU_ENTRY_POINT_RENAME(uplug_setPlugLevel) +#define uplug_setPlugName U_ICU_ENTRY_POINT_RENAME(uplug_setPlugName) +#define uplug_setPlugNoUnload U_ICU_ENTRY_POINT_RENAME(uplug_setPlugNoUnload) +#define uprops_getSource U_ICU_ENTRY_POINT_RENAME(uprops_getSource) +#define upropsvec_addPropertyStarts U_ICU_ENTRY_POINT_RENAME(upropsvec_addPropertyStarts) +#define uprv_aestrncpy U_ICU_ENTRY_POINT_RENAME(uprv_aestrncpy) +#define uprv_asciiFromEbcdic U_ICU_ENTRY_POINT_RENAME(uprv_asciiFromEbcdic) +#define uprv_asciitolower U_ICU_ENTRY_POINT_RENAME(uprv_asciitolower) +#define uprv_calloc U_ICU_ENTRY_POINT_RENAME(uprv_calloc) +#define uprv_ceil U_ICU_ENTRY_POINT_RENAME(uprv_ceil) +#define uprv_compareASCIIPropertyNames U_ICU_ENTRY_POINT_RENAME(uprv_compareASCIIPropertyNames) +#define uprv_compareEBCDICPropertyNames U_ICU_ENTRY_POINT_RENAME(uprv_compareEBCDICPropertyNames) +#define uprv_compareInvAscii U_ICU_ENTRY_POINT_RENAME(uprv_compareInvAscii) +#define uprv_compareInvEbcdic U_ICU_ENTRY_POINT_RENAME(uprv_compareInvEbcdic) +#define uprv_compareInvEbcdicAsAscii U_ICU_ENTRY_POINT_RENAME(uprv_compareInvEbcdicAsAscii) +#define uprv_convertToLCID U_ICU_ENTRY_POINT_RENAME(uprv_convertToLCID) +#define uprv_convertToPosix U_ICU_ENTRY_POINT_RENAME(uprv_convertToPosix) +#define uprv_copyAscii U_ICU_ENTRY_POINT_RENAME(uprv_copyAscii) +#define uprv_copyEbcdic U_ICU_ENTRY_POINT_RENAME(uprv_copyEbcdic) +#define uprv_decContextClearStatus U_ICU_ENTRY_POINT_RENAME(uprv_decContextClearStatus) +#define uprv_decContextDefault U_ICU_ENTRY_POINT_RENAME(uprv_decContextDefault) +#define uprv_decContextGetRounding U_ICU_ENTRY_POINT_RENAME(uprv_decContextGetRounding) +#define uprv_decContextGetStatus U_ICU_ENTRY_POINT_RENAME(uprv_decContextGetStatus) +#define uprv_decContextRestoreStatus U_ICU_ENTRY_POINT_RENAME(uprv_decContextRestoreStatus) +#define uprv_decContextSaveStatus U_ICU_ENTRY_POINT_RENAME(uprv_decContextSaveStatus) +#define uprv_decContextSetRounding U_ICU_ENTRY_POINT_RENAME(uprv_decContextSetRounding) +#define uprv_decContextSetStatus U_ICU_ENTRY_POINT_RENAME(uprv_decContextSetStatus) +#define uprv_decContextSetStatusFromString U_ICU_ENTRY_POINT_RENAME(uprv_decContextSetStatusFromString) +#define uprv_decContextSetStatusFromStringQuiet U_ICU_ENTRY_POINT_RENAME(uprv_decContextSetStatusFromStringQuiet) +#define uprv_decContextSetStatusQuiet U_ICU_ENTRY_POINT_RENAME(uprv_decContextSetStatusQuiet) +#define uprv_decContextStatusToString U_ICU_ENTRY_POINT_RENAME(uprv_decContextStatusToString) +#define uprv_decContextTestSavedStatus U_ICU_ENTRY_POINT_RENAME(uprv_decContextTestSavedStatus) +#define uprv_decContextTestStatus U_ICU_ENTRY_POINT_RENAME(uprv_decContextTestStatus) +#define uprv_decContextZeroStatus U_ICU_ENTRY_POINT_RENAME(uprv_decContextZeroStatus) +#define uprv_decNumberAbs U_ICU_ENTRY_POINT_RENAME(uprv_decNumberAbs) +#define uprv_decNumberAdd U_ICU_ENTRY_POINT_RENAME(uprv_decNumberAdd) +#define uprv_decNumberAnd U_ICU_ENTRY_POINT_RENAME(uprv_decNumberAnd) +#define uprv_decNumberClass U_ICU_ENTRY_POINT_RENAME(uprv_decNumberClass) +#define uprv_decNumberClassToString U_ICU_ENTRY_POINT_RENAME(uprv_decNumberClassToString) +#define uprv_decNumberCompare U_ICU_ENTRY_POINT_RENAME(uprv_decNumberCompare) +#define uprv_decNumberCompareSignal U_ICU_ENTRY_POINT_RENAME(uprv_decNumberCompareSignal) +#define uprv_decNumberCompareTotal U_ICU_ENTRY_POINT_RENAME(uprv_decNumberCompareTotal) +#define uprv_decNumberCompareTotalMag U_ICU_ENTRY_POINT_RENAME(uprv_decNumberCompareTotalMag) +#define uprv_decNumberCopy U_ICU_ENTRY_POINT_RENAME(uprv_decNumberCopy) +#define uprv_decNumberCopyAbs U_ICU_ENTRY_POINT_RENAME(uprv_decNumberCopyAbs) +#define uprv_decNumberCopyNegate U_ICU_ENTRY_POINT_RENAME(uprv_decNumberCopyNegate) +#define uprv_decNumberCopySign U_ICU_ENTRY_POINT_RENAME(uprv_decNumberCopySign) +#define uprv_decNumberDivide U_ICU_ENTRY_POINT_RENAME(uprv_decNumberDivide) +#define uprv_decNumberDivideInteger U_ICU_ENTRY_POINT_RENAME(uprv_decNumberDivideInteger) +#define uprv_decNumberExp U_ICU_ENTRY_POINT_RENAME(uprv_decNumberExp) +#define uprv_decNumberFMA U_ICU_ENTRY_POINT_RENAME(uprv_decNumberFMA) +#define uprv_decNumberFromInt32 U_ICU_ENTRY_POINT_RENAME(uprv_decNumberFromInt32) +#define uprv_decNumberFromString U_ICU_ENTRY_POINT_RENAME(uprv_decNumberFromString) +#define uprv_decNumberFromUInt32 U_ICU_ENTRY_POINT_RENAME(uprv_decNumberFromUInt32) +#define uprv_decNumberGetBCD U_ICU_ENTRY_POINT_RENAME(uprv_decNumberGetBCD) +#define uprv_decNumberInvert U_ICU_ENTRY_POINT_RENAME(uprv_decNumberInvert) +#define uprv_decNumberIsNormal U_ICU_ENTRY_POINT_RENAME(uprv_decNumberIsNormal) +#define uprv_decNumberIsSubnormal U_ICU_ENTRY_POINT_RENAME(uprv_decNumberIsSubnormal) +#define uprv_decNumberLn U_ICU_ENTRY_POINT_RENAME(uprv_decNumberLn) +#define uprv_decNumberLog10 U_ICU_ENTRY_POINT_RENAME(uprv_decNumberLog10) +#define uprv_decNumberLogB U_ICU_ENTRY_POINT_RENAME(uprv_decNumberLogB) +#define uprv_decNumberMax U_ICU_ENTRY_POINT_RENAME(uprv_decNumberMax) +#define uprv_decNumberMaxMag U_ICU_ENTRY_POINT_RENAME(uprv_decNumberMaxMag) +#define uprv_decNumberMin U_ICU_ENTRY_POINT_RENAME(uprv_decNumberMin) +#define uprv_decNumberMinMag U_ICU_ENTRY_POINT_RENAME(uprv_decNumberMinMag) +#define uprv_decNumberMinus U_ICU_ENTRY_POINT_RENAME(uprv_decNumberMinus) +#define uprv_decNumberMultiply U_ICU_ENTRY_POINT_RENAME(uprv_decNumberMultiply) +#define uprv_decNumberNextMinus U_ICU_ENTRY_POINT_RENAME(uprv_decNumberNextMinus) +#define uprv_decNumberNextPlus U_ICU_ENTRY_POINT_RENAME(uprv_decNumberNextPlus) +#define uprv_decNumberNextToward U_ICU_ENTRY_POINT_RENAME(uprv_decNumberNextToward) +#define uprv_decNumberNormalize U_ICU_ENTRY_POINT_RENAME(uprv_decNumberNormalize) +#define uprv_decNumberOr U_ICU_ENTRY_POINT_RENAME(uprv_decNumberOr) +#define uprv_decNumberPlus U_ICU_ENTRY_POINT_RENAME(uprv_decNumberPlus) +#define uprv_decNumberPower U_ICU_ENTRY_POINT_RENAME(uprv_decNumberPower) +#define uprv_decNumberQuantize U_ICU_ENTRY_POINT_RENAME(uprv_decNumberQuantize) +#define uprv_decNumberReduce U_ICU_ENTRY_POINT_RENAME(uprv_decNumberReduce) +#define uprv_decNumberRemainder U_ICU_ENTRY_POINT_RENAME(uprv_decNumberRemainder) +#define uprv_decNumberRemainderNear U_ICU_ENTRY_POINT_RENAME(uprv_decNumberRemainderNear) +#define uprv_decNumberRescale U_ICU_ENTRY_POINT_RENAME(uprv_decNumberRescale) +#define uprv_decNumberRotate U_ICU_ENTRY_POINT_RENAME(uprv_decNumberRotate) +#define uprv_decNumberSameQuantum U_ICU_ENTRY_POINT_RENAME(uprv_decNumberSameQuantum) +#define uprv_decNumberScaleB U_ICU_ENTRY_POINT_RENAME(uprv_decNumberScaleB) +#define uprv_decNumberSetBCD U_ICU_ENTRY_POINT_RENAME(uprv_decNumberSetBCD) +#define uprv_decNumberShift U_ICU_ENTRY_POINT_RENAME(uprv_decNumberShift) +#define uprv_decNumberSquareRoot U_ICU_ENTRY_POINT_RENAME(uprv_decNumberSquareRoot) +#define uprv_decNumberSubtract U_ICU_ENTRY_POINT_RENAME(uprv_decNumberSubtract) +#define uprv_decNumberToEngString U_ICU_ENTRY_POINT_RENAME(uprv_decNumberToEngString) +#define uprv_decNumberToInt32 U_ICU_ENTRY_POINT_RENAME(uprv_decNumberToInt32) +#define uprv_decNumberToIntegralExact U_ICU_ENTRY_POINT_RENAME(uprv_decNumberToIntegralExact) +#define uprv_decNumberToIntegralValue U_ICU_ENTRY_POINT_RENAME(uprv_decNumberToIntegralValue) +#define uprv_decNumberToString U_ICU_ENTRY_POINT_RENAME(uprv_decNumberToString) +#define uprv_decNumberToUInt32 U_ICU_ENTRY_POINT_RENAME(uprv_decNumberToUInt32) +#define uprv_decNumberTrim U_ICU_ENTRY_POINT_RENAME(uprv_decNumberTrim) +#define uprv_decNumberVersion U_ICU_ENTRY_POINT_RENAME(uprv_decNumberVersion) +#define uprv_decNumberXor U_ICU_ENTRY_POINT_RENAME(uprv_decNumberXor) +#define uprv_decNumberZero U_ICU_ENTRY_POINT_RENAME(uprv_decNumberZero) +#define uprv_deleteConditionalCE32 U_ICU_ENTRY_POINT_RENAME(uprv_deleteConditionalCE32) +#define uprv_deleteUObject U_ICU_ENTRY_POINT_RENAME(uprv_deleteUObject) +#define uprv_dl_close U_ICU_ENTRY_POINT_RENAME(uprv_dl_close) +#define uprv_dl_open U_ICU_ENTRY_POINT_RENAME(uprv_dl_open) +#define uprv_dlsym_func U_ICU_ENTRY_POINT_RENAME(uprv_dlsym_func) +#define uprv_eastrncpy U_ICU_ENTRY_POINT_RENAME(uprv_eastrncpy) +#define uprv_ebcdicFromAscii U_ICU_ENTRY_POINT_RENAME(uprv_ebcdicFromAscii) +#define uprv_ebcdicToLowercaseAscii U_ICU_ENTRY_POINT_RENAME(uprv_ebcdicToLowercaseAscii) +#define uprv_ebcdictolower U_ICU_ENTRY_POINT_RENAME(uprv_ebcdictolower) +#define uprv_fabs U_ICU_ENTRY_POINT_RENAME(uprv_fabs) +#define uprv_floor U_ICU_ENTRY_POINT_RENAME(uprv_floor) +#define uprv_fmax U_ICU_ENTRY_POINT_RENAME(uprv_fmax) +#define uprv_fmin U_ICU_ENTRY_POINT_RENAME(uprv_fmin) +#define uprv_fmod U_ICU_ENTRY_POINT_RENAME(uprv_fmod) +#define uprv_free U_ICU_ENTRY_POINT_RENAME(uprv_free) +#define uprv_getCharNameCharacters U_ICU_ENTRY_POINT_RENAME(uprv_getCharNameCharacters) +#define uprv_getDefaultCodepage U_ICU_ENTRY_POINT_RENAME(uprv_getDefaultCodepage) +#define uprv_getDefaultLocaleID U_ICU_ENTRY_POINT_RENAME(uprv_getDefaultLocaleID) +#define uprv_getInfinity U_ICU_ENTRY_POINT_RENAME(uprv_getInfinity) +#define uprv_getMaxCharNameLength U_ICU_ENTRY_POINT_RENAME(uprv_getMaxCharNameLength) +#define uprv_getMaxValues U_ICU_ENTRY_POINT_RENAME(uprv_getMaxValues) +#define uprv_getNaN U_ICU_ENTRY_POINT_RENAME(uprv_getNaN) +#define uprv_getRawUTCtime U_ICU_ENTRY_POINT_RENAME(uprv_getRawUTCtime) +#define uprv_getStaticCurrencyName U_ICU_ENTRY_POINT_RENAME(uprv_getStaticCurrencyName) +#define uprv_getUTCtime U_ICU_ENTRY_POINT_RENAME(uprv_getUTCtime) +#define uprv_haveProperties U_ICU_ENTRY_POINT_RENAME(uprv_haveProperties) +#define uprv_int32Comparator U_ICU_ENTRY_POINT_RENAME(uprv_int32Comparator) +#define uprv_isASCIILetter U_ICU_ENTRY_POINT_RENAME(uprv_isASCIILetter) +#define uprv_isInfinite U_ICU_ENTRY_POINT_RENAME(uprv_isInfinite) +#define uprv_isInvariantString U_ICU_ENTRY_POINT_RENAME(uprv_isInvariantString) +#define uprv_isInvariantUString U_ICU_ENTRY_POINT_RENAME(uprv_isInvariantUString) +#define uprv_isNaN U_ICU_ENTRY_POINT_RENAME(uprv_isNaN) +#define uprv_isNegativeInfinity U_ICU_ENTRY_POINT_RENAME(uprv_isNegativeInfinity) +#define uprv_isPositiveInfinity U_ICU_ENTRY_POINT_RENAME(uprv_isPositiveInfinity) +#define uprv_itou U_ICU_ENTRY_POINT_RENAME(uprv_itou) +#define uprv_log U_ICU_ENTRY_POINT_RENAME(uprv_log) +#define uprv_malloc U_ICU_ENTRY_POINT_RENAME(uprv_malloc) +#define uprv_mapFile U_ICU_ENTRY_POINT_RENAME(uprv_mapFile) +#define uprv_max U_ICU_ENTRY_POINT_RENAME(uprv_max) +#define uprv_maxMantissa U_ICU_ENTRY_POINT_RENAME(uprv_maxMantissa) +#define uprv_maximumPtr U_ICU_ENTRY_POINT_RENAME(uprv_maximumPtr) +#define uprv_min U_ICU_ENTRY_POINT_RENAME(uprv_min) +#define uprv_modf U_ICU_ENTRY_POINT_RENAME(uprv_modf) +#define uprv_parseCurrency U_ICU_ENTRY_POINT_RENAME(uprv_parseCurrency) +#define uprv_pathIsAbsolute U_ICU_ENTRY_POINT_RENAME(uprv_pathIsAbsolute) +#define uprv_pow U_ICU_ENTRY_POINT_RENAME(uprv_pow) +#define uprv_pow10 U_ICU_ENTRY_POINT_RENAME(uprv_pow10) +#define uprv_realloc U_ICU_ENTRY_POINT_RENAME(uprv_realloc) +#define uprv_round U_ICU_ENTRY_POINT_RENAME(uprv_round) +#define uprv_sortArray U_ICU_ENTRY_POINT_RENAME(uprv_sortArray) +#define uprv_stableBinarySearch U_ICU_ENTRY_POINT_RENAME(uprv_stableBinarySearch) +#define uprv_strCompare U_ICU_ENTRY_POINT_RENAME(uprv_strCompare) +#define uprv_strdup U_ICU_ENTRY_POINT_RENAME(uprv_strdup) +#define uprv_stricmp U_ICU_ENTRY_POINT_RENAME(uprv_stricmp) +#define uprv_strndup U_ICU_ENTRY_POINT_RENAME(uprv_strndup) +#define uprv_strnicmp U_ICU_ENTRY_POINT_RENAME(uprv_strnicmp) +#define uprv_syntaxError U_ICU_ENTRY_POINT_RENAME(uprv_syntaxError) +#define uprv_timezone U_ICU_ENTRY_POINT_RENAME(uprv_timezone) +#define uprv_toupper U_ICU_ENTRY_POINT_RENAME(uprv_toupper) +#define uprv_trunc U_ICU_ENTRY_POINT_RENAME(uprv_trunc) +#define uprv_tzname U_ICU_ENTRY_POINT_RENAME(uprv_tzname) +#define uprv_tzset U_ICU_ENTRY_POINT_RENAME(uprv_tzset) +#define uprv_uint16Comparator U_ICU_ENTRY_POINT_RENAME(uprv_uint16Comparator) +#define uprv_uint32Comparator U_ICU_ENTRY_POINT_RENAME(uprv_uint32Comparator) +#define uprv_unmapFile U_ICU_ENTRY_POINT_RENAME(uprv_unmapFile) +#define upvec_cloneArray U_ICU_ENTRY_POINT_RENAME(upvec_cloneArray) +#define upvec_close U_ICU_ENTRY_POINT_RENAME(upvec_close) +#define upvec_compact U_ICU_ENTRY_POINT_RENAME(upvec_compact) +#define upvec_compactToUTrie2Handler U_ICU_ENTRY_POINT_RENAME(upvec_compactToUTrie2Handler) +#define upvec_compactToUTrie2WithRowIndexes U_ICU_ENTRY_POINT_RENAME(upvec_compactToUTrie2WithRowIndexes) +#define upvec_getArray U_ICU_ENTRY_POINT_RENAME(upvec_getArray) +#define upvec_getRow U_ICU_ENTRY_POINT_RENAME(upvec_getRow) +#define upvec_getValue U_ICU_ENTRY_POINT_RENAME(upvec_getValue) +#define upvec_open U_ICU_ENTRY_POINT_RENAME(upvec_open) +#define upvec_setValue U_ICU_ENTRY_POINT_RENAME(upvec_setValue) +#define uregex_appendReplacement U_ICU_ENTRY_POINT_RENAME(uregex_appendReplacement) +#define uregex_appendReplacementUText U_ICU_ENTRY_POINT_RENAME(uregex_appendReplacementUText) +#define uregex_appendTail U_ICU_ENTRY_POINT_RENAME(uregex_appendTail) +#define uregex_appendTailUText U_ICU_ENTRY_POINT_RENAME(uregex_appendTailUText) +#define uregex_clone U_ICU_ENTRY_POINT_RENAME(uregex_clone) +#define uregex_close U_ICU_ENTRY_POINT_RENAME(uregex_close) +#define uregex_end U_ICU_ENTRY_POINT_RENAME(uregex_end) +#define uregex_end64 U_ICU_ENTRY_POINT_RENAME(uregex_end64) +#define uregex_find U_ICU_ENTRY_POINT_RENAME(uregex_find) +#define uregex_find64 U_ICU_ENTRY_POINT_RENAME(uregex_find64) +#define uregex_findNext U_ICU_ENTRY_POINT_RENAME(uregex_findNext) +#define uregex_flags U_ICU_ENTRY_POINT_RENAME(uregex_flags) +#define uregex_getFindProgressCallback U_ICU_ENTRY_POINT_RENAME(uregex_getFindProgressCallback) +#define uregex_getMatchCallback U_ICU_ENTRY_POINT_RENAME(uregex_getMatchCallback) +#define uregex_getStackLimit U_ICU_ENTRY_POINT_RENAME(uregex_getStackLimit) +#define uregex_getText U_ICU_ENTRY_POINT_RENAME(uregex_getText) +#define uregex_getTimeLimit U_ICU_ENTRY_POINT_RENAME(uregex_getTimeLimit) +#define uregex_getUText U_ICU_ENTRY_POINT_RENAME(uregex_getUText) +#define uregex_group U_ICU_ENTRY_POINT_RENAME(uregex_group) +#define uregex_groupCount U_ICU_ENTRY_POINT_RENAME(uregex_groupCount) +#define uregex_groupUText U_ICU_ENTRY_POINT_RENAME(uregex_groupUText) +#define uregex_groupUTextDeep U_ICU_ENTRY_POINT_RENAME(uregex_groupUTextDeep) +#define uregex_hasAnchoringBounds U_ICU_ENTRY_POINT_RENAME(uregex_hasAnchoringBounds) +#define uregex_hasTransparentBounds U_ICU_ENTRY_POINT_RENAME(uregex_hasTransparentBounds) +#define uregex_hitEnd U_ICU_ENTRY_POINT_RENAME(uregex_hitEnd) +#define uregex_lookingAt U_ICU_ENTRY_POINT_RENAME(uregex_lookingAt) +#define uregex_lookingAt64 U_ICU_ENTRY_POINT_RENAME(uregex_lookingAt64) +#define uregex_matches U_ICU_ENTRY_POINT_RENAME(uregex_matches) +#define uregex_matches64 U_ICU_ENTRY_POINT_RENAME(uregex_matches64) +#define uregex_open U_ICU_ENTRY_POINT_RENAME(uregex_open) +#define uregex_openC U_ICU_ENTRY_POINT_RENAME(uregex_openC) +#define uregex_openUText U_ICU_ENTRY_POINT_RENAME(uregex_openUText) +#define uregex_pattern U_ICU_ENTRY_POINT_RENAME(uregex_pattern) +#define uregex_patternUText U_ICU_ENTRY_POINT_RENAME(uregex_patternUText) +#define uregex_refreshUText U_ICU_ENTRY_POINT_RENAME(uregex_refreshUText) +#define uregex_regionEnd U_ICU_ENTRY_POINT_RENAME(uregex_regionEnd) +#define uregex_regionEnd64 U_ICU_ENTRY_POINT_RENAME(uregex_regionEnd64) +#define uregex_regionStart U_ICU_ENTRY_POINT_RENAME(uregex_regionStart) +#define uregex_regionStart64 U_ICU_ENTRY_POINT_RENAME(uregex_regionStart64) +#define uregex_replaceAll U_ICU_ENTRY_POINT_RENAME(uregex_replaceAll) +#define uregex_replaceAllUText U_ICU_ENTRY_POINT_RENAME(uregex_replaceAllUText) +#define uregex_replaceFirst U_ICU_ENTRY_POINT_RENAME(uregex_replaceFirst) +#define uregex_replaceFirstUText U_ICU_ENTRY_POINT_RENAME(uregex_replaceFirstUText) +#define uregex_requireEnd U_ICU_ENTRY_POINT_RENAME(uregex_requireEnd) +#define uregex_reset U_ICU_ENTRY_POINT_RENAME(uregex_reset) +#define uregex_reset64 U_ICU_ENTRY_POINT_RENAME(uregex_reset64) +#define uregex_setFindProgressCallback U_ICU_ENTRY_POINT_RENAME(uregex_setFindProgressCallback) +#define uregex_setMatchCallback U_ICU_ENTRY_POINT_RENAME(uregex_setMatchCallback) +#define uregex_setRegion U_ICU_ENTRY_POINT_RENAME(uregex_setRegion) +#define uregex_setRegion64 U_ICU_ENTRY_POINT_RENAME(uregex_setRegion64) +#define uregex_setRegionAndStart U_ICU_ENTRY_POINT_RENAME(uregex_setRegionAndStart) +#define uregex_setStackLimit U_ICU_ENTRY_POINT_RENAME(uregex_setStackLimit) +#define uregex_setText U_ICU_ENTRY_POINT_RENAME(uregex_setText) +#define uregex_setTimeLimit U_ICU_ENTRY_POINT_RENAME(uregex_setTimeLimit) +#define uregex_setUText U_ICU_ENTRY_POINT_RENAME(uregex_setUText) +#define uregex_split U_ICU_ENTRY_POINT_RENAME(uregex_split) +#define uregex_splitUText U_ICU_ENTRY_POINT_RENAME(uregex_splitUText) +#define uregex_start U_ICU_ENTRY_POINT_RENAME(uregex_start) +#define uregex_start64 U_ICU_ENTRY_POINT_RENAME(uregex_start64) +#define uregex_ucstr_unescape_charAt U_ICU_ENTRY_POINT_RENAME(uregex_ucstr_unescape_charAt) +#define uregex_useAnchoringBounds U_ICU_ENTRY_POINT_RENAME(uregex_useAnchoringBounds) +#define uregex_useTransparentBounds U_ICU_ENTRY_POINT_RENAME(uregex_useTransparentBounds) +#define uregex_utext_unescape_charAt U_ICU_ENTRY_POINT_RENAME(uregex_utext_unescape_charAt) +#define uregion_areEqual U_ICU_ENTRY_POINT_RENAME(uregion_areEqual) +#define uregion_contains U_ICU_ENTRY_POINT_RENAME(uregion_contains) +#define uregion_getAvailable U_ICU_ENTRY_POINT_RENAME(uregion_getAvailable) +#define uregion_getContainedRegions U_ICU_ENTRY_POINT_RENAME(uregion_getContainedRegions) +#define uregion_getContainedRegionsOfType U_ICU_ENTRY_POINT_RENAME(uregion_getContainedRegionsOfType) +#define uregion_getContainingRegion U_ICU_ENTRY_POINT_RENAME(uregion_getContainingRegion) +#define uregion_getContainingRegionOfType U_ICU_ENTRY_POINT_RENAME(uregion_getContainingRegionOfType) +#define uregion_getNumericCode U_ICU_ENTRY_POINT_RENAME(uregion_getNumericCode) +#define uregion_getPreferredValues U_ICU_ENTRY_POINT_RENAME(uregion_getPreferredValues) +#define uregion_getRegionCode U_ICU_ENTRY_POINT_RENAME(uregion_getRegionCode) +#define uregion_getRegionFromCode U_ICU_ENTRY_POINT_RENAME(uregion_getRegionFromCode) +#define uregion_getRegionFromNumericCode U_ICU_ENTRY_POINT_RENAME(uregion_getRegionFromNumericCode) +#define uregion_getType U_ICU_ENTRY_POINT_RENAME(uregion_getType) +#define ures_close U_ICU_ENTRY_POINT_RENAME(ures_close) +#define ures_copyResb U_ICU_ENTRY_POINT_RENAME(ures_copyResb) +#define ures_countArrayItems U_ICU_ENTRY_POINT_RENAME(ures_countArrayItems) +#define ures_findResource U_ICU_ENTRY_POINT_RENAME(ures_findResource) +#define ures_findSubResource U_ICU_ENTRY_POINT_RENAME(ures_findSubResource) +#define ures_getBinary U_ICU_ENTRY_POINT_RENAME(ures_getBinary) +#define ures_getByIndex U_ICU_ENTRY_POINT_RENAME(ures_getByIndex) +#define ures_getByKey U_ICU_ENTRY_POINT_RENAME(ures_getByKey) +#define ures_getByKeyWithFallback U_ICU_ENTRY_POINT_RENAME(ures_getByKeyWithFallback) +#define ures_getFunctionalEquivalent U_ICU_ENTRY_POINT_RENAME(ures_getFunctionalEquivalent) +#define ures_getInt U_ICU_ENTRY_POINT_RENAME(ures_getInt) +#define ures_getIntVector U_ICU_ENTRY_POINT_RENAME(ures_getIntVector) +#define ures_getKey U_ICU_ENTRY_POINT_RENAME(ures_getKey) +#define ures_getKeywordValues U_ICU_ENTRY_POINT_RENAME(ures_getKeywordValues) +#define ures_getLocale U_ICU_ENTRY_POINT_RENAME(ures_getLocale) +#define ures_getLocaleByType U_ICU_ENTRY_POINT_RENAME(ures_getLocaleByType) +#define ures_getLocaleInternal U_ICU_ENTRY_POINT_RENAME(ures_getLocaleInternal) +#define ures_getName U_ICU_ENTRY_POINT_RENAME(ures_getName) +#define ures_getNextResource U_ICU_ENTRY_POINT_RENAME(ures_getNextResource) +#define ures_getNextString U_ICU_ENTRY_POINT_RENAME(ures_getNextString) +#define ures_getSize U_ICU_ENTRY_POINT_RENAME(ures_getSize) +#define ures_getString U_ICU_ENTRY_POINT_RENAME(ures_getString) +#define ures_getStringByIndex U_ICU_ENTRY_POINT_RENAME(ures_getStringByIndex) +#define ures_getStringByKey U_ICU_ENTRY_POINT_RENAME(ures_getStringByKey) +#define ures_getStringByKeyWithFallback U_ICU_ENTRY_POINT_RENAME(ures_getStringByKeyWithFallback) +#define ures_getType U_ICU_ENTRY_POINT_RENAME(ures_getType) +#define ures_getUInt U_ICU_ENTRY_POINT_RENAME(ures_getUInt) +#define ures_getUTF8String U_ICU_ENTRY_POINT_RENAME(ures_getUTF8String) +#define ures_getUTF8StringByIndex U_ICU_ENTRY_POINT_RENAME(ures_getUTF8StringByIndex) +#define ures_getUTF8StringByKey U_ICU_ENTRY_POINT_RENAME(ures_getUTF8StringByKey) +#define ures_getVersion U_ICU_ENTRY_POINT_RENAME(ures_getVersion) +#define ures_getVersionByKey U_ICU_ENTRY_POINT_RENAME(ures_getVersionByKey) +#define ures_getVersionNumber U_ICU_ENTRY_POINT_RENAME(ures_getVersionNumber) +#define ures_getVersionNumberInternal U_ICU_ENTRY_POINT_RENAME(ures_getVersionNumberInternal) +#define ures_hasNext U_ICU_ENTRY_POINT_RENAME(ures_hasNext) +#define ures_initStackObject U_ICU_ENTRY_POINT_RENAME(ures_initStackObject) +#define ures_open U_ICU_ENTRY_POINT_RENAME(ures_open) +#define ures_openAvailableLocales U_ICU_ENTRY_POINT_RENAME(ures_openAvailableLocales) +#define ures_openDirect U_ICU_ENTRY_POINT_RENAME(ures_openDirect) +#define ures_openFillIn U_ICU_ENTRY_POINT_RENAME(ures_openFillIn) +#define ures_openU U_ICU_ENTRY_POINT_RENAME(ures_openU) +#define ures_resetIterator U_ICU_ENTRY_POINT_RENAME(ures_resetIterator) +#define ures_swap U_ICU_ENTRY_POINT_RENAME(ures_swap) +#define uscript_breaksBetweenLetters U_ICU_ENTRY_POINT_RENAME(uscript_breaksBetweenLetters) +#define uscript_closeRun U_ICU_ENTRY_POINT_RENAME(uscript_closeRun) +#define uscript_getCode U_ICU_ENTRY_POINT_RENAME(uscript_getCode) +#define uscript_getName U_ICU_ENTRY_POINT_RENAME(uscript_getName) +#define uscript_getSampleString U_ICU_ENTRY_POINT_RENAME(uscript_getSampleString) +#define uscript_getSampleUnicodeString U_ICU_ENTRY_POINT_RENAME(uscript_getSampleUnicodeString) +#define uscript_getScript U_ICU_ENTRY_POINT_RENAME(uscript_getScript) +#define uscript_getScriptExtensions U_ICU_ENTRY_POINT_RENAME(uscript_getScriptExtensions) +#define uscript_getShortName U_ICU_ENTRY_POINT_RENAME(uscript_getShortName) +#define uscript_getUsage U_ICU_ENTRY_POINT_RENAME(uscript_getUsage) +#define uscript_hasScript U_ICU_ENTRY_POINT_RENAME(uscript_hasScript) +#define uscript_isCased U_ICU_ENTRY_POINT_RENAME(uscript_isCased) +#define uscript_isRightToLeft U_ICU_ENTRY_POINT_RENAME(uscript_isRightToLeft) +#define uscript_nextRun U_ICU_ENTRY_POINT_RENAME(uscript_nextRun) +#define uscript_openRun U_ICU_ENTRY_POINT_RENAME(uscript_openRun) +#define uscript_resetRun U_ICU_ENTRY_POINT_RENAME(uscript_resetRun) +#define uscript_setRunText U_ICU_ENTRY_POINT_RENAME(uscript_setRunText) +#define usearch_close U_ICU_ENTRY_POINT_RENAME(usearch_close) +#define usearch_first U_ICU_ENTRY_POINT_RENAME(usearch_first) +#define usearch_following U_ICU_ENTRY_POINT_RENAME(usearch_following) +#define usearch_getAttribute U_ICU_ENTRY_POINT_RENAME(usearch_getAttribute) +#define usearch_getBreakIterator U_ICU_ENTRY_POINT_RENAME(usearch_getBreakIterator) +#define usearch_getCollator U_ICU_ENTRY_POINT_RENAME(usearch_getCollator) +#define usearch_getMatchedLength U_ICU_ENTRY_POINT_RENAME(usearch_getMatchedLength) +#define usearch_getMatchedStart U_ICU_ENTRY_POINT_RENAME(usearch_getMatchedStart) +#define usearch_getMatchedText U_ICU_ENTRY_POINT_RENAME(usearch_getMatchedText) +#define usearch_getOffset U_ICU_ENTRY_POINT_RENAME(usearch_getOffset) +#define usearch_getPattern U_ICU_ENTRY_POINT_RENAME(usearch_getPattern) +#define usearch_getText U_ICU_ENTRY_POINT_RENAME(usearch_getText) +#define usearch_handleNextCanonical U_ICU_ENTRY_POINT_RENAME(usearch_handleNextCanonical) +#define usearch_handleNextExact U_ICU_ENTRY_POINT_RENAME(usearch_handleNextExact) +#define usearch_handlePreviousCanonical U_ICU_ENTRY_POINT_RENAME(usearch_handlePreviousCanonical) +#define usearch_handlePreviousExact U_ICU_ENTRY_POINT_RENAME(usearch_handlePreviousExact) +#define usearch_last U_ICU_ENTRY_POINT_RENAME(usearch_last) +#define usearch_next U_ICU_ENTRY_POINT_RENAME(usearch_next) +#define usearch_open U_ICU_ENTRY_POINT_RENAME(usearch_open) +#define usearch_openFromCollator U_ICU_ENTRY_POINT_RENAME(usearch_openFromCollator) +#define usearch_preceding U_ICU_ENTRY_POINT_RENAME(usearch_preceding) +#define usearch_previous U_ICU_ENTRY_POINT_RENAME(usearch_previous) +#define usearch_reset U_ICU_ENTRY_POINT_RENAME(usearch_reset) +#define usearch_search U_ICU_ENTRY_POINT_RENAME(usearch_search) +#define usearch_searchBackwards U_ICU_ENTRY_POINT_RENAME(usearch_searchBackwards) +#define usearch_setAttribute U_ICU_ENTRY_POINT_RENAME(usearch_setAttribute) +#define usearch_setBreakIterator U_ICU_ENTRY_POINT_RENAME(usearch_setBreakIterator) +#define usearch_setCollator U_ICU_ENTRY_POINT_RENAME(usearch_setCollator) +#define usearch_setOffset U_ICU_ENTRY_POINT_RENAME(usearch_setOffset) +#define usearch_setPattern U_ICU_ENTRY_POINT_RENAME(usearch_setPattern) +#define usearch_setText U_ICU_ENTRY_POINT_RENAME(usearch_setText) +#define uset_add U_ICU_ENTRY_POINT_RENAME(uset_add) +#define uset_addAll U_ICU_ENTRY_POINT_RENAME(uset_addAll) +#define uset_addAllCodePoints U_ICU_ENTRY_POINT_RENAME(uset_addAllCodePoints) +#define uset_addRange U_ICU_ENTRY_POINT_RENAME(uset_addRange) +#define uset_addString U_ICU_ENTRY_POINT_RENAME(uset_addString) +#define uset_applyIntPropertyValue U_ICU_ENTRY_POINT_RENAME(uset_applyIntPropertyValue) +#define uset_applyPattern U_ICU_ENTRY_POINT_RENAME(uset_applyPattern) +#define uset_applyPropertyAlias U_ICU_ENTRY_POINT_RENAME(uset_applyPropertyAlias) +#define uset_charAt U_ICU_ENTRY_POINT_RENAME(uset_charAt) +#define uset_clear U_ICU_ENTRY_POINT_RENAME(uset_clear) +#define uset_clone U_ICU_ENTRY_POINT_RENAME(uset_clone) +#define uset_cloneAsThawed U_ICU_ENTRY_POINT_RENAME(uset_cloneAsThawed) +#define uset_close U_ICU_ENTRY_POINT_RENAME(uset_close) +#define uset_closeOver U_ICU_ENTRY_POINT_RENAME(uset_closeOver) +#define uset_compact U_ICU_ENTRY_POINT_RENAME(uset_compact) +#define uset_complement U_ICU_ENTRY_POINT_RENAME(uset_complement) +#define uset_complementAll U_ICU_ENTRY_POINT_RENAME(uset_complementAll) +#define uset_contains U_ICU_ENTRY_POINT_RENAME(uset_contains) +#define uset_containsAll U_ICU_ENTRY_POINT_RENAME(uset_containsAll) +#define uset_containsAllCodePoints U_ICU_ENTRY_POINT_RENAME(uset_containsAllCodePoints) +#define uset_containsNone U_ICU_ENTRY_POINT_RENAME(uset_containsNone) +#define uset_containsRange U_ICU_ENTRY_POINT_RENAME(uset_containsRange) +#define uset_containsSome U_ICU_ENTRY_POINT_RENAME(uset_containsSome) +#define uset_containsString U_ICU_ENTRY_POINT_RENAME(uset_containsString) +#define uset_equals U_ICU_ENTRY_POINT_RENAME(uset_equals) +#define uset_freeze U_ICU_ENTRY_POINT_RENAME(uset_freeze) +#define uset_getItem U_ICU_ENTRY_POINT_RENAME(uset_getItem) +#define uset_getItemCount U_ICU_ENTRY_POINT_RENAME(uset_getItemCount) +#define uset_getSerializedRange U_ICU_ENTRY_POINT_RENAME(uset_getSerializedRange) +#define uset_getSerializedRangeCount U_ICU_ENTRY_POINT_RENAME(uset_getSerializedRangeCount) +#define uset_getSerializedSet U_ICU_ENTRY_POINT_RENAME(uset_getSerializedSet) +#define uset_indexOf U_ICU_ENTRY_POINT_RENAME(uset_indexOf) +#define uset_isEmpty U_ICU_ENTRY_POINT_RENAME(uset_isEmpty) +#define uset_isFrozen U_ICU_ENTRY_POINT_RENAME(uset_isFrozen) +#define uset_open U_ICU_ENTRY_POINT_RENAME(uset_open) +#define uset_openEmpty U_ICU_ENTRY_POINT_RENAME(uset_openEmpty) +#define uset_openPattern U_ICU_ENTRY_POINT_RENAME(uset_openPattern) +#define uset_openPatternOptions U_ICU_ENTRY_POINT_RENAME(uset_openPatternOptions) +#define uset_remove U_ICU_ENTRY_POINT_RENAME(uset_remove) +#define uset_removeAll U_ICU_ENTRY_POINT_RENAME(uset_removeAll) +#define uset_removeAllStrings U_ICU_ENTRY_POINT_RENAME(uset_removeAllStrings) +#define uset_removeRange U_ICU_ENTRY_POINT_RENAME(uset_removeRange) +#define uset_removeString U_ICU_ENTRY_POINT_RENAME(uset_removeString) +#define uset_resemblesPattern U_ICU_ENTRY_POINT_RENAME(uset_resemblesPattern) +#define uset_retain U_ICU_ENTRY_POINT_RENAME(uset_retain) +#define uset_retainAll U_ICU_ENTRY_POINT_RENAME(uset_retainAll) +#define uset_serialize U_ICU_ENTRY_POINT_RENAME(uset_serialize) +#define uset_serializedContains U_ICU_ENTRY_POINT_RENAME(uset_serializedContains) +#define uset_set U_ICU_ENTRY_POINT_RENAME(uset_set) +#define uset_setSerializedToOne U_ICU_ENTRY_POINT_RENAME(uset_setSerializedToOne) +#define uset_size U_ICU_ENTRY_POINT_RENAME(uset_size) +#define uset_span U_ICU_ENTRY_POINT_RENAME(uset_span) +#define uset_spanBack U_ICU_ENTRY_POINT_RENAME(uset_spanBack) +#define uset_spanBackUTF8 U_ICU_ENTRY_POINT_RENAME(uset_spanBackUTF8) +#define uset_spanUTF8 U_ICU_ENTRY_POINT_RENAME(uset_spanUTF8) +#define uset_toPattern U_ICU_ENTRY_POINT_RENAME(uset_toPattern) +#define uspoof_areConfusable U_ICU_ENTRY_POINT_RENAME(uspoof_areConfusable) +#define uspoof_areConfusableUTF8 U_ICU_ENTRY_POINT_RENAME(uspoof_areConfusableUTF8) +#define uspoof_areConfusableUnicodeString U_ICU_ENTRY_POINT_RENAME(uspoof_areConfusableUnicodeString) +#define uspoof_check U_ICU_ENTRY_POINT_RENAME(uspoof_check) +#define uspoof_checkUTF8 U_ICU_ENTRY_POINT_RENAME(uspoof_checkUTF8) +#define uspoof_checkUnicodeString U_ICU_ENTRY_POINT_RENAME(uspoof_checkUnicodeString) +#define uspoof_clone U_ICU_ENTRY_POINT_RENAME(uspoof_clone) +#define uspoof_close U_ICU_ENTRY_POINT_RENAME(uspoof_close) +#define uspoof_getAllowedChars U_ICU_ENTRY_POINT_RENAME(uspoof_getAllowedChars) +#define uspoof_getAllowedLocales U_ICU_ENTRY_POINT_RENAME(uspoof_getAllowedLocales) +#define uspoof_getAllowedUnicodeSet U_ICU_ENTRY_POINT_RENAME(uspoof_getAllowedUnicodeSet) +#define uspoof_getChecks U_ICU_ENTRY_POINT_RENAME(uspoof_getChecks) +#define uspoof_getInclusionSet U_ICU_ENTRY_POINT_RENAME(uspoof_getInclusionSet) +#define uspoof_getInclusionUnicodeSet U_ICU_ENTRY_POINT_RENAME(uspoof_getInclusionUnicodeSet) +#define uspoof_getRecommendedSet U_ICU_ENTRY_POINT_RENAME(uspoof_getRecommendedSet) +#define uspoof_getRecommendedUnicodeSet U_ICU_ENTRY_POINT_RENAME(uspoof_getRecommendedUnicodeSet) +#define uspoof_getRestrictionLevel U_ICU_ENTRY_POINT_RENAME(uspoof_getRestrictionLevel) +#define uspoof_getSkeleton U_ICU_ENTRY_POINT_RENAME(uspoof_getSkeleton) +#define uspoof_getSkeletonUTF8 U_ICU_ENTRY_POINT_RENAME(uspoof_getSkeletonUTF8) +#define uspoof_getSkeletonUnicodeString U_ICU_ENTRY_POINT_RENAME(uspoof_getSkeletonUnicodeString) +#define uspoof_open U_ICU_ENTRY_POINT_RENAME(uspoof_open) +#define uspoof_openFromSerialized U_ICU_ENTRY_POINT_RENAME(uspoof_openFromSerialized) +#define uspoof_openFromSource U_ICU_ENTRY_POINT_RENAME(uspoof_openFromSource) +#define uspoof_serialize U_ICU_ENTRY_POINT_RENAME(uspoof_serialize) +#define uspoof_setAllowedChars U_ICU_ENTRY_POINT_RENAME(uspoof_setAllowedChars) +#define uspoof_setAllowedLocales U_ICU_ENTRY_POINT_RENAME(uspoof_setAllowedLocales) +#define uspoof_setAllowedUnicodeSet U_ICU_ENTRY_POINT_RENAME(uspoof_setAllowedUnicodeSet) +#define uspoof_setChecks U_ICU_ENTRY_POINT_RENAME(uspoof_setChecks) +#define uspoof_setRestrictionLevel U_ICU_ENTRY_POINT_RENAME(uspoof_setRestrictionLevel) +#define uspoof_swap U_ICU_ENTRY_POINT_RENAME(uspoof_swap) +#define usprep_close U_ICU_ENTRY_POINT_RENAME(usprep_close) +#define usprep_open U_ICU_ENTRY_POINT_RENAME(usprep_open) +#define usprep_openByType U_ICU_ENTRY_POINT_RENAME(usprep_openByType) +#define usprep_prepare U_ICU_ENTRY_POINT_RENAME(usprep_prepare) +#define usprep_swap U_ICU_ENTRY_POINT_RENAME(usprep_swap) +#define ustr_hashCharsN U_ICU_ENTRY_POINT_RENAME(ustr_hashCharsN) +#define ustr_hashICharsN U_ICU_ENTRY_POINT_RENAME(ustr_hashICharsN) +#define ustr_hashUCharsN U_ICU_ENTRY_POINT_RENAME(ustr_hashUCharsN) +#define ustrcase_internalFold U_ICU_ENTRY_POINT_RENAME(ustrcase_internalFold) +#define ustrcase_internalToLower U_ICU_ENTRY_POINT_RENAME(ustrcase_internalToLower) +#define ustrcase_internalToTitle U_ICU_ENTRY_POINT_RENAME(ustrcase_internalToTitle) +#define ustrcase_internalToUpper U_ICU_ENTRY_POINT_RENAME(ustrcase_internalToUpper) +#define ustrcase_map U_ICU_ENTRY_POINT_RENAME(ustrcase_map) +#define ustrcase_setTempCaseMapLocale U_ICU_ENTRY_POINT_RENAME(ustrcase_setTempCaseMapLocale) +#define utext_char32At U_ICU_ENTRY_POINT_RENAME(utext_char32At) +#define utext_clone U_ICU_ENTRY_POINT_RENAME(utext_clone) +#define utext_close U_ICU_ENTRY_POINT_RENAME(utext_close) +#define utext_copy U_ICU_ENTRY_POINT_RENAME(utext_copy) +#define utext_current32 U_ICU_ENTRY_POINT_RENAME(utext_current32) +#define utext_equals U_ICU_ENTRY_POINT_RENAME(utext_equals) +#define utext_extract U_ICU_ENTRY_POINT_RENAME(utext_extract) +#define utext_freeze U_ICU_ENTRY_POINT_RENAME(utext_freeze) +#define utext_getNativeIndex U_ICU_ENTRY_POINT_RENAME(utext_getNativeIndex) +#define utext_getPreviousNativeIndex U_ICU_ENTRY_POINT_RENAME(utext_getPreviousNativeIndex) +#define utext_hasMetaData U_ICU_ENTRY_POINT_RENAME(utext_hasMetaData) +#define utext_isLengthExpensive U_ICU_ENTRY_POINT_RENAME(utext_isLengthExpensive) +#define utext_isWritable U_ICU_ENTRY_POINT_RENAME(utext_isWritable) +#define utext_moveIndex32 U_ICU_ENTRY_POINT_RENAME(utext_moveIndex32) +#define utext_nativeLength U_ICU_ENTRY_POINT_RENAME(utext_nativeLength) +#define utext_next32 U_ICU_ENTRY_POINT_RENAME(utext_next32) +#define utext_next32From U_ICU_ENTRY_POINT_RENAME(utext_next32From) +#define utext_openCharacterIterator U_ICU_ENTRY_POINT_RENAME(utext_openCharacterIterator) +#define utext_openConstUnicodeString U_ICU_ENTRY_POINT_RENAME(utext_openConstUnicodeString) +#define utext_openReplaceable U_ICU_ENTRY_POINT_RENAME(utext_openReplaceable) +#define utext_openUChars U_ICU_ENTRY_POINT_RENAME(utext_openUChars) +#define utext_openUTF8 U_ICU_ENTRY_POINT_RENAME(utext_openUTF8) +#define utext_openUnicodeString U_ICU_ENTRY_POINT_RENAME(utext_openUnicodeString) +#define utext_previous32 U_ICU_ENTRY_POINT_RENAME(utext_previous32) +#define utext_previous32From U_ICU_ENTRY_POINT_RENAME(utext_previous32From) +#define utext_replace U_ICU_ENTRY_POINT_RENAME(utext_replace) +#define utext_setNativeIndex U_ICU_ENTRY_POINT_RENAME(utext_setNativeIndex) +#define utext_setup U_ICU_ENTRY_POINT_RENAME(utext_setup) +#define utf8_appendCharSafeBody U_ICU_ENTRY_POINT_RENAME(utf8_appendCharSafeBody) +#define utf8_back1SafeBody U_ICU_ENTRY_POINT_RENAME(utf8_back1SafeBody) +#define utf8_countTrailBytes U_ICU_ENTRY_POINT_RENAME(utf8_countTrailBytes) +#define utf8_nextCharSafeBody U_ICU_ENTRY_POINT_RENAME(utf8_nextCharSafeBody) +#define utf8_prevCharSafeBody U_ICU_ENTRY_POINT_RENAME(utf8_prevCharSafeBody) +#define utmscale_fromInt64 U_ICU_ENTRY_POINT_RENAME(utmscale_fromInt64) +#define utmscale_getTimeScaleValue U_ICU_ENTRY_POINT_RENAME(utmscale_getTimeScaleValue) +#define utmscale_toInt64 U_ICU_ENTRY_POINT_RENAME(utmscale_toInt64) +#define utrace_cleanup U_ICU_ENTRY_POINT_RENAME(utrace_cleanup) +#define utrace_data U_ICU_ENTRY_POINT_RENAME(utrace_data) +#define utrace_entry U_ICU_ENTRY_POINT_RENAME(utrace_entry) +#define utrace_exit U_ICU_ENTRY_POINT_RENAME(utrace_exit) +#define utrace_format U_ICU_ENTRY_POINT_RENAME(utrace_format) +#define utrace_functionName U_ICU_ENTRY_POINT_RENAME(utrace_functionName) +#define utrace_getFunctions U_ICU_ENTRY_POINT_RENAME(utrace_getFunctions) +#define utrace_getLevel U_ICU_ENTRY_POINT_RENAME(utrace_getLevel) +#define utrace_level U_ICU_ENTRY_POINT_RENAME(utrace_level) +#define utrace_setFunctions U_ICU_ENTRY_POINT_RENAME(utrace_setFunctions) +#define utrace_setLevel U_ICU_ENTRY_POINT_RENAME(utrace_setLevel) +#define utrace_vformat U_ICU_ENTRY_POINT_RENAME(utrace_vformat) +#define utrans_clone U_ICU_ENTRY_POINT_RENAME(utrans_clone) +#define utrans_close U_ICU_ENTRY_POINT_RENAME(utrans_close) +#define utrans_countAvailableIDs U_ICU_ENTRY_POINT_RENAME(utrans_countAvailableIDs) +#define utrans_getAvailableID U_ICU_ENTRY_POINT_RENAME(utrans_getAvailableID) +#define utrans_getID U_ICU_ENTRY_POINT_RENAME(utrans_getID) +#define utrans_getSourceSet U_ICU_ENTRY_POINT_RENAME(utrans_getSourceSet) +#define utrans_getUnicodeID U_ICU_ENTRY_POINT_RENAME(utrans_getUnicodeID) +#define utrans_open U_ICU_ENTRY_POINT_RENAME(utrans_open) +#define utrans_openIDs U_ICU_ENTRY_POINT_RENAME(utrans_openIDs) +#define utrans_openInverse U_ICU_ENTRY_POINT_RENAME(utrans_openInverse) +#define utrans_openU U_ICU_ENTRY_POINT_RENAME(utrans_openU) +#define utrans_register U_ICU_ENTRY_POINT_RENAME(utrans_register) +#define utrans_rep_caseContextIterator U_ICU_ENTRY_POINT_RENAME(utrans_rep_caseContextIterator) +#define utrans_setFilter U_ICU_ENTRY_POINT_RENAME(utrans_setFilter) +#define utrans_stripRules U_ICU_ENTRY_POINT_RENAME(utrans_stripRules) +#define utrans_toRules U_ICU_ENTRY_POINT_RENAME(utrans_toRules) +#define utrans_trans U_ICU_ENTRY_POINT_RENAME(utrans_trans) +#define utrans_transIncremental U_ICU_ENTRY_POINT_RENAME(utrans_transIncremental) +#define utrans_transIncrementalUChars U_ICU_ENTRY_POINT_RENAME(utrans_transIncrementalUChars) +#define utrans_transUChars U_ICU_ENTRY_POINT_RENAME(utrans_transUChars) +#define utrans_transliterator_cleanup U_ICU_ENTRY_POINT_RENAME(utrans_transliterator_cleanup) +#define utrans_unregister U_ICU_ENTRY_POINT_RENAME(utrans_unregister) +#define utrans_unregisterID U_ICU_ENTRY_POINT_RENAME(utrans_unregisterID) +#define utrie2_clone U_ICU_ENTRY_POINT_RENAME(utrie2_clone) +#define utrie2_cloneAsThawed U_ICU_ENTRY_POINT_RENAME(utrie2_cloneAsThawed) +#define utrie2_close U_ICU_ENTRY_POINT_RENAME(utrie2_close) +#define utrie2_enum U_ICU_ENTRY_POINT_RENAME(utrie2_enum) +#define utrie2_enumForLeadSurrogate U_ICU_ENTRY_POINT_RENAME(utrie2_enumForLeadSurrogate) +#define utrie2_freeze U_ICU_ENTRY_POINT_RENAME(utrie2_freeze) +#define utrie2_fromUTrie U_ICU_ENTRY_POINT_RENAME(utrie2_fromUTrie) +#define utrie2_get32 U_ICU_ENTRY_POINT_RENAME(utrie2_get32) +#define utrie2_get32FromLeadSurrogateCodeUnit U_ICU_ENTRY_POINT_RENAME(utrie2_get32FromLeadSurrogateCodeUnit) +#define utrie2_getVersion U_ICU_ENTRY_POINT_RENAME(utrie2_getVersion) +#define utrie2_internalU8NextIndex U_ICU_ENTRY_POINT_RENAME(utrie2_internalU8NextIndex) +#define utrie2_internalU8PrevIndex U_ICU_ENTRY_POINT_RENAME(utrie2_internalU8PrevIndex) +#define utrie2_isFrozen U_ICU_ENTRY_POINT_RENAME(utrie2_isFrozen) +#define utrie2_open U_ICU_ENTRY_POINT_RENAME(utrie2_open) +#define utrie2_openDummy U_ICU_ENTRY_POINT_RENAME(utrie2_openDummy) +#define utrie2_openFromSerialized U_ICU_ENTRY_POINT_RENAME(utrie2_openFromSerialized) +#define utrie2_serialize U_ICU_ENTRY_POINT_RENAME(utrie2_serialize) +#define utrie2_set32 U_ICU_ENTRY_POINT_RENAME(utrie2_set32) +#define utrie2_set32ForLeadSurrogateCodeUnit U_ICU_ENTRY_POINT_RENAME(utrie2_set32ForLeadSurrogateCodeUnit) +#define utrie2_setRange32 U_ICU_ENTRY_POINT_RENAME(utrie2_setRange32) +#define utrie2_swap U_ICU_ENTRY_POINT_RENAME(utrie2_swap) +#define utrie2_swapAnyVersion U_ICU_ENTRY_POINT_RENAME(utrie2_swapAnyVersion) +#define utrie_clone U_ICU_ENTRY_POINT_RENAME(utrie_clone) +#define utrie_close U_ICU_ENTRY_POINT_RENAME(utrie_close) +#define utrie_defaultGetFoldingOffset U_ICU_ENTRY_POINT_RENAME(utrie_defaultGetFoldingOffset) +#define utrie_enum U_ICU_ENTRY_POINT_RENAME(utrie_enum) +#define utrie_get32 U_ICU_ENTRY_POINT_RENAME(utrie_get32) +#define utrie_getData U_ICU_ENTRY_POINT_RENAME(utrie_getData) +#define utrie_open U_ICU_ENTRY_POINT_RENAME(utrie_open) +#define utrie_serialize U_ICU_ENTRY_POINT_RENAME(utrie_serialize) +#define utrie_set32 U_ICU_ENTRY_POINT_RENAME(utrie_set32) +#define utrie_setRange32 U_ICU_ENTRY_POINT_RENAME(utrie_setRange32) +#define utrie_swap U_ICU_ENTRY_POINT_RENAME(utrie_swap) +#define utrie_unserialize U_ICU_ENTRY_POINT_RENAME(utrie_unserialize) +#define utrie_unserializeDummy U_ICU_ENTRY_POINT_RENAME(utrie_unserializeDummy) +#define vzone_clone U_ICU_ENTRY_POINT_RENAME(vzone_clone) +#define vzone_close U_ICU_ENTRY_POINT_RENAME(vzone_close) +#define vzone_countTransitionRules U_ICU_ENTRY_POINT_RENAME(vzone_countTransitionRules) +#define vzone_equals U_ICU_ENTRY_POINT_RENAME(vzone_equals) +#define vzone_getDynamicClassID U_ICU_ENTRY_POINT_RENAME(vzone_getDynamicClassID) +#define vzone_getLastModified U_ICU_ENTRY_POINT_RENAME(vzone_getLastModified) +#define vzone_getNextTransition U_ICU_ENTRY_POINT_RENAME(vzone_getNextTransition) +#define vzone_getOffset U_ICU_ENTRY_POINT_RENAME(vzone_getOffset) +#define vzone_getOffset2 U_ICU_ENTRY_POINT_RENAME(vzone_getOffset2) +#define vzone_getOffset3 U_ICU_ENTRY_POINT_RENAME(vzone_getOffset3) +#define vzone_getPreviousTransition U_ICU_ENTRY_POINT_RENAME(vzone_getPreviousTransition) +#define vzone_getRawOffset U_ICU_ENTRY_POINT_RENAME(vzone_getRawOffset) +#define vzone_getStaticClassID U_ICU_ENTRY_POINT_RENAME(vzone_getStaticClassID) +#define vzone_getTZURL U_ICU_ENTRY_POINT_RENAME(vzone_getTZURL) +#define vzone_hasSameRules U_ICU_ENTRY_POINT_RENAME(vzone_hasSameRules) +#define vzone_inDaylightTime U_ICU_ENTRY_POINT_RENAME(vzone_inDaylightTime) +#define vzone_openData U_ICU_ENTRY_POINT_RENAME(vzone_openData) +#define vzone_openID U_ICU_ENTRY_POINT_RENAME(vzone_openID) +#define vzone_setLastModified U_ICU_ENTRY_POINT_RENAME(vzone_setLastModified) +#define vzone_setRawOffset U_ICU_ENTRY_POINT_RENAME(vzone_setRawOffset) +#define vzone_setTZURL U_ICU_ENTRY_POINT_RENAME(vzone_setTZURL) +#define vzone_useDaylightTime U_ICU_ENTRY_POINT_RENAME(vzone_useDaylightTime) +#define vzone_write U_ICU_ENTRY_POINT_RENAME(vzone_write) +#define vzone_writeFromStart U_ICU_ENTRY_POINT_RENAME(vzone_writeFromStart) +#define vzone_writeSimple U_ICU_ENTRY_POINT_RENAME(vzone_writeSimple) +#define zrule_close U_ICU_ENTRY_POINT_RENAME(zrule_close) +#define zrule_equals U_ICU_ENTRY_POINT_RENAME(zrule_equals) +#define zrule_getDSTSavings U_ICU_ENTRY_POINT_RENAME(zrule_getDSTSavings) +#define zrule_getName U_ICU_ENTRY_POINT_RENAME(zrule_getName) +#define zrule_getRawOffset U_ICU_ENTRY_POINT_RENAME(zrule_getRawOffset) +#define zrule_isEquivalentTo U_ICU_ENTRY_POINT_RENAME(zrule_isEquivalentTo) +#define ztrans_adoptFrom U_ICU_ENTRY_POINT_RENAME(ztrans_adoptFrom) +#define ztrans_adoptTo U_ICU_ENTRY_POINT_RENAME(ztrans_adoptTo) +#define ztrans_clone U_ICU_ENTRY_POINT_RENAME(ztrans_clone) +#define ztrans_close U_ICU_ENTRY_POINT_RENAME(ztrans_close) +#define ztrans_equals U_ICU_ENTRY_POINT_RENAME(ztrans_equals) +#define ztrans_getDynamicClassID U_ICU_ENTRY_POINT_RENAME(ztrans_getDynamicClassID) +#define ztrans_getFrom U_ICU_ENTRY_POINT_RENAME(ztrans_getFrom) +#define ztrans_getStaticClassID U_ICU_ENTRY_POINT_RENAME(ztrans_getStaticClassID) +#define ztrans_getTime U_ICU_ENTRY_POINT_RENAME(ztrans_getTime) +#define ztrans_getTo U_ICU_ENTRY_POINT_RENAME(ztrans_getTo) +#define ztrans_open U_ICU_ENTRY_POINT_RENAME(ztrans_open) +#define ztrans_openEmpty U_ICU_ENTRY_POINT_RENAME(ztrans_openEmpty) +#define ztrans_setFrom U_ICU_ENTRY_POINT_RENAME(ztrans_setFrom) +#define ztrans_setTime U_ICU_ENTRY_POINT_RENAME(ztrans_setTime) +#define ztrans_setTo U_ICU_ENTRY_POINT_RENAME(ztrans_setTo) + +#endif + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/uset.h b/src/core/basetypes/icu-ucsdet/include/unicode/uset.h new file mode 100644 index 00000000..eb3c9e6a --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/uset.h @@ -0,0 +1,1126 @@ +/* +******************************************************************************* +* +* Copyright (C) 2002-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uset.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002mar07 +* created by: Markus W. Scherer +* +* C version of UnicodeSet. +*/ + + +/** + * \file + * \brief C API: Unicode Set + * + * <p>This is a C wrapper around the C++ UnicodeSet class.</p> + */ + +#ifndef __USET_H__ +#define __USET_H__ + +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "unicode/localpointer.h" + +#ifndef UCNV_H +struct USet; +/** + * A UnicodeSet. Use the uset_* API to manipulate. Create with + * uset_open*, and destroy with uset_close. + * @stable ICU 2.4 + */ +typedef struct USet USet; +#endif + +/** + * Bitmask values to be passed to uset_openPatternOptions() or + * uset_applyPattern() taking an option parameter. + * @stable ICU 2.4 + */ +enum { + /** + * Ignore white space within patterns unless quoted or escaped. + * @stable ICU 2.4 + */ + USET_IGNORE_SPACE = 1, + + /** + * Enable case insensitive matching. E.g., "[ab]" with this flag + * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will + * match all except 'a', 'A', 'b', and 'B'. This performs a full + * closure over case mappings, e.g. U+017F for s. + * + * The resulting set is a superset of the input for the code points but + * not for the strings. + * It performs a case mapping closure of the code points and adds + * full case folding strings for the code points, and reduces strings of + * the original set to their full case folding equivalents. + * + * This is designed for case-insensitive matches, for example + * in regular expressions. The full code point case closure allows checking of + * an input character directly against the closure set. + * Strings are matched by comparing the case-folded form from the closure + * set with an incremental case folding of the string in question. + * + * The closure set will also contain single code points if the original + * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). + * This is not necessary (that is, redundant) for the above matching method + * but results in the same closure sets regardless of whether the original + * set contained the code point or a string. + * + * @stable ICU 2.4 + */ + USET_CASE_INSENSITIVE = 2, + + /** + * Enable case insensitive matching. E.g., "[ab]" with this flag + * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will + * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, + * title-, and uppercase mappings as well as the case folding + * of each existing element in the set. + * @stable ICU 3.2 + */ + USET_ADD_CASE_MAPPINGS = 4 +}; + +/** + * Argument values for whether span() and similar functions continue while + * the current character is contained vs. not contained in the set. + * + * The functionality is straightforward for sets with only single code points, + * without strings (which is the common case): + * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same. + * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED. + * - span() and spanBack() partition any string the same way when + * alternating between span(USET_SPAN_NOT_CONTAINED) and + * span(either "contained" condition). + * - Using a complemented (inverted) set and the opposite span conditions + * yields the same results. + * + * When a set contains multi-code point strings, then these statements may not + * be true, depending on the strings in the set (for example, whether they + * overlap with each other) and the string that is processed. + * For a set with strings: + * - The complement of the set contains the opposite set of code points, + * but the same set of strings. + * Therefore, complementing both the set and the span conditions + * may yield different results. + * - When starting spans at different positions in a string + * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different + * because a set string may start before the later position. + * - span(USET_SPAN_SIMPLE) may be shorter than + * span(USET_SPAN_CONTAINED) because it will not recursively try + * all possible paths. + * For example, with a set which contains the three strings "xy", "xya" and "ax", + * span("xyax", USET_SPAN_CONTAINED) will return 4 but + * span("xyax", USET_SPAN_SIMPLE) will return 3. + * span(USET_SPAN_SIMPLE) will never be longer than + * span(USET_SPAN_CONTAINED). + * - With either "contained" condition, span() and spanBack() may partition + * a string in different ways. + * For example, with a set which contains the two strings "ab" and "ba", + * and when processing the string "aba", + * span() will yield contained/not-contained boundaries of { 0, 2, 3 } + * while spanBack() will yield boundaries of { 0, 1, 3 }. + * + * Note: If it is important to get the same boundaries whether iterating forward + * or backward through a string, then either only span() should be used and + * the boundaries cached for backward operation, or an ICU BreakIterator + * could be used. + * + * Note: Unpaired surrogates are treated like surrogate code points. + * Similarly, set strings match only on code point boundaries, + * never in the middle of a surrogate pair. + * Illegal UTF-8 sequences are treated like U+FFFD. + * When processing UTF-8 strings, malformed set strings + * (strings with unpaired surrogates which cannot be converted to UTF-8) + * are ignored. + * + * @stable ICU 3.8 + */ +typedef enum USetSpanCondition { + /** + * Continues a span() while there is no set element at the current position. + * Increments by one code point at a time. + * Stops before the first set element (character or string). + * (For code points only, this is like while contains(current)==FALSE). + * + * When span() returns, the substring between where it started and the position + * it returned consists only of characters that are not in the set, + * and none of its strings overlap with the span. + * + * @stable ICU 3.8 + */ + USET_SPAN_NOT_CONTAINED = 0, + /** + * Spans the longest substring that is a concatenation of set elements (characters or strings). + * (For characters only, this is like while contains(current)==TRUE). + * + * When span() returns, the substring between where it started and the position + * it returned consists only of set elements (characters or strings) that are in the set. + * + * If a set contains strings, then the span will be the longest substring for which there + * exists at least one non-overlapping concatenation of set elements (characters or strings). + * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. + * (Java/ICU/Perl regex stops at the first match of an OR.) + * + * @stable ICU 3.8 + */ + USET_SPAN_CONTAINED = 1, + /** + * Continues a span() while there is a set element at the current position. + * Increments by the longest matching element at each position. + * (For characters only, this is like while contains(current)==TRUE). + * + * When span() returns, the substring between where it started and the position + * it returned consists only of set elements (characters or strings) that are in the set. + * + * If a set only contains single characters, then this is the same + * as USET_SPAN_CONTAINED. + * + * If a set contains strings, then the span will be the longest substring + * with a match at each position with the longest single set element (character or string). + * + * Use this span condition together with other longest-match algorithms, + * such as ICU converters (ucnv_getUnicodeSet()). + * + * @stable ICU 3.8 + */ + USET_SPAN_SIMPLE = 2, + /** + * One more than the last span condition. + * @stable ICU 3.8 + */ + USET_SPAN_CONDITION_COUNT +} USetSpanCondition; + +enum { + /** + * Capacity of USerializedSet::staticArray. + * Enough for any single-code point set. + * Also provides padding for nice sizeof(USerializedSet). + * @stable ICU 2.4 + */ + USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8 +}; + +/** + * A serialized form of a Unicode set. Limited manipulations are + * possible directly on a serialized set. See below. + * @stable ICU 2.4 + */ +typedef struct USerializedSet { + /** + * The serialized Unicode Set. + * @stable ICU 2.4 + */ + const uint16_t *array; + /** + * The length of the array that contains BMP characters. + * @stable ICU 2.4 + */ + int32_t bmpLength; + /** + * The total length of the array. + * @stable ICU 2.4 + */ + int32_t length; + /** + * A small buffer for the array to reduce memory allocations. + * @stable ICU 2.4 + */ + uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY]; +} USerializedSet; + +/********************************************************************* + * USet API + *********************************************************************/ + +/** + * Create an empty USet object. + * Equivalent to uset_open(1, 0). + * @return a newly created USet. The caller must call uset_close() on + * it when done. + * @stable ICU 4.2 + */ +U_STABLE USet* U_EXPORT2 +uset_openEmpty(void); + +/** + * Creates a USet object that contains the range of characters + * start..end, inclusive. If <code>start > end</code> + * then an empty set is created (same as using uset_openEmpty()). + * @param start first character of the range, inclusive + * @param end last character of the range, inclusive + * @return a newly created USet. The caller must call uset_close() on + * it when done. + * @stable ICU 2.4 + */ +U_STABLE USet* U_EXPORT2 +uset_open(UChar32 start, UChar32 end); + +/** + * Creates a set from the given pattern. See the UnicodeSet class + * description for the syntax of the pattern language. + * @param pattern a string specifying what characters are in the set + * @param patternLength the length of the pattern, or -1 if null + * terminated + * @param ec the error code + * @stable ICU 2.4 + */ +U_STABLE USet* U_EXPORT2 +uset_openPattern(const UChar* pattern, int32_t patternLength, + UErrorCode* ec); + +/** + * Creates a set from the given pattern. See the UnicodeSet class + * description for the syntax of the pattern language. + * @param pattern a string specifying what characters are in the set + * @param patternLength the length of the pattern, or -1 if null + * terminated + * @param options bitmask for options to apply to the pattern. + * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. + * @param ec the error code + * @stable ICU 2.4 + */ +U_STABLE USet* U_EXPORT2 +uset_openPatternOptions(const UChar* pattern, int32_t patternLength, + uint32_t options, + UErrorCode* ec); + +/** + * Disposes of the storage used by a USet object. This function should + * be called exactly once for objects returned by uset_open(). + * @param set the object to dispose of + * @stable ICU 2.4 + */ +U_STABLE void U_EXPORT2 +uset_close(USet* set); + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN + +/** + * \class LocalUSetPointer + * "Smart pointer" class, closes a USet via uset_close(). + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + * @stable ICU 4.4 + */ +U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close); + +U_NAMESPACE_END + +#endif + +/** + * Returns a copy of this object. + * If this set is frozen, then the clone will be frozen as well. + * Use uset_cloneAsThawed() for a mutable clone of a frozen set. + * @param set the original set + * @return the newly allocated copy of the set + * @see uset_cloneAsThawed + * @stable ICU 3.8 + */ +U_STABLE USet * U_EXPORT2 +uset_clone(const USet *set); + +/** + * Determines whether the set has been frozen (made immutable) or not. + * See the ICU4J Freezable interface for details. + * @param set the set + * @return TRUE/FALSE for whether the set has been frozen + * @see uset_freeze + * @see uset_cloneAsThawed + * @stable ICU 3.8 + */ +U_STABLE UBool U_EXPORT2 +uset_isFrozen(const USet *set); + +/** + * Freeze the set (make it immutable). + * Once frozen, it cannot be unfrozen and is therefore thread-safe + * until it is deleted. + * See the ICU4J Freezable interface for details. + * Freezing the set may also make some operations faster, for example + * uset_contains() and uset_span(). + * A frozen set will not be modified. (It remains frozen.) + * @param set the set + * @return the same set, now frozen + * @see uset_isFrozen + * @see uset_cloneAsThawed + * @stable ICU 3.8 + */ +U_STABLE void U_EXPORT2 +uset_freeze(USet *set); + +/** + * Clone the set and make the clone mutable. + * See the ICU4J Freezable interface for details. + * @param set the set + * @return the mutable clone + * @see uset_freeze + * @see uset_isFrozen + * @see uset_clone + * @stable ICU 3.8 + */ +U_STABLE USet * U_EXPORT2 +uset_cloneAsThawed(const USet *set); + +/** + * Causes the USet object to represent the range <code>start - end</code>. + * If <code>start > end</code> then this USet is set to an empty range. + * A frozen set will not be modified. + * @param set the object to set to the given range + * @param start first character in the set, inclusive + * @param end last character in the set, inclusive + * @stable ICU 3.2 + */ +U_STABLE void U_EXPORT2 +uset_set(USet* set, + UChar32 start, UChar32 end); + +/** + * Modifies the set to represent the set specified by the given + * pattern. See the UnicodeSet class description for the syntax of + * the pattern language. See also the User Guide chapter about UnicodeSet. + * <em>Empties the set passed before applying the pattern.</em> + * A frozen set will not be modified. + * @param set The set to which the pattern is to be applied. + * @param pattern A pointer to UChar string specifying what characters are in the set. + * The character at pattern[0] must be a '['. + * @param patternLength The length of the UChar string. -1 if NUL terminated. + * @param options A bitmask for options to apply to the pattern. + * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. + * @param status Returns an error if the pattern cannot be parsed. + * @return Upon successful parse, the value is either + * the index of the character after the closing ']' + * of the parsed pattern. + * If the status code indicates failure, then the return value + * is the index of the error in the source. + * + * @stable ICU 2.8 + */ +U_STABLE int32_t U_EXPORT2 +uset_applyPattern(USet *set, + const UChar *pattern, int32_t patternLength, + uint32_t options, + UErrorCode *status); + +/** + * Modifies the set to contain those code points which have the given value + * for the given binary or enumerated property, as returned by + * u_getIntPropertyValue. Prior contents of this set are lost. + * A frozen set will not be modified. + * + * @param set the object to contain the code points defined by the property + * + * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 + * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 + * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. + * + * @param value a value in the range u_getIntPropertyMinValue(prop).. + * u_getIntPropertyMaxValue(prop), with one exception. If prop is + * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but + * rather a mask value produced by U_GET_GC_MASK(). This allows grouped + * categories such as [:L:] to be represented. + * + * @param ec error code input/output parameter + * + * @stable ICU 3.2 + */ +U_STABLE void U_EXPORT2 +uset_applyIntPropertyValue(USet* set, + UProperty prop, int32_t value, UErrorCode* ec); + +/** + * Modifies the set to contain those code points which have the + * given value for the given property. Prior contents of this + * set are lost. + * A frozen set will not be modified. + * + * @param set the object to contain the code points defined by the given + * property and value alias + * + * @param prop a string specifying a property alias, either short or long. + * The name is matched loosely. See PropertyAliases.txt for names and a + * description of loose matching. If the value string is empty, then this + * string is interpreted as either a General_Category value alias, a Script + * value alias, a binary property alias, or a special ID. Special IDs are + * matched loosely and correspond to the following sets: + * + * "ANY" = [\\u0000-\\U0010FFFF], + * "ASCII" = [\\u0000-\\u007F], + * "Assigned" = [:^Cn:]. + * + * @param propLength the length of the prop, or -1 if NULL + * + * @param value a string specifying a value alias, either short or long. + * The name is matched loosely. See PropertyValueAliases.txt for names + * and a description of loose matching. In addition to aliases listed, + * numeric values and canonical combining classes may be expressed + * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string + * may also be empty. + * + * @param valueLength the length of the value, or -1 if NULL + * + * @param ec error code input/output parameter + * + * @stable ICU 3.2 + */ +U_STABLE void U_EXPORT2 +uset_applyPropertyAlias(USet* set, + const UChar *prop, int32_t propLength, + const UChar *value, int32_t valueLength, + UErrorCode* ec); + +/** + * Return true if the given position, in the given pattern, appears + * to be the start of a UnicodeSet pattern. + * + * @param pattern a string specifying the pattern + * @param patternLength the length of the pattern, or -1 if NULL + * @param pos the given position + * @stable ICU 3.2 + */ +U_STABLE UBool U_EXPORT2 +uset_resemblesPattern(const UChar *pattern, int32_t patternLength, + int32_t pos); + +/** + * Returns a string representation of this set. If the result of + * calling this function is passed to a uset_openPattern(), it + * will produce another set that is equal to this one. + * @param set the set + * @param result the string to receive the rules, may be NULL + * @param resultCapacity the capacity of result, may be 0 if result is NULL + * @param escapeUnprintable if TRUE then convert unprintable + * character to their hex escape representations, \\uxxxx or + * \\Uxxxxxxxx. Unprintable characters are those other than + * U+000A, U+0020..U+007E. + * @param ec error code. + * @return length of string, possibly larger than resultCapacity + * @stable ICU 2.4 + */ +U_STABLE int32_t U_EXPORT2 +uset_toPattern(const USet* set, + UChar* result, int32_t resultCapacity, + UBool escapeUnprintable, + UErrorCode* ec); + +/** + * Adds the given character to the given USet. After this call, + * uset_contains(set, c) will return TRUE. + * A frozen set will not be modified. + * @param set the object to which to add the character + * @param c the character to add + * @stable ICU 2.4 + */ +U_STABLE void U_EXPORT2 +uset_add(USet* set, UChar32 c); + +/** + * Adds all of the elements in the specified set to this set if + * they're not already present. This operation effectively + * modifies this set so that its value is the <i>union</i> of the two + * sets. The behavior of this operation is unspecified if the specified + * collection is modified while the operation is in progress. + * A frozen set will not be modified. + * + * @param set the object to which to add the set + * @param additionalSet the source set whose elements are to be added to this set. + * @stable ICU 2.6 + */ +U_STABLE void U_EXPORT2 +uset_addAll(USet* set, const USet *additionalSet); + +/** + * Adds the given range of characters to the given USet. After this call, + * uset_contains(set, start, end) will return TRUE. + * A frozen set will not be modified. + * @param set the object to which to add the character + * @param start the first character of the range to add, inclusive + * @param end the last character of the range to add, inclusive + * @stable ICU 2.2 + */ +U_STABLE void U_EXPORT2 +uset_addRange(USet* set, UChar32 start, UChar32 end); + +/** + * Adds the given string to the given USet. After this call, + * uset_containsString(set, str, strLen) will return TRUE. + * A frozen set will not be modified. + * @param set the object to which to add the character + * @param str the string to add + * @param strLen the length of the string or -1 if null terminated. + * @stable ICU 2.4 + */ +U_STABLE void U_EXPORT2 +uset_addString(USet* set, const UChar* str, int32_t strLen); + +/** + * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} + * If this set already any particular character, it has no effect on that character. + * A frozen set will not be modified. + * @param set the object to which to add the character + * @param str the source string + * @param strLen the length of the string or -1 if null terminated. + * @stable ICU 3.4 + */ +U_STABLE void U_EXPORT2 +uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen); + +/** + * Removes the given character from the given USet. After this call, + * uset_contains(set, c) will return FALSE. + * A frozen set will not be modified. + * @param set the object from which to remove the character + * @param c the character to remove + * @stable ICU 2.4 + */ +U_STABLE void U_EXPORT2 +uset_remove(USet* set, UChar32 c); + +/** + * Removes the given range of characters from the given USet. After this call, + * uset_contains(set, start, end) will return FALSE. + * A frozen set will not be modified. + * @param set the object to which to add the character + * @param start the first character of the range to remove, inclusive + * @param end the last character of the range to remove, inclusive + * @stable ICU 2.2 + */ +U_STABLE void U_EXPORT2 +uset_removeRange(USet* set, UChar32 start, UChar32 end); + +/** + * Removes the given string to the given USet. After this call, + * uset_containsString(set, str, strLen) will return FALSE. + * A frozen set will not be modified. + * @param set the object to which to add the character + * @param str the string to remove + * @param strLen the length of the string or -1 if null terminated. + * @stable ICU 2.4 + */ +U_STABLE void U_EXPORT2 +uset_removeString(USet* set, const UChar* str, int32_t strLen); + +/** + * Removes from this set all of its elements that are contained in the + * specified set. This operation effectively modifies this + * set so that its value is the <i>asymmetric set difference</i> of + * the two sets. + * A frozen set will not be modified. + * @param set the object from which the elements are to be removed + * @param removeSet the object that defines which elements will be + * removed from this set + * @stable ICU 3.2 + */ +U_STABLE void U_EXPORT2 +uset_removeAll(USet* set, const USet* removeSet); + +/** + * Retain only the elements in this set that are contained in the + * specified range. If <code>start > end</code> then an empty range is + * retained, leaving the set empty. This is equivalent to + * a boolean logic AND, or a set INTERSECTION. + * A frozen set will not be modified. + * + * @param set the object for which to retain only the specified range + * @param start first character, inclusive, of range to be retained + * to this set. + * @param end last character, inclusive, of range to be retained + * to this set. + * @stable ICU 3.2 + */ +U_STABLE void U_EXPORT2 +uset_retain(USet* set, UChar32 start, UChar32 end); + +/** + * Retains only the elements in this set that are contained in the + * specified set. In other words, removes from this set all of + * its elements that are not contained in the specified set. This + * operation effectively modifies this set so that its value is + * the <i>intersection</i> of the two sets. + * A frozen set will not be modified. + * + * @param set the object on which to perform the retain + * @param retain set that defines which elements this set will retain + * @stable ICU 3.2 + */ +U_STABLE void U_EXPORT2 +uset_retainAll(USet* set, const USet* retain); + +/** + * Reallocate this objects internal structures to take up the least + * possible space, without changing this object's value. + * A frozen set will not be modified. + * + * @param set the object on which to perfrom the compact + * @stable ICU 3.2 + */ +U_STABLE void U_EXPORT2 +uset_compact(USet* set); + +/** + * Inverts this set. This operation modifies this set so that + * its value is its complement. This operation does not affect + * the multicharacter strings, if any. + * A frozen set will not be modified. + * @param set the set + * @stable ICU 2.4 + */ +U_STABLE void U_EXPORT2 +uset_complement(USet* set); + +/** + * Complements in this set all elements contained in the specified + * set. Any character in the other set will be removed if it is + * in this set, or will be added if it is not in this set. + * A frozen set will not be modified. + * + * @param set the set with which to complement + * @param complement set that defines which elements will be xor'ed + * from this set. + * @stable ICU 3.2 + */ +U_STABLE void U_EXPORT2 +uset_complementAll(USet* set, const USet* complement); + +/** + * Removes all of the elements from this set. This set will be + * empty after this call returns. + * A frozen set will not be modified. + * @param set the set + * @stable ICU 2.4 + */ +U_STABLE void U_EXPORT2 +uset_clear(USet* set); + +/** + * Close this set over the given attribute. For the attribute + * USET_CASE, the result is to modify this set so that: + * + * 1. For each character or string 'a' in this set, all strings or + * characters 'b' such that foldCase(a) == foldCase(b) are added + * to this set. + * + * 2. For each string 'e' in the resulting set, if e != + * foldCase(e), 'e' will be removed. + * + * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] + * + * (Here foldCase(x) refers to the operation u_strFoldCase, and a + * == b denotes that the contents are the same, not pointer + * comparison.) + * + * A frozen set will not be modified. + * + * @param set the set + * + * @param attributes bitmask for attributes to close over. + * Currently only the USET_CASE bit is supported. Any undefined bits + * are ignored. + * @stable ICU 4.2 + */ +U_STABLE void U_EXPORT2 +uset_closeOver(USet* set, int32_t attributes); + +/** + * Remove all strings from this set. + * + * @param set the set + * @stable ICU 4.2 + */ +U_STABLE void U_EXPORT2 +uset_removeAllStrings(USet* set); + +/** + * Returns TRUE if the given USet contains no characters and no + * strings. + * @param set the set + * @return true if set is empty + * @stable ICU 2.4 + */ +U_STABLE UBool U_EXPORT2 +uset_isEmpty(const USet* set); + +/** + * Returns TRUE if the given USet contains the given character. + * This function works faster with a frozen set. + * @param set the set + * @param c The codepoint to check for within the set + * @return true if set contains c + * @stable ICU 2.4 + */ +U_STABLE UBool U_EXPORT2 +uset_contains(const USet* set, UChar32 c); + +/** + * Returns TRUE if the given USet contains all characters c + * where start <= c && c <= end. + * @param set the set + * @param start the first character of the range to test, inclusive + * @param end the last character of the range to test, inclusive + * @return TRUE if set contains the range + * @stable ICU 2.2 + */ +U_STABLE UBool U_EXPORT2 +uset_containsRange(const USet* set, UChar32 start, UChar32 end); + +/** + * Returns TRUE if the given USet contains the given string. + * @param set the set + * @param str the string + * @param strLen the length of the string or -1 if null terminated. + * @return true if set contains str + * @stable ICU 2.4 + */ +U_STABLE UBool U_EXPORT2 +uset_containsString(const USet* set, const UChar* str, int32_t strLen); + +/** + * Returns the index of the given character within this set, where + * the set is ordered by ascending code point. If the character + * is not in this set, return -1. The inverse of this method is + * <code>charAt()</code>. + * @param set the set + * @param c the character to obtain the index for + * @return an index from 0..size()-1, or -1 + * @stable ICU 3.2 + */ +U_STABLE int32_t U_EXPORT2 +uset_indexOf(const USet* set, UChar32 c); + +/** + * Returns the character at the given index within this set, where + * the set is ordered by ascending code point. If the index is + * out of range, return (UChar32)-1. The inverse of this method is + * <code>indexOf()</code>. + * @param set the set + * @param charIndex an index from 0..size()-1 to obtain the char for + * @return the character at the given index, or (UChar32)-1. + * @stable ICU 3.2 + */ +U_STABLE UChar32 U_EXPORT2 +uset_charAt(const USet* set, int32_t charIndex); + +/** + * Returns the number of characters and strings contained in the given + * USet. + * @param set the set + * @return a non-negative integer counting the characters and strings + * contained in set + * @stable ICU 2.4 + */ +U_STABLE int32_t U_EXPORT2 +uset_size(const USet* set); + +/** + * Returns the number of items in this set. An item is either a range + * of characters or a single multicharacter string. + * @param set the set + * @return a non-negative integer counting the character ranges + * and/or strings contained in set + * @stable ICU 2.4 + */ +U_STABLE int32_t U_EXPORT2 +uset_getItemCount(const USet* set); + +/** + * Returns an item of this set. An item is either a range of + * characters or a single multicharacter string. + * @param set the set + * @param itemIndex a non-negative integer in the range 0.. + * uset_getItemCount(set)-1 + * @param start pointer to variable to receive first character + * in range, inclusive + * @param end pointer to variable to receive last character in range, + * inclusive + * @param str buffer to receive the string, may be NULL + * @param strCapacity capacity of str, or 0 if str is NULL + * @param ec error code + * @return the length of the string (>= 2), or 0 if the item is a + * range, in which case it is the range *start..*end, or -1 if + * itemIndex is out of range + * @stable ICU 2.4 + */ +U_STABLE int32_t U_EXPORT2 +uset_getItem(const USet* set, int32_t itemIndex, + UChar32* start, UChar32* end, + UChar* str, int32_t strCapacity, + UErrorCode* ec); + +/** + * Returns true if set1 contains all the characters and strings + * of set2. It answers the question, 'Is set1 a superset of set2?' + * @param set1 set to be checked for containment + * @param set2 set to be checked for containment + * @return true if the test condition is met + * @stable ICU 3.2 + */ +U_STABLE UBool U_EXPORT2 +uset_containsAll(const USet* set1, const USet* set2); + +/** + * Returns true if this set contains all the characters + * of the given string. This is does not check containment of grapheme + * clusters, like uset_containsString. + * @param set set of characters to be checked for containment + * @param str string containing codepoints to be checked for containment + * @param strLen the length of the string or -1 if null terminated. + * @return true if the test condition is met + * @stable ICU 3.4 + */ +U_STABLE UBool U_EXPORT2 +uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen); + +/** + * Returns true if set1 contains none of the characters and strings + * of set2. It answers the question, 'Is set1 a disjoint set of set2?' + * @param set1 set to be checked for containment + * @param set2 set to be checked for containment + * @return true if the test condition is met + * @stable ICU 3.2 + */ +U_STABLE UBool U_EXPORT2 +uset_containsNone(const USet* set1, const USet* set2); + +/** + * Returns true if set1 contains some of the characters and strings + * of set2. It answers the question, 'Does set1 and set2 have an intersection?' + * @param set1 set to be checked for containment + * @param set2 set to be checked for containment + * @return true if the test condition is met + * @stable ICU 3.2 + */ +U_STABLE UBool U_EXPORT2 +uset_containsSome(const USet* set1, const USet* set2); + +/** + * Returns the length of the initial substring of the input string which + * consists only of characters and strings that are contained in this set + * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), + * or only of characters and strings that are not contained + * in this set (USET_SPAN_NOT_CONTAINED). + * See USetSpanCondition for details. + * Similar to the strspn() C library function. + * Unpaired surrogates are treated according to contains() of their surrogate code points. + * This function works faster with a frozen set and with a non-negative string length argument. + * @param set the set + * @param s start of the string + * @param length of the string; can be -1 for NUL-terminated + * @param spanCondition specifies the containment condition + * @return the length of the initial substring according to the spanCondition; + * 0 if the start of the string does not fit the spanCondition + * @stable ICU 3.8 + * @see USetSpanCondition + */ +U_STABLE int32_t U_EXPORT2 +uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); + +/** + * Returns the start of the trailing substring of the input string which + * consists only of characters and strings that are contained in this set + * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), + * or only of characters and strings that are not contained + * in this set (USET_SPAN_NOT_CONTAINED). + * See USetSpanCondition for details. + * Unpaired surrogates are treated according to contains() of their surrogate code points. + * This function works faster with a frozen set and with a non-negative string length argument. + * @param set the set + * @param s start of the string + * @param length of the string; can be -1 for NUL-terminated + * @param spanCondition specifies the containment condition + * @return the start of the trailing substring according to the spanCondition; + * the string length if the end of the string does not fit the spanCondition + * @stable ICU 3.8 + * @see USetSpanCondition + */ +U_STABLE int32_t U_EXPORT2 +uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); + +/** + * Returns the length of the initial substring of the input string which + * consists only of characters and strings that are contained in this set + * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), + * or only of characters and strings that are not contained + * in this set (USET_SPAN_NOT_CONTAINED). + * See USetSpanCondition for details. + * Similar to the strspn() C library function. + * Malformed byte sequences are treated according to contains(0xfffd). + * This function works faster with a frozen set and with a non-negative string length argument. + * @param set the set + * @param s start of the string (UTF-8) + * @param length of the string; can be -1 for NUL-terminated + * @param spanCondition specifies the containment condition + * @return the length of the initial substring according to the spanCondition; + * 0 if the start of the string does not fit the spanCondition + * @stable ICU 3.8 + * @see USetSpanCondition + */ +U_STABLE int32_t U_EXPORT2 +uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); + +/** + * Returns the start of the trailing substring of the input string which + * consists only of characters and strings that are contained in this set + * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), + * or only of characters and strings that are not contained + * in this set (USET_SPAN_NOT_CONTAINED). + * See USetSpanCondition for details. + * Malformed byte sequences are treated according to contains(0xfffd). + * This function works faster with a frozen set and with a non-negative string length argument. + * @param set the set + * @param s start of the string (UTF-8) + * @param length of the string; can be -1 for NUL-terminated + * @param spanCondition specifies the containment condition + * @return the start of the trailing substring according to the spanCondition; + * the string length if the end of the string does not fit the spanCondition + * @stable ICU 3.8 + * @see USetSpanCondition + */ +U_STABLE int32_t U_EXPORT2 +uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); + +/** + * Returns true if set1 contains all of the characters and strings + * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' + * @param set1 set to be checked for containment + * @param set2 set to be checked for containment + * @return true if the test condition is met + * @stable ICU 3.2 + */ +U_STABLE UBool U_EXPORT2 +uset_equals(const USet* set1, const USet* set2); + +/********************************************************************* + * Serialized set API + *********************************************************************/ + +/** + * Serializes this set into an array of 16-bit integers. Serialization + * (currently) only records the characters in the set; multicharacter + * strings are ignored. + * + * The array + * has following format (each line is one 16-bit integer): + * + * length = (n+2*m) | (m!=0?0x8000:0) + * bmpLength = n; present if m!=0 + * bmp[0] + * bmp[1] + * ... + * bmp[n-1] + * supp-high[0] + * supp-low[0] + * supp-high[1] + * supp-low[1] + * ... + * supp-high[m-1] + * supp-low[m-1] + * + * The array starts with a header. After the header are n bmp + * code points, then m supplementary code points. Either n or m + * or both may be zero. n+2*m is always <= 0x7FFF. + * + * If there are no supplementary characters (if m==0) then the + * header is one 16-bit integer, 'length', with value n. + * + * If there are supplementary characters (if m!=0) then the header + * is two 16-bit integers. The first, 'length', has value + * (n+2*m)|0x8000. The second, 'bmpLength', has value n. + * + * After the header the code points are stored in ascending order. + * Supplementary code points are stored as most significant 16 + * bits followed by least significant 16 bits. + * + * @param set the set + * @param dest pointer to buffer of destCapacity 16-bit integers. + * May be NULL only if destCapacity is zero. + * @param destCapacity size of dest, or zero. Must not be negative. + * @param pErrorCode pointer to the error code. Will be set to + * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to + * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. + * @return the total length of the serialized format, including + * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other + * than U_BUFFER_OVERFLOW_ERROR. + * @stable ICU 2.4 + */ +U_STABLE int32_t U_EXPORT2 +uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode); + +/** + * Given a serialized array, fill in the given serialized set object. + * @param fillSet pointer to result + * @param src pointer to start of array + * @param srcLength length of array + * @return true if the given array is valid, otherwise false + * @stable ICU 2.4 + */ +U_STABLE UBool U_EXPORT2 +uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength); + +/** + * Set the USerializedSet to contain the given character (and nothing + * else). + * @param fillSet pointer to result + * @param c The codepoint to set + * @stable ICU 2.4 + */ +U_STABLE void U_EXPORT2 +uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c); + +/** + * Returns TRUE if the given USerializedSet contains the given + * character. + * @param set the serialized set + * @param c The codepoint to check for within the set + * @return true if set contains c + * @stable ICU 2.4 + */ +U_STABLE UBool U_EXPORT2 +uset_serializedContains(const USerializedSet* set, UChar32 c); + +/** + * Returns the number of disjoint ranges of characters contained in + * the given serialized set. Ignores any strings contained in the + * set. + * @param set the serialized set + * @return a non-negative integer counting the character ranges + * contained in set + * @stable ICU 2.4 + */ +U_STABLE int32_t U_EXPORT2 +uset_getSerializedRangeCount(const USerializedSet* set); + +/** + * Returns a range of characters contained in the given serialized + * set. + * @param set the serialized set + * @param rangeIndex a non-negative integer in the range 0.. + * uset_getSerializedRangeCount(set)-1 + * @param pStart pointer to variable to receive first character + * in range, inclusive + * @param pEnd pointer to variable to receive last character in range, + * inclusive + * @return true if rangeIndex is valid, otherwise false + * @stable ICU 2.4 + */ +U_STABLE UBool U_EXPORT2 +uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, + UChar32* pStart, UChar32* pEnd); + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/ustring.h b/src/core/basetypes/icu-ucsdet/include/unicode/ustring.h new file mode 100644 index 00000000..6d141e8d --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/ustring.h @@ -0,0 +1,1700 @@ +/* +********************************************************************** +* Copyright (C) 1998-2014, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* File ustring.h +* +* Modification History: +* +* Date Name Description +* 12/07/98 bertrand Creation. +****************************************************************************** +*/ + +#ifndef USTRING_H +#define USTRING_H + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/uiter.h" + +/** + * \def UBRK_TYPEDEF_UBREAK_ITERATOR + * @internal + */ + +#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR +# define UBRK_TYPEDEF_UBREAK_ITERATOR +/** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1*/ + typedef struct UBreakIterator UBreakIterator; +#endif + +/** + * \file + * \brief C API: Unicode string handling functions + * + * These C API functions provide general Unicode string handling. + * + * Some functions are equivalent in name, signature, and behavior to the ANSI C <string.h> + * functions. (For example, they do not check for bad arguments like NULL string pointers.) + * In some cases, only the thread-safe variant of such a function is implemented here + * (see u_strtok_r()). + * + * Other functions provide more Unicode-specific functionality like locale-specific + * upper/lower-casing and string comparison in code point order. + * + * ICU uses 16-bit Unicode (UTF-16) in the form of arrays of UChar code units. + * UTF-16 encodes each Unicode code point with either one or two UChar code units. + * (This is the default form of Unicode, and a forward-compatible extension of the original, + * fixed-width form that was known as UCS-2. UTF-16 superseded UCS-2 with Unicode 2.0 + * in 1996.) + * + * Some APIs accept a 32-bit UChar32 value for a single code point. + * + * ICU also handles 16-bit Unicode text with unpaired surrogates. + * Such text is not well-formed UTF-16. + * Code-point-related functions treat unpaired surrogates as surrogate code points, + * i.e., as separate units. + * + * Although UTF-16 is a variable-width encoding form (like some legacy multi-byte encodings), + * it is much more efficient even for random access because the code unit values + * for single-unit characters vs. lead units vs. trail units are completely disjoint. + * This means that it is easy to determine character (code point) boundaries from + * random offsets in the string. + * + * Unicode (UTF-16) string processing is optimized for the single-unit case. + * Although it is important to support supplementary characters + * (which use pairs of lead/trail code units called "surrogates"), + * their occurrence is rare. Almost all characters in modern use require only + * a single UChar code unit (i.e., their code point values are <=0xffff). + * + * For more details see the User Guide Strings chapter (http://icu-project.org/userguide/strings.html). + * For a discussion of the handling of unpaired surrogates see also + * Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18. + */ + +/** + * \defgroup ustring_ustrlen String Length + * \ingroup ustring_strlen + */ +/*@{*/ +/** + * Determine the length of an array of UChar. + * + * @param s The array of UChars, NULL (U+0000) terminated. + * @return The number of UChars in <code>chars</code>, minus the terminator. + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_strlen(const UChar *s); +/*@}*/ + +/** + * Count Unicode code points in the length UChar code units of the string. + * A code point may occupy either one or two UChar code units. + * Counting code points involves reading all code units. + * + * This functions is basically the inverse of the U16_FWD_N() macro (see utf.h). + * + * @param s The input string. + * @param length The number of UChar code units to be checked, or -1 to count all + * code points before the first NUL (U+0000). + * @return The number of code points in the specified code units. + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_countChar32(const UChar *s, int32_t length); + +/** + * Check if the string contains more Unicode code points than a certain number. + * This is more efficient than counting all code points in the entire string + * and comparing that number with a threshold. + * This function may not need to scan the string at all if the length is known + * (not -1 for NUL-termination) and falls within a certain range, and + * never needs to count more than 'number+1' code points. + * Logically equivalent to (u_countChar32(s, length)>number). + * A Unicode code point may occupy either one or two UChar code units. + * + * @param s The input string. + * @param length The length of the string, or -1 if it is NUL-terminated. + * @param number The number of code points in the string is compared against + * the 'number' parameter. + * @return Boolean value for whether the string contains more Unicode code points + * than 'number'. Same as (u_countChar32(s, length)>number). + * @stable ICU 2.4 + */ +U_STABLE UBool U_EXPORT2 +u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number); + +/** + * Concatenate two ustrings. Appends a copy of <code>src</code>, + * including the null terminator, to <code>dst</code>. The initial copied + * character from <code>src</code> overwrites the null terminator in <code>dst</code>. + * + * @param dst The destination string. + * @param src The source string. + * @return A pointer to <code>dst</code>. + * @stable ICU 2.0 + */ +U_STABLE UChar* U_EXPORT2 +u_strcat(UChar *dst, + const UChar *src); + +/** + * Concatenate two ustrings. + * Appends at most <code>n</code> characters from <code>src</code> to <code>dst</code>. + * Adds a terminating NUL. + * If src is too long, then only <code>n-1</code> characters will be copied + * before the terminating NUL. + * If <code>n<=0</code> then dst is not modified. + * + * @param dst The destination string. + * @param src The source string (can be NULL/invalid if n<=0). + * @param n The maximum number of characters to append; no-op if <=0. + * @return A pointer to <code>dst</code>. + * @stable ICU 2.0 + */ +U_STABLE UChar* U_EXPORT2 +u_strncat(UChar *dst, + const UChar *src, + int32_t n); + +/** + * Find the first occurrence of a substring in a string. + * The substring is found at code point boundaries. + * That means that if the substring begins with + * a trail surrogate or ends with a lead surrogate, + * then it is found only if these surrogates stand alone in the text. + * Otherwise, the substring edge units would be matched against + * halves of surrogate pairs. + * + * @param s The string to search (NUL-terminated). + * @param substring The substring to find (NUL-terminated). + * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>, + * or <code>s</code> itself if the <code>substring</code> is empty, + * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>. + * @stable ICU 2.0 + * + * @see u_strrstr + * @see u_strFindFirst + * @see u_strFindLast + */ +U_STABLE UChar * U_EXPORT2 +u_strstr(const UChar *s, const UChar *substring); + +/** + * Find the first occurrence of a substring in a string. + * The substring is found at code point boundaries. + * That means that if the substring begins with + * a trail surrogate or ends with a lead surrogate, + * then it is found only if these surrogates stand alone in the text. + * Otherwise, the substring edge units would be matched against + * halves of surrogate pairs. + * + * @param s The string to search. + * @param length The length of s (number of UChars), or -1 if it is NUL-terminated. + * @param substring The substring to find (NUL-terminated). + * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated. + * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>, + * or <code>s</code> itself if the <code>substring</code> is empty, + * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>. + * @stable ICU 2.4 + * + * @see u_strstr + * @see u_strFindLast + */ +U_STABLE UChar * U_EXPORT2 +u_strFindFirst(const UChar *s, int32_t length, const UChar *substring, int32_t subLength); + +/** + * Find the first occurrence of a BMP code point in a string. + * A surrogate code point is found only if its match in the text is not + * part of a surrogate pair. + * A NUL character is found at the string terminator. + * + * @param s The string to search (NUL-terminated). + * @param c The BMP code point to find. + * @return A pointer to the first occurrence of <code>c</code> in <code>s</code> + * or <code>NULL</code> if <code>c</code> is not in <code>s</code>. + * @stable ICU 2.0 + * + * @see u_strchr32 + * @see u_memchr + * @see u_strstr + * @see u_strFindFirst + */ +U_STABLE UChar * U_EXPORT2 +u_strchr(const UChar *s, UChar c); + +/** + * Find the first occurrence of a code point in a string. + * A surrogate code point is found only if its match in the text is not + * part of a surrogate pair. + * A NUL character is found at the string terminator. + * + * @param s The string to search (NUL-terminated). + * @param c The code point to find. + * @return A pointer to the first occurrence of <code>c</code> in <code>s</code> + * or <code>NULL</code> if <code>c</code> is not in <code>s</code>. + * @stable ICU 2.0 + * + * @see u_strchr + * @see u_memchr32 + * @see u_strstr + * @see u_strFindFirst + */ +U_STABLE UChar * U_EXPORT2 +u_strchr32(const UChar *s, UChar32 c); + +/** + * Find the last occurrence of a substring in a string. + * The substring is found at code point boundaries. + * That means that if the substring begins with + * a trail surrogate or ends with a lead surrogate, + * then it is found only if these surrogates stand alone in the text. + * Otherwise, the substring edge units would be matched against + * halves of surrogate pairs. + * + * @param s The string to search (NUL-terminated). + * @param substring The substring to find (NUL-terminated). + * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>, + * or <code>s</code> itself if the <code>substring</code> is empty, + * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>. + * @stable ICU 2.4 + * + * @see u_strstr + * @see u_strFindFirst + * @see u_strFindLast + */ +U_STABLE UChar * U_EXPORT2 +u_strrstr(const UChar *s, const UChar *substring); + +/** + * Find the last occurrence of a substring in a string. + * The substring is found at code point boundaries. + * That means that if the substring begins with + * a trail surrogate or ends with a lead surrogate, + * then it is found only if these surrogates stand alone in the text. + * Otherwise, the substring edge units would be matched against + * halves of surrogate pairs. + * + * @param s The string to search. + * @param length The length of s (number of UChars), or -1 if it is NUL-terminated. + * @param substring The substring to find (NUL-terminated). + * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated. + * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>, + * or <code>s</code> itself if the <code>substring</code> is empty, + * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>. + * @stable ICU 2.4 + * + * @see u_strstr + * @see u_strFindLast + */ +U_STABLE UChar * U_EXPORT2 +u_strFindLast(const UChar *s, int32_t length, const UChar *substring, int32_t subLength); + +/** + * Find the last occurrence of a BMP code point in a string. + * A surrogate code point is found only if its match in the text is not + * part of a surrogate pair. + * A NUL character is found at the string terminator. + * + * @param s The string to search (NUL-terminated). + * @param c The BMP code point to find. + * @return A pointer to the last occurrence of <code>c</code> in <code>s</code> + * or <code>NULL</code> if <code>c</code> is not in <code>s</code>. + * @stable ICU 2.4 + * + * @see u_strrchr32 + * @see u_memrchr + * @see u_strrstr + * @see u_strFindLast + */ +U_STABLE UChar * U_EXPORT2 +u_strrchr(const UChar *s, UChar c); + +/** + * Find the last occurrence of a code point in a string. + * A surrogate code point is found only if its match in the text is not + * part of a surrogate pair. + * A NUL character is found at the string terminator. + * + * @param s The string to search (NUL-terminated). + * @param c The code point to find. + * @return A pointer to the last occurrence of <code>c</code> in <code>s</code> + * or <code>NULL</code> if <code>c</code> is not in <code>s</code>. + * @stable ICU 2.4 + * + * @see u_strrchr + * @see u_memchr32 + * @see u_strrstr + * @see u_strFindLast + */ +U_STABLE UChar * U_EXPORT2 +u_strrchr32(const UChar *s, UChar32 c); + +/** + * Locates the first occurrence in the string <code>string</code> of any of the characters + * in the string <code>matchSet</code>. + * Works just like C's strpbrk but with Unicode. + * + * @param string The string in which to search, NUL-terminated. + * @param matchSet A NUL-terminated string defining a set of code points + * for which to search in the text string. + * @return A pointer to the character in <code>string</code> that matches one of the + * characters in <code>matchSet</code>, or NULL if no such character is found. + * @stable ICU 2.0 + */ +U_STABLE UChar * U_EXPORT2 +u_strpbrk(const UChar *string, const UChar *matchSet); + +/** + * Returns the number of consecutive characters in <code>string</code>, + * beginning with the first, that do not occur somewhere in <code>matchSet</code>. + * Works just like C's strcspn but with Unicode. + * + * @param string The string in which to search, NUL-terminated. + * @param matchSet A NUL-terminated string defining a set of code points + * for which to search in the text string. + * @return The number of initial characters in <code>string</code> that do not + * occur in <code>matchSet</code>. + * @see u_strspn + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_strcspn(const UChar *string, const UChar *matchSet); + +/** + * Returns the number of consecutive characters in <code>string</code>, + * beginning with the first, that occur somewhere in <code>matchSet</code>. + * Works just like C's strspn but with Unicode. + * + * @param string The string in which to search, NUL-terminated. + * @param matchSet A NUL-terminated string defining a set of code points + * for which to search in the text string. + * @return The number of initial characters in <code>string</code> that do + * occur in <code>matchSet</code>. + * @see u_strcspn + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_strspn(const UChar *string, const UChar *matchSet); + +/** + * The string tokenizer API allows an application to break a string into + * tokens. Unlike strtok(), the saveState (the current pointer within the + * original string) is maintained in saveState. In the first call, the + * argument src is a pointer to the string. In subsequent calls to + * return successive tokens of that string, src must be specified as + * NULL. The value saveState is set by this function to maintain the + * function's position within the string, and on each subsequent call + * you must give this argument the same variable. This function does + * handle surrogate pairs. This function is similar to the strtok_r() + * the POSIX Threads Extension (1003.1c-1995) version. + * + * @param src String containing token(s). This string will be modified. + * After the first call to u_strtok_r(), this argument must + * be NULL to get to the next token. + * @param delim Set of delimiter characters (Unicode code points). + * @param saveState The current pointer within the original string, + * which is set by this function. The saveState + * parameter should the address of a local variable of type + * UChar *. (i.e. defined "Uhar *myLocalSaveState" and use + * &myLocalSaveState for this parameter). + * @return A pointer to the next token found in src, or NULL + * when there are no more tokens. + * @stable ICU 2.0 + */ +U_STABLE UChar * U_EXPORT2 +u_strtok_r(UChar *src, + const UChar *delim, + UChar **saveState); + +/** + * Compare two Unicode strings for bitwise equality (code unit order). + * + * @param s1 A string to compare. + * @param s2 A string to compare. + * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative + * value if <code>s1</code> is bitwise less than <code>s2,</code>; a positive + * value if <code>s1</code> is bitwise greater than <code>s2</code>. + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_strcmp(const UChar *s1, + const UChar *s2); + +/** + * Compare two Unicode strings in code point order. + * See u_strCompare for details. + * + * @param s1 A string to compare. + * @param s2 A string to compare. + * @return a negative/zero/positive integer corresponding to whether + * the first string is less than/equal to/greater than the second one + * in code point order + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_strcmpCodePointOrder(const UChar *s1, const UChar *s2); + +/** + * Compare two Unicode strings (binary order). + * + * The comparison can be done in code unit order or in code point order. + * They differ only in UTF-16 when + * comparing supplementary code points (U+10000..U+10ffff) + * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff). + * In code unit order, high BMP code points sort after supplementary code points + * because they are stored as pairs of surrogates which are at U+d800..U+dfff. + * + * This functions works with strings of different explicitly specified lengths + * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. + * NUL-terminated strings are possible with length arguments of -1. + * + * @param s1 First source string. + * @param length1 Length of first source string, or -1 if NUL-terminated. + * + * @param s2 Second source string. + * @param length2 Length of second source string, or -1 if NUL-terminated. + * + * @param codePointOrder Choose between code unit order (FALSE) + * and code point order (TRUE). + * + * @return <0 or 0 or >0 as usual for string comparisons + * + * @stable ICU 2.2 + */ +U_STABLE int32_t U_EXPORT2 +u_strCompare(const UChar *s1, int32_t length1, + const UChar *s2, int32_t length2, + UBool codePointOrder); + +/** + * Compare two Unicode strings (binary order) + * as presented by UCharIterator objects. + * Works otherwise just like u_strCompare(). + * + * Both iterators are reset to their start positions. + * When the function returns, it is undefined where the iterators + * have stopped. + * + * @param iter1 First source string iterator. + * @param iter2 Second source string iterator. + * @param codePointOrder Choose between code unit order (FALSE) + * and code point order (TRUE). + * + * @return <0 or 0 or >0 as usual for string comparisons + * + * @see u_strCompare + * + * @stable ICU 2.6 + */ +U_STABLE int32_t U_EXPORT2 +u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder); + +#ifndef U_COMPARE_CODE_POINT_ORDER +/* see also unistr.h and unorm.h */ +/** + * Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc: + * Compare strings in code point order instead of code unit order. + * @stable ICU 2.2 + */ +#define U_COMPARE_CODE_POINT_ORDER 0x8000 +#endif + +/** + * Compare two strings case-insensitively using full case folding. + * This is equivalent to + * u_strCompare(u_strFoldCase(s1, options), + * u_strFoldCase(s2, options), + * (options&U_COMPARE_CODE_POINT_ORDER)!=0). + * + * The comparison can be done in UTF-16 code unit order or in code point order. + * They differ only when comparing supplementary code points (U+10000..U+10ffff) + * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff). + * In code unit order, high BMP code points sort after supplementary code points + * because they are stored as pairs of surrogates which are at U+d800..U+dfff. + * + * This functions works with strings of different explicitly specified lengths + * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. + * NUL-terminated strings are possible with length arguments of -1. + * + * @param s1 First source string. + * @param length1 Length of first source string, or -1 if NUL-terminated. + * + * @param s2 Second source string. + * @param length2 Length of second source string, or -1 if NUL-terminated. + * + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code unit order with default case folding. + * + * - U_COMPARE_CODE_POINT_ORDER + * Set to choose code point order instead of code unit order + * (see u_strCompare for details). + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * + * @return <0 or 0 or >0 as usual for string comparisons + * + * @stable ICU 2.2 + */ +U_STABLE int32_t U_EXPORT2 +u_strCaseCompare(const UChar *s1, int32_t length1, + const UChar *s2, int32_t length2, + uint32_t options, + UErrorCode *pErrorCode); + +/** + * Compare two ustrings for bitwise equality. + * Compares at most <code>n</code> characters. + * + * @param ucs1 A string to compare (can be NULL/invalid if n<=0). + * @param ucs2 A string to compare (can be NULL/invalid if n<=0). + * @param n The maximum number of characters to compare; always returns 0 if n<=0. + * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative + * value if <code>s1</code> is bitwise less than <code>s2</code>; a positive + * value if <code>s1</code> is bitwise greater than <code>s2</code>. + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_strncmp(const UChar *ucs1, + const UChar *ucs2, + int32_t n); + +/** + * Compare two Unicode strings in code point order. + * This is different in UTF-16 from u_strncmp() if supplementary characters are present. + * For details, see u_strCompare(). + * + * @param s1 A string to compare. + * @param s2 A string to compare. + * @param n The maximum number of characters to compare. + * @return a negative/zero/positive integer corresponding to whether + * the first string is less than/equal to/greater than the second one + * in code point order + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n); + +/** + * Compare two strings case-insensitively using full case folding. + * This is equivalent to u_strcmp(u_strFoldCase(s1, options), u_strFoldCase(s2, options)). + * + * @param s1 A string to compare. + * @param s2 A string to compare. + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code unit order with default case folding. + * + * - U_COMPARE_CODE_POINT_ORDER + * Set to choose code point order instead of code unit order + * (see u_strCompare for details). + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @return A negative, zero, or positive integer indicating the comparison result. + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options); + +/** + * Compare two strings case-insensitively using full case folding. + * This is equivalent to u_strcmp(u_strFoldCase(s1, at most n, options), + * u_strFoldCase(s2, at most n, options)). + * + * @param s1 A string to compare. + * @param s2 A string to compare. + * @param n The maximum number of characters each string to case-fold and then compare. + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code unit order with default case folding. + * + * - U_COMPARE_CODE_POINT_ORDER + * Set to choose code point order instead of code unit order + * (see u_strCompare for details). + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @return A negative, zero, or positive integer indicating the comparison result. + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options); + +/** + * Compare two strings case-insensitively using full case folding. + * This is equivalent to u_strcmp(u_strFoldCase(s1, n, options), + * u_strFoldCase(s2, n, options)). + * + * @param s1 A string to compare. + * @param s2 A string to compare. + * @param length The number of characters in each string to case-fold and then compare. + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code unit order with default case folding. + * + * - U_COMPARE_CODE_POINT_ORDER + * Set to choose code point order instead of code unit order + * (see u_strCompare for details). + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @return A negative, zero, or positive integer indicating the comparison result. + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options); + +/** + * Copy a ustring. Adds a null terminator. + * + * @param dst The destination string. + * @param src The source string. + * @return A pointer to <code>dst</code>. + * @stable ICU 2.0 + */ +U_STABLE UChar* U_EXPORT2 +u_strcpy(UChar *dst, + const UChar *src); + +/** + * Copy a ustring. + * Copies at most <code>n</code> characters. The result will be null terminated + * if the length of <code>src</code> is less than <code>n</code>. + * + * @param dst The destination string. + * @param src The source string (can be NULL/invalid if n<=0). + * @param n The maximum number of characters to copy; no-op if <=0. + * @return A pointer to <code>dst</code>. + * @stable ICU 2.0 + */ +U_STABLE UChar* U_EXPORT2 +u_strncpy(UChar *dst, + const UChar *src, + int32_t n); + +#if !UCONFIG_NO_CONVERSION + +/** + * Copy a byte string encoded in the default codepage to a ustring. + * Adds a null terminator. + * Performs a host byte to UChar conversion + * + * @param dst The destination string. + * @param src The source string. + * @return A pointer to <code>dst</code>. + * @stable ICU 2.0 + */ +U_STABLE UChar* U_EXPORT2 u_uastrcpy(UChar *dst, + const char *src ); + +/** + * Copy a byte string encoded in the default codepage to a ustring. + * Copies at most <code>n</code> characters. The result will be null terminated + * if the length of <code>src</code> is less than <code>n</code>. + * Performs a host byte to UChar conversion + * + * @param dst The destination string. + * @param src The source string. + * @param n The maximum number of characters to copy. + * @return A pointer to <code>dst</code>. + * @stable ICU 2.0 + */ +U_STABLE UChar* U_EXPORT2 u_uastrncpy(UChar *dst, + const char *src, + int32_t n); + +/** + * Copy ustring to a byte string encoded in the default codepage. + * Adds a null terminator. + * Performs a UChar to host byte conversion + * + * @param dst The destination string. + * @param src The source string. + * @return A pointer to <code>dst</code>. + * @stable ICU 2.0 + */ +U_STABLE char* U_EXPORT2 u_austrcpy(char *dst, + const UChar *src ); + +/** + * Copy ustring to a byte string encoded in the default codepage. + * Copies at most <code>n</code> characters. The result will be null terminated + * if the length of <code>src</code> is less than <code>n</code>. + * Performs a UChar to host byte conversion + * + * @param dst The destination string. + * @param src The source string. + * @param n The maximum number of characters to copy. + * @return A pointer to <code>dst</code>. + * @stable ICU 2.0 + */ +U_STABLE char* U_EXPORT2 u_austrncpy(char *dst, + const UChar *src, + int32_t n ); + +#endif + +/** + * Synonym for memcpy(), but with UChars only. + * @param dest The destination string + * @param src The source string (can be NULL/invalid if count<=0) + * @param count The number of characters to copy; no-op if <=0 + * @return A pointer to <code>dest</code> + * @stable ICU 2.0 + */ +U_STABLE UChar* U_EXPORT2 +u_memcpy(UChar *dest, const UChar *src, int32_t count); + +/** + * Synonym for memmove(), but with UChars only. + * @param dest The destination string + * @param src The source string (can be NULL/invalid if count<=0) + * @param count The number of characters to move; no-op if <=0 + * @return A pointer to <code>dest</code> + * @stable ICU 2.0 + */ +U_STABLE UChar* U_EXPORT2 +u_memmove(UChar *dest, const UChar *src, int32_t count); + +/** + * Initialize <code>count</code> characters of <code>dest</code> to <code>c</code>. + * + * @param dest The destination string. + * @param c The character to initialize the string. + * @param count The maximum number of characters to set. + * @return A pointer to <code>dest</code>. + * @stable ICU 2.0 + */ +U_STABLE UChar* U_EXPORT2 +u_memset(UChar *dest, UChar c, int32_t count); + +/** + * Compare the first <code>count</code> UChars of each buffer. + * + * @param buf1 The first string to compare. + * @param buf2 The second string to compare. + * @param count The maximum number of UChars to compare. + * @return When buf1 < buf2, a negative number is returned. + * When buf1 == buf2, 0 is returned. + * When buf1 > buf2, a positive number is returned. + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count); + +/** + * Compare two Unicode strings in code point order. + * This is different in UTF-16 from u_memcmp() if supplementary characters are present. + * For details, see u_strCompare(). + * + * @param s1 A string to compare. + * @param s2 A string to compare. + * @param count The maximum number of characters to compare. + * @return a negative/zero/positive integer corresponding to whether + * the first string is less than/equal to/greater than the second one + * in code point order + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count); + +/** + * Find the first occurrence of a BMP code point in a string. + * A surrogate code point is found only if its match in the text is not + * part of a surrogate pair. + * A NUL character is found at the string terminator. + * + * @param s The string to search (contains <code>count</code> UChars). + * @param c The BMP code point to find. + * @param count The length of the string. + * @return A pointer to the first occurrence of <code>c</code> in <code>s</code> + * or <code>NULL</code> if <code>c</code> is not in <code>s</code>. + * @stable ICU 2.0 + * + * @see u_strchr + * @see u_memchr32 + * @see u_strFindFirst + */ +U_STABLE UChar* U_EXPORT2 +u_memchr(const UChar *s, UChar c, int32_t count); + +/** + * Find the first occurrence of a code point in a string. + * A surrogate code point is found only if its match in the text is not + * part of a surrogate pair. + * A NUL character is found at the string terminator. + * + * @param s The string to search (contains <code>count</code> UChars). + * @param c The code point to find. + * @param count The length of the string. + * @return A pointer to the first occurrence of <code>c</code> in <code>s</code> + * or <code>NULL</code> if <code>c</code> is not in <code>s</code>. + * @stable ICU 2.0 + * + * @see u_strchr32 + * @see u_memchr + * @see u_strFindFirst + */ +U_STABLE UChar* U_EXPORT2 +u_memchr32(const UChar *s, UChar32 c, int32_t count); + +/** + * Find the last occurrence of a BMP code point in a string. + * A surrogate code point is found only if its match in the text is not + * part of a surrogate pair. + * A NUL character is found at the string terminator. + * + * @param s The string to search (contains <code>count</code> UChars). + * @param c The BMP code point to find. + * @param count The length of the string. + * @return A pointer to the last occurrence of <code>c</code> in <code>s</code> + * or <code>NULL</code> if <code>c</code> is not in <code>s</code>. + * @stable ICU 2.4 + * + * @see u_strrchr + * @see u_memrchr32 + * @see u_strFindLast + */ +U_STABLE UChar* U_EXPORT2 +u_memrchr(const UChar *s, UChar c, int32_t count); + +/** + * Find the last occurrence of a code point in a string. + * A surrogate code point is found only if its match in the text is not + * part of a surrogate pair. + * A NUL character is found at the string terminator. + * + * @param s The string to search (contains <code>count</code> UChars). + * @param c The code point to find. + * @param count The length of the string. + * @return A pointer to the last occurrence of <code>c</code> in <code>s</code> + * or <code>NULL</code> if <code>c</code> is not in <code>s</code>. + * @stable ICU 2.4 + * + * @see u_strrchr32 + * @see u_memrchr + * @see u_strFindLast + */ +U_STABLE UChar* U_EXPORT2 +u_memrchr32(const UChar *s, UChar32 c, int32_t count); + +/** + * Unicode String literals in C. + * We need one macro to declare a variable for the string + * and to statically preinitialize it if possible, + * and a second macro to dynamically intialize such a string variable if necessary. + * + * The macros are defined for maximum performance. + * They work only for strings that contain "invariant characters", i.e., + * only latin letters, digits, and some punctuation. + * See utypes.h for details. + * + * A pair of macros for a single string must be used with the same + * parameters. + * The string parameter must be a C string literal. + * The length of the string, not including the terminating + * <code>NUL</code>, must be specified as a constant. + * The U_STRING_DECL macro should be invoked exactly once for one + * such string variable before it is used. + * + * Usage: + * <pre> + * U_STRING_DECL(ustringVar1, "Quick-Fox 2", 11); + * U_STRING_DECL(ustringVar2, "jumps 5%", 8); + * static UBool didInit=FALSE; + * + * int32_t function() { + * if(!didInit) { + * U_STRING_INIT(ustringVar1, "Quick-Fox 2", 11); + * U_STRING_INIT(ustringVar2, "jumps 5%", 8); + * didInit=TRUE; + * } + * return u_strcmp(ustringVar1, ustringVar2); + * } + * </pre> + * + * Note that the macros will NOT consistently work if their argument is another <code>#define</code>. + * The following will not work on all platforms, don't use it. + * + * <pre> + * #define GLUCK "Mr. Gluck" + * U_STRING_DECL(var, GLUCK, 9) + * U_STRING_INIT(var, GLUCK, 9) + * </pre> + * + * Instead, use the string literal "Mr. Gluck" as the argument to both macro + * calls. + * + * + * @stable ICU 2.0 + */ +#if defined(U_DECLARE_UTF16) +# define U_STRING_DECL(var, cs, length) static const UChar *var=(const UChar *)U_DECLARE_UTF16(cs) + /**@stable ICU 2.0 */ +# define U_STRING_INIT(var, cs, length) +#elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16))) +# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=L ## cs + /**@stable ICU 2.0 */ +# define U_STRING_INIT(var, cs, length) +#elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY +# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=cs + /**@stable ICU 2.0 */ +# define U_STRING_INIT(var, cs, length) +#else +# define U_STRING_DECL(var, cs, length) static UChar var[(length)+1] + /**@stable ICU 2.0 */ +# define U_STRING_INIT(var, cs, length) u_charsToUChars(cs, var, length+1) +#endif + +/** + * Unescape a string of characters and write the resulting + * Unicode characters to the destination buffer. The following escape + * sequences are recognized: + * + * \\uhhhh 4 hex digits; h in [0-9A-Fa-f] + * \\Uhhhhhhhh 8 hex digits + * \\xhh 1-2 hex digits + * \\x{h...} 1-8 hex digits + * \\ooo 1-3 octal digits; o in [0-7] + * \\cX control-X; X is masked with 0x1F + * + * as well as the standard ANSI C escapes: + * + * \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A, + * \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B, + * \\" => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C + * + * Anything else following a backslash is generically escaped. For + * example, "[a\\-z]" returns "[a-z]". + * + * If an escape sequence is ill-formed, this method returns an empty + * string. An example of an ill-formed sequence is "\\u" followed by + * fewer than 4 hex digits. + * + * The above characters are recognized in the compiler's codepage, + * that is, they are coded as 'u', '\\', etc. Characters that are + * not parts of escape sequences are converted using u_charsToUChars(). + * + * This function is similar to UnicodeString::unescape() but not + * identical to it. The latter takes a source UnicodeString, so it + * does escape recognition but no conversion. + * + * @param src a zero-terminated string of invariant characters + * @param dest pointer to buffer to receive converted and unescaped + * text and, if there is room, a zero terminator. May be NULL for + * preflighting, in which case no UChars will be written, but the + * return value will still be valid. On error, an empty string is + * stored here (if possible). + * @param destCapacity the number of UChars that may be written at + * dest. Ignored if dest == NULL. + * @return the length of unescaped string. + * @see u_unescapeAt + * @see UnicodeString#unescape() + * @see UnicodeString#unescapeAt() + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_unescape(const char *src, + UChar *dest, int32_t destCapacity); + +U_CDECL_BEGIN +/** + * Callback function for u_unescapeAt() that returns a character of + * the source text given an offset and a context pointer. The context + * pointer will be whatever is passed into u_unescapeAt(). + * + * @param offset pointer to the offset that will be passed to u_unescapeAt(). + * @param context an opaque pointer passed directly into u_unescapeAt() + * @return the character represented by the escape sequence at + * offset + * @see u_unescapeAt + * @stable ICU 2.0 + */ +typedef UChar (U_CALLCONV *UNESCAPE_CHAR_AT)(int32_t offset, void *context); +U_CDECL_END + +/** + * Unescape a single sequence. The character at offset-1 is assumed + * (without checking) to be a backslash. This method takes a callback + * pointer to a function that returns the UChar at a given offset. By + * varying this callback, ICU functions are able to unescape char* + * strings, UnicodeString objects, and UFILE pointers. + * + * If offset is out of range, or if the escape sequence is ill-formed, + * (UChar32)0xFFFFFFFF is returned. See documentation of u_unescape() + * for a list of recognized sequences. + * + * @param charAt callback function that returns a UChar of the source + * text given an offset and a context pointer. + * @param offset pointer to the offset that will be passed to charAt. + * The offset value will be updated upon return to point after the + * last parsed character of the escape sequence. On error the offset + * is unchanged. + * @param length the number of characters in the source text. The + * last character of the source text is considered to be at offset + * length-1. + * @param context an opaque pointer passed directly into charAt. + * @return the character represented by the escape sequence at + * offset, or (UChar32)0xFFFFFFFF on error. + * @see u_unescape() + * @see UnicodeString#unescape() + * @see UnicodeString#unescapeAt() + * @stable ICU 2.0 + */ +U_STABLE UChar32 U_EXPORT2 +u_unescapeAt(UNESCAPE_CHAR_AT charAt, + int32_t *offset, + int32_t length, + void *context); + +/** + * Uppercase the characters in a string. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * The source string and the destination buffer are allowed to overlap. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param src The original string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param locale The locale to consider, or "" for the root locale or NULL for the default locale. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The length of the result string. It may be greater than destCapacity. In that case, + * only some of the result was written to the destination buffer. + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_strToUpper(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const char *locale, + UErrorCode *pErrorCode); + +/** + * Lowercase the characters in a string. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * The source string and the destination buffer are allowed to overlap. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param src The original string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param locale The locale to consider, or "" for the root locale or NULL for the default locale. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The length of the result string. It may be greater than destCapacity. In that case, + * only some of the result was written to the destination buffer. + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_strToLower(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const char *locale, + UErrorCode *pErrorCode); + +#if !UCONFIG_NO_BREAK_ITERATION + +/** + * Titlecase a string. + * Casing is locale-dependent and context-sensitive. + * Titlecasing uses a break iterator to find the first characters of words + * that are to be titlecased. It titlecases those characters and lowercases + * all others. + * + * The titlecase break iterator can be provided to customize for arbitrary + * styles, using rules and dictionaries beyond the standard iterators. + * It may be more efficient to always provide an iterator to avoid + * opening and closing one for each string. + * The standard titlecase iterator for the root locale implements the + * algorithm of Unicode TR 21. + * + * This function uses only the setText(), first() and next() methods of the + * provided break iterator. + * + * The result may be longer or shorter than the original. + * The source string and the destination buffer are allowed to overlap. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param src The original string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param titleIter A break iterator to find the first characters of words + * that are to be titlecased. + * If none is provided (NULL), then a standard titlecase + * break iterator is opened. + * @param locale The locale to consider, or "" for the root locale or NULL for the default locale. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The length of the result string. It may be greater than destCapacity. In that case, + * only some of the result was written to the destination buffer. + * @stable ICU 2.1 + */ +U_STABLE int32_t U_EXPORT2 +u_strToTitle(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UBreakIterator *titleIter, + const char *locale, + UErrorCode *pErrorCode); + +#endif + +/** + * Case-folds the characters in a string. + * + * Case-folding is locale-independent and not context-sensitive, + * but there is an option for whether to include or exclude mappings for dotted I + * and dotless i that are marked with 'T' in CaseFolding.txt. + * + * The result may be longer or shorter than the original. + * The source string and the destination buffer are allowed to overlap. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param src The original string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The length of the result string. It may be greater than destCapacity. In that case, + * only some of the result was written to the destination buffer. + * @stable ICU 2.0 + */ +U_STABLE int32_t U_EXPORT2 +u_strFoldCase(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + uint32_t options, + UErrorCode *pErrorCode); + +#if defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION +/** + * Convert a UTF-16 string to a wchar_t string. + * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then + * this function simply calls the fast, dedicated function for that. + * Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of wchar_t's). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The pointer to destination buffer. + * @stable ICU 2.0 + */ +U_STABLE wchar_t* U_EXPORT2 +u_strToWCS(wchar_t *dest, + int32_t destCapacity, + int32_t *pDestLength, + const UChar *src, + int32_t srcLength, + UErrorCode *pErrorCode); +/** + * Convert a wchar_t string to UTF-16. + * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then + * this function simply calls the fast, dedicated function for that. + * Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The pointer to destination buffer. + * @stable ICU 2.0 + */ +U_STABLE UChar* U_EXPORT2 +u_strFromWCS(UChar *dest, + int32_t destCapacity, + int32_t *pDestLength, + const wchar_t *src, + int32_t srcLength, + UErrorCode *pErrorCode); +#endif /* defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION */ + +/** + * Convert a UTF-16 string to UTF-8. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of chars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The pointer to destination buffer. + * @stable ICU 2.0 + * @see u_strToUTF8WithSub + * @see u_strFromUTF8 + */ +U_STABLE char* U_EXPORT2 +u_strToUTF8(char *dest, + int32_t destCapacity, + int32_t *pDestLength, + const UChar *src, + int32_t srcLength, + UErrorCode *pErrorCode); + +/** + * Convert a UTF-8 string to UTF-16. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The pointer to destination buffer. + * @stable ICU 2.0 + * @see u_strFromUTF8WithSub + * @see u_strFromUTF8Lenient + */ +U_STABLE UChar* U_EXPORT2 +u_strFromUTF8(UChar *dest, + int32_t destCapacity, + int32_t *pDestLength, + const char *src, + int32_t srcLength, + UErrorCode *pErrorCode); + +/** + * Convert a UTF-16 string to UTF-8. + * + * Same as u_strToUTF8() except for the additional subchar which is output for + * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. + * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8(). + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of chars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param subchar The substitution character to use in place of an illegal input sequence, + * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. + * A substitution character can be any valid Unicode code point (up to U+10FFFF) + * except for surrogate code points (U+D800..U+DFFF). + * The recommended value is U+FFFD "REPLACEMENT CHARACTER". + * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. + * Set to 0 if no substitutions occur or subchar<0. + * pNumSubstitutions can be NULL. + * @param pErrorCode Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The pointer to destination buffer. + * @see u_strToUTF8 + * @see u_strFromUTF8WithSub + * @stable ICU 3.6 + */ +U_STABLE char* U_EXPORT2 +u_strToUTF8WithSub(char *dest, + int32_t destCapacity, + int32_t *pDestLength, + const UChar *src, + int32_t srcLength, + UChar32 subchar, int32_t *pNumSubstitutions, + UErrorCode *pErrorCode); + +/** + * Convert a UTF-8 string to UTF-16. + * + * Same as u_strFromUTF8() except for the additional subchar which is output for + * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. + * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8(). + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param subchar The substitution character to use in place of an illegal input sequence, + * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. + * A substitution character can be any valid Unicode code point (up to U+10FFFF) + * except for surrogate code points (U+D800..U+DFFF). + * The recommended value is U+FFFD "REPLACEMENT CHARACTER". + * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. + * Set to 0 if no substitutions occur or subchar<0. + * pNumSubstitutions can be NULL. + * @param pErrorCode Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The pointer to destination buffer. + * @see u_strFromUTF8 + * @see u_strFromUTF8Lenient + * @see u_strToUTF8WithSub + * @stable ICU 3.6 + */ +U_STABLE UChar* U_EXPORT2 +u_strFromUTF8WithSub(UChar *dest, + int32_t destCapacity, + int32_t *pDestLength, + const char *src, + int32_t srcLength, + UChar32 subchar, int32_t *pNumSubstitutions, + UErrorCode *pErrorCode); + +/** + * Convert a UTF-8 string to UTF-16. + * + * Same as u_strFromUTF8() except that this function is designed to be very fast, + * which it achieves by being lenient about malformed UTF-8 sequences. + * This function is intended for use in environments where UTF-8 text is + * expected to be well-formed. + * + * Its semantics are: + * - Well-formed UTF-8 text is correctly converted to well-formed UTF-16 text. + * - The function will not read beyond the input string, nor write beyond + * the destCapacity. + * - Malformed UTF-8 results in "garbage" 16-bit Unicode strings which may not + * be well-formed UTF-16. + * The function will resynchronize to valid code point boundaries + * within a small number of code points after an illegal sequence. + * - Non-shortest forms are not detected and will result in "spoofing" output. + * + * For further performance improvement, if srcLength is given (>=0), + * then it must be destCapacity>=srcLength. + * + * There is no inverse u_strToUTF8Lenient() function because there is practically + * no performance gain from not checking that a UTF-16 string is well-formed. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * Unlike for other ICU functions, if srcLength>=0 then it + * must be destCapacity>=srcLength. + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * Unlike for other ICU functions, if srcLength>=0 but + * destCapacity<srcLength, then *pDestLength will be set to srcLength + * (and U_BUFFER_OVERFLOW_ERROR will be set) + * regardless of the actual result length. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param pErrorCode Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The pointer to destination buffer. + * @see u_strFromUTF8 + * @see u_strFromUTF8WithSub + * @see u_strToUTF8WithSub + * @stable ICU 3.6 + */ +U_STABLE UChar * U_EXPORT2 +u_strFromUTF8Lenient(UChar *dest, + int32_t destCapacity, + int32_t *pDestLength, + const char *src, + int32_t srcLength, + UErrorCode *pErrorCode); + +/** + * Convert a UTF-16 string to UTF-32. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The pointer to destination buffer. + * @see u_strToUTF32WithSub + * @see u_strFromUTF32 + * @stable ICU 2.0 + */ +U_STABLE UChar32* U_EXPORT2 +u_strToUTF32(UChar32 *dest, + int32_t destCapacity, + int32_t *pDestLength, + const UChar *src, + int32_t srcLength, + UErrorCode *pErrorCode); + +/** + * Convert a UTF-32 string to UTF-16. + * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * @return The pointer to destination buffer. + * @see u_strFromUTF32WithSub + * @see u_strToUTF32 + * @stable ICU 2.0 + */ +U_STABLE UChar* U_EXPORT2 +u_strFromUTF32(UChar *dest, + int32_t destCapacity, + int32_t *pDestLength, + const UChar32 *src, + int32_t srcLength, + UErrorCode *pErrorCode); + +/** + * Convert a UTF-16 string to UTF-32. + * + * Same as u_strToUTF32() except for the additional subchar which is output for + * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. + * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF32(). + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param subchar The substitution character to use in place of an illegal input sequence, + * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. + * A substitution character can be any valid Unicode code point (up to U+10FFFF) + * except for surrogate code points (U+D800..U+DFFF). + * The recommended value is U+FFFD "REPLACEMENT CHARACTER". + * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. + * Set to 0 if no substitutions occur or subchar<0. + * pNumSubstitutions can be NULL. + * @param pErrorCode Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The pointer to destination buffer. + * @see u_strToUTF32 + * @see u_strFromUTF32WithSub + * @stable ICU 4.2 + */ +U_STABLE UChar32* U_EXPORT2 +u_strToUTF32WithSub(UChar32 *dest, + int32_t destCapacity, + int32_t *pDestLength, + const UChar *src, + int32_t srcLength, + UChar32 subchar, int32_t *pNumSubstitutions, + UErrorCode *pErrorCode); + +/** + * Convert a UTF-32 string to UTF-16. + * + * Same as u_strFromUTF32() except for the additional subchar which is output for + * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. + * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32(). + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param subchar The substitution character to use in place of an illegal input sequence, + * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. + * A substitution character can be any valid Unicode code point (up to U+10FFFF) + * except for surrogate code points (U+D800..U+DFFF). + * The recommended value is U+FFFD "REPLACEMENT CHARACTER". + * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. + * Set to 0 if no substitutions occur or subchar<0. + * pNumSubstitutions can be NULL. + * @param pErrorCode Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The pointer to destination buffer. + * @see u_strFromUTF32 + * @see u_strToUTF32WithSub + * @stable ICU 4.2 + */ +U_STABLE UChar* U_EXPORT2 +u_strFromUTF32WithSub(UChar *dest, + int32_t destCapacity, + int32_t *pDestLength, + const UChar32 *src, + int32_t srcLength, + UChar32 subchar, int32_t *pNumSubstitutions, + UErrorCode *pErrorCode); + +/** + * Convert a 16-bit Unicode string to Java Modified UTF-8. + * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8 + * + * This function behaves according to the documentation for Java DataOutput.writeUTF() + * except that it does not encode the output length in the destination buffer + * and does not have an output length restriction. + * See http://java.sun.com/javase/6/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String) + * + * The input string need not be well-formed UTF-16. + * (Therefore there is no subchar parameter.) + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of chars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param pErrorCode Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The pointer to destination buffer. + * @stable ICU 4.4 + * @see u_strToUTF8WithSub + * @see u_strFromJavaModifiedUTF8WithSub + */ +U_STABLE char* U_EXPORT2 +u_strToJavaModifiedUTF8( + char *dest, + int32_t destCapacity, + int32_t *pDestLength, + const UChar *src, + int32_t srcLength, + UErrorCode *pErrorCode); + +/** + * Convert a Java Modified UTF-8 string to a 16-bit Unicode string. + * If the input string is not well-formed and no substitution char is specified, + * then the U_INVALID_CHAR_FOUND error code is set. + * + * This function behaves according to the documentation for Java DataInput.readUTF() + * except that it takes a length parameter rather than + * interpreting the first two input bytes as the length. + * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF() + * + * The output string may not be well-formed UTF-16. + * + * @param dest A buffer for the result string. The result will be zero-terminated if + * the buffer is large enough. + * @param destCapacity The size of the buffer (number of UChars). If it is 0, then + * dest may be NULL and the function will only return the length of the + * result without writing any of the result string (pre-flighting). + * @param pDestLength A pointer to receive the number of units written to the destination. If + * pDestLength!=NULL then *pDestLength is always set to the + * number of output units corresponding to the transformation of + * all the input units, even in case of a buffer overflow. + * @param src The original source string + * @param srcLength The length of the original string. If -1, then src must be zero-terminated. + * @param subchar The substitution character to use in place of an illegal input sequence, + * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. + * A substitution character can be any valid Unicode code point (up to U+10FFFF) + * except for surrogate code points (U+D800..U+DFFF). + * The recommended value is U+FFFD "REPLACEMENT CHARACTER". + * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. + * Set to 0 if no substitutions occur or subchar<0. + * pNumSubstitutions can be NULL. + * @param pErrorCode Pointer to a standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The pointer to destination buffer. + * @see u_strFromUTF8WithSub + * @see u_strFromUTF8Lenient + * @see u_strToJavaModifiedUTF8 + * @stable ICU 4.4 + */ +U_STABLE UChar* U_EXPORT2 +u_strFromJavaModifiedUTF8WithSub( + UChar *dest, + int32_t destCapacity, + int32_t *pDestLength, + const char *src, + int32_t srcLength, + UChar32 subchar, int32_t *pNumSubstitutions, + UErrorCode *pErrorCode); + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/utf.h b/src/core/basetypes/icu-ucsdet/include/unicode/utf.h new file mode 100644 index 00000000..f5954fe9 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/utf.h @@ -0,0 +1,223 @@ +/* +******************************************************************************* +* +* Copyright (C) 1999-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: utf.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999sep09 +* created by: Markus W. Scherer +*/ + +/** + * \file + * \brief C API: Code point macros + * + * This file defines macros for checking whether a code point is + * a surrogate or a non-character etc. + * + * The UChar and UChar32 data types for Unicode code units and code points + * are defined in umachine.h because they can be machine-dependent. + * + * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h + * and itself includes utf8.h and utf16.h after some + * common definitions. + * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 1 then each of these headers must be + * included explicitly if their definitions are used. + * + * utf8.h and utf16.h define macros for efficiently getting code points + * in and out of UTF-8/16 strings. + * utf16.h macros have "U16_" prefixes. + * utf8.h defines similar macros with "U8_" prefixes for UTF-8 string handling. + * + * ICU mostly processes 16-bit Unicode strings. + * Most of the time, such strings are well-formed UTF-16. + * Single, unpaired surrogates must be handled as well, and are treated in ICU + * like regular code points where possible. + * (Pairs of surrogate code points are indistinguishable from supplementary + * code points encoded as pairs of supplementary code units.) + * + * In fact, almost all Unicode code points in normal text (>99%) + * are on the BMP (<=U+ffff) and even <=U+d7ff. + * ICU functions handle supplementary code points (U+10000..U+10ffff) + * but are optimized for the much more frequently occurring BMP code points. + * + * umachine.h defines UChar to be an unsigned 16-bit integer. + * Where available, UChar is defined to be a char16_t + * or a wchar_t (if that is an unsigned 16-bit type), otherwise uint16_t. + * + * UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit + * Unicode code point (Unicode scalar value, 0..0x10ffff). + * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as + * the definition of UChar. For details see the documentation for UChar32 itself. + * + * utf.h defines a small number of C macros for single Unicode code points. + * These are simple checks for surrogates and non-characters. + * For actual Unicode character properties see uchar.h. + * + * By default, string operations must be done with error checking in case + * a string is not well-formed UTF-16. + * The macros will detect if a surrogate code unit is unpaired + * (lead unit without trail unit or vice versa) and just return the unit itself + * as the code point. + * + * The regular "safe" macros require that the initial, passed-in string index + * is within bounds. They only check the index when they read more than one + * code unit. This is usually done with code similar to the following loop: + * <pre>while(i<length) { + * U16_NEXT(s, i, length, c); + * // use c + * }</pre> + * + * When it is safe to assume that text is well-formed UTF-16 + * (does not contain single, unpaired surrogates), then one can use + * U16_..._UNSAFE macros. + * These do not check for proper code unit sequences or truncated text and may + * yield wrong results or even cause a crash if they are used with "malformed" + * text. + * In practice, U16_..._UNSAFE macros will produce slightly less code but + * should not be faster because the processing is only different when a + * surrogate code unit is detected, which will be rare. + * + * Similarly for UTF-8, there are "safe" macros without a suffix, + * and U8_..._UNSAFE versions. + * The performance differences are much larger here because UTF-8 provides so + * many opportunities for malformed sequences. + * The unsafe UTF-8 macros are entirely implemented inside the macro definitions + * and are fast, while the safe UTF-8 macros call functions for all but the + * trivial (ASCII) cases. + * (ICU 3.6 optimizes U8_NEXT() and U8_APPEND() to handle most other common + * characters inline as well.) + * + * Unlike with UTF-16, malformed sequences cannot be expressed with distinct + * code point values (0..U+10ffff). They are indicated with negative values instead. + * + * For more information see the ICU User Guide Strings chapter + * (http://userguide.icu-project.org/strings). + * + * <em>Usage:</em> + * ICU coding guidelines for if() statements should be followed when using these macros. + * Compound statements (curly braces {}) must be used for if-else-while... + * bodies and all macro statements should be terminated with semicolon. + * + * @stable ICU 2.4 + */ + +#ifndef __UTF_H__ +#define __UTF_H__ + +#include "unicode/umachine.h" +/* include the utfXX.h after the following definitions */ + +/* single-code point definitions -------------------------------------------- */ + +/** + * Is this code point a Unicode noncharacter? + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U_IS_UNICODE_NONCHAR(c) \ + ((c)>=0xfdd0 && \ + ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \ + (uint32_t)(c)<=0x10ffff) + +/** + * Is c a Unicode code point value (0..U+10ffff) + * that can be assigned a character? + * + * Code points that are not characters include: + * - single surrogate code points (U+d800..U+dfff, 2048 code points) + * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points) + * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points) + * - the highest Unicode code point value is U+10ffff + * + * This means that all code points below U+d800 are character code points, + * and that boundary is tested first for performance. + * + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U_IS_UNICODE_CHAR(c) \ + ((uint32_t)(c)<0xd800 || \ + ((uint32_t)(c)>0xdfff && \ + (uint32_t)(c)<=0x10ffff && \ + !U_IS_UNICODE_NONCHAR(c))) + +/** + * Is this code point a BMP code point (U+0000..U+ffff)? + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 2.8 + */ +#define U_IS_BMP(c) ((uint32_t)(c)<=0xffff) + +/** + * Is this code point a supplementary code point (U+10000..U+10ffff)? + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 2.8 + */ +#define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000)<=0xfffff) + +/** + * Is this code point a lead surrogate (U+d800..U+dbff)? + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) + +/** + * Is this code point a trail surrogate (U+dc00..U+dfff)? + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) + +/** + * Is this code point a surrogate (U+d800..U+dfff)? + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800) + +/** + * Assuming c is a surrogate code point (U_IS_SURROGATE(c)), + * is it a lead surrogate? + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) + +/** + * Assuming c is a surrogate code point (U_IS_SURROGATE(c)), + * is it a trail surrogate? + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 4.2 + */ +#define U_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0) + +/* include the utfXX.h ------------------------------------------------------ */ + +#if !U_NO_DEFAULT_INCLUDE_UTF_HEADERS + +#include "unicode/utf8.h" +#include "unicode/utf16.h" + +/* utf_old.h contains deprecated, pre-ICU 2.4 definitions */ +#include "unicode/utf_old.h" + +#endif /* !U_NO_DEFAULT_INCLUDE_UTF_HEADERS */ + +#endif /* __UTF_H__ */ diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/utf16.h b/src/core/basetypes/icu-ucsdet/include/unicode/utf16.h new file mode 100644 index 00000000..bdd88a8b --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/utf16.h @@ -0,0 +1,623 @@ +/* +******************************************************************************* +* +* Copyright (C) 1999-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: utf16.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999sep09 +* created by: Markus W. Scherer +*/ + +/** + * \file + * \brief C API: 16-bit Unicode handling macros + * + * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings. + * + * For more information see utf.h and the ICU User Guide Strings chapter + * (http://userguide.icu-project.org/strings). + * + * <em>Usage:</em> + * ICU coding guidelines for if() statements should be followed when using these macros. + * Compound statements (curly braces {}) must be used for if-else-while... + * bodies and all macro statements should be terminated with semicolon. + */ + +#ifndef __UTF16_H__ +#define __UTF16_H__ + +#include "unicode/umachine.h" +#ifndef __UTF_H__ +# include "unicode/utf.h" +#endif + +/* single-code point definitions -------------------------------------------- */ + +/** + * Does this code unit alone encode a code point (BMP, not a surrogate)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c) + +/** + * Is this code unit a lead surrogate (U+d800..U+dbff)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) + +/** + * Is this code unit a trail surrogate (U+dc00..U+dfff)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) + +/** + * Is this code unit a surrogate (U+d800..U+dfff)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c) + +/** + * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), + * is it a lead surrogate? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) + +/** + * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), + * is it a trail surrogate? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 4.2 + */ +#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0) + +/** + * Helper constant for U16_GET_SUPPLEMENTARY. + * @internal + */ +#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) + +/** + * Get a supplementary code point value (U+10000..U+10ffff) + * from its lead and trail surrogates. + * The result is undefined if the input values are not + * lead and trail surrogates. + * + * @param lead lead surrogate (U+d800..U+dbff) + * @param trail trail surrogate (U+dc00..U+dfff) + * @return supplementary code point (U+10000..U+10ffff) + * @stable ICU 2.4 + */ +#define U16_GET_SUPPLEMENTARY(lead, trail) \ + (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET) + + +/** + * Get the lead surrogate (0xd800..0xdbff) for a + * supplementary code point (0x10000..0x10ffff). + * @param supplementary 32-bit code point (U+10000..U+10ffff) + * @return lead surrogate (U+d800..U+dbff) for supplementary + * @stable ICU 2.4 + */ +#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) + +/** + * Get the trail surrogate (0xdc00..0xdfff) for a + * supplementary code point (0x10000..0x10ffff). + * @param supplementary 32-bit code point (U+10000..U+10ffff) + * @return trail surrogate (U+dc00..U+dfff) for supplementary + * @stable ICU 2.4 + */ +#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) + +/** + * How many 16-bit code units are used to encode this Unicode code point? (1 or 2) + * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff). + * @param c 32-bit code point + * @return 1 or 2 + * @stable ICU 2.4 + */ +#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) + +/** + * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff). + * @return 2 + * @stable ICU 2.4 + */ +#define U16_MAX_LENGTH 2 + +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * The offset may point to either the lead or trail surrogate unit + * for a supplementary code point, in which case the macro will read + * the adjacent matching surrogate as well. + * The result is undefined if the offset points to a single, unpaired surrogate. + * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT. + * + * @param s const UChar * string + * @param i string offset + * @param c output UChar32 variable + * @see U16_GET + * @stable ICU 2.4 + */ +#define U16_GET_UNSAFE(s, i, c) { \ + (c)=(s)[i]; \ + if(U16_IS_SURROGATE(c)) { \ + if(U16_IS_SURROGATE_LEAD(c)) { \ + (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \ + } else { \ + (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \ + } \ + } \ +} + +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The offset may point to either the lead or trail surrogate unit + * for a supplementary code point, in which case the macro will read + * the adjacent matching surrogate as well. + * + * The length can be negative for a NUL-terminated string. + * + * If the offset points to a single, unpaired surrogate, then that itself + * will be returned as the code point. + * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start<=i<length + * @param length string length + * @param c output UChar32 variable + * @see U16_GET_UNSAFE + * @stable ICU 2.4 + */ +#define U16_GET(s, start, i, length, c) { \ + (c)=(s)[i]; \ + if(U16_IS_SURROGATE(c)) { \ + uint16_t __c2; \ + if(U16_IS_SURROGATE_LEAD(c)) { \ + if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \ + (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ + } \ + } else { \ + if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ + (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ + } \ + } \ + } \ +} + +/* definitions with forward iteration --------------------------------------- */ + +/** + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Unsafe" macro, assumes well-formed UTF-16. + * + * The offset may point to the lead surrogate unit + * for a supplementary code point, in which case the macro will read + * the following trail surrogate as well. + * If the offset points to a trail surrogate, then that itself + * will be returned as the code point. + * The result is undefined if the offset points to a single, unpaired lead surrogate. + * + * @param s const UChar * string + * @param i string offset + * @param c output UChar32 variable + * @see U16_NEXT + * @stable ICU 2.4 + */ +#define U16_NEXT_UNSAFE(s, i, c) { \ + (c)=(s)[(i)++]; \ + if(U16_IS_LEAD(c)) { \ + (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \ + } \ +} + +/** + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * The offset may point to the lead surrogate unit + * for a supplementary code point, in which case the macro will read + * the following trail surrogate as well. + * If the offset points to a trail surrogate or + * to a single, unpaired lead surrogate, then that itself + * will be returned as the code point. + * + * @param s const UChar * string + * @param i string offset, must be i<length + * @param length string length + * @param c output UChar32 variable + * @see U16_NEXT_UNSAFE + * @stable ICU 2.4 + */ +#define U16_NEXT(s, i, length, c) { \ + (c)=(s)[(i)++]; \ + if(U16_IS_LEAD(c)) { \ + uint16_t __c2; \ + if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \ + ++(i); \ + (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ + } \ + } \ +} + +/** + * Append a code point to a string, overwriting 1 or 2 code units. + * The offset points to the current end of the string contents + * and is advanced (post-increment). + * "Unsafe" macro, assumes a valid code point and sufficient space in the string. + * Otherwise, the result is undefined. + * + * @param s const UChar * string buffer + * @param i string offset + * @param c code point to append + * @see U16_APPEND + * @stable ICU 2.4 + */ +#define U16_APPEND_UNSAFE(s, i, c) { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint16_t)(c); \ + } else { \ + (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } \ +} + +/** + * Append a code point to a string, overwriting 1 or 2 code units. + * The offset points to the current end of the string contents + * and is advanced (post-increment). + * "Safe" macro, checks for a valid code point. + * If a surrogate pair is written, checks for sufficient space in the string. + * If the code point is not valid or a trail surrogate does not fit, + * then isError is set to TRUE. + * + * @param s const UChar * string buffer + * @param i string offset, must be i<capacity + * @param capacity size of the string buffer + * @param c code point to append + * @param isError output UBool set to TRUE if an error occurs, otherwise not modified + * @see U16_APPEND_UNSAFE + * @stable ICU 2.4 + */ +#define U16_APPEND(s, i, capacity, c, isError) { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint16_t)(c); \ + } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \ + (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } else /* c>0x10ffff or not enough space */ { \ + (isError)=TRUE; \ + } \ +} + +/** + * Advance the string offset from one code point boundary to the next. + * (Post-incrementing iteration.) + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @see U16_FWD_1 + * @stable ICU 2.4 + */ +#define U16_FWD_1_UNSAFE(s, i) { \ + if(U16_IS_LEAD((s)[(i)++])) { \ + ++(i); \ + } \ +} + +/** + * Advance the string offset from one code point boundary to the next. + * (Post-incrementing iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const UChar * string + * @param i string offset, must be i<length + * @param length string length + * @see U16_FWD_1_UNSAFE + * @stable ICU 2.4 + */ +#define U16_FWD_1(s, i, length) { \ + if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \ + ++(i); \ + } \ +} + +/** + * Advance the string offset from one code point boundary to the n-th next one, + * i.e., move forward by n code points. + * (Post-incrementing iteration.) + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @param n number of code points to skip + * @see U16_FWD_N + * @stable ICU 2.4 + */ +#define U16_FWD_N_UNSAFE(s, i, n) { \ + int32_t __N=(n); \ + while(__N>0) { \ + U16_FWD_1_UNSAFE(s, i); \ + --__N; \ + } \ +} + +/** + * Advance the string offset from one code point boundary to the n-th next one, + * i.e., move forward by n code points. + * (Post-incrementing iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const UChar * string + * @param i int32_t string offset, must be i<length + * @param length int32_t string length + * @param n number of code points to skip + * @see U16_FWD_N_UNSAFE + * @stable ICU 2.4 + */ +#define U16_FWD_N(s, i, length, n) { \ + int32_t __N=(n); \ + while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ + U16_FWD_1(s, i, length); \ + --__N; \ + } \ +} + +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to the trail surrogate of a surrogate pair, + * then the offset is decremented. + * Otherwise, it is not modified. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @see U16_SET_CP_START + * @stable ICU 2.4 + */ +#define U16_SET_CP_START_UNSAFE(s, i) { \ + if(U16_IS_TRAIL((s)[i])) { \ + --(i); \ + } \ +} + +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to the trail surrogate of a surrogate pair, + * then the offset is decremented. + * Otherwise, it is not modified. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start<=i + * @see U16_SET_CP_START_UNSAFE + * @stable ICU 2.4 + */ +#define U16_SET_CP_START(s, start, i) { \ + if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \ + --(i); \ + } \ +} + +/* definitions with backward iteration -------------------------------------- */ + +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Unsafe" macro, assumes well-formed UTF-16. + * + * The input offset may be the same as the string length. + * If the offset is behind a trail surrogate unit + * for a supplementary code point, then the macro will read + * the preceding lead surrogate as well. + * If the offset is behind a lead surrogate, then that itself + * will be returned as the code point. + * The result is undefined if the offset is behind a single, unpaired trail surrogate. + * + * @param s const UChar * string + * @param i string offset + * @param c output UChar32 variable + * @see U16_PREV + * @stable ICU 2.4 + */ +#define U16_PREV_UNSAFE(s, i, c) { \ + (c)=(s)[--(i)]; \ + if(U16_IS_TRAIL(c)) { \ + (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \ + } \ +} + +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The input offset may be the same as the string length. + * If the offset is behind a trail surrogate unit + * for a supplementary code point, then the macro will read + * the preceding lead surrogate as well. + * If the offset is behind a lead surrogate or behind a single, unpaired + * trail surrogate, then that itself + * will be returned as the code point. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start<i + * @param c output UChar32 variable + * @see U16_PREV_UNSAFE + * @stable ICU 2.4 + */ +#define U16_PREV(s, start, i, c) { \ + (c)=(s)[--(i)]; \ + if(U16_IS_TRAIL(c)) { \ + uint16_t __c2; \ + if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ + --(i); \ + (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ + } \ + } \ +} + +/** + * Move the string offset from one code point boundary to the previous one. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @see U16_BACK_1 + * @stable ICU 2.4 + */ +#define U16_BACK_1_UNSAFE(s, i) { \ + if(U16_IS_TRAIL((s)[--(i)])) { \ + --(i); \ + } \ +} + +/** + * Move the string offset from one code point boundary to the previous one. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start<i + * @see U16_BACK_1_UNSAFE + * @stable ICU 2.4 + */ +#define U16_BACK_1(s, start, i) { \ + if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \ + --(i); \ + } \ +} + +/** + * Move the string offset from one code point boundary to the n-th one before it, + * i.e., move backward by n code points. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @param n number of code points to skip + * @see U16_BACK_N + * @stable ICU 2.4 + */ +#define U16_BACK_N_UNSAFE(s, i, n) { \ + int32_t __N=(n); \ + while(__N>0) { \ + U16_BACK_1_UNSAFE(s, i); \ + --__N; \ + } \ +} + +/** + * Move the string offset from one code point boundary to the n-th one before it, + * i.e., move backward by n code points. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param start start of string + * @param i string offset, must be start<i + * @param n number of code points to skip + * @see U16_BACK_N_UNSAFE + * @stable ICU 2.4 + */ +#define U16_BACK_N(s, start, i, n) { \ + int32_t __N=(n); \ + while(__N>0 && (i)>(start)) { \ + U16_BACK_1(s, start, i); \ + --__N; \ + } \ +} + +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind the lead surrogate of a surrogate pair, + * then the offset is incremented. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @see U16_SET_CP_LIMIT + * @stable ICU 2.4 + */ +#define U16_SET_CP_LIMIT_UNSAFE(s, i) { \ + if(U16_IS_LEAD((s)[(i)-1])) { \ + ++(i); \ + } \ +} + +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind the lead surrogate of a surrogate pair, + * then the offset is incremented. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const UChar * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, start<=i<=length + * @param length int32_t string length + * @see U16_SET_CP_LIMIT_UNSAFE + * @stable ICU 2.4 + */ +#define U16_SET_CP_LIMIT(s, start, i, length) { \ + if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \ + ++(i); \ + } \ +} + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/utf8.h b/src/core/basetypes/icu-ucsdet/include/unicode/utf8.h new file mode 100644 index 00000000..7bd0b0e8 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/utf8.h @@ -0,0 +1,824 @@ +/* +******************************************************************************* +* +* Copyright (C) 1999-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: utf8.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999sep13 +* created by: Markus W. Scherer +*/ + +/** + * \file + * \brief C API: 8-bit Unicode handling macros + * + * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. + * + * For more information see utf.h and the ICU User Guide Strings chapter + * (http://userguide.icu-project.org/strings). + * + * <em>Usage:</em> + * ICU coding guidelines for if() statements should be followed when using these macros. + * Compound statements (curly braces {}) must be used for if-else-while... + * bodies and all macro statements should be terminated with semicolon. + */ + +#ifndef __UTF8_H__ +#define __UTF8_H__ + +#include "unicode/umachine.h" +#ifndef __UTF_H__ +# include "unicode/utf.h" +#endif + +/* internal definitions ----------------------------------------------------- */ + +/** + * \var utf8_countTrailBytes + * Internal array with numbers of trail bytes for any given byte used in + * lead byte position. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is called by public macros in this file and thus must remain stable, + * and should not be hidden when other internal functions are hidden (otherwise + * public macros would fail to compile). + * @internal + */ +#ifdef U_UTF8_IMPL +U_EXPORT const uint8_t +#elif defined(U_STATIC_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) +U_CFUNC const uint8_t +#else +U_CFUNC U_IMPORT const uint8_t /* U_IMPORT2? */ /*U_IMPORT*/ +#endif +utf8_countTrailBytes[256]; + +/** + * Counts the trail bytes for a UTF-8 lead byte. + * Returns 0 for 0..0xbf as well as for 0xfe and 0xff. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is called by public macros in this file and thus must remain stable. + * + * Note: Beginning with ICU 50, the implementation uses a multi-condition expression + * which was shown in 2012 (on x86-64) to compile to fast, branch-free code. + * leadByte is evaluated multiple times. + * + * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes: + * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte]) + * leadByte was evaluated exactly once. + * + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. + * @internal + */ +#define U8_COUNT_TRAIL_BYTES(leadByte) \ + ((uint8_t)(leadByte)<0xf0 ? \ + ((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \ + (uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0) + +/** + * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. + * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF. + * leadByte might be evaluated multiple times. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is called by public macros in this file and thus must remain stable. + * + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. + * @internal + */ +#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \ + (((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0)) + +/** + * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is called by public macros in this file and thus must remain stable. + * @internal + */ +#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) + +/** + * Function for handling "next code point" with error-checking. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this + * file and thus must remain stable, and should not be hidden when other internal + * functions are hidden (otherwise public macros would fail to compile). + * @internal + */ +U_STABLE UChar32 U_EXPORT2 +utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict); + +/** + * Function for handling "append code point" with error-checking. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this + * file and thus must remain stable, and should not be hidden when other internal + * functions are hidden (otherwise public macros would fail to compile). + * @internal + */ +U_STABLE int32_t U_EXPORT2 +utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); + +/** + * Function for handling "previous code point" with error-checking. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this + * file and thus must remain stable, and should not be hidden when other internal + * functions are hidden (otherwise public macros would fail to compile). + * @internal + */ +U_STABLE UChar32 U_EXPORT2 +utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict); + +/** + * Function for handling "skip backward one code point" with error-checking. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this + * file and thus must remain stable, and should not be hidden when other internal + * functions are hidden (otherwise public macros would fail to compile). + * @internal + */ +U_STABLE int32_t U_EXPORT2 +utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); + +/* single-code point definitions -------------------------------------------- */ + +/** + * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U8_IS_SINGLE(c) (((c)&0x80)==0) + +/** + * Is this code unit (byte) a UTF-8 lead byte? + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e) + +/** + * Is this code unit (byte) a UTF-8 trail byte? + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80) + +/** + * How many code units (bytes) are used for the UTF-8 encoding + * of this Unicode code point? + * @param c 32-bit code point + * @return 1..4, or 0 if c is a surrogate or not a Unicode code point + * @stable ICU 2.4 + */ +#define U8_LENGTH(c) \ + ((uint32_t)(c)<=0x7f ? 1 : \ + ((uint32_t)(c)<=0x7ff ? 2 : \ + ((uint32_t)(c)<=0xd7ff ? 3 : \ + ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ + ((uint32_t)(c)<=0xffff ? 3 : 4)\ + ) \ + ) \ + ) \ + ) + +/** + * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). + * @return 4 + * @stable ICU 2.4 + */ +#define U8_MAX_LENGTH 4 + +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * The offset may point to either the lead byte or one of the trail bytes + * for a code point, in which case the macro will read all of the bytes + * for the code point. + * The result is undefined if the offset points to an illegal UTF-8 + * byte sequence. + * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. + * + * @param s const uint8_t * string + * @param i string offset + * @param c output UChar32 variable + * @see U8_GET + * @stable ICU 2.4 + */ +#define U8_GET_UNSAFE(s, i, c) { \ + int32_t _u8_get_unsafe_index=(int32_t)(i); \ + U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \ + U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \ +} + +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * The offset may point to either the lead byte or one of the trail bytes + * for a code point, in which case the macro will read all of the bytes + * for the code point. + * + * The length can be negative for a NUL-terminated string. + * + * If the offset points to an illegal UTF-8 byte sequence, then + * c is set to a negative value. + * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset + * @param i int32_t string offset, must be start<=i<length + * @param length int32_t string length + * @param c output UChar32 variable, set to <0 in case of an error + * @see U8_GET_UNSAFE + * @stable ICU 2.4 + */ +#define U8_GET(s, start, i, length, c) { \ + int32_t _u8_get_index=(i); \ + U8_SET_CP_START(s, start, _u8_get_index); \ + U8_NEXT(s, _u8_get_index, length, c); \ +} + +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * The offset may point to either the lead byte or one of the trail bytes + * for a code point, in which case the macro will read all of the bytes + * for the code point. + * + * The length can be negative for a NUL-terminated string. + * + * If the offset points to an illegal UTF-8 byte sequence, then + * c is set to U+FFFD. + * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD. + * + * This macro does not distinguish between a real U+FFFD in the text + * and U+FFFD returned for an ill-formed sequence. + * Use U8_GET() if that distinction is important. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset + * @param i int32_t string offset, must be start<=i<length + * @param length int32_t string length + * @param c output UChar32 variable, set to U+FFFD in case of an error + * @see U8_GET + * @stable ICU 51 + */ +#define U8_GET_OR_FFFD(s, start, i, length, c) { \ + int32_t _u8_get_index=(i); \ + U8_SET_CP_START(s, start, _u8_get_index); \ + U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \ +} + +/* definitions with forward iteration --------------------------------------- */ + +/** + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Unsafe" macro, assumes well-formed UTF-8. + * + * The offset may point to the lead byte of a multi-byte sequence, + * in which case the macro will read the whole sequence. + * The result is undefined if the offset points to a trail byte + * or an illegal UTF-8 sequence. + * + * @param s const uint8_t * string + * @param i string offset + * @param c output UChar32 variable + * @see U8_NEXT + * @stable ICU 2.4 + */ +#define U8_NEXT_UNSAFE(s, i, c) { \ + (c)=(uint8_t)(s)[(i)++]; \ + if((c)>=0x80) { \ + if((c)<0xe0) { \ + (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \ + } else if((c)<0xf0) { \ + /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ + (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \ + (i)+=2; \ + } else { \ + (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \ + (i)+=3; \ + } \ + } \ +} + +/** + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * The offset may point to the lead byte of a multi-byte sequence, + * in which case the macro will read the whole sequence. + * If the offset points to a trail byte or an illegal UTF-8 sequence, then + * c is set to a negative value. + * + * @param s const uint8_t * string + * @param i int32_t string offset, must be i<length + * @param length int32_t string length + * @param c output UChar32 variable, set to <0 in case of an error + * @see U8_NEXT_UNSAFE + * @stable ICU 2.4 + */ +#define U8_NEXT(s, i, length, c) { \ + (c)=(uint8_t)(s)[(i)++]; \ + if((c)>=0x80) { \ + uint8_t __t1, __t2; \ + if( /* handle U+1000..U+CFFF inline */ \ + (0xe0<(c) && (c)<=0xec) && \ + (((i)+1)<(length) || (length)<0) && \ + (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ + (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ + ) { \ + /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ + (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ + (i)+=2; \ + } else if( /* handle U+0080..U+07FF inline */ \ + ((c)<0xe0 && (c)>=0xc2) && \ + ((i)!=(length)) && \ + (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ + ) { \ + (c)=(((c)&0x1f)<<6)|__t1; \ + ++(i); \ + } else { \ + /* function call for "complicated" and error cases */ \ + (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \ + } \ + } \ +} + +/** + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * The offset may point to the lead byte of a multi-byte sequence, + * in which case the macro will read the whole sequence. + * If the offset points to a trail byte or an illegal UTF-8 sequence, then + * c is set to U+FFFD. + * + * This macro does not distinguish between a real U+FFFD in the text + * and U+FFFD returned for an ill-formed sequence. + * Use U8_NEXT() if that distinction is important. + * + * @param s const uint8_t * string + * @param i int32_t string offset, must be i<length + * @param length int32_t string length + * @param c output UChar32 variable, set to U+FFFD in case of an error + * @see U8_NEXT + * @stable ICU 51 + */ +#define U8_NEXT_OR_FFFD(s, i, length, c) { \ + (c)=(uint8_t)(s)[(i)++]; \ + if((c)>=0x80) { \ + uint8_t __t1, __t2; \ + if( /* handle U+1000..U+CFFF inline */ \ + (0xe0<(c) && (c)<=0xec) && \ + (((i)+1)<(length) || (length)<0) && \ + (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ + (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ + ) { \ + /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ + (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ + (i)+=2; \ + } else if( /* handle U+0080..U+07FF inline */ \ + ((c)<0xe0 && (c)>=0xc2) && \ + ((i)!=(length)) && \ + (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ + ) { \ + (c)=(((c)&0x1f)<<6)|__t1; \ + ++(i); \ + } else { \ + /* function call for "complicated" and error cases */ \ + (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3); \ + } \ + } \ +} + +/** + * Append a code point to a string, overwriting 1 to 4 bytes. + * The offset points to the current end of the string contents + * and is advanced (post-increment). + * "Unsafe" macro, assumes a valid code point and sufficient space in the string. + * Otherwise, the result is undefined. + * + * @param s const uint8_t * string buffer + * @param i string offset + * @param c code point to append + * @see U8_APPEND + * @stable ICU 2.4 + */ +#define U8_APPEND_UNSAFE(s, i, c) { \ + if((uint32_t)(c)<=0x7f) { \ + (s)[(i)++]=(uint8_t)(c); \ + } else { \ + if((uint32_t)(c)<=0x7ff) { \ + (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ + } else { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ + } else { \ + (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ + (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ + } \ +} + +/** + * Append a code point to a string, overwriting 1 to 4 bytes. + * The offset points to the current end of the string contents + * and is advanced (post-increment). + * "Safe" macro, checks for a valid code point. + * If a non-ASCII code point is written, checks for sufficient space in the string. + * If the code point is not valid or trail bytes do not fit, + * then isError is set to TRUE. + * + * @param s const uint8_t * string buffer + * @param i int32_t string offset, must be i<capacity + * @param capacity int32_t size of the string buffer + * @param c UChar32 code point to append + * @param isError output UBool set to TRUE if an error occurs, otherwise not modified + * @see U8_APPEND_UNSAFE + * @stable ICU 2.4 + */ +#define U8_APPEND(s, i, capacity, c, isError) { \ + if((uint32_t)(c)<=0x7f) { \ + (s)[(i)++]=(uint8_t)(c); \ + } else if((uint32_t)(c)<=0x7ff && (i)+1<(capacity)) { \ + (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ + (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ + } else if((uint32_t)(c)<=0xd7ff && (i)+2<(capacity)) { \ + (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ + (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ + (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ + } else { \ + (i)=utf8_appendCharSafeBody(s, (i), (capacity), c, &(isError)); \ + } \ +} + +/** + * Advance the string offset from one code point boundary to the next. + * (Post-incrementing iteration.) + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const uint8_t * string + * @param i string offset + * @see U8_FWD_1 + * @stable ICU 2.4 + */ +#define U8_FWD_1_UNSAFE(s, i) { \ + (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \ +} + +/** + * Advance the string offset from one code point boundary to the next. + * (Post-incrementing iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const uint8_t * string + * @param i int32_t string offset, must be i<length + * @param length int32_t string length + * @see U8_FWD_1_UNSAFE + * @stable ICU 2.4 + */ +#define U8_FWD_1(s, i, length) { \ + uint8_t __b=(uint8_t)(s)[(i)++]; \ + if(U8_IS_LEAD(__b)) { \ + uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \ + if((i)+__count>(length) && (length)>=0) { \ + __count=(uint8_t)((length)-(i)); \ + } \ + while(__count>0 && U8_IS_TRAIL((s)[i])) { \ + ++(i); \ + --__count; \ + } \ + } \ +} + +/** + * Advance the string offset from one code point boundary to the n-th next one, + * i.e., move forward by n code points. + * (Post-incrementing iteration.) + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const uint8_t * string + * @param i string offset + * @param n number of code points to skip + * @see U8_FWD_N + * @stable ICU 2.4 + */ +#define U8_FWD_N_UNSAFE(s, i, n) { \ + int32_t __N=(n); \ + while(__N>0) { \ + U8_FWD_1_UNSAFE(s, i); \ + --__N; \ + } \ +} + +/** + * Advance the string offset from one code point boundary to the n-th next one, + * i.e., move forward by n code points. + * (Post-incrementing iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const uint8_t * string + * @param i int32_t string offset, must be i<length + * @param length int32_t string length + * @param n number of code points to skip + * @see U8_FWD_N_UNSAFE + * @stable ICU 2.4 + */ +#define U8_FWD_N(s, i, length, n) { \ + int32_t __N=(n); \ + while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ + U8_FWD_1(s, i, length); \ + --__N; \ + } \ +} + +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to a UTF-8 trail byte, + * then the offset is moved backward to the corresponding lead byte. + * Otherwise, it is not modified. + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const uint8_t * string + * @param i string offset + * @see U8_SET_CP_START + * @stable ICU 2.4 + */ +#define U8_SET_CP_START_UNSAFE(s, i) { \ + while(U8_IS_TRAIL((s)[i])) { --(i); } \ +} + +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to a UTF-8 trail byte, + * then the offset is moved backward to the corresponding lead byte. + * Otherwise, it is not modified. + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start<=i + * @see U8_SET_CP_START_UNSAFE + * @stable ICU 2.4 + */ +#define U8_SET_CP_START(s, start, i) { \ + if(U8_IS_TRAIL((s)[(i)])) { \ + (i)=utf8_back1SafeBody(s, start, (i)); \ + } \ +} + +/* definitions with backward iteration -------------------------------------- */ + +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Unsafe" macro, assumes well-formed UTF-8. + * + * The input offset may be the same as the string length. + * If the offset is behind a multi-byte sequence, then the macro will read + * the whole sequence. + * If the offset is behind a lead byte, then that itself + * will be returned as the code point. + * The result is undefined if the offset is behind an illegal UTF-8 sequence. + * + * @param s const uint8_t * string + * @param i string offset + * @param c output UChar32 variable + * @see U8_PREV + * @stable ICU 2.4 + */ +#define U8_PREV_UNSAFE(s, i, c) { \ + (c)=(uint8_t)(s)[--(i)]; \ + if(U8_IS_TRAIL(c)) { \ + uint8_t __b, __count=1, __shift=6; \ +\ + /* c is a trail byte */ \ + (c)&=0x3f; \ + for(;;) { \ + __b=(uint8_t)(s)[--(i)]; \ + if(__b>=0xc0) { \ + U8_MASK_LEAD_BYTE(__b, __count); \ + (c)|=(UChar32)__b<<__shift; \ + break; \ + } else { \ + (c)|=(UChar32)(__b&0x3f)<<__shift; \ + ++__count; \ + __shift+=6; \ + } \ + } \ + } \ +} + +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The input offset may be the same as the string length. + * If the offset is behind a multi-byte sequence, then the macro will read + * the whole sequence. + * If the offset is behind a lead byte, then that itself + * will be returned as the code point. + * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start<i + * @param c output UChar32 variable, set to <0 in case of an error + * @see U8_PREV_UNSAFE + * @stable ICU 2.4 + */ +#define U8_PREV(s, start, i, c) { \ + (c)=(uint8_t)(s)[--(i)]; \ + if((c)>=0x80) { \ + (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ + } \ +} + +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The input offset may be the same as the string length. + * If the offset is behind a multi-byte sequence, then the macro will read + * the whole sequence. + * If the offset is behind a lead byte, then that itself + * will be returned as the code point. + * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. + * + * This macro does not distinguish between a real U+FFFD in the text + * and U+FFFD returned for an ill-formed sequence. + * Use U8_PREV() if that distinction is important. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start<i + * @param c output UChar32 variable, set to U+FFFD in case of an error + * @see U8_PREV + * @stable ICU 51 + */ +#define U8_PREV_OR_FFFD(s, start, i, c) { \ + (c)=(uint8_t)(s)[--(i)]; \ + if((c)>=0x80) { \ + (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ + } \ +} + +/** + * Move the string offset from one code point boundary to the previous one. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const uint8_t * string + * @param i string offset + * @see U8_BACK_1 + * @stable ICU 2.4 + */ +#define U8_BACK_1_UNSAFE(s, i) { \ + while(U8_IS_TRAIL((s)[--(i)])) {} \ +} + +/** + * Move the string offset from one code point boundary to the previous one. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start<i + * @see U8_BACK_1_UNSAFE + * @stable ICU 2.4 + */ +#define U8_BACK_1(s, start, i) { \ + if(U8_IS_TRAIL((s)[--(i)])) { \ + (i)=utf8_back1SafeBody(s, start, (i)); \ + } \ +} + +/** + * Move the string offset from one code point boundary to the n-th one before it, + * i.e., move backward by n code points. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const uint8_t * string + * @param i string offset + * @param n number of code points to skip + * @see U8_BACK_N + * @stable ICU 2.4 + */ +#define U8_BACK_N_UNSAFE(s, i, n) { \ + int32_t __N=(n); \ + while(__N>0) { \ + U8_BACK_1_UNSAFE(s, i); \ + --__N; \ + } \ +} + +/** + * Move the string offset from one code point boundary to the n-th one before it, + * i.e., move backward by n code points. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * @param s const uint8_t * string + * @param start int32_t index of the start of the string + * @param i int32_t string offset, must be start<i + * @param n number of code points to skip + * @see U8_BACK_N_UNSAFE + * @stable ICU 2.4 + */ +#define U8_BACK_N(s, start, i, n) { \ + int32_t __N=(n); \ + while(__N>0 && (i)>(start)) { \ + U8_BACK_1(s, start, i); \ + --__N; \ + } \ +} + +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind a partial multi-byte sequence, + * then the offset is incremented to behind the whole sequence. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const uint8_t * string + * @param i string offset + * @see U8_SET_CP_LIMIT + * @stable ICU 2.4 + */ +#define U8_SET_CP_LIMIT_UNSAFE(s, i) { \ + U8_BACK_1_UNSAFE(s, i); \ + U8_FWD_1_UNSAFE(s, i); \ +} + +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind a partial multi-byte sequence, + * then the offset is incremented to behind the whole sequence. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start<=i<=length + * @param length int32_t string length + * @see U8_SET_CP_LIMIT_UNSAFE + * @stable ICU 2.4 + */ +#define U8_SET_CP_LIMIT(s, start, i, length) { \ + if((start)<(i) && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ + U8_BACK_1(s, start, i); \ + U8_FWD_1(s, i, length); \ + } \ +} + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/utf_old.h b/src/core/basetypes/icu-ucsdet/include/unicode/utf_old.h new file mode 100644 index 00000000..f9125b1d --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/utf_old.h @@ -0,0 +1,1169 @@ +/* +******************************************************************************* +* +* Copyright (C) 2002-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: utf_old.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002sep21 +* created by: Markus W. Scherer +*/ + +/** + * \file + * \brief C API: Deprecated macros for Unicode string handling + */ + +/** + * + * The macros in utf_old.h are all deprecated and their use discouraged. + * Some of the design principles behind the set of UTF macros + * have changed or proved impractical. + * Almost all of the old "UTF macros" are at least renamed. + * If you are looking for a new equivalent to an old macro, please see the + * comment at the old one. + * + * Brief summary of reasons for deprecation: + * - Switch on UTF_SIZE (selection of UTF-8/16/32 default string processing) + * was impractical. + * - Switch on UTF_SAFE etc. (selection of unsafe/safe/strict default string processing) + * was of little use and impractical. + * - Whole classes of macros became obsolete outside of the UTF_SIZE/UTF_SAFE + * selection framework: UTF32_ macros (all trivial) + * and UTF_ default and intermediate macros (all aliases). + * - The selection framework also caused many macro aliases. + * - Change in Unicode standard: "irregular" sequences (3.0) became illegal (3.2). + * - Change of language in Unicode standard: + * Growing distinction between internal x-bit Unicode strings and external UTF-x + * forms, with the former more lenient. + * Suggests renaming of UTF16_ macros to U16_. + * - The prefix "UTF_" without a width number confused some users. + * - "Safe" append macros needed the addition of an error indicator output. + * - "Safe" UTF-8 macros used legitimate (if rarely used) code point values + * to indicate error conditions. + * - The use of the "_CHAR" infix for code point operations confused some users. + * + * More details: + * + * Until ICU 2.2, utf.h theoretically allowed to choose among UTF-8/16/32 + * for string processing, and among unsafe/safe/strict default macros for that. + * + * It proved nearly impossible to write non-trivial, high-performance code + * that is UTF-generic. + * Unsafe default macros would be dangerous for default string processing, + * and the main reason for the "strict" versions disappeared: + * Between Unicode 3.0 and 3.2 all "irregular" UTF-8 sequences became illegal. + * The only other conditions that "strict" checked for were non-characters, + * which are valid during processing. Only during text input/output should they + * be checked, and at that time other well-formedness checks may be + * necessary or useful as well. + * This can still be done by using U16_NEXT and U_IS_UNICODE_NONCHAR + * or U_IS_UNICODE_CHAR. + * + * The old UTF8_..._SAFE macros also used some normal Unicode code points + * to indicate malformed sequences. + * The new UTF8_ macros without suffix use negative values instead. + * + * The entire contents of utf32.h was moved here without replacement + * because all those macros were trivial and + * were meaningful only in the framework of choosing the UTF size. + * + * See Jitterbug 2150 and its discussion on the ICU mailing list + * in September 2002. + * + * <hr> + * + * <em>Obsolete part</em> of pre-ICU 2.4 utf.h file documentation: + * + * <p>The original concept for these files was for ICU to allow + * in principle to set which UTF (UTF-8/16/32) is used internally + * by defining UTF_SIZE to either 8, 16, or 32. utf.h would then define the UChar type + * accordingly. UTF-16 was the default.</p> + * + * <p>This concept has been abandoned. + * A lot of the ICU source code assumes UChar strings are in UTF-16. + * This is especially true for low-level code like + * conversion, normalization, and collation. + * The utf.h header enforces the default of UTF-16. + * The UTF-8 and UTF-32 macros remain for now for completeness and backward compatibility.</p> + * + * <p>Accordingly, utf.h defines UChar to be an unsigned 16-bit integer. If this matches wchar_t, then + * UChar is defined to be exactly wchar_t, otherwise uint16_t.</p> + * + * <p>UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit + * Unicode code point (Unicode scalar value, 0..0x10ffff). + * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as + * the definition of UChar. For details see the documentation for UChar32 itself.</p> + * + * <p>utf.h also defines a number of C macros for handling single Unicode code points and + * for using UTF Unicode strings. It includes utf8.h, utf16.h, and utf32.h for the actual + * implementations of those macros and then aliases one set of them (for UTF-16) for general use. + * The UTF-specific macros have the UTF size in the macro name prefixes (UTF16_...), while + * the general alias macros always begin with UTF_...</p> + * + * <p>Many string operations can be done with or without error checking. + * Where such a distinction is useful, there are two versions of the macros, "unsafe" and "safe" + * ones with ..._UNSAFE and ..._SAFE suffixes. The unsafe macros are fast but may cause + * program failures if the strings are not well-formed. The safe macros have an additional, boolean + * parameter "strict". If strict is FALSE, then only illegal sequences are detected. + * Otherwise, irregular sequences and non-characters are detected as well (like single surrogates). + * Safe macros return special error code points for illegal/irregular sequences: + * Typically, U+ffff, or values that would result in a code unit sequence of the same length + * as the erroneous input sequence.<br> + * Note that _UNSAFE macros have fewer parameters: They do not have the strictness parameter, and + * they do not have start/length parameters for boundary checking.</p> + * + * <p>Here, the macros are aliased in two steps: + * In the first step, the UTF-specific macros with UTF16_ prefix and _UNSAFE and _SAFE suffixes are + * aliased according to the UTF_SIZE to macros with UTF_ prefix and the same suffixes and signatures. + * Then, in a second step, the default, general alias macros are set to use either the unsafe or + * the safe/not strict (default) or the safe/strict macro; + * these general macros do not have a strictness parameter.</p> + * + * <p>It is possible to change the default choice for the general alias macros to be unsafe, safe/not strict or safe/strict. + * The default is safe/not strict. It is not recommended to select the unsafe macros as the basis for + * Unicode string handling in ICU! To select this, define UTF_SAFE, UTF_STRICT, or UTF_UNSAFE.</p> + * + * <p>For general use, one should use the default, general macros with UTF_ prefix and no _SAFE/_UNSAFE suffix. + * Only in some cases it may be necessary to control the choice of macro directly and use a less generic alias. + * For example, if it can be assumed that a string is well-formed and the index will stay within the bounds, + * then the _UNSAFE version may be used. + * If a UTF-8 string is to be processed, then the macros with UTF8_ prefixes need to be used.</p> + * + * <hr> + * + * @deprecated ICU 2.4. Use the macros in utf.h, utf16.h, utf8.h instead. + */ + +#ifndef __UTF_OLD_H__ +#define __UTF_OLD_H__ + +#ifndef U_HIDE_DEPRECATED_API + +#include "unicode/utf.h" +#include "unicode/utf8.h" +#include "unicode/utf16.h" + +/* Formerly utf.h, part 1 --------------------------------------------------- */ + +#ifdef U_USE_UTF_DEPRECATES +/** + * Unicode string and array offset and index type. + * ICU always counts Unicode code units (UChars) for + * string offsets, indexes, and lengths, not Unicode code points. + * + * @obsolete ICU 2.6. Use int32_t directly instead since this API will be removed in that release. + */ +typedef int32_t UTextOffset; +#endif + +/** Number of bits in a Unicode string code unit - ICU uses 16-bit Unicode. @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF_SIZE 16 + +/** + * The default choice for general Unicode string macros is to use the ..._SAFE macro implementations + * with strict=FALSE. + * + * @deprecated ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF_SAFE +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#undef UTF_UNSAFE +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#undef UTF_STRICT + +/** + * UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8, + * which need 1 or 2 bytes in UTF-8: + * \code + * U+0015 = NAK = Negative Acknowledge, C0 control character + * U+009f = highest C1 control character + * \endcode + * + * These are used by UTF8_..._SAFE macros so that they can return an error value + * that needs the same number of code units (bytes) as were seen by + * a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID(). + * + * @deprecated ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF8_ERROR_VALUE_1 0x15 + +/** + * See documentation on UTF8_ERROR_VALUE_1 for details. + * + * @deprecated ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF8_ERROR_VALUE_2 0x9f + +/** + * Error value for all UTFs. This code point value will be set by macros with error + * checking if an error is detected. + * + * @deprecated ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF_ERROR_VALUE 0xffff + +/** + * Is a given 32-bit code an error value + * as returned by one of the macros for any UTF? + * + * @deprecated ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF_IS_ERROR(c) \ + (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2) + +/** + * This is a combined macro: Is c a valid Unicode value _and_ not an error code? + * + * @deprecated ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF_IS_VALID(c) \ + (UTF_IS_UNICODE_CHAR(c) && \ + (c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2) + +/** + * Is this code unit or code point a surrogate (U+d800..U+dfff)? + * @deprecated ICU 2.4. Renamed to U_IS_SURROGATE and U16_IS_SURROGATE, see utf_old.h. + */ +#define UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800) + +/** + * Is a given 32-bit code point a Unicode noncharacter? + * + * @deprecated ICU 2.4. Renamed to U_IS_UNICODE_NONCHAR, see utf_old.h. + */ +#define UTF_IS_UNICODE_NONCHAR(c) \ + ((c)>=0xfdd0 && \ + ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \ + (uint32_t)(c)<=0x10ffff) + +/** + * Is a given 32-bit value a Unicode code point value (0..U+10ffff) + * that can be assigned a character? + * + * Code points that are not characters include: + * - single surrogate code points (U+d800..U+dfff, 2048 code points) + * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points) + * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points) + * - the highest Unicode code point value is U+10ffff + * + * This means that all code points below U+d800 are character code points, + * and that boundary is tested first for performance. + * + * @deprecated ICU 2.4. Renamed to U_IS_UNICODE_CHAR, see utf_old.h. + */ +#define UTF_IS_UNICODE_CHAR(c) \ + ((uint32_t)(c)<0xd800 || \ + ((uint32_t)(c)>0xdfff && \ + (uint32_t)(c)<=0x10ffff && \ + !UTF_IS_UNICODE_NONCHAR(c))) + +/* Formerly utf8.h ---------------------------------------------------------- */ + +/** + * Count the trail bytes for a UTF-8 lead byte. + * @deprecated ICU 2.4. Renamed to U8_COUNT_TRAIL_BYTES, see utf_old.h. + */ +#define UTF8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) + +/** + * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. + * @deprecated ICU 2.4. Renamed to U8_MASK_LEAD_BYTE, see utf_old.h. + */ +#define UTF8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) + +/** Is this this code point a single code unit (byte)? @deprecated ICU 2.4. Renamed to U8_IS_SINGLE, see utf_old.h. */ +#define UTF8_IS_SINGLE(uchar) (((uchar)&0x80)==0) +/** Is this this code unit the lead code unit (byte) of a code point? @deprecated ICU 2.4. Renamed to U8_IS_LEAD, see utf_old.h. */ +#define UTF8_IS_LEAD(uchar) ((uint8_t)((uchar)-0xc0)<0x3e) +/** Is this this code unit a trailing code unit (byte) of a code point? @deprecated ICU 2.4. Renamed to U8_IS_TRAIL, see utf_old.h. */ +#define UTF8_IS_TRAIL(uchar) (((uchar)&0xc0)==0x80) + +/** Does this scalar Unicode value need multiple code units for storage? @deprecated ICU 2.4. Use U8_LENGTH or test ((uint32_t)(c)>0x7f) instead, see utf_old.h. */ +#define UTF8_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0x7f) + +/** + * Given the lead character, how many bytes are taken by this code point. + * ICU does not deal with code points >0x10ffff + * unless necessary for advancing in the byte stream. + * + * These length macros take into account that for values >0x10ffff + * the UTF8_APPEND_CHAR_SAFE macros would write the error code point 0xffff + * with 3 bytes. + * Code point comparisons need to be in uint32_t because UChar32 + * may be a signed type, and negative values must be recognized. + * + * @deprecated ICU 2.4. Use U8_LENGTH instead, see utf.h. + */ +#if 1 +# define UTF8_CHAR_LENGTH(c) \ + ((uint32_t)(c)<=0x7f ? 1 : \ + ((uint32_t)(c)<=0x7ff ? 2 : \ + ((uint32_t)((c)-0x10000)>0xfffff ? 3 : 4) \ + ) \ + ) +#else +# define UTF8_CHAR_LENGTH(c) \ + ((uint32_t)(c)<=0x7f ? 1 : \ + ((uint32_t)(c)<=0x7ff ? 2 : \ + ((uint32_t)(c)<=0xffff ? 3 : \ + ((uint32_t)(c)<=0x10ffff ? 4 : \ + ((uint32_t)(c)<=0x3ffffff ? 5 : \ + ((uint32_t)(c)<=0x7fffffff ? 6 : 3) \ + ) \ + ) \ + ) \ + ) \ + ) +#endif + +/** The maximum number of bytes per code point. @deprecated ICU 2.4. Renamed to U8_MAX_LENGTH, see utf_old.h. */ +#define UTF8_MAX_CHAR_LENGTH 4 + +/** Average number of code units compared to UTF-16. @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF8_ARRAY_SIZE(size) ((5*(size))/2) + +/** @deprecated ICU 2.4. Renamed to U8_GET_UNSAFE, see utf_old.h. */ +#define UTF8_GET_CHAR_UNSAFE(s, i, c) { \ + int32_t _utf8_get_char_unsafe_index=(int32_t)(i); \ + UTF8_SET_CHAR_START_UNSAFE(s, _utf8_get_char_unsafe_index); \ + UTF8_NEXT_CHAR_UNSAFE(s, _utf8_get_char_unsafe_index, c); \ +} + +/** @deprecated ICU 2.4. Use U8_GET instead, see utf_old.h. */ +#define UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ + int32_t _utf8_get_char_safe_index=(int32_t)(i); \ + UTF8_SET_CHAR_START_SAFE(s, start, _utf8_get_char_safe_index); \ + UTF8_NEXT_CHAR_SAFE(s, _utf8_get_char_safe_index, length, c, strict); \ +} + +/** @deprecated ICU 2.4. Renamed to U8_NEXT_UNSAFE, see utf_old.h. */ +#define UTF8_NEXT_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[(i)++]; \ + if((uint8_t)((c)-0xc0)<0x35) { \ + uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \ + UTF8_MASK_LEAD_BYTE(c, __count); \ + switch(__count) { \ + /* each following branch falls through to the next one */ \ + case 3: \ + (c)=((c)<<6)|((s)[(i)++]&0x3f); \ + case 2: \ + (c)=((c)<<6)|((s)[(i)++]&0x3f); \ + case 1: \ + (c)=((c)<<6)|((s)[(i)++]&0x3f); \ + /* no other branches to optimize switch() */ \ + break; \ + } \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U8_APPEND_UNSAFE, see utf_old.h. */ +#define UTF8_APPEND_CHAR_UNSAFE(s, i, c) { \ + if((uint32_t)(c)<=0x7f) { \ + (s)[(i)++]=(uint8_t)(c); \ + } else { \ + if((uint32_t)(c)<=0x7ff) { \ + (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ + } else { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ + } else { \ + (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ + (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U8_FWD_1_UNSAFE, see utf_old.h. */ +#define UTF8_FWD_1_UNSAFE(s, i) { \ + (i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \ +} + +/** @deprecated ICU 2.4. Renamed to U8_FWD_N_UNSAFE, see utf_old.h. */ +#define UTF8_FWD_N_UNSAFE(s, i, n) { \ + int32_t __N=(n); \ + while(__N>0) { \ + UTF8_FWD_1_UNSAFE(s, i); \ + --__N; \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U8_SET_CP_START_UNSAFE, see utf_old.h. */ +#define UTF8_SET_CHAR_START_UNSAFE(s, i) { \ + while(UTF8_IS_TRAIL((s)[i])) { --(i); } \ +} + +/** @deprecated ICU 2.4. Use U8_NEXT instead, see utf_old.h. */ +#define UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) { \ + (c)=(s)[(i)++]; \ + if((c)>=0x80) { \ + if(UTF8_IS_LEAD(c)) { \ + (c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, strict); \ + } else { \ + (c)=UTF8_ERROR_VALUE_1; \ + } \ + } \ +} + +/** @deprecated ICU 2.4. Use U8_APPEND instead, see utf_old.h. */ +#define UTF8_APPEND_CHAR_SAFE(s, i, length, c) { \ + if((uint32_t)(c)<=0x7f) { \ + (s)[(i)++]=(uint8_t)(c); \ + } else { \ + (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(length), c, NULL); \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U8_FWD_1, see utf_old.h. */ +#define UTF8_FWD_1_SAFE(s, i, length) U8_FWD_1(s, i, length) + +/** @deprecated ICU 2.4. Renamed to U8_FWD_N, see utf_old.h. */ +#define UTF8_FWD_N_SAFE(s, i, length, n) U8_FWD_N(s, i, length, n) + +/** @deprecated ICU 2.4. Renamed to U8_SET_CP_START, see utf_old.h. */ +#define UTF8_SET_CHAR_START_SAFE(s, start, i) U8_SET_CP_START(s, start, i) + +/** @deprecated ICU 2.4. Renamed to U8_PREV_UNSAFE, see utf_old.h. */ +#define UTF8_PREV_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[--(i)]; \ + if(UTF8_IS_TRAIL(c)) { \ + uint8_t __b, __count=1, __shift=6; \ +\ + /* c is a trail byte */ \ + (c)&=0x3f; \ + for(;;) { \ + __b=(s)[--(i)]; \ + if(__b>=0xc0) { \ + UTF8_MASK_LEAD_BYTE(__b, __count); \ + (c)|=(UChar32)__b<<__shift; \ + break; \ + } else { \ + (c)|=(UChar32)(__b&0x3f)<<__shift; \ + ++__count; \ + __shift+=6; \ + } \ + } \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U8_BACK_1_UNSAFE, see utf_old.h. */ +#define UTF8_BACK_1_UNSAFE(s, i) { \ + while(UTF8_IS_TRAIL((s)[--(i)])) {} \ +} + +/** @deprecated ICU 2.4. Renamed to U8_BACK_N_UNSAFE, see utf_old.h. */ +#define UTF8_BACK_N_UNSAFE(s, i, n) { \ + int32_t __N=(n); \ + while(__N>0) { \ + UTF8_BACK_1_UNSAFE(s, i); \ + --__N; \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U8_SET_CP_LIMIT_UNSAFE, see utf_old.h. */ +#define UTF8_SET_CHAR_LIMIT_UNSAFE(s, i) { \ + UTF8_BACK_1_UNSAFE(s, i); \ + UTF8_FWD_1_UNSAFE(s, i); \ +} + +/** @deprecated ICU 2.4. Use U8_PREV instead, see utf_old.h. */ +#define UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) { \ + (c)=(s)[--(i)]; \ + if((c)>=0x80) { \ + if((c)<=0xbf) { \ + (c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \ + } else { \ + (c)=UTF8_ERROR_VALUE_1; \ + } \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U8_BACK_1, see utf_old.h. */ +#define UTF8_BACK_1_SAFE(s, start, i) U8_BACK_1(s, start, i) + +/** @deprecated ICU 2.4. Renamed to U8_BACK_N, see utf_old.h. */ +#define UTF8_BACK_N_SAFE(s, start, i, n) U8_BACK_N(s, start, i, n) + +/** @deprecated ICU 2.4. Renamed to U8_SET_CP_LIMIT, see utf_old.h. */ +#define UTF8_SET_CHAR_LIMIT_SAFE(s, start, i, length) U8_SET_CP_LIMIT(s, start, i, length) + +/* Formerly utf16.h --------------------------------------------------------- */ + +/** Is uchar a first/lead surrogate? @deprecated ICU 2.4. Renamed to U_IS_LEAD and U16_IS_LEAD, see utf_old.h. */ +#define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800) + +/** Is uchar a second/trail surrogate? @deprecated ICU 2.4. Renamed to U_IS_TRAIL and U16_IS_TRAIL, see utf_old.h. */ +#define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00) + +/** Assuming c is a surrogate, is it a first/lead surrogate? @deprecated ICU 2.4. Renamed to U_IS_SURROGATE_LEAD and U16_IS_SURROGATE_LEAD, see utf_old.h. */ +#define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0) + +/** Helper constant for UTF16_GET_PAIR_VALUE. @deprecated ICU 2.4. Renamed to U16_SURROGATE_OFFSET, see utf_old.h. */ +#define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) + +/** Get the UTF-32 value from the surrogate code units. @deprecated ICU 2.4. Renamed to U16_GET_SUPPLEMENTARY, see utf_old.h. */ +#define UTF16_GET_PAIR_VALUE(first, second) \ + (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET) + +/** @deprecated ICU 2.4. Renamed to U16_LEAD, see utf_old.h. */ +#define UTF_FIRST_SURROGATE(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) + +/** @deprecated ICU 2.4. Renamed to U16_TRAIL, see utf_old.h. */ +#define UTF_SECOND_SURROGATE(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) + +/** @deprecated ICU 2.4. Renamed to U16_LEAD, see utf_old.h. */ +#define UTF16_LEAD(supplementary) UTF_FIRST_SURROGATE(supplementary) + +/** @deprecated ICU 2.4. Renamed to U16_TRAIL, see utf_old.h. */ +#define UTF16_TRAIL(supplementary) UTF_SECOND_SURROGATE(supplementary) + +/** @deprecated ICU 2.4. Renamed to U16_IS_SINGLE, see utf_old.h. */ +#define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar) + +/** @deprecated ICU 2.4. Renamed to U16_IS_LEAD, see utf_old.h. */ +#define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar) + +/** @deprecated ICU 2.4. Renamed to U16_IS_TRAIL, see utf_old.h. */ +#define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar) + +/** Does this scalar Unicode value need multiple code units for storage? @deprecated ICU 2.4. Use U16_LENGTH or test ((uint32_t)(c)>0xffff) instead, see utf_old.h. */ +#define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff) + +/** @deprecated ICU 2.4. Renamed to U16_LENGTH, see utf_old.h. */ +#define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) + +/** @deprecated ICU 2.4. Renamed to U16_MAX_LENGTH, see utf_old.h. */ +#define UTF16_MAX_CHAR_LENGTH 2 + +/** Average number of code units compared to UTF-16. @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF16_ARRAY_SIZE(size) (size) + +/** + * Get a single code point from an offset that points to any + * of the code units that belong to that code point. + * Assume 0<=i<length. + * + * This could be used for iteration together with + * UTF16_CHAR_LENGTH() and UTF_IS_ERROR(), + * but the use of UTF16_NEXT_CHAR[_UNSAFE]() and + * UTF16_PREV_CHAR[_UNSAFE]() is more efficient for that. + * @deprecated ICU 2.4. Renamed to U16_GET_UNSAFE, see utf_old.h. + */ +#define UTF16_GET_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[i]; \ + if(UTF_IS_SURROGATE(c)) { \ + if(UTF_IS_SURROGATE_FIRST(c)) { \ + (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \ + } else { \ + (c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \ + } \ + } \ +} + +/** @deprecated ICU 2.4. Use U16_GET instead, see utf_old.h. */ +#define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ + (c)=(s)[i]; \ + if(UTF_IS_SURROGATE(c)) { \ + uint16_t __c2; \ + if(UTF_IS_SURROGATE_FIRST(c)) { \ + if((i)+1<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)+1])) { \ + (c)=UTF16_GET_PAIR_VALUE((c), __c2); \ + /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ + } else if(strict) {\ + /* unmatched first surrogate */ \ + (c)=UTF_ERROR_VALUE; \ + } \ + } else { \ + if((i)-1>=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ + (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ + /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ + } else if(strict) {\ + /* unmatched second surrogate */ \ + (c)=UTF_ERROR_VALUE; \ + } \ + } \ + } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ + (c)=UTF_ERROR_VALUE; \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U16_NEXT_UNSAFE, see utf_old.h. */ +#define UTF16_NEXT_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[(i)++]; \ + if(UTF_IS_FIRST_SURROGATE(c)) { \ + (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U16_APPEND_UNSAFE, see utf_old.h. */ +#define UTF16_APPEND_CHAR_UNSAFE(s, i, c) { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint16_t)(c); \ + } else { \ + (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U16_FWD_1_UNSAFE, see utf_old.h. */ +#define UTF16_FWD_1_UNSAFE(s, i) { \ + if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \ + ++(i); \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U16_FWD_N_UNSAFE, see utf_old.h. */ +#define UTF16_FWD_N_UNSAFE(s, i, n) { \ + int32_t __N=(n); \ + while(__N>0) { \ + UTF16_FWD_1_UNSAFE(s, i); \ + --__N; \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U16_SET_CP_START_UNSAFE, see utf_old.h. */ +#define UTF16_SET_CHAR_START_UNSAFE(s, i) { \ + if(UTF_IS_SECOND_SURROGATE((s)[i])) { \ + --(i); \ + } \ +} + +/** @deprecated ICU 2.4. Use U16_NEXT instead, see utf_old.h. */ +#define UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) { \ + (c)=(s)[(i)++]; \ + if(UTF_IS_FIRST_SURROGATE(c)) { \ + uint16_t __c2; \ + if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \ + ++(i); \ + (c)=UTF16_GET_PAIR_VALUE((c), __c2); \ + /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ + } else if(strict) {\ + /* unmatched first surrogate */ \ + (c)=UTF_ERROR_VALUE; \ + } \ + } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ + /* unmatched second surrogate or other non-character */ \ + (c)=UTF_ERROR_VALUE; \ + } \ +} + +/** @deprecated ICU 2.4. Use U16_APPEND instead, see utf_old.h. */ +#define UTF16_APPEND_CHAR_SAFE(s, i, length, c) { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint16_t)(c); \ + } else if((uint32_t)(c)<=0x10ffff) { \ + if((i)+1<(length)) { \ + (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } else /* not enough space */ { \ + (s)[(i)++]=UTF_ERROR_VALUE; \ + } \ + } else /* c>0x10ffff, write error value */ { \ + (s)[(i)++]=UTF_ERROR_VALUE; \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U16_FWD_1, see utf_old.h. */ +#define UTF16_FWD_1_SAFE(s, i, length) U16_FWD_1(s, i, length) + +/** @deprecated ICU 2.4. Renamed to U16_FWD_N, see utf_old.h. */ +#define UTF16_FWD_N_SAFE(s, i, length, n) U16_FWD_N(s, i, length, n) + +/** @deprecated ICU 2.4. Renamed to U16_SET_CP_START, see utf_old.h. */ +#define UTF16_SET_CHAR_START_SAFE(s, start, i) U16_SET_CP_START(s, start, i) + +/** @deprecated ICU 2.4. Renamed to U16_PREV_UNSAFE, see utf_old.h. */ +#define UTF16_PREV_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[--(i)]; \ + if(UTF_IS_SECOND_SURROGATE(c)) { \ + (c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U16_BACK_1_UNSAFE, see utf_old.h. */ +#define UTF16_BACK_1_UNSAFE(s, i) { \ + if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \ + --(i); \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U16_BACK_N_UNSAFE, see utf_old.h. */ +#define UTF16_BACK_N_UNSAFE(s, i, n) { \ + int32_t __N=(n); \ + while(__N>0) { \ + UTF16_BACK_1_UNSAFE(s, i); \ + --__N; \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U16_SET_CP_LIMIT_UNSAFE, see utf_old.h. */ +#define UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) { \ + if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ + ++(i); \ + } \ +} + +/** @deprecated ICU 2.4. Use U16_PREV instead, see utf_old.h. */ +#define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \ + (c)=(s)[--(i)]; \ + if(UTF_IS_SECOND_SURROGATE(c)) { \ + uint16_t __c2; \ + if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ + --(i); \ + (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ + /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ + } else if(strict) {\ + /* unmatched second surrogate */ \ + (c)=UTF_ERROR_VALUE; \ + } \ + } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ + /* unmatched first surrogate or other non-character */ \ + (c)=UTF_ERROR_VALUE; \ + } \ +} + +/** @deprecated ICU 2.4. Renamed to U16_BACK_1, see utf_old.h. */ +#define UTF16_BACK_1_SAFE(s, start, i) U16_BACK_1(s, start, i) + +/** @deprecated ICU 2.4. Renamed to U16_BACK_N, see utf_old.h. */ +#define UTF16_BACK_N_SAFE(s, start, i, n) U16_BACK_N(s, start, i, n) + +/** @deprecated ICU 2.4. Renamed to U16_SET_CP_LIMIT, see utf_old.h. */ +#define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length) + +/* Formerly utf32.h --------------------------------------------------------- */ + +/* +* Old documentation: +* +* This file defines macros to deal with UTF-32 code units and code points. +* Signatures and semantics are the same as for the similarly named macros +* in utf16.h. +* utf32.h is included by utf.h after unicode/umachine.h</p> +* and some common definitions. +* <p><b>Usage:</b> ICU coding guidelines for if() statements should be followed when using these macros. +* Compound statements (curly braces {}) must be used for if-else-while... +* bodies and all macro statements should be terminated with semicolon.</p> +*/ + +/* internal definitions ----------------------------------------------------- */ + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_IS_SAFE(c, strict) \ + (!(strict) ? \ + (uint32_t)(c)<=0x10ffff : \ + UTF_IS_UNICODE_CHAR(c)) + +/* + * For the semantics of all of these macros, see utf16.h. + * The UTF-32 versions are trivial because any code point is + * encoded using exactly one code unit. + */ + +/* single-code point definitions -------------------------------------------- */ + +/* classes of code unit values */ + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_IS_SINGLE(uchar) 1 +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_IS_LEAD(uchar) 0 +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_IS_TRAIL(uchar) 0 + +/* number of code units per code point */ + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_NEED_MULTIPLE_UCHAR(c) 0 +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_CHAR_LENGTH(c) 1 +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_MAX_CHAR_LENGTH 1 + +/* average number of code units compared to UTF-16 */ + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_ARRAY_SIZE(size) (size) + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_GET_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[i]; \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ + (c)=(s)[i]; \ + if(!UTF32_IS_SAFE(c, strict)) { \ + (c)=UTF_ERROR_VALUE; \ + } \ +} + +/* definitions with forward iteration --------------------------------------- */ + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_NEXT_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[(i)++]; \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_APPEND_CHAR_UNSAFE(s, i, c) { \ + (s)[(i)++]=(c); \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_FWD_1_UNSAFE(s, i) { \ + ++(i); \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_FWD_N_UNSAFE(s, i, n) { \ + (i)+=(n); \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_SET_CHAR_START_UNSAFE(s, i) { \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_NEXT_CHAR_SAFE(s, i, length, c, strict) { \ + (c)=(s)[(i)++]; \ + if(!UTF32_IS_SAFE(c, strict)) { \ + (c)=UTF_ERROR_VALUE; \ + } \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_APPEND_CHAR_SAFE(s, i, length, c) { \ + if((uint32_t)(c)<=0x10ffff) { \ + (s)[(i)++]=(c); \ + } else /* c>0x10ffff, write 0xfffd */ { \ + (s)[(i)++]=0xfffd; \ + } \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_FWD_1_SAFE(s, i, length) { \ + ++(i); \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_FWD_N_SAFE(s, i, length, n) { \ + if(((i)+=(n))>(length)) { \ + (i)=(length); \ + } \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_SET_CHAR_START_SAFE(s, start, i) { \ +} + +/* definitions with backward iteration -------------------------------------- */ + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_PREV_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[--(i)]; \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_BACK_1_UNSAFE(s, i) { \ + --(i); \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_BACK_N_UNSAFE(s, i, n) { \ + (i)-=(n); \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) { \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) { \ + (c)=(s)[--(i)]; \ + if(!UTF32_IS_SAFE(c, strict)) { \ + (c)=UTF_ERROR_VALUE; \ + } \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_BACK_1_SAFE(s, start, i) { \ + --(i); \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_BACK_N_SAFE(s, start, i, n) { \ + (i)-=(n); \ + if((i)<(start)) { \ + (i)=(start); \ + } \ +} + +/** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_SET_CHAR_LIMIT_SAFE(s, i, length) { \ +} + +/* Formerly utf.h, part 2 --------------------------------------------------- */ + +/** + * Estimate the number of code units for a string based on the number of UTF-16 code units. + * + * @deprecated ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size) + +/** @deprecated ICU 2.4. Renamed to U16_GET_UNSAFE, see utf_old.h. */ +#define UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c) + +/** @deprecated ICU 2.4. Use U16_GET instead, see utf_old.h. */ +#define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) + + +/** @deprecated ICU 2.4. Renamed to U16_NEXT_UNSAFE, see utf_old.h. */ +#define UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c) + +/** @deprecated ICU 2.4. Use U16_NEXT instead, see utf_old.h. */ +#define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) + + +/** @deprecated ICU 2.4. Renamed to U16_APPEND_UNSAFE, see utf_old.h. */ +#define UTF_APPEND_CHAR_UNSAFE(s, i, c) UTF16_APPEND_CHAR_UNSAFE(s, i, c) + +/** @deprecated ICU 2.4. Use U16_APPEND instead, see utf_old.h. */ +#define UTF_APPEND_CHAR_SAFE(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c) + + +/** @deprecated ICU 2.4. Renamed to U16_FWD_1_UNSAFE, see utf_old.h. */ +#define UTF_FWD_1_UNSAFE(s, i) UTF16_FWD_1_UNSAFE(s, i) + +/** @deprecated ICU 2.4. Renamed to U16_FWD_1, see utf_old.h. */ +#define UTF_FWD_1_SAFE(s, i, length) UTF16_FWD_1_SAFE(s, i, length) + + +/** @deprecated ICU 2.4. Renamed to U16_FWD_N_UNSAFE, see utf_old.h. */ +#define UTF_FWD_N_UNSAFE(s, i, n) UTF16_FWD_N_UNSAFE(s, i, n) + +/** @deprecated ICU 2.4. Renamed to U16_FWD_N, see utf_old.h. */ +#define UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n) + + +/** @deprecated ICU 2.4. Renamed to U16_SET_CP_START_UNSAFE, see utf_old.h. */ +#define UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i) + +/** @deprecated ICU 2.4. Renamed to U16_SET_CP_START, see utf_old.h. */ +#define UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i) + + +/** @deprecated ICU 2.4. Renamed to U16_PREV_UNSAFE, see utf_old.h. */ +#define UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c) + +/** @deprecated ICU 2.4. Use U16_PREV instead, see utf_old.h. */ +#define UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) + + +/** @deprecated ICU 2.4. Renamed to U16_BACK_1_UNSAFE, see utf_old.h. */ +#define UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i) + +/** @deprecated ICU 2.4. Renamed to U16_BACK_1, see utf_old.h. */ +#define UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i) + + +/** @deprecated ICU 2.4. Renamed to U16_BACK_N_UNSAFE, see utf_old.h. */ +#define UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n) + +/** @deprecated ICU 2.4. Renamed to U16_BACK_N, see utf_old.h. */ +#define UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n) + + +/** @deprecated ICU 2.4. Renamed to U16_SET_CP_LIMIT_UNSAFE, see utf_old.h. */ +#define UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) + +/** @deprecated ICU 2.4. Renamed to U16_SET_CP_LIMIT, see utf_old.h. */ +#define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) + +/* Define default macros (UTF-16 "safe") ------------------------------------ */ + +/** + * Does this code unit alone encode a code point (BMP, not a surrogate)? + * Same as UTF16_IS_SINGLE. + * @deprecated ICU 2.4. Renamed to U_IS_SINGLE and U16_IS_SINGLE, see utf_old.h. + */ +#define UTF_IS_SINGLE(uchar) U16_IS_SINGLE(uchar) + +/** + * Is this code unit the first one of several (a lead surrogate)? + * Same as UTF16_IS_LEAD. + * @deprecated ICU 2.4. Renamed to U_IS_LEAD and U16_IS_LEAD, see utf_old.h. + */ +#define UTF_IS_LEAD(uchar) U16_IS_LEAD(uchar) + +/** + * Is this code unit one of several but not the first one (a trail surrogate)? + * Same as UTF16_IS_TRAIL. + * @deprecated ICU 2.4. Renamed to U_IS_TRAIL and U16_IS_TRAIL, see utf_old.h. + */ +#define UTF_IS_TRAIL(uchar) U16_IS_TRAIL(uchar) + +/** + * Does this code point require multiple code units (is it a supplementary code point)? + * Same as UTF16_NEED_MULTIPLE_UCHAR. + * @deprecated ICU 2.4. Use U16_LENGTH or test ((uint32_t)(c)>0xffff) instead. + */ +#define UTF_NEED_MULTIPLE_UCHAR(c) UTF16_NEED_MULTIPLE_UCHAR(c) + +/** + * How many code units are used to encode this code point (1 or 2)? + * Same as UTF16_CHAR_LENGTH. + * @deprecated ICU 2.4. Renamed to U16_LENGTH, see utf_old.h. + */ +#define UTF_CHAR_LENGTH(c) U16_LENGTH(c) + +/** + * How many code units are used at most for any Unicode code point (2)? + * Same as UTF16_MAX_CHAR_LENGTH. + * @deprecated ICU 2.4. Renamed to U16_MAX_LENGTH, see utf_old.h. + */ +#define UTF_MAX_CHAR_LENGTH U16_MAX_LENGTH + +/** + * Set c to the code point that contains the code unit i. + * i could point to the lead or the trail surrogate for the code point. + * i is not modified. + * Same as UTF16_GET_CHAR. + * \pre 0<=i<length + * + * @deprecated ICU 2.4. Renamed to U16_GET, see utf_old.h. + */ +#define UTF_GET_CHAR(s, start, i, length, c) U16_GET(s, start, i, length, c) + +/** + * Set c to the code point that starts at code unit i + * and advance i to beyond the code units of this code point (post-increment). + * i must point to the first code unit of a code point. + * Otherwise c is set to the trail unit (surrogate) itself. + * Same as UTF16_NEXT_CHAR. + * \pre 0<=i<length + * \post 0<i<=length + * + * @deprecated ICU 2.4. Renamed to U16_NEXT, see utf_old.h. + */ +#define UTF_NEXT_CHAR(s, i, length, c) U16_NEXT(s, i, length, c) + +/** + * Append the code units of code point c to the string at index i + * and advance i to beyond the new code units (post-increment). + * The code units beginning at index i will be overwritten. + * Same as UTF16_APPEND_CHAR. + * \pre 0<=c<=0x10ffff + * \pre 0<=i<length + * \post 0<i<=length + * + * @deprecated ICU 2.4. Use U16_APPEND instead, see utf_old.h. + */ +#define UTF_APPEND_CHAR(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c) + +/** + * Advance i to beyond the code units of the code point that begins at i. + * I.e., advance i by one code point. + * Same as UTF16_FWD_1. + * \pre 0<=i<length + * \post 0<i<=length + * + * @deprecated ICU 2.4. Renamed to U16_FWD_1, see utf_old.h. + */ +#define UTF_FWD_1(s, i, length) U16_FWD_1(s, i, length) + +/** + * Advance i to beyond the code units of the n code points where the first one begins at i. + * I.e., advance i by n code points. + * Same as UT16_FWD_N. + * \pre 0<=i<length + * \post 0<i<=length + * + * @deprecated ICU 2.4. Renamed to U16_FWD_N, see utf_old.h. + */ +#define UTF_FWD_N(s, i, length, n) U16_FWD_N(s, i, length, n) + +/** + * Take the random-access index i and adjust it so that it points to the beginning + * of a code point. + * The input index points to any code unit of a code point and is moved to point to + * the first code unit of the same code point. i is never incremented. + * In other words, if i points to a trail surrogate that is preceded by a matching + * lead surrogate, then i is decremented. Otherwise it is not modified. + * This can be used to start an iteration with UTF_NEXT_CHAR() from a random index. + * Same as UTF16_SET_CHAR_START. + * \pre start<=i<length + * \post start<=i<length + * + * @deprecated ICU 2.4. Renamed to U16_SET_CP_START, see utf_old.h. + */ +#define UTF_SET_CHAR_START(s, start, i) U16_SET_CP_START(s, start, i) + +/** + * Set c to the code point that has code units before i + * and move i backward (towards the beginning of the string) + * to the first code unit of this code point (pre-increment). + * i must point to the first code unit after the last unit of a code point (i==length is allowed). + * Same as UTF16_PREV_CHAR. + * \pre start<i<=length + * \post start<=i<length + * + * @deprecated ICU 2.4. Renamed to U16_PREV, see utf_old.h. + */ +#define UTF_PREV_CHAR(s, start, i, c) U16_PREV(s, start, i, c) + +/** + * Move i backward (towards the beginning of the string) + * to the first code unit of the code point that has code units before i. + * I.e., move i backward by one code point. + * i must point to the first code unit after the last unit of a code point (i==length is allowed). + * Same as UTF16_BACK_1. + * \pre start<i<=length + * \post start<=i<length + * + * @deprecated ICU 2.4. Renamed to U16_BACK_1, see utf_old.h. + */ +#define UTF_BACK_1(s, start, i) U16_BACK_1(s, start, i) + +/** + * Move i backward (towards the beginning of the string) + * to the first code unit of the n code points that have code units before i. + * I.e., move i backward by n code points. + * i must point to the first code unit after the last unit of a code point (i==length is allowed). + * Same as UTF16_BACK_N. + * \pre start<i<=length + * \post start<=i<length + * + * @deprecated ICU 2.4. Renamed to U16_BACK_N, see utf_old.h. + */ +#define UTF_BACK_N(s, start, i, n) U16_BACK_N(s, start, i, n) + +/** + * Take the random-access index i and adjust it so that it points beyond + * a code point. The input index points beyond any code unit + * of a code point and is moved to point beyond the last code unit of the same + * code point. i is never decremented. + * In other words, if i points to a trail surrogate that is preceded by a matching + * lead surrogate, then i is incremented. Otherwise it is not modified. + * This can be used to start an iteration with UTF_PREV_CHAR() from a random index. + * Same as UTF16_SET_CHAR_LIMIT. + * \pre start<i<=length + * \post start<i<=length + * + * @deprecated ICU 2.4. Renamed to U16_SET_CP_LIMIT, see utf_old.h. + */ +#define UTF_SET_CHAR_LIMIT(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length) + +#endif /* U_HIDE_DEPRECATED_API */ + +#endif + diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/utrace.h b/src/core/basetypes/icu-ucsdet/include/unicode/utrace.h new file mode 100644 index 00000000..2621cf9c --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/utrace.h @@ -0,0 +1,359 @@ +/* +******************************************************************************* +* +* Copyright (C) 2003-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: utrace.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003aug06 +* created by: Markus W. Scherer +* +* Definitions for ICU tracing/logging. +* +*/ + +#ifndef __UTRACE_H__ +#define __UTRACE_H__ + +#include <stdarg.h> +#include "unicode/utypes.h" + +/** + * \file + * \brief C API: Definitions for ICU tracing/logging. + * + * This provides API for debugging the internals of ICU without the use of + * a traditional debugger. + * + * By default, tracing is disabled in ICU. If you need to debug ICU with + * tracing, please compile ICU with the --enable-tracing configure option. + */ + +U_CDECL_BEGIN + +/** + * Trace severity levels. Higher levels increase the verbosity of the trace output. + * @see utrace_setLevel + * @stable ICU 2.8 + */ +typedef enum UTraceLevel { + /** Disable all tracing @stable ICU 2.8*/ + UTRACE_OFF=-1, + /** Trace error conditions only @stable ICU 2.8*/ + UTRACE_ERROR=0, + /** Trace errors and warnings @stable ICU 2.8*/ + UTRACE_WARNING=3, + /** Trace opens and closes of ICU services @stable ICU 2.8*/ + UTRACE_OPEN_CLOSE=5, + /** Trace an intermediate number of ICU operations @stable ICU 2.8*/ + UTRACE_INFO=7, + /** Trace the maximum number of ICU operations @stable ICU 2.8*/ + UTRACE_VERBOSE=9 +} UTraceLevel; + +/** + * These are the ICU functions that will be traced when tracing is enabled. + * @stable ICU 2.8 + */ +typedef enum UTraceFunctionNumber { + UTRACE_FUNCTION_START=0, + UTRACE_U_INIT=UTRACE_FUNCTION_START, + UTRACE_U_CLEANUP, + UTRACE_FUNCTION_LIMIT, + + UTRACE_CONVERSION_START=0x1000, + UTRACE_UCNV_OPEN=UTRACE_CONVERSION_START, + UTRACE_UCNV_OPEN_PACKAGE, + UTRACE_UCNV_OPEN_ALGORITHMIC, + UTRACE_UCNV_CLONE, + UTRACE_UCNV_CLOSE, + UTRACE_UCNV_FLUSH_CACHE, + UTRACE_UCNV_LOAD, + UTRACE_UCNV_UNLOAD, + UTRACE_CONVERSION_LIMIT, + + UTRACE_COLLATION_START=0x2000, + UTRACE_UCOL_OPEN=UTRACE_COLLATION_START, + UTRACE_UCOL_CLOSE, + UTRACE_UCOL_STRCOLL, + UTRACE_UCOL_GET_SORTKEY, + UTRACE_UCOL_GETLOCALE, + UTRACE_UCOL_NEXTSORTKEYPART, + UTRACE_UCOL_STRCOLLITER, + UTRACE_UCOL_OPEN_FROM_SHORT_STRING, + UTRACE_UCOL_STRCOLLUTF8, /**< @stable ICU 50 */ + UTRACE_COLLATION_LIMIT +} UTraceFunctionNumber; + +/** + * Setter for the trace level. + * @param traceLevel A UTraceLevel value. + * @stable ICU 2.8 + */ +U_STABLE void U_EXPORT2 +utrace_setLevel(int32_t traceLevel); + +/** + * Getter for the trace level. + * @return The UTraceLevel value being used by ICU. + * @stable ICU 2.8 + */ +U_STABLE int32_t U_EXPORT2 +utrace_getLevel(void); + +/* Trace function pointers types ----------------------------- */ + +/** + * Type signature for the trace function to be called when entering a function. + * @param context value supplied at the time the trace functions are set. + * @param fnNumber Enum value indicating the ICU function being entered. + * @stable ICU 2.8 + */ +typedef void U_CALLCONV +UTraceEntry(const void *context, int32_t fnNumber); + +/** + * Type signature for the trace function to be called when exiting from a function. + * @param context value supplied at the time the trace functions are set. + * @param fnNumber Enum value indicating the ICU function being exited. + * @param fmt A formatting string that describes the number and types + * of arguments included with the variable args. The fmt + * string has the same form as the utrace_vformat format + * string. + * @param args A variable arguments list. Contents are described by + * the fmt parameter. + * @see utrace_vformat + * @stable ICU 2.8 + */ +typedef void U_CALLCONV +UTraceExit(const void *context, int32_t fnNumber, + const char *fmt, va_list args); + +/** + * Type signature for the trace function to be called from within an ICU function + * to display data or messages. + * @param context value supplied at the time the trace functions are set. + * @param fnNumber Enum value indicating the ICU function being exited. + * @param level The current tracing level + * @param fmt A format string describing the tracing data that is supplied + * as variable args + * @param args The data being traced, passed as variable args. + * @stable ICU 2.8 + */ +typedef void U_CALLCONV +UTraceData(const void *context, int32_t fnNumber, int32_t level, + const char *fmt, va_list args); + +/** + * Set ICU Tracing functions. Installs application-provided tracing + * functions into ICU. After doing this, subsequent ICU operations + * will call back to the installed functions, providing a trace + * of the use of ICU. Passing a NULL pointer for a tracing function + * is allowed, and inhibits tracing action at points where that function + * would be called. + * <p> + * Tracing and Threads: Tracing functions are global to a process, and + * will be called in response to ICU operations performed by any + * thread. If tracing of an individual thread is desired, the + * tracing functions must themselves filter by checking that the + * current thread is the desired thread. + * + * @param context an uninterpretted pointer. Whatever is passed in + * here will in turn be passed to each of the tracing + * functions UTraceEntry, UTraceExit and UTraceData. + * ICU does not use or alter this pointer. + * @param e Callback function to be called on entry to a + * a traced ICU function. + * @param x Callback function to be called on exit from a + * traced ICU function. + * @param d Callback function to be called from within a + * traced ICU function, for the purpose of providing + * data to the trace. + * + * @stable ICU 2.8 + */ +U_STABLE void U_EXPORT2 +utrace_setFunctions(const void *context, + UTraceEntry *e, UTraceExit *x, UTraceData *d); + +/** + * Get the currently installed ICU tracing functions. Note that a null function + * pointer will be returned if no trace function has been set. + * + * @param context The currently installed tracing context. + * @param e The currently installed UTraceEntry function. + * @param x The currently installed UTraceExit function. + * @param d The currently installed UTraceData function. + * @stable ICU 2.8 + */ +U_STABLE void U_EXPORT2 +utrace_getFunctions(const void **context, + UTraceEntry **e, UTraceExit **x, UTraceData **d); + + + +/* + * + * ICU trace format string syntax + * + * Format Strings are passed to UTraceData functions, and define the + * number and types of the trace data being passed on each call. + * + * The UTraceData function, which is supplied by the application, + * not by ICU, can either forward the trace data (passed via + * varargs) and the format string back to ICU for formatting into + * a displayable string, or it can interpret the format itself, + * and do as it wishes with the trace data. + * + * + * Goals for the format string + * - basic data output + * - easy to use for trace programmer + * - sufficient provision for data types for trace output readability + * - well-defined types and binary portable APIs + * + * Non-goals + * - printf compatibility + * - fancy formatting + * - argument reordering and other internationalization features + * + * ICU trace format strings contain plain text with argument inserts, + * much like standard printf format strings. + * Each insert begins with a '%', then optionally contains a 'v', + * then exactly one type character. + * Two '%' in a row represent a '%' instead of an insert. + * The trace format strings need not have \n at the end. + * + * + * Types + * ----- + * + * Type characters: + * - c A char character in the default codepage. + * - s A NUL-terminated char * string in the default codepage. + * - S A UChar * string. Requires two params, (ptr, length). Length=-1 for nul term. + * - b A byte (8-bit integer). + * - h A 16-bit integer. Also a 16 bit Unicode code unit. + * - d A 32-bit integer. Also a 20 bit Unicode code point value. + * - l A 64-bit integer. + * - p A data pointer. + * + * Vectors + * ------- + * + * If the 'v' is not specified, then one item of the specified type + * is passed in. + * If the 'v' (for "vector") is specified, then a vector of items of the + * specified type is passed in, via a pointer to the first item + * and an int32_t value for the length of the vector. + * Length==-1 means zero or NUL termination. Works for vectors of all types. + * + * Note: %vS is a vector of (UChar *) strings. The strings must + * be nul terminated as there is no way to provide a + * separate length parameter for each string. The length + * parameter (required for all vectors) is the number of + * strings, not the length of the strings. + * + * Examples + * -------- + * + * These examples show the parameters that will be passed to an application's + * UTraceData() function for various formats. + * + * - the precise formatting is up to the application! + * - the examples use type casts for arguments only to _show_ the types of + * arguments without needing variable declarations in the examples; + * the type casts will not be necessary in actual code + * + * UTraceDataFunc(context, fnNumber, level, + * "There is a character %c in the string %s.", // Format String + * (char)c, (const char *)s); // varargs parameters + * -> There is a character 0x42 'B' in the string "Bravo". + * + * UTraceDataFunc(context, fnNumber, level, + * "Vector of bytes %vb vector of chars %vc", + * (const uint8_t *)bytes, (int32_t)bytesLength, + * (const char *)chars, (int32_t)charsLength); + * -> Vector of bytes + * 42 63 64 3f [4] + * vector of chars + * "Bcd?"[4] + * + * UTraceDataFunc(context, fnNumber, level, + * "An int32_t %d and a whole bunch of them %vd", + * (int32_t)-5, (const int32_t *)ints, (int32_t)intsLength); + * -> An int32_t 0xfffffffb and a whole bunch of them + * fffffffb 00000005 0000010a [3] + * + */ + + + +/** + * Trace output Formatter. An application's UTraceData tracing functions may call + * back to this function to format the trace output in a + * human readable form. Note that a UTraceData function may choose + * to not format the data; it could, for example, save it in + * in the raw form it was received (more compact), leaving + * formatting for a later trace analyis tool. + * @param outBuf pointer to a buffer to receive the formatted output. Output + * will be nul terminated if there is space in the buffer - + * if the length of the requested output < the output buffer size. + * @param capacity Length of the output buffer. + * @param indent Number of spaces to indent the output. Intended to allow + * data displayed from nested functions to be indented for readability. + * @param fmt Format specification for the data to output + * @param args Data to be formatted. + * @return Length of formatted output, including the terminating NUL. + * If buffer capacity is insufficient, the required capacity is returned. + * @stable ICU 2.8 + */ +U_STABLE int32_t U_EXPORT2 +utrace_vformat(char *outBuf, int32_t capacity, + int32_t indent, const char *fmt, va_list args); + +/** + * Trace output Formatter. An application's UTraceData tracing functions may call + * this function to format any additional trace data, beyond that + * provided by default, in human readable form with the same + * formatting conventions used by utrace_vformat(). + * @param outBuf pointer to a buffer to receive the formatted output. Output + * will be nul terminated if there is space in the buffer - + * if the length of the requested output < the output buffer size. + * @param capacity Length of the output buffer. + * @param indent Number of spaces to indent the output. Intended to allow + * data displayed from nested functions to be indented for readability. + * @param fmt Format specification for the data to output + * @param ... Data to be formatted. + * @return Length of formatted output, including the terminating NUL. + * If buffer capacity is insufficient, the required capacity is returned. + * @stable ICU 2.8 + */ +U_STABLE int32_t U_EXPORT2 +utrace_format(char *outBuf, int32_t capacity, + int32_t indent, const char *fmt, ...); + + + +/* Trace function numbers --------------------------------------------------- */ + +/** + * Get the name of a function from its trace function number. + * + * @param fnNumber The trace number for an ICU function. + * @return The name string for the function. + * + * @see UTraceFunctionNumber + * @stable ICU 2.8 + */ +U_STABLE const char * U_EXPORT2 +utrace_functionName(int32_t fnNumber); + +U_CDECL_END + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/utypes.h b/src/core/basetypes/icu-ucsdet/include/unicode/utypes.h new file mode 100644 index 00000000..704089c7 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/utypes.h @@ -0,0 +1,723 @@ +/* +********************************************************************** +* Copyright (C) 1996-2014, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* FILE NAME : UTYPES.H (formerly ptypes.h) +* +* Date Name Description +* 12/11/96 helena Creation. +* 02/27/97 aliu Added typedefs for UClassID, int8, int16, int32, +* uint8, uint16, and uint32. +* 04/01/97 aliu Added XP_CPLUSPLUS and modified to work under C as +* well as C++. +* Modified to use memcpy() for uprv_arrayCopy() fns. +* 04/14/97 aliu Added TPlatformUtilities. +* 05/07/97 aliu Added import/export specifiers (replacing the old +* broken EXT_CLASS). Added version number for our +* code. Cleaned up header. +* 6/20/97 helena Java class name change. +* 08/11/98 stephen UErrorCode changed from typedef to enum +* 08/12/98 erm Changed T_ANALYTIC_PACKAGE_VERSION to 3 +* 08/14/98 stephen Added uprv_arrayCopy() for int8_t, int16_t, int32_t +* 12/09/98 jfitz Added BUFFER_OVERFLOW_ERROR (bug 1100066) +* 04/20/99 stephen Cleaned up & reworked for autoconf. +* Renamed to utypes.h. +* 05/05/99 stephen Changed to use <inttypes.h> +* 12/07/99 helena Moved copyright notice string from ucnv_bld.h here. +******************************************************************************* +*/ + +#ifndef UTYPES_H +#define UTYPES_H + + +#include "unicode/umachine.h" +#include "unicode/uversion.h" +#include "unicode/uconfig.h" +#include <float.h> + +#if !U_NO_DEFAULT_INCLUDE_UTF_HEADERS +# include "unicode/utf.h" +#endif + +/*! + * \file + * \brief Basic definitions for ICU, for both C and C++ APIs + * + * This file defines basic types, constants, and enumerations directly or + * indirectly by including other header files, especially utf.h for the + * basic character and string definitions and umachine.h for consistent + * integer and other types. + */ + + +/** + * \def U_SHOW_CPLUSPLUS_API + * @internal + */ +#ifdef __cplusplus +# ifndef U_SHOW_CPLUSPLUS_API +# define U_SHOW_CPLUSPLUS_API 1 +# endif +#else +# undef U_SHOW_CPLUSPLUS_API +# define U_SHOW_CPLUSPLUS_API 0 +#endif + +/** @{ API visibility control */ + +/** + * \def U_HIDE_DRAFT_API + * Define this to 1 to request that draft API be "hidden" + * @internal + */ +/** + * \def U_HIDE_INTERNAL_API + * Define this to 1 to request that internal API be "hidden" + * @internal + */ +#if !U_DEFAULT_SHOW_DRAFT && !defined(U_SHOW_DRAFT_API) +#define U_HIDE_DRAFT_API 1 +#endif +#if !U_DEFAULT_SHOW_DRAFT && !defined(U_SHOW_INTERNAL_API) +#define U_HIDE_INTERNAL_API 1 +#endif + +/** @} */ + +/*===========================================================================*/ +/* ICUDATA naming scheme */ +/*===========================================================================*/ + +/** + * \def U_ICUDATA_TYPE_LETTER + * + * This is a platform-dependent string containing one letter: + * - b for big-endian, ASCII-family platforms + * - l for little-endian, ASCII-family platforms + * - e for big-endian, EBCDIC-family platforms + * This letter is part of the common data file name. + * @stable ICU 2.0 + */ + +/** + * \def U_ICUDATA_TYPE_LITLETTER + * The non-string form of U_ICUDATA_TYPE_LETTER + * @stable ICU 2.0 + */ +#if U_CHARSET_FAMILY +# if U_IS_BIG_ENDIAN + /* EBCDIC - should always be BE */ +# define U_ICUDATA_TYPE_LETTER "e" +# define U_ICUDATA_TYPE_LITLETTER e +# else +# error "Don't know what to do with little endian EBCDIC!" +# define U_ICUDATA_TYPE_LETTER "x" +# define U_ICUDATA_TYPE_LITLETTER x +# endif +#else +# if U_IS_BIG_ENDIAN + /* Big-endian ASCII */ +# define U_ICUDATA_TYPE_LETTER "b" +# define U_ICUDATA_TYPE_LITLETTER b +# else + /* Little-endian ASCII */ +# define U_ICUDATA_TYPE_LETTER "l" +# define U_ICUDATA_TYPE_LITLETTER l +# endif +#endif + +/** + * A single string literal containing the icudata stub name. i.e. 'icudt18e' for + * ICU 1.8.x on EBCDIC, etc.. + * @stable ICU 2.0 + */ +#define U_ICUDATA_NAME "icudt" U_ICU_VERSION_SHORT U_ICUDATA_TYPE_LETTER +#ifndef U_HIDE_INTERNAL_API +#define U_USRDATA_NAME "usrdt" U_ICU_VERSION_SHORT U_ICUDATA_TYPE_LETTER /**< @internal */ +#define U_USE_USRDATA 1 /**< @internal */ +#endif /* U_HIDE_INTERNAL_API */ + +/** + * U_ICU_ENTRY_POINT is the name of the DLL entry point to the ICU data library. + * Defined as a literal, not a string. + * Tricky Preprocessor use - ## operator replaces macro paramters with the literal string + * from the corresponding macro invocation, _before_ other macro substitutions. + * Need a nested \#defines to get the actual version numbers rather than + * the literal text U_ICU_VERSION_MAJOR_NUM into the name. + * The net result will be something of the form + * \#define U_ICU_ENTRY_POINT icudt19_dat + * @stable ICU 2.4 + */ +#define U_ICUDATA_ENTRY_POINT U_DEF2_ICUDATA_ENTRY_POINT(U_ICU_VERSION_MAJOR_NUM,U_LIB_SUFFIX_C_NAME) + +#ifndef U_HIDE_INTERNAL_API +/** + * Do not use. Note that it's OK for the 2nd argument to be undefined (literal). + * @internal + */ +#define U_DEF2_ICUDATA_ENTRY_POINT(major,suff) U_DEF_ICUDATA_ENTRY_POINT(major,suff) + +/** + * Do not use. + * @internal + */ +#ifndef U_DEF_ICUDATA_ENTRY_POINT +/* affected by symbol renaming. See platform.h */ +#ifndef U_LIB_SUFFIX_C_NAME +#define U_DEF_ICUDATA_ENTRY_POINT(major, suff) icudt##major##_dat +#else +#define U_DEF_ICUDATA_ENTRY_POINT(major, suff) icudt##suff ## major##_dat +#endif +#endif +#endif /* U_HIDE_INTERNAL_API */ + +/** + * \def NULL + * Define NULL if necessary, to 0 for C++ and to ((void *)0) for C. + * @stable ICU 2.0 + */ +#ifndef NULL +#ifdef __cplusplus +#define NULL 0 +#else +#define NULL ((void *)0) +#endif +#endif + +/*===========================================================================*/ +/* Calendar/TimeZone data types */ +/*===========================================================================*/ + +/** + * Date and Time data type. + * This is a primitive data type that holds the date and time + * as the number of milliseconds since 1970-jan-01, 00:00 UTC. + * UTC leap seconds are ignored. + * @stable ICU 2.0 + */ +typedef double UDate; + +/** The number of milliseconds per second @stable ICU 2.0 */ +#define U_MILLIS_PER_SECOND (1000) +/** The number of milliseconds per minute @stable ICU 2.0 */ +#define U_MILLIS_PER_MINUTE (60000) +/** The number of milliseconds per hour @stable ICU 2.0 */ +#define U_MILLIS_PER_HOUR (3600000) +/** The number of milliseconds per day @stable ICU 2.0 */ +#define U_MILLIS_PER_DAY (86400000) + +/** + * Maximum UDate value + * @stable ICU 4.8 + */ +#define U_DATE_MAX DBL_MAX + +/** + * Minimum UDate value + * @stable ICU 4.8 + */ +#define U_DATE_MIN -U_DATE_MAX + +/*===========================================================================*/ +/* Shared library/DLL import-export API control */ +/*===========================================================================*/ + +/* + * Control of symbol import/export. + * ICU is separated into three libraries. + */ + +/** + * \def U_COMBINED_IMPLEMENTATION + * Set to export library symbols from inside the ICU library + * when all of ICU is in a single library. + * This can be set as a compiler option while building ICU, and it + * needs to be the first one tested to override U_COMMON_API, U_I18N_API, etc. + * @stable ICU 2.0 + */ + +/** + * \def U_DATA_API + * Set to export library symbols from inside the stubdata library, + * and to import them from outside. + * @stable ICU 3.0 + */ + +/** + * \def U_COMMON_API + * Set to export library symbols from inside the common library, + * and to import them from outside. + * @stable ICU 2.0 + */ + +/** + * \def U_I18N_API + * Set to export library symbols from inside the i18n library, + * and to import them from outside. + * @stable ICU 2.0 + */ + +/** + * \def U_LAYOUT_API + * Set to export library symbols from inside the layout engine library, + * and to import them from outside. + * @stable ICU 2.0 + */ + +/** + * \def U_LAYOUTEX_API + * Set to export library symbols from inside the layout extensions library, + * and to import them from outside. + * @stable ICU 2.6 + */ + +/** + * \def U_IO_API + * Set to export library symbols from inside the ustdio library, + * and to import them from outside. + * @stable ICU 2.0 + */ + +/** + * \def U_TOOLUTIL_API + * Set to export library symbols from inside the toolutil library, + * and to import them from outside. + * @stable ICU 3.4 + */ + +#if defined(U_COMBINED_IMPLEMENTATION) +#define U_DATA_API U_EXPORT +#define U_COMMON_API U_EXPORT +#define U_I18N_API U_EXPORT +#define U_LAYOUT_API U_EXPORT +#define U_LAYOUTEX_API U_EXPORT +#define U_IO_API U_EXPORT +#define U_TOOLUTIL_API U_EXPORT +#elif defined(U_STATIC_IMPLEMENTATION) +#define U_DATA_API +#define U_COMMON_API +#define U_I18N_API +#define U_LAYOUT_API +#define U_LAYOUTEX_API +#define U_IO_API +#define U_TOOLUTIL_API +#elif defined(U_COMMON_IMPLEMENTATION) +#define U_DATA_API U_IMPORT +#define U_COMMON_API U_EXPORT +#define U_I18N_API U_IMPORT +#define U_LAYOUT_API U_IMPORT +#define U_LAYOUTEX_API U_IMPORT +#define U_IO_API U_IMPORT +#define U_TOOLUTIL_API U_IMPORT +#elif defined(U_I18N_IMPLEMENTATION) +#define U_DATA_API U_IMPORT +#define U_COMMON_API U_IMPORT +#define U_I18N_API U_EXPORT +#define U_LAYOUT_API U_IMPORT +#define U_LAYOUTEX_API U_IMPORT +#define U_IO_API U_IMPORT +#define U_TOOLUTIL_API U_IMPORT +#elif defined(U_LAYOUT_IMPLEMENTATION) +#define U_DATA_API U_IMPORT +#define U_COMMON_API U_IMPORT +#define U_I18N_API U_IMPORT +#define U_LAYOUT_API U_EXPORT +#define U_LAYOUTEX_API U_IMPORT +#define U_IO_API U_IMPORT +#define U_TOOLUTIL_API U_IMPORT +#elif defined(U_LAYOUTEX_IMPLEMENTATION) +#define U_DATA_API U_IMPORT +#define U_COMMON_API U_IMPORT +#define U_I18N_API U_IMPORT +#define U_LAYOUT_API U_IMPORT +#define U_LAYOUTEX_API U_EXPORT +#define U_IO_API U_IMPORT +#define U_TOOLUTIL_API U_IMPORT +#elif defined(U_IO_IMPLEMENTATION) +#define U_DATA_API U_IMPORT +#define U_COMMON_API U_IMPORT +#define U_I18N_API U_IMPORT +#define U_LAYOUT_API U_IMPORT +#define U_LAYOUTEX_API U_IMPORT +#define U_IO_API U_EXPORT +#define U_TOOLUTIL_API U_IMPORT +#elif defined(U_TOOLUTIL_IMPLEMENTATION) +#define U_DATA_API U_IMPORT +#define U_COMMON_API U_IMPORT +#define U_I18N_API U_IMPORT +#define U_LAYOUT_API U_IMPORT +#define U_LAYOUTEX_API U_IMPORT +#define U_IO_API U_IMPORT +#define U_TOOLUTIL_API U_EXPORT +#else +#define U_DATA_API U_IMPORT +#define U_COMMON_API U_IMPORT +#define U_I18N_API U_IMPORT +#define U_LAYOUT_API U_IMPORT +#define U_LAYOUTEX_API U_IMPORT +#define U_IO_API U_IMPORT +#define U_TOOLUTIL_API U_IMPORT +#endif + +/** + * \def U_STANDARD_CPP_NAMESPACE + * Control of C++ Namespace + * @stable ICU 2.0 + */ +#ifdef __cplusplus +#define U_STANDARD_CPP_NAMESPACE :: +#else +#define U_STANDARD_CPP_NAMESPACE +#endif + + +/*===========================================================================*/ +/* Global delete operator */ +/*===========================================================================*/ + +/* + * The ICU4C library must not use the global new and delete operators. + * These operators here are defined to enable testing for this. + * See Jitterbug 2581 for details of why this is necessary. + * + * Verification that ICU4C's memory usage is correct, i.e., + * that global new/delete are not used: + * + * a) Check for imports of global new/delete (see uobject.cpp for details) + * b) Verify that new is never imported. + * c) Verify that delete is only imported from object code for interface/mixin classes. + * d) Add global delete and delete[] only for the ICU4C library itself + * and define them in a way that crashes or otherwise easily shows a problem. + * + * The following implements d). + * The operator implementations crash; this is intentional and used for library debugging. + * + * Note: This is currently only done on Windows because + * some Linux/Unix compilers have problems with defining global new/delete. + * On Windows, it is _MSC_VER>=1200 for MSVC 6.0 and higher. + */ +#if defined(__cplusplus) && U_DEBUG && U_OVERRIDE_CXX_ALLOCATION && (_MSC_VER>=1200) && !defined(U_STATIC_IMPLEMENTATION) && (defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) || defined(U_LAYOUT_IMPLEMENTATION) || defined(U_LAYOUTEX_IMPLEMENTATION)) + +#ifndef U_HIDE_INTERNAL_API +/** + * Global operator new, defined only inside ICU4C, must not be used. + * Crashes intentionally. + * @internal + */ +inline void * +operator new(size_t /*size*/) { + char *q=NULL; + *q=5; /* break it */ + return q; +} + +#ifdef _Ret_bytecap_ +/* This is only needed to suppress a Visual C++ 2008 warning for operator new[]. */ +_Ret_bytecap_(_Size) +#endif +/** + * Global operator new[], defined only inside ICU4C, must not be used. + * Crashes intentionally. + * @internal + */ +inline void * +operator new[](size_t /*size*/) { + char *q=NULL; + *q=5; /* break it */ + return q; +} + +/** + * Global operator delete, defined only inside ICU4C, must not be used. + * Crashes intentionally. + * @internal + */ +inline void +operator delete(void * /*p*/) { + char *q=NULL; + *q=5; /* break it */ +} + +/** + * Global operator delete[], defined only inside ICU4C, must not be used. + * Crashes intentionally. + * @internal + */ +inline void +operator delete[](void * /*p*/) { + char *q=NULL; + *q=5; /* break it */ +} + +#endif /* U_HIDE_INTERNAL_API */ +#endif + +/*===========================================================================*/ +/* UErrorCode */ +/*===========================================================================*/ + +/** + * Error code to replace exception handling, so that the code is compatible with all C++ compilers, + * and to use the same mechanism for C and C++. + * + * \par + * ICU functions that take a reference (C++) or a pointer (C) to a UErrorCode + * first test if(U_FAILURE(errorCode)) { return immediately; } + * so that in a chain of such functions the first one that sets an error code + * causes the following ones to not perform any operations. + * + * \par + * Error codes should be tested using U_FAILURE() and U_SUCCESS(). + * @stable ICU 2.0 + */ +typedef enum UErrorCode { + /* The ordering of U_ERROR_INFO_START Vs U_USING_FALLBACK_WARNING looks weird + * and is that way because VC++ debugger displays first encountered constant, + * which is not the what the code is used for + */ + + U_USING_FALLBACK_WARNING = -128, /**< A resource bundle lookup returned a fallback result (not an error) */ + + U_ERROR_WARNING_START = -128, /**< Start of information results (semantically successful) */ + + U_USING_DEFAULT_WARNING = -127, /**< A resource bundle lookup returned a result from the root locale (not an error) */ + + U_SAFECLONE_ALLOCATED_WARNING = -126, /**< A SafeClone operation required allocating memory (informational only) */ + + U_STATE_OLD_WARNING = -125, /**< ICU has to use compatibility layer to construct the service. Expect performance/memory usage degradation. Consider upgrading */ + + U_STRING_NOT_TERMINATED_WARNING = -124,/**< An output string could not be NUL-terminated because output length==destCapacity. */ + + U_SORT_KEY_TOO_SHORT_WARNING = -123, /**< Number of levels requested in getBound is higher than the number of levels in the sort key */ + + U_AMBIGUOUS_ALIAS_WARNING = -122, /**< This converter alias can go to different converter implementations */ + + U_DIFFERENT_UCA_VERSION = -121, /**< ucol_open encountered a mismatch between UCA version and collator image version, so the collator was constructed from rules. No impact to further function */ + + U_PLUGIN_CHANGED_LEVEL_WARNING = -120, /**< A plugin caused a level change. May not be an error, but later plugins may not load. */ + + U_ERROR_WARNING_LIMIT, /**< This must always be the last warning value to indicate the limit for UErrorCode warnings (last warning code +1) */ + + + U_ZERO_ERROR = 0, /**< No error, no warning. */ + + U_ILLEGAL_ARGUMENT_ERROR = 1, /**< Start of codes indicating failure */ + U_MISSING_RESOURCE_ERROR = 2, /**< The requested resource cannot be found */ + U_INVALID_FORMAT_ERROR = 3, /**< Data format is not what is expected */ + U_FILE_ACCESS_ERROR = 4, /**< The requested file cannot be found */ + U_INTERNAL_PROGRAM_ERROR = 5, /**< Indicates a bug in the library code */ + U_MESSAGE_PARSE_ERROR = 6, /**< Unable to parse a message (message format) */ + U_MEMORY_ALLOCATION_ERROR = 7, /**< Memory allocation error */ + U_INDEX_OUTOFBOUNDS_ERROR = 8, /**< Trying to access the index that is out of bounds */ + U_PARSE_ERROR = 9, /**< Equivalent to Java ParseException */ + U_INVALID_CHAR_FOUND = 10, /**< Character conversion: Unmappable input sequence. In other APIs: Invalid character. */ + U_TRUNCATED_CHAR_FOUND = 11, /**< Character conversion: Incomplete input sequence. */ + U_ILLEGAL_CHAR_FOUND = 12, /**< Character conversion: Illegal input sequence/combination of input units. */ + U_INVALID_TABLE_FORMAT = 13, /**< Conversion table file found, but corrupted */ + U_INVALID_TABLE_FILE = 14, /**< Conversion table file not found */ + U_BUFFER_OVERFLOW_ERROR = 15, /**< A result would not fit in the supplied buffer */ + U_UNSUPPORTED_ERROR = 16, /**< Requested operation not supported in current context */ + U_RESOURCE_TYPE_MISMATCH = 17, /**< an operation is requested over a resource that does not support it */ + U_ILLEGAL_ESCAPE_SEQUENCE = 18, /**< ISO-2022 illlegal escape sequence */ + U_UNSUPPORTED_ESCAPE_SEQUENCE = 19, /**< ISO-2022 unsupported escape sequence */ + U_NO_SPACE_AVAILABLE = 20, /**< No space available for in-buffer expansion for Arabic shaping */ + U_CE_NOT_FOUND_ERROR = 21, /**< Currently used only while setting variable top, but can be used generally */ + U_PRIMARY_TOO_LONG_ERROR = 22, /**< User tried to set variable top to a primary that is longer than two bytes */ + U_STATE_TOO_OLD_ERROR = 23, /**< ICU cannot construct a service from this state, as it is no longer supported */ + U_TOO_MANY_ALIASES_ERROR = 24, /**< There are too many aliases in the path to the requested resource. + It is very possible that a circular alias definition has occured */ + U_ENUM_OUT_OF_SYNC_ERROR = 25, /**< UEnumeration out of sync with underlying collection */ + U_INVARIANT_CONVERSION_ERROR = 26, /**< Unable to convert a UChar* string to char* with the invariant converter. */ + U_INVALID_STATE_ERROR = 27, /**< Requested operation can not be completed with ICU in its current state */ + U_COLLATOR_VERSION_MISMATCH = 28, /**< Collator version is not compatible with the base version */ + U_USELESS_COLLATOR_ERROR = 29, /**< Collator is options only and no base is specified */ + U_NO_WRITE_PERMISSION = 30, /**< Attempt to modify read-only or constant data. */ + + U_STANDARD_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for standard errors */ + /* + * the error code range 0x10000 0x10100 are reserved for Transliterator + */ + U_BAD_VARIABLE_DEFINITION=0x10000,/**< Missing '$' or duplicate variable name */ + U_PARSE_ERROR_START = 0x10000, /**< Start of Transliterator errors */ + U_MALFORMED_RULE, /**< Elements of a rule are misplaced */ + U_MALFORMED_SET, /**< A UnicodeSet pattern is invalid*/ + U_MALFORMED_SYMBOL_REFERENCE, /**< UNUSED as of ICU 2.4 */ + U_MALFORMED_UNICODE_ESCAPE, /**< A Unicode escape pattern is invalid*/ + U_MALFORMED_VARIABLE_DEFINITION, /**< A variable definition is invalid */ + U_MALFORMED_VARIABLE_REFERENCE, /**< A variable reference is invalid */ + U_MISMATCHED_SEGMENT_DELIMITERS, /**< UNUSED as of ICU 2.4 */ + U_MISPLACED_ANCHOR_START, /**< A start anchor appears at an illegal position */ + U_MISPLACED_CURSOR_OFFSET, /**< A cursor offset occurs at an illegal position */ + U_MISPLACED_QUANTIFIER, /**< A quantifier appears after a segment close delimiter */ + U_MISSING_OPERATOR, /**< A rule contains no operator */ + U_MISSING_SEGMENT_CLOSE, /**< UNUSED as of ICU 2.4 */ + U_MULTIPLE_ANTE_CONTEXTS, /**< More than one ante context */ + U_MULTIPLE_CURSORS, /**< More than one cursor */ + U_MULTIPLE_POST_CONTEXTS, /**< More than one post context */ + U_TRAILING_BACKSLASH, /**< A dangling backslash */ + U_UNDEFINED_SEGMENT_REFERENCE, /**< A segment reference does not correspond to a defined segment */ + U_UNDEFINED_VARIABLE, /**< A variable reference does not correspond to a defined variable */ + U_UNQUOTED_SPECIAL, /**< A special character was not quoted or escaped */ + U_UNTERMINATED_QUOTE, /**< A closing single quote is missing */ + U_RULE_MASK_ERROR, /**< A rule is hidden by an earlier more general rule */ + U_MISPLACED_COMPOUND_FILTER, /**< A compound filter is in an invalid location */ + U_MULTIPLE_COMPOUND_FILTERS, /**< More than one compound filter */ + U_INVALID_RBT_SYNTAX, /**< A "::id" rule was passed to the RuleBasedTransliterator parser */ + U_INVALID_PROPERTY_PATTERN, /**< UNUSED as of ICU 2.4 */ + U_MALFORMED_PRAGMA, /**< A 'use' pragma is invlalid */ + U_UNCLOSED_SEGMENT, /**< A closing ')' is missing */ + U_ILLEGAL_CHAR_IN_SEGMENT, /**< UNUSED as of ICU 2.4 */ + U_VARIABLE_RANGE_EXHAUSTED, /**< Too many stand-ins generated for the given variable range */ + U_VARIABLE_RANGE_OVERLAP, /**< The variable range overlaps characters used in rules */ + U_ILLEGAL_CHARACTER, /**< A special character is outside its allowed context */ + U_INTERNAL_TRANSLITERATOR_ERROR, /**< Internal transliterator system error */ + U_INVALID_ID, /**< A "::id" rule specifies an unknown transliterator */ + U_INVALID_FUNCTION, /**< A "&fn()" rule specifies an unknown transliterator */ + U_PARSE_ERROR_LIMIT, /**< The limit for Transliterator errors */ + + /* + * the error code range 0x10100 0x10200 are reserved for formatting API parsing error + */ + U_UNEXPECTED_TOKEN=0x10100, /**< Syntax error in format pattern */ + U_FMT_PARSE_ERROR_START=0x10100, /**< Start of format library errors */ + U_MULTIPLE_DECIMAL_SEPARATORS, /**< More than one decimal separator in number pattern */ + U_MULTIPLE_DECIMAL_SEPERATORS = U_MULTIPLE_DECIMAL_SEPARATORS, /**< Typo: kept for backward compatibility. Use U_MULTIPLE_DECIMAL_SEPARATORS */ + U_MULTIPLE_EXPONENTIAL_SYMBOLS, /**< More than one exponent symbol in number pattern */ + U_MALFORMED_EXPONENTIAL_PATTERN, /**< Grouping symbol in exponent pattern */ + U_MULTIPLE_PERCENT_SYMBOLS, /**< More than one percent symbol in number pattern */ + U_MULTIPLE_PERMILL_SYMBOLS, /**< More than one permill symbol in number pattern */ + U_MULTIPLE_PAD_SPECIFIERS, /**< More than one pad symbol in number pattern */ + U_PATTERN_SYNTAX_ERROR, /**< Syntax error in format pattern */ + U_ILLEGAL_PAD_POSITION, /**< Pad symbol misplaced in number pattern */ + U_UNMATCHED_BRACES, /**< Braces do not match in message pattern */ + U_UNSUPPORTED_PROPERTY, /**< UNUSED as of ICU 2.4 */ + U_UNSUPPORTED_ATTRIBUTE, /**< UNUSED as of ICU 2.4 */ + U_ARGUMENT_TYPE_MISMATCH, /**< Argument name and argument index mismatch in MessageFormat functions */ + U_DUPLICATE_KEYWORD, /**< Duplicate keyword in PluralFormat */ + U_UNDEFINED_KEYWORD, /**< Undefined Plural keyword */ + U_DEFAULT_KEYWORD_MISSING, /**< Missing DEFAULT rule in plural rules */ + U_DECIMAL_NUMBER_SYNTAX_ERROR, /**< Decimal number syntax error */ + U_FORMAT_INEXACT_ERROR, /**< Cannot format a number exactly and rounding mode is ROUND_UNNECESSARY @stable ICU 4.8 */ + U_FMT_PARSE_ERROR_LIMIT, /**< The limit for format library errors */ + + /* + * the error code range 0x10200 0x102ff are reserved for Break Iterator related error + */ + U_BRK_INTERNAL_ERROR=0x10200, /**< An internal error (bug) was detected. */ + U_BRK_ERROR_START=0x10200, /**< Start of codes indicating Break Iterator failures */ + U_BRK_HEX_DIGITS_EXPECTED, /**< Hex digits expected as part of a escaped char in a rule. */ + U_BRK_SEMICOLON_EXPECTED, /**< Missing ';' at the end of a RBBI rule. */ + U_BRK_RULE_SYNTAX, /**< Syntax error in RBBI rule. */ + U_BRK_UNCLOSED_SET, /**< UnicodeSet witing an RBBI rule missing a closing ']'. */ + U_BRK_ASSIGN_ERROR, /**< Syntax error in RBBI rule assignment statement. */ + U_BRK_VARIABLE_REDFINITION, /**< RBBI rule $Variable redefined. */ + U_BRK_MISMATCHED_PAREN, /**< Mis-matched parentheses in an RBBI rule. */ + U_BRK_NEW_LINE_IN_QUOTED_STRING, /**< Missing closing quote in an RBBI rule. */ + U_BRK_UNDEFINED_VARIABLE, /**< Use of an undefined $Variable in an RBBI rule. */ + U_BRK_INIT_ERROR, /**< Initialization failure. Probable missing ICU Data. */ + U_BRK_RULE_EMPTY_SET, /**< Rule contains an empty Unicode Set. */ + U_BRK_UNRECOGNIZED_OPTION, /**< !!option in RBBI rules not recognized. */ + U_BRK_MALFORMED_RULE_TAG, /**< The {nnn} tag on a rule is mal formed */ + U_BRK_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for Break Iterator failures */ + + /* + * The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs + */ + U_REGEX_INTERNAL_ERROR=0x10300, /**< An internal error (bug) was detected. */ + U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */ + U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */ + U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */ + U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */ + U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */ + U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */ + U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */ + U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */ + U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */ + U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */ + U_REGEX_INVALID_BACK_REF, /**< Back-reference to a non-existent capture group. */ + U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */ + U_REGEX_LOOK_BEHIND_LIMIT, /**< Look-Behind pattern matches must have a bounded maximum length. */ + U_REGEX_SET_CONTAINS_STRING, /**< Regexps cannot have UnicodeSets containing strings.*/ + U_REGEX_OCTAL_TOO_BIG, /**< Octal character constants must be <= 0377. @deprecated ICU 54. This error cannot occur. */ + U_REGEX_MISSING_CLOSE_BRACKET, /**< Missing closing bracket on a bracket expression. */ + U_REGEX_INVALID_RANGE, /**< In a character range [x-y], x is greater than y. */ + U_REGEX_STACK_OVERFLOW, /**< Regular expression backtrack stack overflow. */ + U_REGEX_TIME_OUT, /**< Maximum allowed match time exceeded */ + U_REGEX_STOPPED_BY_CALLER, /**< Matching operation aborted by user callback fn. */ + U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */ + + /* + * The error code in the range 0x10400-0x104ff are reserved for IDNA related error codes + */ + U_IDNA_PROHIBITED_ERROR=0x10400, + U_IDNA_ERROR_START=0x10400, + U_IDNA_UNASSIGNED_ERROR, + U_IDNA_CHECK_BIDI_ERROR, + U_IDNA_STD3_ASCII_RULES_ERROR, + U_IDNA_ACE_PREFIX_ERROR, + U_IDNA_VERIFICATION_ERROR, + U_IDNA_LABEL_TOO_LONG_ERROR, + U_IDNA_ZERO_LENGTH_LABEL_ERROR, + U_IDNA_DOMAIN_NAME_TOO_LONG_ERROR, + U_IDNA_ERROR_LIMIT, + /* + * Aliases for StringPrep + */ + U_STRINGPREP_PROHIBITED_ERROR = U_IDNA_PROHIBITED_ERROR, + U_STRINGPREP_UNASSIGNED_ERROR = U_IDNA_UNASSIGNED_ERROR, + U_STRINGPREP_CHECK_BIDI_ERROR = U_IDNA_CHECK_BIDI_ERROR, + + /* + * The error code in the range 0x10500-0x105ff are reserved for Plugin related error codes + */ + U_PLUGIN_ERROR_START=0x10500, /**< Start of codes indicating plugin failures */ + U_PLUGIN_TOO_HIGH=0x10500, /**< The plugin's level is too high to be loaded right now. */ + U_PLUGIN_DIDNT_SET_LEVEL, /**< The plugin didn't call uplug_setPlugLevel in response to a QUERY */ + U_PLUGIN_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for plugin errors */ + + U_ERROR_LIMIT=U_PLUGIN_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */ +} UErrorCode; + +/* Use the following to determine if an UErrorCode represents */ +/* operational success or failure. */ + +#ifdef __cplusplus + /** + * Does the error code indicate success? + * @stable ICU 2.0 + */ + static + inline UBool U_SUCCESS(UErrorCode code) { return (UBool)(code<=U_ZERO_ERROR); } + /** + * Does the error code indicate a failure? + * @stable ICU 2.0 + */ + static + inline UBool U_FAILURE(UErrorCode code) { return (UBool)(code>U_ZERO_ERROR); } +#else + /** + * Does the error code indicate success? + * @stable ICU 2.0 + */ +# define U_SUCCESS(x) ((x)<=U_ZERO_ERROR) + /** + * Does the error code indicate a failure? + * @stable ICU 2.0 + */ +# define U_FAILURE(x) ((x)>U_ZERO_ERROR) +#endif + +/** + * Return a string for a UErrorCode value. + * The string will be the same as the name of the error code constant + * in the UErrorCode enum above. + * @stable ICU 2.0 + */ +U_STABLE const char * U_EXPORT2 +u_errorName(UErrorCode code); + + +#endif /* _UTYPES */ diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/uvernum.h b/src/core/basetypes/icu-ucsdet/include/unicode/uvernum.h new file mode 100644 index 00000000..5af40e33 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/uvernum.h @@ -0,0 +1,170 @@ +/* +******************************************************************************* +* Copyright (C) 2000-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* +* file name: uvernum.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* Created by: Vladimir Weinstein +* Updated by: Steven R. Loomis +* +*/ + +/** + * \file + * \brief C API: definitions of ICU version numbers + * + * This file is included by uversion.h and other files. This file contains only + * macros and definitions. The actual version numbers are defined here. + */ + + /* + * IMPORTANT: When updating version, the following things need to be done: + * source/common/unicode/uvernum.h - this file: update major, minor, + * patchlevel, suffix, version, short version constants, namespace, + * renaming macro, and copyright + * + * The following files need to be updated as well, which can be done + * by running the UNIX makefile target 'update-windows-makefiles' in icu/source. + * + * + * source/common/common.vcproj - update 'Output file name' on the link tab so + * that it contains the new major/minor combination + * source/i18n/i18n.vcproj - same as for the common.vcproj + * source/layout/layout.vcproj - same as for the common.vcproj + * source/layoutex/layoutex.vcproj - same + * source/stubdata/stubdata.vcproj - same as for the common.vcproj + * source/io/io.vcproj - same as for the common.vcproj + * source/data/makedata.mak - change U_ICUDATA_NAME so that it contains + * the new major/minor combination and the Unicode version. + */ + +#ifndef UVERNUM_H +#define UVERNUM_H + +/** The standard copyright notice that gets compiled into each library. + * This value will change in the subsequent releases of ICU + * @stable ICU 2.4 + */ +#define U_COPYRIGHT_STRING \ + " Copyright (C) 2014, International Business Machines Corporation and others. All Rights Reserved. " + +/** The current ICU major version as an integer. + * This value will change in the subsequent releases of ICU + * @stable ICU 2.4 + */ +#define U_ICU_VERSION_MAJOR_NUM 54 + +/** The current ICU minor version as an integer. + * This value will change in the subsequent releases of ICU + * @stable ICU 2.6 + */ +#define U_ICU_VERSION_MINOR_NUM 1 + +/** The current ICU patchlevel version as an integer. + * This value will change in the subsequent releases of ICU + * @stable ICU 2.4 + */ +#define U_ICU_VERSION_PATCHLEVEL_NUM 0 + +/** The current ICU build level version as an integer. + * This value is for use by ICU clients. It defaults to 0. + * @stable ICU 4.0 + */ +#ifndef U_ICU_VERSION_BUILDLEVEL_NUM +#define U_ICU_VERSION_BUILDLEVEL_NUM 0 +#endif + +/** Glued version suffix for renamers + * This value will change in the subsequent releases of ICU + * @stable ICU 2.6 + */ +#define U_ICU_VERSION_SUFFIX _54_mc + +/** + * \def U_DEF2_ICU_ENTRY_POINT_RENAME + * @internal + */ +/** + * \def U_DEF_ICU_ENTRY_POINT_RENAME + * @internal + */ +/** Glued version suffix function for renamers + * This value will change in the subsequent releases of ICU. + * If a custom suffix (such as matching library suffixes) is desired, this can be modified. + * Note that if present, platform.h may contain an earlier definition of this macro. + * \def U_ICU_ENTRY_POINT_RENAME + * @stable ICU 4.2 + */ + +#ifndef U_ICU_ENTRY_POINT_RENAME +#ifdef U_HAVE_LIB_SUFFIX +#define U_DEF_ICU_ENTRY_POINT_RENAME(x,y,z) x ## y ## z +#define U_DEF2_ICU_ENTRY_POINT_RENAME(x,y,z) U_DEF_ICU_ENTRY_POINT_RENAME(x,y,z) +#define U_ICU_ENTRY_POINT_RENAME(x) U_DEF2_ICU_ENTRY_POINT_RENAME(x,U_ICU_VERSION_SUFFIX,U_LIB_SUFFIX_C_NAME) +#else +#define U_DEF_ICU_ENTRY_POINT_RENAME(x,y) x ## y +#define U_DEF2_ICU_ENTRY_POINT_RENAME(x,y) U_DEF_ICU_ENTRY_POINT_RENAME(x,y) +#define U_ICU_ENTRY_POINT_RENAME(x) U_DEF2_ICU_ENTRY_POINT_RENAME(x,U_ICU_VERSION_SUFFIX) +#endif +#endif + +/** The current ICU library version as a dotted-decimal string. The patchlevel + * only appears in this string if it non-zero. + * This value will change in the subsequent releases of ICU + * @stable ICU 2.4 + */ +#define U_ICU_VERSION "54.1" + +/** The current ICU library major/minor version as a string without dots, for library name suffixes. + * This value will change in the subsequent releases of ICU + * @stable ICU 2.6 + */ +#define U_ICU_VERSION_SHORT "54" + +#ifndef U_HIDE_INTERNAL_API +/** Data version in ICU4C. + * @internal ICU 4.4 Internal Use Only + **/ +#define U_ICU_DATA_VERSION "54.1" +#endif /* U_HIDE_INTERNAL_API */ + +/*=========================================================================== + * ICU collation framework version information + * Version info that can be obtained from a collator is affected by these + * numbers in a secret and magic way. Please use collator version as whole + *=========================================================================== + */ + +/** + * Collation runtime version (sort key generator, strcoll). + * If the version is different, sort keys for the same string could be different. + * This value may change in subsequent releases of ICU. + * @stable ICU 2.4 + */ +#define UCOL_RUNTIME_VERSION 8 + +/** + * Collation builder code version. + * When this is different, the same tailoring might result + * in assigning different collation elements to code points. + * This value may change in subsequent releases of ICU. + * @stable ICU 2.4 + */ +#define UCOL_BUILDER_VERSION 9 + +#ifndef U_HIDE_DEPRECATED_API +/** + * Constant 1. + * This was intended to be the version of collation tailorings, + * but instead the tailoring data carries a version number. + * @deprecated ICU 54 + */ +#define UCOL_TAILORINGS_VERSION 1 +#endif /* U_HIDE_DEPRECATED_API */ + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/unicode/uversion.h b/src/core/basetypes/icu-ucsdet/include/unicode/uversion.h new file mode 100644 index 00000000..74e30910 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/unicode/uversion.h @@ -0,0 +1,193 @@ +/* +******************************************************************************* +* Copyright (C) 2000-2011, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* +* file name: uversion.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* Created by: Vladimir Weinstein +* +* Gets included by utypes.h and Windows .rc files +*/ + +/** + * \file + * \brief C API: API for accessing ICU version numbers. + */ +/*===========================================================================*/ +/* Main ICU version information */ +/*===========================================================================*/ + +#ifndef UVERSION_H +#define UVERSION_H + +#include "unicode/umachine.h" + +/* Actual version info lives in uvernum.h */ +#include "unicode/uvernum.h" + +/** Maximum length of the copyright string. + * @stable ICU 2.4 + */ +#define U_COPYRIGHT_STRING_LENGTH 128 + +/** An ICU version consists of up to 4 numbers from 0..255. + * @stable ICU 2.4 + */ +#define U_MAX_VERSION_LENGTH 4 + +/** In a string, ICU version fields are delimited by dots. + * @stable ICU 2.4 + */ +#define U_VERSION_DELIMITER '.' + +/** The maximum length of an ICU version string. + * @stable ICU 2.4 + */ +#define U_MAX_VERSION_STRING_LENGTH 20 + +/** The binary form of a version on ICU APIs is an array of 4 uint8_t. + * To compare two versions, use memcmp(v1,v2,sizeof(UVersionInfo)). + * @stable ICU 2.4 + */ +typedef uint8_t UVersionInfo[U_MAX_VERSION_LENGTH]; + +/*===========================================================================*/ +/* C++ namespace if supported. Versioned unless versioning is disabled. */ +/*===========================================================================*/ + +/** + * \def U_NAMESPACE_BEGIN + * This is used to begin a declaration of a public ICU C++ API. + * When not compiling for C++, it does nothing. + * When compiling for C++, it begins an extern "C++" linkage block (to protect + * against cases in which an external client includes ICU header files inside + * an extern "C" linkage block). + * + * It also begins a versioned-ICU-namespace block. + * @stable ICU 2.4 + */ + +/** + * \def U_NAMESPACE_END + * This is used to end a declaration of a public ICU C++ API. + * When not compiling for C++, it does nothing. + * When compiling for C++, it ends the extern "C++" block begun by + * U_NAMESPACE_BEGIN. + * + * It also ends the versioned-ICU-namespace block begun by U_NAMESPACE_BEGIN. + * @stable ICU 2.4 + */ + +/** + * \def U_NAMESPACE_USE + * This is used to specify that the rest of the code uses the + * public ICU C++ API namespace. + * This is invoked by default; we recommend that you turn it off: + * See the "Recommended Build Options" section of the ICU4C readme + * (http://source.icu-project.org/repos/icu/icu/trunk/readme.html#RecBuild) + * @stable ICU 2.4 + */ + +/** + * \def U_NAMESPACE_QUALIFIER + * This is used to qualify that a function or class is part of + * the public ICU C++ API namespace. + * + * This macro is unnecessary since ICU 49 requires namespace support. + * You can just use "icu::" instead. + * @stable ICU 2.4 + */ + +/* Define namespace symbols if the compiler supports it. */ +#ifdef __cplusplus +# if U_DISABLE_RENAMING +# define U_ICU_NAMESPACE icu + namespace U_ICU_NAMESPACE { } +# else +# define U_ICU_NAMESPACE U_ICU_ENTRY_POINT_RENAME(icu) + namespace U_ICU_NAMESPACE { } + namespace icu = U_ICU_NAMESPACE; +# endif + +# define U_NAMESPACE_BEGIN extern "C++" { namespace U_ICU_NAMESPACE { +# define U_NAMESPACE_END } } +# define U_NAMESPACE_USE using namespace U_ICU_NAMESPACE; +# define U_NAMESPACE_QUALIFIER U_ICU_NAMESPACE:: + +# ifndef U_USING_ICU_NAMESPACE +# define U_USING_ICU_NAMESPACE 1 +# endif +# if U_USING_ICU_NAMESPACE + U_NAMESPACE_USE +# endif +#else +# define U_NAMESPACE_BEGIN +# define U_NAMESPACE_END +# define U_NAMESPACE_USE +# define U_NAMESPACE_QUALIFIER +#endif + +/*===========================================================================*/ +/* General version helper functions. Definitions in putil.c */ +/*===========================================================================*/ + +/** + * Parse a string with dotted-decimal version information and + * fill in a UVersionInfo structure with the result. + * Definition of this function lives in putil.c + * + * @param versionArray The destination structure for the version information. + * @param versionString A string with dotted-decimal version information, + * with up to four non-negative number fields with + * values of up to 255 each. + * @stable ICU 2.4 + */ +U_STABLE void U_EXPORT2 +u_versionFromString(UVersionInfo versionArray, const char *versionString); + +/** + * Parse a Unicode string with dotted-decimal version information and + * fill in a UVersionInfo structure with the result. + * Definition of this function lives in putil.c + * + * @param versionArray The destination structure for the version information. + * @param versionString A Unicode string with dotted-decimal version + * information, with up to four non-negative number + * fields with values of up to 255 each. + * @stable ICU 4.2 + */ +U_STABLE void U_EXPORT2 +u_versionFromUString(UVersionInfo versionArray, const UChar *versionString); + + +/** + * Write a string with dotted-decimal version information according + * to the input UVersionInfo. + * Definition of this function lives in putil.c + * + * @param versionArray The version information to be written as a string. + * @param versionString A string buffer that will be filled in with + * a string corresponding to the numeric version + * information in versionArray. + * The buffer size must be at least U_MAX_VERSION_STRING_LENGTH. + * @stable ICU 2.4 + */ +U_STABLE void U_EXPORT2 +u_versionToString(const UVersionInfo versionArray, char *versionString); + +/** + * Gets the ICU release version. The version array stores the version information + * for ICU. For example, release "1.3.31.2" is then represented as 0x01031F02. + * Definition of this function lives in putil.c + * + * @param versionArray the version # information, the result will be filled in + * @stable ICU 2.0 + */ +U_STABLE void U_EXPORT2 +u_getVersion(UVersionInfo versionArray); +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/uposixdefs.h b/src/core/basetypes/icu-ucsdet/include/uposixdefs.h new file mode 100644 index 00000000..bd64d91a --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/uposixdefs.h @@ -0,0 +1,73 @@ +/* +******************************************************************************* +* Copyright (C) 2011-2012, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: uposixdefs.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011jul25 +* created by: Markus W. Scherer +* +* Common definitions for implementation files working with POSIX functions. +* *Important*: #include this file before any other header files! +*/ + +#ifndef __UPOSIXDEFS_H__ +#define __UPOSIXDEFS_H__ + +/* + * Define _XOPEN_SOURCE for access to POSIX functions. + * + * We cannot use U_PLATFORM from platform.h/utypes.h because + * "The Open Group Base Specifications" + * chapter "2.2 The Compilation Environment" says: + * "In the compilation of an application that #defines a feature test macro + * specified by IEEE Std 1003.1-2001, + * no header defined by IEEE Std 1003.1-2001 shall be included prior to + * the definition of the feature test macro." + */ +#ifdef _XOPEN_SOURCE + /* Use the predefined value. */ +#else + /* + * Version 6.0: + * The Open Group Base Specifications Issue 6 (IEEE Std 1003.1, 2004 Edition) + * also known as + * SUSv3 = Open Group Single UNIX Specification, Version 3 (UNIX03) + * + * Note: This definition used to be in C source code (e.g., putil.c) + * and define _XOPEN_SOURCE to different values depending on __STDC_VERSION__. + * In C++ source code (e.g., putil.cpp), __STDC_VERSION__ is not defined at all. + */ +# define _XOPEN_SOURCE 600 +#endif + +/* + * Make sure things like readlink and such functions work. + * Poorly upgraded Solaris machines can't have this defined. + * Cleanly installed Solaris can use this #define. + * + * z/OS needs this definition for timeval and to get usleep. + */ +#if !defined(_XOPEN_SOURCE_EXTENDED) +# define _XOPEN_SOURCE_EXTENDED 1 +#endif + +/* + * There is an issue with turning on _XOPEN_SOURCE_EXTENDED on certain platforms. + * A compatibility issue exists between turning on _XOPEN_SOURCE_EXTENDED and using + * standard C++ string class. As a result, standard C++ string class needs to be + * turned off for the follwing platforms: + * -AIX/VACPP + * -Solaris/GCC + */ +#if (U_PLATFORM == U_PF_AIX && !defined(__GNUC__)) || (U_PLATFORM == U_PF_SOLARIS && defined(__GNUC__)) +# if _XOPEN_SOURCE_EXTENDED && !defined(U_HAVE_STD_STRING) +# define U_HAVE_STD_STRING 0 +# endif +#endif + +#endif /* __UPOSIXDEFS_H__ */ diff --git a/src/core/basetypes/icu-ucsdet/include/uset_imp.h b/src/core/basetypes/icu-ucsdet/include/uset_imp.h new file mode 100644 index 00000000..07a7381e --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/uset_imp.h @@ -0,0 +1,60 @@ +/* +******************************************************************************* +* +* Copyright (C) 2004-2007, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uset_imp.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2004sep07 +* created by: Markus W. Scherer +* +* Internal USet definitions. +*/ + +#ifndef __USET_IMP_H__ +#define __USET_IMP_H__ + +#include "unicode/utypes.h" +#include "unicode/uset.h" + +U_CDECL_BEGIN + +typedef void U_CALLCONV +USetAdd(USet *set, UChar32 c); + +typedef void U_CALLCONV +USetAddRange(USet *set, UChar32 start, UChar32 end); + +typedef void U_CALLCONV +USetAddString(USet *set, const UChar *str, int32_t length); + +typedef void U_CALLCONV +USetRemove(USet *set, UChar32 c); + +typedef void U_CALLCONV +USetRemoveRange(USet *set, UChar32 start, UChar32 end); + +/** + * Interface for adding items to a USet, to keep low-level code from + * statically depending on the USet implementation. + * Calls will look like sa->add(sa->set, c); + */ +struct USetAdder { + USet *set; + USetAdd *add; + USetAddRange *addRange; + USetAddString *addString; + USetRemove *remove; + USetRemoveRange *removeRange; +}; +typedef struct USetAdder USetAdder; + +U_CDECL_END + +#endif + diff --git a/src/core/basetypes/icu-ucsdet/include/ustr_imp.h b/src/core/basetypes/icu-ucsdet/include/ustr_imp.h new file mode 100644 index 00000000..ee54d332 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/ustr_imp.h @@ -0,0 +1,247 @@ +/* +********************************************************************** +* Copyright (C) 1999-2011, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* file name: ustr_imp.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2001jan30 +* created by: Markus W. Scherer +*/ + +#ifndef __USTR_IMP_H__ +#define __USTR_IMP_H__ + +#include "unicode/utypes.h" +#include "unicode/uiter.h" +#include "ucase.h" + +/** Simple declaration to avoid including unicode/ubrk.h. */ +#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR +# define UBRK_TYPEDEF_UBREAK_ITERATOR + typedef struct UBreakIterator UBreakIterator; +#endif + +#ifndef U_COMPARE_IGNORE_CASE +/* see also unorm.h */ +/** + * Option bit for unorm_compare: + * Perform case-insensitive comparison. + */ +#define U_COMPARE_IGNORE_CASE 0x10000 +#endif + +/** + * Internal option for unorm_cmpEquivFold() for strncmp style. + * If set, checks for both string length and terminating NUL. + */ +#define _STRNCMP_STYLE 0x1000 + +/** + * Compare two strings in code point order or code unit order. + * Works in strcmp style (both lengths -1), + * strncmp style (lengths equal and >=0, flag TRUE), + * and memcmp/UnicodeString style (at least one length >=0). + */ +U_CFUNC int32_t U_EXPORT2 +uprv_strCompare(const UChar *s1, int32_t length1, + const UChar *s2, int32_t length2, + UBool strncmpStyle, UBool codePointOrder); + +/** + * Internal API, used by u_strcasecmp() etc. + * Compare strings case-insensitively, + * in code point order or code unit order. + */ +U_CFUNC int32_t +u_strcmpFold(const UChar *s1, int32_t length1, + const UChar *s2, int32_t length2, + uint32_t options, + UErrorCode *pErrorCode); + +/** + * Are the Unicode properties loaded? + * This must be used before internal functions are called that do + * not perform this check. + * Generate a debug assertion failure if data is not loaded. + */ +U_CFUNC UBool +uprv_haveProperties(UErrorCode *pErrorCode); + +/** + * Load the Unicode property data. + * Intended primarily for use from u_init(). + * Has no effect if property data is already loaded. + * NOT thread safe. + */ +/*U_CFUNC int8_t +uprv_loadPropsData(UErrorCode *errorCode);*/ + +/* + * Internal string casing functions implementing + * ustring.h/ustrcase.c and UnicodeString case mapping functions. + */ + +struct UCaseMap { + const UCaseProps *csp; +#if !UCONFIG_NO_BREAK_ITERATION + UBreakIterator *iter; /* We adopt the iterator, so we own it. */ +#endif + char locale[32]; + int32_t locCache; + uint32_t options; +}; + +#ifndef __UCASEMAP_H__ +typedef struct UCaseMap UCaseMap; +#endif + +#if UCONFIG_NO_BREAK_ITERATION +# define UCASEMAP_INITIALIZER { NULL, { 0 }, 0, 0 } +#else +# define UCASEMAP_INITIALIZER { NULL, NULL, { 0 }, 0, 0 } +#endif + +U_CFUNC void +ustrcase_setTempCaseMapLocale(UCaseMap *csm, const char *locale); + +#ifndef U_STRING_CASE_MAPPER_DEFINED +#define U_STRING_CASE_MAPPER_DEFINED + +/** + * String case mapping function type, used by ustrcase_map(). + * All error checking must be done. + * The UCaseMap must be fully initialized, with locale and/or iter set as needed. + * src and dest must not overlap. + */ +typedef int32_t U_CALLCONV +UStringCaseMapper(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UErrorCode *pErrorCode); + +#endif + +/** Implements UStringCaseMapper. */ +U_CFUNC int32_t U_CALLCONV +ustrcase_internalToLower(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UErrorCode *pErrorCode); + +/** Implements UStringCaseMapper. */ +U_CFUNC int32_t U_CALLCONV +ustrcase_internalToUpper(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UErrorCode *pErrorCode); + +#if !UCONFIG_NO_BREAK_ITERATION + +/** Implements UStringCaseMapper. */ +U_CFUNC int32_t U_CALLCONV +ustrcase_internalToTitle(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UErrorCode *pErrorCode); + +#endif + +/** Implements UStringCaseMapper. */ +U_CFUNC int32_t U_CALLCONV +ustrcase_internalFold(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UErrorCode *pErrorCode); + +/** + * Implements argument checking and buffer handling + * for string case mapping as a common function. + */ +U_CFUNC int32_t +ustrcase_map(const UCaseMap *csm, + UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UStringCaseMapper *stringCaseMapper, + UErrorCode *pErrorCode); + +/** + * UTF-8 string case mapping function type, used by ucasemap_mapUTF8(). + * UTF-8 version of UStringCaseMapper. + * All error checking must be done. + * The UCaseMap must be fully initialized, with locale and/or iter set as needed. + * src and dest must not overlap. + */ +typedef int32_t U_CALLCONV +UTF8CaseMapper(const UCaseMap *csm, + uint8_t *dest, int32_t destCapacity, + const uint8_t *src, int32_t srcLength, + UErrorCode *pErrorCode); + +/** Implements UTF8CaseMapper. */ +U_CFUNC int32_t U_CALLCONV +ucasemap_internalUTF8ToTitle(const UCaseMap *csm, + uint8_t *dest, int32_t destCapacity, + const uint8_t *src, int32_t srcLength, + UErrorCode *pErrorCode); + +/** + * Implements argument checking and buffer handling + * for UTF-8 string case mapping as a common function. + */ +U_CFUNC int32_t +ucasemap_mapUTF8(const UCaseMap *csm, + uint8_t *dest, int32_t destCapacity, + const uint8_t *src, int32_t srcLength, + UTF8CaseMapper *stringCaseMapper, + UErrorCode *pErrorCode); + +U_CAPI int32_t U_EXPORT2 +ustr_hashUCharsN(const UChar *str, int32_t length); + +U_CAPI int32_t U_EXPORT2 +ustr_hashCharsN(const char *str, int32_t length); + +U_CAPI int32_t U_EXPORT2 +ustr_hashICharsN(const char *str, int32_t length); + +/** + * NUL-terminate a UChar * string if possible. + * If length < destCapacity then NUL-terminate. + * If length == destCapacity then do not terminate but set U_STRING_NOT_TERMINATED_WARNING. + * If length > destCapacity then do not terminate but set U_BUFFER_OVERFLOW_ERROR. + * + * @param dest Destination buffer, can be NULL if destCapacity==0. + * @param destCapacity Number of UChars available at dest. + * @param length Number of UChars that were (to be) written to dest. + * @param pErrorCode ICU error code. + * @return length + */ +U_CAPI int32_t U_EXPORT2 +u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode); + +/** + * NUL-terminate a char * string if possible. + * Same as u_terminateUChars() but for a different string type. + */ +U_CAPI int32_t U_EXPORT2 +u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode); + +/** + * NUL-terminate a UChar32 * string if possible. + * Same as u_terminateUChars() but for a different string type. + */ +U_CAPI int32_t U_EXPORT2 +u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode); + +/** + * NUL-terminate a wchar_t * string if possible. + * Same as u_terminateUChars() but for a different string type. + */ +U_CAPI int32_t U_EXPORT2 +u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode); + +#endif diff --git a/src/core/basetypes/icu-ucsdet/include/utracimp.h b/src/core/basetypes/icu-ucsdet/include/utracimp.h new file mode 100644 index 00000000..317fbe35 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/include/utracimp.h @@ -0,0 +1,384 @@ +/* +******************************************************************************* +* +* Copyright (C) 2003-2009, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: utracimp.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003aug06 +* created by: Markus W. Scherer +* +* Internal header for ICU tracing/logging. +* +* +* Various notes: +* - using a trace level variable to only call trace functions +* when the level is sufficient +* - using the same variable for tracing on/off to never make a function +* call when off +* - the function number is put into a local variable by the entry macro +* and used implicitly to avoid copy&paste/typing mistakes by the developer +* - the application must call utrace_setFunctions() and pass in +* implementations for the trace functions +* - ICU trace macros call ICU functions that route through the function +* pointers if they have been set; +* this avoids an indirection at the call site +* (which would cost more code for another check and for the indirection) +* +* ### TODO Issues: +* - Verify that va_list is portable among compilers for the same platform. +* va_list should be portable because printf() would fail otherwise! +* - Should enum values like UTraceLevel be passed into int32_t-type arguments, +* or should enum types be used? +*/ + +#ifndef __UTRACIMP_H__ +#define __UTRACIMP_H__ + +#include "unicode/utrace.h" +#include <stdarg.h> + +U_CDECL_BEGIN + +/** + * \var utrace_level + * Trace level variable. Negative for "off". + * Use only via UTRACE_ macros. + * @internal + */ +#ifdef UTRACE_IMPL +U_EXPORT int32_t +#else +U_CFUNC U_COMMON_API int32_t +#endif +utrace_level; + + +/** + * Traced Function Exit return types. + * Flags indicating the number and types of varargs included in a call + * to a UTraceExit function. + * Bits 0-3: The function return type. First variable param. + * Bit 4: Flag for presence of U_ErrorCode status param. + * @internal + */ +typedef enum UTraceExitVal { + /** The traced function returns no value @internal */ + UTRACE_EXITV_NONE = 0, + /** The traced function returns an int32_t, or compatible, type. @internal */ + UTRACE_EXITV_I32 = 1, + /** The traced function returns a pointer @internal */ + UTRACE_EXITV_PTR = 2, + /** The traced function returns a UBool @internal */ + UTRACE_EXITV_BOOL = 3, + /** Mask to extract the return type values from a UTraceExitVal @internal */ + UTRACE_EXITV_MASK = 0xf, + /** Bit indicating that the traced function includes a UErrorCode parameter @internal */ + UTRACE_EXITV_STATUS = 0x10 +} UTraceExitVal; + +/** + * Trace function for the entry point of a function. + * Do not use directly, use UTRACE_ENTRY instead. + * @param fnNumber The UTraceFunctionNumber for the current function. + * @internal + */ +U_CAPI void U_EXPORT2 +utrace_entry(int32_t fnNumber); + +/** + * Trace function for each exit point of a function. + * Do not use directly, use UTRACE_EXIT* instead. + * @param fnNumber The UTraceFunctionNumber for the current function. + * @param returnType The type of the value returned by the function. + * @param errorCode The UErrorCode value at function exit. See UTRACE_EXIT. + * @internal + */ +U_CAPI void U_EXPORT2 +utrace_exit(int32_t fnNumber, int32_t returnType, ...); + + +/** + * Trace function used inside functions that have a UTRACE_ENTRY() statement. + * Do not use directly, use UTRACE_DATAX() macros instead. + * + * @param utraceFnNumber The number of the current function, from the local + * variable of the same name. + * @param level The trace level for this message. + * @param fmt The trace format string. + * + * @internal + */ +U_CAPI void U_EXPORT2 +utrace_data(int32_t utraceFnNumber, int32_t level, const char *fmt, ...); + +U_CDECL_END + +#if U_ENABLE_TRACING + +/** + * Boolean expression to see if ICU tracing is turned on + * to at least the specified level. + * @internal + */ +#define UTRACE_LEVEL(level) (utrace_getLevel()>=(level)) + +/** + * Flag bit in utraceFnNumber, the local variable added to each function + * with tracing code to contains the function number. + * + * Set the flag if the function's entry is traced, which will cause the + * function's exit to also be traced. utraceFnNumber is uncoditionally + * set at entry, whether or not the entry is traced, so that it will + * always be available for error trace output. + * @internal + */ +#define UTRACE_TRACED_ENTRY 0x80000000 + +/** + * Trace statement for the entry point of a function. + * Stores the function number in a local variable. + * In C code, must be placed immediately after the last variable declaration. + * Must be matched with UTRACE_EXIT() at all function exit points. + * + * Tracing should start with UTRACE_ENTRY after checking for + * U_FAILURE at function entry, so that if a function returns immediately + * because of a pre-existing error condition, it does not show up in the trace, + * consistent with ICU's error handling model. + * + * @param fnNumber The UTraceFunctionNumber for the current function. + * @internal + */ +#define UTRACE_ENTRY(fnNumber) \ + int32_t utraceFnNumber=(fnNumber); \ + if(utrace_getLevel()>=UTRACE_INFO) { \ + utrace_entry(fnNumber); \ + utraceFnNumber |= UTRACE_TRACED_ENTRY; \ + } + + +/** + * Trace statement for the entry point of open and close functions. + * Produces trace output at a less verbose setting than plain UTRACE_ENTRY + * Stores the function number in a local variable. + * In C code, must be placed immediately after the last variable declaration. + * Must be matched with UTRACE_EXIT() at all function exit points. + * + * @param fnNumber The UTraceFunctionNumber for the current function. + * @internal + */ +#define UTRACE_ENTRY_OC(fnNumber) \ + int32_t utraceFnNumber=(fnNumber); \ + if(utrace_getLevel()>=UTRACE_OPEN_CLOSE) { \ + utrace_entry(fnNumber); \ + utraceFnNumber |= UTRACE_TRACED_ENTRY; \ + } + +/** + * Trace statement for each exit point of a function that has a UTRACE_ENTRY() + * statement. + * + * @param errorCode The function's ICU UErrorCode value at function exit, + * or U_ZERO_ERROR if the function does not use a UErrorCode. + * 0==U_ZERO_ERROR indicates success, + * positive values an error (see u_errorName()), + * negative values an informational status. + * + * @internal + */ +#define UTRACE_EXIT() \ + {if(utraceFnNumber & UTRACE_TRACED_ENTRY) { \ + utrace_exit(utraceFnNumber & ~UTRACE_TRACED_ENTRY, UTRACE_EXITV_NONE); \ + }} + +/** + * Trace statement for each exit point of a function that has a UTRACE_ENTRY() + * statement, and that returns a value. + * + * @param val The function's return value, int32_t or comatible type. + * + * @internal + */ +#define UTRACE_EXIT_VALUE(val) \ + {if(utraceFnNumber & UTRACE_TRACED_ENTRY) { \ + utrace_exit(utraceFnNumber & ~UTRACE_TRACED_ENTRY, UTRACE_EXITV_I32, val); \ + }} + +#define UTRACE_EXIT_STATUS(status) \ + {if(utraceFnNumber & UTRACE_TRACED_ENTRY) { \ + utrace_exit(utraceFnNumber & ~UTRACE_TRACED_ENTRY, UTRACE_EXITV_STATUS, status); \ + }} + +#define UTRACE_EXIT_VALUE_STATUS(val, status) \ + {if(utraceFnNumber & UTRACE_TRACED_ENTRY) { \ + utrace_exit(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (UTRACE_EXITV_I32 | UTRACE_EXITV_STATUS), val, status); \ + }} + +#define UTRACE_EXIT_PTR_STATUS(ptr, status) \ + {if(utraceFnNumber & UTRACE_TRACED_ENTRY) { \ + utrace_exit(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (UTRACE_EXITV_PTR | UTRACE_EXITV_STATUS), ptr, status); \ + }} + +/** + * Trace statement used inside functions that have a UTRACE_ENTRY() statement. + * Takes no data arguments. + * The number of arguments for this macro must match the number of inserts + * in the format string. Vector inserts count as two arguments. + * Calls utrace_data() if the level is high enough. + * @internal + */ +#define UTRACE_DATA0(level, fmt) \ + if(UTRACE_LEVEL(level)) { \ + utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt)); \ + } + +/** + * Trace statement used inside functions that have a UTRACE_ENTRY() statement. + * Takes one data argument. + * The number of arguments for this macro must match the number of inserts + * in the format string. Vector inserts count as two arguments. + * Calls utrace_data() if the level is high enough. + * @internal + */ +#define UTRACE_DATA1(level, fmt, a) \ + if(UTRACE_LEVEL(level)) { \ + utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY , (level), (fmt), (a)); \ + } + +/** + * Trace statement used inside functions that have a UTRACE_ENTRY() statement. + * Takes two data arguments. + * The number of arguments for this macro must match the number of inserts + * in the format string. Vector inserts count as two arguments. + * Calls utrace_data() if the level is high enough. + * @internal + */ +#define UTRACE_DATA2(level, fmt, a, b) \ + if(UTRACE_LEVEL(level)) { \ + utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY , (level), (fmt), (a), (b)); \ + } + +/** + * Trace statement used inside functions that have a UTRACE_ENTRY() statement. + * Takes three data arguments. + * The number of arguments for this macro must match the number of inserts + * in the format string. Vector inserts count as two arguments. + * Calls utrace_data() if the level is high enough. + * @internal + */ +#define UTRACE_DATA3(level, fmt, a, b, c) \ + if(UTRACE_LEVEL(level)) { \ + utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c)); \ + } + +/** + * Trace statement used inside functions that have a UTRACE_ENTRY() statement. + * Takes four data arguments. + * The number of arguments for this macro must match the number of inserts + * in the format string. Vector inserts count as two arguments. + * Calls utrace_data() if the level is high enough. + * @internal + */ +#define UTRACE_DATA4(level, fmt, a, b, c, d) \ + if(UTRACE_LEVEL(level)) { \ + utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c), (d)); \ + } + +/** + * Trace statement used inside functions that have a UTRACE_ENTRY() statement. + * Takes five data arguments. + * The number of arguments for this macro must match the number of inserts + * in the format string. Vector inserts count as two arguments. + * Calls utrace_data() if the level is high enough. + * @internal + */ +#define UTRACE_DATA5(level, fmt, a, b, c, d, e) \ + if(UTRACE_LEVEL(level)) { \ + utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c), (d), (e)); \ + } + +/** + * Trace statement used inside functions that have a UTRACE_ENTRY() statement. + * Takes six data arguments. + * The number of arguments for this macro must match the number of inserts + * in the format string. Vector inserts count as two arguments. + * Calls utrace_data() if the level is high enough. + * @internal + */ +#define UTRACE_DATA6(level, fmt, a, b, c, d, e, f) \ + if(UTRACE_LEVEL(level)) { \ + utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c), (d), (e), (f)); \ + } + +/** + * Trace statement used inside functions that have a UTRACE_ENTRY() statement. + * Takes seven data arguments. + * The number of arguments for this macro must match the number of inserts + * in the format string. Vector inserts count as two arguments. + * Calls utrace_data() if the level is high enough. + * @internal + */ +#define UTRACE_DATA7(level, fmt, a, b, c, d, e, f, g) \ + if(UTRACE_LEVEL(level)) { \ + utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c), (d), (e), (f), (g)); \ + } + +/** + * Trace statement used inside functions that have a UTRACE_ENTRY() statement. + * Takes eight data arguments. + * The number of arguments for this macro must match the number of inserts + * in the format string. Vector inserts count as two arguments. + * Calls utrace_data() if the level is high enough. + * @internal + */ +#define UTRACE_DATA8(level, fmt, a, b, c, d, e, f, g, h) \ + if(UTRACE_LEVEL(level)) { \ + utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c), (d), (e), (f), (g), (h)); \ + } + +/** + * Trace statement used inside functions that have a UTRACE_ENTRY() statement. + * Takes nine data arguments. + * The number of arguments for this macro must match the number of inserts + * in the format string. Vector inserts count as two arguments. + * Calls utrace_data() if the level is high enough. + * @internal + */ +#define UTRACE_DATA9(level, fmt, a, b, c, d, e, f, g, h, i) \ + if(UTRACE_LEVEL(level)) { \ + utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c), (d), (e), (f), (g), (h), (i)); \ + } + +#else + +/* + * When tracing is disabled, the following macros become empty + */ + +#define UTRACE_LEVEL(level) 0 +#define UTRACE_ENTRY(fnNumber) +#define UTRACE_ENTRY_OC(fnNumber) +#define UTRACE_EXIT() +#define UTRACE_EXIT_VALUE(val) +#define UTRACE_EXIT_STATUS(status) +#define UTRACE_EXIT_VALUE_STATUS(val, status) +#define UTRACE_EXIT_PTR_STATUS(ptr, status) +#define UTRACE_DATA0(level, fmt) +#define UTRACE_DATA1(level, fmt, a) +#define UTRACE_DATA2(level, fmt, a, b) +#define UTRACE_DATA3(level, fmt, a, b, c) +#define UTRACE_DATA4(level, fmt, a, b, c, d) +#define UTRACE_DATA5(level, fmt, a, b, c, d, e) +#define UTRACE_DATA6(level, fmt, a, b, c, d, e, f) +#define UTRACE_DATA7(level, fmt, a, b, c, d, e, f, g) +#define UTRACE_DATA8(level, fmt, a, b, c, d, e, f, g, h) +#define UTRACE_DATA9(level, fmt, a, b, c, d, e, f, g, h, i) + +#endif + +#endif diff --git a/src/core/basetypes/icu-ucsdet/inputext.cpp b/src/core/basetypes/icu-ucsdet/inputext.cpp new file mode 100644 index 00000000..e5a8ee18 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/inputext.cpp @@ -0,0 +1,164 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2009, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "inputext.h" + +#include "cmemory.h" +#include "cstring.h" + +#include <string.h> + +U_NAMESPACE_BEGIN + +#define BUFFER_SIZE 8192 + +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) + +#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) +#define DELETE_ARRAY(array) uprv_free((void *) (array)) + +InputText::InputText(UErrorCode &status) + : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been + // removed if appropriate. + fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. + // Value is percent, not absolute. + fDeclaredEncoding(0), + fRawInput(0), + fRawLength(0) +{ + if (fInputBytes == NULL || fByteStats == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } +} + +InputText::~InputText() +{ + DELETE_ARRAY(fDeclaredEncoding); + DELETE_ARRAY(fByteStats); + DELETE_ARRAY(fInputBytes); +} + +void InputText::setText(const char *in, int32_t len) +{ + fInputLen = 0; + fC1Bytes = FALSE; + fRawInput = (const uint8_t *) in; + fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; +} + +void InputText::setDeclaredEncoding(const char* encoding, int32_t len) +{ + if(encoding) { + if (len == -1) { + len = (int32_t)uprv_strlen(encoding); + } + + len += 1; // to make place for the \0 at the end. + uprv_free(fDeclaredEncoding); + fDeclaredEncoding = NEW_ARRAY(char, len); + uprv_strncpy(fDeclaredEncoding, encoding, len); + } +} + +UBool InputText::isSet() const +{ + return fRawInput != NULL; +} + +/** +* MungeInput - after getting a set of raw input data to be analyzed, preprocess +* it by removing what appears to be html markup. +* +* @internal +*/ +void InputText::MungeInput(UBool fStripTags) { + int srci = 0; + int dsti = 0; + uint8_t b; + bool inMarkup = FALSE; + int32_t openTags = 0; + int32_t badTags = 0; + + // + // html / xml markup stripping. + // quick and dirty, not 100% accurate, but hopefully good enough, statistically. + // discard everything within < brackets > + // Count how many total '<' and illegal (nested) '<' occur, so we can make some + // guess as to whether the input was actually marked up at all. + // TODO: Think about how this interacts with EBCDIC charsets that are detected. + if (fStripTags) { + for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { + b = fRawInput[srci]; + + if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ + if (inMarkup) { + badTags += 1; + } + + inMarkup = TRUE; + openTags += 1; + } + + if (! inMarkup) { + fInputBytes[dsti++] = b; + } + + if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ + inMarkup = FALSE; + } + } + + fInputLen = dsti; + } + + // + // If it looks like this input wasn't marked up, or if it looks like it's + // essentially nothing but markup abandon the markup stripping. + // Detection will have to work on the unstripped input. + // + if (openTags<5 || openTags/5 < badTags || + (fInputLen < 100 && fRawLength>600)) + { + int32_t limit = fRawLength; + + if (limit > BUFFER_SIZE) { + limit = BUFFER_SIZE; + } + + for (srci=0; srci<limit; srci++) { + fInputBytes[srci] = fRawInput[srci]; + } + + fInputLen = srci; + } + + // + // Tally up the byte occurence statistics. + // These are available for use by the various detectors. + // + + uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); + + for (srci = 0; srci < fInputLen; srci += 1) { + fByteStats[fInputBytes[srci]] += 1; + } + + for (int32_t i = 0x80; i <= 0x9F; i += 1) { + if (fByteStats[i] != 0) { + fC1Bytes = TRUE; + break; + } + } +} + +U_NAMESPACE_END +#endif + diff --git a/src/core/basetypes/icu-ucsdet/uarrsort.c b/src/core/basetypes/icu-ucsdet/uarrsort.c new file mode 100644 index 00000000..22c76972 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/uarrsort.c @@ -0,0 +1,283 @@ +/* +******************************************************************************* +* +* Copyright (C) 2003-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uarrsort.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003aug04 +* created by: Markus W. Scherer +* +* Internal function for sorting arrays. +*/ + +#include "unicode/utypes.h" +#include "cmemory.h" +#include "uarrsort.h" + +enum { + /** + * "from Knuth" + * + * A binary search over 8 items performs 4 comparisons: + * log2(8)=3 to subdivide, +1 to check for equality. + * A linear search over 8 items on average also performs 4 comparisons. + */ + MIN_QSORT=9, + STACK_ITEM_SIZE=200 +}; + +/* UComparator convenience implementations ---------------------------------- */ + +U_CAPI int32_t U_EXPORT2 +uprv_uint16Comparator(const void *context, const void *left, const void *right) { + return (int32_t)*(const uint16_t *)left - (int32_t)*(const uint16_t *)right; +} + +U_CAPI int32_t U_EXPORT2 +uprv_int32Comparator(const void *context, const void *left, const void *right) { + return *(const int32_t *)left - *(const int32_t *)right; +} + +U_CAPI int32_t U_EXPORT2 +uprv_uint32Comparator(const void *context, const void *left, const void *right) { + uint32_t l=*(const uint32_t *)left, r=*(const uint32_t *)right; + + /* compare directly because (l-r) would overflow the int32_t result */ + if(l<r) { + return -1; + } else if(l==r) { + return 0; + } else /* l>r */ { + return 1; + } +} + +/* Insertion sort using binary search --------------------------------------- */ + +U_CAPI int32_t U_EXPORT2 +uprv_stableBinarySearch(char *array, int32_t limit, void *item, int32_t itemSize, + UComparator *cmp, const void *context) { + int32_t start=0; + UBool found=FALSE; + + /* Binary search until we get down to a tiny sub-array. */ + while((limit-start)>=MIN_QSORT) { + int32_t i=(start+limit)/2; + int32_t diff=cmp(context, item, array+i*itemSize); + if(diff==0) { + /* + * Found the item. We look for the *last* occurrence of such + * an item, for stable sorting. + * If we knew that there will be only few equal items, + * we could break now and enter the linear search. + * However, if there are many equal items, then it should be + * faster to continue with the binary search. + * It seems likely that we either have all unique items + * (where found will never become TRUE in the insertion sort) + * or potentially many duplicates. + */ + found=TRUE; + start=i+1; + } else if(diff<0) { + limit=i; + } else { + start=i; + } + } + + /* Linear search over the remaining tiny sub-array. */ + while(start<limit) { + int32_t diff=cmp(context, item, array+start*itemSize); + if(diff==0) { + found=TRUE; + } else if(diff<0) { + break; + } + ++start; + } + return found ? (start-1) : ~start; +} + +static void +doInsertionSort(char *array, int32_t length, int32_t itemSize, + UComparator *cmp, const void *context, void *pv) { + int32_t j; + + for(j=1; j<length; ++j) { + char *item=array+j*itemSize; + int32_t insertionPoint=uprv_stableBinarySearch(array, j, item, itemSize, cmp, context); + if(insertionPoint<0) { + insertionPoint=~insertionPoint; + } else { + ++insertionPoint; /* one past the last equal item */ + } + if(insertionPoint<j) { + char *dest=array+insertionPoint*itemSize; + uprv_memcpy(pv, item, itemSize); /* v=array[j] */ + uprv_memmove(dest+itemSize, dest, (j-insertionPoint)*itemSize); + uprv_memcpy(dest, pv, itemSize); /* array[insertionPoint]=v */ + } + } +} + +static void +insertionSort(char *array, int32_t length, int32_t itemSize, + UComparator *cmp, const void *context, UErrorCode *pErrorCode) { + UAlignedMemory v[STACK_ITEM_SIZE/sizeof(UAlignedMemory)+1]; + void *pv; + + /* allocate an intermediate item variable (v) */ + if(itemSize<=STACK_ITEM_SIZE) { + pv=v; + } else { + pv=uprv_malloc(itemSize); + if(pv==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return; + } + } + + doInsertionSort(array, length, itemSize, cmp, context, pv); + + if(pv!=v) { + uprv_free(pv); + } +} + +/* QuickSort ---------------------------------------------------------------- */ + +/* + * This implementation is semi-recursive: + * It recurses for the smaller sub-array to shorten the recursion depth, + * and loops for the larger sub-array. + * + * Loosely after QuickSort algorithms in + * Niklaus Wirth + * Algorithmen und Datenstrukturen mit Modula-2 + * B.G. Teubner Stuttgart + * 4. Auflage 1986 + * ISBN 3-519-02260-5 + */ +static void +subQuickSort(char *array, int32_t start, int32_t limit, int32_t itemSize, + UComparator *cmp, const void *context, + void *px, void *pw) { + int32_t left, right; + + /* start and left are inclusive, limit and right are exclusive */ + do { + if((start+MIN_QSORT)>=limit) { + doInsertionSort(array+start*itemSize, limit-start, itemSize, cmp, context, px); + break; + } + + left=start; + right=limit; + + /* x=array[middle] */ + uprv_memcpy(px, array+((start+limit)/2)*itemSize, itemSize); + + do { + while(/* array[left]<x */ + cmp(context, array+left*itemSize, px)<0 + ) { + ++left; + } + while(/* x<array[right-1] */ + cmp(context, px, array+(right-1)*itemSize)<0 + ) { + --right; + } + + /* swap array[left] and array[right-1] via w; ++left; --right */ + if(left<right) { + --right; + + if(left<right) { + uprv_memcpy(pw, array+left*itemSize, itemSize); + uprv_memcpy(array+left*itemSize, array+right*itemSize, itemSize); + uprv_memcpy(array+right*itemSize, pw, itemSize); + } + + ++left; + } + } while(left<right); + + /* sort sub-arrays */ + if((right-start)<(limit-left)) { + /* sort [start..right[ */ + if(start<(right-1)) { + subQuickSort(array, start, right, itemSize, cmp, context, px, pw); + } + + /* sort [left..limit[ */ + start=left; + } else { + /* sort [left..limit[ */ + if(left<(limit-1)) { + subQuickSort(array, left, limit, itemSize, cmp, context, px, pw); + } + + /* sort [start..right[ */ + limit=right; + } + } while(start<(limit-1)); +} + +static void +quickSort(char *array, int32_t length, int32_t itemSize, + UComparator *cmp, const void *context, UErrorCode *pErrorCode) { + UAlignedMemory xw[(2*STACK_ITEM_SIZE)/sizeof(UAlignedMemory)+1]; + void *p; + + /* allocate two intermediate item variables (x and w) */ + if(itemSize<=STACK_ITEM_SIZE) { + p=xw; + } else { + p=uprv_malloc(2*itemSize); + if(p==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return; + } + } + + subQuickSort(array, 0, length, itemSize, + cmp, context, p, (char *)p+itemSize); + + if(p!=xw) { + uprv_free(p); + } +} + +/* uprv_sortArray() API ----------------------------------------------------- */ + +/* + * Check arguments, select an appropriate implementation, + * cast the array to char * so that array+i*itemSize works. + */ +U_CAPI void U_EXPORT2 +uprv_sortArray(void *array, int32_t length, int32_t itemSize, + UComparator *cmp, const void *context, + UBool sortStable, UErrorCode *pErrorCode) { + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + if((length>0 && array==NULL) || length<0 || itemSize<=0 || cmp==NULL) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if(length<=1) { + return; + } else if(length<MIN_QSORT || sortStable) { + insertionSort((char *)array, length, itemSize, cmp, context, pErrorCode); + } else { + quickSort((char *)array, length, itemSize, cmp, context, pErrorCode); + } +} diff --git a/src/core/basetypes/icu-ucsdet/ucln_cmn.cpp b/src/core/basetypes/icu-ucsdet/ucln_cmn.cpp new file mode 100644 index 00000000..2480c9d3 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/ucln_cmn.cpp @@ -0,0 +1,111 @@ +/* +****************************************************************************** +* Copyright (C) 2001-2014, International Business Machines +* Corporation and others. All Rights Reserved. +****************************************************************************** +* file name: ucln_cmn.cpp +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2001July05 +* created by: George Rhoten +*/ + +#include "unicode/utypes.h" +#include "unicode/uclean.h" +#include "cmemory.h" +#include "mutex.h" +#include "uassert.h" +#include "ucln.h" +#include "ucln_cmn.h" +#include "utracimp.h" +#include "umutex.h" + +/** Auto-client for UCLN_COMMON **/ +#define UCLN_TYPE_IS_COMMON +#include "ucln_imp.h" + +static cleanupFunc *gCommonCleanupFunctions[UCLN_COMMON_COUNT]; +static cleanupFunc *gLibCleanupFunctions[UCLN_COMMON]; + + +/************************************************ + The cleanup order is important in this function. + Please be sure that you have read ucln.h + ************************************************/ +U_CAPI void U_EXPORT2 +u_cleanup(void) +{ + UTRACE_ENTRY_OC(UTRACE_U_CLEANUP); + umtx_lock(NULL); /* Force a memory barrier, so that we are sure to see */ + umtx_unlock(NULL); /* all state left around by any other threads. */ + + ucln_lib_cleanup(); + + cmemory_cleanup(); /* undo any heap functions set by u_setMemoryFunctions(). */ + UTRACE_EXIT(); /* Must be before utrace_cleanup(), which turns off tracing. */ +/*#if U_ENABLE_TRACING*/ + utrace_cleanup(); +/*#endif*/ +} + +U_CAPI void U_EXPORT2 ucln_cleanupOne(ECleanupLibraryType libType) +{ + if (gLibCleanupFunctions[libType]) + { + gLibCleanupFunctions[libType](); + gLibCleanupFunctions[libType] = NULL; + } +} + +U_CFUNC void +ucln_common_registerCleanup(ECleanupCommonType type, + cleanupFunc *func) +{ + U_ASSERT(UCLN_COMMON_START < type && type < UCLN_COMMON_COUNT); + if (UCLN_COMMON_START < type && type < UCLN_COMMON_COUNT) + { + icu::Mutex m; // See ticket 10295 for discussion. + gCommonCleanupFunctions[type] = func; + } +#if !UCLN_NO_AUTO_CLEANUP && (defined(UCLN_AUTO_ATEXIT) || defined(UCLN_AUTO_LOCAL)) + ucln_registerAutomaticCleanup(); +#endif +} + +// Note: ucln_registerCleanup() is called with the ICU global mutex locked. +// Be aware if adding anything to the function. +// See ticket 10295 for discussion. + +U_CAPI void U_EXPORT2 +ucln_registerCleanup(ECleanupLibraryType type, + cleanupFunc *func) +{ + U_ASSERT(UCLN_START < type && type < UCLN_COMMON); + if (UCLN_START < type && type < UCLN_COMMON) + { + gLibCleanupFunctions[type] = func; + } +} + +U_CFUNC UBool ucln_lib_cleanup(void) { + int32_t libType = UCLN_START; + int32_t commonFunc = UCLN_COMMON_START; + + for (libType++; libType<UCLN_COMMON; libType++) { + ucln_cleanupOne(static_cast<ECleanupLibraryType>(libType)); + } + + for (commonFunc++; commonFunc<UCLN_COMMON_COUNT; commonFunc++) { + if (gCommonCleanupFunctions[commonFunc]) + { + gCommonCleanupFunctions[commonFunc](); + gCommonCleanupFunctions[commonFunc] = NULL; + } + } +#if !UCLN_NO_AUTO_CLEANUP && (defined(UCLN_AUTO_ATEXIT) || defined(UCLN_AUTO_LOCAL)) + ucln_unRegisterAutomaticCleanup(); +#endif + return TRUE; +} diff --git a/src/core/basetypes/icu-ucsdet/ucln_in.cpp b/src/core/basetypes/icu-ucsdet/ucln_in.cpp new file mode 100644 index 00000000..431d4316 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/ucln_in.cpp @@ -0,0 +1,63 @@ +/* +****************************************************************************** +* * +* Copyright (C) 2001-2014, International Business Machines * +* Corporation and others. All Rights Reserved. * +* * +****************************************************************************** +* file name: ucln_in.cpp +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2001July05 +* created by: George Rhoten +*/ + +#include "ucln.h" +#include "ucln_in.h" +#include "mutex.h" +#include "uassert.h" + +/** Auto-client for UCLN_I18N **/ +#define UCLN_TYPE UCLN_I18N +#include "ucln_imp.h" + +/* Leave this copyright notice here! It needs to go somewhere in this library. */ +static const char copyright[] = U_COPYRIGHT_STRING; + +static cleanupFunc *gCleanupFunctions[UCLN_I18N_COUNT]; + +static UBool i18n_cleanup(void) +{ + int32_t libType = UCLN_I18N_START; + (void)copyright; /* Suppress unused variable warning with clang. */ + + while (++libType<UCLN_I18N_COUNT) { + if (gCleanupFunctions[libType]) + { + gCleanupFunctions[libType](); + gCleanupFunctions[libType] = NULL; + } + } +#if !UCLN_NO_AUTO_CLEANUP && (defined(UCLN_AUTO_ATEXIT) || defined(UCLN_AUTO_LOCAL)) + ucln_unRegisterAutomaticCleanup(); +#endif + return TRUE; +} + +void ucln_i18n_registerCleanup(ECleanupI18NType type, + cleanupFunc *func) { + U_ASSERT(UCLN_I18N_START < type && type < UCLN_I18N_COUNT); + { + icu::Mutex m; // See ticket 10295 for discussion. + ucln_registerCleanup(UCLN_I18N, i18n_cleanup); + if (UCLN_I18N_START < type && type < UCLN_I18N_COUNT) { + gCleanupFunctions[type] = func; + } + } +#if !UCLN_NO_AUTO_CLEANUP && (defined(UCLN_AUTO_ATEXIT) || defined(UCLN_AUTO_LOCAL)) + ucln_registerAutomaticCleanup(); +#endif +} + diff --git a/src/core/basetypes/icu-ucsdet/ucsdet.cpp b/src/core/basetypes/icu-ucsdet/ucsdet.cpp new file mode 100644 index 00000000..b42758d5 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/ucsdet.cpp @@ -0,0 +1,207 @@ +/* + ******************************************************************************** + * Copyright (C) 2005-2013, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION +#include "unicode/ucsdet.h" +#include "csdetect.h" +#include "csmatch.h" +#include "csrsbcs.h" +#include "csrmbcs.h" +#include "csrutf8.h" +#include "csrucode.h" +#include "csr2022.h" + +#include "cmemory.h" + +U_NAMESPACE_USE + +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) + +#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) +#define DELETE_ARRAY(array) uprv_free((void *) (array)) + +U_CDECL_BEGIN + +U_CAPI UCharsetDetector * U_EXPORT2 +ucsdet_open(UErrorCode *status) +{ + if(U_FAILURE(*status)) { + return 0; + } + + CharsetDetector* csd = new CharsetDetector(*status); + + if (U_FAILURE(*status)) { + delete csd; + csd = NULL; + } + + return (UCharsetDetector *) csd; +} + +U_CAPI void U_EXPORT2 +ucsdet_close(UCharsetDetector *ucsd) +{ + CharsetDetector *csd = (CharsetDetector *) ucsd; + delete csd; +} + +U_CAPI void U_EXPORT2 +ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status) +{ + if(U_FAILURE(*status)) { + return; + } + + ((CharsetDetector *) ucsd)->setText(textIn, len); +} + +U_CAPI const char * U_EXPORT2 +ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status) +{ + if(U_FAILURE(*status)) { + return NULL; + } + + return ((CharsetMatch *) ucsm)->getName(); +} + +U_CAPI int32_t U_EXPORT2 +ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status) +{ + if(U_FAILURE(*status)) { + return 0; + } + + return ((CharsetMatch *) ucsm)->getConfidence(); +} + +U_CAPI const char * U_EXPORT2 +ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status) +{ + if(U_FAILURE(*status)) { + return NULL; + } + + return ((CharsetMatch *) ucsm)->getLanguage(); +} + +U_CAPI const UCharsetMatch * U_EXPORT2 +ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status) +{ + if(U_FAILURE(*status)) { + return NULL; + } + + return (const UCharsetMatch *) ((CharsetDetector *) ucsd)->detect(*status); +} + +U_CAPI void U_EXPORT2 +ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status) +{ + if(U_FAILURE(*status)) { + return; + } + + ((CharsetDetector *) ucsd)->setDeclaredEncoding(encoding,length); +} + +U_CAPI const UCharsetMatch** +ucsdet_detectAll(UCharsetDetector *ucsd, + int32_t *maxMatchesFound, UErrorCode *status) +{ + if(U_FAILURE(*status)) { + return NULL; + } + + CharsetDetector *csd = (CharsetDetector *) ucsd; + + return (const UCharsetMatch**)csd->detectAll(*maxMatchesFound,*status); +} + +// U_CAPI const char * U_EXPORT2 +// ucsdet_getDetectableCharsetName(const UCharsetDetector *csd, int32_t index, UErrorCode *status) +// { +// if(U_FAILURE(*status)) { +// return 0; +// } +// return csd->getCharsetName(index,*status); +// } + +// U_CAPI int32_t U_EXPORT2 +// ucsdet_getDetectableCharsetsCount(const UCharsetDetector *csd, UErrorCode *status) +// { +// if(U_FAILURE(*status)) { +// return -1; +// } +// return UCharsetDetector::getDetectableCount(); +// } + +U_CAPI UBool U_EXPORT2 +ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd) +{ + // todo: could use an error return... + if (ucsd == NULL) { + return FALSE; + } + + return ((CharsetDetector *) ucsd)->getStripTagsFlag(); +} + +U_CAPI UBool U_EXPORT2 +ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter) +{ + // todo: could use an error return... + if (ucsd == NULL) { + return FALSE; + } + + CharsetDetector *csd = (CharsetDetector *) ucsd; + UBool prev = csd->getStripTagsFlag(); + + csd->setStripTagsFlag(filter); + + return prev; +} + +/* +U_CAPI int32_t U_EXPORT2 +ucsdet_getUChars(const UCharsetMatch *ucsm, + UChar *buf, int32_t cap, UErrorCode *status) +{ + if(U_FAILURE(*status)) { + return 0; + } + + return ((CharsetMatch *) ucsm)->getUChars(buf, cap, status); +} +*/ + +U_CAPI void U_EXPORT2 +ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status) +{ + ((CharsetDetector *)ucsd)->setDetectableCharset(encoding, enabled, *status); +} + +U_CAPI UEnumeration * U_EXPORT2 +ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status) +{ + return CharsetDetector::getAllDetectableCharsets(*status); +} + +U_DRAFT UEnumeration * U_EXPORT2 +ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status) +{ + return ((CharsetDetector *)ucsd)->getDetectableCharsets(*status); +} + +U_CDECL_END + + +#endif diff --git a/src/core/basetypes/icu-ucsdet/udataswp.c b/src/core/basetypes/icu-ucsdet/udataswp.c new file mode 100644 index 00000000..06fe85bc --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/udataswp.c @@ -0,0 +1,471 @@ +/* +******************************************************************************* +* +* Copyright (C) 2003-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: udataswp.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003jun05 +* created by: Markus W. Scherer +* +* Definitions for ICU data transformations for different platforms, +* changing between big- and little-endian data and/or between +* charset families (ASCII<->EBCDIC). +*/ + +#include <stdarg.h> +#include "unicode/utypes.h" +#include "unicode/udata.h" /* UDataInfo */ +#include "ucmndata.h" /* DataHeader */ +#include "cmemory.h" +#include "udataswp.h" + +/* swapping primitives ------------------------------------------------------ */ + +static int32_t U_CALLCONV +uprv_swapArray16(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const uint16_t *p; + uint16_t *q; + int32_t count; + uint16_t x; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + if(ds==NULL || inData==NULL || length<0 || (length&1)!=0 || outData==NULL) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* setup and swapping */ + p=(const uint16_t *)inData; + q=(uint16_t *)outData; + count=length/2; + while(count>0) { + x=*p++; + *q++=(uint16_t)((x<<8)|(x>>8)); + --count; + } + + return length; +} + +static int32_t U_CALLCONV +uprv_copyArray16(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + if(ds==NULL || inData==NULL || length<0 || (length&1)!=0 || outData==NULL) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + if(length>0 && inData!=outData) { + uprv_memcpy(outData, inData, length); + } + return length; +} + +static int32_t U_CALLCONV +uprv_swapArray32(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const uint32_t *p; + uint32_t *q; + int32_t count; + uint32_t x; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + if(ds==NULL || inData==NULL || length<0 || (length&3)!=0 || outData==NULL) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* setup and swapping */ + p=(const uint32_t *)inData; + q=(uint32_t *)outData; + count=length/4; + while(count>0) { + x=*p++; + *q++=(uint32_t)((x<<24)|((x<<8)&0xff0000)|((x>>8)&0xff00)|(x>>24)); + --count; + } + + return length; +} + +static int32_t U_CALLCONV +uprv_copyArray32(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + if(ds==NULL || inData==NULL || length<0 || (length&3)!=0 || outData==NULL) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + if(length>0 && inData!=outData) { + uprv_memcpy(outData, inData, length); + } + return length; +} + +static int32_t U_CALLCONV +uprv_swapArray64(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const uint64_t *p; + uint64_t *q; + int32_t count; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + if(ds==NULL || inData==NULL || length<0 || (length&7)!=0 || outData==NULL) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* setup and swapping */ + p=(const uint64_t *)inData; + q=(uint64_t *)outData; + count=length/8; + while(count>0) { + uint64_t x=*p++; + x=(x<<56)|((x&0xff00)<<40)|((x&0xff0000)<<24)|((x&0xff000000)<<8)| + ((x>>8)&0xff000000)|((x>>24)&0xff0000)|((x>>40)&0xff00)|(x>>56); + *q++=x; + --count; + } + + return length; +} + +static int32_t U_CALLCONV +uprv_copyArray64(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + if(ds==NULL || inData==NULL || length<0 || (length&7)!=0 || outData==NULL) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + if(length>0 && inData!=outData) { + uprv_memcpy(outData, inData, length); + } + return length; +} + +static uint16_t U_CALLCONV +uprv_readSwapUInt16(uint16_t x) { + return (uint16_t)((x<<8)|(x>>8)); +} + +static uint16_t U_CALLCONV +uprv_readDirectUInt16(uint16_t x) { + return x; +} + +static uint32_t U_CALLCONV +uprv_readSwapUInt32(uint32_t x) { + return (uint32_t)((x<<24)|((x<<8)&0xff0000)|((x>>8)&0xff00)|(x>>24)); +} + +static uint32_t U_CALLCONV +uprv_readDirectUInt32(uint32_t x) { + return x; +} + +static void U_CALLCONV +uprv_writeSwapUInt16(uint16_t *p, uint16_t x) { + *p=(uint16_t)((x<<8)|(x>>8)); +} + +static void U_CALLCONV +uprv_writeDirectUInt16(uint16_t *p, uint16_t x) { + *p=x; +} + +static void U_CALLCONV +uprv_writeSwapUInt32(uint32_t *p, uint32_t x) { + *p=(uint32_t)((x<<24)|((x<<8)&0xff0000)|((x>>8)&0xff00)|(x>>24)); +} + +static void U_CALLCONV +uprv_writeDirectUInt32(uint32_t *p, uint32_t x) { + *p=x; +} + +U_CAPI int16_t U_EXPORT2 +udata_readInt16(const UDataSwapper *ds, int16_t x) { + return (int16_t)ds->readUInt16((uint16_t)x); +} + +U_CAPI int32_t U_EXPORT2 +udata_readInt32(const UDataSwapper *ds, int32_t x) { + return (int32_t)ds->readUInt32((uint32_t)x); +} + +/** + * Swap a block of invariant, NUL-terminated strings, but not padding + * bytes after the last string. + * @internal + */ +U_CAPI int32_t U_EXPORT2 +udata_swapInvStringBlock(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const char *inChars; + int32_t stringsLength; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* reduce the strings length to not include bytes after the last NUL */ + inChars=(const char *)inData; + stringsLength=length; + while(stringsLength>0 && inChars[stringsLength-1]!=0) { + --stringsLength; + } + + /* swap up to the last NUL */ + ds->swapInvChars(ds, inData, stringsLength, outData, pErrorCode); + + /* copy the bytes after the last NUL */ + if(inData!=outData && length>stringsLength) { + uprv_memcpy((char *)outData+stringsLength, inChars+stringsLength, length-stringsLength); + } + + /* return the length including padding bytes */ + if(U_SUCCESS(*pErrorCode)) { + return length; + } else { + return 0; + } +} + +U_CAPI void U_EXPORT2 +udata_printError(const UDataSwapper *ds, + const char *fmt, + ...) { + va_list args; + + if(ds->printError!=NULL) { + va_start(args, fmt); + ds->printError(ds->printErrorContext, fmt, args); + va_end(args); + } +} + +/* swap a data header ------------------------------------------------------- */ + +U_CAPI int32_t U_EXPORT2 +udata_swapDataHeader(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const DataHeader *pHeader; + uint16_t headerSize, infoSize; + + /* argument checking */ + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* check minimum length and magic bytes */ + pHeader=(const DataHeader *)inData; + if( (length>=0 && length<sizeof(DataHeader)) || + pHeader->dataHeader.magic1!=0xda || + pHeader->dataHeader.magic2!=0x27 || + pHeader->info.sizeofUChar!=2 + ) { + udata_printError(ds, "udata_swapDataHeader(): initial bytes do not look like ICU data\n"); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + headerSize=ds->readUInt16(pHeader->dataHeader.headerSize); + infoSize=ds->readUInt16(pHeader->info.size); + + if( headerSize<sizeof(DataHeader) || + infoSize<sizeof(UDataInfo) || + headerSize<(sizeof(pHeader->dataHeader)+infoSize) || + (length>=0 && length<headerSize) + ) { + udata_printError(ds, "udata_swapDataHeader(): header size mismatch - headerSize %d infoSize %d length %d\n", + headerSize, infoSize, length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + if(length>0) { + DataHeader *outHeader; + const char *s; + int32_t maxLength; + + /* Most of the fields are just bytes and need no swapping. */ + if(inData!=outData) { + uprv_memcpy(outData, inData, headerSize); + } + outHeader=(DataHeader *)outData; + + outHeader->info.isBigEndian = ds->outIsBigEndian; + outHeader->info.charsetFamily = ds->outCharset; + + /* swap headerSize */ + ds->swapArray16(ds, &pHeader->dataHeader.headerSize, 2, &outHeader->dataHeader.headerSize, pErrorCode); + + /* swap UDataInfo size and reservedWord */ + ds->swapArray16(ds, &pHeader->info.size, 4, &outHeader->info.size, pErrorCode); + + /* swap copyright statement after the UDataInfo */ + infoSize+=sizeof(pHeader->dataHeader); + s=(const char *)inData+infoSize; + maxLength=headerSize-infoSize; + /* get the length of the string */ + for(length=0; length<maxLength && s[length]!=0; ++length) {} + /* swap the string contents */ + ds->swapInvChars(ds, s, length, (char *)outData+infoSize, pErrorCode); + } + + return headerSize; +} + +/* API functions ------------------------------------------------------------ */ + +U_CAPI UDataSwapper * U_EXPORT2 +udata_openSwapper(UBool inIsBigEndian, uint8_t inCharset, + UBool outIsBigEndian, uint8_t outCharset, + UErrorCode *pErrorCode) { + UDataSwapper *swapper; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return NULL; + } + if(inCharset>U_EBCDIC_FAMILY || outCharset>U_EBCDIC_FAMILY) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + + /* allocate the swapper */ + swapper=uprv_malloc(sizeof(UDataSwapper)); + if(swapper==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + uprv_memset(swapper, 0, sizeof(UDataSwapper)); + + /* set values and functions pointers according to in/out parameters */ + swapper->inIsBigEndian=inIsBigEndian; + swapper->inCharset=inCharset; + swapper->outIsBigEndian=outIsBigEndian; + swapper->outCharset=outCharset; + + swapper->readUInt16= inIsBigEndian==U_IS_BIG_ENDIAN ? uprv_readDirectUInt16 : uprv_readSwapUInt16; + swapper->readUInt32= inIsBigEndian==U_IS_BIG_ENDIAN ? uprv_readDirectUInt32 : uprv_readSwapUInt32; + + swapper->writeUInt16= outIsBigEndian==U_IS_BIG_ENDIAN ? uprv_writeDirectUInt16 : uprv_writeSwapUInt16; + swapper->writeUInt32= outIsBigEndian==U_IS_BIG_ENDIAN ? uprv_writeDirectUInt32 : uprv_writeSwapUInt32; + + swapper->compareInvChars= outCharset==U_ASCII_FAMILY ? uprv_compareInvAscii : uprv_compareInvEbcdic; + + if(inIsBigEndian==outIsBigEndian) { + swapper->swapArray16=uprv_copyArray16; + swapper->swapArray32=uprv_copyArray32; + swapper->swapArray64=uprv_copyArray64; + } else { + swapper->swapArray16=uprv_swapArray16; + swapper->swapArray32=uprv_swapArray32; + swapper->swapArray64=uprv_swapArray64; + } + + if(inCharset==U_ASCII_FAMILY) { + swapper->swapInvChars= outCharset==U_ASCII_FAMILY ? uprv_copyAscii : uprv_ebcdicFromAscii; + } else /* U_EBCDIC_FAMILY */ { + swapper->swapInvChars= outCharset==U_EBCDIC_FAMILY ? uprv_copyEbcdic : uprv_asciiFromEbcdic; + } + + return swapper; +} + +U_CAPI UDataSwapper * U_EXPORT2 +udata_openSwapperForInputData(const void *data, int32_t length, + UBool outIsBigEndian, uint8_t outCharset, + UErrorCode *pErrorCode) { + const DataHeader *pHeader; + uint16_t headerSize, infoSize; + UBool inIsBigEndian; + int8_t inCharset; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return NULL; + } + if( data==NULL || + (length>=0 && length<sizeof(DataHeader)) || + outCharset>U_EBCDIC_FAMILY + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + + pHeader=(const DataHeader *)data; + if( (length>=0 && length<sizeof(DataHeader)) || + pHeader->dataHeader.magic1!=0xda || + pHeader->dataHeader.magic2!=0x27 || + pHeader->info.sizeofUChar!=2 + ) { + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + inIsBigEndian=(UBool)pHeader->info.isBigEndian; + inCharset=pHeader->info.charsetFamily; + + if(inIsBigEndian==U_IS_BIG_ENDIAN) { + headerSize=pHeader->dataHeader.headerSize; + infoSize=pHeader->info.size; + } else { + headerSize=uprv_readSwapUInt16(pHeader->dataHeader.headerSize); + infoSize=uprv_readSwapUInt16(pHeader->info.size); + } + + if( headerSize<sizeof(DataHeader) || + infoSize<sizeof(UDataInfo) || + headerSize<(sizeof(pHeader->dataHeader)+infoSize) || + (length>=0 && length<headerSize) + ) { + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + return udata_openSwapper(inIsBigEndian, inCharset, outIsBigEndian, outCharset, pErrorCode); +} + +U_CAPI void U_EXPORT2 +udata_closeSwapper(UDataSwapper *ds) { + uprv_free(ds); +} diff --git a/src/core/basetypes/icu-ucsdet/uenum.c b/src/core/basetypes/icu-ucsdet/uenum.c new file mode 100644 index 00000000..9a3d9e14 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/uenum.c @@ -0,0 +1,187 @@ +/* +******************************************************************************* +* +* Copyright (C) 2002-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uenum.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:2 +* +* created on: 2002jul08 +* created by: Vladimir Weinstein +*/ + +#include "unicode/putil.h" +#include "uenumimp.h" +#include "cmemory.h" + +/* Layout of the baseContext buffer. */ +typedef struct { + int32_t len; /* number of bytes available starting at 'data' */ + char data; /* actual data starts here */ +} _UEnumBuffer; + +/* Extra bytes to allocate in the baseContext buffer. */ +static const int32_t PAD = 8; + +/* Return a pointer to the baseContext buffer, possibly allocating + or reallocating it if at least 'capacity' bytes are not available. */ +static void* _getBuffer(UEnumeration* en, int32_t capacity) { + + if (en->baseContext != NULL) { + if (((_UEnumBuffer*) en->baseContext)->len < capacity) { + capacity += PAD; + en->baseContext = uprv_realloc(en->baseContext, + sizeof(int32_t) + capacity); + if (en->baseContext == NULL) { + return NULL; + } + ((_UEnumBuffer*) en->baseContext)->len = capacity; + } + } else { + capacity += PAD; + en->baseContext = uprv_malloc(sizeof(int32_t) + capacity); + if (en->baseContext == NULL) { + return NULL; + } + ((_UEnumBuffer*) en->baseContext)->len = capacity; + } + + return (void*) & ((_UEnumBuffer*) en->baseContext)->data; +} + +U_CAPI void U_EXPORT2 +uenum_close(UEnumeration* en) +{ + if (en) { + if (en->close != NULL) { + if (en->baseContext) { + uprv_free(en->baseContext); + } + en->close(en); + } else { /* this seems dangerous, but we better kill the object */ + uprv_free(en); + } + } +} + +U_CAPI int32_t U_EXPORT2 +uenum_count(UEnumeration* en, UErrorCode* status) +{ + if (!en || U_FAILURE(*status)) { + return -1; + } + if (en->count != NULL) { + return en->count(en, status); + } else { + *status = U_UNSUPPORTED_ERROR; + return -1; + } +} + +/* Don't call this directly. Only uenum_unext should be calling this. */ +U_CAPI const UChar* U_EXPORT2 +uenum_unextDefault(UEnumeration* en, + int32_t* resultLength, + UErrorCode* status) +{ + UChar *ustr = NULL; + int32_t len = 0; + if (en->next != NULL) { + const char *cstr = en->next(en, &len, status); + if (cstr != NULL) { + ustr = (UChar*) _getBuffer(en, (len+1) * sizeof(UChar)); + if (ustr == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + } else { + u_charsToUChars(cstr, ustr, len+1); + } + } + } else { + *status = U_UNSUPPORTED_ERROR; + } + if (resultLength) { + *resultLength = len; + } + return ustr; +} + +/* Don't call this directly. Only uenum_next should be calling this. */ +U_CAPI const char* U_EXPORT2 +uenum_nextDefault(UEnumeration* en, + int32_t* resultLength, + UErrorCode* status) +{ + if (en->uNext != NULL) { + char *tempCharVal; + const UChar *tempUCharVal = en->uNext(en, resultLength, status); + if (tempUCharVal == NULL) { + return NULL; + } + tempCharVal = (char*) + _getBuffer(en, (*resultLength+1) * sizeof(char)); + if (!tempCharVal) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + u_UCharsToChars(tempUCharVal, tempCharVal, *resultLength + 1); + return tempCharVal; + } else { + *status = U_UNSUPPORTED_ERROR; + return NULL; + } +} + +U_CAPI const UChar* U_EXPORT2 +uenum_unext(UEnumeration* en, + int32_t* resultLength, + UErrorCode* status) +{ + if (!en || U_FAILURE(*status)) { + return NULL; + } + if (en->uNext != NULL) { + return en->uNext(en, resultLength, status); + } else { + *status = U_UNSUPPORTED_ERROR; + return NULL; + } +} + +U_CAPI const char* U_EXPORT2 +uenum_next(UEnumeration* en, + int32_t* resultLength, + UErrorCode* status) +{ + if (!en || U_FAILURE(*status)) { + return NULL; + } + if (en->next != NULL) { + if (resultLength != NULL) { + return en->next(en, resultLength, status); + } + else { + int32_t dummyLength=0; + return en->next(en, &dummyLength, status); + } + } else { + *status = U_UNSUPPORTED_ERROR; + return NULL; + } +} + +U_CAPI void U_EXPORT2 +uenum_reset(UEnumeration* en, UErrorCode* status) +{ + if (!en || U_FAILURE(*status)) { + return; + } + if (en->reset != NULL) { + en->reset(en, status); + } else { + *status = U_UNSUPPORTED_ERROR; + } +} diff --git a/src/core/basetypes/icu-ucsdet/uinvchar.c b/src/core/basetypes/icu-ucsdet/uinvchar.c new file mode 100644 index 00000000..f874edd9 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/uinvchar.c @@ -0,0 +1,611 @@ +/* +******************************************************************************* +* +* Copyright (C) 1999-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uinvchar.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:2 +* +* created on: 2004sep14 +* created by: Markus W. Scherer +* +* Functions for handling invariant characters, moved here from putil.c +* for better modularization. +*/ + +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "udataswp.h" +#include "cstring.h" +#include "cmemory.h" +#include "uassert.h" +#include "uinvchar.h" + +/* invariant-character handling --------------------------------------------- */ + +/* + * These maps for ASCII to/from EBCDIC map invariant characters (see utypes.h) + * appropriately for most EBCDIC codepages. + * + * They currently also map most other ASCII graphic characters, + * appropriately for codepages 37 and 1047. + * Exceptions: The characters for []^ have different codes in 37 & 1047. + * Both versions are mapped to ASCII. + * + * ASCII 37 1047 + * [ 5B BA AD + * ] 5D BB BD + * ^ 5E B0 5F + * + * There are no mappings for variant characters from Unicode to EBCDIC. + * + * Currently, C0 control codes are also included in these maps. + * Exceptions: S/390 Open Edition swaps LF and NEL codes compared with other + * EBCDIC platforms; both codes (15 and 25) are mapped to ASCII LF (0A), + * but there is no mapping for ASCII LF back to EBCDIC. + * + * ASCII EBCDIC S/390-OE + * LF 0A 25 15 + * NEL 85 15 25 + * + * The maps below explicitly exclude the variant + * control and graphical characters that are in ASCII-based + * codepages at 0x80 and above. + * "No mapping" is expressed by mapping to a 00 byte. + * + * These tables do not establish a converter or a codepage. + */ + +static const uint8_t asciiFromEbcdic[256]={ + 0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07, + 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a, + + 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c, + 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e, + 0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22, + + 0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, + 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00, + + 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x5c, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static const uint8_t ebcdicFromAscii[256]={ + 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f, + 0x40, 0x00, 0x7f, 0x00, 0x00, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f, + + 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, + 0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x6d, + 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, + 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x07, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Same as asciiFromEbcdic[] except maps all letters to lowercase. */ +static const uint8_t lowercaseAsciiFromEbcdic[256]={ + 0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07, + 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a, + + 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c, + 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e, + 0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22, + + 0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, + 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00, + + 0x7b, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x7d, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x7c, 0x00, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +/* + * Bit sets indicating which characters of the ASCII repertoire + * (by ASCII/Unicode code) are "invariant". + * See utypes.h for more details. + * + * As invariant are considered the characters of the ASCII repertoire except + * for the following: + * 21 '!' <exclamation mark> + * 23 '#' <number sign> + * 24 '$' <dollar sign> + * + * 40 '@' <commercial at> + * + * 5b '[' <left bracket> + * 5c '\' <backslash> + * 5d ']' <right bracket> + * 5e '^' <circumflex> + * + * 60 '`' <grave accent> + * + * 7b '{' <left brace> + * 7c '|' <vertical line> + * 7d '}' <right brace> + * 7e '~' <tilde> + */ +static const uint32_t invariantChars[4]={ + 0xfffffbff, /* 00..1f but not 0a */ + 0xffffffe5, /* 20..3f but not 21 23 24 */ + 0x87fffffe, /* 40..5f but not 40 5b..5e */ + 0x87fffffe /* 60..7f but not 60 7b..7e */ +}; + +/* + * test unsigned types (or values known to be non-negative) for invariant characters, + * tests ASCII-family character values + */ +#define UCHAR_IS_INVARIANT(c) (((c)<=0x7f) && (invariantChars[(c)>>5]&((uint32_t)1<<((c)&0x1f)))!=0) + +/* test signed types for invariant characters, adds test for positive values */ +#define SCHAR_IS_INVARIANT(c) ((0<=(c)) && UCHAR_IS_INVARIANT(c)) + +#if U_CHARSET_FAMILY==U_ASCII_FAMILY +#define CHAR_TO_UCHAR(c) c +#define UCHAR_TO_CHAR(c) c +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY +#define CHAR_TO_UCHAR(u) asciiFromEbcdic[u] +#define UCHAR_TO_CHAR(u) ebcdicFromAscii[u] +#else +# error U_CHARSET_FAMILY is not valid +#endif + + +U_CAPI void U_EXPORT2 +u_charsToUChars(const char *cs, UChar *us, int32_t length) { + UChar u; + uint8_t c; + + /* + * Allow the entire ASCII repertoire to be mapped _to_ Unicode. + * For EBCDIC systems, this works for characters with codes from + * codepages 37 and 1047 or compatible. + */ + while(length>0) { + c=(uint8_t)(*cs++); + u=(UChar)CHAR_TO_UCHAR(c); + U_ASSERT((u!=0 || c==0)); /* only invariant chars converted? */ + *us++=u; + --length; + } +} + +U_CAPI void U_EXPORT2 +u_UCharsToChars(const UChar *us, char *cs, int32_t length) { + UChar u; + + while(length>0) { + u=*us++; + if(!UCHAR_IS_INVARIANT(u)) { + U_ASSERT(FALSE); /* Variant characters were used. These are not portable in ICU. */ + u=0; + } + *cs++=(char)UCHAR_TO_CHAR(u); + --length; + } +} + +U_CAPI UBool U_EXPORT2 +uprv_isInvariantString(const char *s, int32_t length) { + uint8_t c; + + for(;;) { + if(length<0) { + /* NUL-terminated */ + c=(uint8_t)*s++; + if(c==0) { + break; + } + } else { + /* count length */ + if(length==0) { + break; + } + --length; + c=(uint8_t)*s++; + if(c==0) { + continue; /* NUL is invariant */ + } + } + /* c!=0 now, one branch below checks c==0 for variant characters */ + + /* + * no assertions here because these functions are legitimately called + * for strings with variant characters + */ +#if U_CHARSET_FAMILY==U_ASCII_FAMILY + if(!UCHAR_IS_INVARIANT(c)) { + return FALSE; /* found a variant char */ + } +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY + c=CHAR_TO_UCHAR(c); + if(c==0 || !UCHAR_IS_INVARIANT(c)) { + return FALSE; /* found a variant char */ + } +#else +# error U_CHARSET_FAMILY is not valid +#endif + } + return TRUE; +} + +U_CAPI UBool U_EXPORT2 +uprv_isInvariantUString(const UChar *s, int32_t length) { + UChar c; + + for(;;) { + if(length<0) { + /* NUL-terminated */ + c=*s++; + if(c==0) { + break; + } + } else { + /* count length */ + if(length==0) { + break; + } + --length; + c=*s++; + } + + /* + * no assertions here because these functions are legitimately called + * for strings with variant characters + */ + if(!UCHAR_IS_INVARIANT(c)) { + return FALSE; /* found a variant char */ + } + } + return TRUE; +} + +/* UDataSwapFn implementations used in udataswp.c ------- */ + +/* convert ASCII to EBCDIC and verify that all characters are invariant */ +U_CAPI int32_t U_EXPORT2 +uprv_ebcdicFromAscii(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const uint8_t *s; + uint8_t *t; + uint8_t c; + + int32_t count; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* setup and swapping */ + s=(const uint8_t *)inData; + t=(uint8_t *)outData; + count=length; + while(count>0) { + c=*s++; + if(!UCHAR_IS_INVARIANT(c)) { + udata_printError(ds, "uprv_ebcdicFromAscii() string[%d] contains a variant character in position %d\n", + length, length-count); + *pErrorCode=U_INVALID_CHAR_FOUND; + return 0; + } + *t++=ebcdicFromAscii[c]; + --count; + } + + return length; +} + +/* this function only checks and copies ASCII strings without conversion */ +U_CFUNC int32_t +uprv_copyAscii(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const uint8_t *s; + uint8_t c; + + int32_t count; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* setup and checking */ + s=(const uint8_t *)inData; + count=length; + while(count>0) { + c=*s++; + if(!UCHAR_IS_INVARIANT(c)) { + udata_printError(ds, "uprv_copyFromAscii() string[%d] contains a variant character in position %d\n", + length, length-count); + *pErrorCode=U_INVALID_CHAR_FOUND; + return 0; + } + --count; + } + + if(length>0 && inData!=outData) { + uprv_memcpy(outData, inData, length); + } + + return length; +} + +/* convert EBCDIC to ASCII and verify that all characters are invariant */ +U_CFUNC int32_t +uprv_asciiFromEbcdic(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const uint8_t *s; + uint8_t *t; + uint8_t c; + + int32_t count; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* setup and swapping */ + s=(const uint8_t *)inData; + t=(uint8_t *)outData; + count=length; + while(count>0) { + c=*s++; + if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) { + udata_printError(ds, "uprv_asciiFromEbcdic() string[%d] contains a variant character in position %d\n", + length, length-count); + *pErrorCode=U_INVALID_CHAR_FOUND; + return 0; + } + *t++=c; + --count; + } + + return length; +} + +/* this function only checks and copies EBCDIC strings without conversion */ +U_CFUNC int32_t +uprv_copyEbcdic(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const uint8_t *s; + uint8_t c; + + int32_t count; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* setup and checking */ + s=(const uint8_t *)inData; + count=length; + while(count>0) { + c=*s++; + if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) { + udata_printError(ds, "uprv_copyEbcdic() string[%] contains a variant character in position %d\n", + length, length-count); + *pErrorCode=U_INVALID_CHAR_FOUND; + return 0; + } + --count; + } + + if(length>0 && inData!=outData) { + uprv_memcpy(outData, inData, length); + } + + return length; +} + +/* compare invariant strings; variant characters compare less than others and unlike each other */ +U_CFUNC int32_t +uprv_compareInvAscii(const UDataSwapper *ds, + const char *outString, int32_t outLength, + const UChar *localString, int32_t localLength) { + int32_t minLength; + UChar32 c1, c2; + uint8_t c; + + if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) { + return 0; + } + + if(outLength<0) { + outLength=(int32_t)uprv_strlen(outString); + } + if(localLength<0) { + localLength=u_strlen(localString); + } + + minLength= outLength<localLength ? outLength : localLength; + + while(minLength>0) { + c=(uint8_t)*outString++; + if(UCHAR_IS_INVARIANT(c)) { + c1=c; + } else { + c1=-1; + } + + c2=*localString++; + if(!UCHAR_IS_INVARIANT(c2)) { + c2=-2; + } + + if((c1-=c2)!=0) { + return c1; + } + + --minLength; + } + + /* strings start with same prefix, compare lengths */ + return outLength-localLength; +} + +U_CFUNC int32_t +uprv_compareInvEbcdic(const UDataSwapper *ds, + const char *outString, int32_t outLength, + const UChar *localString, int32_t localLength) { + int32_t minLength; + UChar32 c1, c2; + uint8_t c; + + if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) { + return 0; + } + + if(outLength<0) { + outLength=(int32_t)uprv_strlen(outString); + } + if(localLength<0) { + localLength=u_strlen(localString); + } + + minLength= outLength<localLength ? outLength : localLength; + + while(minLength>0) { + c=(uint8_t)*outString++; + if(c==0) { + c1=0; + } else if((c1=asciiFromEbcdic[c])!=0 && UCHAR_IS_INVARIANT(c1)) { + /* c1 is set */ + } else { + c1=-1; + } + + c2=*localString++; + if(!UCHAR_IS_INVARIANT(c2)) { + c2=-2; + } + + if((c1-=c2)!=0) { + return c1; + } + + --minLength; + } + + /* strings start with same prefix, compare lengths */ + return outLength-localLength; +} + +U_CAPI int32_t U_EXPORT2 +uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2) { + int32_t c1, c2; + + for(;; ++s1, ++s2) { + c1=(uint8_t)*s1; + c2=(uint8_t)*s2; + if(c1!=c2) { + if(c1!=0 && ((c1=asciiFromEbcdic[c1])==0 || !UCHAR_IS_INVARIANT(c1))) { + c1=-(int32_t)(uint8_t)*s1; + } + if(c2!=0 && ((c2=asciiFromEbcdic[c2])==0 || !UCHAR_IS_INVARIANT(c2))) { + c2=-(int32_t)(uint8_t)*s2; + } + return c1-c2; + } else if(c1==0) { + return 0; + } + } +} + +U_CAPI char U_EXPORT2 +uprv_ebcdicToLowercaseAscii(char c) { + return (char)lowercaseAsciiFromEbcdic[(uint8_t)c]; +} + +U_INTERNAL uint8_t* U_EXPORT2 +uprv_aestrncpy(uint8_t *dst, const uint8_t *src, int32_t n) +{ + uint8_t *orig_dst = dst; + + if(n==-1) { + n = uprv_strlen((const char*)src)+1; /* copy NUL */ + } + /* copy non-null */ + while(*src && n>0) { + *(dst++) = asciiFromEbcdic[*(src++)]; + n--; + } + /* pad */ + while(n>0) { + *(dst++) = 0; + n--; + } + return orig_dst; +} + +U_INTERNAL uint8_t* U_EXPORT2 +uprv_eastrncpy(uint8_t *dst, const uint8_t *src, int32_t n) +{ + uint8_t *orig_dst = dst; + + if(n==-1) { + n = uprv_strlen((const char*)src)+1; /* copy NUL */ + } + /* copy non-null */ + while(*src && n>0) { + char ch = ebcdicFromAscii[*(src++)]; + if(ch == 0) { + ch = ebcdicFromAscii[0x3f]; /* questionmark (subchar) */ + } + *(dst++) = ch; + n--; + } + /* pad */ + while(n>0) { + *(dst++) = 0; + n--; + } + return orig_dst; +} + diff --git a/src/core/basetypes/icu-ucsdet/umutex.cpp b/src/core/basetypes/icu-ucsdet/umutex.cpp new file mode 100644 index 00000000..0c1fdc80 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/umutex.cpp @@ -0,0 +1,392 @@ +/* +****************************************************************************** +* +* Copyright (C) 1997-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* File umutex.cpp +* +* Modification History: +* +* Date Name Description +* 04/02/97 aliu Creation. +* 04/07/99 srl updated +* 05/13/99 stephen Changed to umutex (from cmutex). +* 11/22/99 aliu Make non-global mutex autoinitialize [j151] +****************************************************************************** +*/ + +#include "umutex.h" + +#include "unicode/utypes.h" +#include "uassert.h" +#include "cmemory.h" + + +// The ICU global mutex. Used when ICU implementation code passes NULL for the mutex pointer. +static UMutex globalMutex = U_MUTEX_INITIALIZER; + +/* + * ICU Mutex wrappers. Wrap operating system mutexes, giving the rest of ICU a + * platform independent set of mutex operations. For internal ICU use only. + */ + +#if defined(U_USER_MUTEX_CPP) +// Build time user mutex hook: #include "U_USER_MUTEX_CPP" +#include U_MUTEX_XSTR(U_USER_MUTEX_CPP) + +#elif U_PLATFORM_HAS_WIN32_API + +//------------------------------------------------------------------------------------------- +// +// Windows Specific Definitions +// +// Note: Cygwin (and possibly others) have both WIN32 and POSIX. +// Prefer Win32 in these cases. (Win32 comes ahead in the #if chain) +// +//------------------------------------------------------------------------------------------- + +#if defined U_NO_PLATFORM_ATOMICS +#error ICU on Win32 requires support for low level atomic operations. +// Visual Studio, gcc, clang are OK. Shouldn't get here. +#endif + + +// This function is called when a test of a UInitOnce::fState reveals that +// initialization has not completed, that we either need to call the +// function on this thread, or wait for some other thread to complete. +// +// The actual call to the init function is made inline by template code +// that knows the C++ types involved. This function returns TRUE if +// the caller needs to call the Init function. +// + +U_NAMESPACE_BEGIN + +U_COMMON_API UBool U_EXPORT2 umtx_initImplPreInit(UInitOnce &uio) { + for (;;) { + int32_t previousState = InterlockedCompareExchange( +#if (U_PLATFORM == U_PF_MINGW) || (U_PLATFORM == U_PF_CYGWIN) || defined(__clang__) + (LONG volatile *) // this is the type given in the API doc for this function. +#endif + &uio.fState, // Destination + 1, // Exchange Value + 0); // Compare value + + if (previousState == 0) { + return true; // Caller will next call the init function. + // Current state == 1. + } else if (previousState == 2) { + // Another thread already completed the initialization. + // We can simply return FALSE, indicating no + // further action is needed by the caller. + return FALSE; + } else { + // Another thread is currently running the initialization. + // Wait until it completes. + do { + Sleep(1); + previousState = umtx_loadAcquire(uio.fState); + } while (previousState == 1); + } + } +} + +// This function is called by the thread that ran an initialization function, +// just after completing the function. +// +// success: True: the inialization succeeded. No further calls to the init +// function will be made. +// False: the initializtion failed. The next call to umtx_initOnce() +// will retry the initialization. + +U_COMMON_API void U_EXPORT2 umtx_initImplPostInit(UInitOnce &uio) { + umtx_storeRelease(uio.fState, 2); +} + +U_NAMESPACE_END + +static void winMutexInit(CRITICAL_SECTION *cs) { + InitializeCriticalSection(cs); + return; +} + +U_CAPI void U_EXPORT2 +umtx_lock(UMutex *mutex) { + if (mutex == NULL) { + mutex = &globalMutex; + } + CRITICAL_SECTION *cs = &mutex->fCS; + umtx_initOnce(mutex->fInitOnce, winMutexInit, cs); + EnterCriticalSection(cs); +} + +U_CAPI void U_EXPORT2 +umtx_unlock(UMutex* mutex) +{ + if (mutex == NULL) { + mutex = &globalMutex; + } + LeaveCriticalSection(&mutex->fCS); +} + + +U_CAPI void U_EXPORT2 +umtx_condBroadcast(UConditionVar *condition) { + // We require that the associated mutex be held by the caller, + // so access to fWaitCount is protected and safe. No other thread can + // call condWait() while we are here. + if (condition->fWaitCount == 0) { + return; + } + ResetEvent(condition->fExitGate); + SetEvent(condition->fEntryGate); +} + +U_CAPI void U_EXPORT2 +umtx_condSignal(UConditionVar *condition) { + // Function not implemented. There is no immediate requirement from ICU to have it. + // Once ICU drops support for Windows XP and Server 2003, ICU Condition Variables will be + // changed to be thin wrappers on native Windows CONDITION_VARIABLEs, and this function + // becomes trivial to provide. + U_ASSERT(FALSE); +} + +U_CAPI void U_EXPORT2 +umtx_condWait(UConditionVar *condition, UMutex *mutex) { + if (condition->fEntryGate == NULL) { + // Note: because the associated mutex must be locked when calling + // wait, we know that there can not be multiple threads + // running here with the same condition variable. + // Meaning that lazy initialization is safe. + U_ASSERT(condition->fExitGate == NULL); + condition->fEntryGate = CreateEvent(NULL, // Security Attributes + TRUE, // Manual Reset + FALSE, // Initially reset + NULL); // Name. + U_ASSERT(condition->fEntryGate != NULL); + condition->fExitGate = CreateEvent(NULL, TRUE, TRUE, NULL); + U_ASSERT(condition->fExitGate != NULL); + } + + condition->fWaitCount++; + umtx_unlock(mutex); + WaitForSingleObject(condition->fEntryGate, INFINITE); + umtx_lock(mutex); + condition->fWaitCount--; + if (condition->fWaitCount == 0) { + // All threads that were waiting at the entry gate have woken up + // and moved through. Shut the entry gate and open the exit gate. + ResetEvent(condition->fEntryGate); + SetEvent(condition->fExitGate); + } else { + umtx_unlock(mutex); + WaitForSingleObject(condition->fExitGate, INFINITE); + umtx_lock(mutex); + } +} + + +#elif U_PLATFORM_IMPLEMENTS_POSIX + +//------------------------------------------------------------------------------------------- +// +// POSIX specific definitions +// +//------------------------------------------------------------------------------------------- + +# include <pthread.h> + +// Each UMutex consists of a pthread_mutex_t. +// All are statically initialized and ready for use. +// There is no runtime mutex initialization code needed. + +U_CAPI void U_EXPORT2 +umtx_lock(UMutex *mutex) { + if (mutex == NULL) { + mutex = &globalMutex; + } + int sysErr = pthread_mutex_lock(&mutex->fMutex); + (void)sysErr; // Suppress unused variable warnings. + U_ASSERT(sysErr == 0); +} + + +U_CAPI void U_EXPORT2 +umtx_unlock(UMutex* mutex) +{ + if (mutex == NULL) { + mutex = &globalMutex; + } + int sysErr = pthread_mutex_unlock(&mutex->fMutex); + (void)sysErr; // Suppress unused variable warnings. + U_ASSERT(sysErr == 0); +} + + +U_CAPI void U_EXPORT2 +umtx_condWait(UConditionVar *cond, UMutex *mutex) { + if (mutex == NULL) { + mutex = &globalMutex; + } + int sysErr = pthread_cond_wait(&cond->fCondition, &mutex->fMutex); + (void)sysErr; + U_ASSERT(sysErr == 0); +} + +U_CAPI void U_EXPORT2 +umtx_condBroadcast(UConditionVar *cond) { + int sysErr = pthread_cond_broadcast(&cond->fCondition); + (void)sysErr; + U_ASSERT(sysErr == 0); +} + +U_CAPI void U_EXPORT2 +umtx_condSignal(UConditionVar *cond) { + int sysErr = pthread_cond_signal(&cond->fCondition); + (void)sysErr; + U_ASSERT(sysErr == 0); +} + + + +U_NAMESPACE_BEGIN + +static pthread_mutex_t initMutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t initCondition = PTHREAD_COND_INITIALIZER; + + +// This function is called when a test of a UInitOnce::fState reveals that +// initialization has not completed, that we either need to call the +// function on this thread, or wait for some other thread to complete. +// +// The actual call to the init function is made inline by template code +// that knows the C++ types involved. This function returns TRUE if +// the caller needs to call the Init function. +// +U_COMMON_API UBool U_EXPORT2 +umtx_initImplPreInit(UInitOnce &uio) { + pthread_mutex_lock(&initMutex); + int32_t state = uio.fState; + if (state == 0) { + umtx_storeRelease(uio.fState, 1); + pthread_mutex_unlock(&initMutex); + return TRUE; // Caller will next call the init function. + } else { + while (uio.fState == 1) { + // Another thread is currently running the initialization. + // Wait until it completes. + pthread_cond_wait(&initCondition, &initMutex); + } + pthread_mutex_unlock(&initMutex); + U_ASSERT(uio.fState == 2); + return FALSE; + } +} + + + +// This function is called by the thread that ran an initialization function, +// just after completing the function. +// Some threads may be waiting on the condition, requiring the broadcast wakeup. +// Some threads may be racing to test the fState variable outside of the mutex, +// requiring the use of store/release when changing its value. + +U_COMMON_API void U_EXPORT2 +umtx_initImplPostInit(UInitOnce &uio) { + pthread_mutex_lock(&initMutex); + umtx_storeRelease(uio.fState, 2); + pthread_cond_broadcast(&initCondition); + pthread_mutex_unlock(&initMutex); +} + +U_NAMESPACE_END + +// End of POSIX specific umutex implementation. + +#else // Platform #define chain. + +#error Unknown Platform + +#endif // Platform #define chain. + + +//------------------------------------------------------------------------------- +// +// Atomic Operations, out-of-line versions. +// These are conditional, only defined if better versions +// were not available for the platform. +// +// These versions are platform neutral. +// +//-------------------------------------------------------------------------------- + +#if defined U_NO_PLATFORM_ATOMICS +static UMutex gIncDecMutex = U_MUTEX_INITIALIZER; + +U_NAMESPACE_BEGIN + +U_COMMON_API int32_t U_EXPORT2 +umtx_atomic_inc(u_atomic_int32_t *p) { + int32_t retVal; + umtx_lock(&gIncDecMutex); + retVal = ++(*p); + umtx_unlock(&gIncDecMutex); + return retVal; +} + + +U_COMMON_API int32_t U_EXPORT2 +umtx_atomic_dec(u_atomic_int32_t *p) { + int32_t retVal; + umtx_lock(&gIncDecMutex); + retVal = --(*p); + umtx_unlock(&gIncDecMutex); + return retVal; +} + +U_COMMON_API int32_t U_EXPORT2 +umtx_loadAcquire(u_atomic_int32_t &var) { + int32_t val = var; + umtx_lock(&gIncDecMutex); + umtx_unlock(&gIncDecMutex); + return val; +} + +U_COMMON_API void U_EXPORT2 +umtx_storeRelease(u_atomic_int32_t &var, int32_t val) { + umtx_lock(&gIncDecMutex); + umtx_unlock(&gIncDecMutex); + var = val; +} + +U_NAMESPACE_END +#endif + +//-------------------------------------------------------------------------- +// +// Deprecated functions for setting user mutexes. +// +//-------------------------------------------------------------------------- + +U_DEPRECATED void U_EXPORT2 +u_setMutexFunctions(const void * /*context */, UMtxInitFn *, UMtxFn *, + UMtxFn *, UMtxFn *, UErrorCode *status) { + if (U_SUCCESS(*status)) { + *status = U_UNSUPPORTED_ERROR; + } + return; +} + + + +U_DEPRECATED void U_EXPORT2 +u_setAtomicIncDecFunctions(const void * /*context */, UMtxAtomicFn *, UMtxAtomicFn *, + UErrorCode *status) { + if (U_SUCCESS(*status)) { + *status = U_UNSUPPORTED_ERROR; + } + return; +} diff --git a/src/core/basetypes/icu-ucsdet/uobject.cpp b/src/core/basetypes/icu-ucsdet/uobject.cpp new file mode 100644 index 00000000..900e0345 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/uobject.cpp @@ -0,0 +1,103 @@ +/* +****************************************************************************** +* +* Copyright (C) 2002-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: uobject.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002jun26 +* created by: Markus W. Scherer +*/ + +#include "unicode/uobject.h" +#include "cmemory.h" + +U_NAMESPACE_BEGIN + +#if U_OVERRIDE_CXX_ALLOCATION + +/* + * Default implementation of UMemory::new/delete + * using uprv_malloc() and uprv_free(). + * + * For testing, this is used together with a list of imported symbols to verify + * that ICU is not using the global ::new and ::delete operators. + * + * These operators can be implemented like this or any other appropriate way + * when customizing ICU for certain environments. + * Whenever ICU is customized in binary incompatible ways please be sure + * to use library name suffixes to distinguish such libraries from + * the standard build. + * + * Instead of just modifying these C++ new/delete operators, it is usually best + * to modify the uprv_malloc()/uprv_free()/uprv_realloc() functions in cmemory.c. + * + * Memory test on Windows/MSVC 6: + * The global operators new and delete look as follows: + * 04F 00000000 UNDEF notype () External | ??2@YAPAXI@Z (void * __cdecl operator new(unsigned int)) + * 03F 00000000 UNDEF notype () External | ??3@YAXPAX@Z (void __cdecl operator delete(void *)) + * + * These lines are from output generated by the MSVC 6 tool dumpbin with + * dumpbin /symbols *.obj + * + * ??2@YAPAXI@Z and ??3@YAXPAX@Z are the linker symbols in the .obj + * files and are imported from msvcrtd.dll (in a debug build). + * + * Make sure that with the UMemory operators new and delete defined these two symbols + * do not appear in the dumpbin /symbols output for the ICU libraries! + * + * If such a symbol appears in the output then look in the preceding lines in the output + * for which file and function calls the global new or delete operator, + * and replace with uprv_malloc/uprv_free. + */ + +void * U_EXPORT2 UMemory::operator new(size_t size) U_NO_THROW { + return uprv_malloc(size); +} + +void U_EXPORT2 UMemory::operator delete(void *p) U_NO_THROW { + if(p!=NULL) { + uprv_free(p); + } +} + +void * U_EXPORT2 UMemory::operator new[](size_t size) U_NO_THROW { + return uprv_malloc(size); +} + +void U_EXPORT2 UMemory::operator delete[](void *p) U_NO_THROW { + if(p!=NULL) { + uprv_free(p); + } +} + +#if U_HAVE_DEBUG_LOCATION_NEW +void * U_EXPORT2 UMemory::operator new(size_t size, const char* /*file*/, int /*line*/) U_NO_THROW { + return UMemory::operator new(size); +} + +void U_EXPORT2 UMemory::operator delete(void* p, const char* /*file*/, int /*line*/) U_NO_THROW { + UMemory::operator delete(p); +} +#endif /* U_HAVE_DEBUG_LOCATION_NEW */ + + +#endif + +UObject::~UObject() {} + +UClassID UObject::getDynamicClassID() const { return NULL; } + +U_NAMESPACE_END + +U_NAMESPACE_USE + +U_CAPI void U_EXPORT2 +uprv_deleteUObject(void *obj) { + delete static_cast<UObject *>(obj); +} diff --git a/src/core/basetypes/icu-ucsdet/ustring.cpp b/src/core/basetypes/icu-ucsdet/ustring.cpp new file mode 100644 index 00000000..40d23c06 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/ustring.cpp @@ -0,0 +1,1516 @@ +/* +****************************************************************************** +* +* Copyright (C) 1998-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* File ustring.cpp +* +* Modification History: +* +* Date Name Description +* 12/07/98 bertrand Creation. +****************************************************************************** +*/ + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/ustring.h" +#include "unicode/utf16.h" +#include "cstring.h" +#include "cwchar.h" +#include "cmemory.h" +#include "ustr_imp.h" + +/* ANSI string.h - style functions ------------------------------------------ */ + +/* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */ +#define U_BMP_MAX 0xffff + +/* Forward binary string search functions ----------------------------------- */ + +/* + * Test if a substring match inside a string is at code point boundaries. + * All pointers refer to the same buffer. + * The limit pointer may be NULL, all others must be real pointers. + */ +static inline UBool +isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) { + if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) { + /* the leading edge of the match is in the middle of a surrogate pair */ + return FALSE; + } + if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) { + /* the trailing edge of the match is in the middle of a surrogate pair */ + return FALSE; + } + return TRUE; +} + +U_CAPI UChar * U_EXPORT2 +u_strFindFirst(const UChar *s, int32_t length, + const UChar *sub, int32_t subLength) { + const UChar *start, *p, *q, *subLimit; + UChar c, cs, cq; + + if(sub==NULL || subLength<-1) { + return (UChar *)s; + } + if(s==NULL || length<-1) { + return NULL; + } + + start=s; + + if(length<0 && subLength<0) { + /* both strings are NUL-terminated */ + if((cs=*sub++)==0) { + return (UChar *)s; + } + if(*sub==0 && !U16_IS_SURROGATE(cs)) { + /* the substring consists of a single, non-surrogate BMP code point */ + return u_strchr(s, cs); + } + + while((c=*s++)!=0) { + if(c==cs) { + /* found first substring UChar, compare rest */ + p=s; + q=sub; + for(;;) { + if((cq=*q)==0) { + if(isMatchAtCPBoundary(start, s-1, p, NULL)) { + return (UChar *)(s-1); /* well-formed match */ + } else { + break; /* no match because surrogate pair is split */ + } + } + if((c=*p)==0) { + return NULL; /* no match, and none possible after s */ + } + if(c!=cq) { + break; /* no match */ + } + ++p; + ++q; + } + } + } + + /* not found */ + return NULL; + } + + if(subLength<0) { + subLength=u_strlen(sub); + } + if(subLength==0) { + return (UChar *)s; + } + + /* get sub[0] to search for it fast */ + cs=*sub++; + --subLength; + subLimit=sub+subLength; + + if(subLength==0 && !U16_IS_SURROGATE(cs)) { + /* the substring consists of a single, non-surrogate BMP code point */ + return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length); + } + + if(length<0) { + /* s is NUL-terminated */ + while((c=*s++)!=0) { + if(c==cs) { + /* found first substring UChar, compare rest */ + p=s; + q=sub; + for(;;) { + if(q==subLimit) { + if(isMatchAtCPBoundary(start, s-1, p, NULL)) { + return (UChar *)(s-1); /* well-formed match */ + } else { + break; /* no match because surrogate pair is split */ + } + } + if((c=*p)==0) { + return NULL; /* no match, and none possible after s */ + } + if(c!=*q) { + break; /* no match */ + } + ++p; + ++q; + } + } + } + } else { + const UChar *limit, *preLimit; + + /* subLength was decremented above */ + if(length<=subLength) { + return NULL; /* s is shorter than sub */ + } + + limit=s+length; + + /* the substring must start before preLimit */ + preLimit=limit-subLength; + + while(s!=preLimit) { + c=*s++; + if(c==cs) { + /* found first substring UChar, compare rest */ + p=s; + q=sub; + for(;;) { + if(q==subLimit) { + if(isMatchAtCPBoundary(start, s-1, p, limit)) { + return (UChar *)(s-1); /* well-formed match */ + } else { + break; /* no match because surrogate pair is split */ + } + } + if(*p!=*q) { + break; /* no match */ + } + ++p; + ++q; + } + } + } + } + + /* not found */ + return NULL; +} + +U_CAPI UChar * U_EXPORT2 +u_strstr(const UChar *s, const UChar *substring) { + return u_strFindFirst(s, -1, substring, -1); +} + +U_CAPI UChar * U_EXPORT2 +u_strchr(const UChar *s, UChar c) { + if(U16_IS_SURROGATE(c)) { + /* make sure to not find half of a surrogate pair */ + return u_strFindFirst(s, -1, &c, 1); + } else { + UChar cs; + + /* trivial search for a BMP code point */ + for(;;) { + if((cs=*s)==c) { + return (UChar *)s; + } + if(cs==0) { + return NULL; + } + ++s; + } + } +} + +U_CAPI UChar * U_EXPORT2 +u_strchr32(const UChar *s, UChar32 c) { + if((uint32_t)c<=U_BMP_MAX) { + /* find BMP code point */ + return u_strchr(s, (UChar)c); + } else if((uint32_t)c<=UCHAR_MAX_VALUE) { + /* find supplementary code point as surrogate pair */ + UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); + + while((cs=*s++)!=0) { + if(cs==lead && *s==trail) { + return (UChar *)(s-1); + } + } + return NULL; + } else { + /* not a Unicode code point, not findable */ + return NULL; + } +} + +U_CAPI UChar * U_EXPORT2 +u_memchr(const UChar *s, UChar c, int32_t count) { + if(count<=0) { + return NULL; /* no string */ + } else if(U16_IS_SURROGATE(c)) { + /* make sure to not find half of a surrogate pair */ + return u_strFindFirst(s, count, &c, 1); + } else { + /* trivial search for a BMP code point */ + const UChar *limit=s+count; + do { + if(*s==c) { + return (UChar *)s; + } + } while(++s!=limit); + return NULL; + } +} + +U_CAPI UChar * U_EXPORT2 +u_memchr32(const UChar *s, UChar32 c, int32_t count) { + if((uint32_t)c<=U_BMP_MAX) { + /* find BMP code point */ + return u_memchr(s, (UChar)c, count); + } else if(count<2) { + /* too short for a surrogate pair */ + return NULL; + } else if((uint32_t)c<=UCHAR_MAX_VALUE) { + /* find supplementary code point as surrogate pair */ + const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */ + UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); + + do { + if(*s==lead && *(s+1)==trail) { + return (UChar *)s; + } + } while(++s!=limit); + return NULL; + } else { + /* not a Unicode code point, not findable */ + return NULL; + } +} + +/* Backward binary string search functions ---------------------------------- */ + +U_CAPI UChar * U_EXPORT2 +u_strFindLast(const UChar *s, int32_t length, + const UChar *sub, int32_t subLength) { + const UChar *start, *limit, *p, *q, *subLimit; + UChar c, cs; + + if(sub==NULL || subLength<-1) { + return (UChar *)s; + } + if(s==NULL || length<-1) { + return NULL; + } + + /* + * This implementation is more lazy than the one for u_strFindFirst(): + * There is no special search code for NUL-terminated strings. + * It does not seem to be worth it for searching substrings to + * search forward and find all matches like in u_strrchr() and similar. + * Therefore, we simply get both string lengths and search backward. + * + * markus 2002oct23 + */ + + if(subLength<0) { + subLength=u_strlen(sub); + } + if(subLength==0) { + return (UChar *)s; + } + + /* get sub[subLength-1] to search for it fast */ + subLimit=sub+subLength; + cs=*(--subLimit); + --subLength; + + if(subLength==0 && !U16_IS_SURROGATE(cs)) { + /* the substring consists of a single, non-surrogate BMP code point */ + return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length); + } + + if(length<0) { + length=u_strlen(s); + } + + /* subLength was decremented above */ + if(length<=subLength) { + return NULL; /* s is shorter than sub */ + } + + start=s; + limit=s+length; + + /* the substring must start no later than s+subLength */ + s+=subLength; + + while(s!=limit) { + c=*(--limit); + if(c==cs) { + /* found last substring UChar, compare rest */ + p=limit; + q=subLimit; + for(;;) { + if(q==sub) { + if(isMatchAtCPBoundary(start, p, limit+1, start+length)) { + return (UChar *)p; /* well-formed match */ + } else { + break; /* no match because surrogate pair is split */ + } + } + if(*(--p)!=*(--q)) { + break; /* no match */ + } + } + } + } + + /* not found */ + return NULL; +} + +U_CAPI UChar * U_EXPORT2 +u_strrstr(const UChar *s, const UChar *substring) { + return u_strFindLast(s, -1, substring, -1); +} + +U_CAPI UChar * U_EXPORT2 +u_strrchr(const UChar *s, UChar c) { + if(U16_IS_SURROGATE(c)) { + /* make sure to not find half of a surrogate pair */ + return u_strFindLast(s, -1, &c, 1); + } else { + const UChar *result=NULL; + UChar cs; + + /* trivial search for a BMP code point */ + for(;;) { + if((cs=*s)==c) { + result=s; + } + if(cs==0) { + return (UChar *)result; + } + ++s; + } + } +} + +U_CAPI UChar * U_EXPORT2 +u_strrchr32(const UChar *s, UChar32 c) { + if((uint32_t)c<=U_BMP_MAX) { + /* find BMP code point */ + return u_strrchr(s, (UChar)c); + } else if((uint32_t)c<=UCHAR_MAX_VALUE) { + /* find supplementary code point as surrogate pair */ + const UChar *result=NULL; + UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); + + while((cs=*s++)!=0) { + if(cs==lead && *s==trail) { + result=s-1; + } + } + return (UChar *)result; + } else { + /* not a Unicode code point, not findable */ + return NULL; + } +} + +U_CAPI UChar * U_EXPORT2 +u_memrchr(const UChar *s, UChar c, int32_t count) { + if(count<=0) { + return NULL; /* no string */ + } else if(U16_IS_SURROGATE(c)) { + /* make sure to not find half of a surrogate pair */ + return u_strFindLast(s, count, &c, 1); + } else { + /* trivial search for a BMP code point */ + const UChar *limit=s+count; + do { + if(*(--limit)==c) { + return (UChar *)limit; + } + } while(s!=limit); + return NULL; + } +} + +U_CAPI UChar * U_EXPORT2 +u_memrchr32(const UChar *s, UChar32 c, int32_t count) { + if((uint32_t)c<=U_BMP_MAX) { + /* find BMP code point */ + return u_memrchr(s, (UChar)c, count); + } else if(count<2) { + /* too short for a surrogate pair */ + return NULL; + } else if((uint32_t)c<=UCHAR_MAX_VALUE) { + /* find supplementary code point as surrogate pair */ + const UChar *limit=s+count-1; + UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); + + do { + if(*limit==trail && *(limit-1)==lead) { + return (UChar *)(limit-1); + } + } while(s!=--limit); + return NULL; + } else { + /* not a Unicode code point, not findable */ + return NULL; + } +} + +/* Tokenization functions --------------------------------------------------- */ + +/* + * Match each code point in a string against each code point in the matchSet. + * Return the index of the first string code point that + * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. + * Return -(string length)-1 if there is no such code point. + */ +static int32_t +_matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) { + int32_t matchLen, matchBMPLen, strItr, matchItr; + UChar32 stringCh, matchCh; + UChar c, c2; + + /* first part of matchSet contains only BMP code points */ + matchBMPLen = 0; + while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { + ++matchBMPLen; + } + + /* second part of matchSet contains BMP and supplementary code points */ + matchLen = matchBMPLen; + while(matchSet[matchLen] != 0) { + ++matchLen; + } + + for(strItr = 0; (c = string[strItr]) != 0;) { + ++strItr; + if(U16_IS_SINGLE(c)) { + if(polarity) { + for(matchItr = 0; matchItr < matchLen; ++matchItr) { + if(c == matchSet[matchItr]) { + return strItr - 1; /* one matches */ + } + } + } else { + for(matchItr = 0; matchItr < matchLen; ++matchItr) { + if(c == matchSet[matchItr]) { + goto endloop; + } + } + return strItr - 1; /* none matches */ + } + } else { + /* + * No need to check for string length before U16_IS_TRAIL + * because c2 could at worst be the terminating NUL. + */ + if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { + ++strItr; + stringCh = U16_GET_SUPPLEMENTARY(c, c2); + } else { + stringCh = c; /* unpaired trail surrogate */ + } + + if(polarity) { + for(matchItr = matchBMPLen; matchItr < matchLen;) { + U16_NEXT(matchSet, matchItr, matchLen, matchCh); + if(stringCh == matchCh) { + return strItr - U16_LENGTH(stringCh); /* one matches */ + } + } + } else { + for(matchItr = matchBMPLen; matchItr < matchLen;) { + U16_NEXT(matchSet, matchItr, matchLen, matchCh); + if(stringCh == matchCh) { + goto endloop; + } + } + return strItr - U16_LENGTH(stringCh); /* none matches */ + } + } +endloop: + /* wish C had continue with labels like Java... */; + } + + /* Didn't find it. */ + return -strItr-1; +} + +/* Search for a codepoint in a string that matches one of the matchSet codepoints. */ +U_CAPI UChar * U_EXPORT2 +u_strpbrk(const UChar *string, const UChar *matchSet) +{ + int32_t idx = _matchFromSet(string, matchSet, TRUE); + if(idx >= 0) { + return (UChar *)string + idx; + } else { + return NULL; + } +} + +/* Search for a codepoint in a string that matches one of the matchSet codepoints. */ +U_CAPI int32_t U_EXPORT2 +u_strcspn(const UChar *string, const UChar *matchSet) +{ + int32_t idx = _matchFromSet(string, matchSet, TRUE); + if(idx >= 0) { + return idx; + } else { + return -idx - 1; /* == u_strlen(string) */ + } +} + +/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */ +U_CAPI int32_t U_EXPORT2 +u_strspn(const UChar *string, const UChar *matchSet) +{ + int32_t idx = _matchFromSet(string, matchSet, FALSE); + if(idx >= 0) { + return idx; + } else { + return -idx - 1; /* == u_strlen(string) */ + } +} + +/* ----- Text manipulation functions --- */ + +U_CAPI UChar* U_EXPORT2 +u_strtok_r(UChar *src, + const UChar *delim, + UChar **saveState) +{ + UChar *tokSource; + UChar *nextToken; + uint32_t nonDelimIdx; + + /* If saveState is NULL, the user messed up. */ + if (src != NULL) { + tokSource = src; + *saveState = src; /* Set to "src" in case there are no delimiters */ + } + else if (*saveState) { + tokSource = *saveState; + } + else { + /* src == NULL && *saveState == NULL */ + /* This shouldn't happen. We already finished tokenizing. */ + return NULL; + } + + /* Skip initial delimiters */ + nonDelimIdx = u_strspn(tokSource, delim); + tokSource = &tokSource[nonDelimIdx]; + + if (*tokSource) { + nextToken = u_strpbrk(tokSource, delim); + if (nextToken != NULL) { + /* Create a token */ + *(nextToken++) = 0; + *saveState = nextToken; + return tokSource; + } + else if (*saveState) { + /* Return the last token */ + *saveState = NULL; + return tokSource; + } + } + else { + /* No tokens were found. Only delimiters were left. */ + *saveState = NULL; + } + return NULL; +} + +/* Miscellaneous functions -------------------------------------------------- */ + +U_CAPI UChar* U_EXPORT2 +u_strcat(UChar *dst, + const UChar *src) +{ + UChar *anchor = dst; /* save a pointer to start of dst */ + + while(*dst != 0) { /* To end of first string */ + ++dst; + } + while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ + } + + return anchor; +} + +U_CAPI UChar* U_EXPORT2 +u_strncat(UChar *dst, + const UChar *src, + int32_t n ) +{ + if(n > 0) { + UChar *anchor = dst; /* save a pointer to start of dst */ + + while(*dst != 0) { /* To end of first string */ + ++dst; + } + while((*dst = *src) != 0) { /* copy string 2 over */ + ++dst; + if(--n == 0) { + *dst = 0; + break; + } + ++src; + } + + return anchor; + } else { + return dst; + } +} + +/* ----- Text property functions --- */ + +U_CAPI int32_t U_EXPORT2 +u_strcmp(const UChar *s1, + const UChar *s2) +{ + UChar c1, c2; + + for(;;) { + c1=*s1++; + c2=*s2++; + if (c1 != c2 || c1 == 0) { + break; + } + } + return (int32_t)c1 - (int32_t)c2; +} + +U_CFUNC int32_t U_EXPORT2 +uprv_strCompare(const UChar *s1, int32_t length1, + const UChar *s2, int32_t length2, + UBool strncmpStyle, UBool codePointOrder) { + const UChar *start1, *start2, *limit1, *limit2; + UChar c1, c2; + + /* setup for fix-up */ + start1=s1; + start2=s2; + + /* compare identical prefixes - they do not need to be fixed up */ + if(length1<0 && length2<0) { + /* strcmp style, both NUL-terminated */ + if(s1==s2) { + return 0; + } + + for(;;) { + c1=*s1; + c2=*s2; + if(c1!=c2) { + break; + } + if(c1==0) { + return 0; + } + ++s1; + ++s2; + } + + /* setup for fix-up */ + limit1=limit2=NULL; + } else if(strncmpStyle) { + /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */ + if(s1==s2) { + return 0; + } + + limit1=start1+length1; + + for(;;) { + /* both lengths are same, check only one limit */ + if(s1==limit1) { + return 0; + } + + c1=*s1; + c2=*s2; + if(c1!=c2) { + break; + } + if(c1==0) { + return 0; + } + ++s1; + ++s2; + } + + /* setup for fix-up */ + limit2=start2+length1; /* use length1 here, too, to enforce assumption */ + } else { + /* memcmp/UnicodeString style, both length-specified */ + int32_t lengthResult; + + if(length1<0) { + length1=u_strlen(s1); + } + if(length2<0) { + length2=u_strlen(s2); + } + + /* limit1=start1+min(lenght1, length2) */ + if(length1<length2) { + lengthResult=-1; + limit1=start1+length1; + } else if(length1==length2) { + lengthResult=0; + limit1=start1+length1; + } else /* length1>length2 */ { + lengthResult=1; + limit1=start1+length2; + } + + if(s1==s2) { + return lengthResult; + } + + for(;;) { + /* check pseudo-limit */ + if(s1==limit1) { + return lengthResult; + } + + c1=*s1; + c2=*s2; + if(c1!=c2) { + break; + } + ++s1; + ++s2; + } + + /* setup for fix-up */ + limit1=start1+length1; + limit2=start2+length2; + } + + /* if both values are in or above the surrogate range, fix them up */ + if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { + /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ + if( + (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) || + (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1))) + ) { + /* part of a surrogate pair, leave >=d800 */ + } else { + /* BMP code point - may be surrogate code point - make <d800 */ + c1-=0x2800; + } + + if( + (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) || + (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1))) + ) { + /* part of a surrogate pair, leave >=d800 */ + } else { + /* BMP code point - may be surrogate code point - make <d800 */ + c2-=0x2800; + } + } + + /* now c1 and c2 are in the requested (code unit or code point) order */ + return (int32_t)c1-(int32_t)c2; +} + +/* + * Compare two strings as presented by UCharIterators. + * Use code unit or code point order. + * When the function returns, it is undefined where the iterators + * have stopped. + */ +U_CAPI int32_t U_EXPORT2 +u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) { + UChar32 c1, c2; + + /* argument checking */ + if(iter1==NULL || iter2==NULL) { + return 0; /* bad arguments */ + } + if(iter1==iter2) { + return 0; /* identical iterators */ + } + + /* reset iterators to start? */ + iter1->move(iter1, 0, UITER_START); + iter2->move(iter2, 0, UITER_START); + + /* compare identical prefixes - they do not need to be fixed up */ + for(;;) { + c1=iter1->next(iter1); + c2=iter2->next(iter2); + if(c1!=c2) { + break; + } + if(c1==-1) { + return 0; + } + } + + /* if both values are in or above the surrogate range, fix them up */ + if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { + /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ + if( + (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) || + (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1)))) + ) { + /* part of a surrogate pair, leave >=d800 */ + } else { + /* BMP code point - may be surrogate code point - make <d800 */ + c1-=0x2800; + } + + if( + (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) || + (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2)))) + ) { + /* part of a surrogate pair, leave >=d800 */ + } else { + /* BMP code point - may be surrogate code point - make <d800 */ + c2-=0x2800; + } + } + + /* now c1 and c2 are in the requested (code unit or code point) order */ + return (int32_t)c1-(int32_t)c2; +} + +#if 0 +/* + * u_strCompareIter() does not leave the iterators _on_ the different units. + * This is possible but would cost a few extra indirect function calls to back + * up if the last unit (c1 or c2 respectively) was >=0. + * + * Consistently leaving them _behind_ the different units is not an option + * because the current "unit" is the end of the string if that is reached, + * and in such a case the iterator does not move. + * For example, when comparing "ab" with "abc", both iterators rest _on_ the end + * of their strings. Calling previous() on each does not move them to where + * the comparison fails. + * + * So the simplest semantics is to not define where the iterators end up. + * + * The following fragment is part of what would need to be done for backing up. + */ +void fragment { + /* iff a surrogate is part of a surrogate pair, leave >=d800 */ + if(c1<=0xdbff) { + if(!U16_IS_TRAIL(iter1->current(iter1))) { + /* lead surrogate code point - make <d800 */ + c1-=0x2800; + } + } else if(c1<=0xdfff) { + int32_t idx=iter1->getIndex(iter1, UITER_CURRENT); + iter1->previous(iter1); /* ==c1 */ + if(!U16_IS_LEAD(iter1->previous(iter1))) { + /* trail surrogate code point - make <d800 */ + c1-=0x2800; + } + /* go back to behind where the difference is */ + iter1->move(iter1, idx, UITER_ZERO); + } else /* 0xe000<=c1<=0xffff */ { + /* BMP code point - make <d800 */ + c1-=0x2800; + } +} +#endif + +U_CAPI int32_t U_EXPORT2 +u_strCompare(const UChar *s1, int32_t length1, + const UChar *s2, int32_t length2, + UBool codePointOrder) { + /* argument checking */ + if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { + return 0; + } + return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder); +} + +/* String compare in code point order - u_strcmp() compares in code unit order. */ +U_CAPI int32_t U_EXPORT2 +u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) { + return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE); +} + +U_CAPI int32_t U_EXPORT2 +u_strncmp(const UChar *s1, + const UChar *s2, + int32_t n) +{ + if(n > 0) { + int32_t rc; + for(;;) { + rc = (int32_t)*s1 - (int32_t)*s2; + if(rc != 0 || *s1 == 0 || --n == 0) { + return rc; + } + ++s1; + ++s2; + } + } else { + return 0; + } +} + +U_CAPI int32_t U_EXPORT2 +u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) { + return uprv_strCompare(s1, n, s2, n, TRUE, TRUE); +} + +U_CAPI UChar* U_EXPORT2 +u_strcpy(UChar *dst, + const UChar *src) +{ + UChar *anchor = dst; /* save a pointer to start of dst */ + + while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ + } + + return anchor; +} + +U_CAPI UChar* U_EXPORT2 +u_strncpy(UChar *dst, + const UChar *src, + int32_t n) +{ + UChar *anchor = dst; /* save a pointer to start of dst */ + + /* copy string 2 over */ + while(n > 0 && (*(dst++) = *(src++)) != 0) { + --n; + } + + return anchor; +} + +U_CAPI int32_t U_EXPORT2 +u_strlen(const UChar *s) +{ +#if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR + return (int32_t)uprv_wcslen(s); +#else + const UChar *t = s; + while(*t != 0) { + ++t; + } + return t - s; +#endif +} + +U_CAPI int32_t U_EXPORT2 +u_countChar32(const UChar *s, int32_t length) { + int32_t count; + + if(s==NULL || length<-1) { + return 0; + } + + count=0; + if(length>=0) { + while(length>0) { + ++count; + if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) { + s+=2; + length-=2; + } else { + ++s; + --length; + } + } + } else /* length==-1 */ { + UChar c; + + for(;;) { + if((c=*s++)==0) { + break; + } + ++count; + + /* + * sufficient to look ahead one because of UTF-16; + * safe to look ahead one because at worst that would be the terminating NUL + */ + if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { + ++s; + } + } + } + return count; +} + +U_CAPI UBool U_EXPORT2 +u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) { + + if(number<0) { + return TRUE; + } + if(s==NULL || length<-1) { + return FALSE; + } + + if(length==-1) { + /* s is NUL-terminated */ + UChar c; + + /* count code points until they exceed */ + for(;;) { + if((c=*s++)==0) { + return FALSE; + } + if(number==0) { + return TRUE; + } + if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { + ++s; + } + --number; + } + } else { + /* length>=0 known */ + const UChar *limit; + int32_t maxSupplementary; + + /* s contains at least (length+1)/2 code points: <=2 UChars per cp */ + if(((length+1)/2)>number) { + return TRUE; + } + + /* check if s does not even contain enough UChars */ + maxSupplementary=length-number; + if(maxSupplementary<=0) { + return FALSE; + } + /* there are maxSupplementary=length-number more UChars than asked-for code points */ + + /* + * count code points until they exceed and also check that there are + * no more than maxSupplementary supplementary code points (UChar pairs) + */ + limit=s+length; + for(;;) { + if(s==limit) { + return FALSE; + } + if(number==0) { + return TRUE; + } + if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) { + ++s; + if(--maxSupplementary<=0) { + /* too many pairs - too few code points */ + return FALSE; + } + } + --number; + } + } +} + +U_CAPI UChar * U_EXPORT2 +u_memcpy(UChar *dest, const UChar *src, int32_t count) { + if(count > 0) { + uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR); + } + return dest; +} + +U_CAPI UChar * U_EXPORT2 +u_memmove(UChar *dest, const UChar *src, int32_t count) { + if(count > 0) { + uprv_memmove(dest, src, count*U_SIZEOF_UCHAR); + } + return dest; +} + +U_CAPI UChar * U_EXPORT2 +u_memset(UChar *dest, UChar c, int32_t count) { + if(count > 0) { + UChar *ptr = dest; + UChar *limit = dest + count; + + while (ptr < limit) { + *(ptr++) = c; + } + } + return dest; +} + +U_CAPI int32_t U_EXPORT2 +u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) { + if(count > 0) { + const UChar *limit = buf1 + count; + int32_t result; + + while (buf1 < limit) { + result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2; + if (result != 0) { + return result; + } + buf1++; + buf2++; + } + } + return 0; +} + +U_CAPI int32_t U_EXPORT2 +u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) { + return uprv_strCompare(s1, count, s2, count, FALSE, TRUE); +} + +/* u_unescape & support fns ------------------------------------------------- */ + +/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ +static const UChar UNESCAPE_MAP[] = { + /*" 0x22, 0x22 */ + /*' 0x27, 0x27 */ + /*? 0x3F, 0x3F */ + /*\ 0x5C, 0x5C */ + /*a*/ 0x61, 0x07, + /*b*/ 0x62, 0x08, + /*e*/ 0x65, 0x1b, + /*f*/ 0x66, 0x0c, + /*n*/ 0x6E, 0x0a, + /*r*/ 0x72, 0x0d, + /*t*/ 0x74, 0x09, + /*v*/ 0x76, 0x0b +}; +enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) }; + +/* Convert one octal digit to a numeric value 0..7, or -1 on failure */ +static int8_t _digit8(UChar c) { + if (c >= 0x0030 && c <= 0x0037) { + return (int8_t)(c - 0x0030); + } + return -1; +} + +/* Convert one hex digit to a numeric value 0..F, or -1 on failure */ +static int8_t _digit16(UChar c) { + if (c >= 0x0030 && c <= 0x0039) { + return (int8_t)(c - 0x0030); + } + if (c >= 0x0041 && c <= 0x0046) { + return (int8_t)(c - (0x0041 - 10)); + } + if (c >= 0x0061 && c <= 0x0066) { + return (int8_t)(c - (0x0061 - 10)); + } + return -1; +} + +/* Parse a single escape sequence. Although this method deals in + * UChars, it does not use C++ or UnicodeString. This allows it to + * be used from C contexts. */ +U_CAPI UChar32 U_EXPORT2 +u_unescapeAt(UNESCAPE_CHAR_AT charAt, + int32_t *offset, + int32_t length, + void *context) { + + int32_t start = *offset; + UChar c; + UChar32 result = 0; + int8_t n = 0; + int8_t minDig = 0; + int8_t maxDig = 0; + int8_t bitsPerDigit = 4; + int8_t dig; + int32_t i; + UBool braces = FALSE; + + /* Check that offset is in range */ + if (*offset < 0 || *offset >= length) { + goto err; + } + + /* Fetch first UChar after '\\' */ + c = charAt((*offset)++, context); + + /* Convert hexadecimal and octal escapes */ + switch (c) { + case 0x0075 /*'u'*/: + minDig = maxDig = 4; + break; + case 0x0055 /*'U'*/: + minDig = maxDig = 8; + break; + case 0x0078 /*'x'*/: + minDig = 1; + if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { + ++(*offset); + braces = TRUE; + maxDig = 8; + } else { + maxDig = 2; + } + break; + default: + dig = _digit8(c); + if (dig >= 0) { + minDig = 1; + maxDig = 3; + n = 1; /* Already have first octal digit */ + bitsPerDigit = 3; + result = dig; + } + break; + } + if (minDig != 0) { + while (*offset < length && n < maxDig) { + c = charAt(*offset, context); + dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); + if (dig < 0) { + break; + } + result = (result << bitsPerDigit) | dig; + ++(*offset); + ++n; + } + if (n < minDig) { + goto err; + } + if (braces) { + if (c != 0x7D /*}*/) { + goto err; + } + ++(*offset); + } + if (result < 0 || result >= 0x110000) { + goto err; + } + /* If an escape sequence specifies a lead surrogate, see if + * there is a trail surrogate after it, either as an escape or + * as a literal. If so, join them up into a supplementary. + */ + if (*offset < length && U16_IS_LEAD(result)) { + int32_t ahead = *offset + 1; + c = charAt(*offset, context); + if (c == 0x5C /*'\\'*/ && ahead < length) { + c = (UChar) u_unescapeAt(charAt, &ahead, length, context); + } + if (U16_IS_TRAIL(c)) { + *offset = ahead; + result = U16_GET_SUPPLEMENTARY(result, c); + } + } + return result; + } + + /* Convert C-style escapes in table */ + for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) { + if (c == UNESCAPE_MAP[i]) { + return UNESCAPE_MAP[i+1]; + } else if (c < UNESCAPE_MAP[i]) { + break; + } + } + + /* Map \cX to control-X: X & 0x1F */ + if (c == 0x0063 /*'c'*/ && *offset < length) { + c = charAt((*offset)++, context); + if (U16_IS_LEAD(c) && *offset < length) { + UChar c2 = charAt(*offset, context); + if (U16_IS_TRAIL(c2)) { + ++(*offset); + c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */ + } + } + return 0x1F & c; + } + + /* If no special forms are recognized, then consider + * the backslash to generically escape the next character. + * Deal with surrogate pairs. */ + if (U16_IS_LEAD(c) && *offset < length) { + UChar c2 = charAt(*offset, context); + if (U16_IS_TRAIL(c2)) { + ++(*offset); + return U16_GET_SUPPLEMENTARY(c, c2); + } + } + return c; + + err: + /* Invalid escape sequence */ + *offset = start; /* Reset to initial value */ + return (UChar32)0xFFFFFFFF; +} + +/* u_unescapeAt() callback to return a UChar from a char* */ +static UChar U_CALLCONV +_charPtr_charAt(int32_t offset, void *context) { + UChar c16; + /* It would be more efficient to access the invariant tables + * directly but there is no API for that. */ + u_charsToUChars(((char*) context) + offset, &c16, 1); + return c16; +} + +/* Append an escape-free segment of the text; used by u_unescape() */ +static void _appendUChars(UChar *dest, int32_t destCapacity, + const char *src, int32_t srcLen) { + if (destCapacity < 0) { + destCapacity = 0; + } + if (srcLen > destCapacity) { + srcLen = destCapacity; + } + u_charsToUChars(src, dest, srcLen); +} + +/* Do an invariant conversion of char* -> UChar*, with escape parsing */ +U_CAPI int32_t U_EXPORT2 +u_unescape(const char *src, UChar *dest, int32_t destCapacity) { + const char *segment = src; + int32_t i = 0; + char c; + + while ((c=*src) != 0) { + /* '\\' intentionally written as compiler-specific + * character constant to correspond to compiler-specific + * char* constants. */ + if (c == '\\') { + int32_t lenParsed = 0; + UChar32 c32; + if (src != segment) { + if (dest != NULL) { + _appendUChars(dest + i, destCapacity - i, + segment, (int32_t)(src - segment)); + } + i += (int32_t)(src - segment); + } + ++src; /* advance past '\\' */ + c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src); + if (lenParsed == 0) { + goto err; + } + src += lenParsed; /* advance past escape seq. */ + if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) { + U16_APPEND_UNSAFE(dest, i, c32); + } else { + i += U16_LENGTH(c32); + } + segment = src; + } else { + ++src; + } + } + if (src != segment) { + if (dest != NULL) { + _appendUChars(dest + i, destCapacity - i, + segment, (int32_t)(src - segment)); + } + i += (int32_t)(src - segment); + } + if (dest != NULL && i < destCapacity) { + dest[i] = 0; + } + return i; + + err: + if (dest != NULL && destCapacity > 0) { + *dest = 0; + } + return 0; +} + +/* NUL-termination of strings ----------------------------------------------- */ + +/** + * NUL-terminate a string no matter what its type. + * Set warning and error codes accordingly. + */ +#define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \ + if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \ + /* not a public function, so no complete argument checking */ \ + \ + if(length<0) { \ + /* assume that the caller handles this */ \ + } else if(length<destCapacity) { \ + /* NUL-terminate the string, the NUL fits */ \ + dest[length]=0; \ + /* unset the not-terminated warning but leave all others */ \ + if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \ + *pErrorCode=U_ZERO_ERROR; \ + } \ + } else if(length==destCapacity) { \ + /* unable to NUL-terminate, but the string itself fit - set a warning code */ \ + *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \ + } else /* length>destCapacity */ { \ + /* even the string itself did not fit - set an error code */ \ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \ + } \ + } + +U_CAPI int32_t U_EXPORT2 +u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { + __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); + return length; +} + +U_CAPI int32_t U_EXPORT2 +u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { + __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); + return length; +} + +U_CAPI int32_t U_EXPORT2 +u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { + __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); + return length; +} + +U_CAPI int32_t U_EXPORT2 +u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { + __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); + return length; +} + +// Compute the hash code for a string -------------------------------------- *** + +// Moved here from uhash.c so that UnicodeString::hashCode() does not depend +// on UHashtable code. + +/* + Compute the hash by iterating sparsely over about 32 (up to 63) + characters spaced evenly through the string. For each character, + multiply the previous hash value by a prime number and add the new + character in, like a linear congruential random number generator, + producing a pseudorandom deterministic value well distributed over + the output range. [LIU] +*/ + +#define STRING_HASH(TYPE, STR, STRLEN, DEREF) \ + uint32_t hash = 0; \ + const TYPE *p = (const TYPE*) STR; \ + if (p != NULL) { \ + int32_t len = (int32_t)(STRLEN); \ + int32_t inc = ((len - 32) / 32) + 1; \ + const TYPE *limit = p + len; \ + while (p<limit) { \ + hash = (hash * 37) + DEREF; \ + p += inc; \ + } \ + } \ + return static_cast<int32_t>(hash) + +/* Used by UnicodeString to compute its hashcode - Not public API. */ +U_CAPI int32_t U_EXPORT2 +ustr_hashUCharsN(const UChar *str, int32_t length) { + STRING_HASH(UChar, str, length, *p); +} + +U_CAPI int32_t U_EXPORT2 +ustr_hashCharsN(const char *str, int32_t length) { + STRING_HASH(uint8_t, str, length, *p); +} + +U_CAPI int32_t U_EXPORT2 +ustr_hashICharsN(const char *str, int32_t length) { + STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p)); +} diff --git a/src/core/basetypes/icu-ucsdet/utrace.c b/src/core/basetypes/icu-ucsdet/utrace.c new file mode 100644 index 00000000..c2f36c48 --- /dev/null +++ b/src/core/basetypes/icu-ucsdet/utrace.c @@ -0,0 +1,488 @@ +/* +******************************************************************************* +* Copyright (C) 2003-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: utrace.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +*/ + +#define UTRACE_IMPL +#include "unicode/utrace.h" +#include "utracimp.h" +#include "cstring.h" +#include "uassert.h" +#include "ucln_cmn.h" + + +static UTraceEntry *pTraceEntryFunc = NULL; +static UTraceExit *pTraceExitFunc = NULL; +static UTraceData *pTraceDataFunc = NULL; +static const void *gTraceContext = NULL; + +U_EXPORT int32_t +utrace_level = UTRACE_ERROR; + +U_CAPI void U_EXPORT2 +utrace_entry(int32_t fnNumber) { + if (pTraceEntryFunc != NULL) { + (*pTraceEntryFunc)(gTraceContext, fnNumber); + } +} + + +static const char gExitFmt[] = "Returns."; +static const char gExitFmtValue[] = "Returns %d."; +static const char gExitFmtStatus[] = "Returns. Status = %d."; +static const char gExitFmtValueStatus[] = "Returns %d. Status = %d."; +static const char gExitFmtPtrStatus[] = "Returns %d. Status = %p."; + +U_CAPI void U_EXPORT2 +utrace_exit(int32_t fnNumber, int32_t returnType, ...) { + if (pTraceExitFunc != NULL) { + va_list args; + const char *fmt; + + switch (returnType) { + case 0: + fmt = gExitFmt; + break; + case UTRACE_EXITV_I32: + fmt = gExitFmtValue; + break; + case UTRACE_EXITV_STATUS: + fmt = gExitFmtStatus; + break; + case UTRACE_EXITV_I32 | UTRACE_EXITV_STATUS: + fmt = gExitFmtValueStatus; + break; + case UTRACE_EXITV_PTR | UTRACE_EXITV_STATUS: + fmt = gExitFmtPtrStatus; + break; + default: + U_ASSERT(FALSE); + fmt = gExitFmt; + } + + va_start(args, returnType); + (*pTraceExitFunc)(gTraceContext, fnNumber, fmt, args); + va_end(args); + } +} + + + +U_CAPI void U_EXPORT2 +utrace_data(int32_t fnNumber, int32_t level, const char *fmt, ...) { + if (pTraceDataFunc != NULL) { + va_list args; + va_start(args, fmt ); + (*pTraceDataFunc)(gTraceContext, fnNumber, level, fmt, args); + va_end(args); + } +} + + +static void outputChar(char c, char *outBuf, int32_t *outIx, int32_t capacity, int32_t indent) { + int32_t i; + /* Check whether a start of line indenting is needed. Three cases: + * 1. At the start of the first line (output index == 0). + * 2. At the start of subsequent lines (preceeding char in buffer == '\n') + * 3. When preflighting buffer len (buffer capacity is exceeded), when + * a \n is output. Ideally we wouldn't do the indent until the following char + * is received, but that won't work because there's no place to remember that + * the preceding char was \n. Meaning that we may overstimate the + * buffer size needed. No harm done. + */ + if (*outIx==0 || /* case 1. */ + (c!='\n' && c!=0 && *outIx < capacity && outBuf[(*outIx)-1]=='\n') || /* case 2. */ + (c=='\n' && *outIx>=capacity)) /* case 3 */ + { + /* At the start of a line. Indent. */ + for(i=0; i<indent; i++) { + if (*outIx < capacity) { + outBuf[*outIx] = ' '; + } + (*outIx)++; + } + } + + if (*outIx < capacity) { + outBuf[*outIx] = c; + } + if (c != 0) { + /* Nulls only appear as end-of-string terminators. Move them to the output + * buffer, but do not update the length of the buffer, so that any + * following output will overwrite the null. */ + (*outIx)++; + } +} + +static void outputHexBytes(int64_t val, int32_t charsToOutput, + char *outBuf, int32_t *outIx, int32_t capacity) { + static const char gHexChars[] = "0123456789abcdef"; + int32_t shiftCount; + for (shiftCount=(charsToOutput-1)*4; shiftCount >= 0; shiftCount-=4) { + char c = gHexChars[(val >> shiftCount) & 0xf]; + outputChar(c, outBuf, outIx, capacity, 0); + } +} + +/* Output a pointer value in hex. Work with any size of pointer */ +static void outputPtrBytes(void *val, char *outBuf, int32_t *outIx, int32_t capacity) { + int32_t i; + int32_t incVal = 1; /* +1 for big endian, -1 for little endian */ + char *p = (char *)&val; /* point to current byte to output in the ptr val */ + +#if !U_IS_BIG_ENDIAN + /* Little Endian. Move p to most significant end of the value */ + incVal = -1; + p += sizeof(void *) - 1; +#endif + + /* Loop through the bytes of the ptr as it sits in memory, from + * most significant to least significant end */ + for (i=0; i<sizeof(void *); i++) { + outputHexBytes(*p, 2, outBuf, outIx, capacity); + p += incVal; + } +} + +static void outputString(const char *s, char *outBuf, int32_t *outIx, int32_t capacity, int32_t indent) { + int32_t i = 0; + char c; + if (s==NULL) { + s = "*NULL*"; + } + do { + c = s[i++]; + outputChar(c, outBuf, outIx, capacity, indent); + } while (c != 0); +} + + + +static void outputUString(const UChar *s, int32_t len, + char *outBuf, int32_t *outIx, int32_t capacity, int32_t indent) { + int32_t i = 0; + UChar c; + if (s==NULL) { + outputString(NULL, outBuf, outIx, capacity, indent); + return; + } + + for (i=0; i<len || len==-1; i++) { + c = s[i]; + outputHexBytes(c, 4, outBuf, outIx, capacity); + outputChar(' ', outBuf, outIx, capacity, indent); + if (len == -1 && c==0) { + break; + } + } +} + +U_CAPI int32_t U_EXPORT2 +utrace_vformat(char *outBuf, int32_t capacity, int32_t indent, const char *fmt, va_list args) { + int32_t outIx = 0; + int32_t fmtIx = 0; + char fmtC; + char c; + int32_t intArg; + int64_t longArg = 0; + char *ptrArg; + + /* Loop runs once for each character in the format string. + */ + for (;;) { + fmtC = fmt[fmtIx++]; + if (fmtC != '%') { + /* Literal character, not part of a %sequence. Just copy it to the output. */ + outputChar(fmtC, outBuf, &outIx, capacity, indent); + if (fmtC == 0) { + /* We hit the null that terminates the format string. + * This is the normal (and only) exit from the loop that + * interprets the format + */ + break; + } + continue; + } + + /* We encountered a '%'. Pick up the following format char */ + fmtC = fmt[fmtIx++]; + + switch (fmtC) { + case 'c': + /* single 8 bit char */ + c = (char)va_arg(args, int32_t); + outputChar(c, outBuf, &outIx, capacity, indent); + break; + + case 's': + /* char * string, null terminated. */ + ptrArg = va_arg(args, char *); + outputString((const char *)ptrArg, outBuf, &outIx, capacity, indent); + break; + + case 'S': + /* UChar * string, with length, len==-1 for null terminated. */ + ptrArg = va_arg(args, void *); /* Ptr */ + intArg =(int32_t)va_arg(args, int32_t); /* Length */ + outputUString((const UChar *)ptrArg, intArg, outBuf, &outIx, capacity, indent); + break; + + case 'b': + /* 8 bit int */ + intArg = va_arg(args, int); + outputHexBytes(intArg, 2, outBuf, &outIx, capacity); + break; + + case 'h': + /* 16 bit int */ + intArg = va_arg(args, int); + outputHexBytes(intArg, 4, outBuf, &outIx, capacity); + break; + + case 'd': + /* 32 bit int */ + intArg = va_arg(args, int); + outputHexBytes(intArg, 8, outBuf, &outIx, capacity); + break; + + case 'l': + /* 64 bit long */ + longArg = va_arg(args, int64_t); + outputHexBytes(longArg, 16, outBuf, &outIx, capacity); + break; + + case 'p': + /* Pointers. */ + ptrArg = va_arg(args, void *); + outputPtrBytes(ptrArg, outBuf, &outIx, capacity); + break; + + case 0: + /* Single '%' at end of fmt string. Output as literal '%'. + * Back up index into format string so that the terminating null will be + * re-fetched in the outer loop, causing it to terminate. + */ + outputChar('%', outBuf, &outIx, capacity, indent); + fmtIx--; + break; + + case 'v': + { + /* Vector of values, e.g. %vh */ + char vectorType; + int32_t vectorLen; + const char *i8Ptr; + int16_t *i16Ptr; + int32_t *i32Ptr; + int64_t *i64Ptr; + void **ptrPtr; + int32_t charsToOutput = 0; + int32_t i; + + vectorType = fmt[fmtIx]; /* b, h, d, l, p, etc. */ + if (vectorType != 0) { + fmtIx++; + } + i8Ptr = (const char *)va_arg(args, void*); + i16Ptr = (int16_t *)i8Ptr; + i32Ptr = (int32_t *)i8Ptr; + i64Ptr = (int64_t *)i8Ptr; + ptrPtr = (void **)i8Ptr; + vectorLen =(int32_t)va_arg(args, int32_t); + if (ptrPtr == NULL) { + outputString("*NULL* ", outBuf, &outIx, capacity, indent); + } else { + for (i=0; i<vectorLen || vectorLen==-1; i++) { + switch (vectorType) { + case 'b': + charsToOutput = 2; + longArg = *i8Ptr++; + break; + case 'h': + charsToOutput = 4; + longArg = *i16Ptr++; + break; + case 'd': + charsToOutput = 8; + longArg = *i32Ptr++; + break; + case 'l': + charsToOutput = 16; + longArg = *i64Ptr++; + break; + case 'p': + charsToOutput = 0; + outputPtrBytes(*ptrPtr, outBuf, &outIx, capacity); + longArg = *ptrPtr==NULL? 0: 1; /* test for null terminated array. */ + ptrPtr++; + break; + case 'c': + charsToOutput = 0; + outputChar(*i8Ptr, outBuf, &outIx, capacity, indent); + longArg = *i8Ptr; /* for test for null terminated array. */ + i8Ptr++; + break; + case 's': + charsToOutput = 0; + outputString(*ptrPtr, outBuf, &outIx, capacity, indent); + outputChar('\n', outBuf, &outIx, capacity, indent); + longArg = *ptrPtr==NULL? 0: 1; /* for test for null term. array. */ + ptrPtr++; + break; + + case 'S': + charsToOutput = 0; + outputUString((const UChar *)*ptrPtr, -1, outBuf, &outIx, capacity, indent); + outputChar('\n', outBuf, &outIx, capacity, indent); + longArg = *ptrPtr==NULL? 0: 1; /* for test for null term. array. */ + ptrPtr++; + break; + + + } + if (charsToOutput > 0) { + outputHexBytes(longArg, charsToOutput, outBuf, &outIx, capacity); + outputChar(' ', outBuf, &outIx, capacity, indent); + } + if (vectorLen == -1 && longArg == 0) { + break; + } + } + } + outputChar('[', outBuf, &outIx, capacity, indent); + outputHexBytes(vectorLen, 8, outBuf, &outIx, capacity); + outputChar(']', outBuf, &outIx, capacity, indent); + } + break; + + + default: + /* %. in format string, where . is some character not in the set + * of recognized format chars. Just output it as if % wasn't there. + * (Covers "%%" outputing a single '%') + */ + outputChar(fmtC, outBuf, &outIx, capacity, indent); + } + } + outputChar(0, outBuf, &outIx, capacity, indent); /* Make sure that output is null terminated */ + return outIx + 1; /* outIx + 1 because outIx does not increment when outputing final null. */ +} + + + + +U_CAPI int32_t U_EXPORT2 +utrace_format(char *outBuf, int32_t capacity, + int32_t indent, const char *fmt, ...) { + int32_t retVal; + va_list args; + va_start(args, fmt ); + retVal = utrace_vformat(outBuf, capacity, indent, fmt, args); + va_end(args); + return retVal; +} + + +U_CAPI void U_EXPORT2 +utrace_setFunctions(const void *context, + UTraceEntry *e, UTraceExit *x, UTraceData *d) { + pTraceEntryFunc = e; + pTraceExitFunc = x; + pTraceDataFunc = d; + gTraceContext = context; +} + + +U_CAPI void U_EXPORT2 +utrace_getFunctions(const void **context, + UTraceEntry **e, UTraceExit **x, UTraceData **d) { + *e = pTraceEntryFunc; + *x = pTraceExitFunc; + *d = pTraceDataFunc; + *context = gTraceContext; +} + +U_CAPI void U_EXPORT2 +utrace_setLevel(int32_t level) { + if (level < UTRACE_OFF) { + level = UTRACE_OFF; + } + if (level > UTRACE_VERBOSE) { + level = UTRACE_VERBOSE; + } + utrace_level = level; +} + +U_CAPI int32_t U_EXPORT2 +utrace_getLevel() { + return utrace_level; +} + + +U_CFUNC UBool +utrace_cleanup() { + pTraceEntryFunc = NULL; + pTraceExitFunc = NULL; + pTraceDataFunc = NULL; + utrace_level = UTRACE_OFF; + gTraceContext = NULL; + return TRUE; +} + + +static const char * const +trFnName[] = { + "u_init", + "u_cleanup", + NULL +}; + + +static const char * const +trConvNames[] = { + "ucnv_open", + "ucnv_openPackage", + "ucnv_openAlgorithmic", + "ucnv_clone", + "ucnv_close", + "ucnv_flushCache", + "ucnv_load", + "ucnv_unload", + NULL +}; + + +static const char * const +trCollNames[] = { + "ucol_open", + "ucol_close", + "ucol_strcoll", + "ucol_getSortKey", + "ucol_getLocale", + "ucol_nextSortKeyPart", + "ucol_strcollIter", + "ucol_openFromShortString", + "ucol_strcollUTF8", + NULL +}; + + +U_CAPI const char * U_EXPORT2 +utrace_functionName(int32_t fnNumber) { + if(UTRACE_FUNCTION_START <= fnNumber && fnNumber < UTRACE_FUNCTION_LIMIT) { + return trFnName[fnNumber]; + } else if(UTRACE_CONVERSION_START <= fnNumber && fnNumber < UTRACE_CONVERSION_LIMIT) { + return trConvNames[fnNumber - UTRACE_CONVERSION_START]; + } else if(UTRACE_COLLATION_START <= fnNumber && fnNumber < UTRACE_COLLATION_LIMIT){ + return trCollNames[fnNumber - UTRACE_COLLATION_START]; + } else { + return "[BOGUS Trace Function Number]"; + } +} + |