summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorGravatar Alexey Yakovenko <wakeroid@gmail.com>2010-05-02 12:41:20 +0200
committerGravatar Alexey Yakovenko <wakeroid@gmail.com>2010-05-02 12:41:20 +0200
commit2e53930ee9f06102ee39bf95b21775a563679d31 (patch)
treec9b65db0e54b576e91eed4e61dc1aafb2453aa3f /utf8.c
parent95c829fb75ed7c4ecb933073f5ed59d2b647c7f4 (diff)
optimizations in search and sorting
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c45
1 files changed, 41 insertions, 4 deletions
diff --git a/utf8.c b/utf8.c
index b55bc0de..6e6bb100 100644
--- a/utf8.c
+++ b/utf8.c
@@ -26,6 +26,7 @@
//#include <alloca.h>
#include "ctype.h"
#include "utf8.h"
+#include "u8_lc_map.h"
static const uint32_t offsetsFromUTF8[6] = {
0x00000000UL, 0x00003080UL, 0x000E2080UL,
@@ -599,17 +600,33 @@ int u8_valid (const char *str,
return 1;
}
-static const char lowerchars[] = "záéíñóúüäöåæøàçèéêабвгдеёжзийклмнорпстуфхцчшщъыьэюя";
-static const char upperchars[] = "ZÁÉÍÑÓÚÜÄÖÅÆØÀÇÈÉÊАБВГДЕЁЖЗИЙКЛМНОРПСТУФХЦЧШЩЪЫЬЭЮЯ";
+#if 0
+static const char lowerchars[] = "áéíñóúüäöåæøàçèêабвгдеёжзийклмнорпстуфхцчшщъыьэюя";
+static const char upperchars[] = "ÁÉÍÑÓÚÜÄÖÅÆØÀÇÈÊАБВГДЕЁЖЗИЙКЛМНОРПСТУФХЦЧШЩЪЫЬЭЮЯ";
+#endif
int
u8_tolower (const signed char *c, int l, char *out) {
- if (*c > 0) {
- *out = tolower (*c);
+ if (*c >= 65 && *c <= 90) {
+ *out = *c + 0x20;//tolower (*c);
+ out[1] = 0;
+ return 1;
+ }
+ else if (*c > 0) {
+ *out = *c;
out[1] = 0;
return 1;
}
else {
+#if 1
+ struct u8_case_map_t *lc = u8_lc_in_word_set (c, l);
+ if (lc) {
+ int ll = 2;//strlen (lc->lower);
+ memcpy (out, lc->lower, ll);
+ out[ll] = 0;
+ return ll;
+ }
+#else
for (int i = 0; i < sizeof (upperchars)-l; i++) {
if (!memcmp (upperchars+i, c, l)) {
// found!
@@ -618,6 +635,7 @@ u8_tolower (const signed char *c, int l, char *out) {
return l;
}
}
+#endif
memcpy (out, c, l);
out[l] = 0;
return l;
@@ -670,3 +688,22 @@ utfcasestr (const char *s1, const char *s2) {
}
return NULL;
}
+
+void
+u8_lc_map_test (void) {
+ struct u8_case_map_t *lc;
+ lc = u8_lc_in_word_set ("Á", 2);
+ printf ("%s -> %s\n", lc->name, lc->lower);
+ lc = u8_lc_in_word_set ("É", 2);
+ printf ("%s -> %s\n", lc->name, lc->lower);
+ lc = u8_lc_in_word_set ("Í", 2);
+ printf ("%s -> %s\n", lc->name, lc->lower);
+ lc = u8_lc_in_word_set ("Ñ", 2);
+ printf ("%s -> %s\n", lc->name, lc->lower);
+ lc = u8_lc_in_word_set ("П", 2);
+ printf ("%s -> %s\n", lc->name, lc->lower);
+ lc = u8_lc_in_word_set ("Л", 2);
+ printf ("%s -> %s\n", lc->name, lc->lower);
+ lc = u8_lc_in_word_set ("А", 2);
+ printf ("%s -> %s\n", lc->name, lc->lower);
+}