From eb86dffeeec897d17905f3adff84e6acfd018330 Mon Sep 17 00:00:00 2001
From: Denis Redozubov <denis.redozubov@gmail.com>
Date: Wed, 22 Aug 2018 15:11:32 +0300
Subject: Rough same page anchors

---
 include/urweb/urweb_cpp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/urweb')

diff --git a/include/urweb/urweb_cpp.h b/include/urweb/urweb_cpp.h
index 5f1144b8..1351cfbc 100644
--- a/include/urweb/urweb_cpp.h
+++ b/include/urweb/urweb_cpp.h
@@ -242,6 +242,7 @@ uw_Basis_string uw_Basis_blessEnvVar(struct uw_context *, uw_Basis_string);
 uw_Basis_string uw_Basis_blessMeta(struct uw_context *, uw_Basis_string);
 
 uw_Basis_string uw_Basis_checkUrl(struct uw_context *, uw_Basis_string);
+uw_Basis_string uw_Basis_anchorUrl(struct uw_context *, uw_Basis_string);
 uw_Basis_string uw_Basis_checkMime(struct uw_context *, uw_Basis_string);
 uw_Basis_string uw_Basis_checkRequestHeader(struct uw_context *, uw_Basis_string);
 uw_Basis_string uw_Basis_checkResponseHeader(struct uw_context *, uw_Basis_string);
-- 
cgit v1.2.3


From c2a217f9121dd865122bc6150c53e77bd662050d Mon Sep 17 00:00:00 2001
From: fab <fabrice.leal.ch@gmail.com>
Date: Sat, 3 Nov 2018 20:09:20 +0000
Subject: utf-8 aware functions for basis. unit-testing.

---
 .travis.yml               |   2 +-
 include/urweb/types_cpp.h |   3 +-
 src/c/Makefile.am         |   2 +-
 src/c/urweb.c             | 197 +++++++++++++-------
 src/compiler.sml          |   4 +-
 tests/Makefile            |   2 +
 tests/utf8.py             | 449 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/utf8.ur             | 431 ++++++++++++++++++++++++++++++++++++++++++++
 tests/utf8.urp            |   5 +
 9 files changed, 1024 insertions(+), 71 deletions(-)
 create mode 100644 tests/utf8.py
 create mode 100644 tests/utf8.ur
 create mode 100644 tests/utf8.urp

(limited to 'include/urweb')

diff --git a/.travis.yml b/.travis.yml
index df4e4abc..86d731cc 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,7 +18,7 @@ compiler:
 before_install:
   - export CONFIGURE_ARGS=""
   - if command -v apt-get &>/dev/null; then sudo apt-get update -qq; fi
-  - if command -v apt-get &>/dev/null; then sudo apt-get install -y mlton; fi
+  - if command -v apt-get &>/dev/null; then sudo apt-get install -y mlton lib-icudev; fi
   - if command -v brew &>/dev/null; then brew update; fi
   - if command -v brew &>/dev/null; then brew uninstall libtool; fi
   - if command -v brew &>/dev/null; then brew install libtool; fi
diff --git a/include/urweb/types_cpp.h b/include/urweb/types_cpp.h
index 0c546d1c..c6c0dd3e 100644
--- a/include/urweb/types_cpp.h
+++ b/include/urweb/types_cpp.h
@@ -4,11 +4,12 @@
 #include <time.h>
 #include <unistd.h>
 #include <stdint.h>
+#include <unicode/utypes.h>
 
 typedef long long uw_Basis_int;
 typedef double uw_Basis_float;
 typedef char* uw_Basis_string;
-typedef char uw_Basis_char;
+typedef UChar32 uw_Basis_char;
 typedef struct {
   time_t seconds;
   unsigned microseconds;
diff --git a/src/c/Makefile.am b/src/c/Makefile.am
index 027b1458..96c1d92f 100644
--- a/src/c/Makefile.am
+++ b/src/c/Makefile.am
@@ -11,7 +11,7 @@ AM_CFLAGS = -Wall -Wunused-parameter -Werror -Wno-format-security -Wno-deprecate
 liburweb_la_LDFLAGS = $(AM_LDFLAGS) $(OPENSSL_LDFLAGS) \
 	-export-symbols-regex '^(client_pruner|pthread_create_big|strcmp_nullsafe|uw_.*)' \
 	-version-info 1:0:0
-liburweb_la_LIBADD = $(PTHREAD_LIBS) -lm $(OPENSSL_LIBS)
+liburweb_la_LIBADD = $(PTHREAD_LIBS) -lm $(OPENSSL_LIBS) -licui18n -licuuc -licudata
 liburweb_http_la_LIBADD = liburweb.la
 liburweb_http_la_LDFLAGS = -export-symbols-regex '^(main|uw_.*)' \
 	-version-info 1:0:0
diff --git a/src/c/urweb.c b/src/c/urweb.c
index 2e3e18bc..69c3da94 100644
--- a/src/c/urweb.c
+++ b/src/c/urweb.c
@@ -20,6 +20,9 @@
 
 #include <pthread.h>
 
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+
 #include "types.h"
 
 #include "uthash.h"
@@ -2421,28 +2424,34 @@ uw_unit uw_Basis_htmlifySource_w(uw_context ctx, uw_Basis_source src) {
   return uw_unit_v;
 }
 
-uw_Basis_char uw_Basis_strsub(uw_context ctx, uw_Basis_string s, uw_Basis_int n) {
+uw_Basis_char uw_Basis_strsub(uw_context ctx, uw_Basis_string s, uw_Basis_int n) {  
+  uw_Basis_char c;
+  int offset = 0;
+  
   while (n >= 0) {
-    if (*s == 0)
+    
+    if (s[offset] == 0)
       uw_error(ctx, FATAL, "Out-of-bounds strsub");
 
+    U8_NEXT(s, offset, -1, c);
+    
     if (n == 0)
-      return *s;
+      return c;
 
     --n;
-    ++s;
   }
 
   uw_error(ctx, FATAL, "Negative strsub bound");
 }
 
 uw_Basis_string uw_Basis_strsuffix(uw_context ctx, uw_Basis_string s, uw_Basis_int n) {
+  int offset = 0;
   while (n >= 0) {
-    if (*s == 0 || n == 0)
-      return s;
+    if (s[offset] == 0 || n == 0)
+      return s + offset;
 
+    U8_FWD_1(s, offset, -1);
     --n;
-    ++s;
   }
 
   uw_error(ctx, FATAL, "Negative strsuffix bound");
@@ -2450,40 +2459,80 @@ uw_Basis_string uw_Basis_strsuffix(uw_context ctx, uw_Basis_string s, uw_Basis_i
 
 uw_Basis_int uw_Basis_strlen(uw_context ctx, uw_Basis_string s) {
   (void)ctx;
-  return strlen(s);
+  int offset = 0, iterations = 0;
+  while (s[offset] != 0) {
+    U8_FWD_1(s, offset, -1);
+    ++iterations;
+  }
+  return iterations;
 }
 
 uw_Basis_bool uw_Basis_strlenGe(uw_context ctx, uw_Basis_string s, uw_Basis_int n) {
   (void)ctx;
-
+  int offset = 0;
   while (n > 0) {
-    if (*s == 0)
+    if (s[offset] == 0)
       return uw_Basis_False;
-
+        
+    U8_FWD_1(s, offset, -1);
     --n;
-    ++s;
   }
 
   return uw_Basis_True;
 }
 
+int aux_strchr(uw_Basis_string s, uw_Basis_char ch, int* o_offset) {
+  int u8idx = 0, offset = 0;
+  uw_Basis_char c;
+    
+  while (s[offset] != 0) {
+    U8_NEXT(s, offset, -1, c);
+    if (c == ch) {
+      *o_offset = offset;
+      return u8idx;
+    }
+
+    ++u8idx;
+  }
+
+  *o_offset = -1;
+  return -1;
+}
+
 uw_Basis_string uw_Basis_strchr(uw_context ctx, uw_Basis_string s, uw_Basis_char ch) {
   (void)ctx;
-  return strchr(s, ch);
+  int offset = -1;
+  if (aux_strchr(s, ch, &offset) > -1) {
+    return s + offset;
+  }
+  return NULL;  
 }
 
 uw_Basis_int uw_Basis_strcspn(uw_context ctx, uw_Basis_string s, uw_Basis_string chs) {
   (void)ctx;
-  return strcspn(s, chs);
+  int offset = 0, u8idx = 0, offsetChs = 0;
+  uw_Basis_char c;
+  
+  while (s[offset] != 0) {
+    U8_NEXT(s, offset, -1, c);
+    if (aux_strchr(chs, c, &offsetChs) > -1) {
+      return u8idx;
+    }
+    ++u8idx;
+  }
+
+  return u8idx;
 }
 
 uw_Basis_int *uw_Basis_strindex(uw_context ctx, uw_Basis_string s, uw_Basis_char ch) {
-  uw_Basis_string r = strchr(s, ch);
-  if (r == NULL)
+  (void)ctx;
+  int offset = -1;
+  int r = aux_strchr(s, ch, &offset);
+  if (r == -1)
     return NULL;
   else {
     uw_Basis_int *nr = uw_malloc(ctx, sizeof(uw_Basis_int));
-    *nr = r - s;
+    *nr = r;
     return nr;
   }
 }
@@ -2494,13 +2543,19 @@ uw_Basis_int *uw_Basis_strsindex(uw_context ctx, const char *haystack, const cha
     return NULL;
   else {
     uw_Basis_int *nr = uw_malloc(ctx, sizeof(uw_Basis_int));
-    *nr = r - haystack;
+    int src = r - haystack, offset = 0, utf8idx = 0;
+    while (offset < src) {
+      U8_FWD_1(haystack, offset, -1);
+      ++utf8idx;
+    }
+    
+    *nr = utf8idx;
     return nr;
   }
 }
 
 uw_Basis_string uw_Basis_strcat(uw_context ctx, uw_Basis_string s1, uw_Basis_string s2) {
-  int len = uw_Basis_strlen(ctx, s1) + uw_Basis_strlen(ctx, s2) + 1;
+  int len = strlen(s1) + strlen(s2) + 1;
   char *s;
 
   uw_check_heap(ctx, len);
@@ -2515,8 +2570,8 @@ uw_Basis_string uw_Basis_strcat(uw_context ctx, uw_Basis_string s1, uw_Basis_str
 }
 
 uw_Basis_string uw_Basis_substring(uw_context ctx, uw_Basis_string s, uw_Basis_int start, uw_Basis_int len) {
-  size_t full_len = uw_Basis_strlen(ctx, s);
-
+  int full_len = uw_Basis_strlen(ctx, s);
+  
   if (start < 0)
     uw_error(ctx, FATAL, "substring: Negative start index");
   if (len < 0)
@@ -2524,32 +2579,41 @@ uw_Basis_string uw_Basis_substring(uw_context ctx, uw_Basis_string s, uw_Basis_i
   if (start + len > full_len)
     uw_error(ctx, FATAL, "substring: Start index plus length is too large");
 
-  if (start + len == full_len)
-    return &s[start];
-  else {
-    uw_Basis_string r = uw_malloc(ctx, len+1);
-    memcpy(r, s+start, len);
-    r[len] = 0;
+  int offset = 0;
+  U8_FWD_N(s, offset, -1, start);
+  
+  if (start + len == full_len) {
+    return s + offset;
+  } else {
+    int end = offset;
+    U8_FWD_N(s, end, -1, len);
+
+    int actual_len = end - offset;
+
+    uw_Basis_string r = uw_malloc(ctx, actual_len + 1);
+    memcpy(r, s + offset, actual_len);
+    r[actual_len] = 0;
     return r;
   }
-
 }
 
 uw_Basis_string uw_Basis_str1(uw_context ctx, uw_Basis_char ch) {
   char *r;
-
-  uw_check_heap(ctx, 2);
+  int req = U8_LENGTH(ch);
+  int offset = 0;
+  
+  uw_check_heap(ctx, req + 1);
   r = ctx->heap.front;
-  r[0] = ch;
-  r[1] = 0;
 
-  ctx->heap.front += 2;
+  U8_APPEND_UNSAFE(r, offset, ch);  
+  r[req] = 0;
 
-  return r;
+  ctx->heap.front += req + 1;
+  return r; 
 }
 
 uw_Basis_string uw_strdup(uw_context ctx, uw_Basis_string s1) {
-  int len = uw_Basis_strlen(ctx, s1) + 1;
+  int len = strlen(s1) + 1;
   char *s;
 
   uw_check_heap(ctx, len);
@@ -2676,7 +2740,6 @@ uw_Basis_string uw_Basis_sqlifyString(uw_context ctx, uw_Basis_string s) {
 
 uw_Basis_string uw_Basis_sqlifyChar(uw_context ctx, uw_Basis_char c) {
   char *r, *s2;
-
   uw_check_heap(ctx, 5 + uw_Estrings + strlen(uw_sqlsuffixChar));
 
   r = s2 = ctx->heap.front;
@@ -2934,10 +2997,7 @@ uw_Basis_string uw_Basis_floatToString(uw_context ctx, uw_Basis_float n) {
 }
 
 uw_Basis_string uw_Basis_charToString(uw_context ctx, uw_Basis_char ch) {
-  char *r = uw_malloc(ctx, 2);
-  r[0] = ch;
-  r[1] = 0;
-  return r;
+  return uw_Basis_str1(ctx, ch);
 }
 
 uw_Basis_string uw_Basis_boolToString(uw_context ctx, uw_Basis_bool b) {
@@ -2997,11 +3057,12 @@ uw_Basis_char *uw_Basis_stringToChar(uw_context ctx, uw_Basis_string s) {
     uw_Basis_char *r = uw_malloc(ctx, 1);
     r[0] = 0;
     return r;
-  } else if (s[1] != 0)
+  } else if (uw_Basis_strlenGe(ctx, s, 2) == uw_Basis_True)
     return NULL;
   else {
     uw_Basis_char *r = uw_malloc(ctx, 1);
-    r[0] = s[0];
+    int offset = 0;
+    U8_NEXT(s, offset, -1, *r);
     return r;
   }
 }
@@ -3126,10 +3187,14 @@ uw_Basis_float uw_Basis_stringToFloat_error(uw_context ctx, uw_Basis_string s) {
 uw_Basis_char uw_Basis_stringToChar_error(uw_context ctx, uw_Basis_string s) {
   if (s[0] == 0)
     return 0;
-  else if (s[1] != 0)
+  else if (uw_Basis_strlenGe(ctx, s, 2) == uw_Basis_True)
     uw_error(ctx, FATAL, "Can't parse char: %s", uw_Basis_htmlifyString(ctx, s));
-  else
-    return s[0];
+  else {
+    uw_Basis_char c;
+    int offset = 0;
+    U8_NEXT(s, offset, -1, c);
+    return c;
+  }
 }
 
 uw_Basis_bool uw_Basis_stringToBool_error(uw_context ctx, uw_Basis_string s) {
@@ -4328,82 +4393,82 @@ void uw_set_global(uw_context ctx, char *name, void *data, void (*free)(void*))
 
 uw_Basis_bool uw_Basis_isalnum(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return !!isalnum((int)c);
+  return !!u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM);
 }
 
 uw_Basis_bool uw_Basis_isalpha(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return !!isalpha((int)c);
+  return !!u_hasBinaryProperty(c, UCHAR_ALPHABETIC);
 }
 
 uw_Basis_bool uw_Basis_isblank(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return !!isblank((int)c);
+  return !!u_hasBinaryProperty(c, UCHAR_POSIX_BLANK);
 }
 
 uw_Basis_bool uw_Basis_iscntrl(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return !!iscntrl((int)c);
+  return !!(u_charType(c)==U_CONTROL_CHAR);
 }
 
 uw_Basis_bool uw_Basis_isdigit(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return !!isdigit((int)c);
+  return !!u_isdigit(c);
 }
 
 uw_Basis_bool uw_Basis_isgraph(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return !!isgraph((int)c);
+  return !!u_hasBinaryProperty(c, UCHAR_POSIX_GRAPH);
 }
 
 uw_Basis_bool uw_Basis_islower(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return !!islower((int)c);
+  return !!u_hasBinaryProperty(c, UCHAR_LOWERCASE);
 }
 
 uw_Basis_bool uw_Basis_isprint(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return !!isprint((int)c);
+  return !!u_hasBinaryProperty(c, UCHAR_POSIX_PRINT);
 }
 
 uw_Basis_bool uw_Basis_ispunct(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return !!ispunct((int)c);
+  return !!u_ispunct(c);
 }
 
 uw_Basis_bool uw_Basis_isspace(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return !!isspace((int)c);
+  return !!u_hasBinaryProperty(c, UCHAR_WHITE_SPACE);
 }
 
 uw_Basis_bool uw_Basis_isupper(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return !!isupper((int)c);
+  return !!u_hasBinaryProperty(c, UCHAR_UPPERCASE);
 }
 
 uw_Basis_bool uw_Basis_isxdigit(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return !!isxdigit((int)c);
+  return !!u_hasBinaryProperty(c, UCHAR_POSIX_XDIGIT);
 }
 
 uw_Basis_char uw_Basis_tolower(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return tolower((int)c);
+  return u_tolower(c);
 }
 
 uw_Basis_char uw_Basis_toupper(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return toupper((int)c);
+  return u_toupper(c);
 }
 
 uw_Basis_int uw_Basis_ord(uw_context ctx, uw_Basis_char c) {
   (void)ctx;
-  return (unsigned char)c;
+  return (uw_Basis_int)c;
 }
 
 uw_Basis_char uw_Basis_chr(uw_context ctx, uw_Basis_int n) {
   (void)ctx;
-  return n;
+  return (uw_Basis_char)n;
 }
 
 uw_Basis_string uw_Basis_currentUrl(uw_context ctx) {
@@ -4657,7 +4722,7 @@ uw_Basis_string uw_Basis_atom(uw_context ctx, uw_Basis_string s) {
 
   for (p = s; *p; ++p) {
     char c = *p;
-    if (!isalnum((int)c) && c != '+' && c != '-' && c != '.' && c != '%' && c != '#')
+    if (!U8_IS_SINGLE(c) && !isalnum((int)c) && c != '+' && c != '-' && c != '.' && c != '%' && c != '#')
       uw_error(ctx, FATAL, "Disallowed character in CSS atom");
   }
 
@@ -4669,7 +4734,7 @@ uw_Basis_string uw_Basis_css_url(uw_context ctx, uw_Basis_string s) {
 
   for (p = s; *p; ++p) {
     char c = *p;
-    if (!isalnum((int)c) && c != ':' && c != '/' && c != '.' && c != '_' && c != '+'
+    if (!U8_IS_SINGLE(c) && !isalnum((int)c) && c != ':' && c != '/' && c != '.' && c != '_' && c != '+'
         && c != '-' && c != '%' && c != '?' && c != '&' && c != '=' && c != '#')
       uw_error(ctx, FATAL, "Disallowed character in CSS URL");
   }
@@ -4688,7 +4753,7 @@ uw_Basis_string uw_Basis_property(uw_context ctx, uw_Basis_string s) {
 
   for (p = s; *p; ++p) {
     char c = *p;
-    if (!islower((int)c) && !isdigit((int)c) && c != '_' && c != '-')
+    if (!U8_IS_SINGLE(c) && !islower((int)c) && !isdigit((int)c) && c != '_' && c != '-')
       uw_error(ctx, FATAL, "Disallowed character in CSS property");
   }
 
@@ -5064,7 +5129,7 @@ void uw_Sqlcache_flush(uw_context ctx, uw_Sqlcache_Cache *cache, char **keys) {
   pthread_rwlock_unlock(&cache->lockIn);
 }
 
-int strcmp_nullsafe(const char *str1, const char *str2) {
+int strcmp_nullsafe(const char *str1, const char *str2) {  
   if (str1)
     return strcmp(str1, str2);
   else
@@ -5073,7 +5138,7 @@ int strcmp_nullsafe(const char *str1, const char *str2) {
 
 static int is_valid_hash(uw_Basis_string hash) {
   for (; *hash; ++hash)
-    if (!isxdigit(*hash))
+    if (!U8_IS_SINGLE(*hash) && !isxdigit(*hash))
       return 0;
 
   return 1;
diff --git a/src/compiler.sml b/src/compiler.sml
index f724bf56..9ee88c9b 100644
--- a/src/compiler.sml
+++ b/src/compiler.sml
@@ -1585,9 +1585,9 @@ fun compileC {cname, oname, ename, libs, profile, debug, linker, link = link'} =
         val proto = Settings.currentProtocol ()
 
         val lib = if Settings.getBootLinking () then
-                      !Settings.configLib ^ "/" ^ #linkStatic proto ^ " " ^ !Settings.configLib ^ "/liburweb.a"
+                      !Settings.configLib ^ "/" ^ #linkStatic proto ^ " " ^ !Settings.configLib ^ "/liburweb.a -licui18n -licuuc -licudata"
                   else if Settings.getStaticLinking () then
-                      " -static " ^ !Settings.configLib ^ "/" ^ #linkStatic proto ^ " " ^ !Settings.configLib ^ "/liburweb.a"
+                      " -static " ^ !Settings.configLib ^ "/" ^ #linkStatic proto ^ " " ^ !Settings.configLib ^ "/liburweb.a -licui18n -licuuc -licudata"
                   else
                       "-L" ^ !Settings.configLib ^ " " ^ #linkDynamic proto ^ " -lurweb"
 
diff --git a/tests/Makefile b/tests/Makefile
index ecf5557b..03e37e4b 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -28,3 +28,5 @@ simple::
 	./driver.sh fact
 	./driver.sh filter
 	./driver.sh jsbspace
+	./driver.sh utf8
+
diff --git a/tests/utf8.py b/tests/utf8.py
new file mode 100644
index 00000000..ff9b737a
--- /dev/null
+++ b/tests/utf8.py
@@ -0,0 +1,449 @@
+import unittest
+import base
+
+class Suite(base.Base):
+    def test_1(self):
+        """Test case: substring (1)"""
+        self.start('Utf8/substrings')
+
+        pre = self.xpath('pre[1]')
+        self.assertEqual('abc', pre.text)
+
+        pre = self.xpath('pre[2]')
+        self.assertEqual('bc', pre.text)
+
+        pre = self.xpath('pre[3]')
+        self.assertEqual('c', pre.text)
+
+        pre = self.xpath('pre[4]')
+        self.assertEqual('ábó', pre.text)
+
+        pre = self.xpath('pre[5]')
+        self.assertEqual('bó', pre.text)
+
+        pre = self.xpath('pre[6]')
+        self.assertEqual('ó', pre.text)
+        
+        pre = self.xpath('pre[7]')
+        self.assertEqual('çãó', pre.text)
+
+        pre = self.xpath('pre[8]')
+        self.assertEqual('ãó', pre.text)
+
+        pre = self.xpath('pre[9]')
+        self.assertEqual('ó', pre.text)
+
+        pre = self.xpath('pre[10]')
+        self.assertEqual('', pre.text)
+
+        pre = self.xpath('pre[11]')
+        self.assertEqual('', pre.text)
+
+        
+    def test_2(self):
+        """Test case: strlen (2)"""
+        self.start('Utf8/strlens')
+
+        pre = self.xpath('pre[1]')
+        self.assertEqual('3', pre.text)
+
+        pre = self.xpath('pre[2]')
+        self.assertEqual('3', pre.text)
+
+        pre = self.xpath('pre[3]')
+        self.assertEqual('3', pre.text)
+
+        pre = self.xpath('pre[4]')
+        self.assertEqual('3', pre.text)
+
+        pre = self.xpath('pre[5]')
+        self.assertEqual('1', pre.text)
+
+        pre = self.xpath('pre[6]')
+        self.assertEqual('1', pre.text)
+
+        pre = self.xpath('pre[7]')
+        self.assertEqual('0', pre.text)
+        
+        pre = self.xpath('pre[8]')
+        self.assertEqual('1', pre.text)
+        
+        pre = self.xpath('pre[9]')
+        self.assertEqual('1', pre.text)
+
+        pre = self.xpath('pre[10]')
+        self.assertEqual('1', pre.text)
+
+        pre = self.xpath('pre[11]')
+        self.assertEqual('6', pre.text)
+
+        pre = self.xpath('pre[12]')
+        self.assertEqual('2', pre.text)
+
+        pre = self.xpath('pre[13]')
+        self.assertEqual('14', pre.text)
+
+        
+    def test_3(self):
+        """Test case: strlenGe (3)"""
+        self.start('Utf8/strlenGens')
+        
+        pre = self.xpath('pre[1]')
+        self.assertEqual('False', pre.text)
+
+        pre = self.xpath('pre[2]')
+        self.assertEqual('True', pre.text)
+
+        pre = self.xpath('pre[3]')
+        self.assertEqual('False', pre.text)
+
+        pre = self.xpath('pre[4]')
+        self.assertEqual('True', pre.text)
+
+        pre = self.xpath('pre[5]')
+        self.assertEqual('True', pre.text)
+
+        pre = self.xpath('pre[6]')
+        self.assertEqual('False', pre.text)
+
+        pre = self.xpath('pre[7]')
+        self.assertEqual('True', pre.text)
+
+        pre = self.xpath('pre[8]')
+        self.assertEqual('True', pre.text)
+
+    def test_4(self):
+        """Test case: strcat (4)"""
+        self.start('Utf8/strcats')
+        
+        pre = self.xpath('pre[1]')
+        self.assertEqual('', pre.text)
+        
+        pre = self.xpath('pre[2]')
+        self.assertEqual('0', pre.text)
+
+        pre = self.xpath('pre[3]')
+        self.assertEqual('aabb', pre.text)
+        
+        pre = self.xpath('pre[4]')
+        self.assertEqual('4', pre.text)
+        
+        pre = self.xpath('pre[5]')
+        self.assertEqual('bb', pre.text)
+        
+        pre = self.xpath('pre[6]')
+        self.assertEqual('2', pre.text)
+        
+        pre = self.xpath('pre[7]')
+        self.assertEqual('aa', pre.text)
+        
+        pre = self.xpath('pre[8]')
+        self.assertEqual('2', pre.text)
+        
+        pre = self.xpath('pre[9]')
+        self.assertEqual('ààáá', pre.text)
+        
+        pre = self.xpath('pre[10]')
+        self.assertEqual('4', pre.text)
+        
+        pre = self.xpath('pre[11]')
+        self.assertEqual('áá', pre.text)
+        
+        pre = self.xpath('pre[12]')
+        self.assertEqual('2', pre.text)
+        
+        pre = self.xpath('pre[13]')
+        self.assertEqual('àà', pre.text)
+        
+        pre = self.xpath('pre[14]')
+        self.assertEqual('2', pre.text)
+
+    def test_5(self):
+        """Test case: strsub (5)"""
+        self.start('Utf8/strsubs')
+
+        pre = self.xpath('pre[1]')
+        self.assertEqual('a', pre.text)
+
+        pre = self.xpath('pre[2]')
+        self.assertEqual('b', pre.text)
+
+        pre = self.xpath('pre[3]')
+        self.assertEqual('à', pre.text)
+
+        pre = self.xpath('pre[4]')
+        self.assertEqual('ç', pre.text)
+
+    def test_6(self):
+        """Test case: strsuffix (6)"""
+        self.start('Utf8/strsuffixs')
+
+        pre = self.xpath('pre[1]')
+        self.assertEqual('abàç', pre.text)
+
+        pre = self.xpath('pre[2]')
+        self.assertEqual('bàç', pre.text)
+        
+        pre = self.xpath('pre[3]')
+        self.assertEqual('àç', pre.text)
+        
+        pre = self.xpath('pre[4]')
+        self.assertEqual('ç', pre.text)
+
+    def test_7(self):
+        """Test case: strchr (7)"""
+        self.start('Utf8/strchrs')
+
+        pre = self.xpath('pre[1]')
+        self.assertEqual('None', pre.text)
+
+        pre = self.xpath('pre[2]')
+        self.assertEqual('Some "bàç"', pre.text)
+        
+        pre = self.xpath('pre[3]')
+        self.assertEqual('Some "àç"', pre.text)
+
+        pre = self.xpath('pre[4]')
+        self.assertEqual('Some "ç"', pre.text)
+
+        pre = self.xpath('pre[5]')
+        self.assertEqual('Some ""', pre.text)
+        
+    def test_8(self):
+        """Test case: strindex (8)"""
+        self.start('Utf8/strindexs')
+        
+        pre = self.xpath('pre[1]')
+        self.assertEqual('None', pre.text)
+
+        pre = self.xpath('pre[2]')
+        self.assertEqual('Some 0', pre.text)
+        
+        pre = self.xpath('pre[3]')
+        self.assertEqual('Some 1', pre.text)
+
+        pre = self.xpath('pre[4]')
+        self.assertEqual('Some 2', pre.text)
+
+        pre = self.xpath('pre[5]')
+        self.assertEqual('Some 3', pre.text)
+
+    def test_9(self):
+        """Test case: strindex (9)"""
+        self.start('Utf8/strsindexs')
+
+        pre = self.xpath('pre[1]')
+        # behavior of strstr C function
+        self.assertEqual('Some 0', pre.text)
+        
+        pre = self.xpath('pre[2]')
+        self.assertEqual('Some 0', pre.text)
+        
+        pre = self.xpath('pre[3]')
+        self.assertEqual('None', pre.text)
+        
+        pre = self.xpath('pre[4]')
+        self.assertEqual('Some 1', pre.text)
+
+        pre = self.xpath('pre[5]')
+        self.assertEqual('None', pre.text)
+        
+        pre = self.xpath('pre[6]')
+        self.assertEqual('Some 2', pre.text)
+
+        pre = self.xpath('pre[7]')
+        self.assertEqual('None', pre.text)
+        
+        pre = self.xpath('pre[8]')
+        self.assertEqual('None', pre.text)
+
+        pre = self.xpath('pre[9]')
+        self.assertEqual('Some 3', pre.text)
+
+    def test_10(self):
+        """Test case: strcspn (10)"""
+        self.start('Utf8/strcspns')
+
+        pre = self.xpath('pre[1]')
+        self.assertEqual('4', pre.text)
+
+        pre = self.xpath('pre[2]')
+        self.assertEqual('0', pre.text)
+        
+        pre = self.xpath('pre[3]')
+        self.assertEqual('0', pre.text)
+        
+        pre = self.xpath('pre[4]')
+        self.assertEqual('1', pre.text)
+
+        pre = self.xpath('pre[5]')
+        self.assertEqual('2', pre.text)
+
+        pre = self.xpath('pre[6]')
+        self.assertEqual('3', pre.text)
+
+    def test_11(self):
+        """Test case: str1 (11)"""
+        self.start('Utf8/str1s')
+
+        pre = self.xpath('pre[1]')
+        self.assertEqual('a', pre.text)
+
+        pre = self.xpath('pre[2]')
+        self.assertEqual('à', pre.text)
+
+        pre = self.xpath('pre[3]')
+        self.assertEqual('á', pre.text)
+
+    def test_12(self):
+        """Test case: isalnum (12)"""
+        self.start('Utf8/isalnums')
+                               
+        for idx in range(1, 9):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed isalnum: assert ' + str(idx))
+        
+    def test_13(self):
+        """Test case: isalpha (13)"""
+        self.start('Utf8/isalphas')
+                       
+        for idx in range(1, 9):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed isalpha: assert ' + str(idx))
+        
+    def test_14(self):
+        """Test case: isblank (14)"""
+        self.start('Utf8/isblanks')
+               
+        for idx in range(1, 11):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed isblank: assert ' + str(idx))
+
+    def test_15(self):
+        """Test case: iscntrl (15)"""
+        self.start('Utf8/iscntrls')
+               
+        for idx in range(1, 11):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed iscntrl: assert ' + str(idx))
+        
+    def test_16(self):
+        """Test case: isdigit (16)"""
+        self.start('Utf8/isdigits')
+               
+        for idx in range(1, 11):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed isdigit: assert ' + str(idx))
+
+        
+    def test_17(self):
+        """Test case: isgraph (17)"""
+        self.start('Utf8/isgraphs')
+        
+        for idx in range(1, 11):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed isgraph: assert ' + str(idx))
+    
+    def test_18(self):
+        """Test case: islower (18)"""
+        self.start('Utf8/islowers')
+        
+        for idx in range(1, 11):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed islower: assert ' + str(idx))
+        
+    def test_19(self):
+        """Test case: isprint (19)"""
+        self.start('Utf8/isprints')
+
+        for idx in range(1, 11):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed isprint: assert ' + str(idx))
+        
+    def test_20(self):
+        """Test case: ispunct (20)"""
+        self.start('Utf8/ispuncts')
+
+        for idx in range(1, 11):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed ispunct: assert ' + str(idx))
+        
+    def test_21(self):
+        """Test case: isspace (21)"""
+        self.start('Utf8/isspaces')
+
+        for idx in range(1, 11):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed isspace: assert ' + str(idx))
+
+    def test_22(self):
+        """Test case: isupper (22)"""
+        self.start('Utf8/isuppers')
+
+        for idx in range(1, 11):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed isupper: assert ' + str(idx))
+
+    def test_23(self):
+        """Test case: isxdigit (23)"""
+        self.start('Utf8/isxdigits')
+
+        for idx in range(1, 11):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed isxdigit: assert ' + str(idx))
+
+    def test_24(self):
+        """Test case: toupper (24)"""
+        self.start('Utf8/touppers')
+
+        for idx in range(1, 6):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed toupper: assert ' + str(idx))
+
+    def test_25(self):
+        """Test case: ord (25)"""
+        self.start('Utf8/ord_and_chrs')
+
+        for idx in range(1, 8):
+            pre = self.xpath('pre[' + str(idx) + ']')
+            self.assertEqual('True', pre.text, 'Failed ord: assert ' + str(idx))
+
+    def test_26 (self):
+        """Test case: test_db (26) """
+        self.start('Utf8/test_db')
+
+        pre = self.xpath('pre[1]')
+        self.assertEqual('abc', pre.text)
+
+        pre = self.xpath('pre[2]')
+        self.assertEqual('3', pre.text)
+
+        pre = self.xpath('pre[3]')
+        self.assertEqual('çãó', pre.text)
+
+        pre = self.xpath('pre[4]')
+        self.assertEqual('3', pre.text)
+
+        pre = self.xpath('pre[5]')
+        self.assertEqual('が', pre.text)
+
+        pre = self.xpath('pre[6]')
+        self.assertEqual('1', pre.text)
+
+        pre = self.xpath('pre[7]')
+        self.assertEqual('漢', pre.text)
+
+        pre = self.xpath('pre[8]')
+        self.assertEqual('1', pre.text)
+
+        pre = self.xpath('pre[9]')
+        self.assertEqual('カ', pre.text)
+
+        pre = self.xpath('pre[10]')
+        self.assertEqual('1', pre.text)
+
+        pre = self.xpath('pre[11]')
+        self.assertEqual('وظيفية', pre.text)
+
+        pre = self.xpath('pre[12]')
+        self.assertEqual('6', pre.text)
diff --git a/tests/utf8.ur b/tests/utf8.ur
new file mode 100644
index 00000000..0dedc726
--- /dev/null
+++ b/tests/utf8.ur
@@ -0,0 +1,431 @@
+fun substrings () : transaction page = return <xml>
+  <body>
+    <pre>{[substring "abc" 0 3]}</pre>   
+    <pre>{[substring "abc" 1 2]}</pre>
+    <pre>{[substring "abc" 2 1]}</pre>
+    <pre>{[substring "ábó" 0 3]}</pre>    
+    <pre>{[substring "ábó" 1 2]}</pre>
+    <pre>{[substring "ábó" 2 1]}</pre>    
+    <pre>{[substring "çãó" 0 3]}</pre>
+    <pre>{[substring "çãó" 1 2]}</pre>
+    <pre>{[substring "çãó" 2 1]}</pre>
+    <pre>{[substring "çãó" 2 0]}</pre>
+    <pre>{[substring "" 0 0]}</pre>
+  </body>
+</xml>
+
+fun strlens () : transaction page = return <xml>
+  <body>
+    <pre>{[strlen "abc"]}</pre>
+    <pre>{[strlen "çbc"]}</pre>
+    <pre>{[strlen "çãc"]}</pre>
+    <pre>{[strlen "çãó"]}</pre>
+    <pre>{[strlen "ç"]}</pre>
+    <pre>{[strlen "c"]}</pre>
+    <pre>{[strlen ""]}</pre>
+    <pre>{[strlen "が"]}</pre>
+    <pre>{[strlen "漢"]}</pre>
+    <pre>{[strlen "カ"]}</pre>
+    <pre>{[strlen "وظيفية"]}</pre>
+    <pre>{[strlen "函數"]}</pre>
+    <pre>{[strlen "Функциональное"]}</pre>
+  </body>
+  </xml>
+				       
+fun strlenGens () : transaction page = return <xml>
+  <body>
+    <pre>{[strlenGe "" 1]}</pre>
+    <pre>{[strlenGe "" 0]}</pre>
+    <pre>{[strlenGe "aba" 4]}</pre>
+    <pre>{[strlenGe "aba" 3]}</pre>
+    <pre>{[strlenGe "aba" 2]}</pre>
+    <pre>{[strlenGe "áçà" 4]}</pre>
+    <pre>{[strlenGe "áçà" 3]}</pre>
+    <pre>{[strlenGe "áçà" 2]}</pre>
+    
+  </body>
+  </xml>
+
+fun strcats () : transaction page =
+    let
+	fun catAndLen a b =
+	    <xml>
+	      <pre>{[strcat a b]}</pre>
+	      <pre>{[strlen (strcat a b)]}</pre>
+	    </xml>
+    in
+	return <xml>
+	  <body>
+	    {catAndLen "" ""}
+	    {catAndLen "aa" "bb"}
+	    {catAndLen "" "bb"}
+	    {catAndLen "aa" ""}
+	    {catAndLen "àà" "áá"}
+	    {catAndLen "" "áá"}
+	    {catAndLen "àà" ""}	    
+	  </body>
+	</xml>
+end
+
+fun strsubs () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[strsub "abàç" 0]}</pre>
+	<pre>{[strsub "abàç" 1]}</pre>
+	<pre>{[strsub "abàç" 2]}</pre>
+	<pre>{[strsub "abàç" 3]}</pre>
+      </body>
+      </xml>
+
+fun strsuffixs () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[strsuffix "abàç" 0]}</pre>
+	<pre>{[strsuffix "abàç" 1]}</pre>
+	<pre>{[strsuffix "abàç" 2]}</pre>
+	<pre>{[strsuffix "abàç" 3]}</pre>
+      </body>
+    </xml>
+
+fun strchrs () : transaction page =
+    let
+	fun optToStr ms =
+	    case ms of
+		None => "None"
+	      | Some s => "Some \"" ^ s ^ "\""
+
+    in
+	return <xml>
+	  <body>
+	    <pre>{[optToStr (strchr "abàç" #"c")]}</pre>
+	    <pre>{[optToStr (strchr "abàç" #"a")]}</pre>
+	    <pre>{[optToStr (strchr "abàç" #"b")]}</pre>
+	    <pre>{[optToStr (strchr "abàç" (strsub "à" 0))]}</pre>
+	    <pre>{[optToStr (strchr "abàç" (strsub "ç" 0))]}</pre>
+	  </body>
+	</xml>
+    end
+
+fun strindexs () : transaction page =
+    let
+	fun optToStr ms =
+	    case ms of
+		None => "None"
+	      | Some s => "Some " ^ (show s)
+
+    in
+	return <xml>
+	  <body>
+	    <pre>{[optToStr (strindex "abàç" #"c")]}</pre>
+	    <pre>{[optToStr (strindex "abàç" #"a")]}</pre>
+	    <pre>{[optToStr (strindex "abàç" #"b")]}</pre>
+	    <pre>{[optToStr (strindex "abàç" (strsub "à" 0))]}</pre>
+	    <pre>{[optToStr (strindex "abàç" (strsub "ç" 0))]}</pre>
+	  </body>
+	</xml>
+    end
+
+fun strsindexs () : transaction page =
+    let
+	fun optToStr ms =
+	    case ms of
+		None => "None"
+	      | Some s => "Some " ^ (show s)
+
+    in
+	return <xml>
+	  <body>
+	    <pre>{[optToStr (strsindex "abàç" "")]}</pre>
+	    <pre>{[optToStr (strsindex "abàç" "abàç")]}</pre>
+	    <pre>{[optToStr (strsindex "abàç" "abàc")]}</pre>
+	    <pre>{[optToStr (strsindex "abàç" "bàç")]}</pre>
+	    <pre>{[optToStr (strsindex "abàç" "bàc")]}</pre>
+	    <pre>{[optToStr (strsindex "abàç" "àç")]}</pre>
+	    <pre>{[optToStr (strsindex "abàç" "àc")]}</pre>
+	    <pre>{[optToStr (strsindex "abàç" "ac")]}</pre>
+	    <pre>{[optToStr (strsindex "abàç" "ç")]}</pre>
+	  </body>
+	</xml>
+    end
+	
+fun strcspns () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[strcspn "abàç" ""]}</pre>
+	<pre>{[strcspn "abàç" "abàç"]}</pre>
+	<pre>{[strcspn "abàç" "a"]}</pre>
+	<pre>{[strcspn "abàç" "bàç"]}</pre>
+	<pre>{[strcspn "abàç" "àç"]}</pre>
+	<pre>{[strcspn "abàç" "ç"]}</pre>
+      </body>
+      </xml>
+
+fun str1s () : transaction page = return <xml>
+  <body>
+    <pre>{[str1 #"a"]}</pre>
+    <pre>{[str1 (strsub "à" 0)]}</pre>
+    <pre>{[str1 (strsub "aá" 1)]}</pre>
+  </body>
+  </xml>
+
+fun isalnums () : transaction page = return <xml>
+  <body>
+    <pre>{[isalnum #"a"]}</pre>
+    <pre>{[isalnum (strsub "à" 0)]}</pre>
+    <pre>{[isalnum #"A"]}</pre>
+    <pre>{[isalnum (strsub "À" 0)]}</pre>
+    <pre>{[isalnum #"1"]}</pre>
+    <pre>{[not (isalnum #"!")]}</pre>
+    <pre>{[not (isalnum #"#")]}</pre>
+    <pre>{[not (isalnum #" ")]}</pre>
+  </body>
+</xml>
+
+fun isalphas () : transaction page = return <xml>
+  <body>
+    <pre>{[isalpha #"a"]}</pre>
+    <pre>{[isalpha (strsub "à" 0)]}</pre>
+    <pre>{[isalpha #"A"]}</pre>
+    <pre>{[isalpha (strsub "À" 0)]}</pre>
+    <pre>{[not (isalpha #"1")]}</pre>
+    <pre>{[not (isalpha #"!")]}</pre>
+    <pre>{[not (isalpha #"#")]}</pre>
+    <pre>{[not (isalpha #" ")]}</pre>
+  </body>
+</xml>
+
+fun isblanks () : transaction page = 
+    return <xml>
+      <body>
+	<pre>{[not (isblank #"a")]}</pre>
+	<pre>{[not (isblank (strsub "à" 0))]}</pre>
+	<pre>{[not (isblank #"A")]}</pre>
+	<pre>{[not (isblank (strsub "À" 0))]}</pre>
+	<pre>{[not (isblank #"1")]}</pre>
+	<pre>{[not (isblank #"!")]}</pre>
+	<pre>{[not (isblank #"#")]}</pre>
+	<pre>{[isblank #" "]}</pre>
+	<pre>{[isblank #"\t"]}</pre>
+	<pre>{[not (isblank #"\n")]}</pre>
+      </body>
+    </xml>
+
+fun iscntrls () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[not (iscntrl #"a")]}</pre>
+	<pre>{[not (iscntrl (strsub "à" 0))]}</pre>
+	<pre>{[not (iscntrl #"A")]}</pre>
+	<pre>{[not (iscntrl (strsub "À" 0))]}</pre>
+	<pre>{[not (iscntrl #"1")]}</pre>
+	<pre>{[not (iscntrl #"!")]}</pre>
+	<pre>{[not (iscntrl #"#")]}</pre>
+	<pre>{[not (iscntrl #" ")]}</pre>
+	<pre>{[iscntrl #"\t"]}</pre>
+	<pre>{[iscntrl #"\n"]}</pre>
+      </body>
+      </xml>
+
+fun isdigits () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[not (isdigit #"a")]}</pre>
+	<pre>{[not (isdigit (strsub "à" 0))]}</pre>
+	<pre>{[not (isdigit #"A")]}</pre>
+	<pre>{[not (isdigit (strsub "À" 0))]}</pre>
+	<pre>{[isdigit #"1"]}</pre>
+	<pre>{[not (isdigit #"!")]}</pre>
+	<pre>{[not (isdigit #"#")]}</pre>
+	<pre>{[not (isdigit #" ")]}</pre>
+	<pre>{[not (isdigit #"\t")]}</pre>
+	<pre>{[not (isdigit #"\n")]}</pre>
+      </body>
+      </xml>
+
+fun isgraphs () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[isgraph #"a"]}</pre>
+	<pre>{[isgraph (strsub "à" 0)]}</pre>
+	<pre>{[isgraph #"A"]}</pre>
+	<pre>{[isgraph (strsub "À" 0)]}</pre>
+	<pre>{[isgraph #"1"]}</pre>
+	<pre>{[isgraph #"!"]}</pre>
+	<pre>{[isgraph #"#"]}</pre>
+	<pre>{[not (isgraph #" ")]}</pre>
+	<pre>{[not (isgraph #"\t")]}</pre>
+	<pre>{[not (isdigit #"\n")]}</pre>
+      </body>
+      </xml>
+
+fun islowers () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[islower #"a"]}</pre>
+	<pre>{[islower (strsub "à" 0)]}</pre>
+	<pre>{[not (islower #"A")]}</pre>
+	<pre>{[not (islower (strsub "À" 0))]}</pre>
+	<pre>{[not (islower #"1")]}</pre>
+	<pre>{[not (islower #"!")]}</pre>
+	<pre>{[not (islower #"#")]}</pre>
+	<pre>{[not (islower #" ")]}</pre>
+	<pre>{[not (islower #"\t")]}</pre>
+	<pre>{[not (islower #"\n")]}</pre>
+      </body>
+      </xml>
+
+fun isprints () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[isprint #"a"]}</pre>
+	<pre>{[isprint (strsub "à" 0)]}</pre>
+	<pre>{[isprint #"A"]}</pre>
+	<pre>{[isprint (strsub "À" 0)]}</pre>
+	<pre>{[isprint #"1"]}</pre>
+	<pre>{[isprint #"!"]}</pre>
+	<pre>{[isprint #"#"]}</pre>
+	<pre>{[isprint #" "]}</pre>
+	<pre>{[not (isprint #"\t")]}</pre>
+	<pre>{[not (isprint #"\n")]}</pre>
+      </body>
+      </xml>
+
+fun ispuncts () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[not (ispunct #"a")]}</pre>
+	<pre>{[not (ispunct (strsub "à" 0))]}</pre>
+	<pre>{[not (ispunct #"A")]}</pre>
+	<pre>{[not (ispunct (strsub "À" 0))]}</pre>
+	<pre>{[not (ispunct #"1")]}</pre>
+	<pre>{[ispunct #"!"]}</pre>
+	<pre>{[ispunct #"#"]}</pre>
+	<pre>{[not (ispunct #" ")]}</pre>
+	<pre>{[not (isprint #"\t")]}</pre>
+	<pre>{[not (isprint #"\n")]}</pre>
+      </body>
+      </xml>
+
+fun isspaces () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[not (isspace #"a")]}</pre>
+	<pre>{[not (isspace (strsub "à" 0))]}</pre>
+	<pre>{[not (isspace #"A")]}</pre>
+	<pre>{[not (isspace (strsub "À" 0))]}</pre>
+	<pre>{[not (isspace #"1")]}</pre>
+	<pre>{[not (isspace #"!")]}</pre>
+	<pre>{[not (isspace #"#")]}</pre>
+	<pre>{[isspace #" "]}</pre>
+	<pre>{[isspace #"\t"]}</pre>
+	<pre>{[isspace #"\n"]}</pre>
+      </body>
+    </xml>
+    
+fun isuppers () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[not (isupper #"a")]}</pre>
+	<pre>{[not (isupper (strsub "à" 0))]}</pre>
+	<pre>{[isupper #"A"]}</pre>
+	<pre>{[isupper (strsub "À" 0)]}</pre>
+	<pre>{[not (isupper #"1")]}</pre>
+	<pre>{[not (isupper #"!")]}</pre>
+	<pre>{[not (isupper #"#")]}</pre>
+	<pre>{[not (isupper #" ")]}</pre>
+	<pre>{[not (isupper #"\t")]}</pre>
+	<pre>{[not (isupper #"\n")]}</pre>
+      </body>
+      </xml>
+
+fun isxdigits () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[isxdigit #"a"]}</pre>
+	<pre>{[not (isxdigit (strsub "à" 0))]}</pre>
+	<pre>{[isxdigit #"A"]}</pre>
+	<pre>{[not (isxdigit (strsub "À" 0))]}</pre>
+	<pre>{[isxdigit #"1"]}</pre>
+	<pre>{[not (isxdigit #"!")]}</pre>
+	<pre>{[not (isxdigit #"#")]}</pre>
+	<pre>{[not (isxdigit #" ")]}</pre>
+	<pre>{[not (isxdigit #"\t")]}</pre>
+	<pre>{[not (isxdigit #"\n")]}</pre>
+      </body>
+      </xml>
+
+fun tolowers () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[tolower #"A" = #"a"]}</pre>
+	<pre>{[tolower #"a" = #"a"]}</pre>
+	<pre>{[tolower (strsub "á" 0) = (strsub "á" 0)]}</pre>
+	<pre>{[tolower (strsub "Á" 0) = (strsub "á" 0)]}</pre>
+	<pre>{[tolower #"1" = #"1"]}</pre>
+      </body>
+    </xml>
+    
+fun touppers () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[toupper #"A" = #"A"]}</pre>
+	<pre>{[toupper #"a" = #"A"]}</pre>
+	<pre>{[toupper (strsub "á" 0) = (strsub "Á" 0)]}</pre>
+	<pre>{[toupper (strsub "Á" 0) = (strsub "Á" 0)]}</pre>
+	<pre>{[toupper #"1" = #"1"]}</pre>
+      </body>
+      </xml>
+
+fun ord_and_chrs () : transaction page =
+    return <xml>
+      <body>
+	<pre>{[chr (ord #"A") = #"A"]}</pre>
+	<pre>{[chr (ord #"a") = #"a"]}</pre>
+	<pre>{[chr (ord (strsub "á" 0)) = (strsub "á" 0)]}</pre>
+	<pre>{[chr (ord (strsub "Á" 0)) = (strsub "Á" 0)]}</pre>
+	<pre>{[chr (ord #"1") = #"1"]}</pre>
+	<pre>{[chr (ord #"\n") = #"\n"]}</pre>
+	<pre>{[chr (ord (strsub "が" 0)) = (strsub "が" 0)]}</pre>
+	<pre>{[chr (ord (strsub "漢" 0)) = (strsub "漢" 0)]}</pre>
+	<pre>{[chr (ord (strsub "カ" 0)) = (strsub "カ" 0)]}</pre>
+      </body>
+      </xml>
+
+table t : { Id : int, Text : string }
+
+
+fun test_db () : transaction page =
+    dml (INSERT INTO t (Id, Text) VALUES({[1]}, {["abc"]}));
+    t1 <- oneRow (SELECT t.Text FROM t WHERE t.Id = 1);
+
+    dml (INSERT INTO t (Id, Text) VALUES({[2]}, {["çãó"]}));
+    t2 <- oneRow (SELECT t.Text FROM t WHERE t.Id = 2);
+
+    dml (INSERT INTO t (Id, Text) VALUES({[3]}, {["が"]}));
+    t3 <- oneRow (SELECT t.Text FROM t WHERE t.Id = 3);
+
+    dml (INSERT INTO t (Id, Text) VALUES({[4]}, {["漢"]}));
+    t4 <- oneRow (SELECT t.Text FROM t WHERE t.Id = 4);
+
+    dml (INSERT INTO t (Id, Text) VALUES({[5]}, {["カ"]}));
+    t5 <- oneRow (SELECT t.Text FROM t WHERE t.Id = 5);
+
+    dml (INSERT INTO t (Id, Text) VALUES({[6]}, {["وظيفية"]}));
+    t6 <- oneRow (SELECT t.Text FROM t WHERE t.Id = 6);
+
+    return <xml>
+      <body>
+	<pre>{[t1.T.Text]}</pre>
+	<pre>{[strlen t1.T.Text]}</pre>
+	<pre>{[t2.T.Text]}</pre>
+	<pre>{[strlen t2.T.Text]}</pre>
+	<pre>{[t3.T.Text]}</pre>
+	<pre>{[strlen t3.T.Text]}</pre>
+	<pre>{[t4.T.Text]}</pre>
+	<pre>{[strlen t4.T.Text]}</pre>
+	<pre>{[t5.T.Text]}</pre>
+	<pre>{[strlen t5.T.Text]}</pre>
+	<pre>{[t6.T.Text]}</pre>
+	<pre>{[strlen t6.T.Text]}</pre>
+      </body>
+      </xml>
diff --git a/tests/utf8.urp b/tests/utf8.urp
new file mode 100644
index 00000000..9b3067af
--- /dev/null
+++ b/tests/utf8.urp
@@ -0,0 +1,5 @@
+database dbname=utf8
+sql utf8.sql
+safeGet Utf8/test_db
+
+utf8
\ No newline at end of file
-- 
cgit v1.2.3


From 5cc729b48aad084757a049b7e5cdbadae5e9e400 Mon Sep 17 00:00:00 2001
From: fab <fabrice.leal.ch@gmail.com>
Date: Fri, 30 Nov 2018 23:29:14 +0000
Subject: reject invalid codepoints. Basis.iscodepoint. fix german char in js

---
 include/urweb/urweb_cpp.h |   5 +-
 lib/js/urweb.js           |   7 +-
 lib/ur/basis.urs          |   2 +
 src/c/urweb.c             | 265 +++++++++++++++++++++++++++-------------------
 4 files changed, 168 insertions(+), 111 deletions(-)

(limited to 'include/urweb')

diff --git a/include/urweb/urweb_cpp.h b/include/urweb/urweb_cpp.h
index 5f1144b8..25f97fb3 100644
--- a/include/urweb/urweb_cpp.h
+++ b/include/urweb/urweb_cpp.h
@@ -103,7 +103,7 @@ char *uw_Basis_htmlifyFloat(struct uw_context *, uw_Basis_float);
 char *uw_Basis_htmlifyString(struct uw_context *, uw_Basis_string);
 char *uw_Basis_htmlifyBool(struct uw_context *, uw_Basis_bool);
 char *uw_Basis_htmlifyTime(struct uw_context *, uw_Basis_time);
-char *uw_Basis_htmlifySpecialChar(struct uw_context *, unsigned char);
+char *uw_Basis_htmlifySpecialChar(struct uw_context *, uw_Basis_char);
 char *uw_Basis_htmlifySource(struct uw_context *, uw_Basis_source);
 
 uw_unit uw_Basis_htmlifyInt_w(struct uw_context *, uw_Basis_int);
@@ -111,7 +111,7 @@ uw_unit uw_Basis_htmlifyFloat_w(struct uw_context *, uw_Basis_float);
 uw_unit uw_Basis_htmlifyString_w(struct uw_context *, uw_Basis_string);
 uw_unit uw_Basis_htmlifyBool_w(struct uw_context *, uw_Basis_bool);
 uw_unit uw_Basis_htmlifyTime_w(struct uw_context *, uw_Basis_time);
-uw_unit uw_Basis_htmlifySpecialChar_w(struct uw_context *, unsigned char);
+uw_unit uw_Basis_htmlifySpecialChar_w(struct uw_context *, uw_Basis_char);
 uw_unit uw_Basis_htmlifySource_w(struct uw_context *, uw_Basis_source);
 
 char *uw_Basis_attrifyInt(struct uw_context *, uw_Basis_int);
@@ -327,6 +327,7 @@ uw_Basis_bool uw_Basis_isxdigit(struct uw_context *, uw_Basis_char);
 uw_Basis_char uw_Basis_tolower(struct uw_context *, uw_Basis_char);
 uw_Basis_char uw_Basis_toupper(struct uw_context *, uw_Basis_char);
 
+uw_Basis_bool uw_Basis_iscodepoint(struct uw_context *, uw_Basis_int);
 uw_Basis_int uw_Basis_ord(struct uw_context *, uw_Basis_char);
 uw_Basis_char uw_Basis_chr(struct uw_context *, uw_Basis_int);
 
diff --git a/lib/js/urweb.js b/lib/js/urweb.js
index de1a2ad0..c7725e28 100644
--- a/lib/js/urweb.js
+++ b/lib/js/urweb.js
@@ -38,7 +38,12 @@ function isXdigit(c) { return isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A'
 function ord(c) { return c.charCodeAt(0); }
 function isPrint(c) { return ord(c) > 31 && ord(c) != 127; }
 function toLower(c) { return c.toLowerCase(); }
-function toUpper(c) { return c.toUpperCase(); }
+function toUpper(c) {
+    if (ord(c) == 223)
+	return c;
+    else
+	return c.toUpperCase();
+}
 
 // Lists
 
diff --git a/lib/ur/basis.urs b/lib/ur/basis.urs
index 878f2793..c9d6556b 100644
--- a/lib/ur/basis.urs
+++ b/lib/ur/basis.urs
@@ -79,6 +79,8 @@ val toupper : char -> char
 val ord : char -> int
 val chr : int -> char
 
+val iscodepoint : int -> bool
+
 (** String operations *)
 
 val strlen : string -> int
diff --git a/src/c/urweb.c b/src/c/urweb.c
index be65afcc..195ddada 100644
--- a/src/c/urweb.c
+++ b/src/c/urweb.c
@@ -1559,101 +1559,89 @@ const char *uw_Basis_get_settings(uw_context ctx, uw_unit u) {
   }
 }
 
-uw_Basis_string uw_Basis_jsifyString(uw_context ctx, uw_Basis_string s) {
-  char *r, *s2;
-
-  uw_check_heap(ctx, strlen(s) * 4 + 3);
-
-  r = s2 = ctx->heap.front;
-  *s2++ = '"';
-
-  for (; *s; s++) {
-    unsigned char c = *s;
-
-    switch (c) {
-    case '"':
-      strcpy(s2, "\\\"");
-      s2 += 2;
-      break;
-    case '\'':
-      strcpy(s2, "\\047");
-      s2 += 4;
-      break;
-    case '\\':
-      strcpy(s2, "\\\\");
-      s2 += 2;
-      break;
-    case '<':
-      strcpy(s2, "\\074");
-      s2 += 4;
-      break;
-    case '&':
-      strcpy(s2, "\\046");
-      s2 += 4;
-      break;
-    default:
-      if (isprint((int)c) || c >= 128)
-        *s2++ = c;
-      else {
-        sprintf(s2, "\\%03o", c);
-        s2 += 4;
-      }
-    }
-  }
-
-  strcpy(s2, "\"");
-  ctx->heap.front = s2 + 2;
-  return r;
-}
-
 uw_Basis_bool uw_Basis_isprint(uw_context ctx, uw_Basis_char ch);
-
-uw_Basis_string uw_Basis_jsifyChar(uw_context ctx, uw_Basis_char c1) {
-  char *r, *s2;
-
-  uw_check_heap(ctx, 7);
-
-  r = s2 = ctx->heap.front;
-  *s2++ = '"';
-
+void jsifyChar(char**buffer_ptr, uw_context ctx, uw_Basis_char c1) {
+  char* buffer = *buffer_ptr;
+  
   switch (c1) {
   case '"':
-    strcpy(s2, "\\\"");
-    s2 += 2;
+    strcpy(buffer, "\\\"");
+    buffer += 2;
     break;
   case '\'':
-    strcpy(s2, "\\047");
-    s2 += 4;
+    strcpy(buffer, "\\047");
+    buffer += 4;
     break;
   case '\\':
-    strcpy(s2, "\\\\");
-    s2 += 2;
+    strcpy(buffer, "\\\\");
+    buffer += 2;
     break;
   case '<':
-    strcpy(s2, "\\074");
-    s2 += 4;
+    strcpy(buffer, "\\074");
+    buffer += 4;
     break;
   case '&':
-    strcpy(s2, "\\046");
-    s2 += 4;
+    strcpy(buffer, "\\046");
+    buffer += 4;
     break;
   default:
     
     if (uw_Basis_isprint(ctx, c1) == uw_Basis_True)
       {
 	int offset = 0;
-	U8_APPEND_UNSAFE(s2, offset, c1);
-	s2 += offset;
+	U8_APPEND_UNSAFE(buffer, offset, c1);
+	buffer += offset;
       }
     else {
-      assert(0777 >= c1);
-      sprintf(s2, "\\%03o", (unsigned char)c1);
-      s2 += 4;
+      assert(65536 > c1);
+      sprintf(buffer, "\\u%04x", (unsigned char)c1);
+      buffer += 6;
     }
   }
 
+ 
+  *buffer_ptr = buffer;
+}
+
+uw_Basis_string uw_Basis_jsifyString(uw_context ctx, uw_Basis_string s) {
+  char *r, *s2;
+  uw_Basis_char c;
+
+  uw_check_heap(ctx, strlen(s) * 6 + 3);
+
+  r = s2 = ctx->heap.front;
+  *s2++ = '"';
+
+  int offset = 0;
+  while(s[offset] != 0)
+    {
+      U8_NEXT(s, offset, -1, c);
+      
+      jsifyChar(&s2, ctx, c);      
+    }
+
   strcpy(s2, "\"");
   ctx->heap.front = s2 + 2;
+
+  return r;
+}
+
+uw_Basis_int uw_Basis_ord(uw_context ctx, uw_Basis_char c);
+
+uw_Basis_string uw_Basis_jsifyChar(uw_context ctx, uw_Basis_char c1) {
+  char *r, *s2;
+
+  uw_check_heap(ctx, 8);
+
+  r = s2 = ctx->heap.front;
+  
+  *s2++ = '"';
+  
+  jsifyChar(&s2, ctx, c1);
+
+  strcpy(s2, "\"");
+  ctx->heap.front = s2 + 2;
+
   return r;
 }
 
@@ -1697,6 +1685,7 @@ uw_Basis_string uw_Basis_jsifyString_ws(uw_context ctx, uw_Basis_string s) {
 
   strcpy(s2, "\"");
   ctx->script.front = s2 + 1;
+
   return r;
 }
 
@@ -2262,25 +2251,27 @@ uw_unit uw_Basis_htmlifyInt_w(uw_context ctx, uw_Basis_int n) {
   return uw_unit_v;
 }
 
-char *uw_Basis_htmlifySpecialChar(uw_context ctx, unsigned char ch) {
+char *uw_Basis_htmlifySpecialChar(uw_context ctx, uw_Basis_char ch) {
   unsigned int n = ch;
   int len;
   char *r;
 
-  uw_check_heap(ctx, INTS_MAX+3);
+  uw_check_heap(ctx, INTS_MAX+3 + 1);
   r = ctx->heap.front;
-  sprintf(r, "&#%u;%n", n, &len);
+  len = sprintf(r, "&#%u;", n);
   ctx->heap.front += len+1;
+
   return r;
 }
 
-uw_unit uw_Basis_htmlifySpecialChar_w(uw_context ctx, unsigned char ch) {
+uw_unit uw_Basis_htmlifySpecialChar_w(uw_context ctx, uw_Basis_char ch) {
   unsigned int n = ch;
   int len;
 
   uw_check(ctx, INTS_MAX+3);
-  sprintf(ctx->page.front, "&#%u;%n", n, &len);
+  len = sprintf(ctx->page.front, "&#%u;", n);
   ctx->page.front += len;
+
   return uw_unit_v;
 }
 
@@ -2328,48 +2319,69 @@ uw_unit uw_Basis_jsifyInt_w(uw_context ctx, uw_Basis_int n) {
 
 char *uw_Basis_htmlifyString(uw_context ctx, const char *s) {
   char *r, *s2;
+  uw_Basis_char c1;
+  int offset = 0, len = 0;
+  
+  uw_check_heap(ctx, strlen(s) * (INTS_MAX + 3) + 1);
 
-  uw_check_heap(ctx, strlen(s) * 5 + 1);
-
-  for (r = s2 = ctx->heap.front; *s; s++) {
-    unsigned char c = *s;
-
-    switch (c) {
-    case '<':
-      strcpy(s2, "&lt;");
-      s2 += 4;
-      break;
-    case '&':
-      strcpy(s2, "&amp;");
-      s2 += 5;
-      break;
-    default:
-      *s2++ = c;
+  r = s2 = ctx->heap.front;
+  
+  while (s[offset] != 0) {
+    
+    U8_NEXT(s, offset, -1, c1);
+     
+    
+    if (U8_IS_SINGLE(c1) && uw_Basis_isprint(ctx, c1)) {
+      switch (c1) {
+      case '<':
+	strcpy(s2, "&lt;");
+	s2 += 4;
+	break;
+      case '&':
+	strcpy(s2, "&amp;");
+	s2 += 5;
+	break;
+      default:
+	*s2++ = c1;	
+      }      
+    } else {
+      len = sprintf(s2, "&#%u;", c1);
+      s2 += len;
     }
   }
-
+  
   *s2++ = 0;
   ctx->heap.front = s2;
+
   return r;
 }
 
 uw_unit uw_Basis_htmlifyString_w(uw_context ctx, uw_Basis_string s) {
   uw_check(ctx, strlen(s) * 6);
-
-  for (; *s; s++) {
-    unsigned char c = *s;
-
-    switch (c) {
-    case '<':
-      uw_write_unsafe(ctx, "&lt;");
-      break;
-    case '&':
-      uw_write_unsafe(ctx, "&amp;");
-      break;
-    default:
-      uw_writec_unsafe(ctx, c);
+  int offset = 0;
+  uw_Basis_char c1;
+  
+  while(s[offset] != 0){
+
+    U8_NEXT(s, offset, -1, c1);
+ 
+    if (U8_IS_SINGLE(c1) && uw_Basis_isprint(ctx, c1)) {
+	
+      switch (c1) {
+      case '<':
+	uw_write_unsafe(ctx, "&lt;");
+	break;
+      case '&':
+	uw_write_unsafe(ctx, "&amp;");
+	break;
+      default:
+	uw_writec_unsafe(ctx, c1);
+      }
     }
-  }
+    else {
+      uw_Basis_htmlifySpecialChar_w(ctx, c1);
+    }    
+  }  
 
   return uw_unit_v;
 }
@@ -4474,9 +4486,46 @@ uw_Basis_int uw_Basis_ord(uw_context ctx, uw_Basis_char c) {
   return (uw_Basis_int)c;
 }
 
+uw_Basis_bool uw_Basis_iscodepoint (uw_context ctx, uw_Basis_int n) {
+  (void)ctx;
+  uw_Basis_char ch = (uw_Basis_char)n;
+
+  if (UCHAR_MIN_VALUE <= ch && UCHAR_MAX_VALUE > ch) {
+
+    if (U8_LENGTH(ch) == 0) {
+      return uw_Basis_False;
+    }
+
+    if (u_charType(ch) == U_UNASSIGNED) {
+      return uw_Basis_False;
+    }
+
+  } else {
+    return uw_Basis_False;
+  }
+
+  return uw_Basis_True;
+}
+
 uw_Basis_char uw_Basis_chr(uw_context ctx, uw_Basis_int n) {
   (void)ctx;
-  return (uw_Basis_char)n;
+  uw_Basis_char ch = (uw_Basis_char)n;
+
+  if (UCHAR_MIN_VALUE <= ch && UCHAR_MAX_VALUE > ch) {
+
+    if (U8_LENGTH(ch) == 0) {
+      uw_error(ctx, FATAL, "The integer %lld cannot be converted to a char", n);
+    }
+
+    if (u_charType(ch) == U_UNASSIGNED) {
+      uw_error(ctx, FATAL, "The integer %lld is not a valid char codepoint", n);
+    }
+
+  } else {
+    uw_error(ctx, FATAL, "Integer %lld out of range of unicode chars", n);
+  }
+ 
+  return ch;
 }
 
 uw_Basis_string uw_Basis_currentUrl(uw_context ctx) {
-- 
cgit v1.2.3


From 28d130c8c3c2ef9cd229d09afe14fbcbcb954223 Mon Sep 17 00:00:00 2001
From: fab <fabrice.leal.ch@gmail.com>
Date: Wed, 9 Jan 2019 22:34:53 +0000
Subject: urlifyChar needs to be added to .h file as well

---
 include/urweb/urweb_cpp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/urweb')

diff --git a/include/urweb/urweb_cpp.h b/include/urweb/urweb_cpp.h
index 25f97fb3..25f26e1b 100644
--- a/include/urweb/urweb_cpp.h
+++ b/include/urweb/urweb_cpp.h
@@ -138,6 +138,7 @@ char *uw_Basis_urlifySource(struct uw_context *, uw_Basis_source);
 
 uw_unit uw_Basis_urlifyInt_w(struct uw_context *, uw_Basis_int);
 uw_unit uw_Basis_urlifyFloat_w(struct uw_context *, uw_Basis_float);
+uw_unit uw_Basis_urlifyChar_w(struct uw_context *, uw_Basis_char);
 uw_unit uw_Basis_urlifyString_w(struct uw_context *, uw_Basis_string);
 uw_unit uw_Basis_urlifyBool_w(struct uw_context *, uw_Basis_bool);
 uw_unit uw_Basis_urlifyTime_w(struct uw_context *, uw_Basis_time);
-- 
cgit v1.2.3


From 87d2eab53f8e9f81cc459429675123c9ff36f41e Mon Sep 17 00:00:00 2001
From: Adam Chlipala <adam@chlipala.net>
Date: Mon, 21 Jan 2019 18:09:59 -0500
Subject: Basis.textOfBlob; try creating filecache directory if it doesn't
 exist

---
 include/urweb/urweb_cpp.h |  1 +
 lib/ur/basis.urs          |  2 ++
 src/c/urweb.c             | 16 +++++++++++++++-
 src/cjr_print.sml         | 24 +++++++++++++++++++++++-
 4 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'include/urweb')

diff --git a/include/urweb/urweb_cpp.h b/include/urweb/urweb_cpp.h
index 25f97fb3..67312015 100644
--- a/include/urweb/urweb_cpp.h
+++ b/include/urweb/urweb_cpp.h
@@ -262,6 +262,7 @@ uw_Basis_string uw_Basis_fileMimeType(struct uw_context *, uw_Basis_file);
 uw_Basis_blob uw_Basis_fileData(struct uw_context *, uw_Basis_file);
 uw_Basis_int uw_Basis_blobSize(struct uw_context *, uw_Basis_blob);
 uw_Basis_blob uw_Basis_textBlob(struct uw_context *, uw_Basis_string);
+uw_Basis_string uw_Basis_textOfBlob(struct uw_context *, uw_Basis_blob);
 
 uw_Basis_string uw_Basis_postType(struct uw_context *, uw_Basis_postBody);
 uw_Basis_string uw_Basis_postData(struct uw_context *, uw_Basis_postBody);
diff --git a/lib/ur/basis.urs b/lib/ur/basis.urs
index c893e65d..be13c684 100644
--- a/lib/ur/basis.urs
+++ b/lib/ur/basis.urs
@@ -1019,6 +1019,8 @@ val checkMime : string -> option mimeType
 val returnBlob : t ::: Type -> blob -> mimeType -> transaction t
 val blobSize : blob -> int
 val textBlob : string -> blob
+val textOfBlob : blob -> option string
+(* Returns [Some] exactly when the blob contains no zero bytes. *)
 
 type postBody
 val postType : postBody -> string
diff --git a/src/c/urweb.c b/src/c/urweb.c
index ae2fc0a8..c8cfb0c6 100644
--- a/src/c/urweb.c
+++ b/src/c/urweb.c
@@ -4075,6 +4075,20 @@ uw_Basis_blob uw_Basis_textBlob(uw_context ctx, uw_Basis_string s) {
   return b;
 }
 
+uw_Basis_string uw_Basis_textOfBlob(uw_context ctx, uw_Basis_blob b) {
+  size_t i;
+  uw_Basis_string r;
+
+  for (i = 0; i < b.size; ++i)
+    if (b.data[i] == 0)
+      return NULL;
+
+  r = uw_malloc(ctx, b.size + 1);
+  memcpy(r, b.data, b.size);
+  r[b.size] = 0;
+  return r;
+}
+
 uw_Basis_blob uw_Basis_fileData(uw_context ctx, uw_Basis_file f) {
   (void)ctx;
   return f.data;
@@ -5207,7 +5221,7 @@ uw_unit uw_Basis_cache_file(uw_context ctx, uw_Basis_blob contents) {
 
   fd = mkstemp(tempfile);
   if (fd < 0)
-    uw_error(ctx, FATAL, "Error creating temporary file for cache");
+    uw_error(ctx, FATAL, "Error creating temporary file %s for cache", tempfile);
 
   while (written_so_far < contents.size) {
     ssize_t written_just_now = write(fd, contents.data + written_so_far, contents.size - written_so_far);
diff --git a/src/cjr_print.sml b/src/cjr_print.sml
index 31653a74..09cd9c7f 100644
--- a/src/cjr_print.sml
+++ b/src/cjr_print.sml
@@ -3391,6 +3391,14 @@ fun p_file env (ds, ps) =
              newline,
              string "#include <time.h>",
              newline,
+             (case Settings.getFileCache () of
+                  NONE => box []
+                | SOME _ => box [string "#include <sys/types.h>",
+                                 newline,
+                                 string "#include <sys/stat.h>",
+                                 newline,
+                                 string "#include <unistd.h>",
+                                 newline]),
              if hasDb then
                  box [string ("#include <" ^ #header (Settings.currentDbms ()) ^ ">"),
                       newline]
@@ -3655,7 +3663,21 @@ fun p_file env (ds, ps) =
              newline,
              string "static void uw_initializer(uw_context ctx) {",
              newline,
-             box [string "uw_begin_initializing(ctx);",
+             box [(case Settings.getFileCache () of
+                       NONE => box []
+                     | SOME dir => box [newline,
+                                        string "struct stat st = {0};",
+                                        newline,
+                                        newline,
+                                        string "if (stat(\"",
+                                        string (Prim.toCString dir),
+                                        string "\", &st) == -1)",
+                                        newline,
+                                        box [string "mkdir(\"",
+                                             string (Prim.toCString dir),
+                                             string "\", 0700);",
+                                             newline]]),
+                  string "uw_begin_initializing(ctx);",
                   newline,
                   p_list_sep newline (fn x => x) (rev (!global_initializers)),
                   string "uw_end_initializing(ctx);",
-- 
cgit v1.2.3


From 3f119f5c0a5f210ed442841dfed3ae98786004e9 Mon Sep 17 00:00:00 2001
From: Adam Chlipala <adam@chlipala.net>
Date: Sat, 23 Mar 2019 20:16:15 -0400
Subject: Supporting 'char' arguments to handlers called from client code

---
 include/urweb/urweb_cpp.h |  1 +
 src/c/urweb.c             | 17 +++++++++++++++++
 src/settings.sml          |  2 ++
 3 files changed, 20 insertions(+)

(limited to 'include/urweb')

diff --git a/include/urweb/urweb_cpp.h b/include/urweb/urweb_cpp.h
index 18b5f583..dcf67fef 100644
--- a/include/urweb/urweb_cpp.h
+++ b/include/urweb/urweb_cpp.h
@@ -149,6 +149,7 @@ uw_Basis_unit uw_Basis_unurlifyUnit(struct uw_context * ctx, char **s);
 uw_Basis_int uw_Basis_unurlifyInt(struct uw_context *, char **);
 uw_Basis_float uw_Basis_unurlifyFloat(struct uw_context *, char **);
 uw_Basis_string uw_Basis_unurlifyString(struct uw_context *, char **);
+uw_Basis_char uw_Basis_unurlifyChar(struct uw_context *, char **);
 uw_Basis_string uw_Basis_unurlifyString_fromClient(struct uw_context *, char **);
 uw_Basis_bool uw_Basis_unurlifyBool(struct uw_context *, char **);
 uw_Basis_time uw_Basis_unurlifyTime(struct uw_context *, char **);
diff --git a/src/c/urweb.c b/src/c/urweb.c
index 58f7884d..4d9e8630 100644
--- a/src/c/urweb.c
+++ b/src/c/urweb.c
@@ -2267,6 +2267,23 @@ uw_Basis_string uw_Basis_unurlifyString(uw_context ctx, char **s) {
   return r;
 }
 
+uw_Basis_char uw_Basis_unurlifyChar(uw_context ctx, char **s) {
+  char *new_s = uw_unurlify_advance(*s);
+  char *r;
+  int len;
+
+  len = strlen(*s);
+  uw_check_heap(ctx, len + 1);
+
+  r = ctx->heap.front;
+  ctx->heap.front = uw_unurlifyString_to(0, ctx, ctx->heap.front, *s);
+  *s = new_s;
+  if (strlen(r) == 1)
+    return r[0];
+  else
+    uw_error(ctx, FATAL, "Unurlified character is multiple characters long");
+}
+
 uw_Basis_unit uw_Basis_unurlifyUnit(uw_context ctx, char **s) {
   (void)ctx;
   *s = uw_unurlify_advance(*s);
diff --git a/src/settings.sml b/src/settings.sml
index abb26f72..edc03d4c 100644
--- a/src/settings.sml
+++ b/src/settings.sml
@@ -117,6 +117,7 @@ fun basis x = S.addList (S.empty, map (fn x : string => ("Basis", x)) x)
 val clientToServerBase = basis ["int",
                                 "float",
                                 "string",
+                                "char",
                                 "time",
                                 "file",
                                 "unit",
@@ -277,6 +278,7 @@ val jsFuncsBase = basisM [("alert", "alert"),
                           ("urlifyFloat", "ts"),
                           ("urlifyTime", "ts"),
                           ("urlifyString", "uf"),
+                          ("urlifyChar", "uf"),
                           ("urlifyBool", "ub"),
                           ("recv", "rv"),
                           ("strcat", "cat"),
-- 
cgit v1.2.3


From 8728f397bee2b567611dcd7a7c359c7e92159c1c Mon Sep 17 00:00:00 2001
From: Adam Chlipala <adam@chlipala.net>
Date: Wed, 25 Sep 2019 19:54:59 -0400
Subject: Unicode escapes in JSON

---
 include/urweb/urweb_cpp.h |  1 +
 lib/ur/basis.urs          |  1 +
 lib/ur/json.ur            | 29 +++++++++++++++++++++++++++++
 src/c/urweb.c             | 12 ++++++++++++
 4 files changed, 43 insertions(+)

(limited to 'include/urweb')

diff --git a/include/urweb/urweb_cpp.h b/include/urweb/urweb_cpp.h
index dcf67fef..e4ad6e61 100644
--- a/include/urweb/urweb_cpp.h
+++ b/include/urweb/urweb_cpp.h
@@ -166,6 +166,7 @@ uw_Basis_string uw_Basis_strchr(struct uw_context *, const char *, uw_Basis_char
 uw_Basis_int uw_Basis_strcspn(struct uw_context *, const char *, const char *);
 uw_Basis_string uw_Basis_substring(struct uw_context *, const char *, uw_Basis_int, uw_Basis_int);
 uw_Basis_string uw_Basis_str1(struct uw_context *, uw_Basis_char);
+uw_Basis_string uw_Basis_ofUnicode(struct uw_context *, uw_Basis_int);
 
 uw_Basis_string uw_strdup(struct uw_context *, const char *);
 uw_Basis_string uw_maybe_strdup(struct uw_context *, const char *);
diff --git a/lib/ur/basis.urs b/lib/ur/basis.urs
index 2a98bf6f..d29bf6e6 100644
--- a/lib/ur/basis.urs
+++ b/lib/ur/basis.urs
@@ -95,6 +95,7 @@ val strsindex : string -> string -> option int
 val strcspn : string -> string -> int
 val substring : string -> int -> int -> string
 val str1 : char -> string
+val ofUnicode : int -> string
 
 class show
 val show : t ::: Type -> show t -> t -> string
diff --git a/lib/ur/json.ur b/lib/ur/json.ur
index 05406739..70f0c797 100644
--- a/lib/ur/json.ur
+++ b/lib/ur/json.ur
@@ -59,6 +59,17 @@ fun escape s =
         "\"" ^ esc s
     end
 
+fun unhex ch =
+    if Char.isDigit ch then
+        Char.toInt ch - Char.toInt #"0"
+    else if Char.isXdigit ch then
+        if Char.isUpper ch then
+            10 + (Char.toInt ch - Char.toInt #"A")
+        else
+            10 + (Char.toInt ch - Char.toInt #"a")
+    else
+        error <xml>Invalid hexadecimal digit "{[ch]}"</xml>
+    
 fun unescape s =
     let
         val len = String.length s
@@ -75,6 +86,11 @@ fun unescape s =
                       | #"\\" =>
                         if i+1 >= len then
                             error <xml>JSON unescape: Bad escape sequence: {[s]}</xml>
+                        else if String.sub s (i + 1) = #"u" then
+                            if i+5 >= len then
+                                error <xml>JSON unescape: Bad escape sequence: {[s]}</xml>
+                            else
+                                findEnd (i+6)
                         else
                             findEnd (i+2)
                       | _ => findEnd (i+1)
@@ -93,6 +109,19 @@ fun unescape s =
                         #"\\" =>
                         if i+1 >= len then
                             error <xml>JSON unescape: Bad escape sequence: {[s]}</xml>
+                        else if String.sub s (i+1) = #"u" then
+                            if i+5 >= len then
+                                error <xml>JSON unescape: Unicode ends early</xml>
+                            else
+                                let
+                                    val n =
+                                        unhex (String.sub s (i+2)) * (256*16)
+                                        + unhex (String.sub s (i+3)) * 256
+                                        + unhex (String.sub s (i+4)) * 16
+                                        + unhex (String.sub s (i+5))
+                                in
+                                    ofUnicode n ^ unesc (i+6)
+                                end
                         else
 			    (case String.sub s (i+1) of
 				 #"n" => "\n"
diff --git a/src/c/urweb.c b/src/c/urweb.c
index af929269..8c445f39 100644
--- a/src/c/urweb.c
+++ b/src/c/urweb.c
@@ -2724,6 +2724,18 @@ uw_Basis_string uw_Basis_str1(uw_context ctx, uw_Basis_char ch) {
   return r; 
 }
 
+uw_Basis_string uw_Basis_ofUnicode(uw_context ctx, uw_Basis_int n) {
+  UChar buf16[] = {n};
+  uw_Basis_string out = uw_malloc(ctx, 3);
+  int32_t outLen;
+  UErrorCode pErrorCode = 0;
+
+  if (u_strToUTF8(out, 3, &outLen, buf16, 1, &pErrorCode) == NULL || outLen == 0)
+    uw_error(ctx, FATAL, "Bad Unicode string to unescape (error %s)", u_errorName(pErrorCode));
+
+  return out;
+}
+
 uw_Basis_string uw_strdup(uw_context ctx, uw_Basis_string s1) {
   int len = strlen(s1) + 1;
   char *s;
-- 
cgit v1.2.3