From ebff207e853e7dda8f5dc0bf364c6578d86c5c55 Mon Sep 17 00:00:00 2001
From: Adam Chlipala <adamc@hcoop.net>
Date: Thu, 3 Dec 2009 11:50:51 -0500
Subject: UTF-8 in dynamic escaping

---
 src/c/urweb.c    | 78 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 src/mono_opt.sml | 35 ++++++++++++++++++++++---
 src/prim.sml     | 18 ++++++++++---
 3 files changed, 114 insertions(+), 17 deletions(-)

(limited to 'src')

diff --git a/src/c/urweb.c b/src/c/urweb.c
index 344ef2ad..6e2b9e22 100644
--- a/src/c/urweb.c
+++ b/src/c/urweb.c
@@ -1410,6 +1410,10 @@ char *uw_Basis_attrifyFloat(uw_context ctx, uw_Basis_float n) {
   return result;
 }
 
+static int isCont(unsigned char ch) {
+  return ch / 64 == 2;
+}
+
 char *uw_Basis_attrifyString(uw_context ctx, uw_Basis_string s) {
   int len = strlen(s);
   char *result, *p;
@@ -1418,7 +1422,7 @@ char *uw_Basis_attrifyString(uw_context ctx, uw_Basis_string s) {
   result = p = ctx->heap.front;
 
   for (; *s; s++) {
-    char c = *s;
+    unsigned char c = *s;
 
     if (c == '"') {
       strcpy(p, "&quot;");
@@ -1429,7 +1433,19 @@ char *uw_Basis_attrifyString(uw_context ctx, uw_Basis_string s) {
     }
     else if (isprint(c))
       *p++ = c;
-    else {
+    else if (c / 32 == 6 && isCont(s[1])) {
+      memcpy(p, s, 2);
+      p += 2;
+      ++s;
+    } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+      memcpy(p, s, 3);
+      p += 3;
+      s += 2;
+    } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+      memcpy(p, s, 4);
+      p += 4;
+      s += 3;
+    } else {
       int len2;
       sprintf(p, "&#%d;%n", c, &len2);
       p += len2;
@@ -1499,7 +1515,7 @@ uw_unit uw_Basis_attrifyString_w(uw_context ctx, uw_Basis_string s) {
   uw_check(ctx, strlen(s) * 6);
 
   for (; *s; s++) {
-    char c = *s;
+    unsigned char c = *s;
 
     if (c == '"')
       uw_write_unsafe(ctx, "&quot;");
@@ -1507,7 +1523,22 @@ uw_unit uw_Basis_attrifyString_w(uw_context ctx, uw_Basis_string s) {
       uw_write_unsafe(ctx, "&amp;");
     else if (isprint(c))
       uw_writec_unsafe(ctx, c);
-    else {
+    else if (c / 32 == 6 && isCont(s[1])) {
+      uw_writec_unsafe(ctx, c);
+      uw_writec_unsafe(ctx, s[1]);
+      ++s;
+    } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+      uw_writec_unsafe(ctx, c);
+      uw_writec_unsafe(ctx, s[1]);
+      uw_writec_unsafe(ctx, s[2]);
+      s += 2;
+    } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+      uw_writec_unsafe(ctx, c);
+      uw_writec_unsafe(ctx, s[1]);
+      uw_writec_unsafe(ctx, s[2]);
+      uw_writec_unsafe(ctx, s[3]);
+      s += 3;
+    } else {
       uw_write_unsafe(ctx, "&#");
       uw_Basis_attrifyInt_w_unsafe(ctx, c);
       uw_writec_unsafe(ctx, ';');
@@ -1847,7 +1878,7 @@ char *uw_Basis_htmlifyString(uw_context ctx, uw_Basis_string s) {
   uw_check_heap(ctx, strlen(s) * 5 + 1);
 
   for (r = s2 = ctx->heap.front; *s; s++) {
-    char c = *s;
+    unsigned char c = *s;
 
     switch (c) {
     case '<':
@@ -1859,9 +1890,21 @@ char *uw_Basis_htmlifyString(uw_context ctx, uw_Basis_string s) {
       s2 += 5;
       break;
     default:
-      if (isprint(c))
+      if (isprint(c) || isspace(c))
         *s2++ = c;
-      else {
+      else if (c / 32 == 6 && isCont(s[1])) {
+        memcpy(s2, s, 2);
+        s2 += 2;
+        ++s;
+      } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+        memcpy(s2, s, 3);
+        s2 += 3;
+        s += 2;
+      } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+        memcpy(s2, s, 4);
+        s2 += 4;
+        s += 3;
+      } else {
         int len2;
         sprintf(s2, "&#%d;%n", c, &len2);
         s2 += len2;
@@ -1878,7 +1921,7 @@ uw_unit uw_Basis_htmlifyString_w(uw_context ctx, uw_Basis_string s) {
   uw_check(ctx, strlen(s) * 6);
 
   for (; *s; s++) {
-    char c = *s;
+    unsigned char c = *s;
 
     switch (c) {
     case '<':
@@ -1888,9 +1931,24 @@ uw_unit uw_Basis_htmlifyString_w(uw_context ctx, uw_Basis_string s) {
       uw_write_unsafe(ctx, "&amp;");
       break;
     default:
-      if (isprint(c))
+      if (isprint(c) || isspace(c))
         uw_writec_unsafe(ctx, c);
-      else {
+      else if (c / 32 == 6 && isCont(s[1])) {
+        uw_writec_unsafe(ctx, c);
+        uw_writec_unsafe(ctx, s[1]);
+        ++s;
+      } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+        uw_writec_unsafe(ctx, c);
+        uw_writec_unsafe(ctx, s[1]);
+        uw_writec_unsafe(ctx, s[2]);
+        s += 2;
+      } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+        uw_writec_unsafe(ctx, c);
+        uw_writec_unsafe(ctx, s[1]);
+        uw_writec_unsafe(ctx, s[2]);
+        uw_writec_unsafe(ctx, s[3]);
+        s += 3;
+      } else {
         uw_write_unsafe(ctx, "&#");
         uw_Basis_attrifyInt_w_unsafe(ctx, c);
         uw_writec_unsafe(ctx, ';');
diff --git a/src/mono_opt.sml b/src/mono_opt.sml
index 3a5b4f4c..bda4d93a 100644
--- a/src/mono_opt.sml
+++ b/src/mono_opt.sml
@@ -45,6 +45,37 @@ fun attrifyFloat n =
     else
         Real.toString n
 
+fun attrifyString s =
+    let
+        fun hs (pos, acc) =
+            if pos >= size s then
+                String.concat (rev acc)
+            else
+                case String.sub (s, pos) of
+                    #"\"" => hs (pos+1, "&quot;" :: acc)
+                  | #"&" => hs (pos+1, "&amp;" :: acc)
+                  | ch =>
+                    let
+                        val n = ord ch
+                        fun isCont k = pos + k < size s
+                                       andalso ord (String.sub (s, pos + k)) div 64 = 2
+                        fun unicode k = hs (pos+k+1, String.substring (s, pos, k+1) :: acc)
+                    in
+                        if Char.isPrint ch orelse Char.isSpace ch then
+                            hs (pos+1, str ch :: acc)
+                        else if n div 32 = 6 andalso isCont 1 then
+                            unicode 1
+                        else if n div 16 = 14 andalso isCont 1 andalso isCont 2 then
+                            unicode 2
+                        else if n div 8 = 30 andalso isCont 1 andalso isCont 2 andalso isCont 3 then
+                            unicode 3
+                        else
+                            hs (pos+1, "&#" ^ Int.toString (ord ch) ^ ";" :: acc)
+                    end
+    in
+        hs (0, [])
+    end
+
 fun attrifyChar ch =
     case ch of
         #"\"" => "&quot;"
@@ -54,8 +85,6 @@ fun attrifyChar ch =
               else
                   "&#" ^ Int.toString (ord ch) ^ ";"
 
-val attrifyString = String.translate attrifyChar
-
 val urlifyInt = attrifyInt
 val urlifyFloat = attrifyFloat
 
@@ -78,7 +107,7 @@ fun htmlifyString s =
                                        andalso ord (String.sub (s, pos + k)) div 64 = 2
                         fun unicode k = hs (pos+k+1, String.substring (s, pos, k+1) :: acc)
                     in
-                        if Char.isPrint ch orelse Char.isSpace ch then
+                        if Char.isPrint ch then
                             hs (pos+1, str ch :: acc)
                         else if n div 32 = 6 andalso isCont 1 then
                             unicode 1
diff --git a/src/prim.sml b/src/prim.sml
index c4b7e839..e094e0b1 100644
--- a/src/prim.sml
+++ b/src/prim.sml
@@ -74,10 +74,20 @@ fun pad (n, ch, s) =
     else
         str ch ^ pad (n-1, ch, s)
 
-val gccify = String.translate (fn ch => if Char.isPrint ch then
-                                            str ch
-                                        else
-                                            "\\" ^ pad (3, #"0", Int.fmt StringCvt.OCT (ord ch)))
+val gccify = String.translate (fn ch =>
+                                  case ch of
+                                      #"\"" => "\\\""
+                                    | #"\\" => "\\\\"
+                                    | #"'" => "\\'"
+                                    | #"\n" => "\\n"
+                                    | #"\r" => "\\r"
+                                    | #"\t" => "\\t"
+                                    | #" " => " "
+                                    | _ =>
+                                      if Char.isPrint ch then
+                                          str ch
+                                      else
+                                          "\\" ^ pad (3, #"0", Int.fmt StringCvt.OCT (ord ch)))
 
 fun p_t_GCC t =
     case t of
-- 
cgit v1.2.3