From ebff207e853e7dda8f5dc0bf364c6578d86c5c55 Mon Sep 17 00:00:00 2001 From: Adam Chlipala Date: Thu, 3 Dec 2009 11:50:51 -0500 Subject: UTF-8 in dynamic escaping --- src/c/urweb.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++-------- src/mono_opt.sml | 35 ++++++++++++++++++++++--- src/prim.sml | 18 ++++++++++--- 3 files changed, 114 insertions(+), 17 deletions(-) (limited to 'src') diff --git a/src/c/urweb.c b/src/c/urweb.c index 344ef2ad..6e2b9e22 100644 --- a/src/c/urweb.c +++ b/src/c/urweb.c @@ -1410,6 +1410,10 @@ char *uw_Basis_attrifyFloat(uw_context ctx, uw_Basis_float n) { return result; } +static int isCont(unsigned char ch) { + return ch / 64 == 2; +} + char *uw_Basis_attrifyString(uw_context ctx, uw_Basis_string s) { int len = strlen(s); char *result, *p; @@ -1418,7 +1422,7 @@ char *uw_Basis_attrifyString(uw_context ctx, uw_Basis_string s) { result = p = ctx->heap.front; for (; *s; s++) { - char c = *s; + unsigned char c = *s; if (c == '"') { strcpy(p, """); @@ -1429,7 +1433,19 @@ char *uw_Basis_attrifyString(uw_context ctx, uw_Basis_string s) { } else if (isprint(c)) *p++ = c; - else { + else if (c / 32 == 6 && isCont(s[1])) { + memcpy(p, s, 2); + p += 2; + ++s; + } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) { + memcpy(p, s, 3); + p += 3; + s += 2; + } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) { + memcpy(p, s, 4); + p += 4; + s += 3; + } else { int len2; sprintf(p, "&#%d;%n", c, &len2); p += len2; @@ -1499,7 +1515,7 @@ uw_unit uw_Basis_attrifyString_w(uw_context ctx, uw_Basis_string s) { uw_check(ctx, strlen(s) * 6); for (; *s; s++) { - char c = *s; + unsigned char c = *s; if (c == '"') uw_write_unsafe(ctx, """); @@ -1507,7 +1523,22 @@ uw_unit uw_Basis_attrifyString_w(uw_context ctx, uw_Basis_string s) { uw_write_unsafe(ctx, "&"); else if (isprint(c)) uw_writec_unsafe(ctx, c); - else { + else if (c / 32 == 6 && isCont(s[1])) { + uw_writec_unsafe(ctx, c); + uw_writec_unsafe(ctx, s[1]); + ++s; + } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) { + uw_writec_unsafe(ctx, c); + uw_writec_unsafe(ctx, s[1]); + uw_writec_unsafe(ctx, s[2]); + s += 2; + } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) { + uw_writec_unsafe(ctx, c); + uw_writec_unsafe(ctx, s[1]); + uw_writec_unsafe(ctx, s[2]); + uw_writec_unsafe(ctx, s[3]); + s += 3; + } else { uw_write_unsafe(ctx, "&#"); uw_Basis_attrifyInt_w_unsafe(ctx, c); uw_writec_unsafe(ctx, ';'); @@ -1847,7 +1878,7 @@ char *uw_Basis_htmlifyString(uw_context ctx, uw_Basis_string s) { uw_check_heap(ctx, strlen(s) * 5 + 1); for (r = s2 = ctx->heap.front; *s; s++) { - char c = *s; + unsigned char c = *s; switch (c) { case '<': @@ -1859,9 +1890,21 @@ char *uw_Basis_htmlifyString(uw_context ctx, uw_Basis_string s) { s2 += 5; break; default: - if (isprint(c)) + if (isprint(c) || isspace(c)) *s2++ = c; - else { + else if (c / 32 == 6 && isCont(s[1])) { + memcpy(s2, s, 2); + s2 += 2; + ++s; + } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) { + memcpy(s2, s, 3); + s2 += 3; + s += 2; + } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) { + memcpy(s2, s, 4); + s2 += 4; + s += 3; + } else { int len2; sprintf(s2, "&#%d;%n", c, &len2); s2 += len2; @@ -1878,7 +1921,7 @@ uw_unit uw_Basis_htmlifyString_w(uw_context ctx, uw_Basis_string s) { uw_check(ctx, strlen(s) * 6); for (; *s; s++) { - char c = *s; + unsigned char c = *s; switch (c) { case '<': @@ -1888,9 +1931,24 @@ uw_unit uw_Basis_htmlifyString_w(uw_context ctx, uw_Basis_string s) { uw_write_unsafe(ctx, "&"); break; default: - if (isprint(c)) + if (isprint(c) || isspace(c)) uw_writec_unsafe(ctx, c); - else { + else if (c / 32 == 6 && isCont(s[1])) { + uw_writec_unsafe(ctx, c); + uw_writec_unsafe(ctx, s[1]); + ++s; + } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) { + uw_writec_unsafe(ctx, c); + uw_writec_unsafe(ctx, s[1]); + uw_writec_unsafe(ctx, s[2]); + s += 2; + } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) { + uw_writec_unsafe(ctx, c); + uw_writec_unsafe(ctx, s[1]); + uw_writec_unsafe(ctx, s[2]); + uw_writec_unsafe(ctx, s[3]); + s += 3; + } else { uw_write_unsafe(ctx, "&#"); uw_Basis_attrifyInt_w_unsafe(ctx, c); uw_writec_unsafe(ctx, ';'); diff --git a/src/mono_opt.sml b/src/mono_opt.sml index 3a5b4f4c..bda4d93a 100644 --- a/src/mono_opt.sml +++ b/src/mono_opt.sml @@ -45,6 +45,37 @@ fun attrifyFloat n = else Real.toString n +fun attrifyString s = + let + fun hs (pos, acc) = + if pos >= size s then + String.concat (rev acc) + else + case String.sub (s, pos) of + #"\"" => hs (pos+1, """ :: acc) + | #"&" => hs (pos+1, "&" :: acc) + | ch => + let + val n = ord ch + fun isCont k = pos + k < size s + andalso ord (String.sub (s, pos + k)) div 64 = 2 + fun unicode k = hs (pos+k+1, String.substring (s, pos, k+1) :: acc) + in + if Char.isPrint ch orelse Char.isSpace ch then + hs (pos+1, str ch :: acc) + else if n div 32 = 6 andalso isCont 1 then + unicode 1 + else if n div 16 = 14 andalso isCont 1 andalso isCont 2 then + unicode 2 + else if n div 8 = 30 andalso isCont 1 andalso isCont 2 andalso isCont 3 then + unicode 3 + else + hs (pos+1, "&#" ^ Int.toString (ord ch) ^ ";" :: acc) + end + in + hs (0, []) + end + fun attrifyChar ch = case ch of #"\"" => """ @@ -54,8 +85,6 @@ fun attrifyChar ch = else "&#" ^ Int.toString (ord ch) ^ ";" -val attrifyString = String.translate attrifyChar - val urlifyInt = attrifyInt val urlifyFloat = attrifyFloat @@ -78,7 +107,7 @@ fun htmlifyString s = andalso ord (String.sub (s, pos + k)) div 64 = 2 fun unicode k = hs (pos+k+1, String.substring (s, pos, k+1) :: acc) in - if Char.isPrint ch orelse Char.isSpace ch then + if Char.isPrint ch then hs (pos+1, str ch :: acc) else if n div 32 = 6 andalso isCont 1 then unicode 1 diff --git a/src/prim.sml b/src/prim.sml index c4b7e839..e094e0b1 100644 --- a/src/prim.sml +++ b/src/prim.sml @@ -74,10 +74,20 @@ fun pad (n, ch, s) = else str ch ^ pad (n-1, ch, s) -val gccify = String.translate (fn ch => if Char.isPrint ch then - str ch - else - "\\" ^ pad (3, #"0", Int.fmt StringCvt.OCT (ord ch))) +val gccify = String.translate (fn ch => + case ch of + #"\"" => "\\\"" + | #"\\" => "\\\\" + | #"'" => "\\'" + | #"\n" => "\\n" + | #"\r" => "\\r" + | #"\t" => "\\t" + | #" " => " " + | _ => + if Char.isPrint ch then + str ch + else + "\\" ^ pad (3, #"0", Int.fmt StringCvt.OCT (ord ch))) fun p_t_GCC t = case t of -- cgit v1.2.3