summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Adam Chlipala <adamc@hcoop.net>2009-12-03 11:50:51 -0500
committerGravatar Adam Chlipala <adamc@hcoop.net>2009-12-03 11:50:51 -0500
commit2304eece8f5f2b9067cd66d860a332f8721c5321 (patch)
treea8e6fae2fe503e60c209eeab70b9ff462ab808c8
parente7b50987708729b9c3d0c8c0acb58e801142e6b9 (diff)
UTF-8 in dynamic escaping
-rw-r--r--src/c/urweb.c78
-rw-r--r--src/mono_opt.sml35
-rw-r--r--src/prim.sml18
3 files changed, 114 insertions, 17 deletions
diff --git a/src/c/urweb.c b/src/c/urweb.c
index 344ef2ad..6e2b9e22 100644
--- a/src/c/urweb.c
+++ b/src/c/urweb.c
@@ -1410,6 +1410,10 @@ char *uw_Basis_attrifyFloat(uw_context ctx, uw_Basis_float n) {
return result;
}
+static int isCont(unsigned char ch) {
+ return ch / 64 == 2;
+}
+
char *uw_Basis_attrifyString(uw_context ctx, uw_Basis_string s) {
int len = strlen(s);
char *result, *p;
@@ -1418,7 +1422,7 @@ char *uw_Basis_attrifyString(uw_context ctx, uw_Basis_string s) {
result = p = ctx->heap.front;
for (; *s; s++) {
- char c = *s;
+ unsigned char c = *s;
if (c == '"') {
strcpy(p, "&quot;");
@@ -1429,7 +1433,19 @@ char *uw_Basis_attrifyString(uw_context ctx, uw_Basis_string s) {
}
else if (isprint(c))
*p++ = c;
- else {
+ else if (c / 32 == 6 && isCont(s[1])) {
+ memcpy(p, s, 2);
+ p += 2;
+ ++s;
+ } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+ memcpy(p, s, 3);
+ p += 3;
+ s += 2;
+ } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+ memcpy(p, s, 4);
+ p += 4;
+ s += 3;
+ } else {
int len2;
sprintf(p, "&#%d;%n", c, &len2);
p += len2;
@@ -1499,7 +1515,7 @@ uw_unit uw_Basis_attrifyString_w(uw_context ctx, uw_Basis_string s) {
uw_check(ctx, strlen(s) * 6);
for (; *s; s++) {
- char c = *s;
+ unsigned char c = *s;
if (c == '"')
uw_write_unsafe(ctx, "&quot;");
@@ -1507,7 +1523,22 @@ uw_unit uw_Basis_attrifyString_w(uw_context ctx, uw_Basis_string s) {
uw_write_unsafe(ctx, "&amp;");
else if (isprint(c))
uw_writec_unsafe(ctx, c);
- else {
+ else if (c / 32 == 6 && isCont(s[1])) {
+ uw_writec_unsafe(ctx, c);
+ uw_writec_unsafe(ctx, s[1]);
+ ++s;
+ } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+ uw_writec_unsafe(ctx, c);
+ uw_writec_unsafe(ctx, s[1]);
+ uw_writec_unsafe(ctx, s[2]);
+ s += 2;
+ } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+ uw_writec_unsafe(ctx, c);
+ uw_writec_unsafe(ctx, s[1]);
+ uw_writec_unsafe(ctx, s[2]);
+ uw_writec_unsafe(ctx, s[3]);
+ s += 3;
+ } else {
uw_write_unsafe(ctx, "&#");
uw_Basis_attrifyInt_w_unsafe(ctx, c);
uw_writec_unsafe(ctx, ';');
@@ -1847,7 +1878,7 @@ char *uw_Basis_htmlifyString(uw_context ctx, uw_Basis_string s) {
uw_check_heap(ctx, strlen(s) * 5 + 1);
for (r = s2 = ctx->heap.front; *s; s++) {
- char c = *s;
+ unsigned char c = *s;
switch (c) {
case '<':
@@ -1859,9 +1890,21 @@ char *uw_Basis_htmlifyString(uw_context ctx, uw_Basis_string s) {
s2 += 5;
break;
default:
- if (isprint(c))
+ if (isprint(c) || isspace(c))
*s2++ = c;
- else {
+ else if (c / 32 == 6 && isCont(s[1])) {
+ memcpy(s2, s, 2);
+ s2 += 2;
+ ++s;
+ } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+ memcpy(s2, s, 3);
+ s2 += 3;
+ s += 2;
+ } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+ memcpy(s2, s, 4);
+ s2 += 4;
+ s += 3;
+ } else {
int len2;
sprintf(s2, "&#%d;%n", c, &len2);
s2 += len2;
@@ -1878,7 +1921,7 @@ uw_unit uw_Basis_htmlifyString_w(uw_context ctx, uw_Basis_string s) {
uw_check(ctx, strlen(s) * 6);
for (; *s; s++) {
- char c = *s;
+ unsigned char c = *s;
switch (c) {
case '<':
@@ -1888,9 +1931,24 @@ uw_unit uw_Basis_htmlifyString_w(uw_context ctx, uw_Basis_string s) {
uw_write_unsafe(ctx, "&amp;");
break;
default:
- if (isprint(c))
+ if (isprint(c) || isspace(c))
uw_writec_unsafe(ctx, c);
- else {
+ else if (c / 32 == 6 && isCont(s[1])) {
+ uw_writec_unsafe(ctx, c);
+ uw_writec_unsafe(ctx, s[1]);
+ ++s;
+ } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+ uw_writec_unsafe(ctx, c);
+ uw_writec_unsafe(ctx, s[1]);
+ uw_writec_unsafe(ctx, s[2]);
+ s += 2;
+ } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+ uw_writec_unsafe(ctx, c);
+ uw_writec_unsafe(ctx, s[1]);
+ uw_writec_unsafe(ctx, s[2]);
+ uw_writec_unsafe(ctx, s[3]);
+ s += 3;
+ } else {
uw_write_unsafe(ctx, "&#");
uw_Basis_attrifyInt_w_unsafe(ctx, c);
uw_writec_unsafe(ctx, ';');
diff --git a/src/mono_opt.sml b/src/mono_opt.sml
index 3a5b4f4c..bda4d93a 100644
--- a/src/mono_opt.sml
+++ b/src/mono_opt.sml
@@ -45,6 +45,37 @@ fun attrifyFloat n =
else
Real.toString n
+fun attrifyString s =
+ let
+ fun hs (pos, acc) =
+ if pos >= size s then
+ String.concat (rev acc)
+ else
+ case String.sub (s, pos) of
+ #"\"" => hs (pos+1, "&quot;" :: acc)
+ | #"&" => hs (pos+1, "&amp;" :: acc)
+ | ch =>
+ let
+ val n = ord ch
+ fun isCont k = pos + k < size s
+ andalso ord (String.sub (s, pos + k)) div 64 = 2
+ fun unicode k = hs (pos+k+1, String.substring (s, pos, k+1) :: acc)
+ in
+ if Char.isPrint ch orelse Char.isSpace ch then
+ hs (pos+1, str ch :: acc)
+ else if n div 32 = 6 andalso isCont 1 then
+ unicode 1
+ else if n div 16 = 14 andalso isCont 1 andalso isCont 2 then
+ unicode 2
+ else if n div 8 = 30 andalso isCont 1 andalso isCont 2 andalso isCont 3 then
+ unicode 3
+ else
+ hs (pos+1, "&#" ^ Int.toString (ord ch) ^ ";" :: acc)
+ end
+ in
+ hs (0, [])
+ end
+
fun attrifyChar ch =
case ch of
#"\"" => "&quot;"
@@ -54,8 +85,6 @@ fun attrifyChar ch =
else
"&#" ^ Int.toString (ord ch) ^ ";"
-val attrifyString = String.translate attrifyChar
-
val urlifyInt = attrifyInt
val urlifyFloat = attrifyFloat
@@ -78,7 +107,7 @@ fun htmlifyString s =
andalso ord (String.sub (s, pos + k)) div 64 = 2
fun unicode k = hs (pos+k+1, String.substring (s, pos, k+1) :: acc)
in
- if Char.isPrint ch orelse Char.isSpace ch then
+ if Char.isPrint ch then
hs (pos+1, str ch :: acc)
else if n div 32 = 6 andalso isCont 1 then
unicode 1
diff --git a/src/prim.sml b/src/prim.sml
index c4b7e839..e094e0b1 100644
--- a/src/prim.sml
+++ b/src/prim.sml
@@ -74,10 +74,20 @@ fun pad (n, ch, s) =
else
str ch ^ pad (n-1, ch, s)
-val gccify = String.translate (fn ch => if Char.isPrint ch then
- str ch
- else
- "\\" ^ pad (3, #"0", Int.fmt StringCvt.OCT (ord ch)))
+val gccify = String.translate (fn ch =>
+ case ch of
+ #"\"" => "\\\""
+ | #"\\" => "\\\\"
+ | #"'" => "\\'"
+ | #"\n" => "\\n"
+ | #"\r" => "\\r"
+ | #"\t" => "\\t"
+ | #" " => " "
+ | _ =>
+ if Char.isPrint ch then
+ str ch
+ else
+ "\\" ^ pad (3, #"0", Int.fmt StringCvt.OCT (ord ch)))
fun p_t_GCC t =
case t of