From e7b50987708729b9c3d0c8c0acb58e801142e6b9 Mon Sep 17 00:00:00 2001 From: Adam Chlipala Date: Thu, 3 Dec 2009 11:20:13 -0500 Subject: Escaping UTF-8 in MonoOpt --- src/mono_opt.sml | 39 +++++++++++++++++++++++++++++++-------- src/prim.sml | 15 +++++++++++++-- tests/cyrillic.ur | 2 ++ tests/cyrillic.urp | 2 ++ tests/cyrillic.urs | 1 + 5 files changed, 49 insertions(+), 10 deletions(-) create mode 100644 tests/cyrillic.ur create mode 100644 tests/cyrillic.urp create mode 100644 tests/cyrillic.urs diff --git a/src/mono_opt.sml b/src/mono_opt.sml index d4441ac7..3a5b4f4c 100644 --- a/src/mono_opt.sml +++ b/src/mono_opt.sml @@ -61,14 +61,37 @@ val urlifyFloat = attrifyFloat val htmlifyInt = attrifyInt val htmlifyFloat = attrifyFloat -val htmlifyString = String.translate (fn ch => case ch of - #"<" => "<" - | #"&" => "&" - | _ => - if Char.isPrint ch orelse Char.isSpace ch then - str ch - else - "&#" ^ Int.toString (ord ch) ^ ";") + +fun htmlifyString s = + let + fun hs (pos, acc) = + if pos >= size s then + String.concat (rev acc) + else + case String.sub (s, pos) of + #"<" => hs (pos+1, "<" :: acc) + | #"&" => hs (pos+1, "&" :: acc) + | ch => + let + val n = ord ch + fun isCont k = pos + k < size s + andalso ord (String.sub (s, pos + k)) div 64 = 2 + fun unicode k = hs (pos+k+1, String.substring (s, pos, k+1) :: acc) + in + if Char.isPrint ch orelse Char.isSpace ch then + hs (pos+1, str ch :: acc) + else if n div 32 = 6 andalso isCont 1 then + unicode 1 + else if n div 16 = 14 andalso isCont 1 andalso isCont 2 then + unicode 2 + else if n div 8 = 30 andalso isCont 1 andalso isCont 2 andalso isCont 3 then + unicode 3 + else + hs (pos+1, "&#" ^ Int.toString (ord ch) ^ ";" :: acc) + end + in + hs (0, []) + end fun hexIt ch = let diff --git a/src/prim.sml b/src/prim.sml index c84c557e..c4b7e839 100644 --- a/src/prim.sml +++ b/src/prim.sml @@ -68,12 +68,23 @@ fun toString t = | String s => s | Char ch => str ch +fun pad (n, ch, s) = + if size s >= n then + s + else + str ch ^ pad (n-1, ch, s) + +val gccify = String.translate (fn ch => if Char.isPrint ch then + str ch + else + "\\" ^ pad (3, #"0", Int.fmt StringCvt.OCT (ord ch))) + fun p_t_GCC t = case t of Int n => string (int2s n) | Float n => string (float2s n) - | String s => box [string "\"", string (String.toString s), string "\""] - | Char ch => box [string "'", string (String.toString (str ch)), string "'"] + | String s => box [string "\"", string (gccify s), string "\""] + | Char ch => box [string "'", string (gccify (str ch)), string "'"] fun equal x = case x of diff --git a/tests/cyrillic.ur b/tests/cyrillic.ur new file mode 100644 index 00000000..f125792f --- /dev/null +++ b/tests/cyrillic.ur @@ -0,0 +1,2 @@ +fun main () = return одел +Hi! diff --git a/tests/cyrillic.urp b/tests/cyrillic.urp new file mode 100644 index 00000000..279d4af5 --- /dev/null +++ b/tests/cyrillic.urp @@ -0,0 +1,2 @@ + +cyrillic diff --git a/tests/cyrillic.urs b/tests/cyrillic.urs new file mode 100644 index 00000000..6ac44e0b --- /dev/null +++ b/tests/cyrillic.urs @@ -0,0 +1 @@ +val main : unit -> transaction page -- cgit v1.2.3