From d7e10798f1905161e5790444e604f439281d4220 Mon Sep 17 00:00:00 2001 From: Oisín Mac Fhearaí Date: Sun, 11 Aug 2019 05:04:43 +0100 Subject: * When htmlifying characters, don't use numeric escapes if they're printable -- instead, try to convert them to UTF-8. * Add libicuio to linked C libraries --- src/c/Makefile.am | 2 +- src/c/urweb.c | 19 ++++++++++++++++--- src/compiler.sml | 8 ++++++-- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/c/Makefile.am b/src/c/Makefile.am index 95582793..ff4b6eaf 100644 --- a/src/c/Makefile.am +++ b/src/c/Makefile.am @@ -11,7 +11,7 @@ AM_CFLAGS = -Wall -Wunused-parameter -Werror -Wno-format-security -Wno-deprecate liburweb_la_LDFLAGS = $(AM_LDFLAGS) $(OPENSSL_LDFLAGS) \ -export-symbols-regex '^(client_pruner|pthread_create_big|strcmp_nullsafe|uw_.*)' \ -version-info 1:0:0 -liburweb_la_LIBADD = $(PTHREAD_LIBS) -lm $(OPENSSL_LIBS) $(ICU_LIBS) -licui18n -licuuc -licudata +liburweb_la_LIBADD = $(PTHREAD_LIBS) -lm $(OPENSSL_LIBS) $(ICU_LIBS) -licui18n -licuuc -licudata -licuio liburweb_http_la_LIBADD = liburweb.la liburweb_http_la_LDFLAGS = -export-symbols-regex '^(main|uw_.*)' \ -version-info 1:0:0 diff --git a/src/c/urweb.c b/src/c/urweb.c index b820354f..dad15568 100644 --- a/src/c/urweb.c +++ b/src/c/urweb.c @@ -20,7 +20,6 @@ #include -#include #include #include "types.h" @@ -2347,7 +2346,21 @@ uw_unit uw_Basis_htmlifySpecialChar_w(uw_context ctx, uw_Basis_char ch) { int len; uw_check(ctx, INTS_MAX+3); - len = sprintf(ctx->page.front, "&#%u;", n); + + if(uw_Basis_isprint(ctx, ch)) { + + UChar32 ins[1] = { ch }; + char buf[5]; + int32_t len_written = 0; + UErrorCode err = U_ZERO_ERROR; + + u_strToUTF8(buf, 5, &len_written, ins, 1, &err); + sprintf(ctx->page.front, "%s", buf); + // printf("buf: %s, hex: %x, len_written: %d, err: %s\n", buf, ch, len_written, u_errorName(err)); + len = len_written; + } else { + len = sprintf(ctx->page.front, "&#%u;", n); + } ctx->page.front += len; return uw_unit_v; @@ -2459,7 +2472,7 @@ uw_unit uw_Basis_htmlifyString_w(uw_context ctx, uw_Basis_string s) { else { uw_Basis_htmlifySpecialChar_w(ctx, c1); } - } + } return uw_unit_v; } diff --git a/src/compiler.sml b/src/compiler.sml index 0aba3a40..c00fe807 100644 --- a/src/compiler.sml +++ b/src/compiler.sml @@ -1610,9 +1610,13 @@ fun compileC {cname, oname, ename, libs, profile, debug, linker, link = link'} = val proto = Settings.currentProtocol () val lib = if Settings.getBootLinking () then - !Settings.configLib ^ "/" ^ #linkStatic proto ^ " " ^ !Settings.configLib ^ "/liburweb.a " ^ !Settings.configIcuLibs ^ " -licui18n -licuuc -licudata" + !Settings.configLib ^ "/" ^ #linkStatic proto ^ " " ^ + !Settings.configLib ^ "/liburweb.a " ^ + !Settings.configIcuLibs ^ " -licui18n -licuuc -licudata -licuio" else if Settings.getStaticLinking () then - " -static " ^ !Settings.configLib ^ "/" ^ #linkStatic proto ^ " " ^ !Settings.configLib ^ "/liburweb.a " ^ !Settings.configIcuLibs ^ " -licui18n -licuuc -licudata" + " -static " ^ !Settings.configLib ^ "/" ^ #linkStatic + proto ^ " " ^ !Settings.configLib ^ "/liburweb.a " ^ + !Settings.configIcuLibs ^ " -licui18n -licuuc -licudata -licuio" else "-L" ^ !Settings.configLib ^ " " ^ #linkDynamic proto ^ " -lurweb" -- cgit v1.2.3 From 71e1eb7be7ebd566a19be3cab381f813d9c2e4fc Mon Sep 17 00:00:00 2001 From: Oisín Mac Fhearaí Date: Sun, 11 Aug 2019 06:04:35 +0100 Subject: Add missing include --- src/c/urweb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/c/urweb.c b/src/c/urweb.c index dad15568..509ba10d 100644 --- a/src/c/urweb.c +++ b/src/c/urweb.c @@ -21,6 +21,7 @@ #include #include +#include #include "types.h" -- cgit v1.2.3 From 35eaf23643fcd2eb4376f07a490c959737179eef Mon Sep 17 00:00:00 2001 From: Oisín Mac Fhearaí Date: Sun, 11 Aug 2019 06:14:10 +0100 Subject: Try to avoid a pointer conversion error --- src/c/urweb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/c/urweb.c b/src/c/urweb.c index 509ba10d..a76f0004 100644 --- a/src/c/urweb.c +++ b/src/c/urweb.c @@ -20,8 +20,8 @@ #include -#include #include +#include #include "types.h" @@ -2350,7 +2350,7 @@ uw_unit uw_Basis_htmlifySpecialChar_w(uw_context ctx, uw_Basis_char ch) { if(uw_Basis_isprint(ctx, ch)) { - UChar32 ins[1] = { ch }; + const UChar ins[1] = { ch }; char buf[5]; int32_t len_written = 0; UErrorCode err = U_ZERO_ERROR; -- cgit v1.2.3 From 5e2ebc973f19fe8e5fdbe20e102e445329b528b0 Mon Sep 17 00:00:00 2001 From: Oisín Mac Fhearaí Date: Wed, 28 Aug 2019 01:56:53 +0100 Subject: Minor cleanup -- handle the case where we couldn't successfully generate a UTF8 codepoint by outputting a HTML escape (the default behaviour before for all multi-byte characters). --- src/c/urweb.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/c/urweb.c b/src/c/urweb.c index a76f0004..62561828 100644 --- a/src/c/urweb.c +++ b/src/c/urweb.c @@ -20,6 +20,7 @@ #include +#include #include #include @@ -2344,7 +2345,7 @@ char *uw_Basis_htmlifySpecialChar(uw_context ctx, uw_Basis_char ch) { uw_unit uw_Basis_htmlifySpecialChar_w(uw_context ctx, uw_Basis_char ch) { unsigned int n = ch; - int len; + int len = 0; uw_check(ctx, INTS_MAX+3); @@ -2359,7 +2360,10 @@ uw_unit uw_Basis_htmlifySpecialChar_w(uw_context ctx, uw_Basis_char ch) { sprintf(ctx->page.front, "%s", buf); // printf("buf: %s, hex: %x, len_written: %d, err: %s\n", buf, ch, len_written, u_errorName(err)); len = len_written; - } else { + } + + // either it's a non-printable character, or we failed to convert to UTF-8 + if(len == 0) { len = sprintf(ctx->page.front, "&#%u;", n); } ctx->page.front += len; -- cgit v1.2.3 From 0490176b675eb3ea36cd51fa5d1fd41a3126c10c Mon Sep 17 00:00:00 2001 From: Oisín Mac Fhearaí Date: Thu, 29 Aug 2019 21:39:53 +0100 Subject: PR suggestions (with thanks to @fabriceleal). --- src/c/urweb.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/c/urweb.c b/src/c/urweb.c index 62561828..af929269 100644 --- a/src/c/urweb.c +++ b/src/c/urweb.c @@ -2351,14 +2351,10 @@ uw_unit uw_Basis_htmlifySpecialChar_w(uw_context ctx, uw_Basis_char ch) { if(uw_Basis_isprint(ctx, ch)) { - const UChar ins[1] = { ch }; - char buf[5]; int32_t len_written = 0; UErrorCode err = U_ZERO_ERROR; - u_strToUTF8(buf, 5, &len_written, ins, 1, &err); - sprintf(ctx->page.front, "%s", buf); - // printf("buf: %s, hex: %x, len_written: %d, err: %s\n", buf, ch, len_written, u_errorName(err)); + u_strToUTF8(ctx->page.front, 5, &len_written, (const UChar*)&ch, 1, &err); len = len_written; } -- cgit v1.2.3