ARM port: add support for Thumb2. To be tested.

git-svn-id: https://yquem.inria.fr/compcert/svn/compcert/trunk@2549 fca1b0fc-160b-0410-b1d3-a4f43f01ea2e
author: xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e> 2014-07-27 07:35:49 +0000
committer: xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e> 2014-07-27 07:35:49 +0000
commit: f4e106d4fc1cce484678b5cdd86ab57d7a43076a (patch)
tree: 3de9bddc63b80fb1b695bbdb8fa5bd6aa893a13a
parent: 04ff02a9f4bc4f766a450e5463729102ee26882e (diff)
28 files changed, 1194 insertions, 799 deletions
diff --git a/arm/Archi.v b/arm/Archi.v
index 1306459..00d9895 100644
--- a/arm/Archi.v
+++ b/arm/Archi.v
@@ -47,5 +47,9 @@ Global Opaque big_endian
               default_pl_64 choose_binop_pl_64
               default_pl_32 choose_binop_pl_32.
 
+(** Which ABI to use: either the standard ARM EABI with floats passed
+  in integer registers, or the "hardfloat" variant of the EABI
+  that uses FP registers instead. *)
+
 Inductive abi_kind := Softfloat | Hardfloat.
 Parameter abi: abi_kind.
diff --git a/arm/Asm.v b/arm/Asm.v
index f054db0..0790c6f 100644
--- a/arm/Asm.v
+++ b/arm/Asm.v
@@ -91,29 +91,19 @@ Notation "'RA'" := IR14 (only parsing).
   reference manuals for more details.  Some instructions,
   described below, are pseudo-instructions: they expand to
   canned instruction sequences during the printing of the assembly
-  code. *)
+  code.  Most instructions are common to Thumb2 and ARM classic.  
+  We use a few Thumb2-specific instructions when available, and avoid
+  to use ARM classic features that are not in Thumb2. *)
 
 Definition label := positive.
 
 Inductive shift_op : Type :=
   | SOimm: int -> shift_op
   | SOreg: ireg -> shift_op
-  | SOlslimm: ireg -> int -> shift_op
-  | SOlslreg: ireg -> ireg -> shift_op
-  | SOlsrimm: ireg -> int -> shift_op
-  | SOlsrreg: ireg -> ireg -> shift_op
-  | SOasrimm: ireg -> int -> shift_op
-  | SOasrreg: ireg -> ireg -> shift_op
-  | SOrorimm: ireg -> int -> shift_op
-  | SOrorreg: ireg -> ireg -> shift_op.
-
-Inductive shift_addr : Type :=
-  | SAimm: int -> shift_addr
-  | SAreg: ireg -> shift_addr
-  | SAlsl: ireg -> int -> shift_addr
-  | SAlsr: ireg -> int -> shift_addr
-  | SAasr: ireg -> int -> shift_addr
-  | SAror: ireg -> int -> shift_addr.
+  | SOlsl: ireg -> int -> shift_op
+  | SOlsr: ireg -> int -> shift_op
+  | SOasr: ireg -> int -> shift_op
+  | SOror: ireg -> int -> shift_op.
 
 Inductive testcond : Type :=
   | TCeq: testcond    (**r equal *)
@@ -133,6 +123,7 @@ Inductive instruction : Type :=
   (* Core instructions *)
   | Padd: ireg -> ireg -> shift_op -> instruction (**r integer addition *)
   | Pand: ireg -> ireg -> shift_op -> instruction (**r bitwise and *)
+  | Pasr: ireg -> ireg -> ireg -> instruction     (**r arithmetic shift right *)
   | Pb: label -> instruction                      (**r branch to label *)
   | Pbc: testcond -> label -> instruction            (**r conditional branch to label *)
   | Pbsymb: ident -> signature -> instruction                  (**r branch to symbol *)
@@ -142,23 +133,27 @@ Inductive instruction : Type :=
   | Pbic: ireg -> ireg -> shift_op -> instruction (**r bitwise bit-clear *)
   | Pcmp: ireg -> shift_op -> instruction         (**r integer comparison *)
   | Peor: ireg -> ireg -> shift_op -> instruction (**r bitwise exclusive or *)
-  | Pldr: ireg -> ireg -> shift_addr -> instruction (**r int32 load *)
-  | Pldr_a: ireg -> ireg -> shift_addr -> instruction (**r any32 load to int register *)
-  | Pldrb: ireg -> ireg -> shift_addr -> instruction (**r unsigned int8 load *)
-  | Pldrh: ireg -> ireg -> shift_addr -> instruction (**r unsigned int16 load *)
-  | Pldrsb: ireg -> ireg -> shift_addr -> instruction (**r signed int8 load *)
-  | Pldrsh: ireg -> ireg -> shift_addr -> instruction (**r unsigned int16 load *)
+  | Pldr: ireg -> ireg -> shift_op -> instruction (**r int32 load *)
+  | Pldr_a: ireg -> ireg -> shift_op -> instruction (**r any32 load to int register *)
+  | Pldrb: ireg -> ireg -> shift_op -> instruction (**r unsigned int8 load *)
+  | Pldrh: ireg -> ireg -> shift_op -> instruction (**r unsigned int16 load *)
+  | Pldrsb: ireg -> ireg -> shift_op -> instruction (**r signed int8 load *)
+  | Pldrsh: ireg -> ireg -> shift_op -> instruction (**r unsigned int16 load *)
+  | Plsl: ireg -> ireg -> ireg -> instruction       (**r shift left *)
+  | Plsr: ireg -> ireg -> ireg -> instruction       (**r logical shift right *)
   | Pmla: ireg -> ireg -> ireg -> ireg -> instruction      (**r integer multiply-add *)
   | Pmov: ireg -> shift_op -> instruction          (**r integer move *)
-  | Pmovc: testcond -> ireg -> shift_op -> instruction (**r integer conditional move *)
+  | Pmovw: ireg -> int -> instruction              (**r move 16-bit immediate *)
+  | Pmovt: ireg -> int -> instruction              (**r set high 16 bits *)
   | Pmul: ireg -> ireg -> ireg -> instruction      (**r integer multiplication *)
   | Pmvn: ireg -> shift_op -> instruction          (**r integer complement *)
   | Porr: ireg -> ireg -> shift_op -> instruction  (**r bitwise or *)
   | Prsb: ireg -> ireg -> shift_op -> instruction  (**r integer reverse subtraction *)
-  | Pstr: ireg -> ireg -> shift_addr -> instruction (**r int32 store *)
-  | Pstr_a: ireg -> ireg -> shift_addr -> instruction (**r any32 store from int register *)
-  | Pstrb: ireg -> ireg -> shift_addr -> instruction (**r int8 store *)
-  | Pstrh: ireg -> ireg -> shift_addr -> instruction (**r int16 store *)
+  | Psbfx: ireg -> ireg -> int -> int -> instruction (**r signed bitfield extract *)
+  | Pstr: ireg -> ireg -> shift_op -> instruction (**r int32 store *)
+  | Pstr_a: ireg -> ireg -> shift_op -> instruction (**r any32 store from int register *)
+  | Pstrb: ireg -> ireg -> shift_op -> instruction (**r int8 store *)
+  | Pstrh: ireg -> ireg -> shift_op -> instruction (**r int16 store *)
   | Psdiv: instruction                              (**r signed division *)
   | Psmull: ireg -> ireg -> ireg -> ireg -> instruction (**r signed multiply long *)
   | Psub: ireg -> ireg -> shift_op -> instruction  (**r integer subtraction *)
@@ -206,6 +201,7 @@ Inductive instruction : Type :=
   | Pfreeframe: Z -> int -> instruction       (**r deallocate stack frame and restore previous frame *)
   | Plabel: label -> instruction                   (**r define a code label *)
   | Ploadsymbol: ireg -> ident -> int -> instruction (**r load the address of a symbol *)
+  | Pmovite: testcond -> ireg -> shift_op -> shift_op -> instruction (**r integer conditional move *)
   | Pbtbl: ireg -> list label -> instruction       (**r N-way branch through a jump table *)
   | Pbuiltin: external_function -> list preg -> list preg -> instruction  (**r built-in function *)
   | Pannot: external_function -> list annot_param -> instruction (**r annotation statement *)
@@ -288,6 +284,11 @@ Fixpoint undef_regs (l: list preg) (rs: regset) : regset :=
   | r :: l' => undef_regs l' (rs#r <- Vundef)
   end.
 
+(** Undefining the condition codes *)
+
+Definition undef_flags (rs: regset) : regset :=
+  fun r => match r with CR _ => Vundef | _ => rs r end.
+
 (** Assigning multiple registers *)
 
 Fixpoint set_regs (rl: list preg) (vl: list val) (rs: regset) : regset :=
@@ -347,6 +348,9 @@ Inductive outcome: Type :=
 Definition nextinstr (rs: regset) :=
   rs#PC <- (Val.add rs#PC Vone).
 
+Definition nextinstr_nf (rs: regset) :=
+  nextinstr (undef_flags rs).
+
 Definition goto_label (f: function) (lbl: label) (rs: regset) (m: mem) :=
   match label_pos lbl 0 (fn_code f) with
   | None => Stuck
@@ -363,29 +367,12 @@ Definition eval_shift_op (so: shift_op) (rs: regset) :=
   match so with
   | SOimm n => Vint n
   | SOreg r => rs r
-  | SOlslimm r n => Val.shl (rs r) (Vint n)
-  | SOlslreg r r' => Val.shl (rs r) (rs r')
-  | SOlsrimm r n => Val.shru (rs r) (Vint n)
-  | SOlsrreg r r' => Val.shru (rs r) (rs r')
-  | SOasrimm r n => Val.shr (rs r) (Vint n)
-  | SOasrreg r r' => Val.shr (rs r) (rs r')
-  | SOrorimm r n => Val.ror (rs r) (Vint n)
-  | SOrorreg r r' => Val.ror (rs r) (rs r')
-  end.
-
-(** Evaluation of [shift_addr] operands *)
-
-Definition eval_shift_addr (sa: shift_addr) (rs: regset) :=
-  match sa with
-  | SAimm n => Vint n
-  | SAreg r => rs r
-  | SAlsl r n => Val.shl (rs r) (Vint n)
-  | SAlsr r n => Val.shru (rs r) (Vint n)
-  | SAasr r n => Val.shr (rs r) (Vint n)
-  | SAror r n => Val.ror (rs r) (Vint n)
+  | SOlsl r n => Val.shl (rs r) (Vint n)
+  | SOlsr r n => Val.shru (rs r) (Vint n)
+  | SOasr r n => Val.shr (rs r) (Vint n)
+  | SOror r n => Val.ror (rs r) (Vint n)
   end.
 
-
 (** Auxiliaries for memory accesses *)
 
 Definition exec_load (chunk: memory_chunk) (addr: val) (r: preg) 
@@ -515,22 +502,29 @@ Definition eval_testcond (c: testcond) (rs: regset) : option bool :=
 
 (** Execution of a single instruction [i] in initial state
     [rs] and [m].  Return updated state.  For instructions
-    that correspond to actual PowerPC instructions, the cases are
+    that correspond to actual ARM instructions, the cases are
     straightforward transliterations of the informal descriptions
-    given in the PowerPC reference manuals.  For pseudo-instructions,
-    refer to the informal descriptions given above.  Note that
-    we set to [Vundef] the registers used as temporaries by the
-    expansions of the pseudo-instructions, so that the PPC code
-    we generate cannot use those registers to hold values that
-    must survive the execution of the pseudo-instruction.
+    given in the ARM reference manuals.  For pseudo-instructions,
+    refer to the informal descriptions given above.
+
+    Note that we set to [Vundef] the registers used as temporaries by
+    the expansions of the pseudo-instructions, so that the ARM code we
+    generate cannot use those registers to hold values that must
+    survive the execution of the pseudo-instruction.
+
+    Likewise, for several instructions we set the condition flags
+    to [Vundef], so that we can expand them later to the S form
+    or to the non-S form, whichever is more compact in Thumb2.
 *)
 
 Definition exec_instr (f: function) (i: instruction) (rs: regset) (m: mem) : outcome :=
   match i with
   | Padd r1 r2 so => 
-      Next (nextinstr (rs#r1 <- (Val.add rs#r2 (eval_shift_op so rs)))) m
+      Next (nextinstr_nf (rs#r1 <- (Val.add rs#r2 (eval_shift_op so rs)))) m
   | Pand r1 r2 so => 
-      Next (nextinstr (rs#r1 <- (Val.and rs#r2 (eval_shift_op so rs)))) m
+      Next (nextinstr_nf (rs#r1 <- (Val.and rs#r2 (eval_shift_op so rs)))) m
+  | Pasr r1 r2 r3 =>
+      Next (nextinstr_nf (rs#r1 <- (Val.shr rs#r2 rs#r3))) m
   | Pb lbl =>                      
       goto_label f lbl rs m
   | Pbc cond lbl =>            
@@ -548,49 +542,52 @@ Definition exec_instr (f: function) (i: instruction) (rs: regset) (m: mem) : out
   | Pblreg r sg =>                   
       Next (rs#IR14 <- (Val.add rs#PC Vone) #PC <- (rs#r)) m
   | Pbic r1 r2 so => 
-      Next (nextinstr (rs#r1 <- (Val.and rs#r2 (Val.notint (eval_shift_op so rs))))) m
+      Next (nextinstr_nf (rs#r1 <- (Val.and rs#r2 (Val.notint (eval_shift_op so rs))))) m
   | Pcmp r1 so => 
       Next (nextinstr (compare_int rs rs#r1 (eval_shift_op so rs) m)) m
   | Peor r1 r2 so => 
-      Next (nextinstr (rs#r1 <- (Val.xor rs#r2 (eval_shift_op so rs)))) m
+      Next (nextinstr_nf (rs#r1 <- (Val.xor rs#r2 (eval_shift_op so rs)))) m
   | Pldr r1 r2 sa => 
-      exec_load Mint32 (Val.add rs#r2 (eval_shift_addr sa rs)) r1 rs m
+      exec_load Mint32 (Val.add rs#r2 (eval_shift_op sa rs)) r1 rs m
   | Pldr_a r1 r2 sa => 
-      exec_load Many32 (Val.add rs#r2 (eval_shift_addr sa rs)) r1 rs m
+      exec_load Many32 (Val.add rs#r2 (eval_shift_op sa rs)) r1 rs m
   | Pldrb r1 r2 sa => 
-      exec_load Mint8unsigned (Val.add rs#r2 (eval_shift_addr sa rs)) r1 rs m
+      exec_load Mint8unsigned (Val.add rs#r2 (eval_shift_op sa rs)) r1 rs m
   | Pldrh r1 r2 sa => 
-      exec_load Mint16unsigned (Val.add rs#r2 (eval_shift_addr sa rs)) r1 rs m
+      exec_load Mint16unsigned (Val.add rs#r2 (eval_shift_op sa rs)) r1 rs m
   | Pldrsb r1 r2 sa => 
-      exec_load Mint8signed (Val.add rs#r2 (eval_shift_addr sa rs)) r1 rs m
+      exec_load Mint8signed (Val.add rs#r2 (eval_shift_op sa rs)) r1 rs m
   | Pldrsh r1 r2 sa => 
-      exec_load Mint16signed (Val.add rs#r2 (eval_shift_addr sa rs)) r1 rs m
+      exec_load Mint16signed (Val.add rs#r2 (eval_shift_op sa rs)) r1 rs m
+  | Plsl r1 r2 r3 =>
+      Next (nextinstr_nf (rs#r1 <- (Val.shl rs#r2 rs#r3))) m
+  | Plsr r1 r2 r3 =>
+      Next (nextinstr_nf (rs#r1 <- (Val.shru rs#r2 rs#r3))) m
   | Pmla r1 r2 r3 r4 =>      
       Next (nextinstr (rs#r1 <- (Val.add (Val.mul rs#r2 rs#r3) rs#r4))) m
   | Pmov r1 so =>          
-      Next (nextinstr (rs#r1 <- (eval_shift_op so rs))) m
-  | Pmovc cond r1 so =>          
-      match eval_testcond cond rs with
-      | Some true => Next (nextinstr (rs#r1 <- (eval_shift_op so rs))) m
-      | Some false => Next (nextinstr rs) m
-      | None => Next (nextinstr (rs#r1 <- Vundef)) m
-      end
+      Next (nextinstr_nf (rs#r1 <- (eval_shift_op so rs))) m
+  | Pmovw r n =>
+      Next (nextinstr (rs#r <- (Vint n))) m
+  | Pmovt r n =>
+      Next (nextinstr (rs#r <- (Val.or (Val.and rs#r (Vint (Int.repr 65535)))
+                                       (Vint (Int.shl n (Int.repr 16)))))) m
   | Pmul r1 r2 r3 =>      
       Next (nextinstr (rs#r1 <- (Val.mul rs#r2 rs#r3))) m
   | Pmvn r1 so =>          
-      Next (nextinstr (rs#r1 <- (Val.notint (eval_shift_op so rs)))) m
+      Next (nextinstr_nf (rs#r1 <- (Val.notint (eval_shift_op so rs)))) m
   | Porr r1 r2 so =>  
-      Next (nextinstr (rs#r1 <- (Val.or rs#r2 (eval_shift_op so rs)))) m
+      Next (nextinstr_nf (rs#r1 <- (Val.or rs#r2 (eval_shift_op so rs)))) m
   | Prsb r1 r2 so =>  
-      Next (nextinstr (rs#r1 <- (Val.sub (eval_shift_op so rs) rs#r2))) m
+      Next (nextinstr_nf (rs#r1 <- (Val.sub (eval_shift_op so rs) rs#r2))) m
   | Pstr r1 r2 sa => 
-      exec_store Mint32 (Val.add rs#r2 (eval_shift_addr sa rs)) r1 rs m
+      exec_store Mint32 (Val.add rs#r2 (eval_shift_op sa rs)) r1 rs m
   | Pstr_a r1 r2 sa => 
-      exec_store Many32 (Val.add rs#r2 (eval_shift_addr sa rs)) r1 rs m
+      exec_store Many32 (Val.add rs#r2 (eval_shift_op sa rs)) r1 rs m
   | Pstrb r1 r2 sa => 
-      exec_store Mint8unsigned (Val.add rs#r2 (eval_shift_addr sa rs)) r1 rs m
+      exec_store Mint8unsigned (Val.add rs#r2 (eval_shift_op sa rs)) r1 rs m
   | Pstrh r1 r2 sa => 
-      exec_store Mint16unsigned (Val.add rs#r2 (eval_shift_addr sa rs)) r1 rs m
+      exec_store Mint16unsigned (Val.add rs#r2 (eval_shift_op sa rs)) r1 rs m
   | Psdiv =>
       match Val.divs rs#IR0 rs#IR1 with
       | Some v => Next (nextinstr (rs#IR0 <- v
@@ -598,11 +595,13 @@ Definition exec_instr (f: function) (i: instruction) (rs: regset) (m: mem) : out
                                      #IR3 <- Vundef #IR12 <- Vundef)) m
       | None => Stuck
       end
+  | Psbfx r1 r2 lsb sz =>
+      Next (nextinstr (rs#r1 <- (Val.sign_ext (Int.unsigned sz) (Val.shru rs#r2 (Vint lsb))))) m
   | Psmull rdl rdh r1 r2 =>
       Next (nextinstr (rs#rdl <- (Val.mul rs#r1 rs#r2)
                          #rdh <- (Val.mulhs rs#r1 rs#r2))) m
   | Psub r1 r2 so =>  
-      Next (nextinstr (rs#r1 <- (Val.sub rs#r2 (eval_shift_op so rs)))) m
+      Next (nextinstr_nf (rs#r1 <- (Val.sub rs#r2 (eval_shift_op so rs)))) m
   | Pudiv =>
       match Val.divu rs#IR0 rs#IR1 with
       | Some v => Next (nextinstr (rs#IR0 <- v
@@ -709,6 +708,14 @@ Definition exec_instr (f: function) (i: instruction) (rs: regset) (m: mem) : out
       Next (nextinstr rs) m
   | Ploadsymbol r1 lbl ofs =>
       Next (nextinstr (rs#r1 <- (Genv.symbol_address ge lbl ofs))) m
+  | Pmovite cond r1 ifso ifnot =>
+      let v :=
+        match eval_testcond cond rs with
+        | Some true => eval_shift_op ifso rs
+        | Some false => eval_shift_op ifnot rs
+        | None => Vundef
+        end in
+      Next (nextinstr (rs#r1 <- v)) m
   | Pbtbl r tbl =>
       match rs#r with
       | Vint n => 
diff --git a/arm/Asmgen.v b/arm/Asmgen.v
index fa4faa6..de4b87f 100644
--- a/arm/Asmgen.v
+++ b/arm/Asmgen.v
@@ -21,6 +21,7 @@ Require Import Op.
 Require Import Locations.
 Require Import Mach.
 Require Import Asm.
+Require Import Compopts.
 
 Open Local Scope string_scope.
 Open Local Scope error_monad_scope.
@@ -33,62 +34,120 @@ Definition ireg_of (r: mreg) : res ireg :=
 Definition freg_of (r: mreg) : res freg :=
   match preg_of r with FR mr => OK mr | _ => Error(msg "Asmgen.freg_of") end.
 
-(** Recognition of integer immediate arguments.
-- For arithmetic operations, immediates are
-  8-bit quantities zero-extended and rotated right by 0, 2, 4, ... 30 bits.
-- For memory accesses of type [Mint32], immediate offsets are
-  12-bit quantities plus a sign bit.
-- For other memory accesses, immediate offsets are
-  8-bit quantities plus a sign bit. *)
-
-Fixpoint is_immed_arith_aux (n: nat) (x msk: int) {struct n}: bool :=
+(** Recognition of integer immediate arguments for arithmetic operations.
+- ARM: immediates are 8-bit quantities zero-extended and rotated right
+  by 0, 2, 4, ... 30 bits.  In other words, [n] is an immediate iff
+  [rotate-left(n, p)] is between 0 and 255 for some [p = 0, 2, 4, ..., 30].
+- Thumb: immediates are 8-bit quantities zero-extended and shifted left
+  by 0, 1, ..., 31 bits.  In other words, [n] is an immediate if
+  all bits are 0 except a run of 8 adjacent bits.  In addition,
+  [00XY00XY] and [XY00XY00] and [XYXYXYXY] are immediates for
+  a given [XY] 8-bit constant.
+*)
+
+Fixpoint is_immed_arith_arm (n: nat) (x: int) {struct n}: bool :=
   match n with
   | Datatypes.O => false
-  | Datatypes.S n' =>
-      Int.eq (Int.and x (Int.not msk)) Int.zero ||
-      is_immed_arith_aux n' x (Int.ror msk (Int.repr 2))
+  | Datatypes.S n =>
+      Int.eq x (Int.and x (Int.repr 255)) ||
+      is_immed_arith_arm n (Int.rol x (Int.repr 2))
+  end.
+
+Fixpoint is_immed_arith_thumb (n: nat) (x: int) {struct n}: bool :=
+  match n with
+  | Datatypes.O => true
+  | Datatypes.S n =>
+      Int.eq x (Int.and x (Int.repr 255)) ||
+      (Int.eq (Int.and x Int.one) Int.zero
+       && is_immed_arith_thumb n (Int.shru x Int.one))
   end.
 
-Definition is_immed_arith (x: int) : bool :=
-  is_immed_arith_aux 16%nat x (Int.repr 255).
-
-Definition is_immed_mem_word (x: int) : bool :=
-  Int.lt x (Int.repr 4096) && Int.lt (Int.repr (-4096)) x.
-
-Definition mk_immed_mem_word (x: int) : int :=
-  Int.sign_ext 13 x.
-  
-Definition is_immed_mem_small (x: int) : bool :=
-  Int.lt x (Int.repr 256) && Int.lt (Int.repr (-256)) x.
-  
-Definition mk_immed_mem_small (x: int) : int :=
-  Int.sign_ext 9 x.
-  
-Definition is_immed_mem_float (x: int) : bool :=
-  Int.eq (Int.and x (Int.repr 3)) Int.zero
-  && Int.lt x (Int.repr 1024) && Int.lt (Int.repr (-1024)) x.
-  
-Definition mk_immed_mem_float (x: int) : int :=
-  Int.and (Int.sign_ext 11 x) (Int.repr 4294967288).  (**r 0xfffffff8 *)
-  
+Definition is_immed_arith_thumb_special (x: int): bool :=
+  let l1 := Int.and x (Int.repr 255) in
+  let l2 := Int.shl l1 (Int.repr 8) in
+  let l3 := Int.shl l2 (Int.repr 8) in
+  let l4 := Int.shl l3 (Int.repr 8) in
+  let l13 := Int.or l1 l3 in
+  let l24 := Int.or l2 l4 in
+  Int.eq x l13 || Int.eq x l24 || Int.eq x (Int.or l13 l24).
+
+Definition is_immed_arith (x: int): bool :=
+  if thumb tt
+  then is_immed_arith_thumb 24%nat x || is_immed_arith_thumb_special x
+  else is_immed_arith_arm 16%nat x.
+
+(** Recognition of integer immediate arguments for indexed memory accesses.
+- For 32-bit integers, immediate offsets are [(-2^12,2^12)] for ARM classic
+  and [(-2^8,2^12)] for Thumb2.
+- For 8- and 16-bit integers, immediate offsets are [(-2^8,2^8)].
+- For 32- and 64-bit integers, immediate offsets are multiples of 4
+  in [(-2^10,2^10)].
+
+For all 3 kinds of accesses, we provide not a recognizer but a synthesizer:
+a function taking an arbitrary offset [n] and returning a valid offset [n']
+that contains as many useful bits of [n] as possible, so that the
+computation of the remainder [n - n'] is as simple as possible.
+In particular, if [n] is a representable immediate argument, we should have
+[n' = n].
+*)
+
+Definition mk_immed_mem_word (x: int): int :=
+  if Int.ltu x Int.zero then
+    Int.neg (Int.zero_ext (if thumb tt then 8 else 12) (Int.neg x))
+  else
+    Int.zero_ext 12 x.
+
+Definition mk_immed_mem_small (x: int): int :=
+  if Int.ltu x Int.zero then
+    Int.neg (Int.zero_ext 8 (Int.neg x))
+  else
+    Int.zero_ext 8 x.
+
+Definition mk_immed_mem_float (x: int): int :=
+  let x := Int.and x (Int.repr (-4)) in   (**r mask low 2 bits off *)
+  if Int.ltu x Int.zero then
+    Int.neg (Int.zero_ext 10 (Int.neg x))
+  else
+    Int.zero_ext 10 x.
+
 (** Decomposition of a 32-bit integer into a list of immediate arguments,
     whose sum or "or" or "xor" equals the integer. *)
 
-Fixpoint decompose_int_rec (N: nat) (n p: int) : list int :=
+Fixpoint decompose_int_arm (N: nat) (n p: int) : list int :=
   match N with
   | Datatypes.O =>
       if Int.eq n Int.zero then nil else n :: nil
   | Datatypes.S M =>
       if Int.eq (Int.and n (Int.shl (Int.repr 3) p)) Int.zero then
-        decompose_int_rec M n (Int.add p (Int.repr 2))
+        decompose_int_arm M n (Int.add p (Int.repr 2))
+      else
+        let m := Int.shl (Int.repr 255) p in
+        Int.and n m ::
+        decompose_int_arm M (Int.and n (Int.not m)) (Int.add p (Int.repr 2))
+  end.
+
+Fixpoint decompose_int_thumb (N: nat) (n p: int) : list int :=
+  match N with
+  | Datatypes.O =>
+      if Int.eq n Int.zero then nil else n :: nil
+  | Datatypes.S M =>
+      if Int.eq (Int.and n (Int.shl Int.one p)) Int.zero then
+        decompose_int_thumb M n (Int.add p Int.one)
       else
         let m := Int.shl (Int.repr 255) p in
         Int.and n m ::
-        decompose_int_rec M (Int.and n (Int.not m)) (Int.add p (Int.repr 2))
+        decompose_int_thumb M (Int.and n (Int.not m)) (Int.add p Int.one)
   end.
 
+Definition decompose_int_base (n: int): list int :=
+  if thumb tt
+  then if is_immed_arith_thumb_special n
+       then n :: nil
+       else decompose_int_thumb 24%nat n Int.zero
+  else decompose_int_arm 12%nat n Int.zero.
+
 Definition decompose_int (n: int) : list int :=
-  match decompose_int_rec 12%nat n Int.zero with
+  match decompose_int_base n with
   | nil => Int.zero :: nil
   | l   => l
   end.
@@ -103,28 +162,46 @@ Definition iterate_op (op1 op2: shift_op -> instruction) (l: list int) (k: code)
 
 (** Smart constructors for integer immediate arguments. *)
 
+Definition loadimm_thumb (r: ireg) (n: int) (k: code) :=
+  let hi := Int.shru n (Int.repr 16) in
+  if Int.eq hi Int.zero
+  then Pmovw r n :: k
+  else Pmovw r (Int.zero_ext 16 n) :: Pmovt r hi :: k.
+
 Definition loadimm (r: ireg) (n: int) (k: code) :=
   let d1 := decompose_int n in
   let d2 := decompose_int (Int.not n) in
-  if NPeano.leb (List.length d1) (List.length d2)
-  then iterate_op (Pmov r) (Porr r r) d1 k
-  else iterate_op (Pmvn r) (Pbic r r) d2 k.
+  let l1 := List.length d1 in
+  let l2 := List.length d2 in
+  if NPeano.leb l1 1%nat then
+    Pmov r (SOimm n) :: k
+  else if NPeano.leb l2 1%nat then
+    Pmvn r (SOimm (Int.not n)) :: k
+  else if thumb tt then
+    loadimm_thumb r n k
+  else if NPeano.leb l1 l2 then
+    iterate_op (Pmov r) (Porr r r) d1 k
+  else
+    iterate_op (Pmvn r) (Pbic r r) d2 k.
 
 Definition addimm (r1 r2: ireg) (n: int) (k: code) :=
-  let d1 := decompose_int n in
-  let d2 := decompose_int (Int.neg n) in
-  if NPeano.leb (List.length d1) (List.length d2)
-  then iterate_op (Padd r1 r2) (Padd r1 r1) d1 k
-  else iterate_op (Psub r1 r2) (Psub r1 r1) d2 k.
+  if Int.ltu (Int.repr (-256)) n then
+    Psub r1 r2 (SOimm (Int.neg n)) :: k
+  else
+   (let d1 := decompose_int n in
+    let d2 := decompose_int (Int.neg n) in
+    if NPeano.leb (List.length d1) (List.length d2)
+    then iterate_op (Padd r1 r2) (Padd r1 r1) d1 k
+    else iterate_op (Psub r1 r2) (Psub r1 r1) d2 k).
+
+Definition rsubimm (r1 r2: ireg) (n: int) (k: code) :=
+  iterate_op (Prsb r1 r2) (Padd r1 r1) (decompose_int n) k.
 
 Definition andimm (r1 r2: ireg) (n: int) (k: code) :=
   if is_immed_arith n 
   then Pand r1 r2 (SOimm n) :: k
   else iterate_op (Pbic r1 r2) (Pbic r1 r1) (decompose_int (Int.not n)) k.
 
-Definition rsubimm (r1 r2: ireg) (n: int) (k: code) :=
-  iterate_op (Prsb r1 r2) (Padd r1 r1) (decompose_int n) k.
-
 Definition orimm  (r1 r2: ireg) (n: int) (k: code) :=
   iterate_op (Porr r1 r2) (Porr r1 r1) (decompose_int n) k.
 
@@ -135,10 +212,10 @@ Definition xorimm  (r1 r2: ireg) (n: int) (k: code) :=
 
 Definition transl_shift (s: shift) (r: ireg) : shift_op :=
   match s with
-  | Slsl n => SOlslimm r (s_amount n)
-  | Slsr n => SOlsrimm r (s_amount n)
-  | Sasr n => SOasrimm r (s_amount n)
-  | Sror n => SOrorimm r (s_amount n)
+  | Slsl n => SOlsl r (s_amount n)
+  | Slsr n => SOlsr r (s_amount n)
+  | Sasr n => SOasr r (s_amount n)
+  | Sror n => SOror r (s_amount n)
   end.
 
 (** Translation of a condition.  Prepends to [k] the instructions
@@ -288,12 +365,18 @@ Definition transl_op
       OK (addimm r IR13 n k)
   | Ocast8signed, a1 :: nil =>
       do r <- ireg_of res; do r1 <- ireg_of a1;
-      OK (Pmov r (SOlslimm r1 (Int.repr 24)) ::
-          Pmov r (SOasrimm r (Int.repr 24)) :: k)
+      OK (if thumb tt then
+            Psbfx r r1 Int.zero (Int.repr 8) :: k
+          else
+            Pmov r (SOlsl r1 (Int.repr 24)) ::
+            Pmov r (SOasr r (Int.repr 24)) :: k)
   | Ocast16signed, a1 :: nil =>
       do r <- ireg_of res; do r1 <- ireg_of a1;
-      OK (Pmov r (SOlslimm r1 (Int.repr 16)) ::
-          Pmov r (SOasrimm r (Int.repr 16)) :: k)
+      OK (if thumb tt then
+            Psbfx r r1 Int.zero (Int.repr 16) :: k
+          else
+            Pmov r (SOlsl r1 (Int.repr 16)) ::
+            Pmov r (SOasr r (Int.repr 16)) :: k)
   | Oadd, a1 :: a2 :: nil =>
       do r <- ireg_of res; do r1 <- ireg_of a1; do r2 <- ireg_of a2;
       OK (Padd r r1 (SOreg r2) :: k)
@@ -317,15 +400,11 @@ Definition transl_op
       OK (rsubimm r r1 n k)
   | Omul, a1 :: a2 :: nil =>
       do r <- ireg_of res; do r1 <- ireg_of a1; do r2 <- ireg_of a2;
-      OK (if negb (ireg_eq r r1) then Pmul r r1 r2 :: k
-          else if negb (ireg_eq r r2) then Pmul r r2 r1 :: k
-          else Pmul IR14 r1 r2 :: Pmov r (SOreg IR14) :: k)
+      OK (Pmul r r1 r2 :: k)
   | Omla, a1 :: a2 :: a3 :: nil =>
       do r <- ireg_of res; do r1 <- ireg_of a1; 
       do r2 <- ireg_of a2; do r3 <- ireg_of a3;
-      OK (if negb (ireg_eq r r1) then Pmla r r1 r2 r3 :: k
-          else if negb (ireg_eq r r2) then Pmla r r2 r1 r3 :: k
-          else Pmla IR14 r1 r2 r3 :: Pmov r (SOreg IR14) :: k)
+      OK (Pmla r r1 r2 r3 :: k)
   | Omulhs, a1 :: a2 :: nil =>
       do r <- ireg_of res; do r1 <- ireg_of a1; do r2 <- ireg_of a2;
       OK (Psmull IR14 r r1 r2 :: k)
@@ -383,13 +462,13 @@ Definition transl_op
       OK (Pmvn r (transl_shift s r1) :: k)
   | Oshl, a1 :: a2 :: nil =>
       do r <- ireg_of res; do r1 <- ireg_of a1; do r2 <- ireg_of a2;
-      OK (Pmov r (SOlslreg r1 r2) :: k)
+      OK (Plsl r r1 r2 :: k)
   | Oshr, a1 :: a2 :: nil =>
       do r <- ireg_of res; do r1 <- ireg_of a1; do r2 <- ireg_of a2;
-      OK (Pmov r (SOasrreg r1 r2) :: k)
+      OK (Pasr r r1 r2 :: k)
   | Oshru, a1 :: a2 :: nil =>
       do r <- ireg_of res; do r1 <- ireg_of a1; do r2 <- ireg_of a2;
-      OK (Pmov r (SOlsrreg r1 r2) :: k)
+      OK (Plsr r r1 r2 :: k)
   | Oshift s, a1 :: nil =>
       do r <- ireg_of res; do r1 <- ireg_of a1;
       OK (Pmov r (transl_shift s r1) :: k)
@@ -398,9 +477,9 @@ Definition transl_op
       if Int.eq n Int.zero then
         OK (Pmov r (SOreg r1) :: k)
       else
-        OK (Pmov IR14 (SOasrimm r1 (Int.repr 31)) ::
-            Padd IR14 r1 (SOlsrimm IR14 (Int.sub Int.iwordsize n)) ::
-            Pmov r (SOasrimm IR14 n) :: k)
+        OK (Pmov IR14 (SOasr r1 (Int.repr 31)) ::
+            Padd IR14 r1 (SOlsr IR14 (Int.sub Int.iwordsize n)) ::
+            Pmov r (SOasr IR14 n) :: k)
   | Onegf, a1 :: nil =>
       do r <- freg_of res; do r1 <- freg_of a1;
       OK (Pfnegd r r1 :: k)
@@ -470,9 +549,7 @@ Definition transl_op
   | Ocmp cmp, _ =>
       do r <- ireg_of res;
       transl_cond cmp args
-        (Pmov r (SOimm Int.zero) ::
-         Pmovc (cond_for_cond cmp) r (SOimm Int.one) ::
-         k)
+        (Pmovite (cond_for_cond cmp) r (SOimm Int.one) (SOimm Int.zero) :: k)
   | _, _ =>
       Error(msg "Asmgen.transl_op")
   end.
@@ -489,14 +566,14 @@ Definition indexed_memory_access
   else addimm IR14 base (Int.sub n n1) (mk_instr IR14 n1 :: k).
 
 Definition loadind_int (base: ireg) (ofs: int) (dst: ireg) (k: code) :=
-  indexed_memory_access (fun base n => Pldr dst base (SAimm n)) mk_immed_mem_word base ofs k.
+  indexed_memory_access (fun base n => Pldr dst base (SOimm n)) mk_immed_mem_word base ofs k.
 
 Definition loadind (base: ireg) (ofs: int) (ty: typ) (dst: mreg) (k: code) :=
   match ty, preg_of dst with
   | Tint, IR r =>
-      OK (indexed_memory_access (fun base n => Pldr r base (SAimm n)) mk_immed_mem_word base ofs k)
+      OK (indexed_memory_access (fun base n => Pldr r base (SOimm n)) mk_immed_mem_word base ofs k)
   | Tany32, IR r =>
-      OK (indexed_memory_access (fun base n => Pldr_a r base (SAimm n)) mk_immed_mem_word base ofs k)
+      OK (indexed_memory_access (fun base n => Pldr_a r base (SOimm n)) mk_immed_mem_word base ofs k)
   | Tsingle, FR r =>
       OK (indexed_memory_access (Pflds r) mk_immed_mem_float base ofs k)
   | Tfloat, FR r =>
@@ -510,9 +587,9 @@ Definition loadind (base: ireg) (ofs: int) (ty: typ) (dst: mreg) (k: code) :=
 Definition storeind (src: mreg) (base: ireg) (ofs: int) (ty: typ) (k: code) :=
   match ty, preg_of src with
   | Tint, IR r =>
-      OK (indexed_memory_access (fun base n => Pstr r base (SAimm n)) mk_immed_mem_word base ofs k)
+      OK (indexed_memory_access (fun base n => Pstr r base (SOimm n)) mk_immed_mem_word base ofs k)
   | Tany32, IR r =>
-      OK (indexed_memory_access (fun base n => Pstr_a r base (SAimm n)) mk_immed_mem_word base ofs k)
+      OK (indexed_memory_access (fun base n => Pstr_a r base (SOimm n)) mk_immed_mem_word base ofs k)
   | Tsingle, FR r =>
       OK (indexed_memory_access (Pfsts r) mk_immed_mem_float base ofs k)
   | Tfloat, FR r =>
@@ -525,17 +602,9 @@ Definition storeind (src: mreg) (base: ireg) (ofs: int) (ty: typ) (k: code) :=
 
 (** Translation of memory accesses *)
 
-Definition transl_shift_addr (s: shift) (r: ireg) : shift_addr :=
-  match s with
-  | Slsl n => SAlsl r (s_amount n)
-  | Slsr n => SAlsr r (s_amount n)
-  | Sasr n => SAasr r (s_amount n)
-  | Sror n => SAror r (s_amount n)
-  end.
-
 Definition transl_memory_access
      (mk_instr_imm: ireg -> int -> instruction)
-     (mk_instr_gen: option (ireg -> shift_addr -> instruction))
+     (mk_instr_gen: option (ireg -> shift_op -> instruction))
      (mk_immed: int -> int)
      (addr: addressing) (args: list mreg) (k: code) :=
   match addr, args with
@@ -546,7 +615,7 @@ Definition transl_memory_access
       match mk_instr_gen with
       | Some f =>
           do r1 <- ireg_of a1; do r2 <- ireg_of a2;
-          OK (f r1 (SAreg r2) :: k)
+          OK (f r1 (SOreg r2) :: k)
       | None =>
           Error (msg "Asmgen.Aindexed2")
       end
@@ -554,7 +623,7 @@ Definition transl_memory_access
       match mk_instr_gen with
       | Some f =>
           do r1 <- ireg_of a1; do r2 <- ireg_of a2;
-          OK (f r1 (transl_shift_addr s r2) :: k)
+          OK (f r1 (transl_shift s r2) :: k)
       | None =>
           Error (msg "Asmgen.Aindexed2shift")
       end
@@ -565,12 +634,12 @@ Definition transl_memory_access
   end.
 
 Definition transl_memory_access_int
-     (mk_instr: ireg -> ireg -> shift_addr -> instruction)
+     (mk_instr: ireg -> ireg -> shift_op -> instruction)
      (mk_immed: int -> int)
      (dst: mreg) (addr: addressing) (args: list mreg) (k: code) :=
   do rd <- ireg_of dst;
   transl_memory_access
-    (fun r n => mk_instr rd r (SAimm n))
+    (fun r n => mk_instr rd r (SOimm n))
     (Some (mk_instr rd))
     mk_immed addr args k.
 
@@ -729,7 +798,7 @@ Definition transl_function (f: Mach.function) :=
   do c <- transl_code f f.(Mach.fn_code) true;
   OK (mkfunction f.(Mach.fn_sig)
         (Pallocframe f.(fn_stacksize) f.(fn_link_ofs) ::
-         Pstr IR14 IR13 (SAimm f.(fn_retaddr_ofs)) :: c)).
+         Pstr IR14 IR13 (SOimm f.(fn_retaddr_ofs)) :: c)).
 
 Definition transf_function (f: Mach.function) : res Asm.function :=
   do tf <- transl_function f;
diff --git a/arm/Asmgenproof.v b/arm/Asmgenproof.v
index 341f6a0..713e012 100644
--- a/arm/Asmgenproof.v
+++ b/arm/Asmgenproof.v
@@ -27,6 +27,7 @@ Require Import Op.
 Require Import Locations.
 Require Import Conventions.
 Require Import Mach.
+Require Import Compopts.
 Require Import Asm.
 Require Import Asmgen.
 Require Import Asmgenproof0.
@@ -179,8 +180,13 @@ Remark loadimm_label:
   forall r n k, tail_nolabel k (loadimm r n k).
 Proof.
   intros. unfold loadimm.
-  destruct (NPeano.leb (length (decompose_int n)) (length (decompose_int (Int.not n))));
-  auto with labels.
+  set (l1 := length (decompose_int n)).
+  set (l2 := length (decompose_int (Int.not n))).
+  destruct (NPeano.leb l1 1%nat). TailNoLabel.
+  destruct (NPeano.leb l2 1%nat). TailNoLabel.
+  destruct (thumb tt). unfold loadimm_thumb.
+  destruct (Int.eq (Int.shru n (Int.repr 16)) Int.zero); TailNoLabel.
+  destruct (NPeano.leb l1 l2); auto with labels.
 Qed.
 Hint Resolve loadimm_label: labels.
 
@@ -188,6 +194,7 @@ Remark addimm_label:
   forall r1 r2 n k, tail_nolabel k (addimm r1 r2 n k).
 Proof.
   intros; unfold addimm.
+  destruct (Int.ltu (Int.repr (-256)) n). TailNoLabel.
   destruct (NPeano.leb (length (decompose_int n)) (length (decompose_int (Int.neg n))));
   auto with labels.
 Qed.
@@ -262,15 +269,15 @@ Proof.
 Opaque Int.eq.
   unfold transl_op; intros; destruct op; TailNoLabel.
   destruct (preg_of r); try discriminate; destruct (preg_of m); inv H; TailNoLabel.
-  destruct (negb (ireg_eq x x0)). TailNoLabel. destruct (negb (ireg_eq x x1)); TailNoLabel.
-  destruct (negb (ireg_eq x x0)). TailNoLabel. destruct (negb (ireg_eq x x1)); TailNoLabel.
+  destruct (thumb tt); TailNoLabel.
+  destruct (thumb tt); TailNoLabel.
   eapply tail_nolabel_trans; TailNoLabel.
   eapply tail_nolabel_trans. eapply transl_cond_label; eauto. TailNoLabel.
 Qed.
 
 Remark transl_memory_access_label:
   forall (mk_instr_imm: ireg -> int -> instruction)
-         (mk_instr_gen: option (ireg -> shift_addr -> instruction))
+         (mk_instr_gen: option (ireg -> shift_op -> instruction))
          (mk_immed: int -> int)
          (addr: addressing) (args: list mreg) c k,
   transl_memory_access mk_instr_imm mk_instr_gen mk_immed addr args k = OK c ->
@@ -556,8 +563,6 @@ Proof.
   exists rs'; split. eauto.
   split. eapply agree_undef_regs; eauto with asmgen.
   simpl; intros. rewrite Q; auto with asmgen.
-Local Transparent destroyed_by_setstack.
-  destruct ty; simpl; auto with asmgen.
 
 - (* Mgetparam *)
   assert (f0 = f) by congruence; subst f0.
@@ -687,7 +692,7 @@ Opaque loadind.
        k rs2 m2'
     /\ rs2#SP = parent_sp s
     /\ rs2#RA = parent_ra s
-    /\ forall r, r <> PC -> r <> SP -> r <> IR14 -> rs2#r = rs0#r).
+    /\ forall r, if_preg r = true -> r <> SP -> r <> IR14 -> rs2#r = rs0#r).
   {
     intros. 
     exploit loadind_int_correct. eexact C. intros [rs1 [P [Q R]]]. 
@@ -695,9 +700,7 @@ Opaque loadind.
     eapply exec_straight_trans. eexact P. apply exec_straight_one. 
     simpl. rewrite R; auto with asmgen. unfold chunk_of_type in A. rewrite A. 
     rewrite <- (sp_val _ _ _ AG). rewrite E. eauto. auto. 
-    split. Simpl.
-    split. Simpl.
-    intros. Simpl. 
+    split. Simpl. split. Simpl. intros. Simpl. 
   }
   destruct ros as [rf|fid]; simpl in H; monadInv H7.
 + (* Indirect call *)
@@ -794,19 +797,19 @@ Opaque loadind.
   left; eapply exec_straight_steps_goto; eauto.
   intros. simpl in TR.
   destruct (transl_cond_correct tge tf cond args _ rs0 m' _ TR) as [rs' [A [B C]]].
-  rewrite EC in B.
+  rewrite EC in B. destruct B as [Bpos Bneg].
   econstructor; econstructor; econstructor; split. eexact A. 
   split. eapply agree_undef_regs; eauto with asmgen.
-  simpl. rewrite B. reflexivity. 
+  simpl. rewrite Bpos. reflexivity. 
 
 - (* Mcond false *)
   exploit eval_condition_lessdef. eapply preg_vals; eauto. eauto. eauto. intros EC.
   left; eapply exec_straight_steps; eauto. intros. simpl in TR.
   destruct (transl_cond_correct tge tf cond args _ rs0 m' _ TR) as [rs' [A [B C]]].
-  rewrite EC in B.
+  rewrite EC in B. destruct B as [Bpos Bneg].
   econstructor; split.
   eapply exec_straight_trans. eexact A. 
-  apply exec_straight_one. simpl. rewrite B. reflexivity. auto.
+  apply exec_straight_one. simpl. rewrite Bpos. reflexivity. auto.
   split. eapply agree_undef_regs; eauto with asmgen.
   intros; Simpl.
   simpl. congruence.
@@ -849,7 +852,7 @@ Opaque loadind.
        k rs2 m2'
     /\ rs2#SP = parent_sp s
     /\ rs2#RA = parent_ra s
-    /\ forall r, r <> PC -> r <> SP -> r <> IR14 -> rs2#r = rs0#r).
+    /\ forall r, if_preg r = true -> r <> SP -> r <> IR14 -> rs2#r = rs0#r).
   {
     intros. 
     exploit loadind_int_correct. eexact C. intros [rs1 [P [Q R]]]. 
diff --git a/arm/Asmgenproof1.v b/arm/Asmgenproof1.v
index a0d6752..c859434 100644
--- a/arm/Asmgenproof1.v
+++ b/arm/Asmgenproof1.v
@@ -24,6 +24,7 @@ Require Import Globalenvs.
 Require Import Op.
 Require Import Locations.
 Require Import Mach.
+Require Import Compopts.
 Require Import Asm.
 Require Import Asmgen.
 Require Import Conventions.
@@ -45,13 +46,55 @@ Proof.
 Qed.
 Hint Resolve ireg_of_not_R14': asmgen.
 
+(** [undef_flags] and [nextinstr_nf] *)
+
+Lemma nextinstr_nf_pc:
+  forall rs, (nextinstr_nf rs)#PC = Val.add rs#PC Vone.
+Proof.
+  intros. reflexivity. 
+Qed.
+
+Definition if_preg (r: preg) : bool :=
+  match r with
+  | IR _ => true
+  | FR _ => true
+  | CR _ => false
+  | PC   => false
+  end.
+
+Lemma data_if_preg: forall r, data_preg r = true -> if_preg r = true.
+Proof.
+  intros. destruct r; reflexivity || discriminate.
+Qed.
+
+Lemma if_preg_not_PC: forall r, if_preg r = true -> r <> PC.
+Proof.
+  intros; red; intros; subst; discriminate.
+Qed.
+
+Hint Resolve data_if_preg if_preg_not_PC: asmgen.
+
+Lemma nextinstr_nf_inv:
+  forall r rs, if_preg r = true -> (nextinstr_nf rs)#r = rs#r.
+Proof.
+  intros. destruct r; reflexivity || discriminate.
+Qed.
+
+Lemma nextinstr_nf_inv1:
+  forall r rs, data_preg r = true -> (nextinstr_nf rs)#r = rs#r.
+Proof.
+  intros. destruct r; reflexivity || discriminate. 
+Qed.
+
 (** Useful simplification tactic *)
 
 Ltac Simplif :=
   ((rewrite nextinstr_inv by eauto with asmgen)
   || (rewrite nextinstr_inv1 by eauto with asmgen)
+  || (rewrite nextinstr_nf_inv by eauto with asmgen)
   || (rewrite Pregmap.gss)
   || (rewrite nextinstr_pc)
+  || (rewrite nextinstr_nf_pc)
   || (rewrite Pregmap.gso by eauto with asmgen)); auto with asmgen.
 
 Ltac Simpl := repeat Simplif.
@@ -65,8 +108,8 @@ Variable fn: function.
 
 (** Decomposition of an integer constant *)
 
-Lemma decompose_int_rec_or:
-  forall N n p x, List.fold_left Int.or (decompose_int_rec N n p) x = Int.or x n.
+Lemma decompose_int_arm_or:
+  forall N n p x, List.fold_left Int.or (decompose_int_arm N n p) x = Int.or x n.
 Proof.
   induction N; intros; simpl.
   predSpec Int.eq Int.eq_spec n Int.zero; simpl.
@@ -78,8 +121,8 @@ Proof.
   rewrite Int.or_not_self. apply Int.and_mone.
 Qed.
 
-Lemma decompose_int_rec_xor:
-  forall N n p x, List.fold_left Int.xor (decompose_int_rec N n p) x = Int.xor x n.
+Lemma decompose_int_arm_xor:
+  forall N n p x, List.fold_left Int.xor (decompose_int_arm N n p) x = Int.xor x n.
 Proof.
   induction N; intros; simpl.
   predSpec Int.eq Int.eq_spec n Int.zero; simpl.
@@ -91,8 +134,8 @@ Proof.
   rewrite Int.xor_not_self. apply Int.and_mone.
 Qed.
 
-Lemma decompose_int_rec_add:
-  forall N n p x, List.fold_left Int.add (decompose_int_rec N n p) x = Int.add x n.
+Lemma decompose_int_arm_add:
+  forall N n p x, List.fold_left Int.add (decompose_int_arm N n p) x = Int.add x n.
 Proof.
   induction N; intros; simpl.
   predSpec Int.eq Int.eq_spec n Int.zero; simpl.
@@ -104,10 +147,56 @@ Proof.
   rewrite Int.or_not_self. apply Int.and_mone. apply Int.and_not_self.
 Qed.
 
-Remark decompose_int_rec_nil:
-  forall N n p, decompose_int_rec N n p = nil -> n = Int.zero.
+Remark decompose_int_arm_nil:
+  forall N n p, decompose_int_arm N n p = nil -> n = Int.zero.
 Proof.
-  intros. generalize (decompose_int_rec_or N n p Int.zero). rewrite H. simpl. 
+  intros. generalize (decompose_int_arm_or N n p Int.zero). rewrite H. simpl. 
+  rewrite Int.or_commut; rewrite Int.or_zero; auto.
+Qed.
+
+Lemma decompose_int_thumb_or:
+  forall N n p x, List.fold_left Int.or (decompose_int_thumb N n p) x = Int.or x n.
+Proof.
+  induction N; intros; simpl.
+  predSpec Int.eq Int.eq_spec n Int.zero; simpl.
+  subst n. rewrite Int.or_zero. auto.
+  auto.
+  predSpec Int.eq Int.eq_spec (Int.and n (Int.shl Int.one p)) Int.zero.
+  auto.
+  simpl. rewrite IHN. rewrite Int.or_assoc. decEq. rewrite <- Int.and_or_distrib.
+  rewrite Int.or_not_self. apply Int.and_mone.
+Qed.
+
+Lemma decompose_int_thumb_xor:
+  forall N n p x, List.fold_left Int.xor (decompose_int_thumb N n p) x = Int.xor x n.
+Proof.
+  induction N; intros; simpl.
+  predSpec Int.eq Int.eq_spec n Int.zero; simpl.
+  subst n. rewrite Int.xor_zero. auto.
+  auto.
+  predSpec Int.eq Int.eq_spec (Int.and n (Int.shl Int.one p)) Int.zero.
+  auto.
+  simpl. rewrite IHN. rewrite Int.xor_assoc. decEq. rewrite <- Int.and_xor_distrib.
+  rewrite Int.xor_not_self. apply Int.and_mone.
+Qed.
+
+Lemma decompose_int_thumb_add:
+  forall N n p x, List.fold_left Int.add (decompose_int_thumb N n p) x = Int.add x n.
+Proof.
+  induction N; intros; simpl.
+  predSpec Int.eq Int.eq_spec n Int.zero; simpl.
+  subst n. rewrite Int.add_zero. auto.
+  auto.
+  predSpec Int.eq Int.eq_spec (Int.and n (Int.shl Int.one p)) Int.zero.
+  auto.
+  simpl. rewrite IHN. rewrite Int.add_assoc. decEq. rewrite Int.add_and. 
+  rewrite Int.or_not_self. apply Int.and_mone. apply Int.and_not_self.
+Qed.
+
+Remark decompose_int_thumb_nil:
+  forall N n p, decompose_int_thumb N n p = nil -> n = Int.zero.
+Proof.
+  intros. generalize (decompose_int_thumb_or N n p Int.zero). rewrite H. simpl. 
   rewrite Int.or_commut; rewrite Int.or_zero; auto.
 Qed.
 
@@ -116,23 +205,31 @@ Lemma decompose_int_general:
   (forall v1 n2 n3, f (f v1 n2) n3 = f v1 (g n2 n3)) ->
   (forall n1 n2 n3, g (g n1 n2) n3 = g n1 (g n2 n3)) ->
   (forall n, g Int.zero n = n) ->
-  (forall N n p x, List.fold_left g (decompose_int_rec N n p) x = g x n) ->
+  (forall N n p x, List.fold_left g (decompose_int_arm N n p) x = g x n) ->
+  (forall N n p x, List.fold_left g (decompose_int_thumb N n p) x = g x n) ->
   forall n v,
   List.fold_left f (decompose_int n) v = f v n.
 Proof.
-  intros f g DISTR ASSOC ZERO DECOMP.
+  intros f g DISTR ASSOC ZERO DECOMP1 DECOMP2.
   assert (A: forall l x y, g x (fold_left g l y) = fold_left g l (g x y)).
     induction l; intros; simpl. auto. rewrite IHl. decEq. rewrite ASSOC; auto.
   assert (B: forall l v n, fold_left f l (f v n) = f v (fold_left g l n)).
     induction l; intros; simpl.
     auto.
     rewrite IHl. rewrite DISTR. decEq. decEq. auto.
-  intros. unfold decompose_int. 
-  destruct (decompose_int_rec 12 n Int.zero) eqn:?. 
-  simpl. exploit decompose_int_rec_nil; eauto. congruence.
-  simpl. rewrite B. decEq.  
-  generalize (DECOMP 12%nat n Int.zero Int.zero).
-  rewrite Heql. simpl. repeat rewrite ZERO. auto.
+  intros. unfold decompose_int, decompose_int_base.
+  destruct (thumb tt); [destruct (is_immed_arith_thumb_special n)|].
+- reflexivity. 
+- destruct (decompose_int_thumb 24%nat n Int.zero) eqn:DB.
+  + simpl. exploit decompose_int_thumb_nil; eauto. congruence.
+  + simpl. rewrite B. decEq.
+    generalize (DECOMP2 24%nat n Int.zero Int.zero). 
+    rewrite DB; simpl. rewrite ! ZERO. auto.
+- destruct (decompose_int_arm 12%nat n Int.zero) eqn:DB.
+  + simpl. exploit decompose_int_arm_nil; eauto. congruence.
+  + simpl. rewrite B. decEq.
+    generalize (DECOMP1 12%nat n Int.zero Int.zero). 
+    rewrite DB; simpl. rewrite ! ZERO. auto.
 Qed.
 
 Lemma decompose_int_or:
@@ -143,7 +240,7 @@ Proof.
   intros. rewrite Val.or_assoc. auto.
   apply Int.or_assoc.
   intros. rewrite Int.or_commut. apply Int.or_zero.
-  apply decompose_int_rec_or.
+  apply decompose_int_arm_or. apply decompose_int_thumb_or. 
 Qed.
 
 Lemma decompose_int_bic:
@@ -154,7 +251,7 @@ Proof.
   intros. rewrite Val.and_assoc. simpl. decEq. decEq. rewrite Int.not_or_and_not. auto.
   apply Int.or_assoc.
   intros. rewrite Int.or_commut. apply Int.or_zero.
-  apply decompose_int_rec_or.
+  apply decompose_int_arm_or. apply decompose_int_thumb_or.
 Qed.
 
 Lemma decompose_int_xor:
@@ -165,7 +262,7 @@ Proof.
   intros. rewrite Val.xor_assoc. auto. 
   apply Int.xor_assoc.
   intros. rewrite Int.xor_commut. apply Int.xor_zero.
-  apply decompose_int_rec_xor.
+  apply decompose_int_arm_xor. apply decompose_int_thumb_xor.
 Qed.
 
 Lemma decompose_int_add:
@@ -176,7 +273,7 @@ Proof.
   intros. rewrite Val.add_assoc. auto. 
   apply Int.add_assoc.
   intros. rewrite Int.add_commut. apply Int.add_zero.
-  apply decompose_int_rec_add.
+  apply decompose_int_arm_add. apply decompose_int_thumb_add. 
 Qed.
 
 Lemma decompose_int_sub:
@@ -188,26 +285,26 @@ Proof.
   rewrite Int.neg_add_distr; auto.
   apply Int.add_assoc.
   intros. rewrite Int.add_commut. apply Int.add_zero.
-  apply decompose_int_rec_add.
+  apply decompose_int_arm_add. apply decompose_int_thumb_add. 
 Qed.
 
 Lemma iterate_op_correct:
   forall op1 op2 (f: val -> int -> val) (rs: regset) (r: ireg) m v0 n k,
   (forall (rs:regset) n,
     exec_instr ge fn (op2 (SOimm n)) rs m =
-    Next (nextinstr (rs#r <- (f (rs#r) n))) m) ->
+    Next (nextinstr_nf (rs#r <- (f (rs#r) n))) m) ->
   (forall n,
     exec_instr ge fn (op1 (SOimm n)) rs m =
-    Next (nextinstr (rs#r <- (f v0 n))) m) ->
+    Next (nextinstr_nf (rs#r <- (f v0 n))) m) ->
   exists rs',
      exec_straight ge fn (iterate_op op1 op2 (decompose_int n) k) rs m  k rs' m
   /\ rs'#r = List.fold_left f (decompose_int n) v0
-  /\ forall r': preg, r' <> r -> r' <> PC -> rs'#r' = rs#r'.
+  /\ forall r': preg, r' <> r -> if_preg r' = true -> rs'#r' = rs#r'.
 Proof.
   intros until k; intros SEM2 SEM1.
   unfold iterate_op.
-  destruct (decompose_int n) as [ | i tl] eqn:?.
-  unfold decompose_int in Heql. destruct (decompose_int_rec 12%nat n Int.zero); congruence.
+  destruct (decompose_int n) as [ | i tl] eqn:DI.
+  unfold decompose_int in DI. destruct (decompose_int_base n); congruence.
   revert k. pattern tl. apply List.rev_ind.
   (* base case *)
   intros; simpl. econstructor.
@@ -231,22 +328,52 @@ Lemma loadimm_correct:
   exists rs',
      exec_straight ge fn (loadimm r n k) rs m  k rs' m
   /\ rs'#r = Vint n
-  /\ forall r': preg, r' <> r -> r' <> PC -> rs'#r' = rs#r'.
+  /\ forall r': preg, r' <> r -> if_preg r' = true -> rs'#r' = rs#r'.
 Proof.
   intros. unfold loadimm.
-  destruct (NPeano.leb (length (decompose_int n)) (length (decompose_int (Int.not n)))).
-  (* mov - orr* *)
+  set (l1 := length (decompose_int n)).
+  set (l2 := length (decompose_int (Int.not n))).
+  destruct (NPeano.leb l1 1%nat).
+{ (* single mov *)
+  econstructor; split. apply exec_straight_one. simpl; reflexivity. auto. 
+  split; intros; Simpl. }
+  destruct (NPeano.leb l2 1%nat).
+{ (* single movn *)
+  econstructor; split. apply exec_straight_one.
+  simpl. rewrite Int.not_involutive. reflexivity. auto. 
+  split; intros; Simpl. }
+  destruct (thumb tt).
+{ (* movw / movt *)
+  unfold loadimm_thumb. destruct (Int.eq (Int.shru n (Int.repr 16)) Int.zero).
+  econstructor; split.
+  apply exec_straight_one. simpl; eauto. auto. split; intros; Simpl. 
+  econstructor; split.
+  eapply exec_straight_two. simpl; reflexivity. simpl; reflexivity. auto. auto.
+  split; intros; Simpl. simpl. f_equal. rewrite Int.zero_ext_and by omega. 
+  rewrite Int.and_assoc. change 65535 with (two_p 16 - 1). rewrite Int.and_idem.
+  apply Int.same_bits_eq; intros. 
+  rewrite Int.bits_or, Int.bits_and, Int.bits_shl, Int.testbit_repr by auto.
+  rewrite Int.Ztestbit_two_p_m1 by omega. change (Int.unsigned (Int.repr 16)) with 16.
+  destruct (zlt i 16).
+  rewrite andb_true_r, orb_false_r; auto.
+  rewrite andb_false_r; simpl. rewrite Int.bits_shru by omega. 
+  change (Int.unsigned (Int.repr 16)) with 16. rewrite zlt_true by omega. f_equal; omega.
+}
+  destruct (NPeano.leb l1 l2).
+{ (* mov - orr* *)
   replace (Vint n) with (List.fold_left (fun v i => Val.or v (Vint i)) (decompose_int n) Vzero).
   apply iterate_op_correct.
   auto.
   intros; simpl. rewrite Int.or_commut; rewrite Int.or_zero; auto.
   rewrite decompose_int_or. simpl. rewrite Int.or_commut; rewrite Int.or_zero; auto.
-  (* mvn - bic* *)
+}
+{ (* mvn - bic* *)
   replace (Vint n) with (List.fold_left (fun v i => Val.and v (Vint (Int.not i))) (decompose_int (Int.not n)) (Vint Int.mone)).
   apply iterate_op_correct.
   auto.
   intros. simpl. rewrite Int.and_commut; rewrite Int.and_mone; auto.
   rewrite decompose_int_bic. simpl. rewrite Int.not_involutive. rewrite Int.and_commut. rewrite Int.and_mone; auto.
+}
 Qed.
 
 (** Add integer immediate. *)
@@ -256,9 +383,13 @@ Lemma addimm_correct:
   exists rs',
      exec_straight ge fn (addimm r1 r2 n k) rs m  k rs' m
   /\ rs'#r1 = Val.add rs#r2 (Vint n)
-  /\ forall r': preg, r' <> r1 -> r' <> PC -> rs'#r' = rs#r'.
+  /\ forall r': preg, r' <> r1 -> if_preg r' = true -> rs'#r' = rs#r'.
 Proof.
   intros. unfold addimm.
+  destruct (Int.ltu (Int.repr (-256)) n).
+  (* sub *)
+  econstructor; split. apply exec_straight_one; simpl; auto. 
+  split; intros; Simpl. apply Val.sub_opp_add. 
   destruct (NPeano.leb (length (decompose_int n)) (length (decompose_int (Int.neg n)))).
   (* add - add* *)
   replace (Val.add (rs r2) (Vint n))
@@ -283,21 +414,17 @@ Lemma andimm_correct:
   exists rs',
      exec_straight ge fn (andimm r1 r2 n k) rs m  k rs' m
   /\ rs'#r1 = Val.and rs#r2 (Vint n)
-  /\ forall r': preg, r' <> r1 -> r' <> PC -> rs'#r' = rs#r'.
+  /\ forall r': preg, r' <> r1 -> if_preg r' = true -> rs'#r' = rs#r'.
 Proof.
-  intros. unfold andimm.
+  intros. unfold andimm. destruct (is_immed_arith n).
   (* andi *)
-  case (is_immed_arith n).
-  exists (nextinstr (rs#r1 <- (Val.and rs#r2 (Vint n)))).
-  split. apply exec_straight_one; auto.
-  split. rewrite nextinstr_inv; auto with asmgen. apply Pregmap.gss. 
-  intros. rewrite nextinstr_inv; auto. apply Pregmap.gso; auto.
+  exists (nextinstr_nf (rs#r1 <- (Val.and rs#r2 (Vint n)))).
+  split. apply exec_straight_one; auto. split; intros; Simpl.
   (* bic - bic* *)
   replace (Val.and (rs r2) (Vint n))
      with (List.fold_left (fun v i => Val.and v (Vint (Int.not i))) (decompose_int (Int.not n)) (rs r2)).
   apply iterate_op_correct.
-  auto.
-  auto.
+  auto. auto.
   rewrite decompose_int_bic. rewrite Int.not_involutive. auto.
 Qed.
 
@@ -308,7 +435,7 @@ Lemma rsubimm_correct:
   exists rs',
      exec_straight ge fn (rsubimm r1 r2 n k) rs m  k rs' m
   /\ rs'#r1 = Val.sub (Vint n) rs#r2
-  /\ forall r': preg, r' <> r1 -> r' <> PC -> rs'#r' = rs#r'.
+  /\ forall r': preg, r' <> r1 -> if_preg r' = true -> rs'#r' = rs#r'.
 Proof.
   intros. unfold rsubimm.
   (* rsb - add* *)
@@ -329,7 +456,7 @@ Lemma orimm_correct:
   exists rs',
      exec_straight ge fn (orimm r1 r2 n k) rs m  k rs' m
   /\ rs'#r1 = Val.or rs#r2 (Vint n)
-  /\ forall r': preg, r' <> r1 -> r' <> PC -> rs'#r' = rs#r'.
+  /\ forall r': preg, r' <> r1 -> if_preg r' = true -> rs'#r' = rs#r'.
 Proof.
   intros. unfold orimm.
   (* ori - ori* *)
@@ -348,7 +475,7 @@ Lemma xorimm_correct:
   exists rs',
      exec_straight ge fn (xorimm r1 r2 n k) rs m  k rs' m
   /\ rs'#r1 = Val.xor rs#r2 (Vint n)
-  /\ forall r': preg, r' <> r1 -> r' <> PC -> rs'#r' = rs#r'.
+  /\ forall r': preg, r' <> r1 -> if_preg r' = true -> rs'#r' = rs#r'.
 Proof.
   intros. unfold xorimm.
   (* xori - xori* *)
@@ -367,7 +494,7 @@ Lemma indexed_memory_access_correct:
          (mk_immed: int -> int) (base: ireg) n k (rs: regset) m m',
   (forall (r1: ireg) (rs1: regset) n1 k,
     Val.add rs1#r1 (Vint n1) = Val.add rs#base (Vint n) ->
-    (forall (r: preg), r <> PC -> r <> IR14 -> rs1 r = rs r) ->
+    (forall (r: preg), if_preg r = true -> r <> IR14 -> rs1 r = rs r) ->
     exists rs',
     exec_straight ge fn (mk_instr r1 n1 :: k) rs1 m k rs' m' /\ P rs') ->
   exists rs', 
@@ -397,12 +524,12 @@ Lemma loadind_int_correct:
   exists rs',
      exec_straight ge fn (loadind_int base ofs dst k) rs m k rs' m
   /\ rs'#dst = v
-  /\ forall r, r <> PC -> r <> IR14 -> r <> dst -> rs'#r = rs#r.
+  /\ forall r, if_preg r = true -> r <> IR14 -> r <> dst -> rs'#r = rs#r.
 Proof.
   intros; unfold loadind_int. apply indexed_memory_access_correct; intros.
   econstructor; split. 
   apply exec_straight_one. simpl. unfold exec_load. rewrite H0; rewrite H; eauto. auto.
-  split. Simpl. intros; Simpl.
+  split; intros; Simpl.
 Qed.
 
 Lemma loadind_correct:
@@ -412,7 +539,7 @@ Lemma loadind_correct:
   exists rs',
      exec_straight ge fn c rs m k rs' m
   /\ rs'#(preg_of dst) = v
-  /\ forall r, r <> PC -> r <> IR14 -> r <> preg_of dst -> rs'#r = rs#r.
+  /\ forall r, if_preg r = true -> r <> IR14 -> r <> preg_of dst -> rs'#r = rs#r.
 Proof.
   unfold loadind; intros. destruct ty; destruct (preg_of dst); inv H; simpl in H0.
 - (* int *)
@@ -421,22 +548,22 @@ Proof.
   apply indexed_memory_access_correct; intros.
   econstructor; split. 
   apply exec_straight_one. simpl. unfold exec_load. rewrite H. rewrite H0. eauto. auto. 
-  split. Simpl. intros; Simpl.
+  split; intros; Simpl.
 - (* single *)
   apply indexed_memory_access_correct; intros.
   econstructor; split. 
   apply exec_straight_one. simpl. unfold exec_load. rewrite H. rewrite H0. eauto. auto. 
-  split. Simpl. intros; Simpl.
+  split; intros; Simpl.
 - (* any32 *)
   apply indexed_memory_access_correct; intros.
   econstructor; split. 
   apply exec_straight_one. simpl. unfold exec_load. rewrite H. rewrite H0. eauto. auto. 
-  split. Simpl. intros; Simpl.
+  split; intros; Simpl.
 - (* any64 *)
   apply indexed_memory_access_correct; intros.
   econstructor; split. 
   apply exec_straight_one. simpl. unfold exec_load. rewrite H. rewrite H0. eauto. auto. 
-  split. Simpl. intros; Simpl.
+  split; intros; Simpl.
 Qed.
 
 (** Indexed memory stores. *)
@@ -447,10 +574,10 @@ Lemma storeind_correct:
   Mem.storev (chunk_of_type ty) m (Val.add rs#base (Vint ofs)) (rs#(preg_of src)) = Some m' ->
   exists rs',
      exec_straight ge fn c rs m k rs' m'
-  /\ forall r, r <> PC -> r <> IR14 -> preg_notin r (destroyed_by_setstack ty) -> rs'#r = rs#r.
+  /\ forall r, if_preg r = true -> r <> IR14 -> rs'#r = rs#r.
 Proof.
   unfold storeind; intros.
-  assert (NOT14: preg_of src <> IR IR14) by eauto with asmgen.
+  assert (DATA: data_preg (preg_of src) = true) by eauto with asmgen.
   destruct ty; destruct (preg_of src); inv H; simpl in H0.
 - (* int *)
   apply indexed_memory_access_correct; intros.
@@ -493,13 +620,6 @@ Proof.
   intros. destruct s; simpl; auto.
 Qed.
 
-Lemma transl_shift_addr_correct:
-  forall s (r: ireg) (rs: regset),
-  eval_shift_addr (transl_shift_addr s r) rs = eval_shift s (rs#r).
-Proof.
-  intros. destruct s; simpl; auto.
-Qed.
-
 (** Translation of conditions *)
 
 Lemma compare_int_spec:
@@ -871,6 +991,7 @@ Lemma transl_cond_correct:
      exec_straight ge fn c rs m k rs' m
   /\ match eval_condition cond (map rs (map preg_of args)) m with
      | Some b => eval_testcond (cond_for_cond cond) rs' = Some b
+                 /\ eval_testcond (cond_for_cond (negate_condition cond)) rs' = Some (negb b)
      | None => True
      end
   /\ forall r, data_preg r = true -> rs'#r = rs r.
@@ -880,57 +1001,66 @@ Proof.
 - (* Ccomp *)
   econstructor.
   split. apply exec_straight_one. simpl. eauto. auto.
-  split. destruct (Val.cmp_bool c0 (rs x) (rs x0)) eqn:CMP; auto. apply cond_for_signed_cmp_correct; auto.
+  split. destruct (Val.cmp_bool c0 (rs x) (rs x0)) eqn:CMP; auto.
+  split; apply cond_for_signed_cmp_correct; auto. rewrite Val.negate_cmp_bool, CMP; auto.
   apply compare_int_inv. 
 - (* Ccompu *)
   econstructor.
   split. apply exec_straight_one. simpl. eauto. auto.
-  split. destruct (Val.cmpu_bool (Mem.valid_pointer m) c0 (rs x) (rs x0)) eqn:CMP; auto. apply cond_for_unsigned_cmp_correct; auto.
+  split. destruct (Val.cmpu_bool (Mem.valid_pointer m) c0 (rs x) (rs x0)) eqn:CMP; auto.
+  split; apply cond_for_unsigned_cmp_correct; auto. rewrite Val.negate_cmpu_bool, CMP; auto.
   apply compare_int_inv.
 - (* Ccompshift *)
   econstructor.
   split. apply exec_straight_one. simpl. eauto. auto.
   split. rewrite transl_shift_correct.
-  destruct (Val.cmp_bool c0 (rs x) (eval_shift s (rs x0))) eqn:CMP; auto. apply cond_for_signed_cmp_correct; auto.
+  destruct (Val.cmp_bool c0 (rs x) (eval_shift s (rs x0))) eqn:CMP; auto.
+  split; apply cond_for_signed_cmp_correct; auto. rewrite Val.negate_cmp_bool, CMP; auto.
   apply compare_int_inv.
 - (* Ccompushift *)
   econstructor.
   split. apply exec_straight_one. simpl. eauto. auto.
   split. rewrite transl_shift_correct.
-  destruct (Val.cmpu_bool (Mem.valid_pointer m) c0 (rs x) (eval_shift s (rs x0))) eqn:CMP; auto. apply cond_for_unsigned_cmp_correct; auto.
+  destruct (Val.cmpu_bool (Mem.valid_pointer m) c0 (rs x) (eval_shift s (rs x0))) eqn:CMP; auto.
+  split; apply cond_for_unsigned_cmp_correct; auto. rewrite Val.negate_cmpu_bool, CMP; auto.
   apply compare_int_inv.
 - (* Ccompimm *)
   destruct (is_immed_arith i).
   econstructor.
   split. apply exec_straight_one. simpl. eauto. auto.
-  split. destruct (Val.cmp_bool c0 (rs x) (Vint i)) eqn:CMP; auto. apply cond_for_signed_cmp_correct; auto.
+  split. destruct (Val.cmp_bool c0 (rs x) (Vint i)) eqn:CMP; auto. 
+  split; apply cond_for_signed_cmp_correct; auto. rewrite Val.negate_cmp_bool, CMP; auto.
   apply compare_int_inv.
   exploit (loadimm_correct IR14). intros [rs' [P [Q R]]].
   econstructor.
   split. eapply exec_straight_trans. eexact P. apply exec_straight_one. simpl.
   rewrite Q. rewrite R; eauto with asmgen. auto.
   split. rewrite <- R by (eauto with asmgen). 
-  destruct (Val.cmp_bool c0 (rs' x) (Vint i)) eqn:CMP; auto. apply cond_for_signed_cmp_correct; auto.
+  destruct (Val.cmp_bool c0 (rs' x) (Vint i)) eqn:CMP; auto.
+  split; apply cond_for_signed_cmp_correct; auto. rewrite Val.negate_cmp_bool, CMP; auto.
   intros. rewrite compare_int_inv by auto. auto with asmgen. 
 - (* Ccompuimm *)
   destruct (is_immed_arith i).
   econstructor.
   split. apply exec_straight_one. simpl. eauto. auto.
-  split. destruct (Val.cmpu_bool (Mem.valid_pointer m) c0 (rs x) (Vint i)) eqn:CMP; auto. apply cond_for_unsigned_cmp_correct; auto.
+  split. destruct (Val.cmpu_bool (Mem.valid_pointer m) c0 (rs x) (Vint i)) eqn:CMP; auto.
+  split; apply cond_for_unsigned_cmp_correct; auto. rewrite Val.negate_cmpu_bool, CMP; auto.
   apply compare_int_inv.
   exploit (loadimm_correct IR14). intros [rs' [P [Q R]]].
   econstructor.
   split. eapply exec_straight_trans. eexact P. apply exec_straight_one. simpl.
   rewrite Q. rewrite R; eauto with asmgen. auto.
   split. rewrite <- R by (eauto with asmgen). 
-  destruct (Val.cmpu_bool (Mem.valid_pointer m) c0 (rs' x) (Vint i)) eqn:CMP; auto. apply cond_for_unsigned_cmp_correct; auto.
+  destruct (Val.cmpu_bool (Mem.valid_pointer m) c0 (rs' x) (Vint i)) eqn:CMP; auto.
+  split; apply cond_for_unsigned_cmp_correct; auto. rewrite Val.negate_cmpu_bool, CMP; auto.
   intros. rewrite compare_int_inv by auto. auto with asmgen. 
 - (* Ccompf *)
   econstructor.
   split. apply exec_straight_one. simpl. eauto. apply compare_float_nextpc.
   split. destruct (Val.cmpf_bool c0 (rs x) (rs x0)) eqn:CMP; auto.
   destruct (rs x); try discriminate. destruct (rs x0); try discriminate.
-  simpl in CMP. inv CMP. apply cond_for_float_cmp_correct.
+  simpl in CMP. inv CMP.
+  split. apply cond_for_float_cmp_correct. apply cond_for_float_not_cmp_correct. 
   apply compare_float_inv.
 - (* Cnotcompf *)
   econstructor.
@@ -938,7 +1068,8 @@ Proof.
   split. destruct (Val.cmpf_bool c0 (rs x) (rs x0)) eqn:CMP; auto.
   destruct (rs x); try discriminate. destruct (rs x0); try discriminate.
   simpl in CMP. inv CMP.
-Local Opaque compare_float. simpl. apply cond_for_float_not_cmp_correct.
+Local Opaque compare_float. simpl.
+  split. apply cond_for_float_not_cmp_correct. rewrite negb_involutive. apply cond_for_float_cmp_correct.
   exact I.
   apply compare_float_inv.
 - (* Ccompfzero *)
@@ -946,14 +1077,16 @@ Local Opaque compare_float. simpl. apply cond_for_float_not_cmp_correct.
   split. apply exec_straight_one. simpl. eauto. apply compare_float_nextpc.
   split. destruct (Val.cmpf_bool c0 (rs x) (Vfloat Float.zero)) eqn:CMP; auto.
   destruct (rs x); try discriminate.
-  simpl in CMP. inv CMP. apply cond_for_float_cmp_correct.
+  simpl in CMP. inv CMP.
+  split. apply cond_for_float_cmp_correct. apply cond_for_float_not_cmp_correct. 
   apply compare_float_inv.
 - (* Cnotcompfzero *)
   econstructor.
   split. apply exec_straight_one. simpl. eauto. apply compare_float_nextpc.
   split. destruct (Val.cmpf_bool c0 (rs x) (Vfloat Float.zero)) eqn:CMP; auto.
   destruct (rs x); try discriminate. simpl in CMP. inv CMP.
-Local Opaque compare_float. simpl. apply cond_for_float_not_cmp_correct.
+Local Opaque compare_float. simpl.
+  split. apply cond_for_float_not_cmp_correct. rewrite negb_involutive. apply cond_for_float_cmp_correct.
   exact I.
   apply compare_float_inv.
 - (* Ccompfs *)
@@ -961,7 +1094,8 @@ Local Opaque compare_float. simpl. apply cond_for_float_not_cmp_correct.
   split. apply exec_straight_one. simpl. eauto. apply compare_float32_nextpc.
   split. destruct (Val.cmpfs_bool c0 (rs x) (rs x0)) eqn:CMP; auto.
   destruct (rs x); try discriminate. destruct (rs x0); try discriminate.
-  simpl in CMP. inv CMP. apply cond_for_float32_cmp_correct.
+  simpl in CMP. inv CMP. 
+  split. apply cond_for_float32_cmp_correct. apply cond_for_float32_not_cmp_correct.
   apply compare_float32_inv.
 - (* Cnotcompfs *)
   econstructor.
@@ -969,7 +1103,8 @@ Local Opaque compare_float. simpl. apply cond_for_float_not_cmp_correct.
   split. destruct (Val.cmpfs_bool c0 (rs x) (rs x0)) eqn:CMP; auto.
   destruct (rs x); try discriminate. destruct (rs x0); try discriminate.
   simpl in CMP. inv CMP.
-Local Opaque compare_float32. simpl. apply cond_for_float32_not_cmp_correct.
+Local Opaque compare_float32. simpl.
+  split. apply cond_for_float32_not_cmp_correct. rewrite negb_involutive. apply cond_for_float32_cmp_correct.
   exact I.
   apply compare_float32_inv.
 - (* Ccompfszero *)
@@ -977,14 +1112,15 @@ Local Opaque compare_float32. simpl. apply cond_for_float32_not_cmp_correct.
   split. apply exec_straight_one. simpl. eauto. apply compare_float32_nextpc.
   split. destruct (Val.cmpfs_bool c0 (rs x) (Vsingle Float32.zero)) eqn:CMP; auto.
   destruct (rs x); try discriminate.
-  simpl in CMP. inv CMP. apply cond_for_float32_cmp_correct.
+  simpl in CMP. inv CMP.
+  split. apply cond_for_float32_cmp_correct. apply cond_for_float32_not_cmp_correct.
   apply compare_float32_inv.
 - (* Cnotcompfzero *)
   econstructor.
   split. apply exec_straight_one. simpl. eauto. apply compare_float32_nextpc.
   split. destruct (Val.cmpfs_bool c0 (rs x) (Vsingle Float32.zero)) eqn:CMP; auto.
   destruct (rs x); try discriminate. simpl in CMP. inv CMP.
-  simpl. apply cond_for_float32_not_cmp_correct.
+  simpl. split. apply cond_for_float32_not_cmp_correct. rewrite negb_involutive. apply cond_for_float32_cmp_correct.
   exact I.
   apply compare_float32_inv.
 Qed.
@@ -1009,13 +1145,10 @@ Proof.
   intros until v; intros TR EV NOCMP. 
   unfold transl_op in TR; destruct op; ArgsInv; simpl in EV; inv EV; try (TranslOpSimpl; fail).
   (* Omove *)
-  exists (nextinstr (rs#(preg_of res) <- (rs#(preg_of m0)))).
-  split.
   destruct (preg_of res) eqn:RES; try discriminate;
   destruct (preg_of m0) eqn:ARG; inv TR.
-  apply exec_straight_one; auto.
-  apply exec_straight_one; auto.
-  intuition Simpl.
+  econstructor; split. apply exec_straight_one; simpl; eauto. intuition Simpl.
+  econstructor; split. apply exec_straight_one; simpl; eauto. intuition Simpl.
   (* Ointconst *)
   generalize (loadimm_correct x i k rs m). intros [rs' [A [B C]]]. 
   exists rs'; auto with asmgen. 
@@ -1024,8 +1157,11 @@ Proof.
   intros [rs' [EX [RES OTH]]].
   exists rs'; auto with asmgen. 
   (* Ocast8signed *)
-  set (rs1 := nextinstr (rs#x <- (Val.shl rs#x0 (Vint (Int.repr 24))))).
-  set (rs2 := nextinstr (rs1#x <- (Val.shr rs1#x (Vint (Int.repr 24))))).
+  destruct (thumb tt).
+  econstructor; split. apply exec_straight_one; simpl; eauto. intuition Simpl.
+  destruct (rs x0); auto; simpl. rewrite Int.shru_zero. reflexivity.
+  set (rs1 := nextinstr_nf (rs#x <- (Val.shl rs#x0 (Vint (Int.repr 24))))).
+  set (rs2 := nextinstr_nf (rs1#x <- (Val.shr rs1#x (Vint (Int.repr 24))))).
   exists rs2.
   split. apply exec_straight_two with rs1 m; auto. 
   split. unfold rs2; Simpl. unfold rs1; Simpl.
@@ -1034,8 +1170,11 @@ Proof.
   f_equal. symmetry. apply (Int.sign_ext_shr_shl 8). compute; auto.
   intros. unfold rs2, rs1; Simpl. 
   (* Ocast16signed *)
-  set (rs1 := nextinstr (rs#x <- (Val.shl rs#x0 (Vint (Int.repr 16))))).
-  set (rs2 := nextinstr (rs1#x <- (Val.shr rs1#x (Vint (Int.repr 16))))).
+  destruct (thumb tt).
+  econstructor; split. apply exec_straight_one; simpl; eauto. intuition Simpl.
+  destruct (rs x0); auto; simpl. rewrite Int.shru_zero. reflexivity.
+  set (rs1 := nextinstr_nf (rs#x <- (Val.shl rs#x0 (Vint (Int.repr 16))))).
+  set (rs2 := nextinstr_nf (rs1#x <- (Val.shr rs1#x (Vint (Int.repr 16))))).
   exists rs2.
   split. apply exec_straight_two with rs1 m; auto. 
   split. unfold rs2; Simpl. unfold rs1; Simpl.
@@ -1051,22 +1190,6 @@ Proof.
   generalize (rsubimm_correct x x0 i k rs m).
   intros [rs' [A [B C]]].
   exists rs'; auto with asmgen.
-  (* Omul *)
-  destruct (negb (ireg_eq x x0)).
-  TranslOpSimpl.
-  destruct (negb (ireg_eq x x1)).
-  rewrite Val.mul_commut. TranslOpSimpl.
-  econstructor; split.
-  eapply exec_straight_two. simpl; eauto. simpl; eauto. auto. auto.
-  intuition Simpl.
-  (* Omla *)
-  destruct (negb (ireg_eq x x0)).
-  TranslOpSimpl.
-  destruct (negb (ireg_eq x x1)).
-  rewrite Val.mul_commut. TranslOpSimpl.
-  econstructor; split.
-  eapply exec_straight_two. simpl; eauto. simpl; eauto. auto. auto.
-  intuition Simpl.
   (* divs *)
 Local Transparent destroyed_by_op.
   econstructor. split. apply exec_straight_one. simpl. rewrite H0. reflexivity. auto.
@@ -1117,9 +1240,9 @@ Local Transparent destroyed_by_op.
     omega.
   }
   set (j := Int.sub Int.iwordsize i) in *.
-  set (rs1 := nextinstr (rs#IR14 <- (Val.shr (Vint i0) (Vint (Int.repr 31))))).
-  set (rs2 := nextinstr (rs1#IR14 <- (Val.add (Vint i0) (Val.shru rs1#IR14 (Vint j))))).
-  set (rs3 := nextinstr (rs2#x <- (Val.shr rs2#IR14 (Vint i)))).
+  set (rs1 := nextinstr_nf (rs#IR14 <- (Val.shr (Vint i0) (Vint (Int.repr 31))))).
+  set (rs2 := nextinstr_nf (rs1#IR14 <- (Val.add (Vint i0) (Val.shru rs1#IR14 (Vint j))))).
+  set (rs3 := nextinstr_nf (rs2#x <- (Val.shr rs2#IR14 (Vint i)))).
   exists rs3; split.
   apply exec_straight_three with rs1 m rs2 m.
   simpl. rewrite X0; reflexivity.
@@ -1180,23 +1303,11 @@ Proof.
   subst op. simpl in H. monadInv H. simpl in H0. inv H0. 
   rewrite (ireg_of_eq _ _ EQ).
   exploit transl_cond_correct; eauto. instantiate (1 := rs). instantiate (1 := m). intros [rs1 [A [B C]]].
-  set (rs2 := nextinstr (rs1#x <- (Vint Int.zero))).
-  set (rs3 := nextinstr (match eval_testcond (cond_for_cond cmp) rs2 with
-                         | Some true => rs2 # x <- (Vint Int.one)
-                         | Some false => rs2
-                         | None => rs2 # x <- Vundef end)).
-  exists rs3; split.
-  eapply exec_straight_trans. eexact A. apply exec_straight_two with rs2 m.
-  auto.
-  simpl. unfold rs3. destruct (eval_testcond (cond_for_cond cmp) rs2) as [[]|]; auto.
-  auto. unfold rs3. destruct (eval_testcond (cond_for_cond cmp) rs2) as [[]|]; auto.
-  split. destruct (eval_condition cmp rs ## (preg_of ## args) m) as [b|]; simpl; auto.
-  unfold rs3.
-  change (eval_testcond (cond_for_cond cmp) rs2) with (eval_testcond (cond_for_cond cmp) rs1). rewrite B.
-  Simpl. destruct b. Simpl. unfold rs2. Simpl. 
-  intros. transitivity (rs2 r). 
-  unfold rs3. Simpl. destruct (eval_testcond (cond_for_cond cmp) rs2) as [[]|]; Simpl; auto.
-  unfold rs2. Simpl. 
+  econstructor; split.
+  eapply exec_straight_trans. eexact A. apply exec_straight_one. simpl; eauto. auto.
+  split; intros; Simpl.
+  destruct (eval_condition cmp rs ## (preg_of ## args) m) as [b|]; simpl; auto.
+  destruct B as [B1 B2]; rewrite B1. destruct b; auto. 
 Qed.
 
 (** Translation of loads and stores. *)
@@ -1209,21 +1320,21 @@ Qed.
 
 Lemma transl_memory_access_correct:
   forall (P: regset -> Prop) (mk_instr_imm: ireg -> int -> instruction)
-         (mk_instr_gen: option (ireg -> shift_addr -> instruction))
+         (mk_instr_gen: option (ireg -> shift_op -> instruction))
          (mk_immed: int -> int)
          addr args k c (rs: regset) a m m',
   transl_memory_access mk_instr_imm mk_instr_gen mk_immed addr args k = OK c ->
   eval_addressing ge (rs#SP) addr (map rs (map preg_of args)) = Some a ->
   (forall (r1: ireg) (rs1: regset) n k,
     Val.add rs1#r1 (Vint n) = a ->
-    (forall (r: preg), r <> PC -> r <> IR14 -> rs1 r = rs r) ->
+    (forall (r: preg), if_preg r = true -> r <> IR14 -> rs1 r = rs r) ->
     exists rs',
     exec_straight ge fn (mk_instr_imm r1 n :: k) rs1 m k rs' m' /\ P rs') ->
   match mk_instr_gen with
   | None => True
   | Some mk =>
-      (forall (r1: ireg) (sa: shift_addr) k,
-      Val.add rs#r1 (eval_shift_addr sa rs) = a ->
+      (forall (r1: ireg) (sa: shift_op) k,
+      Val.add rs#r1 (eval_shift_op sa rs) = a ->
        exists rs',
       exec_straight ge fn (mk r1 sa :: k) rs m k rs' m' /\ P rs')
   end ->
@@ -1239,7 +1350,7 @@ Proof.
   simpl. erewrite ! ireg_of_eq; eauto.
   (* Aindexed2shift *)
   destruct mk_instr_gen as [mk | ]; monadInv TR. apply MK2.
-  erewrite ! ireg_of_eq; eauto. rewrite transl_shift_addr_correct. auto. 
+  erewrite ! ireg_of_eq; eauto. rewrite transl_shift_correct. auto. 
   (* Ainstack *)
   inv TR. apply indexed_memory_access_correct. exact MK1.
 Qed.
@@ -1249,9 +1360,9 @@ Lemma transl_load_int_correct:
   transl_memory_access_int mk_instr is_immed dst addr args k = OK c ->
   eval_addressing ge (rs#SP) addr (map rs (map preg_of args)) = Some a ->
   Mem.loadv chunk m a = Some v ->
-  (forall (r1 r2: ireg) (sa: shift_addr) (rs1: regset),
+  (forall (r1 r2: ireg) (sa: shift_op) (rs1: regset),
     exec_instr ge fn (mk_instr r1 r2 sa) rs1 m =
-    exec_load chunk (Val.add rs1#r2 (eval_shift_addr sa rs1)) r1 rs1 m) ->
+    exec_load chunk (Val.add rs1#r2 (eval_shift_op sa rs1)) r1 rs1 m) ->
   exists rs',
       exec_straight ge fn c rs m k rs' m
    /\ rs'#(preg_of dst) = v
@@ -1260,7 +1371,7 @@ Proof.
   intros. monadInv H. erewrite ireg_of_eq by eauto.  
   eapply transl_memory_access_correct; eauto.
   intros; simpl. econstructor; split. apply exec_straight_one. 
-  rewrite H2. unfold exec_load. simpl eval_shift_addr. rewrite H. rewrite H1. eauto. auto.
+  rewrite H2. unfold exec_load. simpl eval_shift_op. rewrite H. rewrite H1. eauto. auto.
   split. Simpl. intros; Simpl. 
   simpl; intros. 
   econstructor; split. apply exec_straight_one. 
@@ -1294,17 +1405,18 @@ Lemma transl_store_int_correct:
   transl_memory_access_int mk_instr is_immed src addr args k = OK c ->
   eval_addressing ge (rs#SP) addr (map rs (map preg_of args)) = Some a ->
   Mem.storev chunk m a rs#(preg_of src) = Some m' ->
-  (forall (r1 r2: ireg) (sa: shift_addr) (rs1: regset),
+  (forall (r1 r2: ireg) (sa: shift_op) (rs1: regset),
     exec_instr ge fn (mk_instr r1 r2 sa) rs1 m =
-    exec_store chunk (Val.add rs1#r2 (eval_shift_addr sa rs1)) r1 rs1 m) ->
+    exec_store chunk (Val.add rs1#r2 (eval_shift_op sa rs1)) r1 rs1 m) ->
   exists rs',
       exec_straight ge fn c rs m k rs' m'
    /\ forall r, data_preg r = true -> preg_notin r mr -> rs'#r = rs#r.
 Proof.
-  intros. monadInv H. erewrite ireg_of_eq in * by eauto.  
+  intros. assert (DR: data_preg (preg_of src) = true) by eauto with asmgen.
+  monadInv H. erewrite ireg_of_eq in * by eauto.  
   eapply transl_memory_access_correct; eauto.
   intros; simpl. econstructor; split. apply exec_straight_one. 
-  rewrite H2. unfold exec_store. simpl eval_shift_addr. rewrite H. rewrite H3; eauto with asmgen. 
+  rewrite H2. unfold exec_store. simpl eval_shift_op. rewrite H. rewrite H3; eauto with asmgen. 
   rewrite H1. eauto. auto.
   intros; Simpl.
   simpl; intros. 
@@ -1325,7 +1437,8 @@ Lemma transl_store_float_correct:
       exec_straight ge fn c rs m k rs' m'
    /\ forall r, data_preg r = true -> preg_notin r mr -> rs'#r = rs#r.
 Proof.
-  intros. monadInv H. erewrite freg_of_eq in * by eauto.  
+  intros. assert (DR: data_preg (preg_of src) = true) by eauto with asmgen.
+  monadInv H. erewrite freg_of_eq in * by eauto.  
   eapply transl_memory_access_correct; eauto.
   intros; simpl. econstructor; split. apply exec_straight_one. 
   rewrite H2. unfold exec_store. rewrite H. rewrite H3; auto with asmgen. rewrite H1. eauto. auto.
diff --git a/arm/PrintAsm.ml b/arm/PrintAsm.ml
index 2ad9114..bb738a1 100644
--- a/arm/PrintAsm.ml
+++ b/arm/PrintAsm.ml
@@ -20,6 +20,11 @@ open AST
 open Memdata
 open Asm
 
+(* Code generation options.  To be turned into proper options. *)
+
+let vfpv3 = ref true
+let literals_in_code = ref true
+
 (* On-the-fly label renaming *)
 
 let next_label = ref 100
@@ -102,35 +107,41 @@ let condition_name = function
   | TCgt -> "gt"
   | TCle -> "le"
 
-let movimm oc dst n =
-  match Asmgen.decompose_int n with
-  | [] -> assert false
-  | hd::tl as l ->
-      fprintf oc "	mov	%s, #%a\n" dst coqint hd;
-      List.iter
-        (fun n -> fprintf oc "	orr	%s, %s, #%a\n" dst dst coqint n)
-        tl;
-      List.length l
-
-let addimm oc dst src n =
-  match Asmgen.decompose_int n with
-  | [] -> assert false
-  | hd::tl as l ->
-      fprintf oc "	add	%s, %s, #%a\n" dst src coqint hd;
-      List.iter
-        (fun n -> fprintf oc "	add	%s, %s, #%a\n" dst dst coqint n)
-        tl;
-      List.length l
-
-let subimm oc dst src n =
-  match Asmgen.decompose_int n with
-  | [] -> assert false
-  | hd::tl as l ->
-      fprintf oc "	sub	%s, %s, #%a\n" dst src coqint hd;
-      List.iter
-        (fun n -> fprintf oc "	sub	%s, %s, #%a\n" dst dst coqint n)
-        tl;
-      List.length l
+let neg_condition_name = function
+  | TCeq -> "ne"
+  | TCne -> "eq"
+  | TChs -> "lo"
+  | TClo -> "hs"
+  | TCmi -> "pl"
+  | TCpl -> "mi"
+  | TChi -> "ls"
+  | TCls -> "hi"
+  | TCge -> "lt"
+  | TClt -> "ge"
+  | TCgt -> "le"
+  | TCle -> "gt"
+
+(* In Thumb2 mode, some arithmetic instructions have shorter encodings
+   if they carry the "S" flag (update condition flags):
+       add   (but not sp + imm)
+       and
+       asr
+       bic
+       eor
+       lsl
+       lsr
+       mov
+       mvn
+       orr
+       rsb
+       sub    (but not sp - imm)
+   The proof of Asmgen shows that CompCert-generated code behaves the
+   same whether flags are updated or not by those instructions.  The
+   following printing function adds a "S" suffix if we are in Thumb2
+   mode. *)
+
+let thumbS oc =
+  if !Clflags.option_fthumb then output_char oc 's'
 
 (* Names of sections *)
 
@@ -156,15 +167,16 @@ let size_constants = ref 0
 let max_pos_constants = ref max_int
 
 let distance_to_emit_constants () =
-  !max_pos_constants - (!currpos + !size_constants)
+  if !literals_in_code
+  then !max_pos_constants - (!currpos + !size_constants)
+  else max_int
 
 (* Associate labels to floating-point constants and to symbols *)
 
 let float_labels = (Hashtbl.create 39 : (int64, int) Hashtbl.t)
 let float32_labels = (Hashtbl.create 39 : (int32, int) Hashtbl.t)
 
-let label_float f =
-  let bf = camlint64_of_coqint(Floats.Float.to_bits f) in
+let label_float bf =
   try
     Hashtbl.find float_labels bf
   with Not_found ->
@@ -174,8 +186,7 @@ let label_float f =
     max_pos_constants := min !max_pos_constants (!currpos + 1024);
     lbl'
 
-let label_float32 f =
-  let bf = camlint_of_coqint(Floats.Float32.to_bits f) in
+let label_float32 bf =
   try
     Hashtbl.find float32_labels bf
   with Not_found ->
@@ -206,6 +217,7 @@ let reset_constants () =
   max_pos_constants := max_int
 
 let emit_constants oc =
+  fprintf oc "	.balign	4\n";
   Hashtbl.iter
     (fun bf lbl ->
       (* Little-endian floats *)
@@ -224,6 +236,67 @@ let emit_constants oc =
     symbol_labels;
   reset_constants ()
 
+(* Generate code to load the address of id + ofs in register r *)
+
+let loadsymbol oc r id ofs =
+  if !Clflags.option_fthumb then begin
+    fprintf oc "	movw	%a, #:lower16:%a\n"
+            ireg r print_symb_ofs (id, ofs);
+    fprintf oc "	movt	%a, #:upper16:%a\n"
+            ireg r print_symb_ofs (id, ofs); 2
+  end else begin
+    let lbl = label_symbol id ofs in
+    fprintf oc "	ldr	%a, .L%d @ %a\n" 
+            ireg r lbl print_symb_ofs (id, ofs); 1
+  end
+
+(* Emit instruction sequences that set or offset a register by a constant. *)
+(* No S suffix because they are applied to SP most of the time. *)
+
+let movimm oc dst n =
+  match Asmgen.decompose_int n with
+  | [] -> assert false
+  | hd::tl as l ->
+      fprintf oc "	mov	%s, #%a\n" dst coqint hd;
+      List.iter
+        (fun n -> fprintf oc "	orr	%s, %s, #%a\n" dst dst coqint n)
+        tl;
+      List.length l
+
+let addimm oc dst src n =
+  match Asmgen.decompose_int n with
+  | [] -> assert false
+  | hd::tl as l ->
+      fprintf oc "	add	%s, %s, #%a\n" dst src coqint hd;
+      List.iter
+        (fun n -> fprintf oc "	add	%s, %s, #%a\n" dst dst coqint n)
+        tl;
+      List.length l
+
+let subimm oc dst src n =
+  match Asmgen.decompose_int n with
+  | [] -> assert false
+  | hd::tl as l ->
+      fprintf oc "	sub	%s, %s, #%a\n" dst src coqint hd;
+      List.iter
+        (fun n -> fprintf oc "	sub	%s, %s, #%a\n" dst dst coqint n)
+        tl;
+      List.length l
+
+(* Recognition of float constants appropriate for VMOV.
+   "a normalized binary floating point encoding with 1 sign bit, 4
+    bits of fraction and a 3-bit exponent" *)
+
+let is_immediate_float64 bits =
+  let exp = (Int64.(to_int (shift_right_logical bits 52)) land 0x7FF) - 1023 in
+  let mant = Int64.logand bits 0xF_FFFF_FFFF_FFFFL in
+  exp >= -3 && exp <= 4 && Int64.logand mant 0xF_0000_0000_0000L = mant
+
+let is_immediate_float32 bits =
+  let exp = (Int32.(to_int (shift_right_logical bits 23)) land 0xFF) - 127 in
+  let mant = Int32.logand bits 0x7F_FFFFl in
+  exp >= -3 && exp <= 4 && Int32.logand mant 0x78_0000l = mant
+
 (* Emit .file / .loc debugging directives *)
 
 let filename_num : (string, int) Hashtbl.t = Hashtbl.create 7
@@ -267,7 +340,7 @@ let cfi_rel_offset oc reg ofs =
    - annotation statements: take their arguments in registers or stack
      locations; generate no code;
    - inlined by the compiler: take their arguments in arbitrary
-     registers; preserve all registers except IR2, IR3, IR12 and FP6.
+     registers.
 *)
 
 (* Handling of annotations *)
@@ -365,9 +438,9 @@ let print_builtin_vload_common oc chunk args res =
         fprintf oc "	ldr	%a, [%a, #0]\n" ireg res2 ireg addr
       end; 2
   | Mfloat32, [IR addr], [FR res] ->
-      fprintf oc "	flds	%a, [%a, #0]\n" freg_single res ireg addr; 1
+      fprintf oc "	vldr	%a, [%a, #0]\n" freg_single res ireg addr; 1
   | Mfloat64, [IR addr], [FR res] ->
-      fprintf oc "	fldd	%a, [%a, #0]\n" freg res ireg addr; 1
+      fprintf oc "	vldr	%a, [%a, #0]\n" freg res ireg addr; 1
   | _ ->
       assert false
 
@@ -378,10 +451,9 @@ let print_builtin_vload oc chunk args res =
 
 let print_builtin_vload_global oc chunk id ofs args res =
   fprintf oc "%s begin builtin __builtin_volatile_read\n" comment;
-  let lbl = label_symbol id ofs in
-  fprintf oc "	ldr	%a, .L%d @ %a\n" ireg IR14 lbl print_symb_ofs (id, ofs);
-  let n = print_builtin_vload_common oc chunk [IR IR14] res in
-  fprintf oc "%s end builtin __builtin_volatile_read\n" comment; n + 1
+  let n1 = loadsymbol oc IR14 id ofs in
+  let n2 = print_builtin_vload_common oc chunk [IR IR14] res in
+  fprintf oc "%s end builtin __builtin_volatile_read\n" comment; n1 + n2
 
 let print_builtin_vstore_common oc chunk args =
   match chunk, args with
@@ -395,9 +467,9 @@ let print_builtin_vstore_common oc chunk args =
       fprintf oc "	str	%a, [%a, #0]\n" ireg src2 ireg addr;
       fprintf oc "	str	%a, [%a, #4]\n" ireg src1 ireg addr; 2
   | Mfloat32, [IR  addr; FR src] ->
-      fprintf oc "	fsts	%a, [%a, #0]\n" freg_single src ireg addr; 1
+      fprintf oc "	vstr	%a, [%a, #0]\n" freg_single src ireg addr; 1
   | Mfloat64, [IR  addr; FR src] ->
-      fprintf oc "	fstd	%a, [%a, #0]\n" freg src ireg addr; 1
+      fprintf oc "	vstr	%a, [%a, #0]\n" freg src ireg addr; 1
   | _ ->
       assert false
 
@@ -408,10 +480,9 @@ let print_builtin_vstore oc chunk args =
 
 let print_builtin_vstore_global oc chunk id ofs args =
   fprintf oc "%s begin builtin __builtin_volatile_write\n" comment;
-  let lbl = label_symbol id ofs in
-  fprintf oc "	ldr	%a, .L%d @ %a\n" ireg IR14 lbl print_symb_ofs (id, ofs);
-  let n = print_builtin_vstore_common oc chunk (IR IR14 :: args) in
-  fprintf oc "%s end builtin __builtin_volatile_write\n" comment; n + 1
+  let n1 = loadsymbol oc IR14 id ofs in
+  let n2 = print_builtin_vstore_common oc chunk (IR IR14 :: args) in
+  fprintf oc "%s end builtin __builtin_volatile_write\n" comment; n1 + n2
 
 (* Handling of varargs *)
 
@@ -458,24 +529,30 @@ let print_builtin_inline oc name args res =
       fprintf oc "	clz	%a, %a\n" ireg res ireg a1; 1
   (* Float arithmetic *)
   | "__builtin_fabs", [FR a1], [FR res] ->
-      fprintf oc "	fabsd	%a, %a\n" freg res freg a1; 1
+      fprintf oc "	vabs.f64 %a, %a\n" freg res freg a1; 1
   | "__builtin_fsqrt", [FR a1], [FR res] ->
-      fprintf oc "	fsqrtd	%a, %a\n" freg res freg a1; 1
+      fprintf oc "	vsqrt.f64 %a, %a\n" freg res freg a1; 1
   (* 64-bit integer arithmetic *)
   | "__builtin_negl", [IR ah; IR al], [IR rh; IR rl] ->
-      if rl = ah then begin
-        fprintf oc "	rsbs	%a, %a, #0\n" ireg IR14 ireg al;
-        fprintf oc "	rsc	%a, %a, #0\n" ireg rh ireg ah;
-        fprintf oc "	mov	%a, %a\n" ireg rl ireg IR14; 3
-      end else begin
-        fprintf oc "	rsbs	%a, %a, #0\n" ireg rl ireg al;
-        fprintf oc "	rsc	%a, %a, #0\n" ireg rh ireg ah; 2
-      end
+        fprintf oc "	rsbs	%a, %a, #0\n"
+                ireg (if rl = ah then IR14 else rl) ireg al;
+        (* No "rsc" instruction in Thumb2.  Emulate based on
+             rsc a, b, #0 == a <- AddWithCarry(~b, 0, carry)
+                          == mvn a, b; adc a, a, #0 *)
+        if !Clflags.option_fthumb then begin
+          fprintf oc "	mvn	%a, %a\n" ireg rh ireg ah;
+          fprintf oc "	adc	%a, %a, #0\n" ireg rh ireg rh
+        end else begin
+          fprintf oc "	rsc	%a, %a, #0\n" ireg rh ireg ah
+        end;
+        if rl = ah then
+          fprintf oc "	mov%t	%a, %a\n" thumbS ireg rl ireg IR14;
+        4
   | "__builtin_addl", [IR ah; IR al; IR bh; IR bl], [IR rh; IR rl] ->
       if rl = ah || rl = bh then begin
         fprintf oc "	adds	%a, %a, %a\n" ireg IR14 ireg al ireg bl;
         fprintf oc "	adc	%a, %a, %a\n" ireg rh ireg ah ireg bh;
-        fprintf oc "	mov	%a, %a\n" ireg rl ireg IR14; 3
+        fprintf oc "	mov%t	%a, %a\n" thumbS ireg rl ireg IR14; 3
       end else begin
         fprintf oc "	adds	%a, %a, %a\n" ireg rl ireg al ireg bl;
         fprintf oc "	adc	%a, %a, %a\n" ireg rh ireg ah ireg bh; 2
@@ -484,18 +561,13 @@ let print_builtin_inline oc name args res =
       if rl = ah || rl = bh then begin
         fprintf oc "	subs	%a, %a, %a\n" ireg IR14 ireg al ireg bl;
         fprintf oc "	sbc	%a, %a, %a\n" ireg rh ireg ah ireg bh;
-        fprintf oc "	mov	%a, %a\n" ireg rl ireg IR14; 3
+        fprintf oc "	mov%t	%a, %a\n" thumbS ireg rl ireg IR14; 3
       end else begin
         fprintf oc "	subs	%a, %a, %a\n" ireg rl ireg al ireg bl;
         fprintf oc "	sbc	%a, %a, %a\n" ireg rh ireg ah ireg bh; 2
       end
   | "__builtin_mull", [IR a; IR b], [IR rh; IR rl] ->
-      if rl = a || rh = a then begin
-        fprintf oc "	mov	%a, %a\n" ireg IR14 ireg a;
-        fprintf oc "	umull   %a, %a, %a, %a\n" ireg rl ireg rh ireg IR14 ireg b; 2
-      end else begin
-        fprintf oc "	umull   %a, %a, %a, %a\n" ireg rl ireg rh ireg a ireg b; 1
-      end
+      fprintf oc "	umull   %a, %a, %a, %a\n" ireg rl ireg rh ireg a ireg b; 1
   (* Memory accesses *)
   | "__builtin_read16_reversed", [IR a1], [IR res] ->
       fprintf oc "	ldrh	%a, [%a, #0]\n" ireg res ireg a1;
@@ -534,16 +606,16 @@ module FixupEABI = struct
   let fixup_double oc dir f i1 i2 =
     match dir with
     | Incoming ->     (* f <- (i1, i2)  *)
-        fprintf oc "	fmdrr	%a, %a, %a\n" freg f ireg i1 ireg i2
+        fprintf oc "	vmov	%a, %a, %a\n" freg f ireg i1 ireg i2
     | Outgoing ->     (* (i1, i2) <- f *)
-        fprintf oc "	fmrrd	%a, %a, %a\n" ireg i1 ireg i2 freg f
+        fprintf oc "	vmov	%a, %a, %a\n" ireg i1 ireg i2 freg f
 
   let fixup_single oc dir f i =
     match dir with
     | Incoming ->     (* f <- i *)
-        fprintf oc "	fmsr	%a, %a\n" freg_single f ireg i
+        fprintf oc "	vmov	%a, %a\n" freg_single f ireg i
     | Outgoing ->     (* i <- f *)
-        fprintf oc "	fmrs	%a, %a\n" ireg i freg_single f
+        fprintf oc "	vmov	%a, %a\n" ireg i freg_single f
 
   let fixup_conventions oc dir tyl =
     let rec fixup i tyl =
@@ -608,11 +680,11 @@ module FixupHF = struct
   | [] -> 0
   | (fr, Double, dr) :: act ->
       if fr = dr then fixup_outgoing oc act else begin
-        fprintf oc "	fcpyd	d%d, d%d\n" dr fr;
+        fprintf oc "	vmov.f64 d%d, d%d\n" dr fr;
         1 + fixup_outgoing oc act
       end
   | (fr, Single, sr) :: act ->
-        fprintf oc "	fcpys	s%d, s%d\n" sr (2*fr);
+        fprintf oc "	vmov.f32 s%d, s%d\n" sr (2*fr);
         1 + fixup_outgoing oc act
 
   let rec fixup_incoming oc = function
@@ -620,13 +692,13 @@ module FixupHF = struct
   | (fr, Double, dr) :: act ->
       let n = fixup_incoming oc act in
       if fr = dr then n else begin
-        fprintf oc "	fcpyd	d%d, d%d\n" fr dr;
+        fprintf oc "	vmov.f64 d%d, d%d\n" fr dr;
         1 + n
       end
   | (fr, Single, sr) :: act ->
       let n = fixup_incoming oc act in
       if fr = sr then n else begin
-        fprintf oc "	fcpys	s%d, s%d\n" (2*fr) sr;
+        fprintf oc "	vmov.f32 s%d, s%d\n" (2*fr) sr;
         1 + n
       end
 
@@ -658,29 +730,23 @@ let (fixup_arguments, fixup_result) =
 let shift_op oc = function
   | SOimm n -> fprintf oc "#%a" coqint n
   | SOreg r -> ireg oc r
-  | SOlslimm(r, n) -> fprintf oc "%a, lsl #%a" ireg r coqint n
-  | SOlslreg(r, r') -> fprintf oc "%a, lsl %a" ireg r ireg r'
-  | SOlsrimm(r, n) -> fprintf oc "%a, lsr #%a" ireg r coqint n
-  | SOlsrreg(r, r') -> fprintf oc "%a, lsr %a" ireg r ireg r'
-  | SOasrimm(r, n) -> fprintf oc "%a, asr #%a" ireg r coqint n
-  | SOasrreg(r, r') -> fprintf oc "%a, asr %a" ireg r ireg r'
-  | SOrorimm(r, n) -> fprintf oc "%a, ror #%a" ireg r coqint n
-  | SOrorreg(r, r') -> fprintf oc "%a, ror %a" ireg r ireg r'
-
-let shift_addr oc = function
-  | SAimm n -> fprintf oc "#%a" coqint n
-  | SAreg r -> ireg oc r
-  | SAlsl(r, n) -> fprintf oc "%a, lsl #%a" ireg r coqint n
-  | SAlsr(r, n) -> fprintf oc "%a, lsr #%a" ireg r coqint n
-  | SAasr(r, n) -> fprintf oc "%a, asr #%a" ireg r coqint n
-  | SAror(r, n) -> fprintf oc "%a, ror #%a" ireg r coqint n
+  | SOlsl(r, n) -> fprintf oc "%a, lsl #%a" ireg r coqint n
+  | SOlsr(r, n) -> fprintf oc "%a, lsr #%a" ireg r coqint n
+  | SOasr(r, n) -> fprintf oc "%a, asr #%a" ireg r coqint n
+  | SOror(r, n) -> fprintf oc "%a, ror #%a" ireg r coqint n
 
 let print_instruction oc = function
   (* Core instructions *)
   | Padd(r1, r2, so) ->
-      fprintf oc "	add	%a, %a, %a\n" ireg r1 ireg r2 shift_op so; 1
+      fprintf oc "	add%s	%a, %a, %a\n"
+         (if !Clflags.option_fthumb && r2 <> IR14 then "s" else "")
+         ireg r1 ireg r2 shift_op so; 1
   | Pand(r1, r2, so) ->
-      fprintf oc "	and	%a, %a, %a\n" ireg r1 ireg r2 shift_op so; 1
+      fprintf oc "	and%t	%a, %a, %a\n"
+         thumbS ireg r1 ireg r2 shift_op so; 1
+  | Pasr(r1, r2, r3) ->
+      fprintf oc "	asr%t	%a, %a, %a\n"
+         thumbS ireg r1 ireg r2 ireg r3; 1
   | Pb lbl ->
       fprintf oc "	b	%a\n" print_label lbl; 1
   | Pbc(bit, lbl) ->
@@ -707,141 +773,181 @@ let print_instruction oc = function
       let n2 = fixup_result oc Incoming sg in
       n1 + 1 + n2
   | Pbic(r1, r2, so) ->
-      fprintf oc "	bic	%a, %a, %a\n" ireg r1 ireg r2 shift_op so; 1
+      fprintf oc "	bic%t	%a, %a, %a\n"
+              thumbS ireg r1 ireg r2 shift_op so; 1
   | Pcmp(r1, so) ->
       fprintf oc "	cmp	%a, %a\n" ireg r1 shift_op so; 1
   | Peor(r1, r2, so) ->
-      fprintf oc "	eor	%a, %a, %a\n" ireg r1 ireg r2 shift_op so; 1
+      fprintf oc "	eor%t	%a, %a, %a\n"
+              thumbS ireg r1 ireg r2 shift_op so; 1
   | Pldr(r1, r2, sa) | Pldr_a(r1, r2, sa) ->
-      fprintf oc "	ldr	%a, [%a, %a]\n" ireg r1 ireg r2 shift_addr sa; 1
+      fprintf oc "	ldr	%a, [%a, %a]\n" ireg r1 ireg r2 shift_op sa; 1
   | Pldrb(r1, r2, sa) ->
-      fprintf oc "	ldrb	%a, [%a, %a]\n" ireg r1 ireg r2 shift_addr sa; 1
+      fprintf oc "	ldrb	%a, [%a, %a]\n" ireg r1 ireg r2 shift_op sa; 1
   | Pldrh(r1, r2, sa) ->
-      fprintf oc "	ldrh	%a, [%a, %a]\n" ireg r1 ireg r2 shift_addr sa; 1
+      fprintf oc "	ldrh	%a, [%a, %a]\n" ireg r1 ireg r2 shift_op sa; 1
   | Pldrsb(r1, r2, sa) ->
-      fprintf oc "	ldrsb	%a, [%a, %a]\n" ireg r1 ireg r2 shift_addr sa; 1
+      fprintf oc "	ldrsb	%a, [%a, %a]\n" ireg r1 ireg r2 shift_op sa; 1
   | Pldrsh(r1, r2, sa) ->
-      fprintf oc "	ldrsh	%a, [%a, %a]\n" ireg r1 ireg r2 shift_addr sa; 1
+      fprintf oc "	ldrsh	%a, [%a, %a]\n" ireg r1 ireg r2 shift_op sa; 1
+  | Plsl(r1, r2, r3) ->
+      fprintf oc "	lsl%t	%a, %a, %a\n"
+              thumbS ireg r1 ireg r2 ireg r3; 1
+  | Plsr(r1, r2, r3) ->
+      fprintf oc "	lsr%t	%a, %a, %a\n"
+              thumbS ireg r1 ireg r2 ireg r3; 1
   | Pmla(r1, r2, r3, r4) ->
       fprintf oc "	mla	%a, %a, %a, %a\n" ireg r1 ireg r2 ireg r3 ireg r4; 1
   | Pmov(r1, so) ->
-      fprintf oc "	mov	%a, %a\n" ireg r1 shift_op so; 1
-  | Pmovc(bit, r1, so) ->
-      fprintf oc "	mov%s	%a, %a\n" (condition_name bit) ireg r1 shift_op so; 1
+      fprintf oc "	mov%t	%a, %a\n" thumbS ireg r1 shift_op so; 1
+  | Pmovw(r1, n) ->
+      fprintf oc "	movw	%a, #%a\n" ireg r1 coqint n; 1
+  | Pmovt(r1, n) ->
+      fprintf oc "	movt	%a, #%a\n" ireg r1 coqint n; 1
   | Pmul(r1, r2, r3) ->
       fprintf oc "	mul	%a, %a, %a\n" ireg r1 ireg r2 ireg r3; 1
   | Pmvn(r1, so) ->
-      fprintf oc "	mvn	%a, %a\n" ireg r1 shift_op so; 1
+      fprintf oc "	mvn%t	%a, %a\n" thumbS ireg r1 shift_op so; 1
   | Porr(r1, r2, so) ->
-      fprintf oc "	orr	%a, %a, %a\n" ireg r1 ireg r2 shift_op so; 1
+      fprintf oc "	orr%t	%a, %a, %a\n"
+              thumbS ireg r1 ireg r2 shift_op so; 1
   | Prsb(r1, r2, so) ->
-      fprintf oc "	rsb	%a, %a, %a\n" ireg r1 ireg r2 shift_op so; 1
+      fprintf oc "	rsb%t	%a, %a, %a\n"
+              thumbS ireg r1 ireg r2 shift_op so; 1
   | Pstr(r1, r2, sa) | Pstr_a(r1, r2, sa) ->
-      fprintf oc "	str	%a, [%a, %a]\n" ireg r1 ireg r2 shift_addr sa;
+      fprintf oc "	str	%a, [%a, %a]\n" ireg r1 ireg r2 shift_op sa;
       begin match r1, r2, sa with
-      | IR14, IR13, SAimm n -> cfi_rel_offset oc "lr" (camlint_of_coqint n)
+      | IR14, IR13, SOimm n -> cfi_rel_offset oc "lr" (camlint_of_coqint n)
       | _ -> ()
       end;
       1
   | Pstrb(r1, r2, sa) ->
-      fprintf oc "	strb	%a, [%a, %a]\n" ireg r1 ireg r2 shift_addr sa; 1
+      fprintf oc "	strb	%a, [%a, %a]\n" ireg r1 ireg r2 shift_op sa; 1
   | Pstrh(r1, r2, sa) ->
-      fprintf oc "	strh	%a, [%a, %a]\n" ireg r1 ireg r2 shift_addr sa; 1
+      fprintf oc "	strh	%a, [%a, %a]\n" ireg r1 ireg r2 shift_op sa; 1
   | Psdiv ->
       fprintf oc "	bl	__aeabi_idiv\n"; 1
+  | Psbfx(r1, r2, lsb, sz) ->
+      fprintf oc "	sbfx	%a, %a, #%a, #%a\n" ireg r1 ireg r2 coqint lsb coqint sz; 1
   | Psmull(r1, r2, r3, r4) ->
       fprintf oc "	smull	%a, %a, %a, %a\n" ireg r1 ireg r2 ireg r3 ireg r4; 1
   | Psub(r1, r2, so) ->
-      fprintf oc "	sub	%a, %a, %a\n" ireg r1 ireg r2 shift_op so; 1
+      fprintf oc "	sub%t	%a, %a, %a\n"
+              thumbS ireg r1 ireg r2 shift_op so; 1
   | Pudiv ->
       fprintf oc "	bl	__aeabi_uidiv\n"; 1
   | Pumull(r1, r2, r3, r4) ->
       fprintf oc "	umull	%a, %a, %a, %a\n" ireg r1 ireg r2 ireg r3 ireg r4; 1
-  (* Floating-point coprocessor instructions *)
+  (* Floating-point VFD instructions *)
   | Pfcpyd(r1, r2) ->
-      fprintf oc "	fcpyd	%a, %a\n" freg r1 freg r2; 1
+      fprintf oc "	vmov.f64 %a, %a\n" freg r1 freg r2; 1
   | Pfabsd(r1, r2) ->
-      fprintf oc "	fabsd	%a, %a\n" freg r1 freg r2; 1
+      fprintf oc "	vabs.f64 %a, %a\n" freg r1 freg r2; 1
   | Pfnegd(r1, r2) ->
-      fprintf oc "	fnegd	%a, %a\n" freg r1 freg r2; 1
+      fprintf oc "	vneg.f64 %a, %a\n" freg r1 freg r2; 1
   | Pfaddd(r1, r2, r3) ->
-      fprintf oc "	faddd	%a, %a, %a\n" freg r1 freg r2 freg r3; 1
+      fprintf oc "	vadd.f64 %a, %a, %a\n" freg r1 freg r2 freg r3; 1
   | Pfdivd(r1, r2, r3) ->
-      fprintf oc "	fdivd	%a, %a, %a\n" freg r1 freg r2 freg r3; 1
+      fprintf oc "	vdiv.f64 %a, %a, %a\n" freg r1 freg r2 freg r3; 1
   | Pfmuld(r1, r2, r3) ->
-      fprintf oc "	fmuld	%a, %a, %a\n" freg r1 freg r2 freg r3; 1
+      fprintf oc "	vmul.f64 %a, %a, %a\n" freg r1 freg r2 freg r3; 1
   | Pfsubd(r1, r2, r3) ->
-      fprintf oc "	fsubd	%a, %a, %a\n" freg r1 freg r2 freg r3; 1
+      fprintf oc "	vsub.f64 %a, %a, %a\n" freg r1 freg r2 freg r3; 1
   | Pflid(r1, f) ->
-      (* We could make good use of the fconstd instruction, but it's available
-         in VFD v3 and up, not in v1 nor v2 *)
-      let lbl = label_float f in
-      fprintf oc "	fldd	%a, .L%d @ %.12g\n" freg r1 lbl (camlfloat_of_coqfloat f); 1
+      let f = camlint64_of_coqint(Floats.Float.to_bits f) in
+      if !vfpv3 && is_immediate_float64 f then begin
+        fprintf oc "	vmov.f64 %a, #%F\n"
+                   freg r1 (Int64.float_of_bits f); 1
+        (* immediate floats have at most 4 bits of fraction, so they
+           should print exactly with OCaml's F decimal format. *)
+      end else if !literals_in_code then begin
+        let lbl = label_float f in
+        fprintf oc "	vldr	%a, .L%d @ %.12g\n"
+                freg r1 lbl (Int64.float_of_bits f); 1
+      end else begin
+        let lbl = label_float f in
+        fprintf oc "	movw	r14, #:lower16:.L%d\n" lbl;
+        fprintf oc "	movt	r14, #:upper16:.L%d\n" lbl;
+        fprintf oc "	vldr	%a, [r14, #0] @ %.12g\n"
+                freg r1 (Int64.float_of_bits f); 3
+      end
   | Pfcmpd(r1, r2) ->
-      fprintf oc "	fcmpd	%a, %a\n" freg r1 freg r2;
-      fprintf oc "	fmstat\n"; 2
+      fprintf oc "	vcmp.f64 %a, %a\n" freg r1 freg r2;
+      fprintf oc "	vmrs APSR_nzcv, FPSCR\n"; 2
   | Pfcmpzd(r1) ->
-      fprintf oc "	fcmpzd	%a\n" freg r1;
-      fprintf oc "	fmstat\n"; 2
+      fprintf oc "	vcmp.f64 %a, #0\n" freg r1;
+      fprintf oc "	vmrs APSR_nzcv, FPSCR\n"; 2
   | Pfsitod(r1, r2) ->
-      fprintf oc "	fmsr	%a, %a\n" freg_single r1 ireg r2;
-      fprintf oc "	fsitod	%a, %a\n" freg r1 freg_single r1; 2
+      fprintf oc "	vmov	%a, %a\n" freg_single r1 ireg r2;
+      fprintf oc "	vcvt.f64.s32 %a, %a\n" freg r1 freg_single r1; 2
   | Pfuitod(r1, r2) ->
-      fprintf oc "	fmsr	%a, %a\n" freg_single r1 ireg r2;
-      fprintf oc "	fuitod	%a, %a\n" freg r1 freg_single r1; 2
+      fprintf oc "	vmov	%a, %a\n" freg_single r1 ireg r2;
+      fprintf oc "	vcvt.f64.u32 %a, %a\n" freg r1 freg_single r1; 2
   | Pftosizd(r1, r2) ->
-      fprintf oc "	ftosizd	%a, %a\n" freg_single FR6 freg r2;
-      fprintf oc "	fmrs	%a, %a\n" ireg r1 freg_single FR6; 2
+      fprintf oc "	vcvt.s32.f64 %a, %a\n" freg_single FR6 freg r2;
+      fprintf oc "	vmov	%a, %a\n" ireg r1 freg_single FR6; 2
   | Pftouizd(r1, r2) ->
-      fprintf oc "	ftouizd	%a, %a\n" freg_single FR6 freg r2;
-      fprintf oc "	fmrs	%a, %a\n" ireg r1 freg_single FR6; 2
+      fprintf oc "	vcvt.u32.f64 %a, %a\n" freg_single FR6 freg r2;
+      fprintf oc "	vmov	%a, %a\n" ireg r1 freg_single FR6; 2
   | Pfabss(r1, r2) ->
-      fprintf oc "	fabss	%a, %a\n" freg_single r1 freg_single r2; 1
+      fprintf oc "	vabs.f32 %a, %a\n" freg_single r1 freg_single r2; 1
   | Pfnegs(r1, r2) ->
-      fprintf oc "	fnegs	%a, %a\n" freg_single r1 freg_single r2; 1
+      fprintf oc "	vneg.f32 %a, %a\n" freg_single r1 freg_single r2; 1
   | Pfadds(r1, r2, r3) ->
-      fprintf oc "	fadds	%a, %a, %a\n" freg_single r1 freg_single r2 freg_single r3; 1
+      fprintf oc "	vadd.f32 %a, %a, %a\n" freg_single r1 freg_single r2 freg_single r3; 1
   | Pfdivs(r1, r2, r3) ->
-      fprintf oc "	fdivs	%a, %a, %a\n" freg_single r1 freg_single r2 freg_single r3; 1
+      fprintf oc "	vdiv.f32 %a, %a, %a\n" freg_single r1 freg_single r2 freg_single r3; 1
   | Pfmuls(r1, r2, r3) ->
-      fprintf oc "	fmuls	%a, %a, %a\n" freg_single r1 freg_single r2 freg_single r3; 1
+      fprintf oc "	vmul.f32 %a, %a, %a\n" freg_single r1 freg_single r2 freg_single r3; 1
   | Pfsubs(r1, r2, r3) ->
-      fprintf oc "	fsubs	%a, %a, %a\n" freg_single r1 freg_single r2 freg_single r3; 1
+      fprintf oc "	vsub.f32 %a, %a, %a\n" freg_single r1 freg_single r2 freg_single r3; 1
   | Pflis(r1, f) ->
-      (* We could make good use of the fconsts instruction, but it's available
-         in VFD v3 and up, not in v1 nor v2 *)
-      let lbl = label_float32 f in
-      fprintf oc "	flds	%a, .L%d @ %.12g\n" freg_single r1 lbl (camlfloat_of_coqfloat32 f); 1
+      let f = camlint_of_coqint(Floats.Float32.to_bits f) in
+      if !vfpv3 && is_immediate_float32 f then begin
+        fprintf oc "	vmov.f32 %a, #%F\n"
+                freg_single r1 (Int32.float_of_bits f); 1
+        (* immediate floats have at most 4 bits of fraction, so they
+           should print exactly with OCaml's F decimal format. *)
+      end else if !literals_in_code then begin
+        let lbl = label_float32 f in
+        fprintf oc "	vldr	%a, .L%d @ %.12g\n"
+                   freg_single r1 lbl (Int32.float_of_bits f); 1
+      end else begin
+        fprintf oc "	movw	r14, #%ld\n" (Int32.logand f 0xFFFFl);
+        fprintf oc "	movt	r14, #%ld\n" (Int32.shift_right_logical f 16);
+        fprintf oc "	vmov	%a, r14 @ %.12g\n"
+                   freg_single r1 (Int32.float_of_bits f); 3
+      end
   | Pfcmps(r1, r2) ->
-      fprintf oc "	fcmps	%a, %a\n" freg_single r1 freg_single r2;
-      fprintf oc "	fmstat\n"; 2
+      fprintf oc "	vcmp.f32 %a, %a\n" freg_single r1 freg_single r2;
+      fprintf oc "	vmrs APSR_nzcv, FPSCR\n"; 2
   | Pfcmpzs(r1) ->
-      fprintf oc "	fcmpzs	%a\n" freg_single r1;
-      fprintf oc "	fmstat\n"; 2
+      fprintf oc "	vcmp.f32 %a, #0\n" freg_single r1;
+      fprintf oc "	vmrs APSR_nzcv, FPSCR\n"; 2
   | Pfsitos(r1, r2) ->
-      fprintf oc "	fmsr	%a, %a\n" freg_single r1 ireg r2;
-      fprintf oc "	fsitos	%a, %a\n" freg_single r1 freg_single r1; 2
+      fprintf oc "	vmov	%a, %a\n" freg_single r1 ireg r2;
+      fprintf oc "	vcvt.f32.s32 %a, %a\n" freg_single r1 freg_single r1; 2
   | Pfuitos(r1, r2) ->
-      fprintf oc "	fmsr	%a, %a\n" freg_single r1 ireg r2;
-      fprintf oc "	fuitos	%a, %a\n" freg_single r1 freg_single r1; 2
+      fprintf oc "	vmov	%a, %a\n" freg_single r1 ireg r2;
+      fprintf oc "	vcvt.f32.u32 %a, %a\n" freg_single r1 freg_single r1; 2
   | Pftosizs(r1, r2) ->
-      fprintf oc "	ftosizs	%a, %a\n" freg_single FR6 freg_single r2;
-      fprintf oc "	fmrs	%a, %a\n" ireg r1 freg_single FR6; 2
+      fprintf oc "	vcvt.s32.f32 %a, %a\n" freg_single FR6 freg_single r2;
+      fprintf oc "	vmov	%a, %a\n" ireg r1 freg_single FR6; 2
   | Pftouizs(r1, r2) ->
-      fprintf oc "	ftouizs	%a, %a\n" freg_single FR6 freg_single r2;
-      fprintf oc "	fmrs	%a, %a\n" ireg r1 freg_single FR6; 2
+      fprintf oc "	vcvt.u32.f32 %a, %a\n" freg_single FR6 freg_single r2;
+      fprintf oc "	vmov	%a, %a\n" ireg r1 freg_single FR6; 2
   | Pfcvtsd(r1, r2) ->
-      fprintf oc "	fcvtsd	%a, %a\n" freg_single r1 freg r2; 1
+      fprintf oc "	vcvt.f32.f64 %a, %a\n" freg_single r1 freg r2; 1
   | Pfcvtds(r1, r2) ->
-      fprintf oc "	fcvtds	%a, %a\n" freg r1 freg_single r2; 1
+      fprintf oc "	vcvt.f64.f32 %a, %a\n" freg r1 freg_single r2; 1
   | Pfldd(r1, r2, n) | Pfldd_a(r1, r2, n) ->
-      fprintf oc "	fldd	%a, [%a, #%a]\n" freg r1 ireg r2 coqint n; 1
+      fprintf oc "	vldr	%a, [%a, #%a]\n" freg r1 ireg r2 coqint n; 1
   | Pflds(r1, r2, n) ->
-      fprintf oc "	flds	%a, [%a, #%a]\n" freg_single r1 ireg r2 coqint n; 1
+      fprintf oc "	vldr	%a, [%a, #%a]\n" freg_single r1 ireg r2 coqint n; 1
   | Pfstd(r1, r2, n) | Pfstd_a(r1, r2, n) ->
-      fprintf oc "	fstd	%a, [%a, #%a]\n" freg r1 ireg r2 coqint n; 1
+      fprintf oc "	vstr	%a, [%a, #%a]\n" freg r1 ireg r2 coqint n; 1
   | Pfsts(r1, r2, n) ->
-      fprintf oc "	fsts	%a, [%a, #%a]\n" freg_single r1 ireg r2 coqint n; 1
+      fprintf oc "	vstr	%a, [%a, #%a]\n" freg_single r1 ireg r2 coqint n; 1
   (* Pseudo-instructions *)
   | Pallocframe(sz, ofs) ->
       fprintf oc "	mov	r12, sp\n";
@@ -866,17 +972,32 @@ let print_instruction oc = function
   | Plabel lbl ->
       fprintf oc "%a:\n" print_label lbl; 0
   | Ploadsymbol(r1, id, ofs) ->
-      let lbl = label_symbol id ofs in
-      fprintf oc "	ldr	%a, .L%d @ %a\n" 
-         ireg r1 lbl print_symb_ofs (id, ofs); 1
+      loadsymbol oc r1 id ofs
+  | Pmovite(cond, r1, ifso, ifnot) ->
+      fprintf oc "	ite	%s\n" (condition_name cond);
+      fprintf oc "	mov%s	%a, %a\n"
+                 (condition_name cond) ireg r1 shift_op ifso;
+      fprintf oc "	mov%s	%a, %a\n"
+                 (neg_condition_name cond) ireg r1 shift_op ifnot; 2
   | Pbtbl(r, tbl) ->
-      fprintf oc "	mov	r14, %a, lsl #2\n" ireg r;
-      fprintf oc "	ldr	pc, [pc, r14]\n";
-      fprintf oc "	mov	r0, r0\n"; (* no-op *)
-      List.iter
-        (fun l -> fprintf oc "	.word	%a\n" print_label l)
-        tbl;
-      3 + List.length tbl
+      if !Clflags.option_fthumb then begin
+        let lbl = new_label() in
+        fprintf oc "	adr	r14, .L%d\n" lbl;
+        fprintf oc "	add	r14, r14, %a, lsl #2\n" ireg r;
+        fprintf oc "	mov	pc, r14\n";
+        fprintf oc ".L%d:\n" lbl;
+        List.iter
+          (fun l -> fprintf oc "	b.w	%a\n" print_label l)
+          tbl;
+        3 + List.length tbl
+      end else begin
+        fprintf oc "	add	pc, pc, %a, lsl #2\n" ireg r;
+        fprintf oc "	nop\n";
+        List.iter
+          (fun l -> fprintf oc "	b	%a\n" print_label l)
+          tbl;
+        2 + List.length tbl
+      end
   | Pbuiltin(ef, args, res) ->
       begin match ef with
       | EF_builtin(name, sg) ->
@@ -938,25 +1059,31 @@ let print_function oc name fn =
   reset_constants();
   current_function_sig := fn.fn_sig;
   currpos := 0;
-  let text =
+  let (text, lit) =
     match C2C.atom_sections name with
-    | t :: _ -> t
-    |   _    -> Section_text in
+    | t :: l :: _ -> (t, l)
+    | _    -> (Section_text, Section_literal) in
   section oc text;
   let alignment =
     match !Clflags.option_falignfunctions with Some n -> n | None -> 4 in
   fprintf oc "	.balign %d\n" alignment;
   if not (C2C.atom_is_static name) then
     fprintf oc "	.global	%a\n" print_symb name;
+  if !Clflags.option_fthumb then
+    fprintf oc "	.thumb_func\n";
   fprintf oc "%a:\n" print_symb name;
   print_location oc (C2C.atom_location name);
   cfi_startproc oc;
   ignore (fixup_arguments oc Incoming fn.fn_sig);
   print_instructions oc fn.fn_code;
-  emit_constants oc;
+  if !literals_in_code then emit_constants oc;
   cfi_endproc oc;
   fprintf oc "	.type	%a, %%function\n" print_symb name;
-  fprintf oc "	.size	%a, . - %a\n" print_symb name print_symb name
+  fprintf oc "	.size	%a, . - %a\n" print_symb name print_symb name;
+  if not !literals_in_code && !size_constants > 0 then begin
+    section oc lit;
+    emit_constants oc
+  end
 
 (* Data *)
 
@@ -1031,9 +1158,14 @@ let print_globdef oc (name, gdef) =
   | Gvar v -> print_var oc name v
 
 let print_program oc p =
-(*  fprintf oc "	.fpu	vfp\n"; *)
   PrintAnnot.print_version_and_options oc comment;
   Hashtbl.clear filename_num;
+  fprintf oc "	.syntax	unified\n";
+  fprintf oc "	.arch	%s\n"
+          (if !Clflags.option_fthumb then "armv7" else "armv6");
+  fprintf oc "	.fpu	%s\n"
+          (if !vfpv3 then "vfpv3-d16" else "vfpv2");
+  fprintf oc "	.%s\n" (if !Clflags.option_fthumb then "thumb" else "arm");
   List.iter (print_globdef oc) p.prog_defs
 
 
diff --git a/arm/SelectOp.vp b/arm/SelectOp.vp
index ad8a945..6102d82 100644
--- a/arm/SelectOp.vp
+++ b/arm/SelectOp.vp
@@ -454,7 +454,7 @@ Definition can_use_Aindexed2 (chunk: memory_chunk): bool :=
   | Many64 => false
   end.
 
-Definition can_use_Aindexed2shift (chunk: memory_chunk): bool :=
+Definition can_use_Aindexed2shift (chunk: memory_chunk) (s: shift): bool :=
   match chunk with
   | Mint8signed => false
   | Mint8unsigned => true
@@ -466,14 +466,20 @@ Definition can_use_Aindexed2shift (chunk: memory_chunk): bool :=
   | Mfloat64 => false
   | Many32 => true
   | Many64 => false
-  end.
+  end &&
+  (if thumb tt then
+     match s with
+     | Slsl s => Int.ltu s (Int.repr 4)
+     | _      => false
+     end
+   else true).
 
 Nondetfunction addressing (chunk: memory_chunk) (e: expr) :=
   match e with
   | Eop (Oaddrstack n) Enil => (Ainstack n, Enil)
   | Eop (Oaddimm n) (e1:::Enil) => (Aindexed n, e1:::Enil)
   | Eop (Oaddshift s) (e1:::e2:::Enil) =>
-      if can_use_Aindexed2shift chunk
+      if can_use_Aindexed2shift chunk s
       then (Aindexed2shift s, e1:::e2:::Enil)
       else (Aindexed Int.zero, Eop (Oaddshift s) (e1:::e2:::Enil) ::: Enil)
   | Eop Oadd (e1:::e2:::Enil) =>
diff --git a/driver/Clflags.ml b/driver/Clflags.ml
index 610d807..7104c32 100644
--- a/driver/Clflags.ml
+++ b/driver/Clflags.ml
@@ -28,6 +28,7 @@ let option_falignfunctions = ref (None: int option)
 let option_falignbranchtargets = ref 0
 let option_faligncondbranchs = ref 0
 let option_finline_asm = ref false
+let option_fthumb = ref false
 let option_Osize = ref false
 let option_dparse = ref false
 let option_dcmedium = ref false
diff --git a/driver/Compopts.v b/driver/Compopts.v
index 205f768..01109f5 100644
--- a/driver/Compopts.v
+++ b/driver/Compopts.v
@@ -30,4 +30,5 @@ Parameter va_strict: unit -> bool.
 (** Flag -ftailcalls.  For tail call optimization. *)
 Parameter eliminate_tailcalls: unit -> bool.
 
-
+(** Flag -fthumb.  For the ARM back-end. *)
+Parameter thumb: unit -> bool.
diff --git a/driver/Driver.ml b/driver/Driver.ml
index 88871f7..556a476 100644
--- a/driver/Driver.ml
+++ b/driver/Driver.ml
@@ -429,6 +429,7 @@ Code generation options: (use -fno-<opt> to turn off -f<opt>) :
   -O             Optimize for speed [on by default]
   -Os            Optimize for code size
   -ffpu          Use FP registers for some integer operations [on]
+  -fthumb        (ARM only) Use Thumb2 instruction encoding
   -fsmall-data <n>  Set maximal size <n> for allocation in small data area
   -fsmall-const <n>  Set maximal size <n> for allocation in small constant area
   -ffloat-const-prop <n>  Control constant propagation of floats
@@ -552,6 +553,7 @@ let cmdline_actions =
   @ f_opt "inline-asm" option_finline_asm
   @ f_opt "fpu" option_ffpu
   @ f_opt "sse" option_ffpu (* backward compatibility *)
+  @ f_opt "thumb" option_fthumb
 
 let _ =
   Gc.set { (Gc.get()) with
diff --git a/extraction/extraction.v b/extraction/extraction.v
index 0f23264..f0033af 100644
--- a/extraction/extraction.v
+++ b/extraction/extraction.v
@@ -93,6 +93,8 @@ Extract Constant Compopts.generate_float_constants =>
   "fun _ -> !Clflags.option_ffloatconstprop >= 2".
 Extract Constant Compopts.eliminate_tailcalls =>
   "fun _ -> !Clflags.option_ftailcalls".
+Extract Constant Compopts.thumb =>
+  "fun _ -> !Clflags.option_fthumb".
 
 (* Compiler *)
 Extract Constant Compiler.print_Clight => "PrintClight.print_if".
diff --git a/runtime/arm/i64_dtos.S b/runtime/arm/i64_dtos.S
index e454fd4..638a5ff 100644
--- a/runtime/arm/i64_dtos.S
+++ b/runtime/arm/i64_dtos.S
@@ -34,66 +34,67 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"	
 
 @@@ Conversion from double float to signed 64-bit integer
 	
-	.global __i64_dtos
-__i64_dtos:
-#ifdef VARIANT_hardfloat	
-	fmrrd r0, r1, d0
+FUNCTION(__i64_dtos)
+#ifndef VARIANT_eabi
+	vmov r0, r1, d0
 #endif	
-        mov r12, r1, asr #31    @ save sign of result in r12
+        ASR r12, r1, #31       @ save sign of result in r12
   @ extract unbiased exponent ((HI & 0x7FF00000) >> 20) - (1023 + 52) in r2
   @ note: 1023 + 52 = 1075 = 1024 + 51
   @ note: (HI & 0x7FF00000) >> 20 = (HI << 1) >> 21
-        mov r2, r1, lsl #1
-        mov r2, r2, lsr #21
-        sub r2, r2, #51
-        sub r2, r2, #1024
+        LSL r2, r1, #1
+        LSR r2, r2, #21
+        SUB r2, r2, #51
+        SUB r2, r2, #1024
   @ check range of exponent
         cmn r2, #52             @ if EXP < -52, |double| is < 1.0
         blt 1f
         cmp r2, #11             @ if EXP >= 63 - 52, |double| is >= 2^63
         bge 2f
   @ extract true mantissa
-        bic r1, r1, #0xFF000000
-        bic r1, r1, #0x00F00000 @ HI &= ~0xFFF00000
-        orr r1, r1, #0x00100000 @ HI |= 0x00100000
+        BIC r1, r1, #0xFF000000
+        BIC r1, r1, #0x00F00000 @ HI &= ~0xFFF00000
+        ORR r1, r1, #0x00100000 @ HI |= 0x00100000
   @ shift it appropriately
         cmp r2, #0
         blt 3f
   @ EXP >= 0: shift left by EXP.  Note that EXP < 12        
         rsb r3, r2, #32         @ r3 = 32 - amount
-        mov r1, r1, lsl r2
-        orr r1, r1, r0, lsr r3
-        mov r0, r0, lsl r2
+        LSL r1, r1, r2
+        LSR r3, r0, r3
+        ORR r1, r1, r3
+        LSL r0, r0, r2
 	b 4f
   @ EXP < 0: shift right by -EXP.  Note that -EXP <= 52 but can be >= 32        
-3:      rsb r2, r2, #0          @ r2 = -EXP - shift amount
-        rsb r3, r2, #32         @ r3 = 32 - amount
-        mov r0, r0, lsr r2
-        orr r0, r0, r1, lsl r3
-	sub r3, r2, #32         @ r3 = amount - 32  (see i64_shr.s)
-	orr r0, r0, r1, lsr r3
-        mov r1, r1, lsr r2
+3:      RSB r2, r2, #0          @ r2 = -EXP = shift amount
+        RSB r3, r2, #32         @ r3 = 32 - amount
+        LSR r0, r0, r2
+        LSL r3, r1, r3
+        ORR r0, r0, r3
+	SUB r3, r2, #32         @ r3 = amount - 32  (see i64_shr.s)
+	LSR r3, r1, r3
+	ORR r0, r0, r3
+        LSR r1, r1, r2
   @ apply sign to result
-4:      eor r0, r0, r12
-        eor r1, r1, r12
+4:      EOR r0, r0, r12
+        EOR r1, r1, r12
         subs r0, r0, r12
         sbc r1, r1, r12
         bx lr
   @ special cases
-1:      mov r0, #0 @ result is 0
-        mov r1, #0
+1:      MOV r0, #0              @ result is 0
+        MOV r1, #0
         bx lr
-2:      cmp r4, #0
+2:      cmp r12, #0
         blt 6f
         mvn r0, #0              @ result is 0x7F....FF (MAX_SINT)
-        mov r1, r0, lsr #1
+        LSR r1, r0, #1
         bx lr
-6:      mov r0, #0              @ result is 0x80....00 (MIN_SINT)
-        mov r1, #0x80000000
+6:      MOV r0, #0              @ result is 0x80....00 (MIN_SINT)
+        MOV r1, #0x80000000
         bx lr
-	.type __i64_dtos, %function
-        .size __i64_dtos, . - __i64_dtos
+ENDFUNCTION(__i64_dtos)
diff --git a/runtime/arm/i64_dtou.S b/runtime/arm/i64_dtou.S
index a9e7b65..1c632d2 100644
--- a/runtime/arm/i64_dtou.S
+++ b/runtime/arm/i64_dtou.S
@@ -34,58 +34,58 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"
 
 @@@ Conversion from double float to unsigned 64-bit integer	
 
-	.global __i64_dtou
-__i64_dtou:
-#ifdef VARIANT_hardfloat	
-	fmrrd r0, r1, d0
+FUNCTION(__i64_dtou)
+#ifndef VARIANT_eabi
+	vmov r0, r1, d0
 #endif	
         cmp r1, #0              @ is double < 0 ?
         blt 1f                  @ then it converts to 0
   @ extract unbiased exponent ((HI & 0x7FF00000) >> 20) - (1023 + 52) in r2
   @ note: 1023 + 52 = 1075 = 1024 + 51
   @ note: (HI & 0x7FF00000) >> 20 = (HI << 1) >> 21
-        mov r2, r1, lsl #1
-        mov r2, r2, lsr #21
-        sub r2, r2, #51
-        sub r2, r2, #1024
+        LSL r2, r1, #1
+        LSR r2, r2, #21
+        SUB r2, r2, #51
+        SUB r2, r2, #1024
   @ check range of exponent
         cmn r2, #52             @ if EXP < -52, double is < 1.0
         blt 1f
         cmp r2, #12             @ if EXP >= 64 - 52, double is >= 2^64
         bge 2f
   @ extract true mantissa
-        bic r1, r1, #0xFF000000
-        bic r1, r1, #0x00F00000 @ HI &= ~0xFFF00000
-        orr r1, r1, #0x00100000 @ HI |= 0x00100000
+        BIC r1, r1, #0xFF000000
+        BIC r1, r1, #0x00F00000 @ HI &= ~0xFFF00000
+        ORR r1, r1, #0x00100000 @ HI |= 0x00100000
   @ shift it appropriately
         cmp r2, #0
         blt 3f
   @ EXP >= 0: shift left by EXP.  Note that EXP < 12        
         rsb r3, r2, #32         @ r3 = 32 - amount
-        mov r1, r1, lsl r2
-        orr r1, r1, r0, lsr r3
-        mov r0, r0, lsl r2
+        LSL r1, r1, r2
+        LSR r3, r0, r3
+        ORR r1, r1, r3
+        LSL r0, r0, r2
 	bx lr
   @ EXP < 0: shift right by -EXP.  Note that -EXP <= 52 but can be >= 32        
-3:      rsb r2, r2, #0          @ r2 = -EXP - shift amount
-        rsb r3, r2, #32         @ r3 = 32 - amount
-        mov r0, r0, lsr r2
-        orr r0, r0, r1, lsl r3
-	sub r3, r2, #32         @ r3 = amount - 32  (see i64_shr.s)
-	orr r0, r0, r1, lsr r3
-        mov r1, r1, lsr r2
+3:      RSB r2, r2, #0          @ r2 = -EXP = shift amount
+        RSB r3, r2, #32         @ r3 = 32 - amount
+        LSR r0, r0, r2
+        LSL r3, r1, r3
+        ORR r0, r0, r3
+	SUB r3, r2, #32         @ r3 = amount - 32  (see i64_shr.s)
+	LSR r3, r1, r3
+	ORR r0, r0, r3
+        LSR r1, r1, r2
         bx lr
   @ special cases
-1:      mov r0, #0              @ result is 0
-        mov r1, #0
+1:      MOV r0, #0              @ result is 0
+        MOV r1, #0
         bx lr
 2:      mvn r0, #0              @ result is 0xFF....FF (MAX_UINT)
-        mvn r1, #0
+        MOV r1, r0
         bx lr
-	.type __i64_dtou, %function
-        .size __i64_dtou, . - __i64_dtou
-	
+ENDFUNCTION(__i64_dtou)
diff --git a/runtime/arm/i64_sar.s b/runtime/arm/i64_sar.S
index a96d092..1bbd8a7 100644
--- a/runtime/arm/i64_sar.s
+++ b/runtime/arm/i64_sar.S
@@ -34,24 +34,24 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"
 
 @@@ Shift right signed
 
-	.global __i64_sar
-__i64_sar:
-        and r2, r2, #63         @ normalize amount to 0...63
+FUNCTION(__i64_sar)
+        AND r2, r2, #63         @ normalize amount to 0...63
         rsbs r3, r2, #32        @ r3 = 32 - amount
         ble 1f                  @ branch if <= 0, namely if amount >= 32
-        mov r0, r0, lsr r2
-        orr r0, r1, lsl r3
-        mov r1, r1, asr r2
+        LSR r0, r0, r2
+	LSL r3, r1, r3
+        ORR r0, r1, r3
+        ASR r1, r1, r2
         bx lr
 1:
-        sub r2, r2, #32
-        mov r0, r1, asr r2
-        mov r1, r1, asr #31
+        SUB r2, r2, #32
+        ASR r0, r1, r2
+        ASR r1, r1, #31
         bx lr
-	.type __i64_sar, %function
-	.size __i64_sar, . - __i64_sar
+ENDFUNCTION(__i64_sar)
+
 	
diff --git a/runtime/arm/i64_smod.s b/runtime/arm/i64_sdiv.S
index 8e766a0..dd88c12 100644
--- a/runtime/arm/i64_smod.s
+++ b/runtime/arm/i64_sdiv.S
@@ -34,31 +34,28 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"
 
-@@@ Signed modulus	
-
-	.global __i64_smod
-__i64_smod:
+@@@ Signed division	
+	
+FUNCTION(__i64_sdiv)
         push {r4, r5, r6, r7, r8, r10, lr}
-	mov r10, r1              @ r10 = sign of result
-        mov r4, r1, asr#31       @ take absolute value of N
-        eor r0, r0, r4           @ N = (N ^ (N >>s 31)) - (N >>s 31)
-        eor r1, r1, r4
+        ASR r4, r1, #31          @ r4 = sign of N
+        ASR r5, r3, #31          @ r5 = sign of D
+        EOR r10, r4, r5          @ r10 = sign of result
+        EOR r0, r0, r4           @ take absolute value of N
+        EOR r1, r1, r4           @ N = (N ^ (N >>s 31)) - (N >>s 31)
         subs r0, r0, r4
         sbc r1, r1, r4
-        mov r4, r3, asr #31      @ take absolute value of D
-        eor r2, r2, r4
-        eor r3, r3, r4
-        subs r2, r2, r4
-        sbc r3, r3, r4
+        EOR r2, r2, r5           @ take absolute value of D
+        EOR r3, r3, r5
+        subs r2, r2, r5
+        sbc r3, r3, r5
         bl __i64_udivmod         @ do unsigned division
-        eor r0, r0, r10, asr#31  @ apply expected sign
-        eor r1, r1, r10, asr#31
-        subs r0, r0, r10, asr#31
-        sbc r1, r1, r10, asr#31
+        EOR r0, r4, r10          @ apply expected sign
+        EOR r1, r5, r10
+        subs r0, r0, r10
+        sbc r1, r1, r10
         pop {r4, r5, r6, r7, r8, r10, lr}
         bx lr
-	.type __i64_smod, %function
-	.size __i64_smod, . - __i64_smod
-	
+ENDFUNCTION(__i64_sdiv)
diff --git a/runtime/arm/i64_shl.s b/runtime/arm/i64_shl.S
index afd92db..66569d3 100644
--- a/runtime/arm/i64_shl.s
+++ b/runtime/arm/i64_shl.S
@@ -34,7 +34,7 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"	
 
 @@@ Shift left	
 
@@ -47,26 +47,25 @@
 @ If N = 0:
 @    RH = XH | 0 | 0
 @    RL = XL
-@ If 1 <= N <= 31:  1 <= 32-N <= 31  and  255 <= N-32 mod 256 <= 255
+@ If 1 <= N <= 31:  1 <= 32-N <= 31  and  2s5 <= N-32 mod 256 <= 255
 @    RH = (XH << N) | (XL >> (32-N) | 0
 @    RL = XL << N        
 @ If N = 32:
 @    RH = 0 | XL | 0
 @    RL = 0
-@ If 33 <= N <= 63: 255 <= 32-N mod 256 <= 255 and 1 <= N-32 <= 31
+@ If 33 <= N <= 63: 225 <= 32-N mod 256 <= 255 and 1 <= N-32 <= 31
 @    RH = 0 | 0 | (XL << (N-32))
 @    RL = 0
 	
-	.global __i64_shl
-__i64_shl:
-        and r2, r2, #63         @ normalize amount to 0...63
-        rsb r3, r2, #32         @ r3 = 32 - amount
-        mov r1, r1, lsl r2
-        orr r1, r1, r0, lsr r3
-	sub r3, r2, #32         @ r3 = amount - 32
-	orr r1, r1, r0, lsl r3
-        mov r0, r0, lsl r2
+FUNCTION(__i64_shl)
+        AND r2, r2, #63         @ normalize amount to 0...63
+        RSB r3, r2, #32         @ r3 = 32 - amount
+        LSL r1, r1, r2
+        LSR r3, r0, r3
+        ORR r1, r1, r3
+	SUB r3, r2, #32         @ r3 = amount - 32
+	LSL r3, r0, r3
+	ORR r1, r1, r3
+        LSL r0, r0, r2
         bx lr
-	.type __i64_shl, %function
-	.size __i64_shl, . - __i64_shl
-
+ENDFUNCTION(__i64_shl)
diff --git a/runtime/arm/i64_shr.s b/runtime/arm/i64_shr.S
index 9d60441..a5418f4 100644
--- a/runtime/arm/i64_shr.s
+++ b/runtime/arm/i64_shr.S
@@ -34,7 +34,7 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"
 
 @@@ Shift right unsigned	
 
@@ -57,16 +57,15 @@
 @    RL = 0 | 0 | (XH >> (N-32))
 @    RH = 0
 
-	.global __i64_shr
-__i64_shr:
-        and r2, r2, #63         @ normalize amount to 0...63
-        rsb r3, r2, #32         @ r3 = 32 - amount
-        mov r0, r0, lsr r2
-        orr r0, r0, r1, lsl r3
-	sub r3, r2, #32         @ r3 = amount - 32
-	orr r0, r0, r1, lsr r3
-        mov r1, r1, lsr r2
+FUNCTION(__i64_shr)
+        AND r2, r2, #63         @ normalize amount to 0...63
+        RSB r3, r2, #32         @ r3 = 32 - amount
+        LSR r0, r0, r2
+        LSL r3, r1, r3
+        ORR r0, r0, r3
+	SUB r3, r2, #32         @ r3 = amount - 32
+	LSR r3, r1, r3
+	ORR r0, r0, r3
+        LSR r1, r1, r2
         bx lr
-	.type __i64_shr, %function
-	.size __i64_shr, . - __i64_shr
-	
+ENDFUNCTION(__i64_shr)
diff --git a/runtime/arm/i64_sdiv.s b/runtime/arm/i64_smod.S
index e96008d..b109ecc 100644
--- a/runtime/arm/i64_sdiv.s
+++ b/runtime/arm/i64_smod.S
@@ -34,33 +34,28 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"
 
-@@@ Signed division	
+@@@ Signed modulus
 	
-	.global __i64_sdiv
-__i64_sdiv:
+FUNCTION(__i64_smod)
         push {r4, r5, r6, r7, r8, r10, lr}
-        eor r10, r1, r3          @ r10 = sign of result
-        mov r4, r1, asr #31      @ take absolute value of N
-        eor r0, r0, r4           @ N = (N ^ (N >>s 31)) - (N >>s 31)
-        eor r1, r1, r4
+        ASR r4, r1, #31          @ r4 = sign of N
+        ASR r5, r3, #31          @ r5 = sign of D
+        MOV r10, r4              @ r10 = sign of result
+        EOR r0, r0, r4           @ take absolute value of N
+        EOR r1, r1, r4           @ N = (N ^ (N >>s 31)) - (N >>s 31)
         subs r0, r0, r4
         sbc r1, r1, r4
-        mov r4, r3, asr #31      @ take absolute value of D
-        eor r2, r2, r4
-        eor r3, r3, r4
-        subs r2, r2, r4
-        sbc r3, r3, r4
+        EOR r2, r2, r5           @ take absolute value of D
+        EOR r3, r3, r5
+        subs r2, r2, r5
+        sbc r3, r3, r5
         bl __i64_udivmod         @ do unsigned division
-        mov r0, r4
-        mov r1, r5
-        eor r0, r0, r10, asr#31  @ apply expected sign
-        eor r1, r1, r10, asr#31
-        subs r0, r0, r10, asr#31
-        sbc r1, r1, r10, asr#31
+        EOR r0, r0, r10          @ apply expected sign
+        EOR r1, r1, r10
+        subs r0, r0, r10
+        sbc r1, r1, r10
         pop {r4, r5, r6, r7, r8, r10, lr}
         bx lr
-	.type __i64_sdiv, %function
-	.size __i64_sdiv, . - __i64_sdiv
-	
+ENDFUNCTION(__i64_smod)
diff --git a/runtime/arm/i64_stod.S b/runtime/arm/i64_stod.S
index 7e5a06f..81e43e2 100644
--- a/runtime/arm/i64_stod.S
+++ b/runtime/arm/i64_stod.S
@@ -34,24 +34,23 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"
 
 @@@ Conversion from signed 64-bit integer to double float
 	
-	.global __i64_stod
+FUNCTION(__i64_stod)
 __i64_stod:
-        fmsr s0, r0
-        fuitod d0, s0           @ convert low half to double (unsigned)
-        fmsr s2, r1
-        fsitod d1, s2           @ convert high half to double (signed)
-        fldd d2, .LC1           @ d2 = 2^32
-        fmacd d0, d1, d2        @ d0 = d0 + d1 * d2 = double value of int64
+        vmov    s0, r0
+        vcvt.f64.u32 d0, s0     @ convert low half to double (unsigned)
+        vmov    s2, r1
+        vcvt.f64.s32 d1, s2     @ convert high half to double (signed)
+        vldr    d2, .LC1        @ d2 = 2^32
+        vmla.f64 d0, d1, d2     @ d0 = d0 + d1 * d2 = double value of int64
 #ifdef VARIANT_eabi	
-	fmrrd r0, r1, d0        @ return result in r0, r1
+	vmov	r0, r1, d0      @ return result in r0, r1
 #endif
         bx lr
-	.type __i64_stod, %function
-	.size __i64_stod, . - __i64_stod
+ENDFUNCTION(__i64_stod)
 
         .balign 8
 .LC1:   .quad 0x41f0000000000000 @ 2^32 in double precision
diff --git a/runtime/arm/i64_stof.S b/runtime/arm/i64_stof.S
index 3f33f04..f1051f5 100644
--- a/runtime/arm/i64_stof.S
+++ b/runtime/arm/i64_stof.S
@@ -34,15 +34,14 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"
 
 @@@ Conversion from signed 64-bit integer to single float
 	
-	.global __i64_stof
-__i64_stof:
+FUNCTION(__i64_stof)
     @ Check whether -2^53 <= X < 2^53
-        mov r2, r1, asr #21
-        mov r3, r1, asr #31     @ (r2,r3) = X >> 53
+        ASR r2, r1, #21
+        ASR r3, r1, #31         @ (r2,r3) = X >> 53
         adds r2, r2, #1
         adc r3, r3, #0          @ (r2,r3) = X >> 53 + 1
         cmp r3, #2
@@ -50,30 +49,29 @@ __i64_stof:
    @ X is large enough that double rounding can occur.
    @ Avoid it by nudging X away from the points where double rounding
    @ occurs (the "round to odd" technique)
-	mov r2, #0x700
-        orr r2, r2, #0xFF       @ r2 = 0x7FF
-        and r3, r0, r2          @ extract bits 0 to 11 of X
-        add r3, r3, r2          @ r3 = (X & 0x7FF) + 0x7FF
+	MOV r2, #0x700
+        ORR r2, r2, #0xFF       @ r2 = 0x7FF
+        AND r3, r0, r2          @ extract bits 0 to 11 of X
+        ADD r3, r3, r2          @ r3 = (X & 0x7FF) + 0x7FF
    @ bit 12 of r3 is 0 if all low 12 bits of X are 0, 1 otherwise
    @ bits 13-31 of r3 are 0
-        orr r0, r0, r3          @ correct bit number 12 of X
-	bic r0, r0, r2          @ set to 0 bits 0 to 11 of X
+        ORR r0, r0, r3          @ correct bit number 12 of X
+	BIC r0, r0, r2          @ set to 0 bits 0 to 11 of X
    @ Convert to double
-1:      fmsr s0, r0
-        fuitod d0, s0           @ convert low half to double (unsigned)
-        fmsr s2, r1
-        fsitod d1, s2           @ convert high half to double (signed)
-        fldd d2, .LC1           @ d2 = 2^32
-        fmacd d0, d1, d2        @ d0 = d0 + d1 * d2 = double value of int64
+1:      vmov s0, r0
+        vcvt.f64.u32 d0, s0     @ convert low half to double (unsigned)
+        vmov s2, r1
+        vcvt.f64.s32 d1, s2     @ convert high half to double (signed)
+        vldr d2, .LC1           @ d2 = 2^32
+        vmla.f64 d0, d1, d2     @ d0 = d0 + d1 * d2 = double value of int64
    @ Round to single
-        fcvtsd s0, d0
+        vcvt.f32.f64 s0, d0
 #ifdef VARIANT_eabi	
    @ Return result in r0
-	fmrs r0, s0
+	vmov r0, s0
 #endif	
         bx lr
-	.type __i64_stof, %function
-	.size __i64_stof, . - __i64_stof
-
+ENDFUNCTION(__i64_stof)
+	
         .balign 8
 .LC1:   .quad 0x41f0000000000000 @ 2^32 in double precision
diff --git a/runtime/arm/i64_udiv.s b/runtime/arm/i64_udiv.S
index de41e00..3b59944 100644
--- a/runtime/arm/i64_udiv.s
+++ b/runtime/arm/i64_udiv.S
@@ -34,18 +34,15 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"
 
 @@@ Unsigned division	
 	
-	.global __i64_udiv
-__i64_udiv:
+FUNCTION(__i64_udiv)
         push {r4, r5, r6, r7, r8, lr}
         bl __i64_udivmod
-        mov r0, r4
-        mov r1, r5
+        MOV r0, r4
+        MOV r1, r5
         pop {r4, r5, r6, r7, r8, lr}
         bx lr
-	.type __i64_udiv, %function
-	.size __i64_udiv, . - __i64_udiv
-	
+ENDFUNCTION(__i64_udiv)	
diff --git a/runtime/arm/i64_udivmod.s b/runtime/arm/i64_udivmod.S
index e3d9c87..e5373ad 100644
--- a/runtime/arm/i64_udivmod.s
+++ b/runtime/arm/i64_udivmod.S
@@ -34,7 +34,7 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"
 
 @@@ Auxiliary function for division and modulus. Don't call from C
 
@@ -42,14 +42,14 @@
 @ On exit:   Q = (r4, r5) quotient     R = (r0, r1) remainder	
 @ Locals:    M = (r6, r7) mask         TMP = r8 temporary	
 	
-	.global __i64_udivmod
-__i64_udivmod:
+FUNCTION(__i64_udivmod)
         orrs r8, r2, r3         @ is D == 0?
+	it eq
         bxeq lr                 @ if so, return with unspecified results
-        mov r4, #0              @ Q = 0
-        mov r5, #0
-        mov r6, #1              @ M = 1
-        mov r7, #0
+        MOV r4, #0              @ Q = 0
+        MOV r5, #0
+        MOV r6, #1              @ M = 1
+        MOV r7, #0
 1:      cmp r3, #0              @ while ((signed) D >= 0) ...
         blt 2f
         subs r8, r0, r2         @ ... and N >= D ...
@@ -69,12 +69,11 @@ __i64_udivmod:
         adc r1, r1, r3          @ N = N + D
         bic r4, r4, r6          @ Q = Q & ~M
         bic r5, r5, r7
-3:      movs r7, r7, lsr #1     @ M = M >> 1
-        mov r6, r6, rrx
-        movs r3, r3, lsr #1     @ D = D >> 1
-        mov r2, r2, rrx
+3:      lsrs r7, r7, #1         @ M = M >> 1
+        rrx r6, r6
+        lsrs r3, r3, #1         @ D = D >> 1
+        rrx r2, r2
         orrs r8, r6, r7         @ repeat while (M != 0) ...
         bne 2b
         bx lr
-	.type __i64_udivmod, %function
-	.size __i64_udivmod, . - __i64_udivmod
+ENDFUNCTION(__i64_udivmod)
diff --git a/runtime/arm/i64_umod.s b/runtime/arm/i64_umod.S
index 80af56e..e59fd20 100644
--- a/runtime/arm/i64_umod.s
+++ b/runtime/arm/i64_umod.S
@@ -34,16 +34,13 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"
 
-@@@ Unsigned modulus	
+@@@ Unsigned remainder
 	
-	.global __i64_umod
-__i64_umod:
+FUNCTION(__i64_umod)
         push {r4, r5, r6, r7, r8, lr}
         bl __i64_udivmod         @ remainder is already in r0,r1
         pop {r4, r5, r6, r7, r8, lr}
         bx lr
-	.type __i64_umod, %function
-	.size __i64_umod, . - __i64_umod
-	
+ENDFUNCTION(__i64_umod)	
diff --git a/runtime/arm/i64_utod.S b/runtime/arm/i64_utod.S
index 1110874..b12d7c2 100644
--- a/runtime/arm/i64_utod.S
+++ b/runtime/arm/i64_utod.S
@@ -34,25 +34,23 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"
 
 @@@ Conversion from unsigned 64-bit integer to double float
 	
-	.global __i64_utod
-__i64_utod:
-        fmsr s0, r0
-        fuitod d0, s0           @ convert low half to double (unsigned)
-        fmsr s2, r1
-        fuitod d1, s2           @ convert high half to double (unsigned)
-        fldd d2, .LC1           @ d2 = 2^32
-        fmacd d0, d1, d2        @ d0 = d0 + d1 * d2 = double value of int64
+FUNCTION(__i64_utod)
+__i64_stod:
+        vmov    s0, r0
+        vcvt.f64.u32 d0, s0     @ convert low half to double (unsigned)
+        vmov    s2, r1
+        vcvt.f64.u32 d1, s2     @ convert high half to double (unsigned)
+        vldr    d2, .LC1        @ d2 = 2^32
+        vmla.f64 d0, d1, d2     @ d0 = d0 + d1 * d2 = double value of int64
 #ifdef VARIANT_eabi	
-	fmrrd r0, r1, d0        @ return result in r0, r1
-#endif	
+	vmov	r0, r1, d0      @ return result in r0, r1
+#endif
         bx lr
-	.type __i64_utod, %function
-	.size __i64_utod, . - __i64_utod
-	
+ENDFUNCTION(__i64_utod)
+
         .balign 8
 .LC1:   .quad 0x41f0000000000000 @ 2^32 in double precision
-	
diff --git a/runtime/arm/i64_utof.S b/runtime/arm/i64_utof.S
index a959076..711cda0 100644
--- a/runtime/arm/i64_utof.S
+++ b/runtime/arm/i64_utof.S
@@ -34,42 +34,40 @@
 
 @ Helper functions for 64-bit integer arithmetic.  ARM version.
 
-        .text
+#include "sysdeps.h"
 
 @@@ Conversion from unsigned 64-bit integer to single float
 	
-	.global __i64_utof
-__i64_utof:
+FUNCTION(__i64_utof)
     @ Check whether X < 2^53
-        movs r2, r1, lsr #21    @ test if X >> 53 == 0
+        lsrs r2, r1, #21        @ test if X >> 53 == 0
         beq 1f
    @ X is large enough that double rounding can occur.
    @ Avoid it by nudging X away from the points where double rounding
    @ occurs (the "round to odd" technique)
-	mov r2, #0x700
-        orr r2, r2, #0xFF       @ r2 = 0x7FF
-        and r3, r0, r2          @ extract bits 0 to 11 of X
-        add r3, r3, r2          @ r3 = (X & 0x7FF) + 0x7FF
+	MOV r2, #0x700
+        ORR r2, r2, #0xFF       @ r2 = 0x7FF
+        AND r3, r0, r2          @ extract bits 0 to 11 of X
+        ADD r3, r3, r2          @ r3 = (X & 0x7FF) + 0x7FF
    @ bit 12 of r3 is 0 if all low 12 bits of X are 0, 1 otherwise
    @ bits 13-31 of r3 are 0
-        orr r0, r0, r3          @ correct bit number 12 of X
-	bic r0, r0, r2          @ set to 0 bits 0 to 11 of X
+        ORR r0, r0, r3          @ correct bit number 12 of X
+	BIC r0, r0, r2          @ set to 0 bits 0 to 11 of X
    @ Convert to double
-1:      fmsr s0, r0
-        fuitod d0, s0           @ convert low half to double (unsigned)
-        fmsr s2, r1
-        fuitod d1, s2           @ convert high half to double (unsigned)
-        fldd d2, .LC1           @ d2 = 2^32
-        fmacd d0, d1, d2        @ d0 = d0 + d1 * d2 = double value of int64
+1:      vmov s0, r0
+        vcvt.f64.u32 d0, s0     @ convert low half to double (unsigned)
+        vmov s2, r1
+        vcvt.f64.u32 d1, s2     @ convert high half to double (unsigned)
+        vldr d2, .LC1           @ d2 = 2^32
+        vmla.f64 d0, d1, d2     @ d0 = d0 + d1 * d2 = double value of int64
    @ Round to single
-        fcvtsd s0, d0
+        vcvt.f32.f64 s0, d0
 #ifdef VARIANT_eabi	
    @ Return result in r0
-	fmrs r0, s0
+	vmov r0, s0
 #endif	
         bx lr
-	.type __i64_utof, %function
-	.size __i64_utof, . - __i64_utof
+ENDFUNCTION(__i64_utof)
 
         .balign 8
 .LC1:   .quad 0x41f0000000000000 @ 2^32 in double precision
diff --git a/runtime/arm/sysdeps.h b/runtime/arm/sysdeps.h
new file mode 100644
index 0000000..b72af0f
--- /dev/null
+++ b/runtime/arm/sysdeps.h
@@ -0,0 +1,84 @@
+// *****************************************************************
+//
+//               The Compcert verified compiler
+//
+//           Xavier Leroy, INRIA Paris-Rocquencourt
+//
+// Copyright (c) 2013 Institut National de Recherche en Informatique et
+//  en Automatique.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the <organization> nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT
+// HOLDER> BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// *********************************************************************
+
+// System dependencies
+
+#ifdef THUMB
+#define FUNCTION(f) \
+	.text; \
+	.thumb; \
+	.thumb_func; \
+	.global f; \
+f:
+#else
+#define FUNCTION(f) \
+	.text; \
+	.arm; \
+	.global f; \
+f:
+#endif
+
+#define ENDFUNCTION(f) \
+	.type f, %function; .size f, . - f
+
+// In Thumb2 mode, some instructions have shorter encodings in their "S" form
+// (update condition flags).  For cases where the condition flags do not
+// matter, we use the following macros for these instructions.
+// In classic ARM mode, we prefer not to use the "S" form unless necessary.
+
+#ifdef THUMB
+#define THUMB_S(x) x##s
+#else
+#define THUMB_S(x) x
+#endif
+
+#define ADD THUMB_S(add)
+#define AND THUMB_S(and)
+#define ASR THUMB_S(asr)
+#define BIC THUMB_S(bic)
+#define EOR THUMB_S(eor)
+#define LSL THUMB_S(lsl)
+#define LSR THUMB_S(lsr)
+#define MOV THUMB_S(mov)
+#define ORR THUMB_S(orr)
+#define RSB THUMB_S(rsb)
+#define SUB THUMB_S(sub)
+
+	.syntax unified
+#ifdef THUMB
+	.arch	armv7
+#else
+	.arch	armv6
+#endif
+	.fpu	vfpv2
diff --git a/runtime/arm/vararg.S b/runtime/arm/vararg.S
index 8d431d3..e352582 100644
--- a/runtime/arm/vararg.S
+++ b/runtime/arm/vararg.S
@@ -34,50 +34,44 @@
 
 @ Helper functions for variadic functions <stdarg.h>.  ARM version
 
-        .text
+#include "sysdeps.h"
 
 @ typedef void * va_list;
 @ unsigned int __compcert_va_int32(va_list * ap);
 @ unsigned long long __compcert_va_int64(va_list * ap);
 @ double __compcert_va_float64(va_list * ap);
 
-        .global __compcert_va_int32
-__compcert_va_int32:
+FUNCTION(__compcert_va_int32)
 	                        @ r0 = ap parameter
         ldr     r1, [r0, #0]    @ r1 = pointer to next argument
-        add     r1, r1, #4      @ advance ap by 4
+        ADD     r1, r1, #4      @ advance ap by 4
         str     r1, [r0, #0]
         ldr     r0, [r1, #-4]    @ load next argument and return it in r0
         bx      lr
-        .type __compcert_va_int32, %function
-        .size __compcert_va_int32, . - __compcert_va_int32
+ENDFUNCTION(__compcert_va_int32)
 
-        .global __compcert_va_int64
-__compcert_va_int64:
+FUNCTION(__compcert_va_int64)
 	                        @ r0 = ap parameter
         ldr     r1, [r0, #0]    @ r1 = pointer to next argument
-        add     r1, r1, #15     @ 8-align and advance by 8
-        bic     r1, r1, #7
+        ADD     r1, r1, #15     @ 8-align and advance by 8
+        BIC     r1, r1, #7
         str     r1, [r0, #0]    @ update ap
         ldr     r0, [r1, #-8]   @ load next argument and return it in r0,r1
         ldr     r1, [r1, #-4]
         bx      lr
-        .type __compcert_va_int64, %function
-        .size __compcert_va_int64, . - __compcert_va_int64
+ENDFUNCTION(__compcert_va_int64)
 
-        .global __compcert_va_float64
-__compcert_va_float64:
+FUNCTION(__compcert_va_float64)
 	                        @ r0 = ap parameter
         ldr     r1, [r0, #0]    @ r1 = pointer to next argument
-        add     r1, r1, #15     @ 8-align and advance by 8
-        bic     r1, r1, #7
+        ADD     r1, r1, #15     @ 8-align and advance by 8
+        BIC     r1, r1, #7
         str     r1, [r0, #0]    @ update ap
-#ifdef VARIANT_hardfloat
-        fldd    d0, [r1, #-8]   @ load next argument and return it in d0
-#else
+#ifdef VARIANT_eabi
         ldr     r0, [r1, #-8]   @ load next argument and return it in r0,r1
         ldr     r1, [r1, #-4]
+#else	
+        vldr    d0, [r1, #-8]   @ load next argument and return it in d0
 #endif	
         bx      lr
-        .type __compcert_va_float64, %function
-        .size __compcert_va_float64, . - __compcert_va_float64
+ENDFUNCTION(__compcert_va_float64)
diff --git a/runtime/ia32/sysdeps.h b/runtime/ia32/sysdeps.h
index 1aa54d3..9d957a8 100644
--- a/runtime/ia32/sysdeps.h
+++ b/runtime/ia32/sysdeps.h
@@ -1,38 +1,38 @@
-# *****************************************************************
-#
-#               The Compcert verified compiler
-#
-#           Xavier Leroy, INRIA Paris-Rocquencourt
-#
-# Copyright (c) 2013 Institut National de Recherche en Informatique et
-#  en Automatique.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright
-#       notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * Neither the name of the <organization> nor the
-#       names of its contributors may be used to endorse or promote products
-#       derived from this software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT
-# HOLDER> BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-# *********************************************************************
+// *****************************************************************
+//
+//               The Compcert verified compiler
+//
+//           Xavier Leroy, INRIA Paris-Rocquencourt
+//
+// Copyright (c) 2013 Institut National de Recherche en Informatique et
+//  en Automatique.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the <organization> nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT
+// HOLDER> BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// *********************************************************************
 
-# System dependencies
+// System dependencies
 
 #if defined(SYS_linux) || defined(SYS_bsd)
author	xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e>	2014-07-27 07:35:49 +0000
committer	xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e>	2014-07-27 07:35:49 +0000
commit	f4e106d4fc1cce484678b5cdd86ab57d7a43076a (patch)
tree	3de9bddc63b80fb1b695bbdb8fa5bd6aa893a13a
parent	04ff02a9f4bc4f766a450e5463729102ee26882e (diff)