aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib/xmlunicode.el
blob: cc19d9cf561bb2adb4d916dbc320709ff01a5303 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
;;; xmlunicode.el --- Unicode support for XML -*- coding: utf-8 -*-

;; $Id$

;; Copyright (C) 2003 Norman Walsh
;; Inspired in part by sgml-input, Copyright (C) 2001 Dave Love
;; Inspired in part by http://www.tbray.org/ongoing/When/200x/2003/09/27/UniEmacs

;; Author: Norman Walsh <ndw@nwalsh.com>
;; Maintainer: Norman Walsh <ndw@nwalsh.com>
;; Created: 2004-07-21
;; Version: 1.6
;; CVS ID: $Id$
;; Keywords: utf-8 unicode xml characters

;; This file is NOT part of GNU emacs.

;; This is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.

;; This software is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs; see the file COPYING.  If not, write to the
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.

;;; Commentary

;; This file provides a suite of functions designed to make it easier
;; to enter Unicode into Emacs. It is not, in fact, particularly XML-specific though
;; it does define an 'xml input-mode and does support the ISO 8879 entity names.

;;; Usage

;; 1. Before loading this file, make sure that the variable unicode-character-list is
;;    defined. The unicode-character-list is a list of triples of the form:
;;
;;    (codepoint "unicode name" "iso name") ; iso name can be nil
;;
;;    e.g.:   (defvar unicode-character-list
;;             '(
;;               ;Codept   Unicode name                            ISO Name
;;               (#x000000 "NULL"                                   nil     )
;;               (#x000001 "START OF HEADING"                       nil     )
;;               ...
;;               (#x0000a0 "NO-BREAK SPACE"                         "nbsp"  )
;;               (#x0000a1 "INVERTED EXCLAMATION MARK"              "iexcl" )
;;               (#x0000a2 "CENT SIGN"                              "cent"  )
;;               ...))
;;
;;
;;    The easiest way to define this list is to load "unichars.el"
;;    which should be available where you got this file.
;;
;; 2. Bind the functions defined in this file to keys you find convenient.
;;
;;    The likely candidates are:
;;
;;    unicode-character-insert            insert a character by unicode name
;;                                        (with completion)
;;    iso8879-character-insert            insert a character by ISO entity name
;;                                        (with completion)
;;    unicode-smart-double-quote          inserts an appropriate double quote
;;    unicode-smart-single-quote          inserts an appropriate single quote
;;    unicode-character-menu-insert       choose special character from a popup menu
;;    unicode-character-shortcut-insert   enter a two-character shortcut for a
;;                                        unicode character
;;
;;    You can also create a standard Emacs menu for the character menu list
;;    (instead of, or in addition to, the popup). To do that:
;;
;;    (define-key APPROPRIATE-MAP [menu-bar unichar]
;;      (cons "UniChar" unicode-character-menu-map))
;;
;;    Where APPROPRIATE-MAP is the name of the emacs keymap to bind into
;;
;; 3. If you want to use the xml input-mode, which provides automatic replacement for the
;;    ISO entity names:
;;
;;    (set-input-method 'xml)
;;
;;    in the appropriate context. Unlike sgml-input, xml-input only inserts the
;;    characters for which you have glyphs. It inserts other characters as numeric
;;    character references. (If you want to insert a literal character even if
;;    you don't have it in your fonts, use unicode-character-insert or
;;    iso8879-character-insert with a prefix.)

;;; Changes

;; v1.7
;;   Require "cl" because, well, because it's required. Also fiddled with
;;   the way single quotes are handled; the apostrophe is now part of the
;;   cycle
;; v1.6
;;   Remove debugging code. Embarrassed again. :-(
;; v1.5
;;   Fixed bug in unicode-smart-single-quote. It wasn't cycling through all
;;   three quotes correctly because of a typo in the function definition.
;;   Make sure smart semicolon insertion only happens if we're right at the
;;   end of a numeric character reference.
;; v1.4
;;   Fixed bug in insert-smart-semicolon. It wasn't careful to tie the search
;;   to the most recent preceding ampersand.
;; v1.3
;;   Fixed bug in (in-comment)
;;   Added unicode-smart-semicolon as another convenience for entering Unicode chars
;;   Added show-unicode-character-list
;; v1.2
;;   Added unicode-smart-hyphen for easy insert of mdash and ndash
;;   Added unicode-smart-period for easy insert of hellip
;;   Fixed a bug in unicode-smart-single-quote
;; v1.1
;;   Fixed a few bugs with respect to how numeric character references are entered.
;;   Added xml-tag-search-limit and unicode-charref-format
;; v1.0
;;   First release. Nearly a complete rewrite from the former xmlchars.el file

;;; Code:

(require 'cl)

(defvar unicode-ldquo  (decode-char 'ucs #x00201c))
(defvar unicode-rdquo  (decode-char 'ucs #x00201d))
(defvar unicode-lsquo  (decode-char 'ucs #x002018))
(defvar unicode-rsquo  (decode-char 'ucs #x002019))
(defvar unicode-quot   (decode-char 'ucs #x000022))
(defvar unicode-apos   (decode-char 'ucs #x000027))
(defvar unicode-capos  (decode-char 'ucs #x0002bc))
(defvar unicode-ndash  (decode-char 'ucs #x002013))
(defvar unicode-mdash  (decode-char 'ucs #x002014))
(defvar unicode-hellip (decode-char 'ucs #x002026))

(defvar unicode-charref-format "&#x%x;"
  "The format for numeric character references")

(defvar xml-tag-search-limit 4096
  "Maximum distance to search from point for tag start characters")

(defvar unicode-character-list-file "/define/this/before/you/load/me"
  "The name of the file that contains your unicode-character-list. unichars.el should be available where you got this file.")

(if (not (boundp 'unicode-character-list))
    (load-file unicode-character-list-file))

(defvar unicode-character-alist '()
  "Mapping of Unicode character names to codepoints.")

(let ((ulist unicode-character-list))
  (setq unicode-character-alist
	(list (cons (cadr (car ulist)) (car (car ulist)))))
  (setq ulist (cdr ulist))
  (while ulist
    (nconc unicode-character-alist
	   (list (cons (cadr (car ulist)) (car (car ulist)))))
    (setq ulist (cdr ulist))))

(defvar iso8879-character-alist '()
  "Mapping of ISO 8879 entity names names to codepoints.")

(let ((ulist unicode-character-list))
  (while (and ulist (not (caddr (car ulist))))
    (setq ulist (cdr ulist)))
  (setq iso8879-character-alist
	(list (cons (caddr (car ulist)) (car (car ulist)))))
  (setq ulist (cdr ulist))
  (while ulist
    (if (caddr (car ulist))
	(nconc iso8879-character-alist
	       (list (cons (caddr (car ulist)) (car (car ulist))))))
    (setq ulist (cdr ulist))))

(defun iso8879-to-codepoints (&optional isolist)
  "Converts a list of ISO 8879 entity names to a list of codepoints. This is a convenience function for defining the glyph list."
  (let (codepoint-list)
    (setq codepoint-list (list 0))
    (while isolist
      (nconc codepoint-list
	     (list (cdr (assoc (car isolist) iso8879-character-alist))))
      (setq isolist (cdr isolist)))
    (cdr codepoint-list)))

(defun unicode-to-codepoints (&optional unilist)
  "Converts a list of Unicode character names to a list of codepoints. This is a convenience function for defining the glyph list."
  (let (codepoint-list)
    (setq codepoint-list (list 0))
    (while unilist
      (nconc codepoint-list
	     (list (cdr (assoc (car isolist) unicode-character-alist))))
      (setq unilist (cdr unilist)))
    (cdr codepoint-list)))

(defvar unicode-glyph-list
  (append
   '(?A ?B ?C ?D ?E ?F ?G ?H ?I ?J ?K ?L ?M
     ?N ?O ?P ?Q ?R ?S ?T ?U ?V ?W ?X ?Y ?Z
     ?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m
     ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z
     ?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?! ?@ ?#
     ?$ ?% ?^ ?* ?( ?) ?- ?_ ?= ?+ ?\ ?|
     ?[ ?] ?{ ?} 59 ?: ?/ ?? ?. 44 96 126)
   (iso8879-to-codepoints
    '("AElig"  "Aacute" "Abreve" "Acirc"  "Agrave"  "Amacr"  "Aogon"
      "Aring"  "Atilde" "Auml"   "Cacute" "Ccaron"  "Ccedil" "Ccirc"
      "Cdot"   "Dagger" "Dcaron" "Dot"    "Dstrok"  "ENG"    "ETH"
      "Eacute" "Ecaron" "Ecirc"  "Edot"   "Egrave"  "Emacr"  "Eogon"
      "Euml"   "Gbreve" "Gcedil" "Gcirc"  "Gdot"    "Hcirc"
      "Hstrok" "IJlig"  "Iacute" "Icirc"  "Idot"    "Igrave"
      "Imacr"  "Iogon"  "Itilde" "Iuml"   "Jcirc"   "Kcedil"
      "Lacute" "Lcaron" "Lcedil" "Lmidot" "Lstrok"  "Nacute"
      "Ncaron" "Ncedil" "Ntilde" "OElig"  "Oacute"  "Ocirc"
      "Odblac" "Ograve" "Omacr"  "Oslash" "Otilde"  "Ouml"
      "Racute" "Rcaron" "Rcedil" "Sacute" "Scaron"  "Scedil"
      "Scirc"  "THORN"  "Tcaron" "Tcedil" "Tstrok"  "Uacute"
      "Ubreve" "Ucirc"  "Udblac" "Ugrave" "Umacr"   "Uogon"
      "Uring"  "Utilde" "Uuml"   "Wcirc"  "Yacute"  "Ycirc"
      "Yuml"   "Zacute" "Zcaron" "Zdot"   "aacute"  "abreve"
      "acirc"  "acute"  "aelig"  "agrave" "amacr"   "angst"
      "aogon"           "aring"  "ast"    "atilde"  "auml"
      "b.mu"   "bdquo"  "blank"  "blk12"  "blk14"   "blk34"
      "block"  "boxDL"  "boxDR"  "boxH"   "boxHD"   "boxHU"
      "boxUL"  "boxUR"  "boxV"   "boxVH"  "boxVL"   "boxVR"
      "boxVh"  "boxdl"  "boxdr"  "boxh"   "boxhd"   "boxhu"
      "boxul"  "boxur"  "boxv"   "boxvH"  "boxvh"   "boxvl"
      "boxvr"  "breve"  "brvbar" "bsol"   "bull"    "cacute"
      "caron"  "ccaron" "ccedil" "ccirc"  "cdot"    "cedil"
      "cent"   "circ"   "colon"  "comma"  "commat"  "copy"
      "curren" "dagger" "dash"   "dblac"  "dcaron"  "deg"
      "die"    "divide" "dollar" "dot"    "dstrok"  "eacute"
      "ecaron" "ecirc"  "edot"   "egrave" "emacr"   "emsp"
      "emsp13" "emsp14" "eng"    "ensp"   "eogon"   "equals"
      "equiv"  "eth"    "euml"   "excl"   "exist"   "fnof"
      "forall" "frac12" "frac14" "frac34" "frasl"   "gacute"
      "gbreve" "gcedil" "gcirc"  "gdot"   "ge"      "ges"
      "grave"           "hairsp" "half"   "hcirc"   "hellip"
      "horbar" "hstrok" "hyphen" "iacute" "icirc"   "iexcl"
      "igrave" "ijlig"  "imacr"  "inodot" "inodot"  "iogon"
      "iquest" "itilde" "iuml"   "jcirc"  "kcedil"  "kgreen"
      "lacute" "laquo"  "lcaron" "lcedil" "lcub"    "ldquo"
      "ldquor" "le"     "les"    "lhblk"  "lmidot"  "lowbar"
      "lpar"   "lsaquo" "lsqb"   "lsquo"  "lsquor"  "lstrok"
      "macr"   "mdash"  "mgr"    "micro"  "middot"  "minus"
      "mldr"   "mu"     "nacute" "napos"  "nbsp"    "ncaron"
      "ncedil" "ndash"  "ne"     "nequiv" "nexist"  "nge"
      "nges"   "ngt"    "nle"    "nles"   "nlt"     "not"
      "ntilde" "num"    "numsp"  "oacute" "ocirc"   "odblac"
      "oelig"  "ogon"   "ograve" "omacr"  "ordf"    "ordm"
      "oslash" "otilde" "ouml"   "para"   "percnt"  "period"
      "permil" "plus"   "plusmn" "pound"  "puncsp"  "quest"
               "racute" "raquo"  "rcaron" "rcedil"  "rcub"
      "rdquo"  "rdquor" "reg"    "ring"   "rpar"    "rsaquo"
      "rsqb"   "rsquo"  "rsquor" "sacute" "sbquo"   "sbsol"
      "scaron" "scedil" "scirc"  "sect"   "semi"    "shy"
      "sol"    "sup1"   "sup2"   "sup3"   "szlig"   "tcaron"
      "tcedil" "thinsp" "thorn"  "tilde"  "times"   "trade"
      "tstrok" "uacute" "ubreve" "ucirc"  "udblac"  "ugrave"
      "uhblk"  "umacr"  "uml"    "uogon"  "uring"   "utilde"
      "uuml"   "verbar" "wcirc"  "wedgeq" "yacute"  "ycirc"
      "yen"    "yuml"   "zacute" "zcaron" "zdot")))
  "A list of Unicode codepoints identifying the characters that display correctly in your Emacs with your fonts.")

;; Insert characters by Unicode name (with completion)

(defun unicode-character-insert (arg &optional argname)
  "Insert a Unicode character by character name. If a prefix is given, the character will be inserted regardless of whether or not it has a displayable glyph; otherwise, a numeric character reference is inserted if the codepoint is not in the unicode-glyph-list. If argname is given, it is used for the prompt. If argname uniquely identifies a character, that character is inserted without the prompt."
  (interactive "P")
  (let* ((completion-ignore-case t)
	 (uniname (if (stringp argname) argname ""))
	 (charname
	  (if (eq (try-completion uniname unicode-character-alist) t)
	      uniname
	    (completing-read
	     "Unicode name: "
	     unicode-character-alist
	     nil t uniname)))
	 codepoint glyph)
    (setq codepoint (cdr (assoc charname unicode-character-alist)))
    (xml-unicode-insert arg codepoint)))

;; Insert characters by iso8879 name

(defun iso8879-character-insert (arg &optional argname)
  "Insert a Unicode character by ISO 8879 entity name. If a prefix is given, the character will be inserted regardless of whether or not it has a displayable glyph; otherwise, a numeric character reference is inserted if the codepoint is not in the unicode-glyph-list. If argname is given, it is used for the prompt. If argname uniquely identifies a character, that character is inserted without the prompt."
  (interactive "P")
  (let* ((isoname (if (stringp argname) argname ""))
	 (charname
	  (if (eq (try-completion isoname iso8879-character-alist) t)
	      isoname
	    (completing-read
	     "ISO name: "
	     iso8879-character-alist
	     nil t isoname)))
	 codepoint glyph)
    (setq codepoint (cdr (assoc charname iso8879-character-alist)))
    (xml-unicode-insert arg codepoint)))

(defun xml-unicode-insert (arg codepoint)
  "Insert the Unicode character identified by codepoint taking into account available glyphs and XML predefined entities."
  (interactive "P")
  (let ((glyph (memq codepoint unicode-glyph-list)))
    (cond
     ((and (decode-char 'ucs codepoint) (or arg glyph))
      (ucs-insert codepoint))
     ((= codepoint 34)
      (insert "&quot;"))
     ((= codepoint 38)
      (insert "&amp;"))
     ((= codepoint 39)
      (insert "&apos;"))
     ((= codepoint 60)
      (insert "&lt;"))
     ((= codepoint 62)
      (insert "&gt;"))
     (t
      (insert (format unicode-charref-format codepoint))))))

;; Menus

(defvar unicode-character-menu-alist
  '(
    ("angst"     . #x212B)
    ("cent"      . #x00A2)
    ("copy"      . #x00A9)
    ("Dagger"    . #x2021)
    ("dagger"    . #x2020)
    ("deg"       . #x00B0)
    ("emsp"      . #x2003)
    ("ensp"      . #x2002)
    ("ETH"       . #x00D0)
    ("eth"       . #x00F0)
    ("euro"      . #x20AC)
    ("half"      . #x00BD)
    ("laquo"     . #x00AB)
    ("ldquo"     . #x201c)
    ("lsquo"     . #x2018)
    ("mdash"     . #x2014)
    ("micro"     . #x00B5)
    ("middot"    . #x00B7)
    ("nbsp"      . #x00A0)
    ("ndash"     . #x2013)
    ("not"       . #x00AC)
    ("numsp"     . #x2007)
    ("para"      . #x00B6)
    ("permil"    . #x2030)
    ("puncsp"    . #x2008)
    ("raquo"     . #x00BB)
    ("rdquo"     . #x201d)
    ("rsquo"     . #x2019)
    ("reg"       . #x00AE)
    ("sect"      . #x00A7)
    ("THORN"     . #x00DE)
    ("thorn"     . #x00FE)
    ("trade"     . #x2122)
    )
  "Mapping of names to codepoints for use in the popup or Emacs menu.")

(defun  unicode-character-menu-insert ()
  "Popup a menu for inserting unicode characters."
  (interactive)
  (let* ((xml-chars-menu
	  (list "Special char" (append (list "") unicode-character-menu-alist)))
	 (value (x-popup-menu t xml-chars-menu)))
    (if value (xml-unicode-insert nil value))))

(defvar unicode-character-menu-map (make-sparse-keymap "UniChar")
  "A menu map for inserting Unicode characters.")

(defun make-unicode-character-menu-bar ()
  "Builds the unicode-character-menu-map for the currently defined unicode-character-menu-alist."
  (let ((alist (reverse unicode-character-menu-alist))
	name codepoint)
    (setq unicode-character-menu-map (make-sparse-keymap "UniChar"))
    (while alist
      (setq name (car (car alist))
	    codepoint (cdr (car alist)))
      (define-key unicode-character-menu-map (vector (intern name))
	`(,name . (lambda () (interactive) (xml-unicode-insert nil ,codepoint))))
      (setq alist (cdr alist)))))

(make-unicode-character-menu-bar)

;; Simple XML tests

(defun in-start-tag ()
  "Crude test to see if point is inside an open start tag."
  (interactive)
  (let (slim here pgt plt)
    (setq here (point))
    (setq slim
	  (if (> here xml-tag-search-limit)
	      (- here xml-tag-search-limit)
	    0))
    (setq pgt (search-backward ">" slim t))
    (goto-char here)
    (setq plt (search-backward "<" slim t))
    (goto-char here)
    (if (and pgt plt)
	(> plt pgt)
      plt)))

(defun after-start-tag ()
  "Crude test to see if point is just after a start tag"
  (interactive)
  (if (and (char-before) (char-equal (char-before) ?>))
      (let (slim here plt psl)
	(setq here (point))
	(setq slim
	      (if (> here xml-tag-search-limit)
		  (- here xml-tag-search-limit)
		0))
	(setq plt (search-backward "<" slim t))
	(goto-char here)
	(setq psl (search-backward "/" slim t))
	(goto-char here)
	(or (and plt (not psl))
	    (and plt psl (< psl plt))))))

(defun in-comment ()
  "Crude test to see if point is inside a comment."
  (interactive)
  (let (slim here pgt pcmt)
    (setq here (point))
    (setq slim
	  (if (> here xml-tag-search-limit)
	      (- here xml-tag-search-limit)
	    0))
    (setq pgt (search-backward "-->" slim t))
    (goto-char here)
    (setq pcmt (search-backward "<!" slim t))
    (goto-char here)
    (if (and pgt pcmt)
	(> pcmt pgt)
      pcmt)))

;;stolen from hen.el which in turn claims to have stolen it from cxref
(defun unicode-looking-backward-at (regexp)
  "Return t if text before point matches regular expression REGEXP.
This function modifies the match data that `match-beginning',
`match-end' and `match-data' access; save and restore the match
data if you want to preserve them."
  (save-excursion
    (let ((here (point)))
      (if (re-search-backward regexp (point-min) t)
          (if (re-search-forward regexp here t)
              (= (point) here))))))

;; Smart quotes

(defun unicode-smart-double-quote ()
  "Insert a left or right double quote as appropriate. Left quotes are inserted after a space, newline, or start tag. Right quotes are inserted after any other character, except if the preceding character is a quote, in which case we cycle through the three quote styles."
  (interactive)
  (if (char-before)
      (let ((ch (char-before)))
	(cond
	 ((in-start-tag)
	  (insert "\""))
	 ((or
	   (after-start-tag)
	   (char-equal ch 40)  ; (
	   (char-equal ch 91)  ; [
	   (char-equal ch ?{)) ; {
	  (insert unicode-ldquo))
	 ((or
	   (char-equal ch ?>)  ; >
	   (char-equal ch 41)  ; )
	   (char-equal ch 93)  ; ]
	   (char-equal ch ?})) ; }
	  (insert unicode-rdquo))
	 ((or (char-equal ch 32)
	      (char-equal ch 10))
	  (insert unicode-ldquo))
	 ((char-equal ch unicode-ldquo)
	  (progn
	    (delete-backward-char 1)
	    (insert "\"")))
	 ((char-equal ch unicode-quot)
	  (progn
	    (delete-backward-char 1)
	    (insert unicode-rdquo)))
	 ((char-equal ch unicode-rdquo)
	  (progn
	    (delete-backward-char 1)
	    (insert unicode-ldquo)))
	 ((char-equal ch unicode-ldquo)
	  (progn
	    (delete-backward-char 1)
	    (insert unicode-rdquo)))
	 ((char-equal ch unicode-lsquo)
	  (insert unicode-ldquo))
	 (t (insert unicode-rdquo))))
    (insert unicode-ldquo)))

(defun unicode-smart-single-quote ()
  "Insert a left or right single quote, or an apostrophe, as appropriate. Left quotes are inserted after a space, newline, or start tag. An apostrophe is inserted after any other character, except if the preceding character is a quote or apostrophe, in which case we cycle through the styles."
  (interactive)
  (if (char-before)
      (let ((ch (char-before)))
	(cond
	 ((in-start-tag)
	  (insert "'"))
	 ((or
	   (after-start-tag)
	   (char-equal ch 40)  ; (
	   (char-equal ch 91)  ; [
	   (char-equal ch ?{)) ; {
	  (insert unicode-lsquo))
	 ((or
	   (char-equal ch ?>)  ; >
	   (char-equal ch 41)  ; )
	   (char-equal ch 93)  ; ]
	   (char-equal ch ?})) ; }
	  (insert unicode-rsquo))
	 ((or (char-equal ch 32)
	      (char-equal ch 10))
	  (insert unicode-lsquo))
	 ((char-equal ch unicode-apos)  ; ' -> rsquo
	  (progn
	    (delete-backward-char 1)
	    (insert unicode-rsquo)))
	 ((char-equal ch unicode-rsquo) ; rsquo -> lsquo
	  (progn
	    (delete-backward-char 1)
	    (insert unicode-lsquo)))
	 ((char-equal ch unicode-lsquo) ; lsquo -> '
	  (progn
	    (delete-backward-char 1)
	    (insert unicode-apos)))
	 (t (insert unicode-apos))))
    (insert unicode-lsquo)))

(defun unicode-smart-hyphen ()
  "Insert a hyphen, mdash, or ndash as appropriate. A hyphen, an mdash, and then an ndash is inserted."
  (interactive)
  (if (char-before)
      (let ((ch (char-before)))
	(cond
	 ((in-comment)
	  (insert "-"))
	 ((char-equal ch ?-)
	  (progn
	    (delete-backward-char 1)
	    (insert unicode-mdash)))
	 ((char-equal ch unicode-mdash)
	  (progn
	    (delete-backward-char 1)
	    (insert unicode-ndash)))
	 ((char-equal ch unicode-ndash)
	  (progn
	    (delete-backward-char 1)
	    (insert "-")))
	 (t (insert "-"))))
    (insert "-")))

(defun unicode-smart-period ()
  "Insert an hellipsis for three dots."
  (interactive)
  (if (> (point) 2)
      (let ((ch1 (char-before))
	    (ch2 (char-before (- (point) 1)))
	    (ch3 (char-before (- (point) 2))))
	(cond
	 ((in-comment)
	  (insert "."))
	 ((char-equal ch1 unicode-hellip)
	  (progn
	    (delete-backward-char 1)
	    (insert "....")))
	 ((and ch3 (char-equal ch1 ?.) (char-equal ch2 ?.) (char-equal ch3 ?.))
	  (insert "."))
	 ((and (char-equal ch1 ?.) (char-equal ch2 ?.))
	  (progn
	    (delete-backward-char 2)
	    (insert unicode-hellip)))
	 (t (insert "."))))
    (insert ".")))

(defun unicode-smart-semicolon ()
  "Detect numeric character references and replace them with the appropriate char."
  (interactive)
  (let ((pos (point))
	amppos codept)
    (search-backward "&" nil t nil)
    (setq amppos (point))
    (goto-char pos)
    (cond
     ((unicode-looking-backward-at "&#[xX][0-9a-fA-F]+")
      (progn
	(re-search-backward "&#[xX]\\([0-9a-fA-F]+\\)" nil t nil)
	(if (= amppos (point))
	    (progn
	      (setq codept (string-to-number (match-string 1) 16))
	      (if (memq codept unicode-glyph-list)
		  (replace-match (format "%c" (decode-char 'ucs codept)))
		(progn
		  (goto-char pos)
		  (insert ";"))))
	  (progn
	    (goto-char pos)
	    (insert ";")))))
     ((unicode-looking-backward-at "&#[0-9]+")
      (progn
	(re-search-backward "&#\\([0-9]+\\)" nil t nil)
	(if (= amppos (point))
	    (progn
	      (setq codept (string-to-number (match-string 1) 10))
	      (if (memq codept unicode-glyph-list)
		  (replace-match (format "%c" (decode-char 'ucs codept)))
		(progn
		  (goto-char pos)
		  (insert ";"))))
	  (progn
	    (goto-char pos)
	    (insert ";")))))
     (t
      (insert ";")))))

;; Setup quail for XML mode

(require 'quail)

(quail-define-package
 "xml" "UTF-8" "&" t
 "Unicode characters input method using ISO 8879 entitie names from the unicode-character-list"
 nil t nil nil nil nil nil nil nil nil t)

(defvar xml-quail-define-rules '()
  "The default xml-input rules. Built dynamically from the unicode-character-list and the unicode-glyph-list.")

(let ((ulist iso8879-character-alist)
      codepoint glyph entname)
  (setq xml-quail-define-rules (list 'quail-define-rules))
  (while ulist
    (setq codepoint (cdr (car ulist)))
    (setq glyph (memq codepoint unicode-glyph-list))
    (setq entname (concat "&" (car (car ulist)) ";"))
    (cond
     ((and glyph (decode-char 'ucs codepoint))
      (nconc xml-quail-define-rules
	     (list (list entname (decode-char 'ucs codepoint)))))
     ((= codepoint 34)
      (nconc xml-quail-define-rules
	     (list (list entname (vector "&quot;")))))
     ((= codepoint 38)
      (nconc xml-quail-define-rules
	     (list (list entname (vector "&amp;")))))
     ((= codepoint 39)
      (nconc xml-quail-define-rules
	     (list (list entname (vector "&apos;")))))
     ((= codepoint 60)
      (nconc xml-quail-define-rules
	     (list (list entname (vector "&lt;")))))
     ((= codepoint 62)
      (nconc xml-quail-define-rules
	     (list (list entname (vector "&gt;")))))
     (t
      (nconc xml-quail-define-rules
	     (list (list entname (vector (format unicode-charref-format codepoint)))))))
    (setq ulist (cdr ulist))))

(eval xml-quail-define-rules)

;; Read two keys

(defvar unicode-character-shortcut-alist
  (list
   (cons "AE"  (cdr (assoc "AElig"  iso8879-character-alist)))
   (cons "A'"  (cdr (assoc "Aacute" iso8879-character-alist)))
   (cons "A^"  (cdr (assoc "Acirc"  iso8879-character-alist)))
   (cons "A`"  (cdr (assoc "Agrave" iso8879-character-alist)))
   (cons "Ao"  (cdr (assoc "Aring"  iso8879-character-alist)))
   (cons "A~"  (cdr (assoc "Atilde" iso8879-character-alist)))
   (cons "A\"" (cdr (assoc "Auml"   iso8879-character-alist)))
   (cons "C,"  (cdr (assoc "Ccedil" iso8879-character-alist)))
   (cons "E'"  (cdr (assoc "Eacute" iso8879-character-alist)))
   (cons "E^"  (cdr (assoc "Ecirc"  iso8879-character-alist)))
   (cons "E`"  (cdr (assoc "Egrave" iso8879-character-alist)))
   (cons "E\"" (cdr (assoc "Euml"   iso8879-character-alist)))
   (cons "I'"  (cdr (assoc "Iacute" iso8879-character-alist)))
   (cons "I^"  (cdr (assoc "Icirc"  iso8879-character-alist)))
   (cons "I`"  (cdr (assoc "Igrave" iso8879-character-alist)))
   (cons "I\"" (cdr (assoc "Iuml"   iso8879-character-alist)))
   (cons "N~"  (cdr (assoc "Ntilde" iso8879-character-alist)))
   (cons "O'"  (cdr (assoc "Oacute" iso8879-character-alist)))
   (cons "O^"  (cdr (assoc "Ocirc"  iso8879-character-alist)))
   (cons "O`"  (cdr (assoc "Ograve" iso8879-character-alist)))
   (cons "O/"  (cdr (assoc "Oslash" iso8879-character-alist)))
   (cons "O~"  (cdr (assoc "Otilde" iso8879-character-alist)))
   (cons "O\"" (cdr (assoc "Ouml"   iso8879-character-alist)))
   (cons "U'"  (cdr (assoc "Uacute" iso8879-character-alist)))
   (cons "U^"  (cdr (assoc "Ucirc"  iso8879-character-alist)))
   (cons "U`"  (cdr (assoc "Ugrave" iso8879-character-alist)))
   (cons "U\"" (cdr (assoc "Uuml"   iso8879-character-alist)))
   (cons "Y'"  (cdr (assoc "Yacute" iso8879-character-alist)))
   (cons "a'"  (cdr (assoc "aacute" iso8879-character-alist)))
   (cons "a^"  (cdr (assoc "acirc"  iso8879-character-alist)))
   (cons "ae"  (cdr (assoc "aelig"  iso8879-character-alist)))
   (cons "a`"  (cdr (assoc  "agrave" iso8879-character-alist)))
   (cons "ao"  (cdr (assoc "aring"  iso8879-character-alist)))
   (cons "a~"  (cdr (assoc "atilde" iso8879-character-alist)))
   (cons "a\"" (cdr (assoc "auml"   iso8879-character-alist)))
   (cons "c,"  (cdr (assoc "ccedil" iso8879-character-alist)))
   (cons "e'"  (cdr (assoc "eacute" iso8879-character-alist)))
   (cons "e^"  (cdr (assoc "ecirc"  iso8879-character-alist)))
   (cons "e`"  (cdr (assoc "egrave" iso8879-character-alist)))
   (cons "e\"" (cdr (assoc "euml"   iso8879-character-alist)))
   (cons "i'"  (cdr (assoc "iacute" iso8879-character-alist)))
   (cons "i^"  (cdr (assoc "icirc"  iso8879-character-alist)))
   (cons "i`"  (cdr (assoc "igrave" iso8879-character-alist)))
   (cons "i\"" (cdr (assoc "iuml"   iso8879-character-alist)))
   (cons "n~"  (cdr (assoc "ntilde" iso8879-character-alist)))
   (cons "o'"  (cdr (assoc "oacute" iso8879-character-alist)))
   (cons "o^"  (cdr (assoc "ocirc"  iso8879-character-alist)))
   (cons "o`"  (cdr (assoc "ograve" iso8879-character-alist)))
   (cons "o-"  (cdr (assoc "omacr"  iso8879-character-alist)))
   (cons "o/"  (cdr (assoc "oslash" iso8879-character-alist)))
   (cons "o~"  (cdr (assoc "otilde" iso8879-character-alist)))
   (cons "o\"" (cdr (assoc "ouml"   iso8879-character-alist)))
   (cons "sz"  (cdr (assoc "szlig"  iso8879-character-alist)))
   (cons "u'"  (cdr (assoc "uacute" iso8879-character-alist)))
   (cons "u^"  (cdr (assoc "ucirc"  iso8879-character-alist)))
   (cons "u`"  (cdr (assoc "ugrave" iso8879-character-alist)))
   (cons "u\"" (cdr (assoc "uuml"   iso8879-character-alist)))
   (cons "y'"  (cdr (assoc "yacute" iso8879-character-alist)))
   (cons "y\"" (cdr (assoc "yuml"   iso8879-character-alist)))
   (cons "12"  (cdr (assoc "frac12" iso8879-character-alist)))
   (cons "13"  (cdr (assoc "frac13" iso8879-character-alist)))
   (cons "14"  (cdr (assoc "frac14" iso8879-character-alist)))
   (cons "15"  (cdr (assoc "frac15" iso8879-character-alist)))
   (cons "16"  (cdr (assoc "frac16" iso8879-character-alist)))
   (cons "18"  (cdr (assoc "frac18" iso8879-character-alist)))
   (cons "23"  (cdr (assoc "frac23" iso8879-character-alist)))
   (cons "25"  (cdr (assoc "frac25" iso8879-character-alist)))
   (cons "34"  (cdr (assoc "frac34" iso8879-character-alist)))
   (cons "35"  (cdr (assoc "frac35" iso8879-character-alist)))
   (cons "38"  (cdr (assoc "frac38" iso8879-character-alist)))
   (cons "45"  (cdr (assoc "frac45" iso8879-character-alist)))
   (cons "56"  (cdr (assoc "frac56" iso8879-character-alist)))
   (cons "58"  (cdr (assoc "frac58" iso8879-character-alist)))
   (cons "78"  (cdr (assoc "frac78" iso8879-character-alist)))
   (cons "<<"  (cdr (assoc "laquo"  iso8879-character-alist)))
   (cons ".."  (cdr (assoc "hellip" iso8879-character-alist)))
   (cons "!i"  (cdr (assoc "iexcl"  iso8879-character-alist)))
   (cons "?i"  (cdr (assoc "iquest" iso8879-character-alist)))
   (cons "  "  (cdr (assoc "nbsp"   iso8879-character-alist)))
   (cons "+-"  (cdr (assoc "plusmn" iso8879-character-alist)))
   (cons "--"  (cdr (assoc "mdash"  iso8879-character-alist)))
   (cons "$c"  (cdr (assoc "cent"   iso8879-character-alist)))
   (cons "$e"  (cdr (assoc "euro"   iso8879-character-alist)))
   (cons "$p"  (cdr (assoc "pound"  iso8879-character-alist)))
   (cons "$y"  (cdr (assoc "yen"    iso8879-character-alist))))
  "Defines a list of two-character shortcuts for keyboard entry of Unicode characters.")

(defun unicode-character-shortcut-insert ()
  "Read a (two-character) keyboard shortcut and insert the corresponding character."
  (interactive)
  (let* ((c1 (read-char))
	 (c2 (read-char))
	 (str (concat (char-to-string c1) (char-to-string c2))))
    (cond
     ((assoc str unicode-character-shortcut-alist)
      (xml-unicode-insert nil
			  (cdr (assoc str unicode-character-shortcut-alist))))
     (t (beep)))))

(defun show-unicode-character-list ()
  "Insert each Unicode character into a buffer. Let's you see which characters are available for literal display in your emacs font."
  (let ((chars unicode-character-list)
	char codept name)
    (while chars
      (setq char (car chars))
      (setq chars (cdr chars))
      (setq codept (car char))
      (setq name (cadr char))

      (if (< codept #xffff)
	  (progn
	    (insert (format "#x%06x " codept))
	    (ucs-insert codept)
	    (insert (format " %s\n" name)))))))

;; EOF