1 ;;; latin-unity-vars.el --- Common variables and objects of latin-unity
3 ;; Copyright (C) 2002 Free Software Foundation, Inc
5 ;; Author: Stephen J. Turnbull
6 ;; Keywords: mule, charsets
7 ;; Created: 2002 January 26
8 ;; Last-modified: 2002 March 23
10 ;; This file is part of XEmacs.
12 ;; XEmacs is free software; you can redistribute it and/or modify
13 ;; it under the terms of the GNU General Public License as published by
14 ;; the Free Software Foundation; either version 2, or (at your option)
17 ;; XEmacs is distributed in the hope that it will be useful,
18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;; GNU General Public License for more details.
22 ;; You should have received a copy of the GNU General Public License
23 ;; along with XEmacs; see the file COPYING. If not, write to the
24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25 ;; Boston, MA 02111-1307, USA.
30 ;; Mule bogusly considers the various ISO-8859 extended character sets
31 ;; as disjoint, when ISO 8859 itself clearly considers them to be subsets
32 ;; of a larger character set. This library provides functions which
33 ;; determine the list of coding systems which can encode all of the
34 ;; characters in the buffer.
38 (provide 'latin-unity-vars)
40 ;; Load the latin{7,8,9,10} language environments, character sets, and
42 (require 'latin-euro-standards)
44 ;;; User customization is in latin-unity.el
46 ;; latin-unity-equivalence-table
47 ;; could speed things up a tiny bit by splitting out the bit-vector, but
48 ;; this is constant-time (a char-table ref plus an aref)
49 (defvar latin-unity-equivalences (make-char-table 'generic)
50 "Char-table of Latin character equivalence vectors.
52 Each vector takes integral elements (or nil, meaning void). The zero-th
53 element is interpreted as the bit vector representation of the set of
54 character sets that can represent the character. A nil value will cause
55 an error if accessed, so probably zero should be used instead. The next
56 (length latin-unity-character-sets) are the mapping of the char-table
57 index to code points in the other character sets. The last is the
60 Note that because this is a char-table, many characters will refer to
61 the same vector. Thus whenever updating a character's value, you must
62 use `copy-sequence', or there will be side-effects.
64 The table is actually loaded from latin-unity-tables.el.")
69 (defcustom latin-unity-debug nil
70 "If non-nil, make file write operations as slow as molasses.
71 If there were bugs, this might help find them, but there aren't. ;^)"
75 (defvar latin-unity-help-buffer " *Coding system conflict*")
77 (defconst latin-unity-coding-systems
80 (when (find-coding-system x)
81 (setq lucs (cons x lucs))))
82 '(iso-8859-1 iso-8859-2 iso-8859-3 iso-8859-4 iso-8859-9
83 iso-8859-10 iso-8859-13 iso-8859-14 iso-8859-15 iso-8859-16))
85 "List of coding systems \"unified\" by latin-unity.
87 Cf. `latin-unity-character-sets'.")
89 (defconst latin-unity-character-sets
92 (when (find-charset x)
93 (setq lucs (cons x lucs))))
94 '(latin-iso8859-1 latin-iso8859-2 latin-iso8859-3 latin-iso8859-4
95 latin-iso8859-9 latin-iso8859-10 latin-iso8859-13 latin-iso8859-14
96 latin-iso8859-15 latin-iso8859-16
97 ;; above are all GR sets, below are normally GL
98 ascii latin-jisx0201))
100 "List of character sets \"unified\" by latin-unity.
102 \"Unified\" is a misnomer, since actually these character sets are all
103 subsets of a larger set. Characters which are identified by these
104 library are actually the same characters according to ISO 8859. The
105 exception is the Japanese JIS X 0201 left half (JIS Roman), which is
106 controversial. It will by default be identified with ASCII, but also
107 may take values elsewhere according to user preference. (Unimplemented.)
109 The ISO 8859 character sets are actually Latin-1 to Latin-10, the right
110 halves of the ISO 8859 Latin character sets.
112 ASCII and Unicode are treated implicitly. All of the listed character
113 sets are the GR of a coded character set that supports ASCII, except
114 for JIS Roman. Whether JIS Roman is considered to be identical to
115 ASCII, or a slight revision, depends on user preference. Unicode is a
116 \"universal\" character set which is always a \"safe\" encoding for
117 streams that receive buffer contents.")
119 (defvar latin-unity-ascii-and-jis-roman "\000-\177"
120 "skip-chars set defining ASCII characters also in JIS Roman.
122 #### Defaults to treating JIS Roman as identical to ASCII, not consistent
123 with the equivalence table.")
125 (defconst latin-unity-cset-codesys-alist
128 (when (find-coding-system (cdr x))
129 (setq lucs (cons x lucs))))
130 '((latin-iso8859-1 . iso-8859-1)
131 (latin-iso8859-2 . iso-8859-2)
132 (latin-iso8859-3 . iso-8859-3)
133 (latin-iso8859-4 . iso-8859-4)
134 (latin-iso8859-9 . iso-8859-9)
135 (latin-iso8859-10 . iso-8859-10)
136 (latin-iso8859-13 . iso-8859-13)
137 (latin-iso8859-14 . iso-8859-14)
138 (latin-iso8859-15 . iso-8859-15)
139 (latin-iso8859-16 . iso-8859-16)
140 ;; the following mappings are bogus, the RightThang not clear
141 (ascii . iso-8859-1) ; any will do
142 (latin-jisx0201 . jisx0201))) ; doesn't currently exist
144 "Map Latin charsets to corresponding coding systems or classes.")
146 ;; bit vectors for checking the feasible character sets
148 (defconst latin-unity-all-flags
149 (lognot (lsh (lognot 0) (length latin-unity-character-sets)))
150 "Bit vector representing the set of all Latin character sets.")
152 ;; put the character set indicies and flag bits in reasonable places
153 (defconst latin-unity-non-latin-bit-flag
154 (let ((index 1) (bit 1))
155 (if (> (length latin-unity-character-sets) 25)
156 (error "representation too small to support so many charsets!"))
158 (put cs 'latin-unity-flag-bit bit)
159 (put cs 'latin-unity-index index)
160 (setq bit (lsh bit 1)
162 latin-unity-character-sets)
164 "A bit-flag indicating charsets not handled by latin-unity.")
166 ;;; end of latin-unity-vars.