Coverity fixes from Nelson
[sxemacs] / src / mule / mule-charset.c
1 /* Functions to handle multilingual characters.
2    Copyright (C) 1992, 1995 Free Software Foundation, Inc.
3    Copyright (C) 1995 Sun Microsystems, Inc.
4
5 This file is part of SXEmacs
6
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>. */
19
20
21 /* Synched up with: FSF 20.3.  Not in FSF. */
22
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
24
25 #include <config.h>
26 #include "lisp.h"
27
28 #include "buffer.h"
29 #include "chartab.h"
30 #include "elhash.h"
31 #include "lstream.h"
32 #include "ui/device.h"
33 #include "ui/faces.h"
34 #include "mule-ccl.h"
35
36 /* The various pre-defined charsets. */
37
38 Lisp_Object Vcharset_ascii;
39 Lisp_Object Vcharset_control_1;
40 Lisp_Object Vcharset_latin_iso8859_1;
41 Lisp_Object Vcharset_latin_iso8859_2;
42 Lisp_Object Vcharset_latin_iso8859_3;
43 Lisp_Object Vcharset_latin_iso8859_4;
44 Lisp_Object Vcharset_thai_tis620;
45 Lisp_Object Vcharset_greek_iso8859_7;
46 Lisp_Object Vcharset_arabic_iso8859_6;
47 Lisp_Object Vcharset_hebrew_iso8859_8;
48 Lisp_Object Vcharset_katakana_jisx0201;
49 Lisp_Object Vcharset_latin_jisx0201;
50 Lisp_Object Vcharset_cyrillic_iso8859_5;
51 Lisp_Object Vcharset_latin_iso8859_9;
52 Lisp_Object Vcharset_japanese_jisx0208_1978;
53 Lisp_Object Vcharset_chinese_gb2312;
54 Lisp_Object Vcharset_japanese_jisx0208;
55 Lisp_Object Vcharset_korean_ksc5601;
56 Lisp_Object Vcharset_japanese_jisx0212;
57 Lisp_Object Vcharset_chinese_cns11643_1;
58 Lisp_Object Vcharset_chinese_cns11643_2;
59 Lisp_Object Vcharset_chinese_big5_1;
60 Lisp_Object Vcharset_chinese_big5_2;
61
62 #ifdef ENABLE_COMPOSITE_CHARS
63 Lisp_Object Vcharset_composite;
64
65 /* Hash tables for composite chars.  One maps string representing
66    composed chars to their equivalent chars; one goes the
67    other way. */
68 Lisp_Object Vcomposite_char_char2string_hash_table;
69 Lisp_Object Vcomposite_char_string2char_hash_table;
70
71 static int composite_char_row_next;
72 static int composite_char_col_next;
73
74 #endif                          /* ENABLE_COMPOSITE_CHARS */
75
76 struct charset_lookup *chlook;
77
78 static const struct lrecord_description charset_lookup_description_1[] = {
79         {XD_LISP_OBJECT_ARRAY,
80          offsetof(struct charset_lookup, charset_by_leading_byte),
81          128 + 4 * 128 * 2},
82         {XD_END}
83 };
84
85 static const struct struct_description charset_lookup_description = {
86         sizeof(struct charset_lookup),
87         charset_lookup_description_1
88 };
89
90 /* Table of number of bytes in the string representation of a character
91    indexed by the first byte of that representation.
92
93    rep_bytes_by_first_byte(c) is more efficient than the equivalent
94    canonical computation:
95
96    XCHARSET_REP_BYTES (CHARSET_BY_LEADING_BYTE (c)) */
97
98 const Bytecount rep_bytes_by_first_byte[0xA0] = {       /* 0x00 - 0x7f are for straight ASCII */
99         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
100         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
101         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
102         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
103         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
105         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107         /* 0x80 - 0x8f are for Dimension-1 official charsets */
108         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
109         /* 0x90 - 0x9d are for Dimension-2 official charsets */
110         /* 0x9e is for Dimension-1 private charsets */
111         /* 0x9f is for Dimension-2 private charsets */
112         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
113 };
114
115 Lisp_Object Qcharsetp;
116
117 /* Qdoc_string, Qdimension, Qchars defined in general.c */
118 Lisp_Object Qregistry, Qfinal, Qgraphic;
119 Lisp_Object Qdirection;
120 Lisp_Object Qreverse_direction_charset;
121 Lisp_Object Qleading_byte;
122 Lisp_Object Qshort_name, Qlong_name;
123
124 Lisp_Object Qascii,
125     Qcontrol_1,
126     Qlatin_iso8859_1,
127     Qlatin_iso8859_2,
128     Qlatin_iso8859_3,
129     Qlatin_iso8859_4,
130     Qthai_tis620,
131     Qgreek_iso8859_7,
132     Qarabic_iso8859_6,
133     Qhebrew_iso8859_8,
134     Qkatakana_jisx0201,
135     Qlatin_jisx0201,
136     Qcyrillic_iso8859_5,
137     Qlatin_iso8859_9,
138     Qjapanese_jisx0208_1978,
139     Qchinese_gb2312,
140     Qjapanese_jisx0208,
141     Qkorean_ksc5601,
142     Qjapanese_jisx0212,
143     Qchinese_cns11643_1,
144     Qchinese_cns11643_2, Qchinese_big5_1, Qchinese_big5_2, Qcomposite;
145
146 Lisp_Object Ql2r, Qr2l;
147
148 Lisp_Object Vcharset_hash_table;
149
150 /* Composite characters are characters constructed by overstriking two
151    or more regular characters.
152
153    1) The old Mule implementation involves storing composite characters
154       in a buffer as a tag followed by all of the actual characters
155       used to make up the composite character.  I think this is a bad
156       idea; it greatly complicates code that wants to handle strings
157       one character at a time because it has to deal with the possibility
158       of great big ungainly characters.  It's much more reasonable to
159       simply store an index into a table of composite characters.
160
161    2) The current implementation only allows for 16,384 separate
162       composite characters over the lifetime of the SXEmacs process.
163       This could become a potential problem if the user
164       edited lots of different files that use composite characters.
165       Due to FSF bogosity, increasing the number of allowable
166       composite characters under Mule would decrease the number
167       of possible faces that can exist.  Mule already has shrunk
168       this to 2048, and further shrinkage would become uncomfortable.
169       No such problems exist in SXEmacs.
170
171       Composite characters could be represented as 0x80 C1 C2 C3,
172       where each C[1-3] is in the range 0xA0 - 0xFF.  This allows
173       for slightly under 2^20 (one million) composite characters
174       over the SXEmacs process lifetime, and you only need to
175       increase the size of a Mule character from 19 to 21 bits.
176       Or you could use 0x80 C1 C2 C3 C4, allowing for about
177       85 million (slightly over 2^26) composite characters. */
178 \f
179 /************************************************************************/
180 /*                       Basic Emchar functions                         */
181 /************************************************************************/
182
183 /* Convert a non-ASCII Mule character C into a one-character Mule-encoded
184    string in STR.  Returns the number of bytes stored.
185    Do not call this directly.  Use the macro set_charptr_emchar() instead.
186  */
187
188 Bytecount non_ascii_set_charptr_emchar(Bufbyte * str, Emchar c)
189 {
190         Bufbyte *p;
191         Bufbyte lb;
192         int c1, c2;
193         Lisp_Object charset;
194
195         p = str;
196         BREAKUP_CHAR(c, charset, c1, c2);
197         lb = CHAR_LEADING_BYTE(c);
198         if (LEADING_BYTE_PRIVATE_P(lb))
199                 *p++ = PRIVATE_LEADING_BYTE_PREFIX(lb);
200         *p++ = lb;
201         if (EQ(charset, Vcharset_control_1))
202                 c1 += 0x20;
203         *p++ = c1 | 0x80;
204         if (c2)
205                 *p++ = c2 | 0x80;
206
207         return (p - str);
208 }
209
210 /* Return the first character from a Mule-encoded string in STR,
211    assuming it's non-ASCII.  Do not call this directly.
212    Use the macro charptr_emchar() instead. */
213
214 Emchar non_ascii_charptr_emchar(const Bufbyte * str)
215 {
216         Bufbyte i0 = *str, i1, i2 = 0;
217         Lisp_Object charset;
218
219         if (i0 == LEADING_BYTE_CONTROL_1)
220                 return (Emchar) (*++str - 0x20);
221
222         if (LEADING_BYTE_PREFIX_P(i0))
223                 i0 = *++str;
224
225         i1 = *++str & 0x7F;
226
227         charset = CHARSET_BY_LEADING_BYTE(i0);
228         if (XCHARSET_DIMENSION(charset) == 2)
229                 i2 = *++str & 0x7F;
230
231         return MAKE_CHAR(charset, i1, i2);
232 }
233
234 /* Return whether CH is a valid Emchar, assuming it's non-ASCII.
235    Do not call this directly.  Use the macro valid_char_p() instead. */
236
237 int non_ascii_valid_char_p(Emchar ch)
238 {
239         int f1, f2, f3;
240
241         /* Must have only lowest 19 bits set */
242         if (ch & ~0x7FFFF)
243                 return 0;
244
245         f1 = CHAR_FIELD1(ch);
246         f2 = CHAR_FIELD2(ch);
247         f3 = CHAR_FIELD3(ch);
248
249         if (f1 == 0) {
250                 Lisp_Object charset;
251
252                 if (f2 < MIN_CHAR_FIELD2_OFFICIAL ||
253                     (f2 > MAX_CHAR_FIELD2_OFFICIAL
254                      && f2 < MIN_CHAR_FIELD2_PRIVATE)
255                     || f2 > MAX_CHAR_FIELD2_PRIVATE)
256                         return 0;
257                 if (f3 < 0x20)
258                         return 0;
259
260                 if (f3 != 0x20 && f3 != 0x7F
261                     && !(f2 >= MIN_CHAR_FIELD2_PRIVATE
262                          && f2 <= MAX_CHAR_FIELD2_PRIVATE))
263                         return 1;
264
265                 /*
266                    NOTE: This takes advantage of the fact that
267                    FIELD2_TO_OFFICIAL_LEADING_BYTE and
268                    FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
269                  */
270                 charset =
271                     CHARSET_BY_LEADING_BYTE(f2 +
272                                             FIELD2_TO_OFFICIAL_LEADING_BYTE);
273                 if (EQ(charset, Qnil))
274                         return 0;
275                 return (XCHARSET_CHARS(charset) == 96);
276         } else {
277                 Lisp_Object charset;
278
279                 if (f1 < MIN_CHAR_FIELD1_OFFICIAL ||
280                     (f1 > MAX_CHAR_FIELD1_OFFICIAL
281                      && f1 < MIN_CHAR_FIELD1_PRIVATE)
282                     || f1 > MAX_CHAR_FIELD1_PRIVATE)
283                         return 0;
284                 if (f2 < 0x20 || f3 < 0x20)
285                         return 0;
286
287 #ifdef ENABLE_COMPOSITE_CHARS
288                 if (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE ==
289                     LEADING_BYTE_COMPOSITE) {
290                         if (UNBOUNDP
291                             (Fgethash
292                              (make_int(ch),
293                               Vcomposite_char_char2string_hash_table,
294                               Qunbound)))
295                                 return 0;
296                         return 1;
297                 }
298 #endif                          /* ENABLE_COMPOSITE_CHARS */
299
300                 if (f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F
301                     && !(f1 >= MIN_CHAR_FIELD1_PRIVATE
302                          && f1 <= MAX_CHAR_FIELD1_PRIVATE))
303                         return 1;
304
305                 if (f1 <= MAX_CHAR_FIELD1_OFFICIAL)
306                         charset =
307                             CHARSET_BY_LEADING_BYTE(f1 +
308                                                     FIELD1_TO_OFFICIAL_LEADING_BYTE);
309                 else
310                         charset =
311                             CHARSET_BY_LEADING_BYTE(f1 +
312                                                     FIELD1_TO_PRIVATE_LEADING_BYTE);
313
314                 if (EQ(charset, Qnil))
315                         return 0;
316                 return (XCHARSET_CHARS(charset) == 96);
317         }
318 }
319 \f
320 /************************************************************************/
321 /*                       Basic string functions                         */
322 /************************************************************************/
323
324 /* Copy the character pointed to by SRC into DST.  Do not call this
325    directly.  Use the macro charptr_copy_char() instead.
326    Return the number of bytes copied.  */
327
328 Bytecount non_ascii_charptr_copy_char(const Bufbyte * src, Bufbyte * dst)
329 {
330         unsigned int bytes = REP_BYTES_BY_FIRST_BYTE(*src);
331         unsigned int i;
332         for (i = bytes; i; i--, dst++, src++)
333                 *dst = *src;
334         return bytes;
335 }
336 \f
337 /************************************************************************/
338 /*                        streams of Emchars                            */
339 /************************************************************************/
340
341 /* Treat a stream as a stream of Emchar's rather than a stream of bytes.
342    The functions below are not meant to be called directly; use
343    the macros in insdel.h. */
344
345 Emchar Lstream_get_emchar_1(Lstream * stream, int ch)
346 {
347         Bufbyte str[MAX_EMCHAR_LEN];
348         Bufbyte *strptr = str;
349         unsigned int bytes;
350
351         str[0] = (Bufbyte) ch;
352
353         for (bytes = REP_BYTES_BY_FIRST_BYTE(ch) - 1; bytes; bytes--) {
354                 int c = Lstream_getc(stream);
355                 bufpos_checking_assert(c >= 0);
356                 *++strptr = (Bufbyte) c;
357         }
358         return charptr_emchar(str);
359 }
360
361 int Lstream_fput_emchar(Lstream * stream, Emchar ch)
362 {
363         Bufbyte str[MAX_EMCHAR_LEN];
364         Bytecount len = set_charptr_emchar(str, ch);
365         return Lstream_write(stream, str, len);
366 }
367
368 void Lstream_funget_emchar(Lstream * stream, Emchar ch)
369 {
370         Bufbyte str[MAX_EMCHAR_LEN];
371         Bytecount len = set_charptr_emchar(str, ch);
372         Lstream_unread(stream, str, len);
373 }
374 \f
375 /************************************************************************/
376 /*                            charset object                            */
377 /************************************************************************/
378
379 static Lisp_Object mark_charset(Lisp_Object obj)
380 {
381         Lisp_Charset *cs = XCHARSET(obj);
382
383         mark_object(cs->short_name);
384         mark_object(cs->long_name);
385         mark_object(cs->doc_string);
386         mark_object(cs->registry);
387         mark_object(cs->ccl_program);
388         return cs->name;
389 }
390
391 static void
392 print_charset(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
393 {
394         Lisp_Charset *cs = XCHARSET(obj);
395
396         if (print_readably)
397                 error("printing unreadable object #<charset %s 0x%x>",
398                       string_data(XSYMBOL(CHARSET_NAME(cs))->name),
399                       cs->header.uid);
400
401         write_c_string("#<charset ", printcharfun);
402         print_internal(CHARSET_NAME(cs), printcharfun, 0);
403         write_c_string(" ", printcharfun);
404         print_internal(CHARSET_SHORT_NAME(cs), printcharfun, 1);
405         write_c_string(" ", printcharfun);
406         print_internal(CHARSET_LONG_NAME(cs), printcharfun, 1);
407         write_c_string(" ", printcharfun);
408         print_internal(CHARSET_DOC_STRING(cs), printcharfun, 1);
409         write_fmt_string(printcharfun, " %s %s cols=%d g%d final='%c' reg=",
410                          (CHARSET_TYPE(cs) == CHARSET_TYPE_94 ? "94" :
411                           CHARSET_TYPE(cs) == CHARSET_TYPE_96 ? "96" :
412                           CHARSET_TYPE(cs) == CHARSET_TYPE_94X94 ? "94x94" :
413                           "96x96"),
414                          (CHARSET_DIRECTION(cs) == CHARSET_LEFT_TO_RIGHT ? "l2r" : "r2l"),
415                          CHARSET_COLUMNS(cs), CHARSET_GRAPHIC(cs), CHARSET_FINAL(cs));
416         print_internal(CHARSET_REGISTRY(cs), printcharfun, 0);
417         write_fmt_str(printcharfun, " 0x%x>", cs->header.uid);
418 }
419
420 static const struct lrecord_description charset_description[] = {
421         {XD_LISP_OBJECT, offsetof(Lisp_Charset, name)},
422         {XD_LISP_OBJECT, offsetof(Lisp_Charset, doc_string)},
423         {XD_LISP_OBJECT, offsetof(Lisp_Charset, registry)},
424         {XD_LISP_OBJECT, offsetof(Lisp_Charset, short_name)},
425         {XD_LISP_OBJECT, offsetof(Lisp_Charset, long_name)},
426         {XD_LISP_OBJECT, offsetof(Lisp_Charset, reverse_direction_charset)},
427         {XD_LISP_OBJECT, offsetof(Lisp_Charset, ccl_program)},
428         {XD_END}
429 };
430
431 DEFINE_LRECORD_IMPLEMENTATION("charset", charset,
432                               mark_charset, print_charset, 0, 0, 0,
433                               charset_description, Lisp_Charset);
434
435 /* Make a new charset. */
436 /* #### SJT Should generic properties be allowed? */
437 static Lisp_Object
438 make_charset(int id, Lisp_Object name, unsigned char rep_bytes,
439              unsigned char type, unsigned char columns, unsigned char graphic,
440              Bufbyte final, unsigned char direction, Lisp_Object short_name,
441              Lisp_Object long_name, Lisp_Object doc, Lisp_Object reg)
442 {
443         Lisp_Object obj;
444         Lisp_Charset *cs = alloc_lcrecord_type(Lisp_Charset, &lrecord_charset);
445
446         zero_lcrecord(cs);
447
448         XSETCHARSET(obj, cs);
449
450         CHARSET_ID(cs) = id;
451         CHARSET_NAME(cs) = name;
452         CHARSET_SHORT_NAME(cs) = short_name;
453         CHARSET_LONG_NAME(cs) = long_name;
454         CHARSET_REP_BYTES(cs) = rep_bytes;
455         CHARSET_DIRECTION(cs) = direction;
456         CHARSET_TYPE(cs) = type;
457         CHARSET_COLUMNS(cs) = columns;
458         CHARSET_GRAPHIC(cs) = graphic;
459         CHARSET_FINAL(cs) = final;
460         CHARSET_DOC_STRING(cs) = doc;
461         CHARSET_REGISTRY(cs) = reg;
462         CHARSET_CCL_PROGRAM(cs) = Qnil;
463         CHARSET_REVERSE_DIRECTION_CHARSET(cs) = Qnil;
464
465         CHARSET_DIMENSION(cs) = (CHARSET_TYPE(cs) == CHARSET_TYPE_94 ||
466                                  CHARSET_TYPE(cs) == CHARSET_TYPE_96) ? 1 : 2;
467         CHARSET_CHARS(cs) = (CHARSET_TYPE(cs) == CHARSET_TYPE_94 ||
468                              CHARSET_TYPE(cs) == CHARSET_TYPE_94X94) ? 94 : 96;
469
470         if (final) {
471                 /* some charsets do not have final characters.  This includes
472                    ASCII, Control-1, Composite, and the two faux private
473                    charsets. */
474                 assert(NILP
475                        (chlook->charset_by_attributes[type][final][direction]));
476                 chlook->charset_by_attributes[type][final][direction] = obj;
477         }
478
479         assert(NILP(chlook->charset_by_leading_byte[id - 128]));
480         chlook->charset_by_leading_byte[id - 128] = obj;
481
482         /* Some charsets are "faux" and don't have names or really exist at
483            all except in the leading-byte table. */
484         if (!NILP(name))
485                 Fputhash(name, obj, Vcharset_hash_table);
486         return obj;
487 }
488
489 static int get_unallocated_leading_byte(int dimension)
490 {
491         int lb;
492
493         if (dimension == 1) {
494                 if (chlook->next_allocated_1_byte_leading_byte >
495                     (Bufbyte)MAX_LEADING_BYTE_PRIVATE_1)
496                         lb = 0;
497                 else
498                         lb = chlook->next_allocated_1_byte_leading_byte++;
499 #if MAX_LEADING_BYTE_PRIVATE_2 == 255
500         } else if (chlook->next_allocated_2_byte_leading_byte == 0) {
501                 lb = 0;
502 #else
503         } else if (chlook->next_allocated_2_byte_leading_byte >
504                    (Bufbyte)MAX_LEADING_BYTE_PRIVATE_2) {
505                 lb = 0;
506 #endif
507         } else {
508                         lb = chlook->next_allocated_2_byte_leading_byte++;
509         }
510
511         if (!lb)
512                 signal_simple_error
513                     ("No more character sets free for this dimension",
514                      make_int(dimension));
515
516         return lb;
517 }
518 \f
519 /************************************************************************/
520 /*                      Basic charset Lisp functions                    */
521 /************************************************************************/
522
523 DEFUN("charsetp", Fcharsetp, 1, 1, 0,   /*
524 Return non-nil if OBJECT is a charset.
525 */
526       (object))
527 {
528         return CHARSETP(object) ? Qt : Qnil;
529 }
530
531 DEFUN("find-charset", Ffind_charset, 1, 1, 0,   /*
532 Retrieve the charset of the given name.
533 If CHARSET-OR-NAME is a charset object, it is simply returned.
534 Otherwise, CHARSET-OR-NAME should be a symbol.  If there is no such charset,
535 nil is returned.  Otherwise the associated charset object is returned.
536 */
537       (charset_or_name))
538 {
539         if (CHARSETP(charset_or_name))
540                 return charset_or_name;
541
542         CHECK_SYMBOL(charset_or_name);
543         return Fgethash(charset_or_name, Vcharset_hash_table, Qnil);
544 }
545
546 DEFUN("get-charset", Fget_charset, 1, 1, 0,     /*
547 Retrieve the charset of the given name.
548 Same as `find-charset' except an error is signalled if there is no such
549 charset instead of returning nil.
550 */
551       (name))
552 {
553         Lisp_Object charset = Ffind_charset(name);
554
555         if (NILP(charset))
556                 signal_simple_error("No such charset", name);
557         return charset;
558 }
559
560 /* We store the charsets in hash tables with the names as the key and the
561    actual charset object as the value.  Occasionally we need to use them
562    in a list format.  These routines provide us with that. */
563 struct charset_list_closure {
564         Lisp_Object *charset_list;
565 };
566
567 static int
568 add_charset_to_list_mapper(Lisp_Object key, Lisp_Object value,
569                            void *charset_list_closure)
570 {
571         /* This function can GC */
572         struct charset_list_closure *chcl =
573             (struct charset_list_closure *)charset_list_closure;
574         Lisp_Object *charset_list = chcl->charset_list;
575
576         *charset_list = Fcons(XCHARSET_NAME(value), *charset_list);
577         return 0;
578 }
579
580 DEFUN("charset-list", Fcharset_list, 0, 0, 0,   /*
581 Return a list of the names of all defined charsets.
582 */
583       ())
584 {
585         Lisp_Object charset_list = Qnil;
586         struct gcpro gcpro1;
587         struct charset_list_closure charset_list_closure;
588
589         GCPRO1(charset_list);
590         charset_list_closure.charset_list = &charset_list;
591         elisp_maphash(add_charset_to_list_mapper, Vcharset_hash_table,
592                       &charset_list_closure);
593         UNGCPRO;
594
595         return charset_list;
596 }
597
598 DEFUN("charset-name", Fcharset_name, 1, 1, 0,   /*
599 Return the name of charset CHARSET.
600 */
601       (charset))
602 {
603         Lisp_Object tmp = Fget_charset(charset);
604         return XCHARSET_NAME(tmp);
605 }
606
607 /* #### SJT Should generic properties be allowed? */
608 DEFUN("make-charset", Fmake_charset, 3, 3, 0,   /*
609 Define a new character set.
610 This function is for use with Mule support.
611 NAME is a symbol, the name by which the character set is normally referred.
612 DOC-STRING is a string describing the character set.
613 PROPS is a property list, describing the specific nature of the
614 character set.  Recognized properties are:
615
616 'short-name   Short version of the charset name (ex: Latin-1)
617 'long-name    Long version of the charset name (ex: ISO8859-1 (Latin-1))
618 'registry     A regular expression matching the font registry field for
619 this character set.
620 'dimension    Number of octets used to index a character in this charset.
621 Either 1 or 2.  Defaults to 1.
622 'columns      Number of columns used to display a character in this charset.
623 Only used in TTY mode. (Under X, the actual width of a
624 character can be derived from the font used to display the
625 characters.) If unspecified, defaults to the dimension
626 (this is almost       always the correct value).
627 'chars                Number of characters in each dimension (94 or 96).
628 Defaults to 94.  Note that if the dimension is 2, the
629 character set thus described is 94x94 or 96x96.
630 'final                Final byte of ISO 2022 escape sequence.  Must be
631 supplied.  Each combination of (DIMENSION, CHARS) defines a
632 separate namespace for final bytes.  Note that ISO
633 2022 restricts the final byte to the range
634 0x30 - 0x7E if dimension == 1, and 0x30 - 0x5F if
635 dimension == 2.  Note also that final bytes in the range
636 0x30 - 0x3F are reserved for user-defined (not official)
637 character sets.
638 'graphic      0 (use left half of font on output) or 1 (use right half
639 of font on output).  Defaults to 0.  For example, for
640 a font whose registry is ISO8859-1, the left half
641 (octets 0x20 - 0x7F) is the `ascii' character set, while
642 the right half (octets 0xA0 - 0xFF) is the `latin-1'
643 character set.  With 'graphic set to 0, the octets
644 will have their high bit cleared; with it set to 1,
645 the octets will have their high bit set.
646 'direction    'l2r (left-to-right) or 'r2l (right-to-left).
647 Defaults to 'l2r.
648 'ccl-program  A compiled CCL program used to convert a character in
649 this charset into an index into the font.  This is in
650 addition to the 'graphic property.  The CCL program
651 is passed the octets of the character, with the high
652 bit cleared and set depending upon whether the value
653 of the 'graphic property is 0 or 1.
654 */
655       (name, doc_string, props))
656 {
657         int id, dimension = 1, chars = 94, graphic = 0, final = 0, columns = -1;
658         int direction = CHARSET_LEFT_TO_RIGHT;
659         int type;
660         Lisp_Object registry = Qnil;
661         Lisp_Object charset;
662         Lisp_Object ccl_program = Qnil;
663         Lisp_Object short_name = Qnil, long_name = Qnil;
664
665         CHECK_SYMBOL(name);
666         if (!NILP(doc_string))
667                 CHECK_STRING(doc_string);
668
669         charset = Ffind_charset(name);
670         if (!NILP(charset))
671                 signal_simple_error("Cannot redefine existing charset", name);
672
673         {
674                 EXTERNAL_PROPERTY_LIST_LOOP_3(keyword, value, props) {
675                         if (EQ(keyword, Qshort_name)) {
676                                 CHECK_STRING(value);
677                                 short_name = value;
678                         }
679
680                         else if (EQ(keyword, Qlong_name)) {
681                                 CHECK_STRING(value);
682                                 long_name = value;
683                         }
684
685                         else if (EQ(keyword, Qdimension)) {
686                                 CHECK_INT(value);
687                                 dimension = XINT(value);
688                                 if (dimension < 1 || dimension > 2)
689                                         signal_simple_error
690                                             ("Invalid value for 'dimension",
691                                              value);
692                         }
693
694                         else if (EQ(keyword, Qchars)) {
695                                 CHECK_INT(value);
696                                 chars = XINT(value);
697                                 if (chars != 94 && chars != 96)
698                                         signal_simple_error
699                                             ("Invalid value for 'chars", value);
700                         }
701
702                         else if (EQ(keyword, Qcolumns)) {
703                                 CHECK_INT(value);
704                                 columns = XINT(value);
705                                 if (columns != 1 && columns != 2)
706                                         signal_simple_error
707                                             ("Invalid value for 'columns",
708                                              value);
709                         }
710
711                         else if (EQ(keyword, Qgraphic)) {
712                                 CHECK_INT(value);
713                                 graphic = XINT(value);
714                                 if (graphic < 0 || graphic > 1)
715                                         signal_simple_error
716                                             ("Invalid value for 'graphic",
717                                              value);
718                         }
719
720                         else if (EQ(keyword, Qregistry)) {
721                                 CHECK_STRING(value);
722                                 registry = value;
723                         }
724
725                         else if (EQ(keyword, Qdirection)) {
726                                 if (EQ(value, Ql2r))
727                                         direction = CHARSET_LEFT_TO_RIGHT;
728                                 else if (EQ(value, Qr2l))
729                                         direction = CHARSET_RIGHT_TO_LEFT;
730                                 else
731                                         signal_simple_error
732                                             ("Invalid value for 'direction",
733                                              value);
734                         }
735
736                         else if (EQ(keyword, Qfinal)) {
737                                 CHECK_CHAR_COERCE_INT(value);
738                                 final = XCHAR(value);
739                                 if (final < '0' || final > '~')
740                                         signal_simple_error
741                                             ("Invalid value for 'final", value);
742                         }
743
744                         else if (EQ(keyword, Qccl_program)) {
745                                 struct ccl_program test_ccl;
746
747                                 if (setup_ccl_program(&test_ccl, value) < 0)
748                                         signal_simple_error
749                                             ("Invalid value for 'ccl-program",
750                                              value);
751                                 ccl_program = value;
752                         }
753
754                         else
755                                 signal_simple_error("Unrecognized property",
756                                                     keyword);
757                 }
758         }
759
760         if (!final)
761                 error("'final must be specified");
762         if (dimension == 2 && final > 0x5F)
763                 signal_simple_error
764                     ("Final must be in the range 0x30 - 0x5F for dimension == 2",
765                      make_char(final));
766
767         if (dimension == 1)
768                 type = (chars == 94) ? CHARSET_TYPE_94 : CHARSET_TYPE_96;
769         else
770                 type = (chars == 94) ? CHARSET_TYPE_94X94 : CHARSET_TYPE_96X96;
771
772         if (!NILP(CHARSET_BY_ATTRIBUTES(type, final, CHARSET_LEFT_TO_RIGHT)) ||
773             !NILP(CHARSET_BY_ATTRIBUTES(type, final, CHARSET_RIGHT_TO_LEFT)))
774                 error
775                     ("Character set already defined for this DIMENSION/CHARS/FINAL combo");
776
777         id = get_unallocated_leading_byte(dimension);
778
779         if (NILP(doc_string))
780                 doc_string = build_string("");
781
782         if (NILP(registry))
783                 registry = build_string("");
784
785         if (NILP(short_name))
786                 XSETSTRING(short_name, XSYMBOL(name)->name);
787
788         if (NILP(long_name))
789                 long_name = doc_string;
790
791         if (columns == -1)
792                 columns = dimension;
793         charset = make_charset(id, name, dimension + 2, type, columns, graphic,
794                                final, direction, short_name, long_name,
795                                doc_string, registry);
796         if (!NILP(ccl_program))
797                 XCHARSET_CCL_PROGRAM(charset) = ccl_program;
798         return charset;
799 }
800
801 DEFUN("make-reverse-direction-charset", Fmake_reverse_direction_charset, 2, 2, 0,       /*
802 Make a charset equivalent to CHARSET but which goes in the opposite direction.
803 NEW-NAME is the name of the new charset.  Return the new charset.
804 */
805       (charset, new_name))
806 {
807         Lisp_Object new_charset = Qnil;
808         int id, dimension, columns, graphic, final;
809         int direction, type;
810         Lisp_Object registry, doc_string, short_name, long_name;
811         Lisp_Charset *cs;
812
813         charset = Fget_charset(charset);
814         if (!NILP(XCHARSET_REVERSE_DIRECTION_CHARSET(charset)))
815                 signal_simple_error
816                     ("Charset already has reverse-direction charset", charset);
817
818         CHECK_SYMBOL(new_name);
819         if (!NILP(Ffind_charset(new_name)))
820                 signal_simple_error("Cannot redefine existing charset",
821                                     new_name);
822
823         cs = XCHARSET(charset);
824
825         type = CHARSET_TYPE(cs);
826         columns = CHARSET_COLUMNS(cs);
827         dimension = CHARSET_DIMENSION(cs);
828         id = get_unallocated_leading_byte(dimension);
829
830         graphic = CHARSET_GRAPHIC(cs);
831         final = CHARSET_FINAL(cs);
832         direction = CHARSET_RIGHT_TO_LEFT;
833         if (CHARSET_DIRECTION(cs) == CHARSET_RIGHT_TO_LEFT)
834                 direction = CHARSET_LEFT_TO_RIGHT;
835         doc_string = CHARSET_DOC_STRING(cs);
836         short_name = CHARSET_SHORT_NAME(cs);
837         long_name = CHARSET_LONG_NAME(cs);
838         registry = CHARSET_REGISTRY(cs);
839
840         new_charset = make_charset(id, new_name, dimension + 2, type, columns,
841                                    graphic, final, direction, short_name,
842                                    long_name, doc_string, registry);
843
844         CHARSET_REVERSE_DIRECTION_CHARSET(cs) = new_charset;
845         XCHARSET_REVERSE_DIRECTION_CHARSET(new_charset) = charset;
846
847         return new_charset;
848 }
849
850 /* #### Reverse direction charsets not yet implemented.  */
851 #if 0
852 DEFUN("charset-reverse-direction-charset", Fcharset_reverse_direction_charset, 1, 1, 0, /*
853 Return the reverse-direction charset parallel to CHARSET, if any.
854 This is the charset with the same properties (in particular, the same
855 dimension, number of characters per dimension, and final byte) as
856 CHARSET but whose characters are displayed in the opposite direction.
857 */
858       (charset))
859 {
860         charset = Fget_charset(charset);
861         return XCHARSET_REVERSE_DIRECTION_CHARSET(charset);
862 }
863 #endif
864
865 DEFUN("charset-from-attributes", Fcharset_from_attributes, 3, 4, 0,     /*
866 Return a charset with the given DIMENSION, CHARS, FINAL, and DIRECTION.
867 If DIRECTION is omitted, both directions will be checked (left-to-right
868 will be returned if character sets exist for both directions).
869 */
870       (dimension, chars, final, direction))
871 {
872         int dm, ch, fi, di = -1;
873         int type;
874         Lisp_Object obj = Qnil;
875
876         CHECK_INT(dimension);
877         dm = XINT(dimension);
878         if (dm < 1 || dm > 2)
879                 signal_simple_error("Invalid value for DIMENSION", dimension);
880
881         CHECK_INT(chars);
882         ch = XINT(chars);
883         if (ch != 94 && ch != 96)
884                 signal_simple_error("Invalid value for CHARS", chars);
885
886         CHECK_CHAR_COERCE_INT(final);
887         fi = XCHAR(final);
888         if (fi < '0' || fi > '~')
889                 signal_simple_error("Invalid value for FINAL", final);
890
891         if (EQ(direction, Ql2r))
892                 di = CHARSET_LEFT_TO_RIGHT;
893         else if (EQ(direction, Qr2l))
894                 di = CHARSET_RIGHT_TO_LEFT;
895         else if (!NILP(direction))
896                 signal_simple_error("Invalid value for DIRECTION", direction);
897
898         if (dm == 2 && fi > 0x5F)
899                 signal_simple_error
900                     ("Final must be in the range 0x30 - 0x5F for dimension == 2",
901                      final);
902
903         if (dm == 1)
904                 type = (ch == 94) ? CHARSET_TYPE_94 : CHARSET_TYPE_96;
905         else
906                 type = (ch == 94) ? CHARSET_TYPE_94X94 : CHARSET_TYPE_96X96;
907
908         if (di == -1) {
909                 obj = CHARSET_BY_ATTRIBUTES(type, fi, CHARSET_LEFT_TO_RIGHT);
910                 if (NILP(obj))
911                         obj =
912                             CHARSET_BY_ATTRIBUTES(type, fi,
913                                                   CHARSET_RIGHT_TO_LEFT);
914         } else
915                 obj = CHARSET_BY_ATTRIBUTES(type, fi, di);
916
917         if (CHARSETP(obj))
918                 return XCHARSET_NAME(obj);
919         return obj;
920 }
921
922 DEFUN("charset-short-name", Fcharset_short_name, 1, 1, 0,       /*
923 Return short name of CHARSET.
924 */
925       (charset))
926 {
927         Lisp_Object tmp = Fget_charset(charset);
928         return XCHARSET_SHORT_NAME(tmp);
929 }
930
931 DEFUN("charset-long-name", Fcharset_long_name, 1, 1, 0, /*
932 Return long name of CHARSET.
933 */
934       (charset))
935 {
936         Lisp_Object tmp = Fget_charset(charset);
937         return XCHARSET_LONG_NAME(tmp);
938 }
939
940 DEFUN("charset-description", Fcharset_description, 1, 1, 0,     /*
941 Return description of CHARSET.
942 */
943       (charset))
944 {
945         Lisp_Object tmp = Fget_charset(charset);
946         return XCHARSET_DOC_STRING(tmp);
947 }
948
949 DEFUN("charset-dimension", Fcharset_dimension, 1, 1, 0, /*
950 Return dimension of CHARSET.
951 */
952       (charset))
953 {
954         Lisp_Object tmp = Fget_charset(charset);
955         return make_int(XCHARSET_DIMENSION(tmp));
956 }
957
958 DEFUN("charset-property", Fcharset_property, 2, 2, 0,   /*
959 Return property PROP of CHARSET, a charset object or symbol naming a charset.
960 Recognized properties are those listed in `make-charset', as well as
961 'name and 'doc-string.
962 */
963       (charset, prop))
964 {
965         Lisp_Charset *cs;
966
967         charset = Fget_charset(charset);
968         cs = XCHARSET(charset);
969
970         CHECK_SYMBOL(prop);
971         if (EQ(prop, Qname))
972                 return CHARSET_NAME(cs);
973         if (EQ(prop, Qshort_name))
974                 return CHARSET_SHORT_NAME(cs);
975         if (EQ(prop, Qlong_name))
976                 return CHARSET_LONG_NAME(cs);
977         if (EQ(prop, Qdoc_string))
978                 return CHARSET_DOC_STRING(cs);
979         if (EQ(prop, Qdimension))
980                 return make_int(CHARSET_DIMENSION(cs));
981         if (EQ(prop, Qcolumns))
982                 return make_int(CHARSET_COLUMNS(cs));
983         if (EQ(prop, Qgraphic))
984                 return make_int(CHARSET_GRAPHIC(cs));
985         if (EQ(prop, Qfinal))
986                 return make_char(CHARSET_FINAL(cs));
987         if (EQ(prop, Qchars))
988                 return make_int(CHARSET_CHARS(cs));
989         if (EQ(prop, Qregistry))
990                 return CHARSET_REGISTRY(cs);
991         if (EQ(prop, Qccl_program))
992                 return CHARSET_CCL_PROGRAM(cs);
993         if (EQ(prop, Qdirection))
994                 return CHARSET_DIRECTION(cs) ==
995                     CHARSET_LEFT_TO_RIGHT ? Ql2r : Qr2l;
996         if (EQ(prop, Qreverse_direction_charset)) {
997                 Lisp_Object obj = CHARSET_REVERSE_DIRECTION_CHARSET(cs);
998                 /* #### Is this translation OK?  If so, error checking sufficient? */
999                 return CHARSETP(obj) ? XCHARSET_NAME(obj) : obj;
1000         }
1001         signal_simple_error("Unrecognized charset property name", prop);
1002         return Qnil;            /* not reached */
1003 }
1004
1005 DEFUN("charset-id", Fcharset_id, 1, 1, 0,       /*
1006 Return charset identification number of CHARSET.
1007 */
1008       (charset))
1009 {
1010         Lisp_Object tmp = Fget_charset(charset);
1011         return make_int(XCHARSET_LEADING_BYTE(tmp));
1012 }
1013
1014 /* #### We need to figure out which properties we really want to
1015    allow to be set. */
1016
1017 DEFUN("set-charset-ccl-program", Fset_charset_ccl_program, 2, 2, 0,     /*
1018 Set the 'ccl-program property of CHARSET to CCL-PROGRAM.
1019 */
1020       (charset, ccl_program))
1021 {
1022         struct ccl_program test_ccl;
1023
1024         charset = Fget_charset(charset);
1025         if (setup_ccl_program(&test_ccl, ccl_program) < 0)
1026                 signal_simple_error("Invalid ccl-program", ccl_program);
1027         XCHARSET_CCL_PROGRAM(charset) = ccl_program;
1028         return Qnil;
1029 }
1030
1031 static void invalidate_charset_font_caches(Lisp_Object charset)
1032 {
1033         /* Invalidate font cache entries for charset on all devices. */
1034         Lisp_Object devcons, concons, hash_table;
1035         DEVICE_LOOP_NO_BREAK(devcons, concons) {
1036                 struct device *d = XDEVICE(XCAR(devcons));
1037                 hash_table = Fgethash(charset, d->charset_font_cache, Qunbound);
1038                 if (!UNBOUNDP(hash_table))
1039                         Fclrhash(hash_table);
1040         }
1041 }
1042
1043 /* Japanese folks may want to (set-charset-registry 'ascii "jisx0201") */
1044 DEFUN("set-charset-registry", Fset_charset_registry, 2, 2, 0,   /*
1045 Set the 'registry property of CHARSET to REGISTRY.
1046 */
1047       (charset, registry))
1048 {
1049         charset = Fget_charset(charset);
1050         CHECK_STRING(registry);
1051         XCHARSET_REGISTRY(charset) = registry;
1052         invalidate_charset_font_caches(charset);
1053         face_property_was_changed(Vdefault_face, Qfont, Qglobal);
1054         return Qnil;
1055 }
1056 \f
1057 /************************************************************************/
1058 /*              Lisp primitives for working with characters             */
1059 /************************************************************************/
1060
1061 DEFUN("make-char", Fmake_char, 2, 3, 0, /*
1062 Make a character from CHARSET and octets ARG1 and ARG2.
1063 ARG2 is required only for characters from two-dimensional charsets.
1064 For example, (make-char 'latin-iso8859-2 185) will return the Latin 2
1065 character s with caron.
1066 */
1067       (charset, arg1, arg2))
1068 {
1069         Lisp_Charset *cs;
1070         int a1, a2;
1071         int lowlim, highlim;
1072
1073         charset = Fget_charset(charset);
1074         cs = XCHARSET(charset);
1075
1076         if (EQ(charset, Vcharset_ascii))
1077                 lowlim = 0, highlim = 127;
1078         else if (EQ(charset, Vcharset_control_1))
1079                 lowlim = 0, highlim = 31;
1080         else if (CHARSET_CHARS(cs) == 94)
1081                 lowlim = 33, highlim = 126;
1082         else                    /* CHARSET_CHARS (cs) == 96) */
1083                 lowlim = 32, highlim = 127;
1084
1085         CHECK_INT(arg1);
1086         /* It is useful (and safe, according to Olivier Galibert) to strip
1087            the 8th bit off ARG1 and ARG2 because it allows programmers to
1088            write (make-char 'latin-iso8859-2 CODE) where code is the actual
1089            Latin 2 code of the character.  */
1090         a1 = XINT(arg1) & 0x7f;
1091         if (a1 < lowlim || a1 > highlim)
1092                 args_out_of_range_3(arg1, make_int(lowlim), make_int(highlim));
1093
1094         if (CHARSET_DIMENSION(cs) == 1) {
1095                 if (!NILP(arg2))
1096                         signal_simple_error
1097                             ("Charset is of dimension one; second octet must be nil",
1098                              arg2);
1099                 return make_char(MAKE_CHAR(charset, a1, 0));
1100         }
1101
1102         CHECK_INT(arg2);
1103         a2 = XINT(arg2) & 0x7f;
1104         if (a2 < lowlim || a2 > highlim)
1105                 args_out_of_range_3(arg2, make_int(lowlim), make_int(highlim));
1106
1107         return make_char(MAKE_CHAR(charset, a1, a2));
1108 }
1109
1110 DEFUN("char-charset", Fchar_charset, 1, 1, 0,   /*
1111 Return the character set of CHARACTER.
1112 */
1113       (character))
1114 {
1115         Lisp_Object tmp;
1116
1117         CHECK_CHAR_COERCE_INT(character);
1118
1119         tmp = CHARSET_BY_LEADING_BYTE(CHAR_LEADING_BYTE(XCHAR(character)));
1120         return XCHARSET_NAME(tmp);
1121 }
1122
1123 DEFUN("char-octet", Fchar_octet, 1, 2, 0,       /*
1124 Return the octet numbered N (should be 0 or 1) of CHARACTER.
1125 N defaults to 0 if omitted.
1126 */
1127       (character, n))
1128 {
1129         Lisp_Object charset;
1130         int octet0, octet1;
1131
1132         CHECK_CHAR_COERCE_INT(character);
1133
1134         BREAKUP_CHAR(XCHAR(character), charset, octet0, octet1);
1135
1136         if (NILP(n) || EQ(n, Qzero))
1137                 return make_int(octet0);
1138         else if (EQ(n, make_int(1)))
1139                 return make_int(octet1);
1140         else
1141                 signal_simple_error("Octet number must be 0 or 1", n);
1142 }
1143
1144 DEFUN("split-char", Fsplit_char, 1, 1, 0,       /*
1145 Return list of charset and one or two position-codes of CHARACTER.
1146 */
1147       (character))
1148 {
1149         /* This function can GC */
1150         struct gcpro gcpro1, gcpro2;
1151         Lisp_Object charset = Qnil;
1152         Lisp_Object rc = Qnil;
1153         int c1, c2;
1154         Lisp_Object tmp;
1155
1156         GCPRO2(charset, rc);
1157         CHECK_CHAR_COERCE_INT(character);
1158
1159         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
1160
1161         tmp = Fget_charset(charset);
1162         if (XCHARSET_DIMENSION(tmp) == 2) {
1163                 rc = list3(XCHARSET_NAME(charset), make_int(c1), make_int(c2));
1164         } else {
1165                 rc = list2(XCHARSET_NAME(charset), make_int(c1));
1166         }
1167         UNGCPRO;
1168
1169         return rc;
1170 }
1171 \f
1172 #ifdef ENABLE_COMPOSITE_CHARS
1173 /************************************************************************/
1174 /*                     composite character functions                    */
1175 /************************************************************************/
1176
1177 Emchar lookup_composite_char(Bufbyte * str, int len)
1178 {
1179         Lisp_Object lispstr = make_string(str, len);
1180         Lisp_Object ch = Fgethash(lispstr,
1181                                   Vcomposite_char_string2char_hash_table,
1182                                   Qunbound);
1183         Emchar emch;
1184
1185         if (UNBOUNDP(ch)) {
1186                 if (composite_char_row_next >= 128)
1187                         signal_simple_error("No more composite chars available",
1188                                             lispstr);
1189                 emch =
1190                     MAKE_CHAR(Vcharset_composite, composite_char_row_next,
1191                               composite_char_col_next);
1192                 Fputhash(make_char(emch), lispstr,
1193                          Vcomposite_char_char2string_hash_table);
1194                 Fputhash(lispstr, make_char(emch),
1195                          Vcomposite_char_string2char_hash_table);
1196                 composite_char_col_next++;
1197                 if (composite_char_col_next >= 128) {
1198                         composite_char_col_next = 32;
1199                         composite_char_row_next++;
1200                 }
1201         } else
1202                 emch = XCHAR(ch);
1203         return emch;
1204 }
1205
1206 Lisp_Object composite_char_string(Emchar ch)
1207 {
1208         Lisp_Object str = Fgethash(make_char(ch),
1209                                    Vcomposite_char_char2string_hash_table,
1210                                    Qunbound);
1211         assert(!UNBOUNDP(str));
1212         return str;
1213 }
1214
1215 xxDEFUN("make-composite-char", Fmake_composite_char, 1, 1, 0,   /*
1216 Convert a string into a single composite character.
1217 The character is the result of overstriking all the characters in
1218 the string.
1219                                                                  */
1220         (string))
1221 {
1222         CHECK_STRING(string);
1223         return make_char(lookup_composite_char(XSTRING_DATA(string),
1224                                                XSTRING_LENGTH(string)));
1225 }
1226
1227 xxDEFUN("composite-char-string", Fcomposite_char_string, 1, 1, 0,       /*
1228 Return a string of the characters comprising a composite character.
1229                                                                          */
1230         (ch))
1231 {
1232         Emchar emch;
1233
1234         CHECK_CHAR(ch);
1235         emch = XCHAR(ch);
1236         if (CHAR_LEADING_BYTE(emch) != LEADING_BYTE_COMPOSITE)
1237                 signal_simple_error("Must be composite char", ch);
1238         return composite_char_string(emch);
1239 }
1240 #endif                          /* ENABLE_COMPOSITE_CHARS */
1241 \f
1242 /************************************************************************/
1243 /*                            initialization                            */
1244 /************************************************************************/
1245
1246 void syms_of_mule_charset(void)
1247 {
1248         INIT_LRECORD_IMPLEMENTATION(charset);
1249
1250         DEFSUBR(Fcharsetp);
1251         DEFSUBR(Ffind_charset);
1252         DEFSUBR(Fget_charset);
1253         DEFSUBR(Fcharset_list);
1254         DEFSUBR(Fcharset_name);
1255         DEFSUBR(Fmake_charset);
1256         DEFSUBR(Fmake_reverse_direction_charset);
1257         /*  DEFSUBR (Freverse_direction_charset); */
1258         DEFSUBR(Fcharset_from_attributes);
1259         DEFSUBR(Fcharset_short_name);
1260         DEFSUBR(Fcharset_long_name);
1261         DEFSUBR(Fcharset_description);
1262         DEFSUBR(Fcharset_dimension);
1263         DEFSUBR(Fcharset_property);
1264         DEFSUBR(Fcharset_id);
1265         DEFSUBR(Fset_charset_ccl_program);
1266         DEFSUBR(Fset_charset_registry);
1267
1268         DEFSUBR(Fmake_char);
1269         DEFSUBR(Fchar_charset);
1270         DEFSUBR(Fchar_octet);
1271         DEFSUBR(Fsplit_char);
1272
1273 #ifdef ENABLE_COMPOSITE_CHARS
1274         DEFSUBR(Fmake_composite_char);
1275         DEFSUBR(Fcomposite_char_string);
1276 #endif
1277
1278         defsymbol(&Qcharsetp, "charsetp");
1279         defsymbol(&Qregistry, "registry");
1280         defsymbol(&Qfinal, "final");
1281         defsymbol(&Qgraphic, "graphic");
1282         defsymbol(&Qdirection, "direction");
1283         defsymbol(&Qreverse_direction_charset, "reverse-direction-charset");
1284         defsymbol(&Qshort_name, "short-name");
1285         defsymbol(&Qlong_name, "long-name");
1286
1287         defsymbol(&Ql2r, "l2r");
1288         defsymbol(&Qr2l, "r2l");
1289
1290         /* Charsets, compatible with FSF 20.3
1291            Naming convention is Script-Charset[-Edition] */
1292         defsymbol(&Qascii, "ascii");
1293         defsymbol(&Qcontrol_1, "control-1");
1294         defsymbol(&Qlatin_iso8859_1, "latin-iso8859-1");
1295         defsymbol(&Qlatin_iso8859_2, "latin-iso8859-2");
1296         defsymbol(&Qlatin_iso8859_3, "latin-iso8859-3");
1297         defsymbol(&Qlatin_iso8859_4, "latin-iso8859-4");
1298         defsymbol(&Qthai_tis620, "thai-tis620");
1299         defsymbol(&Qgreek_iso8859_7, "greek-iso8859-7");
1300         defsymbol(&Qarabic_iso8859_6, "arabic-iso8859-6");
1301         defsymbol(&Qhebrew_iso8859_8, "hebrew-iso8859-8");
1302         defsymbol(&Qkatakana_jisx0201, "katakana-jisx0201");
1303         defsymbol(&Qlatin_jisx0201, "latin-jisx0201");
1304         defsymbol(&Qcyrillic_iso8859_5, "cyrillic-iso8859-5");
1305         defsymbol(&Qlatin_iso8859_9, "latin-iso8859-9");
1306         defsymbol(&Qjapanese_jisx0208_1978, "japanese-jisx0208-1978");
1307         defsymbol(&Qchinese_gb2312, "chinese-gb2312");
1308         defsymbol(&Qjapanese_jisx0208, "japanese-jisx0208");
1309         defsymbol(&Qkorean_ksc5601, "korean-ksc5601");
1310         defsymbol(&Qjapanese_jisx0212, "japanese-jisx0212");
1311         defsymbol(&Qchinese_cns11643_1, "chinese-cns11643-1");
1312         defsymbol(&Qchinese_cns11643_2, "chinese-cns11643-2");
1313         defsymbol(&Qchinese_big5_1, "chinese-big5-1");
1314         defsymbol(&Qchinese_big5_2, "chinese-big5-2");
1315
1316         defsymbol(&Qcomposite, "composite");
1317 }
1318
1319 void vars_of_mule_charset(void)
1320 {
1321         int i, j, k;
1322
1323         chlook = xnew_and_zero(struct charset_lookup);  /* zero for Purify. */
1324         dump_add_root_struct_ptr(&chlook, &charset_lookup_description);
1325
1326         /* Table of charsets indexed by leading byte. */
1327         for (i = 0; i < countof(chlook->charset_by_leading_byte); i++)
1328                 chlook->charset_by_leading_byte[i] = Qnil;
1329
1330         /* Table of charsets indexed by type/final-byte/direction. */
1331         for (i = 0; i < countof(chlook->charset_by_attributes); i++)
1332                 for (j = 0; j < countof(chlook->charset_by_attributes[0]); j++)
1333                         for (k = 0;
1334                              k < countof(chlook->charset_by_attributes[0][0]);
1335                              k++)
1336                                 chlook->charset_by_attributes[i][j][k] = Qnil;
1337
1338         chlook->next_allocated_1_byte_leading_byte = MIN_LEADING_BYTE_PRIVATE_1;
1339         chlook->next_allocated_2_byte_leading_byte = MIN_LEADING_BYTE_PRIVATE_2;
1340 }
1341
1342 void complex_vars_of_mule_charset(void)
1343 {
1344         staticpro(&Vcharset_hash_table);
1345         Vcharset_hash_table =
1346             make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
1347
1348         /* Predefined character sets.  We store them into variables for
1349            ease of access. */
1350
1351         staticpro(&Vcharset_ascii);
1352         Vcharset_ascii =
1353             make_charset(LEADING_BYTE_ASCII, Qascii, 1,
1354                          CHARSET_TYPE_94, 1, 0, 'B',
1355                          CHARSET_LEFT_TO_RIGHT,
1356                          build_string("ASCII"),
1357                          build_string("ASCII)"),
1358                          build_string("ASCII (ISO646 IRV)"),
1359                          build_string("\\(iso8859-[0-9]*\\|-ascii\\)"));
1360         staticpro(&Vcharset_control_1);
1361         Vcharset_control_1 =
1362             make_charset(LEADING_BYTE_CONTROL_1, Qcontrol_1, 2,
1363                          CHARSET_TYPE_94, 1, 1, 0,
1364                          CHARSET_LEFT_TO_RIGHT,
1365                          build_string("C1"),
1366                          build_string("Control characters"),
1367                          build_string("Control characters 128-191"),
1368                          build_string(""));
1369         staticpro(&Vcharset_latin_iso8859_1);
1370         Vcharset_latin_iso8859_1 =
1371             make_charset(LEADING_BYTE_LATIN_ISO8859_1, Qlatin_iso8859_1, 2,
1372                          CHARSET_TYPE_96, 1, 1, 'A',
1373                          CHARSET_LEFT_TO_RIGHT,
1374                          build_string("Latin-1"),
1375                          build_string("ISO8859-1 (Latin-1)"),
1376                          build_string("ISO8859-1 (Latin-1)"),
1377                          build_string("iso8859-1"));
1378         staticpro(&Vcharset_latin_iso8859_2);
1379         Vcharset_latin_iso8859_2 =
1380             make_charset(LEADING_BYTE_LATIN_ISO8859_2, Qlatin_iso8859_2, 2,
1381                          CHARSET_TYPE_96, 1, 1, 'B',
1382                          CHARSET_LEFT_TO_RIGHT,
1383                          build_string("Latin-2"),
1384                          build_string("ISO8859-2 (Latin-2)"),
1385                          build_string("ISO8859-2 (Latin-2)"),
1386                          build_string("iso8859-2"));
1387         staticpro(&Vcharset_latin_iso8859_3);
1388         Vcharset_latin_iso8859_3 =
1389             make_charset(LEADING_BYTE_LATIN_ISO8859_3, Qlatin_iso8859_3, 2,
1390                          CHARSET_TYPE_96, 1, 1, 'C',
1391                          CHARSET_LEFT_TO_RIGHT,
1392                          build_string("Latin-3"),
1393                          build_string("ISO8859-3 (Latin-3)"),
1394                          build_string("ISO8859-3 (Latin-3)"),
1395                          build_string("iso8859-3"));
1396         staticpro(&Vcharset_latin_iso8859_4);
1397         Vcharset_latin_iso8859_4 =
1398             make_charset(LEADING_BYTE_LATIN_ISO8859_4, Qlatin_iso8859_4, 2,
1399                          CHARSET_TYPE_96, 1, 1, 'D',
1400                          CHARSET_LEFT_TO_RIGHT,
1401                          build_string("Latin-4"),
1402                          build_string("ISO8859-4 (Latin-4)"),
1403                          build_string("ISO8859-4 (Latin-4)"),
1404                          build_string("iso8859-4"));
1405         staticpro(&Vcharset_thai_tis620);
1406         Vcharset_thai_tis620 =
1407             make_charset(LEADING_BYTE_THAI_TIS620, Qthai_tis620, 2,
1408                          CHARSET_TYPE_96, 1, 1, 'T',
1409                          CHARSET_LEFT_TO_RIGHT,
1410                          build_string("TIS620"),
1411                          build_string("TIS620 (Thai)"),
1412                          build_string("TIS620.2529 (Thai)"),
1413                          build_string("tis620"));
1414         staticpro(&Vcharset_greek_iso8859_7);
1415         Vcharset_greek_iso8859_7 =
1416             make_charset(LEADING_BYTE_GREEK_ISO8859_7, Qgreek_iso8859_7, 2,
1417                          CHARSET_TYPE_96, 1, 1, 'F',
1418                          CHARSET_LEFT_TO_RIGHT,
1419                          build_string("ISO8859-7"),
1420                          build_string("ISO8859-7 (Greek)"),
1421                          build_string("ISO8859-7 (Greek)"),
1422                          build_string("iso8859-7"));
1423         staticpro(&Vcharset_arabic_iso8859_6);
1424         Vcharset_arabic_iso8859_6 =
1425             make_charset(LEADING_BYTE_ARABIC_ISO8859_6, Qarabic_iso8859_6, 2,
1426                          CHARSET_TYPE_96, 1, 1, 'G',
1427                          CHARSET_RIGHT_TO_LEFT,
1428                          build_string("ISO8859-6"),
1429                          build_string("ISO8859-6 (Arabic)"),
1430                          build_string("ISO8859-6 (Arabic)"),
1431                          build_string("iso8859-6"));
1432         staticpro(&Vcharset_hebrew_iso8859_8);
1433         Vcharset_hebrew_iso8859_8 =
1434             make_charset(LEADING_BYTE_HEBREW_ISO8859_8, Qhebrew_iso8859_8, 2,
1435                          CHARSET_TYPE_96, 1, 1, 'H',
1436                          CHARSET_RIGHT_TO_LEFT,
1437                          build_string("ISO8859-8"),
1438                          build_string("ISO8859-8 (Hebrew)"),
1439                          build_string("ISO8859-8 (Hebrew)"),
1440                          build_string("iso8859-8"));
1441         staticpro(&Vcharset_katakana_jisx0201);
1442         Vcharset_katakana_jisx0201 =
1443             make_charset(LEADING_BYTE_KATAKANA_JISX0201, Qkatakana_jisx0201, 2,
1444                          CHARSET_TYPE_94, 1, 1, 'I',
1445                          CHARSET_LEFT_TO_RIGHT,
1446                          build_string("JISX0201 Kana"),
1447                          build_string("JISX0201.1976 (Japanese Kana)"),
1448                          build_string("JISX0201.1976 Japanese Kana"),
1449                          build_string("jisx0201.1976"));
1450         staticpro(&Vcharset_latin_jisx0201);
1451         Vcharset_latin_jisx0201 =
1452             make_charset(LEADING_BYTE_LATIN_JISX0201, Qlatin_jisx0201, 2,
1453                          CHARSET_TYPE_94, 1, 0, 'J',
1454                          CHARSET_LEFT_TO_RIGHT,
1455                          build_string("JISX0201 Roman"),
1456                          build_string("JISX0201.1976 (Japanese Roman)"),
1457                          build_string("JISX0201.1976 Japanese Roman"),
1458                          build_string("jisx0201.1976"));
1459         staticpro(&Vcharset_cyrillic_iso8859_5);
1460         Vcharset_cyrillic_iso8859_5 =
1461             make_charset(LEADING_BYTE_CYRILLIC_ISO8859_5, Qcyrillic_iso8859_5,
1462                          2, CHARSET_TYPE_96, 1, 1, 'L', CHARSET_LEFT_TO_RIGHT,
1463                          build_string("ISO8859-5"),
1464                          build_string("ISO8859-5 (Cyrillic)"),
1465                          build_string("ISO8859-5 (Cyrillic)"),
1466                          build_string("iso8859-5"));
1467         staticpro(&Vcharset_latin_iso8859_9);
1468         Vcharset_latin_iso8859_9 =
1469             make_charset(LEADING_BYTE_LATIN_ISO8859_9, Qlatin_iso8859_9, 2,
1470                          CHARSET_TYPE_96, 1, 1, 'M',
1471                          CHARSET_LEFT_TO_RIGHT,
1472                          build_string("Latin-5"),
1473                          build_string("ISO8859-9 (Latin-5)"),
1474                          build_string("ISO8859-9 (Latin-5)"),
1475                          build_string("iso8859-9"));
1476         staticpro(&Vcharset_japanese_jisx0208_1978);
1477         Vcharset_japanese_jisx0208_1978 =
1478             make_charset(LEADING_BYTE_JAPANESE_JISX0208_1978,
1479                          Qjapanese_jisx0208_1978, 3, CHARSET_TYPE_94X94, 2, 0,
1480                          '@', CHARSET_LEFT_TO_RIGHT,
1481                          build_string("JISX0208.1978"),
1482                          build_string("JISX0208.1978 (Japanese)"),
1483                          build_string
1484                          ("JISX0208.1978 Japanese Kanji (so called \"old JIS\")"),
1485                          build_string("\\(jisx0208\\|jisc6226\\)\\.1978"));
1486         staticpro(&Vcharset_chinese_gb2312);
1487         Vcharset_chinese_gb2312 =
1488             make_charset(LEADING_BYTE_CHINESE_GB2312, Qchinese_gb2312, 3,
1489                          CHARSET_TYPE_94X94, 2, 0, 'A',
1490                          CHARSET_LEFT_TO_RIGHT,
1491                          build_string("GB2312"),
1492                          build_string("GB2312)"),
1493                          build_string("GB2312 Chinese simplified"),
1494                          build_string("gb2312"));
1495         staticpro(&Vcharset_japanese_jisx0208);
1496         Vcharset_japanese_jisx0208 =
1497             make_charset(LEADING_BYTE_JAPANESE_JISX0208, Qjapanese_jisx0208, 3,
1498                          CHARSET_TYPE_94X94, 2, 0, 'B',
1499                          CHARSET_LEFT_TO_RIGHT,
1500                          build_string("JISX0208"),
1501                          build_string("JISX0208.1983/1990 (Japanese)"),
1502                          build_string("JISX0208.1983/1990 Japanese Kanji"),
1503                          build_string("jisx0208.19\\(83\\|90\\)"));
1504         staticpro(&Vcharset_korean_ksc5601);
1505         Vcharset_korean_ksc5601 =
1506             make_charset(LEADING_BYTE_KOREAN_KSC5601, Qkorean_ksc5601, 3,
1507                          CHARSET_TYPE_94X94, 2, 0, 'C',
1508                          CHARSET_LEFT_TO_RIGHT,
1509                          build_string("KSC5601"),
1510                          build_string("KSC5601 (Korean"),
1511                          build_string("KSC5601 Korean Hangul and Hanja"),
1512                          build_string("ksc5601"));
1513         staticpro(&Vcharset_japanese_jisx0212);
1514         Vcharset_japanese_jisx0212 =
1515             make_charset(LEADING_BYTE_JAPANESE_JISX0212, Qjapanese_jisx0212, 3,
1516                          CHARSET_TYPE_94X94, 2, 0, 'D',
1517                          CHARSET_LEFT_TO_RIGHT,
1518                          build_string("JISX0212"),
1519                          build_string("JISX0212 (Japanese)"),
1520                          build_string("JISX0212 Japanese Supplement"),
1521                          build_string("jisx0212"));
1522
1523 #define CHINESE_CNS_PLANE_RE(n) "cns11643[.-]\\(.*[.-]\\)?" n "$"
1524         staticpro(&Vcharset_chinese_cns11643_1);
1525         Vcharset_chinese_cns11643_1 =
1526             make_charset(LEADING_BYTE_CHINESE_CNS11643_1, Qchinese_cns11643_1,
1527                          3, CHARSET_TYPE_94X94, 2, 0, 'G',
1528                          CHARSET_LEFT_TO_RIGHT, build_string("CNS11643-1"),
1529                          build_string("CNS11643-1 (Chinese traditional)"),
1530                          build_string("CNS 11643 Plane 1 Chinese traditional"),
1531                          build_string(CHINESE_CNS_PLANE_RE("1")));
1532         staticpro(&Vcharset_chinese_cns11643_2);
1533         Vcharset_chinese_cns11643_2 =
1534             make_charset(LEADING_BYTE_CHINESE_CNS11643_2, Qchinese_cns11643_2,
1535                          3, CHARSET_TYPE_94X94, 2, 0, 'H',
1536                          CHARSET_LEFT_TO_RIGHT, build_string("CNS11643-2"),
1537                          build_string("CNS11643-2 (Chinese traditional)"),
1538                          build_string("CNS 11643 Plane 2 Chinese traditional"),
1539                          build_string(CHINESE_CNS_PLANE_RE("2")));
1540         staticpro(&Vcharset_chinese_big5_1);
1541         Vcharset_chinese_big5_1 =
1542             make_charset(LEADING_BYTE_CHINESE_BIG5_1, Qchinese_big5_1, 3,
1543                          CHARSET_TYPE_94X94, 2, 0, '0',
1544                          CHARSET_LEFT_TO_RIGHT,
1545                          build_string("Big5"),
1546                          build_string("Big5 (Level-1)"),
1547                          build_string
1548                          ("Big5 Level-1 Chinese traditional"),
1549                          build_string("big5"));
1550         staticpro(&Vcharset_chinese_big5_2);
1551         Vcharset_chinese_big5_2 =
1552             make_charset(LEADING_BYTE_CHINESE_BIG5_2, Qchinese_big5_2, 3,
1553                          CHARSET_TYPE_94X94, 2, 0, '1',
1554                          CHARSET_LEFT_TO_RIGHT,
1555                          build_string("Big5"),
1556                          build_string("Big5 (Level-2)"),
1557                          build_string
1558                          ("Big5 Level-2 Chinese traditional"),
1559                          build_string("big5"));
1560
1561 #ifdef ENABLE_COMPOSITE_CHARS
1562         /* #### For simplicity, we put composite chars into a 96x96 charset.
1563            This is going to lead to problems because you can run out of
1564            room, esp. as we don't yet recycle numbers. */
1565         staticpro(&Vcharset_composite);
1566         Vcharset_composite =
1567             make_charset(LEADING_BYTE_COMPOSITE, Qcomposite, 3,
1568                          CHARSET_TYPE_96X96, 2, 0, 0,
1569                          CHARSET_LEFT_TO_RIGHT,
1570                          build_string("Composite"),
1571                          build_string("Composite characters"),
1572                          build_string("Composite characters"),
1573                          build_string(""));
1574
1575         /* #### not dumped properly */
1576         composite_char_row_next = 32;
1577         composite_char_col_next = 32;
1578
1579         Vcomposite_char_string2char_hash_table =
1580             make_lisp_hash_table(500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQUAL);
1581         Vcomposite_char_char2string_hash_table =
1582             make_lisp_hash_table(500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
1583         staticpro(&Vcomposite_char_string2char_hash_table);
1584         staticpro(&Vcomposite_char_char2string_hash_table);
1585 #endif                          /* ENABLE_COMPOSITE_CHARS */
1586
1587 }