1 /* Functions to handle multilingual characters.
2 Copyright (C) 1992, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of SXEmacs
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 /* Synched up with: FSF 20.3. Not in FSF. */
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
32 #include "ui/device.h"
36 /* The various pre-defined charsets. */
38 Lisp_Object Vcharset_ascii;
39 Lisp_Object Vcharset_control_1;
40 Lisp_Object Vcharset_latin_iso8859_1;
41 Lisp_Object Vcharset_latin_iso8859_2;
42 Lisp_Object Vcharset_latin_iso8859_3;
43 Lisp_Object Vcharset_latin_iso8859_4;
44 Lisp_Object Vcharset_thai_tis620;
45 Lisp_Object Vcharset_greek_iso8859_7;
46 Lisp_Object Vcharset_arabic_iso8859_6;
47 Lisp_Object Vcharset_hebrew_iso8859_8;
48 Lisp_Object Vcharset_katakana_jisx0201;
49 Lisp_Object Vcharset_latin_jisx0201;
50 Lisp_Object Vcharset_cyrillic_iso8859_5;
51 Lisp_Object Vcharset_latin_iso8859_9;
52 Lisp_Object Vcharset_japanese_jisx0208_1978;
53 Lisp_Object Vcharset_chinese_gb2312;
54 Lisp_Object Vcharset_japanese_jisx0208;
55 Lisp_Object Vcharset_korean_ksc5601;
56 Lisp_Object Vcharset_japanese_jisx0212;
57 Lisp_Object Vcharset_chinese_cns11643_1;
58 Lisp_Object Vcharset_chinese_cns11643_2;
59 Lisp_Object Vcharset_chinese_big5_1;
60 Lisp_Object Vcharset_chinese_big5_2;
62 #ifdef ENABLE_COMPOSITE_CHARS
63 Lisp_Object Vcharset_composite;
65 /* Hash tables for composite chars. One maps string representing
66 composed chars to their equivalent chars; one goes the
68 Lisp_Object Vcomposite_char_char2string_hash_table;
69 Lisp_Object Vcomposite_char_string2char_hash_table;
71 static int composite_char_row_next;
72 static int composite_char_col_next;
74 #endif /* ENABLE_COMPOSITE_CHARS */
76 struct charset_lookup *chlook;
78 static const struct lrecord_description charset_lookup_description_1[] = {
79 {XD_LISP_OBJECT_ARRAY,
80 offsetof(struct charset_lookup, charset_by_leading_byte),
85 static const struct struct_description charset_lookup_description = {
86 sizeof(struct charset_lookup),
87 charset_lookup_description_1
90 /* Table of number of bytes in the string representation of a character
91 indexed by the first byte of that representation.
93 rep_bytes_by_first_byte(c) is more efficient than the equivalent
94 canonical computation:
96 XCHARSET_REP_BYTES (CHARSET_BY_LEADING_BYTE (c)) */
98 const Bytecount rep_bytes_by_first_byte[0xA0] = { /* 0x00 - 0x7f are for straight ASCII */
99 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
100 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
101 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 /* 0x80 - 0x8f are for Dimension-1 official charsets */
108 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
109 /* 0x90 - 0x9d are for Dimension-2 official charsets */
110 /* 0x9e is for Dimension-1 private charsets */
111 /* 0x9f is for Dimension-2 private charsets */
112 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
115 Lisp_Object Qcharsetp;
117 /* Qdoc_string, Qdimension, Qchars defined in general.c */
118 Lisp_Object Qregistry, Qfinal, Qgraphic;
119 Lisp_Object Qdirection;
120 Lisp_Object Qreverse_direction_charset;
121 Lisp_Object Qleading_byte;
122 Lisp_Object Qshort_name, Qlong_name;
138 Qjapanese_jisx0208_1978,
144 Qchinese_cns11643_2, Qchinese_big5_1, Qchinese_big5_2, Qcomposite;
146 Lisp_Object Ql2r, Qr2l;
148 Lisp_Object Vcharset_hash_table;
150 /* Composite characters are characters constructed by overstriking two
151 or more regular characters.
153 1) The old Mule implementation involves storing composite characters
154 in a buffer as a tag followed by all of the actual characters
155 used to make up the composite character. I think this is a bad
156 idea; it greatly complicates code that wants to handle strings
157 one character at a time because it has to deal with the possibility
158 of great big ungainly characters. It's much more reasonable to
159 simply store an index into a table of composite characters.
161 2) The current implementation only allows for 16,384 separate
162 composite characters over the lifetime of the SXEmacs process.
163 This could become a potential problem if the user
164 edited lots of different files that use composite characters.
165 Due to FSF bogosity, increasing the number of allowable
166 composite characters under Mule would decrease the number
167 of possible faces that can exist. Mule already has shrunk
168 this to 2048, and further shrinkage would become uncomfortable.
169 No such problems exist in SXEmacs.
171 Composite characters could be represented as 0x80 C1 C2 C3,
172 where each C[1-3] is in the range 0xA0 - 0xFF. This allows
173 for slightly under 2^20 (one million) composite characters
174 over the SXEmacs process lifetime, and you only need to
175 increase the size of a Mule character from 19 to 21 bits.
176 Or you could use 0x80 C1 C2 C3 C4, allowing for about
177 85 million (slightly over 2^26) composite characters. */
179 /************************************************************************/
180 /* Basic Emchar functions */
181 /************************************************************************/
183 /* Convert a non-ASCII Mule character C into a one-character Mule-encoded
184 string in STR. Returns the number of bytes stored.
185 Do not call this directly. Use the macro set_charptr_emchar() instead.
188 Bytecount non_ascii_set_charptr_emchar(Bufbyte * str, Emchar c)
196 BREAKUP_CHAR(c, charset, c1, c2);
197 lb = CHAR_LEADING_BYTE(c);
198 if (LEADING_BYTE_PRIVATE_P(lb))
199 *p++ = PRIVATE_LEADING_BYTE_PREFIX(lb);
201 if (EQ(charset, Vcharset_control_1))
210 /* Return the first character from a Mule-encoded string in STR,
211 assuming it's non-ASCII. Do not call this directly.
212 Use the macro charptr_emchar() instead. */
214 Emchar non_ascii_charptr_emchar(const Bufbyte * str)
216 Bufbyte i0 = *str, i1, i2 = 0;
219 if (i0 == LEADING_BYTE_CONTROL_1)
220 return (Emchar) (*++str - 0x20);
222 if (LEADING_BYTE_PREFIX_P(i0))
227 charset = CHARSET_BY_LEADING_BYTE(i0);
228 if (XCHARSET_DIMENSION(charset) == 2)
231 return MAKE_CHAR(charset, i1, i2);
234 /* Return whether CH is a valid Emchar, assuming it's non-ASCII.
235 Do not call this directly. Use the macro valid_char_p() instead. */
237 int non_ascii_valid_char_p(Emchar ch)
241 /* Must have only lowest 19 bits set */
245 f1 = CHAR_FIELD1(ch);
246 f2 = CHAR_FIELD2(ch);
247 f3 = CHAR_FIELD3(ch);
252 if (f2 < MIN_CHAR_FIELD2_OFFICIAL ||
253 (f2 > MAX_CHAR_FIELD2_OFFICIAL
254 && f2 < MIN_CHAR_FIELD2_PRIVATE)
255 || f2 > MAX_CHAR_FIELD2_PRIVATE)
260 if (f3 != 0x20 && f3 != 0x7F
261 && !(f2 >= MIN_CHAR_FIELD2_PRIVATE
262 && f2 <= MAX_CHAR_FIELD2_PRIVATE))
266 NOTE: This takes advantage of the fact that
267 FIELD2_TO_OFFICIAL_LEADING_BYTE and
268 FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
271 CHARSET_BY_LEADING_BYTE(f2 +
272 FIELD2_TO_OFFICIAL_LEADING_BYTE);
273 if (EQ(charset, Qnil))
275 return (XCHARSET_CHARS(charset) == 96);
279 if (f1 < MIN_CHAR_FIELD1_OFFICIAL ||
280 (f1 > MAX_CHAR_FIELD1_OFFICIAL
281 && f1 < MIN_CHAR_FIELD1_PRIVATE)
282 || f1 > MAX_CHAR_FIELD1_PRIVATE)
284 if (f2 < 0x20 || f3 < 0x20)
287 #ifdef ENABLE_COMPOSITE_CHARS
288 if (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE ==
289 LEADING_BYTE_COMPOSITE) {
293 Vcomposite_char_char2string_hash_table,
298 #endif /* ENABLE_COMPOSITE_CHARS */
300 if (f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F
301 && !(f1 >= MIN_CHAR_FIELD1_PRIVATE
302 && f1 <= MAX_CHAR_FIELD1_PRIVATE))
305 if (f1 <= MAX_CHAR_FIELD1_OFFICIAL)
307 CHARSET_BY_LEADING_BYTE(f1 +
308 FIELD1_TO_OFFICIAL_LEADING_BYTE);
311 CHARSET_BY_LEADING_BYTE(f1 +
312 FIELD1_TO_PRIVATE_LEADING_BYTE);
314 if (EQ(charset, Qnil))
316 return (XCHARSET_CHARS(charset) == 96);
320 /************************************************************************/
321 /* Basic string functions */
322 /************************************************************************/
324 /* Copy the character pointed to by SRC into DST. Do not call this
325 directly. Use the macro charptr_copy_char() instead.
326 Return the number of bytes copied. */
328 Bytecount non_ascii_charptr_copy_char(const Bufbyte * src, Bufbyte * dst)
330 unsigned int bytes = REP_BYTES_BY_FIRST_BYTE(*src);
332 for (i = bytes; i; i--, dst++, src++)
337 /************************************************************************/
338 /* streams of Emchars */
339 /************************************************************************/
341 /* Treat a stream as a stream of Emchar's rather than a stream of bytes.
342 The functions below are not meant to be called directly; use
343 the macros in insdel.h. */
345 Emchar Lstream_get_emchar_1(Lstream * stream, int ch)
347 Bufbyte str[MAX_EMCHAR_LEN];
348 Bufbyte *strptr = str;
351 str[0] = (Bufbyte) ch;
353 for (bytes = REP_BYTES_BY_FIRST_BYTE(ch) - 1; bytes; bytes--) {
354 int c = Lstream_getc(stream);
355 bufpos_checking_assert(c >= 0);
356 *++strptr = (Bufbyte) c;
358 return charptr_emchar(str);
361 int Lstream_fput_emchar(Lstream * stream, Emchar ch)
363 Bufbyte str[MAX_EMCHAR_LEN];
364 Bytecount len = set_charptr_emchar(str, ch);
365 return Lstream_write(stream, str, len);
368 void Lstream_funget_emchar(Lstream * stream, Emchar ch)
370 Bufbyte str[MAX_EMCHAR_LEN];
371 Bytecount len = set_charptr_emchar(str, ch);
372 Lstream_unread(stream, str, len);
375 /************************************************************************/
377 /************************************************************************/
379 static Lisp_Object mark_charset(Lisp_Object obj)
381 Lisp_Charset *cs = XCHARSET(obj);
383 mark_object(cs->short_name);
384 mark_object(cs->long_name);
385 mark_object(cs->doc_string);
386 mark_object(cs->registry);
387 mark_object(cs->ccl_program);
392 print_charset(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
394 Lisp_Charset *cs = XCHARSET(obj);
397 error("printing unreadable object #<charset %s 0x%x>",
398 string_data(XSYMBOL(CHARSET_NAME(cs))->name),
401 write_c_string("#<charset ", printcharfun);
402 print_internal(CHARSET_NAME(cs), printcharfun, 0);
403 write_c_string(" ", printcharfun);
404 print_internal(CHARSET_SHORT_NAME(cs), printcharfun, 1);
405 write_c_string(" ", printcharfun);
406 print_internal(CHARSET_LONG_NAME(cs), printcharfun, 1);
407 write_c_string(" ", printcharfun);
408 print_internal(CHARSET_DOC_STRING(cs), printcharfun, 1);
409 write_fmt_string(printcharfun, " %s %s cols=%d g%d final='%c' reg=",
410 (CHARSET_TYPE(cs) == CHARSET_TYPE_94 ? "94" :
411 CHARSET_TYPE(cs) == CHARSET_TYPE_96 ? "96" :
412 CHARSET_TYPE(cs) == CHARSET_TYPE_94X94 ? "94x94" :
414 (CHARSET_DIRECTION(cs) == CHARSET_LEFT_TO_RIGHT ? "l2r" : "r2l"),
415 CHARSET_COLUMNS(cs), CHARSET_GRAPHIC(cs), CHARSET_FINAL(cs));
416 print_internal(CHARSET_REGISTRY(cs), printcharfun, 0);
417 write_fmt_str(printcharfun, " 0x%x>", cs->header.uid);
420 static const struct lrecord_description charset_description[] = {
421 {XD_LISP_OBJECT, offsetof(Lisp_Charset, name)},
422 {XD_LISP_OBJECT, offsetof(Lisp_Charset, doc_string)},
423 {XD_LISP_OBJECT, offsetof(Lisp_Charset, registry)},
424 {XD_LISP_OBJECT, offsetof(Lisp_Charset, short_name)},
425 {XD_LISP_OBJECT, offsetof(Lisp_Charset, long_name)},
426 {XD_LISP_OBJECT, offsetof(Lisp_Charset, reverse_direction_charset)},
427 {XD_LISP_OBJECT, offsetof(Lisp_Charset, ccl_program)},
431 DEFINE_LRECORD_IMPLEMENTATION("charset", charset,
432 mark_charset, print_charset, 0, 0, 0,
433 charset_description, Lisp_Charset);
435 /* Make a new charset. */
436 /* #### SJT Should generic properties be allowed? */
438 make_charset(int id, Lisp_Object name, unsigned char rep_bytes,
439 unsigned char type, unsigned char columns, unsigned char graphic,
440 Bufbyte final, unsigned char direction, Lisp_Object short_name,
441 Lisp_Object long_name, Lisp_Object doc, Lisp_Object reg)
444 Lisp_Charset *cs = alloc_lcrecord_type(Lisp_Charset, &lrecord_charset);