1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of SXEmacs
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 /* Synched up with: Mule 2.3. Not in FSF. */
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
30 #include "ui/insdel.h"
37 #include "file-coding.h"
39 Lisp_Object Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
50 struct file_coding_dump {
51 /* Coding system currently associated with each coding category. */
52 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
54 /* Table of all coding categories in decreasing order of priority.
55 This describes a permutation of the possible coding categories. */
56 int coding_category_by_priority[CODING_CATEGORY_LAST];
59 Lisp_Object ucs_to_mule_table[65536];
63 static const struct lrecord_description fcd_description_1[] = {
64 {XD_LISP_OBJECT_ARRAY,
65 offsetof(struct file_coding_dump, coding_category_system),
66 CODING_CATEGORY_LAST},
68 {XD_LISP_OBJECT_ARRAY,
69 offsetof(struct file_coding_dump, ucs_to_mule_table),
70 countof(fcd->ucs_to_mule_table)},
75 static const struct struct_description fcd_description = {
76 sizeof(struct file_coding_dump),
80 Lisp_Object mule_to_ucs_table;
82 Lisp_Object Qcoding_systemp;
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
104 Lisp_Object Qencode, Qdecode;
106 Lisp_Object Vcoding_system_hash_table;
108 int enable_multibyte_characters;
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet:1;
162 unsigned int invalid_switch_dir:1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally:1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence:1;
175 EXFUN(Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179 const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181 unsigned_char_dynarr * dst,
182 Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184 unsigned_char_dynarr * dst,
185 Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187 Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189 unsigned_char_dynarr * dst,
190 Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192 unsigned_char_dynarr * dst,
193 Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195 Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197 unsigned_char_dynarr * dst,
198 Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200 unsigned_char_dynarr * dst,
201 Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203 Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205 unsigned_char_dynarr * dst,
206 Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208 unsigned_char_dynarr * dst,
209 Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212 struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214 const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216 unsigned_char_dynarr * dst,
217 Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219 unsigned_char_dynarr * dst,
220 Lstream_data_count n);
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223 unsigned_char_dynarr * dst,
224 Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226 unsigned_char_dynarr * dst,
227 Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229 unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231 unsigned_char_dynarr * dst, Lstream_data_count n);
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
240 Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
243 static const struct lrecord_description codesys_prop_description_1[] = {
244 {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
248 static const struct struct_description codesys_prop_description = {
249 sizeof(codesys_prop),
250 codesys_prop_description_1
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
258 static const struct struct_description codesys_prop_dynarr_description = {
259 sizeof(codesys_prop_dynarr),
260 codesys_prop_dynarr_description_1
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
265 enum codesys_prop_enum {
267 CODESYS_PROP_ISO2022,
271 /************************************************************************/
272 /* Coding system functions */
273 /************************************************************************/
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
280 static const struct lrecord_description ccs_description_1[] = {
281 {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282 {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
286 static const struct struct_description ccs_description = {
287 sizeof(charset_conversion_spec),
291 static const struct lrecord_description ccsd_description_1[] = {
292 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
296 static const struct struct_description ccsd_description = {
297 sizeof(charset_conversion_spec_dynarr),
302 static const struct lrecord_description coding_system_description[] = {
303 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
312 {XD_LISP_OBJECT_ARRAY,
313 offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314 {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
316 {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
318 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325 mark_coding_system, print_coding_system,
326 finalize_coding_system,
327 0, 0, coding_system_description,
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
332 Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
334 mark_object(CODING_SYSTEM_NAME(codesys));
335 mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336 mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337 mark_object(CODING_SYSTEM_EOL_LF(codesys));
338 mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339 mark_object(CODING_SYSTEM_EOL_CR(codesys));
341 switch (CODING_SYSTEM_TYPE(codesys)) {
344 case CODESYS_ISO2022:
345 for (i = 0; i < 4; i++)
346 mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
348 if (codesys->iso2022.input_conv) {
350 i < Dynarr_length(codesys->iso2022.input_conv);
352 struct charset_conversion_spec *ccs =
353 Dynarr_atp(codesys->iso2022.input_conv, i);
354 mark_object(ccs->from_charset);
355 mark_object(ccs->to_charset);
358 if (codesys->iso2022.output_conv) {
360 i < Dynarr_length(codesys->iso2022.output_conv);
362 struct charset_conversion_spec *ccs =
363 Dynarr_atp(codesys->iso2022.output_conv, i);
364 mark_object(ccs->from_charset);
365 mark_object(ccs->to_charset);
371 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
375 /* list the rest of them lot explicitly */
376 case CODESYS_AUTODETECT:
377 case CODESYS_SHIFT_JIS:
381 case CODESYS_NO_CONVERSION:
383 case CODESYS_INTERNAL:
390 mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
397 Lisp_Coding_System *c = XCODING_SYSTEM(obj);
399 error("printing unreadable object #<coding-system 0x%x>",
402 write_c_string("#<coding-system ", printcharfun);
403 print_internal(c->name, printcharfun, 1);
404 write_c_string(">", printcharfun);
407 static void finalize_coding_system(void *header, int for_disksave)
409 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410 /* Since coding systems never go away, this function is not
411 necessary. But it would be necessary if we changed things
412 so that coding systems could go away. */
413 if (!for_disksave) { /* see comment in lstream.c */
414 switch (CODING_SYSTEM_TYPE(c)) {
416 case CODESYS_ISO2022:
417 if (c->iso2022.input_conv) {
418 Dynarr_free(c->iso2022.input_conv);
419 c->iso2022.input_conv = 0;
421 if (c->iso2022.output_conv) {
422 Dynarr_free(c->iso2022.output_conv);
423 c->iso2022.output_conv = 0;
427 /* list the rest of them lot explicitly */
428 case CODESYS_AUTODETECT:
429 case CODESYS_SHIFT_JIS:
434 case CODESYS_NO_CONVERSION:
436 case CODESYS_INTERNAL:
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
447 CHECK_SYMBOL(symbol);
449 return EOL_AUTODETECT;
452 if (EQ(symbol, Qcrlf))
457 signal_simple_error("Unrecognized eol type", symbol);
458 return EOL_AUTODETECT; /* not reached */
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
477 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
479 Lisp_Object codesys_obj;
480 int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
481 char *codesys_name = (char *)alloca(len + 7);
483 char *codesys_mnemonic = 0;
485 Lisp_Object codesys_name_sym, sub_codesys_obj;
489 XSETCODING_SYSTEM(codesys_obj, codesys);
492 string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
494 if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
495 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
496 codesys_mnemonic = (char *)alloca(mlen + 7);
497 memcpy(codesys_mnemonic,
498 XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
500 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
501 strcpy (codesys_name + len, "-" op_sys); \
503 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
504 codesys_name_sym = intern (codesys_name); \
505 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
506 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
508 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
509 build_string (codesys_mnemonic); \
510 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
513 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
514 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
515 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
518 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
519 Return t if OBJECT is a coding system.
520 A coding system is an object that defines how text containing multiple
521 character sets is encoded into a stream of (typically 8-bit) bytes.
522 The coding system is used to decode the stream into a series of
523 characters (which may be from multiple charsets) when the text is read
524 from a file or process, and is used to encode the text back into the
525 same format when it is written out to a file or process.
527 For example, many ISO2022-compliant coding systems (such as Compound
528 Text, which is used for inter-client data under the X Window System)
529 use escape sequences to switch between different charsets -- Japanese
530 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
531 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
532 `make-coding-system' for more information.
534 Coding systems are normally identified using a symbol, and the
535 symbol is accepted in place of the actual coding system object whenever
536 a coding system is called for. (This is similar to how faces work.)
540 return CODING_SYSTEMP(object) ? Qt : Qnil;
543 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
544 Retrieve the coding system of the given name.
546 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
547 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
548 If there is no such coding system, nil is returned. Otherwise the
549 associated coding system object is returned.
551 (coding_system_or_name))
553 if (NILP(coding_system_or_name))
554 coding_system_or_name = Qbinary;
555 else if (CODING_SYSTEMP(coding_system_or_name))
556 return coding_system_or_name;
558 CHECK_SYMBOL(coding_system_or_name);
561 coding_system_or_name =
562 Fgethash(coding_system_or_name, Vcoding_system_hash_table,
565 if (CODING_SYSTEMP(coding_system_or_name)
566 || NILP(coding_system_or_name))
567 return coding_system_or_name;
571 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
572 Retrieve the coding system of the given name.
573 Same as `find-coding-system' except that if there is no such
574 coding system, an error is signaled instead of returning nil.
578 Lisp_Object coding_system = Ffind_coding_system(name);
580 if (NILP(coding_system))
581 signal_simple_error("No such coding system", name);
582 return coding_system;
585 /* We store the coding systems in hash tables with the names as the key and the
586 actual coding system object as the value. Occasionally we need to use them
587 in a list format. These routines provide us with that. */
588 struct coding_system_list_closure {
589 Lisp_Object *coding_system_list;
593 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
594 void *coding_system_list_closure)
596 /* This function can GC */
597 struct coding_system_list_closure *cscl =
598 (struct coding_system_list_closure *)coding_system_list_closure;
599 Lisp_Object *coding_system_list = cscl->coding_system_list;
601 *coding_system_list = Fcons(key, *coding_system_list);
605 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
606 Return a list of the names of all defined coding systems.
610 Lisp_Object coding_system_list = Qnil;
612 struct coding_system_list_closure coding_system_list_closure;
614 GCPRO1(coding_system_list);
615 coding_system_list_closure.coding_system_list = &coding_system_list;
616 elisp_maphash(add_coding_system_to_list_mapper,
617 Vcoding_system_hash_table, &coding_system_list_closure);
620 return coding_system_list;
623 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
624 Return the name of the given coding system.
628 coding_system = Fget_coding_system(coding_system);
629 return XCODING_SYSTEM_NAME(coding_system);
632 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
635 Lisp_Coding_System *codesys =
636 alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
638 zero_lcrecord(codesys);
639 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
640 CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
641 CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
642 CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
643 CODING_SYSTEM_EOL_CR(codesys) = Qnil;
644 CODING_SYSTEM_EOL_LF(codesys) = Qnil;
645 CODING_SYSTEM_TYPE(codesys) = type;
646 CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
648 if (type == CODESYS_ISO2022) {
650 for (i = 0; i < 4; i++)
651 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
653 } else if (type == CODESYS_CCL) {
654 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
655 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
658 CODING_SYSTEM_NAME(codesys) = name;
664 /* Given a list of charset conversion specs as specified in a Lisp
665 program, parse it into STORE_HERE. */
668 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
669 Lisp_Object spec_list)
673 EXTERNAL_LIST_LOOP(rest, spec_list) {
674 Lisp_Object car = XCAR(rest);
675 Lisp_Object from, to;
676 struct charset_conversion_spec spec;
678 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
679 signal_simple_error("Invalid charset conversion spec",
681 from = Fget_charset(XCAR(car));
682 to = Fget_charset(XCAR(XCDR(car)));
683 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
684 signal_simple_error_2
685 ("Attempted conversion between different charset types",
687 spec.from_charset = from;
688 spec.to_charset = to;
690 Dynarr_add(store_here, spec);
694 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
695 specs, return the equivalent as the Lisp programmer would see it.
697 If LOAD_HERE is 0, return Qnil. */
700 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
707 for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
708 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
710 Fcons(list2(ccs->from_charset, ccs->to_charset), result);
713 return Fnreverse(result);
718 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
719 Register symbol NAME as a coding system.
721 TYPE describes the conversion method used and should be one of
724 Automatic conversion. SXEmacs attempts to detect the coding system
727 No conversion. Use this for binary files and such. On output,
728 graphic characters that are not in ASCII or Latin-1 will be
729 replaced by a ?. (For a no-conversion-encoded buffer, these
730 characters will only be present if you explicitly insert them.)
732 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
734 ISO 10646 UCS-4 encoding.
736 ISO 10646 UTF-8 encoding.
738 Any ISO2022-compliant encoding. Among other things, this includes
739 JIS (the Japanese encoding commonly used for e-mail), EUC (the
740 standard Unix encoding for Japanese and other languages), and
741 Compound Text (the encoding used in X11). You can specify more
742 specific information about the conversion with the PROPS argument.
744 Big5 (the encoding commonly used for Taiwanese).
746 The conversion is performed using a user-written pseudo-code
747 program. CCL (Code Conversion Language) is the name of this
750 Write out or read in the raw contents of the memory representing
751 the buffer's text. This is primarily useful for debugging
752 purposes, and is only enabled when SXEmacs has been compiled with
753 DEBUG_SXEMACS defined (via the --debug configure option).
754 WARNING: Reading in a file using 'internal conversion can result
755 in an internal inconsistency in the memory representing a
756 buffer's text, which will produce unpredictable results and may
757 cause SXEmacs to crash. Under normal circumstances you should
758 never use 'internal conversion.
760 DOC-STRING is a string describing the coding system.
762 PROPS is a property list, describing the specific nature of the
763 character set. Recognized properties are:
766 String to be displayed in the modeline when this coding system is
770 End-of-line conversion to be used. It should be one of
773 Automatically detect the end-of-line type (LF, CRLF,
774 or CR). Also generate subsidiary coding systems named
775 `NAME-unix', `NAME-dos', and `NAME-mac', that are
776 identical to this coding system but have an EOL-TYPE
777 value of 'lf, 'crlf, and 'cr, respectively.
779 The end of a line is marked externally using ASCII LF.
780 Since this is also the way that SXEmacs represents an
781 end-of-line internally, specifying this option results
782 in no end-of-line conversion. This is the standard
783 format for Unix text files.
785 The end of a line is marked externally using ASCII
786 CRLF. This is the standard format for MS-DOS text
789 The end of a line is marked externally using ASCII CR.
790 This is the standard format for Macintosh text files.
792 Automatically detect the end-of-line type but do not
793 generate subsidiary coding systems. (This value is
794 converted to nil when stored internally, and
795 `coding-system-property' will return nil.)
797 'post-read-conversion
798 Function called after a file has been read in, to perform the
799 decoding. Called with two arguments, START and END, denoting
800 a region of the current buffer to be decoded.
802 'pre-write-conversion
803 Function called before a file is written out, to perform the
804 encoding. Called with two arguments, START and END, denoting
805 a region of the current buffer to be encoded.
807 The following additional properties are recognized if TYPE is 'iso2022:
813 The character set initially designated to the G0 - G3 registers.
814 The value should be one of
816 -- A charset object (designate that character set)
817 -- nil (do not ever use this register)
818 -- t (no character set is initially designated to
819 the register, but may be later on; this automatically
820 sets the corresponding `force-g*-on-output' property)
826 If non-nil, send an explicit designation sequence on output before
827 using the specified register.
830 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
831 "ESC $ B" on output in place of the full designation sequences
832 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
835 If non-nil, don't designate ASCII to G0 at each end of line on output.
836 Setting this to non-nil also suppresses other state-resetting that
837 normally happens at the end of a line.
840 If non-nil, don't designate ASCII to G0 before control chars on output.
843 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
847 If non-nil, use locking-shift (SO/SI) instead of single-shift
848 or designation by escape sequence.
851 If non-nil, don't use ISO6429's direction specification.
854 If non-nil, literal control characters that are the same as
855 the beginning of a recognized ISO2022 or ISO6429 escape sequence
856 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
857 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
858 so that they can be properly distinguished from an escape sequence.
859 (Note that doing this results in a non-portable encoding.) This
860 encoding flag is used for byte-compiled files. Note that ESC
861 is a good choice for a quoting character because there are no
862 escape sequences whose second byte is a character from the Control-0
863 or Control-1 character sets; this is explicitly disallowed by the
866 'input-charset-conversion
867 A list of conversion specifications, specifying conversion of
868 characters in one charset to another when decoding is performed.
869 Each specification is a list of two elements: the source charset,
870 and the destination charset.
872 'output-charset-conversion
873 A list of conversion specifications, specifying conversion of
874 characters in one charset to another when encoding is performed.
875 The form of each specification is the same as for
876 'input-charset-conversion.
878 The following additional properties are recognized (and required)
882 CCL program used for decoding (converting to internal format).
885 CCL program used for encoding (converting to external format).
887 (name, type, doc_string, props))
889 Lisp_Coding_System *codesys;
890 enum coding_system_type ty;
891 int need_to_setup_eol_systems = 1;
893 /* Convert type to constant */
894 if (NILP(type) || EQ(type, Qundecided)) {
895 ty = CODESYS_AUTODETECT;
898 else if (EQ(type, Qshift_jis)) {
899 ty = CODESYS_SHIFT_JIS;
900 } else if (EQ(type, Qiso2022)) {
901 ty = CODESYS_ISO2022;
902 } else if (EQ(type, Qbig5)) {
904 } else if (EQ(type, Qucs4)) {
906 } else if (EQ(type, Qutf8)) {
908 } else if (EQ(type, Qccl)) {
912 else if (EQ(type, Qno_conversion)) {
913 ty = CODESYS_NO_CONVERSION;
916 else if (EQ(type, Qinternal)) {
917 ty = CODESYS_INTERNAL;
921 signal_simple_error("Invalid coding system type", type);
925 codesys = allocate_coding_system(ty, name);
927 if (NILP(doc_string))
928 doc_string = build_string("");
930 CHECK_STRING(doc_string);
931 CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
934 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
935 if (EQ(key, Qmnemonic)) {
938 CODING_SYSTEM_MNEMONIC(codesys) = value;
941 else if (EQ(key, Qeol_type)) {
942 need_to_setup_eol_systems = NILP(value);
945 CODING_SYSTEM_EOL_TYPE(codesys) =
946 symbol_to_eol_type(value);
949 else if (EQ(key, Qpost_read_conversion))
950 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
952 else if (EQ(key, Qpre_write_conversion))
953 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
956 else if (ty == CODESYS_ISO2022) {
957 #define FROB_INITIAL_CHARSET(charset_num) \
958 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
959 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
961 if (EQ(key, Qcharset_g0))
962 FROB_INITIAL_CHARSET(0);
963 else if (EQ(key, Qcharset_g1))
964 FROB_INITIAL_CHARSET(1);
965 else if (EQ(key, Qcharset_g2))
966 FROB_INITIAL_CHARSET(2);
967 else if (EQ(key, Qcharset_g3))
968 FROB_INITIAL_CHARSET(3);
970 #define FROB_FORCE_CHARSET(charset_num) \
971 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
973 else if (EQ(key, Qforce_g0_on_output))
974 FROB_FORCE_CHARSET(0);
975 else if (EQ(key, Qforce_g1_on_output))
976 FROB_FORCE_CHARSET(1);
977 else if (EQ(key, Qforce_g2_on_output))
978 FROB_FORCE_CHARSET(2);
979 else if (EQ(key, Qforce_g3_on_output))
980 FROB_FORCE_CHARSET(3);
982 #define FROB_BOOLEAN_PROPERTY(prop) \
983 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
985 else if (EQ(key, Qshort))
986 FROB_BOOLEAN_PROPERTY(SHORT);
987 else if (EQ(key, Qno_ascii_eol))
988 FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
989 else if (EQ(key, Qno_ascii_cntl))
990 FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
991 else if (EQ(key, Qseven))
992 FROB_BOOLEAN_PROPERTY(SEVEN);
993 else if (EQ(key, Qlock_shift))
994 FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
995 else if (EQ(key, Qno_iso6429))
996 FROB_BOOLEAN_PROPERTY(NO_ISO6429);
997 else if (EQ(key, Qescape_quoted))
998 FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
1000 else if (EQ(key, Qinput_charset_conversion)) {
1001 codesys->iso2022.input_conv =
1002 Dynarr_new(charset_conversion_spec);
1003 parse_charset_conversion_specs(codesys->
1007 } else if (EQ(key, Qoutput_charset_conversion)) {
1008 codesys->iso2022.output_conv =
1009 Dynarr_new(charset_conversion_spec);
1010 parse_charset_conversion_specs(codesys->
1016 ("Unrecognized property", key);
1017 } else if (EQ(type, Qccl)) {
1019 struct ccl_program test_ccl;
1022 /* Check key first. */
1023 if (EQ(key, Qdecode))
1024 suffix = "-ccl-decode";
1025 else if (EQ(key, Qencode))
1026 suffix = "-ccl-encode";
1029 ("Unrecognized property", key);
1031 /* If value is vector, register it as a ccl program
1032 associated with an newly created symbol for
1033 backward compatibility. */
1034 if (VECTORP(value)) {
1037 (Fsymbol_name(name),
1038 build_string(suffix)),
1040 Fregister_ccl_program(sym, value);
1042 CHECK_SYMBOL(value);
1045 /* check if the given ccl programs are valid. */
1046 if (setup_ccl_program(&test_ccl, sym) < 0)
1048 ("Invalid CCL program", value);
1050 if (EQ(key, Qdecode))
1051 CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1052 else if (EQ(key, Qencode))
1053 CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1058 signal_simple_error("Unrecognized property",
1063 if (need_to_setup_eol_systems)
1064 setup_eol_coding_systems(codesys);
1067 Lisp_Object codesys_obj;
1068 XSETCODING_SYSTEM(codesys_obj, codesys);
1069 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1074 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1075 Copy OLD-CODING-SYSTEM to NEW-NAME.
1076 If NEW-NAME does not name an existing coding system, a new one will
1079 (old_coding_system, new_name))
1081 Lisp_Object new_coding_system;
1082 old_coding_system = Fget_coding_system(old_coding_system);
1083 new_coding_system = Ffind_coding_system(new_name);
1084 if (NILP(new_coding_system)) {
1085 XSETCODING_SYSTEM(new_coding_system,
1086 allocate_coding_system
1087 (XCODING_SYSTEM_TYPE(old_coding_system),
1089 Fputhash(new_name, new_coding_system,
1090 Vcoding_system_hash_table);
1094 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1095 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1096 memcpy(((char *)to) + sizeof(to->header),
1097 ((char *)from) + sizeof(from->header),
1098 sizeof(*from) - sizeof(from->header));
1099 to->name = new_name;
1101 return new_coding_system;
1104 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1105 Return t if OBJECT names a coding system, and is not a coding system alias.
1109 Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1110 return CODING_SYSTEMP(val) ? Qt : Qnil;
1113 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1114 Return t if OBJECT is a coding system alias.
1115 All coding system aliases are created by `define-coding-system-alias'.
1119 Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1120 return SYMBOLP(val) ? Qt : Qnil;
1123 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1124 Return the coding-system symbol for which symbol ALIAS is an alias.
1128 Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1129 if (SYMBOLP(aliasee)) {
1132 signal_simple_error(
1133 "Symbol is not a coding system alias", alias);
1135 /* To keep the compiler happy */
1140 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1142 return Fintern(concat2(Fsymbol_name(symbol),
1143 build_string(ascii_string)), Qnil);
1146 /* A maphash function, for removing dangling coding system aliases. */
1148 dangling_coding_system_alias_p(Lisp_Object alias,
1149 Lisp_Object aliasee, void *dangling_aliases)
1151 if (SYMBOLP(aliasee)
1152 && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1153 (*(int *)dangling_aliases)++;
1160 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1161 Define symbol ALIAS as an alias for coding system ALIASEE.
1163 You can use this function to redefine an alias that has already been defined,
1164 but you cannot redefine a name which is the canonical name for a coding system.
1165 \(a canonical name of a coding system is what is returned when you call
1166 `coding-system-name' on a coding system).
1168 ALIASEE itself can be an alias, which allows you to define nested aliases.
1170 You are forbidden, however, from creating alias loops or `dangling' aliases.
1171 These will be detected, and an error will be signaled if you attempt to do so.
1173 If ALIASEE is nil, then ALIAS will simply be undefined.
1175 See also `coding-system-alias-p', `coding-system-aliasee',
1176 and `coding-system-canonical-name-p'.
1180 Lisp_Object real_coding_system, probe;
1182 CHECK_SYMBOL(alias);
1184 if (!NILP(Fcoding_system_canonical_name_p(alias)))
1186 ("Symbol is the canonical name of a coding system and cannot be redefined",
1189 if (NILP(aliasee)) {
1190 Lisp_Object subsidiary_unix =
1191 append_suffix_to_symbol(alias, "-unix");
1192 Lisp_Object subsidiary_dos =
1193 append_suffix_to_symbol(alias, "-dos");
1194 Lisp_Object subsidiary_mac =
1195 append_suffix_to_symbol(alias, "-mac");
1197 Fremhash(alias, Vcoding_system_hash_table);
1199 /* Undefine subsidiary aliases,
1200 presumably created by a previous call to this function */
1201 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1202 !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1203 !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1204 Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1205 Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1206 Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1209 /* Undefine dangling coding system aliases. */
1211 int dangling_aliases;
1214 dangling_aliases = 0;
1216 (dangling_coding_system_alias_p,
1217 Vcoding_system_hash_table,
1219 } while (dangling_aliases > 0);
1225 if (CODING_SYSTEMP(aliasee))
1226 aliasee = XCODING_SYSTEM_NAME(aliasee);
1228 /* Checks that aliasee names a coding-system */
1229 real_coding_system = Fget_coding_system(aliasee);
1231 /* Check for coding system alias loops */
1232 if (EQ(alias, aliasee))
1233 alias_loop:signal_simple_error_2
1234 ("Attempt to create a coding system alias loop", alias,
1237 for (probe = aliasee;
1239 probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1240 if (EQ(probe, alias))
1244 Fputhash(alias, aliasee, Vcoding_system_hash_table);
1246 /* Set up aliases for subsidiaries.
1247 #### There must be a better way to handle subsidiary coding
1250 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1252 for (int i = 0; i < countof(suffixes); i++) {
1253 Lisp_Object alias_subsidiary =
1254 append_suffix_to_symbol(alias, suffixes[i]);
1255 Lisp_Object aliasee_subsidiary =
1256 append_suffix_to_symbol(aliasee, suffixes[i]);
1258 if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1259 Fdefine_coding_system_alias(alias_subsidiary,
1260 aliasee_subsidiary);
1264 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1265 but it doesn't look intentional, so I'd rather return something
1266 meaningful or nothing at all. */
1271 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1273 Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1274 Lisp_Object new_coding_system;
1276 if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1277 return coding_system;
1280 case EOL_AUTODETECT:
1281 return coding_system;
1283 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1286 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1289 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1296 return NILP(new_coding_system) ? coding_system : new_coding_system;
1299 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1300 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1302 (coding_system, eol_type))
1304 coding_system = Fget_coding_system(coding_system);
1306 return subsidiary_coding_system(coding_system,
1307 symbol_to_eol_type(eol_type));
1310 /************************************************************************/
1311 /* Coding system accessors */
1312 /************************************************************************/
1314 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1315 Return the doc string for CODING-SYSTEM.
1319 coding_system = Fget_coding_system(coding_system);
1320 return XCODING_SYSTEM_DOC_STRING(coding_system);
1323 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1324 Return the type of CODING-SYSTEM.
1328 Lisp_Object tmp = Fget_coding_system(coding_system);
1330 switch (XCODING_SYSTEM_TYPE(tmp)) {
1333 case CODESYS_AUTODETECT:
1336 case CODESYS_SHIFT_JIS:
1338 case CODESYS_ISO2022:
1349 case CODESYS_NO_CONVERSION:
1350 return Qno_conversion;
1351 #ifdef DEBUG_SXEMACS
1352 case CODESYS_INTERNAL:
1360 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1363 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1365 return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1368 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1369 Return initial charset of CODING-SYSTEM designated to GNUM.
1372 (coding_system, gnum))
1374 coding_system = Fget_coding_system(coding_system);
1377 return coding_system_charset(coding_system, XINT(gnum));
1381 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1382 Return the PROP property of CODING-SYSTEM.
1384 (coding_system, prop))
1387 enum coding_system_type type;
1389 coding_system = Fget_coding_system(coding_system);
1391 type = XCODING_SYSTEM_TYPE(coding_system);
1393 for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1394 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1396 switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1397 case CODESYS_PROP_ALL_OK:
1400 case CODESYS_PROP_ISO2022:
1401 if (type != CODESYS_ISO2022)
1403 ("Property only valid in ISO2022 coding systems",
1407 case CODESYS_PROP_CCL:
1408 if (type != CODESYS_CCL)
1410 ("Property only valid in CCL coding systems",
1420 signal_simple_error("Unrecognized property", prop);
1422 if (EQ(prop, Qname))
1423 return XCODING_SYSTEM_NAME(coding_system);
1424 else if (EQ(prop, Qtype))
1425 return Fcoding_system_type(coding_system);
1426 else if (EQ(prop, Qdoc_string))
1427 return XCODING_SYSTEM_DOC_STRING(coding_system);
1428 else if (EQ(prop, Qmnemonic))
1429 return XCODING_SYSTEM_MNEMONIC(coding_system);
1430 else if (EQ(prop, Qeol_type))
1432 eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1433 else if (EQ(prop, Qeol_lf))
1434 return XCODING_SYSTEM_EOL_LF(coding_system);
1435 else if (EQ(prop, Qeol_crlf))
1436 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1437 else if (EQ(prop, Qeol_cr))
1438 return XCODING_SYSTEM_EOL_CR(coding_system);
1439 else if (EQ(prop, Qpost_read_conversion))
1440 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1441 else if (EQ(prop, Qpre_write_conversion))
1442 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1444 else if (type == CODESYS_ISO2022) {
1445 if (EQ(prop, Qcharset_g0))
1446 return coding_system_charset(coding_system, 0);
1447 else if (EQ(prop, Qcharset_g1))
1448 return coding_system_charset(coding_system, 1);
1449 else if (EQ(prop, Qcharset_g2))
1450 return coding_system_charset(coding_system, 2);
1451 else if (EQ(prop, Qcharset_g3))
1452 return coding_system_charset(coding_system, 3);
1454 #define FORCE_CHARSET(charset_num) \
1455 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1456 (coding_system, charset_num) ? Qt : Qnil)
1458 else if (EQ(prop, Qforce_g0_on_output))
1459 return FORCE_CHARSET(0);
1460 else if (EQ(prop, Qforce_g1_on_output))
1461 return FORCE_CHARSET(1);
1462 else if (EQ(prop, Qforce_g2_on_output))
1463 return FORCE_CHARSET(2);
1464 else if (EQ(prop, Qforce_g3_on_output))
1465 return FORCE_CHARSET(3);
1467 #define LISP_BOOLEAN(prop) \
1468 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1470 else if (EQ(prop, Qshort))
1471 return LISP_BOOLEAN(SHORT);
1472 else if (EQ(prop, Qno_ascii_eol))
1473 return LISP_BOOLEAN(NO_ASCII_EOL);
1474 else if (EQ(prop, Qno_ascii_cntl))
1475 return LISP_BOOLEAN(NO_ASCII_CNTL);
1476 else if (EQ(prop, Qseven))
1477 return LISP_BOOLEAN(SEVEN);
1478 else if (EQ(prop, Qlock_shift))
1479 return LISP_BOOLEAN(LOCK_SHIFT);
1480 else if (EQ(prop, Qno_iso6429))
1481 return LISP_BOOLEAN(NO_ISO6429);
1482 else if (EQ(prop, Qescape_quoted))
1483 return LISP_BOOLEAN(ESCAPE_QUOTED);
1485 else if (EQ(prop, Qinput_charset_conversion))
1487 unparse_charset_conversion_specs
1488 (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1489 else if (EQ(prop, Qoutput_charset_conversion))
1491 unparse_charset_conversion_specs
1492 (XCODING_SYSTEM(coding_system)->iso2022.
1496 } else if (type == CODESYS_CCL) {
1497 if (EQ(prop, Qdecode))
1498 return XCODING_SYSTEM_CCL_DECODE(coding_system);
1499 else if (EQ(prop, Qencode))
1500 return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1508 return Qnil; /* not reached */
1511 /************************************************************************/
1512 /* Coding category functions */
1513 /************************************************************************/
1515 static int decode_coding_category(Lisp_Object symbol)
1519 CHECK_SYMBOL(symbol);
1520 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1521 if (EQ(coding_category_symbol[i], symbol))
1524 signal_simple_error("Unrecognized coding category", symbol);
1525 return 0; /* not reached */
1528 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1529 Return a list of all recognized coding categories.
1534 Lisp_Object list = Qnil;
1536 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1537 list = Fcons(coding_category_symbol[i], list);
1541 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1542 Change the priority order of the coding categories.
1543 LIST should be list of coding categories, in descending order of
1544 priority. Unspecified coding categories will be lower in priority
1545 than all specified ones, in the same relative order they were in
1550 int category_to_priority[CODING_CATEGORY_LAST];
1554 /* First generate a list that maps coding categories to priorities. */
1556 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1557 category_to_priority[i] = -1;
1559 /* Highest priority comes from the specified list. */
1561 EXTERNAL_LIST_LOOP(rest, list) {
1562 int cat = decode_coding_category(XCAR(rest));
1564 if (category_to_priority[cat] >= 0)
1565 signal_simple_error("Duplicate coding category in list",
1567 category_to_priority[cat] = i++;
1570 /* Now go through the existing categories by priority to retrieve
1571 the categories not yet specified and preserve their priority
1573 for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1574 int cat = fcd->coding_category_by_priority[j];
1575 if (category_to_priority[cat] < 0)
1576 category_to_priority[cat] = i++;
1579 /* Now we need to construct the inverse of the mapping we just
1582 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1583 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1585 /* Phew! That was confusing. */
1589 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1590 Return a list of coding categories in descending order of priority.
1595 Lisp_Object list = Qnil;
1597 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1599 Fcons(coding_category_symbol
1600 [fcd->coding_category_by_priority[i]], list);
1604 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1605 Change the coding system associated with a coding category.
1607 (coding_category, coding_system))
1609 int cat = decode_coding_category(coding_category);
1611 coding_system = Fget_coding_system(coding_system);
1612 fcd->coding_category_system[cat] = coding_system;
1616 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1617 Return the coding system associated with a coding category.
1621 int cat = decode_coding_category(coding_category);
1622 Lisp_Object sys = fcd->coding_category_system[cat];
1625 return XCODING_SYSTEM_NAME(sys);
1629 /************************************************************************/
1630 /* Detecting the encoding of data */
1631 /************************************************************************/
1633 struct detection_state {
1634 eol_type_t eol_type;
1661 struct iso2022_decoder iso;
1663 int high_byte_count;
1664 unsigned int saw_single_shift:1;
1673 static int acceptable_control_char_p(int c)
1676 /* Allow and ignore control characters that you might
1677 reasonably see in a text file */
1682 case 8: /* backspace */
1683 case 11: /* vertical tab */
1684 case 12: /* form feed */
1685 case 26: /* MS-DOS C-z junk */
1686 case 31: /* '^_' -- for info */
1693 static int mask_has_at_most_one_bit_p(int mask)
1695 /* Perhaps the only thing useful you learn from intensive Microsoft
1696 technical interviews */
1697 return (mask & (mask - 1)) == 0;
1701 detect_eol_type(struct detection_state *st, const Extbyte * src,
1702 Lstream_data_count n)
1705 const unsigned char c = *(const unsigned char*)src++;
1707 if (st->eol.just_saw_cr)
1709 else if (st->eol.seen_anything)
1711 } else if (st->eol.just_saw_cr)
1714 st->eol.just_saw_cr = 1;
1716 st->eol.just_saw_cr = 0;
1717 st->eol.seen_anything = 1;
1720 return EOL_AUTODETECT;
1723 /* Attempt to determine the encoding and EOL type of the given text.
1724 Before calling this function for the first type, you must initialize
1725 st->eol_type as appropriate and initialize st->mask to ~0.
1727 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1730 st->mask holds the determined coding category mask, or ~0 if only
1731 ASCII has been seen so far.
1735 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1736 is present in st->mask
1737 1 == definitive answers are here for both st->eol_type and st->mask
1741 detect_coding_type(struct detection_state *st, const Extbyte * src,
1742 Lstream_data_count n, int just_do_eol)
1744 if (st->eol_type == EOL_AUTODETECT)
1745 st->eol_type = detect_eol_type(st, src, n);
1748 return st->eol_type != EOL_AUTODETECT;
1750 if (!st->seen_non_ascii) {
1751 for (; n; n--, src++) {
1752 const unsigned char c = *(const unsigned char *)src;
1753 if ((c < 0x20 && !acceptable_control_char_p(c))
1755 st->seen_non_ascii = 1;
1757 st->shift_jis.mask = ~0;
1761 st->iso2022.mask = ~0;
1772 if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1773 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1774 if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1775 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1776 if (!mask_has_at_most_one_bit_p(st->big5.mask))
1777 st->big5.mask = detect_coding_big5(st, src, n);
1778 if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1779 st->utf8.mask = detect_coding_utf8(st, src, n);
1780 if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1781 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1783 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1784 | st->utf8.mask | st->ucs4.mask;
1787 int retval = mask_has_at_most_one_bit_p(st->mask);
1788 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1789 return retval && st->eol_type != EOL_AUTODETECT;
1793 static Lisp_Object coding_system_from_mask(int mask)
1796 /* If the file was entirely or basically ASCII, use the
1797 default value of `buffer-file-coding-system'. */
1798 Lisp_Object retval =
1799 XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1800 if (!NILP(retval)) {
1801 retval = Ffind_coding_system(retval);
1804 (Qbad_variable, Qwarning,
1805 "Invalid `default-buffer-file-coding-system', set to nil");
1806 XBUFFER(Vbuffer_defaults)->
1807 buffer_file_coding_system = Qnil;
1811 retval = Fget_coding_system(Qraw_text);
1817 mask = postprocess_iso2022_mask(mask);
1819 /* Look through the coding categories by priority and find
1820 the first one that is allowed. */
1821 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1822 cat = fcd->coding_category_by_priority[i];
1823 if ((mask & (1 << cat)) &&
1824 !NILP(fcd->coding_category_system[cat]))
1828 return fcd->coding_category_system[cat];
1830 return Fget_coding_system(Qraw_text);
1834 /* Given a seekable read stream and potential coding system and EOL type
1835 as specified, do any autodetection that is called for. If the
1836 coding system and/or EOL type are not `autodetect', they will be left
1837 alone; but this function will never return an autodetect coding system
1840 This function does not automatically fetch subsidiary coding systems;
1841 that should be unnecessary with the explicit eol-type argument. */
1843 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1844 /* number of leading lines to check for a coding cookie */
1845 #define LINES_TO_CHECK 2
1848 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1849 eol_type_t * eol_type_in_out)
1851 struct detection_state decst;
1853 if (*eol_type_in_out == EOL_AUTODETECT)
1854 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
1857 decst.eol_type = *eol_type_in_out;
1860 /* If autodetection is called for, do it now. */
1861 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1862 || *eol_type_in_out == EOL_AUTODETECT) {
1864 Lisp_Object coding_system = Qnil;
1866 Lstream_data_count nread =
1867 Lstream_read(stream, buf, sizeof(buf));
1869 int lines_checked = 0;
1871 /* Look for initial "-*-"; mode line prefix */
1873 scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1874 p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1875 if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1876 Extbyte *local_vars_beg = p + 3;
1877 /* Look for final "-*-"; mode line suffix */
1878 for (p = local_vars_beg,
1879 scan_end = buf + nread - LENGTH("-*-");
1881 && lines_checked < LINES_TO_CHECK; p++)
1882 if (*p == '-' && *(p + 1) == '*'
1883 && *(p + 2) == '-') {
1884 Extbyte *suffix = p;
1885 /* Look for "coding:" */
1886 for (p = local_vars_beg,
1914 /* Get coding system name */
1917 /* Characters valid in a MIME charset name (rfc 1521),
1918 and in a Lisp symbol name. */
1921 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1922 "abcdefghijklmnopqrstuvwxyz"
1944 /* #### file must use standard EOLs or we miss 2d line */
1945 /* #### not to mention this is broken for UTF-16 DOS files */
1946 else if (*p == '\n' || *p == '\r') {
1948 /* skip past multibyte (DOS) newline */
1950 && *(p + 1) == '\n')
1955 /* #### file must use standard EOLs or we miss 2d line */
1956 /* #### not to mention this is broken for UTF-16 DOS files */
1957 else if (*p == '\n' || *p == '\r') {
1959 /* skip past multibyte (DOS) newline */
1960 if (*p == '\r' && *(p + 1) == '\n')
1964 if (NILP(coding_system))
1966 if (detect_coding_type(&decst, buf, nread,
1969 != CODESYS_AUTODETECT))
1971 nread = Lstream_read(stream, buf, sizeof(buf));
1977 else if (XCODING_SYSTEM_TYPE(*codesys_in_out) ==
1979 && XCODING_SYSTEM_EOL_TYPE(coding_system) ==
1982 if (detect_coding_type(&decst, buf, nread, 1))
1984 nread = Lstream_read(stream, buf, sizeof(buf));
1990 *eol_type_in_out = decst.eol_type;
1991 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
1992 if (NILP(coding_system))
1994 coding_system_from_mask(decst.mask);
1996 *codesys_in_out = coding_system;
2000 /* If we absolutely can't determine the EOL type, just assume LF. */
2001 if (*eol_type_in_out == EOL_AUTODETECT)
2002 *eol_type_in_out = EOL_LF;
2004 Lstream_rewind(stream);
2007 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2008 Detect coding system of the text in the region between START and END.
2009 Return a list of possible coding systems ordered by priority.
2010 If only ASCII characters are found, return 'undecided or one of
2011 its subsidiary coding systems according to a detected end-of-line
2012 type. Optional arg BUFFER defaults to the current buffer.
2014 (start, end, buffer))
2016 Lisp_Object val = Qnil;
2017 struct buffer *buf = decode_buffer(buffer, 0);
2019 Lisp_Object instream, lb_instream;
2020 lstream_t istr, lb_istr;
2021 struct detection_state decst;
2022 struct gcpro gcpro1, gcpro2;
2024 get_buffer_range_char(buf, start, end, &b, &e, 0);
2025 lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2026 lb_istr = XLSTREAM(lb_instream);
2028 make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2029 istr = XLSTREAM(instream);
2030 GCPRO2(instream, lb_instream);
2032 decst.eol_type = EOL_AUTODETECT;
2035 Extbyte random_buffer[4096];
2036 Lstream_data_count nread =
2037 Lstream_read(istr, random_buffer, sizeof(random_buffer));
2041 if (detect_coding_type(&decst, random_buffer, nread, 0))
2045 if (decst.mask == ~0)
2046 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2053 decst.mask = postprocess_iso2022_mask(decst.mask);
2055 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2056 int sys = fcd->coding_category_by_priority[i];
2057 if (decst.mask & (1 << sys)) {
2058 Lisp_Object codesys =
2059 fcd->coding_category_system[sys];
2062 subsidiary_coding_system(codesys,
2065 val = Fcons(codesys, val);
2069 Lstream_close(istr);
2071 Lstream_delete(istr);
2072 Lstream_delete(lb_istr);
2076 /************************************************************************/
2077 /* Converting to internal Mule format ("decoding") */
2078 /************************************************************************/
2080 /* A decoding stream is a stream used for decoding text (i.e.
2081 converting from some external format to internal format).
2082 The decoding-stream object keeps track of the actual coding
2083 stream, the stream that is at the other end, and data that
2084 needs to be persistent across the lifetime of the stream. */
2086 /* Handle the EOL stuff related to just-read-in character C.
2087 EOL_TYPE is the EOL type of the coding stream.
2088 FLAGS is the current value of FLAGS in the coding stream, and may
2089 be modified by this macro. (The macro only looks at the
2090 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2091 bytes are to be written. You need to also define a local goto
2092 label "label_continue_loop" that is at the end of the main
2093 character-reading loop.
2095 If C is a CR character, then this macro handles it entirely and
2096 jumps to label_continue_loop. Otherwise, this macro does not add
2097 anything to DST, and continues normally. You should continue
2098 processing C normally after this macro. */
2100 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2104 if (eol_type == EOL_CR) \
2105 Dynarr_add (dst, '\n'); \
2106 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2107 Dynarr_add (dst, c); \
2109 flags |= CODING_STATE_CR; \
2110 goto label_continue_loop; \
2112 else if (flags & CODING_STATE_CR) \
2113 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2115 Dynarr_add (dst, '\r'); \
2116 flags &= ~CODING_STATE_CR; \
2120 /* C should be a binary character in the range 0 - 255; convert
2121 to internal format and add to Dynarr DST. */
2123 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2125 if (BYTE_ASCII_P (c)) \
2126 Dynarr_add (dst, c); \
2127 else if (BYTE_C1_P (c)) \
2129 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2130 Dynarr_add (dst, c + 0x20); \
2134 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2135 Dynarr_add (dst, c); \
2139 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2143 DECODE_ADD_BINARY_CHAR (ch, dst); \
2148 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2150 if (flags & CODING_STATE_END) \
2152 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2153 if (flags & CODING_STATE_CR) \
2154 Dynarr_add (dst, '\r'); \
2158 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2160 typedef struct decoding_stream_s *decoding_stream_t;
2161 struct decoding_stream_s {
2162 /* Coding system that governs the conversion. */
2163 Lisp_Coding_System *codesys;
2165 /* Stream that we read the encoded data from or
2166 write the decoded data to. */
2167 lstream_t other_end;
2169 /* If we are reading, then we can return only a fixed amount of
2170 data, so if the conversion resulted in too much data, we store it
2171 here for retrieval the next time around. */
2172 unsigned_char_dynarr *runoff;
2174 /* FLAGS holds flags indicating the current state of the decoding.
2175 Some of these flags are dependent on the coding system. */
2178 /* CH holds a partially built-up character. Since we only deal
2179 with one- and two-byte characters at the moment, we only use
2180 this to store the first byte of a two-byte character. */
2183 /* EOL_TYPE specifies the type of end-of-line conversion that
2184 currently applies. We need to keep this separate from the
2185 EOL type stored in CODESYS because the latter might indicate
2186 automatic EOL-type detection while the former will always
2187 indicate a particular EOL type. */
2188 eol_type_t eol_type;
2190 /* Additional ISO2022 information. We define the structure above
2191 because it's also needed by the detection routines. */
2192 struct iso2022_decoder iso2022;
2194 /* Additional information (the state of the running CCL program)
2195 used by the CCL decoder. */
2196 struct ccl_program ccl;
2198 /* counter for UTF-8 or UCS-4 */
2199 unsigned char counter;
2201 struct detection_state decst;
2204 static Lstream_data_count
2205 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2206 static Lstream_data_count
2207 decoding_writer(lstream_t stream,
2208 const unsigned char *data, Lstream_data_count size);
2209 static int decoding_rewinder(lstream_t stream);
2210 static int decoding_seekable_p(lstream_t stream);
2211 static int decoding_flusher(lstream_t stream);
2212 static int decoding_closer(lstream_t stream);
2214 static Lisp_Object decoding_marker(Lisp_Object stream);
2216 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2217 sizeof(struct decoding_stream_s));
2220 decoding_marker(Lisp_Object stream)
2222 lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2223 Lisp_Object str_obj;
2225 /* We do not need to mark the coding systems or charsets stored
2226 within the stream because they are stored in a global list
2227 and automatically marked. */
2229 XSETLSTREAM(str_obj, str);
2230 mark_object(str_obj);
2231 if (str->imp->marker) {
2232 return str->imp->marker(str_obj);
2238 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2239 so we read data from the other end, decode it, and store it into DATA. */
2241 static Lstream_data_count
2242 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2244 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2245 unsigned char *orig_data = data;
2246 Lstream_data_count read_size;
2247 int error_occurred = 0;
2249 /* We need to interface to mule_decode(), which expects to take some
2250 amount of data and store the result into a Dynarr. We have
2251 mule_decode() store into str->runoff, and take data from there
2254 /* We loop until we have enough data, reading chunks from the other
2255 end and decoding it. */
2257 /* Take data from the runoff if we can. Make sure to take at
2258 most SIZE bytes, and delete the data from the runoff. */
2259 if (Dynarr_length(str->runoff) > 0) {
2260 Lstream_data_count chunk =
2262 (Lstream_data_count)
2263 Dynarr_length(str->runoff));
2264 memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2265 Dynarr_delete_many(str->runoff, 0, chunk);
2271 /* No more room for data */
2275 if (str->flags & CODING_STATE_END) {
2276 /* This means that on the previous iteration, we hit the
2277 EOF on the other end. We loop once more so that
2278 mule_decode() can output any final stuff it may be
2279 holding, or any "go back to a sane state" escape
2280 sequences. (This latter makes sense during
2285 /* Exhausted the runoff, so get some more. DATA has at least
2286 SIZE bytes left of storage in it, so it's OK to read directly
2287 into it. (We'll be overwriting above, after we've decoded it
2288 into the runoff.) */
2289 read_size = Lstream_read(str->other_end, data, size);
2290 if (read_size < 0) {
2294 if (read_size == 0) {
2295 /* There might be some more end data produced in the
2296 translation. See the comment above. */
2297 str->flags |= CODING_STATE_END;
2299 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2302 if (data - orig_data == 0) {
2303 return error_occurred ? -1 : 0;
2305 return data - orig_data;
2309 static Lstream_data_count
2310 decoding_writer(lstream_t stream, const unsigned char *data,
2311 Lstream_data_count size)
2313 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2314 Lstream_data_count retval;
2316 /* Decode all our data into the runoff, and then attempt to write
2317 it all out to the other end. Remove whatever chunk we succeeded
2319 mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2320 retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2321 Dynarr_length(str->runoff));
2323 Dynarr_delete_many(str->runoff, 0, retval);
2325 /* Do NOT return retval. The return value indicates how much
2326 of the incoming data was written, not how many bytes were
2332 reset_decoding_stream(decoding_stream_t str)
2335 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2336 Lisp_Object coding_system;
2337 XSETCODING_SYSTEM(coding_system, str->codesys);
2338 reset_iso2022(coding_system, &str->iso2022);
2339 } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2340 setup_ccl_program(&str->ccl,
2341 CODING_SYSTEM_CCL_DECODE(str->codesys));
2345 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2346 || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2348 str->decst.eol_type = EOL_AUTODETECT;
2349 str->decst.mask = ~0;
2351 str->flags = str->ch = 0;
2355 decoding_rewinder(lstream_t stream)
2357 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2358 reset_decoding_stream(str);
2359 Dynarr_reset(str->runoff);
2360 return Lstream_rewind(str->other_end);
2364 decoding_seekable_p(lstream_t stream)
2366 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2367 return Lstream_seekable_p(str->other_end);
2371 decoding_flusher(lstream_t stream)
2373 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2374 return Lstream_flush(str->other_end);
2378 decoding_closer(lstream_t stream)
2380 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2381 if (stream->flags & LSTREAM_FL_WRITE) {
2382 str->flags |= CODING_STATE_END;
2383 decoding_writer(stream, 0, 0);
2385 Dynarr_free(str->runoff);
2387 #ifdef ENABLE_COMPOSITE_CHARS
2388 if (str->iso2022.composite_chars) {
2389 Dynarr_free(str->iso2022.composite_chars);
2393 return Lstream_close(str->other_end);
2397 decoding_stream_coding_system(lstream_t stream)
2399 Lisp_Object coding_system;
2400 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2402 XSETCODING_SYSTEM(coding_system, str->codesys);
2403 return subsidiary_coding_system(coding_system, str->eol_type);
2407 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2409 Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2410 decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2412 if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2413 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2415 reset_decoding_stream(str);
2419 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2420 stream for writing, no automatic code detection will be performed.
2421 The reason for this is that automatic code detection requires a
2422 seekable input. Things will also fail if you open a decoding
2423 stream for reading using a non-fully-specified coding system and
2424 a non-seekable input stream. */
2427 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2429 lstream_t lstr = Lstream_new(lstream_decoding, mode);
2430 decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2434 str->other_end = stream;
2435 str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2436 str->eol_type = EOL_AUTODETECT;
2437 if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2438 /* We can determine the coding system now. */
2439 determine_real_coding_system(stream, &codesys, &str->eol_type);
2441 set_decoding_stream_coding_system(lstr, codesys);
2442 str->decst.eol_type = str->eol_type;
2443 str->decst.mask = ~0;
2444 XSETLSTREAM(obj, lstr);
2449 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2451 return make_decoding_stream_1(stream, codesys, "r");
2455 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2457 return make_decoding_stream_1(stream, codesys, "w");
2460 /* Note: the decode_coding_* functions all take the same
2461 arguments as mule_decode(), which is to say some SRC data of
2462 size N, which is to be stored into dynamic array DST.
2463 DECODING is the stream within which the decoding is
2464 taking place, but no data is actually read from or
2465 written to that stream; that is handled in decoding_reader()
2466 or decoding_writer(). This allows the same functions to
2467 be used for both reading and writing. */
2470 mule_decode(lstream_t decoding, const Extbyte * src,
2471 unsigned_char_dynarr * dst, Lstream_data_count n)
2473 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2475 /* If necessary, do encoding-detection now. We do this when
2476 we're a writing stream or a non-seekable reading stream,
2477 meaning that we can't just process the whole input,
2478 rewind, and start over. */
2480 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2481 str->eol_type == EOL_AUTODETECT) {
2482 Lisp_Object codesys;
2484 XSETCODING_SYSTEM(codesys, str->codesys);
2485 detect_coding_type(&str->decst, src, n,
2486 CODING_SYSTEM_TYPE(str->codesys) !=
2487 CODESYS_AUTODETECT);
2488 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2489 str->decst.mask != ~0)
2490 /* #### This is cheesy. What we really ought to do is
2491 buffer up a certain amount of data so as to get a
2492 less random result. */
2493 codesys = coding_system_from_mask(str->decst.mask);
2494 str->eol_type = str->decst.eol_type;
2495 if (XCODING_SYSTEM(codesys) != str->codesys) {
2496 /* Preserve the CODING_STATE_END flag in case it was set.
2497 If we erase it, bad things might happen. */
2498 int was_end = str->flags & CODING_STATE_END;
2499 set_decoding_stream_coding_system(decoding, codesys);
2501 str->flags |= CODING_STATE_END;
2505 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2506 #ifdef DEBUG_SXEMACS
2507 case CODESYS_INTERNAL:
2508 Dynarr_add_many(dst, src, n);
2511 case CODESYS_AUTODETECT:
2512 /* If we got this far and still haven't decided on the coding
2513 system, then do no conversion. */
2514 case CODESYS_NO_CONVERSION:
2515 decode_coding_no_conversion(decoding, src, dst, n);
2518 case CODESYS_SHIFT_JIS:
2519 decode_coding_sjis(decoding, src, dst, n);
2522 decode_coding_big5(decoding, src, dst, n);
2525 decode_coding_ucs4(decoding, src, dst, n);
2528 decode_coding_utf8(decoding, src, dst, n);
2531 str->ccl.last_block = str->flags & CODING_STATE_END;
2532 /* When applying ccl program to stream, MUST NOT set NULL
2534 ccl_driver(&str->ccl,
2536 ? (const unsigned char *)src
2537 : (const unsigned char *)""),
2538 dst, n, 0, CCL_MODE_DECODING);
2540 case CODESYS_ISO2022:
2541 decode_coding_iso2022(decoding, src, dst, n);
2549 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2550 Decode the text between START and END which is encoded in CODING-SYSTEM.
2551 This is useful if you've read in encoded text from a file without decoding
2552 it (e.g. you read in a JIS-formatted file but used the `binary' or
2553 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2554 Return length of decoded text.
2555 BUFFER defaults to the current buffer if unspecified.
2557 (start, end, coding_system, buffer))
2560 struct buffer *buf = decode_buffer(buffer, 0);
2561 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2562 lstream_t istr, ostr;
2563 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2565 get_buffer_range_char(buf, start, end, &b, &e, 0);
2567 barf_if_buffer_read_only(buf, b, e);
2569 coding_system = Fget_coding_system(coding_system);
2570 instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2571 lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2572 de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2574 outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2575 Fget_coding_system(Qbinary));
2576 istr = XLSTREAM(instream);
2577 ostr = XLSTREAM(outstream);
2578 GCPRO4(instream, lb_outstream, de_outstream, outstream);
2580 /* The chain of streams looks like this:
2582 [BUFFER] <----- send through
2583 ------> [ENCODE AS BINARY]
2584 ------> [DECODE AS SPECIFIED]
2589 char tempbuf[1024]; /* some random amount */
2590 Bufpos newpos, even_newer_pos;
2591 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2592 Lstream_data_count size_in_bytes =
2593 Lstream_read(istr, tempbuf, sizeof(tempbuf));
2597 newpos = lisp_buffer_stream_startpos(istr);
2598 Lstream_write(ostr, tempbuf, size_in_bytes);
2599 even_newer_pos = lisp_buffer_stream_startpos(istr);
2600 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2603 Lstream_close(istr);
2604 Lstream_close(ostr);
2606 Lstream_delete(istr);
2607 Lstream_delete(ostr);
2608 Lstream_delete(XLSTREAM(de_outstream));
2609 Lstream_delete(XLSTREAM(lb_outstream));
2613 /************************************************************************/
2614 /* Converting to an external encoding ("encoding") */
2615 /************************************************************************/
2617 /* An encoding stream is an output stream. When you create the
2618 stream, you specify the coding system that governs the encoding
2619 and another stream that the resulting encoded data is to be
2620 sent to, and then start sending data to it. */
2622 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2624 typedef struct encoding_stream_s *encoding_stream_t;
2625 struct encoding_stream_s {
2626 /* Coding system that governs the conversion. */
2627 Lisp_Coding_System *codesys;
2629 /* Stream that we read the encoded data from or
2630 write the decoded data to. */
2631 lstream_t other_end;
2633 /* If we are reading, then we can return only a fixed amount of
2634 data, so if the conversion resulted in too much data, we store it
2635 here for retrieval the next time around. */
2636 unsigned_char_dynarr *runoff;
2638 /* FLAGS holds flags indicating the current state of the encoding.
2639 Some of these flags are dependent on the coding system. */
2642 /* CH holds a partially built-up character. Since we only deal
2643 with one- and two-byte characters at the moment, we only use
2644 this to store the first byte of a two-byte character. */
2647 /* Additional information used by the ISO2022 encoder. */
2649 /* CHARSET holds the character sets currently assigned to the G0
2650 through G3 registers. It is initialized from the array
2651 INITIAL_CHARSET in CODESYS. */
2652 Lisp_Object charset[4];
2654 /* Which registers are currently invoked into the left (GL) and
2655 right (GR) halves of the 8-bit encoding space? */
2656 int register_left, register_right;
2658 /* Whether we need to explicitly designate the charset in the
2659 G? register before using it. It is initialized from the
2660 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2661 unsigned char force_charset_on_output[4];
2663 /* Other state variables that need to be preserved across
2665 Lisp_Object current_charset;
2667 int current_char_boundary;
2670 /* Additional information (the state of the running CCL program)
2671 used by the CCL encoder. */
2672 struct ccl_program ccl;
2676 static Lstream_data_count
2677 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2678 static Lstream_data_count
2679 encoding_writer(lstream_t stream,
2680 const unsigned char *data, Lstream_data_count size);
2681 static int encoding_rewinder(lstream_t stream);
2682 static int encoding_seekable_p(lstream_t stream);
2683 static int encoding_flusher(lstream_t stream);
2684 static int encoding_closer(lstream_t stream);
2686 static Lisp_Object encoding_marker(Lisp_Object stream);
2688 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2689 sizeof(struct encoding_stream_s));
2692 encoding_marker(Lisp_Object stream)
2694 lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2695 Lisp_Object str_obj;
2697 /* We do not need to mark the coding systems or charsets stored
2698 within the stream because they are stored in a global list
2699 and automatically marked. */
2701 XSETLSTREAM(str_obj, str);
2702 mark_object(str_obj);
2703 if (str->imp->marker) {
2704 return str->imp->marker(str_obj);
2710 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2711 so we read data from the other end, encode it, and store it into DATA. */
2713 static Lstream_data_count
2714 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2716 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2717 unsigned char *orig_data = data;
2718 Lstream_data_count read_size;
2719 int error_occurred = 0;
2721 /* We need to interface to mule_encode(), which expects to take some
2722 amount of data and store the result into a Dynarr. We have
2723 mule_encode() store into str->runoff, and take data from there
2726 /* We loop until we have enough data, reading chunks from the other
2727 end and encoding it. */
2729 /* Take data from the runoff if we can. Make sure to take at
2730 most SIZE bytes, and delete the data from the runoff. */
2731 if (Dynarr_length(str->runoff) > 0) {
2732 int chunk = min((int)size, Dynarr_length(str->runoff));
2733 memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2734 Dynarr_delete_many(str->runoff, 0, chunk);
2740 /* No more room for data */
2744 if (str->flags & CODING_STATE_END) {
2745 /* This means that on the previous iteration, we hit the
2746 EOF on the other end. We loop once more so that
2747 mule_encode() can output any final stuff it may be
2748 holding, or any "go back to a sane state" escape
2749 sequences. (This latter makes sense during
2754 /* Exhausted the runoff, so get some more. DATA at least SIZE
2755 bytes left of storage in it, so it's OK to read directly into
2756 it. (We'll be overwriting above, after we've encoded it into
2758 read_size = Lstream_read(str->other_end, data, size);
2759 if (read_size < 0) {
2763 if (read_size == 0) {
2764 /* There might be some more end data produced in the
2765 translation. See the comment above. */
2766 str->flags |= CODING_STATE_END;
2768 mule_encode(stream, data, str->runoff, read_size);
2771 if (data == orig_data) {
2772 return error_occurred ? -1 : 0;
2774 return data - orig_data;
2778 static Lstream_data_count
2779 encoding_writer(lstream_t stream, const unsigned char *data,
2780 Lstream_data_count size)
2782 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2783 Lstream_data_count retval;
2785 /* Encode all our data into the runoff, and then attempt to write
2786 it all out to the other end. Remove whatever chunk we succeeded
2788 mule_encode(stream, data, str->runoff, size);
2789 retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2790 Dynarr_length(str->runoff));
2792 Dynarr_delete_many(str->runoff, 0, retval);
2794 /* Do NOT return retval. The return value indicates how much
2795 of the incoming data was written, not how many bytes were
2801 reset_encoding_stream(encoding_stream_t str)
2804 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2805 case CODESYS_ISO2022: {
2808 for (i = 0; i < 4; i++) {
2809 str->iso2022.charset[i] =
2810 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2812 str->iso2022.force_charset_on_output[i] =
2813 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2816 str->iso2022.register_left = 0;
2817 str->iso2022.register_right = 1;
2818 str->iso2022.current_charset = Qnil;
2819 str->iso2022.current_half = 0;
2820 str->iso2022.current_char_boundary = 1;
2824 setup_ccl_program(&str->ccl,
2825 CODING_SYSTEM_CCL_ENCODE(str->codesys));
2828 /* list the rest of them lot explicitly */
2829 case CODESYS_AUTODETECT:
2830 case CODESYS_SHIFT_JIS:
2834 case CODESYS_NO_CONVERSION:
2835 #ifdef DEBUG_SXEMACS
2836 case CODESYS_INTERNAL:
2843 str->flags = str->ch = 0;
2847 encoding_rewinder(lstream_t stream)
2849 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2850 reset_encoding_stream(str);
2851 Dynarr_reset(str->runoff);
2852 return Lstream_rewind(str->other_end);
2856 encoding_seekable_p(lstream_t stream)
2858 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2859 return Lstream_seekable_p(str->other_end);
2863 encoding_flusher(lstream_t stream)
2865 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2866 return Lstream_flush(str->other_end);
2870 encoding_closer(lstream_t stream)
2872 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2873 if (stream->flags & LSTREAM_FL_WRITE) {
2874 str->flags |= CODING_STATE_END;
2875 encoding_writer(stream, 0, 0);
2877 Dynarr_free(str->runoff);
2878 return Lstream_close(str->other_end);
2882 encoding_stream_coding_system(lstream_t stream)
2884 Lisp_Object coding_system;
2885 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2887 XSETCODING_SYSTEM(coding_system, str->codesys);
2888 return coding_system;
2892 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2894 Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2895 encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2897 reset_encoding_stream(str);
2901 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2903 lstream_t lstr = Lstream_new(lstream_encoding, mode);
2904 encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2908 str->runoff = Dynarr_new(unsigned_char);
2909 str->other_end = stream;
2910 set_encoding_stream_coding_system(lstr, codesys);
2911 XSETLSTREAM(obj, lstr);
2916 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2918 return make_encoding_stream_1(stream, codesys, "r");
2922 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2924 return make_encoding_stream_1(stream, codesys, "w");
2927 /* Convert N bytes of internally-formatted data stored in SRC to an
2928 external format, according to the encoding stream ENCODING.
2929 Store the encoded data into DST. */
2932 mule_encode(lstream_t encoding, const Bufbyte * src,
2933 unsigned_char_dynarr * dst, Lstream_data_count n)
2935 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2937 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2938 #ifdef DEBUG_SXEMACS
2939 case CODESYS_INTERNAL:
2940 Dynarr_add_many(dst, src, n);
2943 case CODESYS_AUTODETECT:
2944 /* If we got this far and still haven't decided on the coding
2945 system, then do no conversion. */
2946 case CODESYS_NO_CONVERSION:
2947 encode_coding_no_conversion(encoding, src, dst, n);
2950 case CODESYS_SHIFT_JIS:
2951 encode_coding_sjis(encoding, src, dst, n);
2954 encode_coding_big5(encoding, src, dst, n);
2957 encode_coding_ucs4(encoding, src, dst, n);
2960 encode_coding_utf8(encoding, src, dst, n);
2963 str->ccl.last_block = str->flags & CODING_STATE_END;
2964 /* When applying ccl program to stream, MUST NOT set NULL
2966 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
2967 dst, n, 0, CCL_MODE_ENCODING);
2969 case CODESYS_ISO2022:
2970 encode_coding_iso2022(encoding, src, dst, n);
2978 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2979 Encode the text between START and END using CODING-SYSTEM.
2980 This will, for example, convert Japanese characters into stuff such as
2981 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2982 text. BUFFER defaults to the current buffer if unspecified.
2984 (start, end, coding_system, buffer))
2987 struct buffer *buf = decode_buffer(buffer, 0);
2988 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2989 lstream_t istr, ostr;
2990 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2992 get_buffer_range_char(buf, start, end, &b, &e, 0);
2994 barf_if_buffer_read_only(buf, b, e);
2996 coding_system = Fget_coding_system(coding_system);
2997 instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2998 lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2999 de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
3000 Fget_coding_system(Qbinary));
3001 outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
3003 istr = XLSTREAM(instream);
3004 ostr = XLSTREAM(outstream);
3005 GCPRO4(instream, outstream, de_outstream, lb_outstream);
3006 /* The chain of streams looks like this:
3008 [BUFFER] <----- send through
3009 ------> [ENCODE AS SPECIFIED]
3010 ------> [DECODE AS BINARY]
3014 char tempbuf[1024]; /* some random amount */
3015 Bufpos newpos, even_newer_pos;
3016 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
3017 Lstream_data_count size_in_bytes =
3018 Lstream_read(istr, tempbuf, sizeof(tempbuf));
3022 newpos = lisp_buffer_stream_startpos(istr);
3023 Lstream_write(ostr, tempbuf, size_in_bytes);
3024 even_newer_pos = lisp_buffer_stream_startpos(istr);
3025 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3031 lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3032 Lstream_close(istr);
3033 Lstream_close(ostr);
3035 Lstream_delete(istr);
3036 Lstream_delete(ostr);
3037 Lstream_delete(XLSTREAM(de_outstream));
3038 Lstream_delete(XLSTREAM(lb_outstream));
3039 return make_int(retlen);
3045 /************************************************************************/
3046 /* Shift-JIS methods */
3047 /************************************************************************/
3049 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3050 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3051 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3052 encoded by "position-code + 0x80". A character of JISX0208
3053 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3054 position-codes are divided and shifted so that it fit in the range
3057 --- CODE RANGE of Shift-JIS ---
3058 (character set) (range)
3060 JISX0201-Kana 0xA0 .. 0xDF
3061 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3062 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3063 -------------------------------
3067 /* Is this the first byte of a Shift-JIS two-byte char? */
3069 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3070 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3072 /* Is this the second byte of a Shift-JIS two-byte char? */
3074 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3075 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3077 #define BYTE_SJIS_KATAKANA_P(c) \
3078 ((c) >= 0xA1 && (c) <= 0xDF)
3081 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3082 Lstream_data_count n)
3085 const unsigned char c = *(const unsigned char *)src++;
3086 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3088 if (st->shift_jis.in_second_byte) {
3089 st->shift_jis.in_second_byte = 0;
3092 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3093 st->shift_jis.in_second_byte = 1;
3095 return CODING_CATEGORY_SHIFT_JIS_MASK;
3098 /* Convert Shift-JIS data to internal format. */
3101 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3102 unsigned_char_dynarr * dst, Lstream_data_count n)
3104 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3105 unsigned int flags = str->flags;
3106 unsigned int ch = str->ch;
3107 eol_type_t eol_type = str->eol_type;
3110 const unsigned char c = *(const unsigned char *)src++;
3113 /* Previous character was first byte of Shift-JIS Kanji
3115 if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3116 unsigned char e1, e2;
3118 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3119 DECODE_SJIS(ch, c, e1, e2);
3120 Dynarr_add(dst, e1);
3121 Dynarr_add(dst, e2);
3123 DECODE_ADD_BINARY_CHAR(ch, dst);
3124 DECODE_ADD_BINARY_CHAR(c, dst);
3128 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3129 if (BYTE_SJIS_TWO_BYTE_1_P(c))
3131 else if (BYTE_SJIS_KATAKANA_P(c)) {
3132 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3135 DECODE_ADD_BINARY_CHAR(c, dst);
3137 label_continue_loop:;
3140 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3146 /* Convert internally-formatted data to Shift-JIS. */
3149 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3150 unsigned_char_dynarr * dst, Lstream_data_count n)
3152 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3153 unsigned int flags = str->flags;
3154 unsigned int ch = str->ch;
3155 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3160 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3161 Dynarr_add(dst, '\r');
3162 if (eol_type != EOL_CR)
3163 Dynarr_add(dst, '\n');
3165 } else if (BYTE_ASCII_P(c)) {
3168 } else if (BUFBYTE_LEADING_BYTE_P(c))
3169 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3170 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3171 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3173 if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3176 } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3177 ch == LEADING_BYTE_JAPANESE_JISX0208)
3180 /* j1 is bessel j1 function,
3181 * so we use something else */
3182 /* unsigned char j1, j2; */
3183 unsigned char tt1, tt2;
3185 ENCODE_SJIS(ch, c, tt1, tt2);
3186 Dynarr_add(dst, tt1);
3187 Dynarr_add(dst, tt2);
3197 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3198 Decode a JISX0208 character of Shift-JIS coding-system.
3199 CODE is the character code in Shift-JIS as a cons of type bytes.
3200 Return the corresponding character.
3204 unsigned char c1, c2, s1, s2;
3207 CHECK_INT(XCAR(code));
3208 CHECK_INT(XCDR(code));
3209 s1 = XINT(XCAR(code));
3210 s2 = XINT(XCDR(code));
3211 if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3212 DECODE_SJIS(s1, s2, c1, c2);
3213 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3214 c1 & 0x7F, c2 & 0x7F));
3219 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3220 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3221 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3225 Lisp_Object charset;
3228 CHECK_CHAR_COERCE_INT(character);
3229 BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3230 if (EQ(charset, Vcharset_japanese_jisx0208)) {
3231 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3232 return Fcons(make_int(s1), make_int(s2));
3237 /************************************************************************/
3239 /************************************************************************/
3241 /* BIG5 is a coding system encoding two character sets: ASCII and
3242 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3243 character set and is encoded in two-byte.
3245 --- CODE RANGE of BIG5 ---
3246 (character set) (range)
3248 Big5 (1st byte) 0xA1 .. 0xFE
3249 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3250 --------------------------
3252 Since the number of characters in Big5 is larger than maximum
3253 characters in Emacs' charset (96x96), it can't be handled as one
3254 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3255 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3256 contains frequently used characters and the latter contains less
3257 frequently used characters. */
3259 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3260 ((c) >= 0xA1 && (c) <= 0xFE)
3262 /* Is this the second byte of a Shift-JIS two-byte char? */
3264 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3265 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3267 /* Number of Big5 characters which have the same code in 1st byte. */
3269 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3271 /* Code conversion macros. These are macros because they are used in
3272 inner loops during code conversion.
3274 Note that temporary variables in macros introduce the classic
3275 dynamic-scoping problems with variable names. We use capital-
3276 lettered variables in the assumption that SXEmacs does not use
3277 capital letters in variables except in a very formalized way
3280 /* Convert Big5 code (b1, b2) into its internal string representation
3283 /* There is a much simpler way to split the Big5 charset into two.
3284 For the moment I'm going to leave the algorithm as-is because it
3285 claims to separate out the most-used characters into a single
3286 charset, which perhaps will lead to optimizations in various
3289 The way the algorithm works is something like this:
3291 Big5 can be viewed as a 94x157 charset, where the row is
3292 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3293 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3294 the split between low and high column numbers is apparently
3295 meaningless; ascending rows produce less and less frequent chars.
3296 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3297 the first charset, and the upper half (0xC9 .. 0xFE) to the
3298 second. To do the conversion, we convert the character into
3299 a single number where 0 .. 156 is the first row, 157 .. 313
3300 is the second, etc. That way, the characters are ordered by
3301 decreasing frequency. Then we just chop the space in two
3302 and coerce the result into a 94x94 space.
3305 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3307 int B1 = b1, B2 = b2; \
3309 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3313 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3317 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3318 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3320 c1 = I / (0xFF - 0xA1) + 0xA1; \
3321 c2 = I % (0xFF - 0xA1) + 0xA1; \
3324 /* Convert the internal string representation of a Big5 character
3325 (lb, c1, c2) into Big5 code (b1, b2). */
3327 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3329 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3331 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3333 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3335 b1 = I / BIG5_SAME_ROW + 0xA1; \
3336 b2 = I % BIG5_SAME_ROW; \
3337 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3341 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3342 Lstream_data_count n)
3345 const unsigned char c = *(const unsigned char *)src++;
3346 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3347 (c >= 0x80 && c <= 0xA0))
3349 if (st->big5.in_second_byte) {
3350 st->big5.in_second_byte = 0;
3351 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3353 } else if (c >= 0xA1)
3354 st->big5.in_second_byte = 1;
3356 return CODING_CATEGORY_BIG5_MASK;
3359 /* Convert Big5 data to internal format. */
3362 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3363 unsigned_char_dynarr * dst, Lstream_data_count n)
3365 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3366 unsigned int flags = str->flags;
3367 unsigned int ch = str->ch;
3368 eol_type_t eol_type = str->eol_type;
3371 const unsigned char c = *(const unsigned char *)src++;
3373 /* Previous character was first byte of Big5 char. */
3374 if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3375 unsigned char b1, b2, b3;
3376 DECODE_BIG5(ch, c, b1, b2, b3);
3377 Dynarr_add(dst, b1);
3378 Dynarr_add(dst, b2);
3379 Dynarr_add(dst, b3);
3381 DECODE_ADD_BINARY_CHAR(ch, dst);
3382 DECODE_ADD_BINARY_CHAR(c, dst);
3386 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3387 if (BYTE_BIG5_TWO_BYTE_1_P(c))
3390 DECODE_ADD_BINARY_CHAR(c, dst);
3392 label_continue_loop:;
3395 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3401 /* Convert internally-formatted data to Big5. */
3404 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3405 unsigned_char_dynarr * dst, Lstream_data_count n)
3408 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3409 unsigned int flags = str->flags;
3410 unsigned int ch = str->ch;
3411 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3416 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3417 Dynarr_add(dst, '\r');
3418 if (eol_type != EOL_CR)
3419 Dynarr_add(dst, '\n');
3420 } else if (BYTE_ASCII_P(c)) {
3423 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3424 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3425 c == LEADING_BYTE_CHINESE_BIG5_2) {
3426 /* A recognized leading byte. */
3428 continue; /* not done with this character. */
3430 /* otherwise just ignore this character. */
3431 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3432 ch == LEADING_BYTE_CHINESE_BIG5_2) {
3433 /* Previous char was a recognized leading byte. */
3435 continue; /* not done with this character. */
3437 /* Encountering second byte of a Big5 character. */
3438 unsigned char b1, b2;
3440 ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3441 Dynarr_add(dst, b1);
3442 Dynarr_add(dst, b2);
3452 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3453 Decode a Big5 character CODE of BIG5 coding-system.
3454 CODE is the character code in BIG5, a cons of two integers.
3455 Return the corresponding character.
3459 unsigned char c1, c2, b1, b2;
3462 CHECK_INT(XCAR(code));
3463 CHECK_INT(XCDR(code));
3464 b1 = XINT(XCAR(code));
3465 b2 = XINT(XCDR(code));
3466 if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3468 Lisp_Object charset;
3469 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3470 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3471 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3476 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3477 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3478 Return the corresponding character code in Big5.
3482 Lisp_Object charset;
3485 CHECK_CHAR_COERCE_INT(character);
3486 BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3487 if (EQ(charset, Vcharset_chinese_big5_1) ||
3488 EQ(charset, Vcharset_chinese_big5_2)) {
3489 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3491 return Fcons(make_int(b1), make_int(b2));
3496 /************************************************************************/
3499 /* UCS-4 character codes are implemented as nonnegative integers. */
3501 /************************************************************************/
3503 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3504 Map UCS-4 code CODE to Mule character CHARACTER.
3506 Return T on success, NIL on failure.
3512 CHECK_CHAR(character);
3516 if (c < countof(fcd->ucs_to_mule_table)) {
3517 fcd->ucs_to_mule_table[c] = character;
3523 static Lisp_Object ucs_to_char(unsigned long code)
3525 if (code < countof(fcd->ucs_to_mule_table)) {
3526 return fcd->ucs_to_mule_table[code];
3527 } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3531 c = code % (94 * 94);
3533 (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3534 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3535 CHARSET_LEFT_TO_RIGHT),
3536 c / 94 + 33, c % 94 + 33));
3541 DEFUN("ucs-char", Fucs_char, 1, 1, 0, /*
3542 Return Mule character corresponding to UCS code CODE (a positive integer).
3547 return ucs_to_char(XINT(code));
3550 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3551 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3555 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3556 Fset_char_ucs is more restrictive on index arg, but should
3557 check code arg in a char_table method. */
3558 CHECK_CHAR(character);
3560 return Fput_char_table(character, code, mule_to_ucs_table);
3563 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0, /*
3564 Return the UCS code (a positive integer) corresponding to CHARACTER.
3568 return Fget_char_table(character, mule_to_ucs_table);
3571 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3572 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3573 is not found, instead.
3574 #### do something more appropriate (use blob?)
3575 Danger, Will Robinson! Data loss. Should we signal user? */
3576 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3578 Lisp_Object chr = ucs_to_char(ch);
3581 Bufbyte work[MAX_EMCHAR_LEN];
3586 simple_set_charptr_emchar(work, ch) :
3587 non_ascii_set_charptr_emchar(work, ch);
3588 Dynarr_add_many(dst, work, len);
3590 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3591 Dynarr_add(dst, 34 + 128);
3592 Dynarr_add(dst, 46 + 128);
3596 static unsigned long
3597 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3600 = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3605 } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3606 (XCHARSET_CHARS(charset) == 94)) {
3607 unsigned char final = XCHARSET_FINAL(charset);
3609 if (('@' <= final) && (final < 0x7f)) {
3610 return 0xe00000 + (final - '@') * 94 * 94
3611 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3621 encode_ucs4(Lisp_Object charset,
3622 unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3624 unsigned long code = mule_char_to_ucs4(charset, h, l);
3625 Dynarr_add(dst, code >> 24);
3626 Dynarr_add(dst, (code >> 16) & 255);
3627 Dynarr_add(dst, (code >> 8) & 255);
3628 Dynarr_add(dst, code & 255);
3632 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3633 Lstream_data_count n)
3636 const unsigned char c = *(const unsigned char *)src++;
3637 switch (st->ucs4.in_byte) {
3645 st->ucs4.in_byte = 0;
3651 return CODING_CATEGORY_UCS4_MASK;
3655 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3656 unsigned_char_dynarr * dst, Lstream_data_count n)
3658 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3659 unsigned int flags = str->flags;
3660 unsigned int ch = str->ch;
3661 unsigned char counter = str->counter;
3664 const unsigned char c = *(const unsigned char *)src++;
3671 decode_ucs4((ch << 8) | c, dst);
3680 if (counter & CODING_STATE_END)
3681 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3685 str->counter = counter;
3689 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3690 unsigned_char_dynarr * dst, Lstream_data_count n)
3692 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3693 unsigned int flags = str->flags;
3694 unsigned int ch = str->ch;
3695 unsigned char char_boundary = str->iso2022.current_char_boundary;
3696 Lisp_Object charset = str->iso2022.current_charset;
3698 #ifdef ENABLE_COMPOSITE_CHARS
3699 /* flags for handling composite chars. We do a little switcharoo
3700 on the source while we're outputting the composite char. */
3701 unsigned int saved_n = 0;
3702 const unsigned char *saved_src = NULL;
3703 int in_composite = 0;
3709 unsigned char c = *src++;
3711 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
3713 encode_ucs4(Vcharset_ascii, c, 0, dst);
3715 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
3717 charset = CHARSET_BY_LEADING_BYTE(c);
3718 if (LEADING_BYTE_PREFIX_P(c))
3721 } else { /* Processing Non-ASCII character */
3723 if (EQ(charset, Vcharset_control_1)) {
3724 encode_ucs4(Vcharset_control_1, c, 0, dst);
3726 switch (XCHARSET_REP_BYTES(charset)) {
3728 encode_ucs4(charset, c, 0, dst);
3731 if (XCHARSET_PRIVATE_P(charset)) {
3732 encode_ucs4(charset, c, 0, dst);
3735 #ifdef ENABLE_COMPOSITE_CHARS
3738 Vcharset_composite)) {
3740 /* #### Bother! We don't know how to
3753 (Vcharset_composite,
3758 composite_char_string
3767 n = XSTRING_LENGTH(lstr);
3770 #endif /* ENABLE_COMPOSITE_CHARS */
3772 encode_ucs4(charset, ch,
3783 encode_ucs4(charset, ch, c,
3798 #ifdef ENABLE_COMPOSITE_CHARS
3803 goto back_to_square_n; /* Wheeeeeeeee ..... */
3805 #endif /* ENABLE_COMPOSITE_CHARS */
3809 str->iso2022.current_char_boundary = char_boundary;
3810 str->iso2022.current_charset = charset;
3812 /* Verbum caro factum est! */
3815 /************************************************************************/
3817 /************************************************************************/
3820 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3821 Lstream_data_count n)
3824 const unsigned char c = *(const unsigned char *)src++;
3825 switch (st->utf8.in_byte) {
3827 if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3828 || c == ISO_CODE_SO)
3831 st->utf8.in_byte = 5;
3833 st->utf8.in_byte = 4;
3835 st->utf8.in_byte = 3;
3837 st->utf8.in_byte = 2;
3839 st->utf8.in_byte = 1;
3844 if ((c & 0xc0) != 0x80)
3850 return CODING_CATEGORY_UTF8_MASK;
3854 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3855 unsigned_char_dynarr * dst, Lstream_data_count n)
3857 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3858 unsigned int flags = str->flags;
3859 unsigned int ch = str->ch;
3860 eol_type_t eol_type = str->eol_type;
3861 unsigned char counter = str->counter;
3864 const unsigned char c = *(const unsigned char *)src++;
3870 } else if (c >= 0xf8) {
3873 } else if (c >= 0xf0) {
3876 } else if (c >= 0xe0) {
3879 } else if (c >= 0xc0) {
3883 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3884 decode_ucs4(c, dst);
3888 ch = (ch << 6) | (c & 0x3f);
3889 decode_ucs4(ch, dst);
3894 ch = (ch << 6) | (c & 0x3f);
3897 label_continue_loop:;
3900 if (flags & CODING_STATE_END)
3901 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3905 str->counter = counter;
3909 encode_utf8(Lisp_Object charset,
3910 unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3912 unsigned long code = mule_char_to_ucs4(charset, h, l);
3914 Dynarr_add(dst, code);
3915 } else if (code <= 0x7ff) {
3916 Dynarr_add(dst, (code >> 6) | 0xc0);
3917 Dynarr_add(dst, (code & 0x3f) | 0x80);
3918 } else if (code <= 0xffff) {
3919 Dynarr_add(dst, (code >> 12) | 0xe0);
3920 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3921 Dynarr_add(dst, (code & 0x3f) | 0x80);
3922 } else if (code <= 0x1fffff) {
3923 Dynarr_add(dst, (code >> 18) | 0xf0);
3924 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3925 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3926 Dynarr_add(dst, (code & 0x3f) | 0x80);
3927 } else if (code <= 0x3ffffff) {
3928 Dynarr_add(dst, (code >> 24) | 0xf8);
3929 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3930 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3931 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3932 Dynarr_add(dst, (code & 0x3f) | 0x80);
3934 Dynarr_add(dst, (code >> 30) | 0xfc);
3935 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3936 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3937 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3938 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3939 Dynarr_add(dst, (code & 0x3f) | 0x80);
3944 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3945 unsigned_char_dynarr * dst, Lstream_data_count n)
3947 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3948 unsigned int flags = str->flags;
3949 unsigned int ch = str->ch;
3950 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3951 unsigned char char_boundary = str->iso2022.current_char_boundary;
3952 Lisp_Object charset = str->iso2022.current_charset;
3954 #ifdef ENABLE_COMPOSITE_CHARS
3955 /* flags for handling composite chars. We do a little switcharoo
3956 on the source while we're outputting the composite char. */
3957 unsigned int saved_n = 0;
3958 const unsigned char *saved_src = NULL;
3959 int in_composite = 0;
3962 #endif /* ENABLE_COMPOSITE_CHARS */
3965 unsigned char c = *src++;
3967 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
3970 if (eol_type != EOL_LF
3971 && eol_type != EOL_AUTODETECT)
3972 Dynarr_add(dst, '\r');
3973 if (eol_type != EOL_CR)
3976 encode_utf8(Vcharset_ascii, c, 0, dst);
3978 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
3980 charset = CHARSET_BY_LEADING_BYTE(c);
3981 if (LEADING_BYTE_PREFIX_P(c))
3984 } else { /* Processing Non-ASCII character */
3986 if (EQ(charset, Vcharset_control_1)) {
3987 encode_utf8(Vcharset_control_1, c, 0, dst);
3989 switch (XCHARSET_REP_BYTES(charset)) {
3991 encode_utf8(charset, c, 0, dst);
3994 if (XCHARSET_PRIVATE_P(charset)) {
3995 encode_utf8(charset, c, 0, dst);
3998 #ifdef ENABLE_COMPOSITE_CHARS
4001 Vcharset_composite)) {
4003 /* #### Bother! We don't know how to
4012 (Vcharset_composite,
4017 composite_char_string
4026 n = XSTRING_LENGTH(lstr);
4029 #endif /* ENABLE_COMPOSITE_CHARS */
4031 encode_utf8(charset, ch,
4042 encode_utf8(charset, ch, c,
4057 #ifdef ENABLE_COMPOSITE_CHARS
4062 goto back_to_square_n; /* Wheeeeeeeee ..... */
4068 str->iso2022.current_char_boundary = char_boundary;
4069 str->iso2022.current_charset = charset;
4071 /* Verbum caro factum est! */
4074 /************************************************************************/
4075 /* ISO2022 methods */
4076 /************************************************************************/
4078 /* The following note describes the coding system ISO2022 briefly.
4079 Since the intention of this note is to help understand the
4080 functions in this file, some parts are NOT ACCURATE or OVERLY
4081 SIMPLIFIED. For thorough understanding, please refer to the
4082 original document of ISO2022.
4084 ISO2022 provides many mechanisms to encode several character sets
4085 in 7-bit and 8-bit environments. For 7-bit environments, all text
4086 is encoded using bytes less than 128. This may make the encoded
4087 text a little bit longer, but the text passes more easily through
4088 several gateways, some of which strip off MSB (Most Signigant Bit).
4090 There are two kinds of character sets: control character set and
4091 graphic character set. The former contains control characters such
4092 as `newline' and `escape' to provide control functions (control
4093 functions are also provided by escape sequences). The latter
4094 contains graphic characters such as 'A' and '-'. Emacs recognizes
4095 two control character sets and many graphic character sets.
4097 Graphic character sets are classified into one of the following
4098 four classes, according to the number of bytes (DIMENSION) and
4099 number of characters in one dimension (CHARS) of the set:
4100 - DIMENSION1_CHARS94
4101 - DIMENSION1_CHARS96
4102 - DIMENSION2_CHARS94
4103 - DIMENSION2_CHARS96
4105 In addition, each character set is assigned an identification tag,
4106 unique for each set, called "final character" (denoted as <F>
4107 hereafter). The <F> of each character set is decided by ECMA(*)
4108 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4109 (0x30..0x3F are for private use only).
4111 Note (*): ECMA = European Computer Manufacturers Association
4113 Here are examples of graphic character set [NAME(<F>)]:
4114 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4115 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4116 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4117 o DIMENSION2_CHARS96 -- none for the moment
4119 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4120 C0 [0x00..0x1F] -- control character plane 0
4121 GL [0x20..0x7F] -- graphic character plane 0
4122 C1 [0x80..0x9F] -- control character plane 1
4123 GR [0xA0..0xFF] -- graphic character plane 1
4125 A control character set is directly designated and invoked to C0 or
4126 C1 by an escape sequence. The most common case is that:
4127 - ISO646's control character set is designated/invoked to C0, and
4128 - ISO6429's control character set is designated/invoked to C1,
4129 and usually these designations/invocations are omitted in encoded
4130 text. In a 7-bit environment, only C0 can be used, and a control
4131 character for C1 is encoded by an appropriate escape sequence to
4132 fit into the environment. All control characters for C1 are
4133 defined to have corresponding escape sequences.
4135 A graphic character set is at first designated to one of four
4136 graphic registers (G0 through G3), then these graphic registers are
4137 invoked to GL or GR. These designations and invocations can be
4138 done independently. The most common case is that G0 is invoked to
4139 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4140 these invocations and designations are omitted in encoded text.
4141 In a 7-bit environment, only GL can be used.
4143 When a graphic character set of CHARS94 is invoked to GL, codes
4144 0x20 and 0x7F of the GL area work as control characters SPACE and
4145 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4148 There are two ways of invocation: locking-shift and single-shift.
4149 With locking-shift, the invocation lasts until the next different
4150 invocation, whereas with single-shift, the invocation affects the
4151 following character only and doesn't affect the locking-shift
4152 state. Invocations are done by the following control characters or
4155 ----------------------------------------------------------------------
4156 abbrev function cntrl escape seq description
4157 ----------------------------------------------------------------------
4158 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4159 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4160 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4161 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4162 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4163 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4164 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4165 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4166 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4167 ----------------------------------------------------------------------
4168 (*) These are not used by any known coding system.
4170 Control characters for these functions are defined by macros
4171 ISO_CODE_XXX in `coding.h'.
4173 Designations are done by the following escape sequences:
4174 ----------------------------------------------------------------------
4175 escape sequence description
4176 ----------------------------------------------------------------------
4177 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4178 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4179 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4180 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4181 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4182 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4183 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4184 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4185 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4186 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4187 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4188 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4189 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4190 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4191 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4192 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4193 ----------------------------------------------------------------------
4195 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4196 of dimension 1, chars 94, and final character <F>, etc...
4198 Note (*): Although these designations are not allowed in ISO2022,
4199 Emacs accepts them on decoding, and produces them on encoding
4200 CHARS96 character sets in a coding system which is characterized as
4201 7-bit environment, non-locking-shift, and non-single-shift.
4203 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4204 '(' can be omitted. We refer to this as "short-form" hereafter.
4206 Now you may notice that there are a lot of ways for encoding the
4207 same multilingual text in ISO2022. Actually, there exist many
4208 coding systems such as Compound Text (used in X11's inter client
4209 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4210 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4211 localized platforms), and all of these are variants of ISO2022.
4213 In addition to the above, Emacs handles two more kinds of escape
4214 sequences: ISO6429's direction specification and Emacs' private
4215 sequence for specifying character composition.
4217 ISO6429's direction specification takes the following form:
4218 o CSI ']' -- end of the current direction
4219 o CSI '0' ']' -- end of the current direction
4220 o CSI '1' ']' -- start of left-to-right text
4221 o CSI '2' ']' -- start of right-to-left text
4222 The control character CSI (0x9B: control sequence introducer) is
4223 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4225 Character composition specification takes the following form:
4226 o ESC '0' -- start character composition
4227 o ESC '1' -- end character composition
4228 Since these are not standard escape sequences of any ISO standard,
4229 their use with these meanings is restricted to Emacs only. */
4232 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4236 for (i = 0; i < 4; i++) {
4237 if (!NILP(coding_system))
4239 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4242 iso->charset[i] = Qt;
4243 iso->invalid_designated[i] = 0;
4245 iso->esc = ISO_ESC_NOTHING;
4246 iso->esc_bytes_index = 0;
4247 iso->register_left = 0;
4248 iso->register_right = 1;
4249 iso->switched_dir_and_no_valid_charset_yet = 0;
4250 iso->invalid_switch_dir = 0;
4251 iso->output_direction_sequence = 0;
4252 iso->output_literally = 0;
4253 #ifdef ENABLE_COMPOSITE_CHARS
4254 if (iso->composite_chars)
4255 Dynarr_reset(iso->composite_chars);
4259 static int fit_to_be_escape_quoted(unsigned char c)
4275 /* Parse one byte of an ISO2022 escape sequence.
4276 If the result is an invalid escape sequence, return 0 and
4277 do not change anything in STR. Otherwise, if the result is
4278 an incomplete escape sequence, update ISO2022.ESC and
4279 ISO2022.ESC_BYTES and return -1. Otherwise, update
4280 all the state variables (but not ISO2022.ESC_BYTES) and
4283 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4284 or invocation of an invalid character set and treat that as
4285 an unrecognized escape sequence.
4287 ********************************************************************
4289 #### Strategies for error annotation and coding orthogonalization
4291 We really want to separate out a number of things. Conceptually,
4292 there is a nested syntax.
4294 At the top level is the ISO 2022 extension syntax, including charset
4295 designation and invocation, and certain auxiliary controls such as the
4296 ISO 6429 direction specification. These are octet-oriented, with the
4297 single exception (AFAIK) of the "exit Unicode" sequence which uses the
4298 UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4299 UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a
4300 (deprecated) special case in Unicode processing.
4302 The middle layer is ISO 2022 character interpretation. This will depend
4303 on the current state of the ISO 2022 registers, and assembles octets
4304 into the character's internal representation.
4306 The lowest level is translating system control conventions. At present
4307 this is restricted to newline translation, but one could imagine doing
4308 tab conversion or line wrapping here. "Escape from Unicode" processing
4309 would be done at this level.
4311 At each level the parser will verify the syntax. In the case of a
4312 syntax error or warning (such as a redundant escape sequence that affects
4313 no characters), the parser will take some action, typically inserting the
4314 erroneous octets directly into the output and creating an annotation
4315 which can be used by higher level I/O to mark the affected region.
4317 This should make it possible to do something sensible about separating
4318 newline convention processing from character construction, and about
4319 preventing ISO 2022 escape sequences from being recognized
4322 The basic strategy will be to have octet classification tables, and
4323 switch processing according to the table entry.
4325 It's possible that, by doing the processing with tables of functions or
4326 the like, the parser can be used for both detection and translation. */
4329 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4330 unsigned char c, unsigned int *flags,
4331 int check_invalid_charsets)
4333 /* (1) If we're at the end of a designation sequence, CS is the
4334 charset being designated and REG is the register to designate
4337 (2) If we're at the end of a locking-shift sequence, REG is
4338 the register to invoke and HALF (0 == left, 1 == right) is
4339 the half to invoke it into.
4341 (3) If we're at the end of a single-shift sequence, REG is
4342 the register to invoke. */
4343 Lisp_Object cs = Qnil;
4346 /* NOTE: This code does goto's all over the fucking place.
4347 The reason for this is that we're basically implementing
4348 a state machine here, and hierarchical languages like C
4349 don't really provide a clean way of doing this. */
4351 if (!(*flags & CODING_STATE_ESCAPE))
4352 /* At beginning of escape sequence; we need to reset our
4353 escape-state variables. */
4354 iso->esc = ISO_ESC_NOTHING;
4356 iso->output_literally = 0;
4357 iso->output_direction_sequence = 0;
4360 case ISO_ESC_NOTHING:
4361 iso->esc_bytes_index = 0;
4363 case ISO_CODE_ESC: /* Start escape sequence */
4364 *flags |= CODING_STATE_ESCAPE;
4368 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4369 *flags |= CODING_STATE_ESCAPE;
4370 iso->esc = ISO_ESC_5_11;
4373 case ISO_CODE_SO: /* locking shift 1 */
4377 case ISO_CODE_SI: /* locking shift 0 */
4382 case ISO_CODE_SS2: /* single shift */
4385 case ISO_CODE_SS3: /* single shift */
4389 default: /* Other control characters */
4395 /**** single shift ****/
4397 case 'N': /* single shift 2 */
4400 case 'O': /* single shift 3 */
4404 /**** locking shift ****/
4406 case '~': /* locking shift 1 right */
4410 case 'n': /* locking shift 2 */
4414 case '}': /* locking shift 2 right */
4418 case 'o': /* locking shift 3 */
4422 case '|': /* locking shift 3 right */
4427 #ifdef ENABLE_COMPOSITE_CHARS
4428 /**** composite ****/
4431 iso->esc = ISO_ESC_START_COMPOSITE;
4432 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4433 CODING_STATE_COMPOSITE;
4437 iso->esc = ISO_ESC_END_COMPOSITE;
4438 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4439 ~CODING_STATE_COMPOSITE;
4441 #endif /* ENABLE_COMPOSITE_CHARS */
4443 /**** directionality ****/
4446 iso->esc = ISO_ESC_5_11;
4449 /**** designation ****/
4451 case '$': /* multibyte charset prefix */
4452 iso->esc = ISO_ESC_2_4;
4456 if (0x28 <= c && c <= 0x2F) {
4458 (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4462 /* This function is called with CODESYS equal to nil when
4463 doing coding-system detection. */
4465 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4466 && fit_to_be_escape_quoted(c)) {
4467 iso->esc = ISO_ESC_LITERAL;
4468 *flags &= CODING_STATE_ISO2022_LOCK;
4476 /**** directionality ****/
4478 case ISO_ESC_5_11: /* ISO6429 direction control */
4481 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4482 goto directionality;
4485 iso->esc = ISO_ESC_5_11_0;
4487 iso->esc = ISO_ESC_5_11_1;
4489 iso->esc = ISO_ESC_5_11_2;
4494 case ISO_ESC_5_11_0:
4497 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4498 goto directionality;
4502 case ISO_ESC_5_11_1:
4505 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4506 goto directionality;
4510 case ISO_ESC_5_11_2:
4513 (*flags & CODING_STATE_ISO2022_LOCK) |
4515 goto directionality;
4520 iso->esc = ISO_ESC_DIRECTIONALITY;
4521 /* Various junk here to attempt to preserve the direction
4522 sequences literally in the text if they would otherwise be
4523 swallowed due to invalid designations that don't show up as
4524 actual charset changes in the text. */
4525 if (iso->invalid_switch_dir) {
4526 /* We already inserted a direction switch literally into
4527 the text. We assume (#### this may not be right)
4528 that the next direction switch is the one going the
4529 other way, and we need to output that literally as
4531 iso->output_literally = 1;
4532 iso->invalid_switch_dir = 0;
4536 /* If we are in the thrall of an invalid designation,
4537 then stick the directionality sequence literally into
4538 the output stream so it ends up in the original text
4540 for (jj = 0; jj < 4; jj++)
4541 if (iso->invalid_designated[jj])
4544 iso->output_literally = 1;
4545 iso->invalid_switch_dir = 1;
4547 /* Indicate that we haven't yet seen a valid
4548 designation, so that if a switch-dir is
4549 directly followed by an invalid designation,
4550 both get inserted literally. */
4551 iso->switched_dir_and_no_valid_charset_yet = 1;
4555 /**** designation ****/
4558 if (0x28 <= c && c <= 0x2F) {
4560 (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4563 if (0x40 <= c && c <= 0x42) {
4564 cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4565 *flags & CODING_STATE_R2L ?
4566 CHARSET_RIGHT_TO_LEFT :
4567 CHARSET_LEFT_TO_RIGHT);
4584 case ISO_ESC_2_4_10:
4585 case ISO_ESC_2_4_11:
4586 case ISO_ESC_2_4_12:
4587 case ISO_ESC_2_4_13:
4588 case ISO_ESC_2_4_14:
4589 case ISO_ESC_2_4_15:
4590 case ISO_ESC_SINGLE_SHIFT:
4591 case ISO_ESC_LOCKING_SHIFT:
4592 case ISO_ESC_DESIGNATE:
4593 case ISO_ESC_DIRECTIONALITY:
4594 case ISO_ESC_LITERAL:
4599 if (c < '0' || c > '~')
4600 return 0; /* bad final byte */
4602 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4603 type = ((iso->esc >= ISO_ESC_2_12) ?
4604 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4605 reg = (iso->esc - ISO_ESC_2_8) & 3;
4606 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4607 iso->esc <= ISO_ESC_2_4_15) {
4608 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4609 CHARSET_TYPE_96X96 :
4610 CHARSET_TYPE_94X94);
4611 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4613 /* Can this ever be reached? -slb */
4618 cs = CHARSET_BY_ATTRIBUTES(type, c,
4619 *flags & CODING_STATE_R2L ?
4620 CHARSET_RIGHT_TO_LEFT :
4621 CHARSET_LEFT_TO_RIGHT);
4627 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4631 if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4632 /* can't invoke something that ain't there. */
4634 iso->esc = ISO_ESC_SINGLE_SHIFT;
4635 *flags &= CODING_STATE_ISO2022_LOCK;
4637 *flags |= CODING_STATE_SS2;
4639 *flags |= CODING_STATE_SS3;
4643 if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4644 /* can't invoke something that ain't there. */
4647 iso->register_right = reg;
4649 iso->register_left = reg;
4650 *flags &= CODING_STATE_ISO2022_LOCK;
4651 iso->esc = ISO_ESC_LOCKING_SHIFT;
4655 if (NILP(cs) && check_invalid_charsets) {
4656 iso->invalid_designated[reg] = 1;
4657 iso->charset[reg] = Vcharset_ascii;
4658 iso->esc = ISO_ESC_DESIGNATE;
4659 *flags &= CODING_STATE_ISO2022_LOCK;
4660 iso->output_literally = 1;
4661 if (iso->switched_dir_and_no_valid_charset_yet) {
4662 /* We encountered a switch-direction followed by an
4663 invalid designation. Ensure that the switch-direction
4664 gets outputted; otherwise it will probably get eaten
4665 when the text is written out again. */
4666 iso->switched_dir_and_no_valid_charset_yet = 0;
4667 iso->output_direction_sequence = 1;
4668 /* And make sure that the switch-dir going the other
4669 way gets outputted, as well. */
4670 iso->invalid_switch_dir = 1;
4674 /* This function is called with CODESYS equal to nil when
4675 doing coding-system detection. */
4676 if (!NILP(codesys)) {
4677 charset_conversion_spec_dynarr *dyn =
4678 XCODING_SYSTEM(codesys)->iso2022.input_conv;
4683 for (i = 0; i < Dynarr_length(dyn); i++) {
4684 struct charset_conversion_spec *spec =
4686 if (EQ(cs, spec->from_charset))
4687 cs = spec->to_charset;
4692 iso->charset[reg] = cs;
4693 iso->esc = ISO_ESC_DESIGNATE;
4694 *flags &= CODING_STATE_ISO2022_LOCK;
4695 if (iso->invalid_designated[reg]) {
4696 iso->invalid_designated[reg] = 0;
4697 iso->output_literally = 1;
4699 if (iso->switched_dir_and_no_valid_charset_yet)
4700 iso->switched_dir_and_no_valid_charset_yet = 0;
4705 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4706 Lstream_data_count n)
4710 /* #### There are serious deficiencies in the recognition mechanism
4711 here. This needs to be much smarter if it's going to cut it.
4712 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4713 it should be detected as Latin-1.
4714 All the ISO2022 stuff in this file should be synced up with the
4715 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4716 Perhaps we should wait till R2L works in FSF Emacs? */
4718 if (!st->iso2022.initted) {
4719 reset_iso2022(Qnil, &st->iso2022.iso);
4720 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4721 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4722 CODING_CATEGORY_ISO_8_1_MASK |
4723 CODING_CATEGORY_ISO_8_2_MASK |
4724 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4725 st->iso2022.flags = 0;
4726 st->iso2022.high_byte_count = 0;
4727 st->iso2022.saw_single_shift = 0;
4728 st->iso2022.initted = 1;
4731 mask = st->iso2022.mask;
4734 const unsigned char c = *(const unsigned char *)src++;
4736 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4737 st->iso2022.high_byte_count++;
4739 if (st->iso2022.high_byte_count
4740 && !st->iso2022.saw_single_shift) {
4741 if (st->iso2022.high_byte_count & 1)
4742 /* odd number of high bytes; assume not iso-8-2 */
4743 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4745 st->iso2022.high_byte_count = 0;
4746 st->iso2022.saw_single_shift = 0;
4748 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4750 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4751 && (BYTE_C0_P(c) || BYTE_C1_P(c))) { /* control chars */
4753 /* Allow and ignore control characters that you might
4754 reasonably see in a text file */
4759 case 8: /* backspace */
4760 case 11: /* vertical tab */
4761 case 12: /* form feed */
4762 case 26: /* MS-DOS C-z junk */
4763 case 31: /* '^_' -- for info */
4764 goto label_continue_loop;
4771 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4773 if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4774 &st->iso2022.flags, 0)) {
4775 switch (st->iso2022.iso.esc) {
4776 case ISO_ESC_DESIGNATE:
4777 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4778 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4780 case ISO_ESC_LOCKING_SHIFT:
4781 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4782 goto ran_out_of_chars;
4783 case ISO_ESC_SINGLE_SHIFT:
4784 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4785 st->iso2022.saw_single_shift = 1;
4789 case ISO_ESC_NOTHING:
4802 case ISO_ESC_2_4_10:
4803 case ISO_ESC_2_4_11:
4804 case ISO_ESC_2_4_12:
4805 case ISO_ESC_2_4_13:
4806 case ISO_ESC_2_4_14:
4807 case ISO_ESC_2_4_15:
4809 case ISO_ESC_5_11_0:
4810 case ISO_ESC_5_11_1:
4811 case ISO_ESC_5_11_2:
4812 case ISO_ESC_DIRECTIONALITY:
4813 case ISO_ESC_LITERAL:
4819 goto ran_out_of_chars;
4822 label_continue_loop:;
4829 static int postprocess_iso2022_mask(int mask)
4831 /* #### kind of cheesy */
4832 /* If seven-bit ISO is allowed, then assume that the encoding is
4833 entirely seven-bit and turn off the eight-bit ones. */
4834 if (mask & CODING_CATEGORY_ISO_7_MASK)
4835 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4836 CODING_CATEGORY_ISO_8_1_MASK |
4837 CODING_CATEGORY_ISO_8_2_MASK);
4841 /* If FLAGS is a null pointer or specifies right-to-left motion,
4842 output a switch-dir-to-left-to-right sequence to DST.
4843 Also update FLAGS if it is not a null pointer.
4844 If INTERNAL_P is set, we are outputting in internal format and
4845 need to handle the CSI differently. */
4848 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4849 unsigned_char_dynarr * dst,
4850 unsigned int *flags, int internal_p)
4852 if (!flags || (*flags & CODING_STATE_R2L)) {
4853 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4854 Dynarr_add(dst, ISO_CODE_ESC);
4855 Dynarr_add(dst, '[');
4856 } else if (internal_p)
4857 DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4859 Dynarr_add(dst, ISO_CODE_CSI);
4860 Dynarr_add(dst, '0');
4861 Dynarr_add(dst, ']');
4863 *flags &= ~CODING_STATE_R2L;
4867 /* If FLAGS is a null pointer or specifies a direction different from
4868 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4869 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4870 sequence to DST. Also update FLAGS if it is not a null pointer.
4871 If INTERNAL_P is set, we are outputting in internal format and
4872 need to handle the CSI differently. */
4875 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4876 unsigned_char_dynarr * dst, unsigned int *flags,
4879 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4880 direction == CHARSET_LEFT_TO_RIGHT)
4881 restore_left_to_right_direction(codesys, dst, flags,
4883 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4884 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4885 direction == CHARSET_RIGHT_TO_LEFT) {
4886 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4887 Dynarr_add(dst, ISO_CODE_ESC);
4888 Dynarr_add(dst, '[');
4889 } else if (internal_p)
4890 DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4892 Dynarr_add(dst, ISO_CODE_CSI);
4893 Dynarr_add(dst, '2');
4894 Dynarr_add(dst, ']');
4896 *flags |= CODING_STATE_R2L;
4900 /* Convert ISO2022-format data to internal format. */
4903 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4904 unsigned_char_dynarr * dst, Lstream_data_count n)
4906 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4907 unsigned int flags = str->flags;
4908 unsigned int ch = str->ch;
4909 eol_type_t eol_type = str->eol_type;
4910 #ifdef ENABLE_COMPOSITE_CHARS
4911 unsigned_char_dynarr *real_dst = dst;
4913 Lisp_Object coding_system;
4915 XSETCODING_SYSTEM(coding_system, str->codesys);
4917 #ifdef ENABLE_COMPOSITE_CHARS
4918 if (flags & CODING_STATE_COMPOSITE)
4919 dst = str->iso2022.composite_chars;
4920 #endif /* ENABLE_COMPOSITE_CHARS */
4923 const unsigned char c = *(const unsigned char *)src++;
4924 if (flags & CODING_STATE_ESCAPE) {
4925 /* Within ESC sequence */
4926 int retval = parse_iso2022_esc(
4927 coding_system, &str->iso2022, c, &flags, 1);
4930 switch (str->iso2022.esc) {
4931 #ifdef ENABLE_COMPOSITE_CHARS
4932 case ISO_ESC_START_COMPOSITE:
4933 if (str->iso2022.composite_chars)
4934 Dynarr_reset(str->iso2022.
4937 str->iso2022.composite_chars =
4938 Dynarr_new(unsigned_char);
4939 dst = str->iso2022.composite_chars;
4941 case ISO_ESC_END_COMPOSITE:
4943 Bufbyte comstr[MAX_EMCHAR_LEN];
4946 lookup_composite_char
4947 (Dynarr_atp(dst, 0),
4948 Dynarr_length(dst));
4951 set_charptr_emchar(comstr,
4953 Dynarr_add_many(dst, comstr,
4957 #endif /* ENABLE_COMPOSITE_CHARS */
4959 case ISO_ESC_LITERAL:
4960 DECODE_ADD_BINARY_CHAR(c, dst);
4963 case ISO_ESC_NOTHING:
4976 case ISO_ESC_2_4_10:
4977 case ISO_ESC_2_4_11:
4978 case ISO_ESC_2_4_12:
4979 case ISO_ESC_2_4_13:
4980 case ISO_ESC_2_4_14:
4981 case ISO_ESC_2_4_15:
4983 case ISO_ESC_5_11_0:
4984 case ISO_ESC_5_11_1:
4985 case ISO_ESC_5_11_2:
4986 case ISO_ESC_SINGLE_SHIFT:
4987 case ISO_ESC_LOCKING_SHIFT:
4988 case ISO_ESC_DESIGNATE:
4989 case ISO_ESC_DIRECTIONALITY:
4992 /* Everything else handled already */
4997 /* Attempted error recovery. */
4998 if (str->iso2022.output_direction_sequence)
4999 ensure_correct_direction(flags &
5001 CHARSET_RIGHT_TO_LEFT :
5002 CHARSET_LEFT_TO_RIGHT,
5003 str->codesys, dst, 0,
5005 /* More error recovery. */
5006 if (!retval || str->iso2022.output_literally) {
5007 /* Output the (possibly invalid) sequence */
5009 for (i = 0; i < str->iso2022.esc_bytes_index;
5011 DECODE_ADD_BINARY_CHAR(str->iso2022.
5014 flags &= CODING_STATE_ISO2022_LOCK;
5016 n++, src--; /* Repeat the loop with the same character. */
5018 /* No sense in reprocessing the final byte of the
5019 escape sequence; it could mess things up anyway.
5021 DECODE_ADD_BINARY_CHAR(c, dst);
5025 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) { /* Control characters */
5027 /***** Error-handling *****/
5029 /* If we were in the middle of a character, dump out the
5030 partial character. */
5031 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5033 /* If we just saw a single-shift character, dump it out.
5034 This may dump out the wrong sort of single-shift character,
5035 but least it will give an indication that something went
5037 if (flags & CODING_STATE_SS2) {
5038 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5039 flags &= ~CODING_STATE_SS2;
5041 if (flags & CODING_STATE_SS3) {
5042 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5043 flags &= ~CODING_STATE_SS3;
5046 /***** Now handle the control characters. *****/
5049 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5051 flags &= CODING_STATE_ISO2022_LOCK;
5053 if (!parse_iso2022_esc
5054 (coding_system, &str->iso2022, c, &flags, 1))
5055 DECODE_ADD_BINARY_CHAR(c, dst);
5056 } else { /* Graphic characters */
5057 Lisp_Object charset;
5061 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5063 /* Now determine the charset. */
5064 reg = ((flags & CODING_STATE_SS2) ? 2
5065 : (flags & CODING_STATE_SS3) ? 3
5066 : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5067 : str->iso2022.register_left);
5068 charset = str->iso2022.charset[reg];
5070 /* Error checking: */
5071 if (!CHARSETP(charset)
5072 || str->iso2022.invalid_designated[reg]
5074 (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5075 && XCHARSET_CHARS(charset) == 94))
5076 /* Mrmph. We are trying to invoke a register that has no
5077 or an invalid charset in it, or trying to add a character
5078 outside the range of the charset. Insert that char literally
5079 to preserve it for the output. */
5081 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5082 DECODE_ADD_BINARY_CHAR(c, dst);
5086 /* Things are probably hunky-dorey. */
5088 /* Fetch reverse charset, maybe. */
5089 if (((flags & CODING_STATE_R2L) &&
5090 XCHARSET_DIRECTION(charset) ==
5091 CHARSET_LEFT_TO_RIGHT)
5092 || (!(flags & CODING_STATE_R2L)
5093 && XCHARSET_DIRECTION(charset) ==
5094 CHARSET_RIGHT_TO_LEFT)) {
5095 Lisp_Object new_charset =
5096 XCHARSET_REVERSE_DIRECTION_CHARSET
5098 if (!NILP(new_charset))
5099 charset = new_charset;
5102 lb = XCHARSET_LEADING_BYTE(charset);
5103 switch (XCHARSET_REP_BYTES(charset)) {
5105 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5106 Dynarr_add(dst, c & 0x7F);
5109 case 2: /* one-byte official */
5110 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5111 Dynarr_add(dst, lb);
5112 Dynarr_add(dst, c | 0x80);
5115 case 3: /* one-byte private or two-byte official */
5116 if (XCHARSET_PRIVATE_P(charset)) {
5117 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5119 PRE_LEADING_BYTE_PRIVATE_1);
5120 Dynarr_add(dst, lb);
5121 Dynarr_add(dst, c | 0x80);
5124 Dynarr_add(dst, lb);
5135 default: /* two-byte private */
5138 PRE_LEADING_BYTE_PRIVATE_2);
5139 Dynarr_add(dst, lb);
5140 Dynarr_add(dst, ch | 0x80);
5141 Dynarr_add(dst, c | 0x80);
5149 flags &= CODING_STATE_ISO2022_LOCK;
5152 label_continue_loop:;
5155 if (flags & CODING_STATE_END)
5156 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5162 /***** ISO2022 encoder *****/
5164 /* Designate CHARSET into register REG. */
5167 iso2022_designate(Lisp_Object charset, unsigned char reg,
5168 encoding_stream_t str, unsigned_char_dynarr * dst)
5170 static const char inter94[] = "()*+";
5171 static const char inter96[] = ",-./";
5173 unsigned char final;
5174 Lisp_Object old_charset = str->iso2022.charset[reg];
5176 str->iso2022.charset[reg] = charset;
5177 if (!CHARSETP(charset))
5178 /* charset might be an initial nil or t. */
5180 type = XCHARSET_TYPE(charset);
5181 final = XCHARSET_FINAL(charset);
5182 if (!str->iso2022.force_charset_on_output[reg] &&
5183 CHARSETP(old_charset) &&
5184 XCHARSET_TYPE(old_charset) == type &&
5185 XCHARSET_FINAL(old_charset) == final)
5188 str->iso2022.force_charset_on_output[reg] = 0;
5191 charset_conversion_spec_dynarr *dyn =
5192 str->codesys->iso2022.output_conv;
5197 for (i = 0; i < Dynarr_length(dyn); i++) {
5198 struct charset_conversion_spec *spec =
5200 if (EQ(charset, spec->from_charset))
5201 charset = spec->to_charset;
5206 Dynarr_add(dst, ISO_CODE_ESC);
5208 case CHARSET_TYPE_94:
5209 Dynarr_add(dst, inter94[reg]);
5211 case CHARSET_TYPE_96:
5212 Dynarr_add(dst, inter96[reg]);
5214 case CHARSET_TYPE_94X94:
5215 Dynarr_add(dst, '$');
5216 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5217 || final < '@' || final > 'B')
5218 Dynarr_add(dst, inter94[reg]);
5220 case CHARSET_TYPE_96X96:
5221 Dynarr_add(dst, '$');
5222 Dynarr_add(dst, inter96[reg]);
5227 Dynarr_add(dst, final);
5231 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5233 if (str->iso2022.register_left != 0) {
5234 Dynarr_add(dst, ISO_CODE_SI);
5235 str->iso2022.register_left = 0;
5240 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5242 if (str->iso2022.register_left != 1) {
5243 Dynarr_add(dst, ISO_CODE_SO);
5244 str->iso2022.register_left = 1;
5248 /* Convert internally-formatted data to ISO2022 format. */
5251 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5252 unsigned_char_dynarr * dst, Lstream_data_count n)
5254 unsigned char charmask, c;
5255 unsigned char char_boundary;
5256 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5257 unsigned int flags = str->flags;
5258 unsigned int ch = str->ch;
5259 Lisp_Coding_System *codesys = str->codesys;
5260 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5262 Lisp_Object charset;
5265 #ifdef ENABLE_COMPOSITE_CHARS
5266 /* flags for handling composite chars. We do a little switcharoo
5267 on the source while we're outputting the composite char. */
5268 unsigned int saved_n = 0;
5269 const unsigned char *saved_src = NULL;
5270 int in_composite = 0;
5271 #endif /* ENABLE_COMPOSITE_CHARS */
5273 char_boundary = str->iso2022.current_char_boundary;
5274 charset = str->iso2022.current_charset;
5275 half = str->iso2022.current_half;
5277 #ifdef ENABLE_COMPOSITE_CHARS
5283 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
5286 restore_left_to_right_direction(codesys, dst, &flags,
5289 /* Make sure G0 contains ASCII */
5290 if ((c > ' ' && c < ISO_CODE_DEL) ||
5291 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5292 ensure_normal_shift(str, dst);
5293 iso2022_designate(Vcharset_ascii, 0, str, dst);
5296 /* If necessary, restore everything to the default state
5299 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5300 restore_left_to_right_direction(codesys, dst,
5303 ensure_normal_shift(str, dst);
5305 for (i = 0; i < 4; i++) {
5306 Lisp_Object initial_charset =
5307 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5309 iso2022_designate(initial_charset, i,
5314 if (eol_type != EOL_LF
5315 && eol_type != EOL_AUTODETECT)
5316 Dynarr_add(dst, '\r');
5317 if (eol_type != EOL_CR)
5320 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5321 && fit_to_be_escape_quoted(c))
5322 Dynarr_add(dst, ISO_CODE_ESC);
5328 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
5330 charset = CHARSET_BY_LEADING_BYTE(c);
5331 if (LEADING_BYTE_PREFIX_P(c))
5333 else if (!EQ(charset, Vcharset_control_1)
5334 #ifdef ENABLE_COMPOSITE_CHARS
5335 && !EQ(charset, Vcharset_composite)
5340 ensure_correct_direction(XCHARSET_DIRECTION
5344 /* Now determine which register to use. */
5346 for (i = 0; i < 4; i++) {
5347 if (EQ(charset, str->iso2022.charset[i])
5349 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5357 if (XCHARSET_GRAPHIC(charset) != 0) {
5359 (str->iso2022.charset[1])
5361 (!CODING_SYSTEM_ISO2022_SEVEN
5364 CODING_SYSTEM_ISO2022_LOCK_SHIFT
5381 iso2022_designate(charset, reg, str, dst);
5383 /* Now invoke that register. */
5386 ensure_normal_shift(str, dst);
5391 if (CODING_SYSTEM_ISO2022_SEVEN
5393 ensure_shift_out(str, dst);
5400 if (CODING_SYSTEM_ISO2022_SEVEN
5402 Dynarr_add(dst, ISO_CODE_ESC);
5403 Dynarr_add(dst, 'N');
5406 Dynarr_add(dst, ISO_CODE_SS2);
5412 if (CODING_SYSTEM_ISO2022_SEVEN
5414 Dynarr_add(dst, ISO_CODE_ESC);
5415 Dynarr_add(dst, 'O');
5418 Dynarr_add(dst, ISO_CODE_SS3);
5428 } else { /* Processing Non-ASCII character */
5429 charmask = (half == 0 ? 0x7F : 0xFF);
5431 if (EQ(charset, Vcharset_control_1)) {
5432 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5433 && fit_to_be_escape_quoted(c))
5434 Dynarr_add(dst, ISO_CODE_ESC);
5435 /* you asked for it ... */
5436 Dynarr_add(dst, c - 0x20);
5438 switch (XCHARSET_REP_BYTES(charset)) {
5440 Dynarr_add(dst, c & charmask);
5443 if (XCHARSET_PRIVATE_P(charset)) {
5444 Dynarr_add(dst, c & charmask);
5447 #ifdef ENABLE_COMPOSITE_CHARS
5450 Vcharset_composite)) {
5452 /* #### Bother! We don't know how to
5459 (Vcharset_composite,
5464 composite_char_string
5473 n = XSTRING_LENGTH(lstr);
5476 Dynarr_add(dst, '0'); /* start composing */
5479 #endif /* ENABLE_COMPOSITE_CHARS */
5496 Dynarr_add(dst, ch & charmask);
5497 Dynarr_add(dst, c & charmask);
5511 #ifdef ENABLE_COMPOSITE_CHARS
5516 Dynarr_add(dst, ISO_CODE_ESC);
5517 Dynarr_add(dst, '1'); /* end composing */
5518 goto back_to_square_n; /* Wheeeeeeeee ..... */
5520 #endif /* ENABLE_COMPOSITE_CHARS */
5522 if (char_boundary && flags & CODING_STATE_END) {
5523 restore_left_to_right_direction(codesys, dst, &flags, 0);
5524 ensure_normal_shift(str, dst);
5525 for (i = 0; i < 4; i++) {
5526 Lisp_Object initial_charset =
5527 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5528 iso2022_designate(initial_charset, i, str, dst);
5534 str->iso2022.current_char_boundary = char_boundary;
5535 str->iso2022.current_charset = charset;
5536 str->iso2022.current_half = half;
5538 /* Verbum caro factum est! */
5542 /************************************************************************/
5543 /* No-conversion methods */
5544 /************************************************************************/
5546 /* This is used when reading in "binary" files -- i.e. files that may
5547 contain all 256 possible byte values and that are not to be
5548 interpreted as being in any particular decoding. */
5550 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5551 unsigned_char_dynarr * dst, Lstream_data_count n)
5553 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5554 unsigned int flags = str->flags;
5555 unsigned int ch = str->ch;
5556 eol_type_t eol_type = str->eol_type;
5559 const unsigned char c = *(const unsigned char *)src++;
5561 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5562 DECODE_ADD_BINARY_CHAR(c, dst);
5563 label_continue_loop:;
5566 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5573 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5574 unsigned_char_dynarr * dst, Lstream_data_count n)
5577 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5578 unsigned int flags = str->flags;
5579 unsigned int ch = str->ch;
5580 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5585 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5586 Dynarr_add(dst, '\r');
5587 if (eol_type != EOL_CR)
5588 Dynarr_add(dst, '\n');
5590 } else if (BYTE_ASCII_P(c)) {
5593 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5595 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5596 c == LEADING_BYTE_CONTROL_1)
5599 Dynarr_add(dst, '~'); /* untranslatable character */
5601 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5603 else if (ch == LEADING_BYTE_CONTROL_1) {
5605 Dynarr_add(dst, c - 0x20);
5607 /* else it should be the second or third byte of an
5608 untranslatable character, so ignore it */
5617 /************************************************************************/
5618 /* Initialization */
5619 /************************************************************************/
5621 void syms_of_file_coding(void)
5623 INIT_LRECORD_IMPLEMENTATION(coding_system);
5625 DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5627 DEFSUBR(Fcoding_system_p);
5628 DEFSUBR(Ffind_coding_system);
5629 DEFSUBR(Fget_coding_system);
5630 DEFSUBR(Fcoding_system_list);
5631 DEFSUBR(Fcoding_system_name);
5632 DEFSUBR(Fmake_coding_system);
5633 DEFSUBR(Fcopy_coding_system);
5634 DEFSUBR(Fcoding_system_canonical_name_p);
5635 DEFSUBR(Fcoding_system_alias_p);
5636 DEFSUBR(Fcoding_system_aliasee);
5637 DEFSUBR(Fdefine_coding_system_alias);
5638 DEFSUBR(Fsubsidiary_coding_system);
5640 DEFSUBR(Fcoding_system_type);
5641 DEFSUBR(Fcoding_system_doc_string);
5643 DEFSUBR(Fcoding_system_charset);
5645 DEFSUBR(Fcoding_system_property);
5647 DEFSUBR(Fcoding_category_list);
5648 DEFSUBR(Fset_coding_priority_list);
5649 DEFSUBR(Fcoding_priority_list);
5650 DEFSUBR(Fset_coding_category_system);
5651 DEFSUBR(Fcoding_category_system);
5653 DEFSUBR(Fdetect_coding_region);
5654 DEFSUBR(Fdecode_coding_region);
5655 DEFSUBR(Fencode_coding_region);
5657 DEFSUBR(Fdecode_shift_jis_char);
5658 DEFSUBR(Fencode_shift_jis_char);
5659 DEFSUBR(Fdecode_big5_char);
5660 DEFSUBR(Fencode_big5_char);
5661 DEFSUBR(Fset_ucs_char);
5663 DEFSUBR(Fset_char_ucs);
5666 defsymbol(&Qcoding_systemp, "coding-system-p");
5667 defsymbol(&Qno_conversion, "no-conversion");
5668 defsymbol(&Qraw_text, "raw-text");
5670 defsymbol(&Qbig5, "big5");
5671 defsymbol(&Qshift_jis, "shift-jis");
5672 defsymbol(&Qucs4, "ucs-4");
5673 defsymbol(&Qutf8, "utf-8");
5674 defsymbol(&Qccl, "ccl");
5675 defsymbol(&Qiso2022, "iso2022");
5677 defsymbol(&Qmnemonic, "mnemonic");
5678 defsymbol(&Qeol_type, "eol-type");
5679 defsymbol(&Qpost_read_conversion, "post-read-conversion");
5680 defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5682 defsymbol(&Qcr, "cr");
5683 defsymbol(&Qlf, "lf");
5684 defsymbol(&Qcrlf, "crlf");
5685 defsymbol(&Qeol_cr, "eol-cr");
5686 defsymbol(&Qeol_lf, "eol-lf");
5687 defsymbol(&Qeol_crlf, "eol-crlf");
5689 defsymbol(&Qcharset_g0, "charset-g0");
5690 defsymbol(&Qcharset_g1, "charset-g1");
5691 defsymbol(&Qcharset_g2, "charset-g2");
5692 defsymbol(&Qcharset_g3, "charset-g3");
5693 defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5694 defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5695 defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5696 defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5697 defsymbol(&Qno_iso6429, "no-iso6429");
5698 defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5699 defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5701 defsymbol(&Qshort, "short");
5702 defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5703 defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5704 defsymbol(&Qseven, "seven");
5705 defsymbol(&Qlock_shift, "lock-shift");
5706 defsymbol(&Qescape_quoted, "escape-quoted");
5708 defsymbol(&Qencode, "encode");
5709 defsymbol(&Qdecode, "decode");
5712 defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5714 defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5715 defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5716 defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5717 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5718 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5720 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5721 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5722 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5725 defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5729 void lstream_type_create_file_coding(void)
5731 LSTREAM_HAS_METHOD(decoding, reader);
5732 LSTREAM_HAS_METHOD(decoding, writer);
5733 LSTREAM_HAS_METHOD(decoding, rewinder);
5734 LSTREAM_HAS_METHOD(decoding, seekable_p);
5735 LSTREAM_HAS_METHOD(decoding, flusher);
5736 LSTREAM_HAS_METHOD(decoding, closer);
5737 LSTREAM_HAS_METHOD(decoding, marker);
5739 LSTREAM_HAS_METHOD(encoding, reader);
5740 LSTREAM_HAS_METHOD(encoding, writer);
5741 LSTREAM_HAS_METHOD(encoding, rewinder);
5742 LSTREAM_HAS_METHOD(encoding, seekable_p);
5743 LSTREAM_HAS_METHOD(encoding, flusher);
5744 LSTREAM_HAS_METHOD(encoding, closer);
5745 LSTREAM_HAS_METHOD(encoding, marker);
5748 void vars_of_file_coding(void)
5752 fcd = xnew(struct file_coding_dump);
5753 dump_add_root_struct_ptr(&fcd, &fcd_description);
5755 /* Initialize to something reasonable ... */
5756 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5757 fcd->coding_category_system[i] = Qnil;
5758 fcd->coding_category_by_priority[i] = i;
5761 Fprovide(intern("file-coding"));
5763 DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system /*
5764 Coding system used for TTY keyboard input.
5765 Not used under a windowing system.
5767 Vkeyboard_coding_system = Qnil;
5769 DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system /*
5770 Coding system used for TTY display output.
5771 Not used under a windowing system.
5773 Vterminal_coding_system = Qnil;
5775 DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read /*
5776 Overriding coding system used when reading from a file or process.
5777 You should bind this variable with `let', but do not set it globally.
5778 If this is non-nil, it specifies the coding system that will be used
5779 to decode input on read operations, such as from a file or process.
5780 It overrides `buffer-file-coding-system-for-read',
5781 `insert-file-contents-pre-hook', etc. Use those variables instead of
5782 this one for permanent changes to the environment. */ );
5783 Vcoding_system_for_read = Qnil;
5785 DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write /*
5786 Overriding coding system used when writing to a file or process.
5787 You should bind this variable with `let', but do not set it globally.
5788 If this is non-nil, it specifies the coding system that will be used
5789 to encode output for write operations, such as to a file or process.
5790 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5791 Use those variables instead of this one for permanent changes to the
5793 Vcoding_system_for_write = Qnil;
5795 DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system /*
5796 Coding system used to convert pathnames when accessing files.
5798 Vfile_name_coding_system = Qnil;
5800 DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5801 Non-nil means the buffer contents are regarded as multi-byte form
5802 of characters, not a binary code. This affects the display, file I/O,
5803 and behaviors of various editing commands.
5805 Setting this to nil does not do anything.
5807 enable_multibyte_characters = 1;
5810 void complex_vars_of_file_coding(void)
5812 staticpro(&Vcoding_system_hash_table);
5813 Vcoding_system_hash_table =
5814 make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5816 the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5817 dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5818 &codesys_prop_dynarr_description);
5820 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5822 struct codesys_prop csp; \
5824 csp.prop_type = (Prop_Type); \
5825 Dynarr_add (the_codesys_prop_dynarr, csp); \
5828 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5829 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5830 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5831 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5832 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5833 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5834 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5836 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5837 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5838 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5839 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5840 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5841 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5842 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5843 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5844 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5845 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5846 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5847 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5848 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5849 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5850 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5851 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5852 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5854 DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5855 DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5857 /* Need to create this here or we're really screwed. */
5859 (Qraw_text, Qno_conversion,
5861 ("Raw text, which means it converts only line-break-codes."),
5862 list2(Qmnemonic, build_string("Raw")));
5865 (Qbinary, Qno_conversion,
5866 build_string("Binary, which means it does not convert anything."),
5867 list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5869 Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5871 Fdefine_coding_system_alias(Qfile_name, Qbinary);
5873 Fdefine_coding_system_alias(Qterminal, Qbinary);
5874 Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5876 /* Need this for bootstrapping */
5877 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5878 Fget_coding_system(Qraw_text);
5884 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5885 fcd->ucs_to_mule_table[i] = Qnil;
5887 staticpro(&mule_to_ucs_table);
5888 mule_to_ucs_table = Fmake_char_table(Qgeneric);