1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of SXEmacs
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 /* Synched up with: Mule 2.3. Not in FSF. */
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
30 #include "ui/insdel.h"
37 #include "file-coding.h"
39 Lisp_Object Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
50 struct file_coding_dump {
51 /* Coding system currently associated with each coding category. */
52 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
54 /* Table of all coding categories in decreasing order of priority.
55 This describes a permutation of the possible coding categories. */
56 int coding_category_by_priority[CODING_CATEGORY_LAST];
59 Lisp_Object ucs_to_mule_table[65536];
63 static const struct lrecord_description fcd_description_1[] = {
64 {XD_LISP_OBJECT_ARRAY,
65 offsetof(struct file_coding_dump, coding_category_system),
66 CODING_CATEGORY_LAST},
68 {XD_LISP_OBJECT_ARRAY,
69 offsetof(struct file_coding_dump, ucs_to_mule_table),
70 countof(fcd->ucs_to_mule_table)},
75 static const struct struct_description fcd_description = {
76 sizeof(struct file_coding_dump),
80 Lisp_Object mule_to_ucs_table;
82 Lisp_Object Qcoding_systemp;
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
104 Lisp_Object Qencode, Qdecode;
106 Lisp_Object Vcoding_system_hash_table;
108 int enable_multibyte_characters;
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet:1;
162 unsigned int invalid_switch_dir:1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally:1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence:1;
175 EXFUN(Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179 const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181 unsigned_char_dynarr * dst,
182 Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184 unsigned_char_dynarr * dst,
185 Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187 Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189 unsigned_char_dynarr * dst,
190 Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192 unsigned_char_dynarr * dst,
193 Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195 Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197 unsigned_char_dynarr * dst,
198 Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200 unsigned_char_dynarr * dst,
201 Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203 Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205 unsigned_char_dynarr * dst,
206 Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208 unsigned_char_dynarr * dst,
209 Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212 struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214 const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216 unsigned_char_dynarr * dst,
217 Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219 unsigned_char_dynarr * dst,
220 Lstream_data_count n);
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223 unsigned_char_dynarr * dst,
224 Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226 unsigned_char_dynarr * dst,
227 Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229 unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231 unsigned_char_dynarr * dst, Lstream_data_count n);
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
240 Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
243 static const struct lrecord_description codesys_prop_description_1[] = {
244 {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
248 static const struct struct_description codesys_prop_description = {
249 sizeof(codesys_prop),
250 codesys_prop_description_1
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
258 static const struct struct_description codesys_prop_dynarr_description = {
259 sizeof(codesys_prop_dynarr),
260 codesys_prop_dynarr_description_1
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
265 enum codesys_prop_enum {
267 CODESYS_PROP_ISO2022,
271 /************************************************************************/
272 /* Coding system functions */
273 /************************************************************************/
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
280 static const struct lrecord_description ccs_description_1[] = {
281 {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282 {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
286 static const struct struct_description ccs_description = {
287 sizeof(charset_conversion_spec),
291 static const struct lrecord_description ccsd_description_1[] = {
292 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
296 static const struct struct_description ccsd_description = {
297 sizeof(charset_conversion_spec_dynarr),
302 static const struct lrecord_description coding_system_description[] = {
303 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
312 {XD_LISP_OBJECT_ARRAY,
313 offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314 {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
316 {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
318 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325 mark_coding_system, print_coding_system,
326 finalize_coding_system,
327 0, 0, coding_system_description,
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
332 Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
334 mark_object(CODING_SYSTEM_NAME(codesys));
335 mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336 mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337 mark_object(CODING_SYSTEM_EOL_LF(codesys));
338 mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339 mark_object(CODING_SYSTEM_EOL_CR(codesys));
341 switch (CODING_SYSTEM_TYPE(codesys)) {
344 case CODESYS_ISO2022:
345 for (i = 0; i < 4; i++)
346 mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
348 if (codesys->iso2022.input_conv) {
350 i < Dynarr_length(codesys->iso2022.input_conv);
352 struct charset_conversion_spec *ccs =
353 Dynarr_atp(codesys->iso2022.input_conv, i);
354 mark_object(ccs->from_charset);
355 mark_object(ccs->to_charset);
358 if (codesys->iso2022.output_conv) {
360 i < Dynarr_length(codesys->iso2022.output_conv);
362 struct charset_conversion_spec *ccs =
363 Dynarr_atp(codesys->iso2022.output_conv, i);
364 mark_object(ccs->from_charset);
365 mark_object(ccs->to_charset);
371 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
375 /* list the rest of them lot explicitly */
376 case CODESYS_AUTODETECT:
377 case CODESYS_SHIFT_JIS:
381 case CODESYS_NO_CONVERSION:
383 case CODESYS_INTERNAL:
390 mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
397 Lisp_Coding_System *c = XCODING_SYSTEM(obj);
399 error("printing unreadable object #<coding-system 0x%x>",
402 write_c_string("#<coding-system ", printcharfun);
403 print_internal(c->name, printcharfun, 1);
404 write_c_string(">", printcharfun);
407 static void finalize_coding_system(void *header, int for_disksave)
409 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410 /* Since coding systems never go away, this function is not
411 necessary. But it would be necessary if we changed things
412 so that coding systems could go away. */
413 if (!for_disksave) { /* see comment in lstream.c */
414 switch (CODING_SYSTEM_TYPE(c)) {
416 case CODESYS_ISO2022:
417 if (c->iso2022.input_conv) {
418 Dynarr_free(c->iso2022.input_conv);
419 c->iso2022.input_conv = 0;
421 if (c->iso2022.output_conv) {
422 Dynarr_free(c->iso2022.output_conv);
423 c->iso2022.output_conv = 0;
427 /* list the rest of them lot explicitly */
428 case CODESYS_AUTODETECT:
429 case CODESYS_SHIFT_JIS:
434 case CODESYS_NO_CONVERSION:
436 case CODESYS_INTERNAL:
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
447 CHECK_SYMBOL(symbol);
449 return EOL_AUTODETECT;
452 if (EQ(symbol, Qcrlf))
457 signal_simple_error("Unrecognized eol type", symbol);
458 return EOL_AUTODETECT; /* not reached */
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
477 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
479 Lisp_Object codesys_obj;
480 int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
481 char *codesys_name = (char *)alloca(len + 7);
483 char *codesys_mnemonic = 0;
485 Lisp_Object codesys_name_sym, sub_codesys_obj;
489 XSETCODING_SYSTEM(codesys_obj, codesys);
492 string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
494 if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
495 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
496 codesys_mnemonic = (char *)alloca(mlen + 7);
497 memcpy(codesys_mnemonic,
498 XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
500 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
501 strcpy (codesys_name + len, "-" op_sys); \
503 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
504 codesys_name_sym = intern (codesys_name); \
505 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
506 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
508 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
509 build_string (codesys_mnemonic); \
510 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
513 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
514 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
515 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
518 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
519 Return t if OBJECT is a coding system.
520 A coding system is an object that defines how text containing multiple
521 character sets is encoded into a stream of (typically 8-bit) bytes.
522 The coding system is used to decode the stream into a series of
523 characters (which may be from multiple charsets) when the text is read
524 from a file or process, and is used to encode the text back into the
525 same format when it is written out to a file or process.
527 For example, many ISO2022-compliant coding systems (such as Compound
528 Text, which is used for inter-client data under the X Window System)
529 use escape sequences to switch between different charsets -- Japanese
530 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
531 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
532 `make-coding-system' for more information.
534 Coding systems are normally identified using a symbol, and the
535 symbol is accepted in place of the actual coding system object whenever
536 a coding system is called for. (This is similar to how faces work.)
540 return CODING_SYSTEMP(object) ? Qt : Qnil;
543 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
544 Retrieve the coding system of the given name.
546 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
547 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
548 If there is no such coding system, nil is returned. Otherwise the
549 associated coding system object is returned.
551 (coding_system_or_name))
553 if (NILP(coding_system_or_name))
554 coding_system_or_name = Qbinary;
555 else if (CODING_SYSTEMP(coding_system_or_name))
556 return coding_system_or_name;
558 CHECK_SYMBOL(coding_system_or_name);
561 coding_system_or_name =
562 Fgethash(coding_system_or_name, Vcoding_system_hash_table,
565 if (CODING_SYSTEMP(coding_system_or_name)
566 || NILP(coding_system_or_name))
567 return coding_system_or_name;
571 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
572 Retrieve the coding system of the given name.
573 Same as `find-coding-system' except that if there is no such
574 coding system, an error is signaled instead of returning nil.
578 Lisp_Object coding_system = Ffind_coding_system(name);
580 if (NILP(coding_system))
581 signal_simple_error("No such coding system", name);
582 return coding_system;
585 /* We store the coding systems in hash tables with the names as the key and the
586 actual coding system object as the value. Occasionally we need to use them
587 in a list format. These routines provide us with that. */
588 struct coding_system_list_closure {
589 Lisp_Object *coding_system_list;
593 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
594 void *coding_system_list_closure)
596 /* This function can GC */
597 struct coding_system_list_closure *cscl =
598 (struct coding_system_list_closure *)coding_system_list_closure;
599 Lisp_Object *coding_system_list = cscl->coding_system_list;
601 *coding_system_list = Fcons(key, *coding_system_list);
605 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
606 Return a list of the names of all defined coding systems.
610 Lisp_Object coding_system_list = Qnil;
612 struct coding_system_list_closure coding_system_list_closure;
614 GCPRO1(coding_system_list);
615 coding_system_list_closure.coding_system_list = &coding_system_list;
616 elisp_maphash(add_coding_system_to_list_mapper,
617 Vcoding_system_hash_table, &coding_system_list_closure);
620 return coding_system_list;
623 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
624 Return the name of the given coding system.
628 coding_system = Fget_coding_system(coding_system);
629 return XCODING_SYSTEM_NAME(coding_system);
632 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
635 Lisp_Coding_System *codesys =
636 alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
638 zero_lcrecord(codesys);
639 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
640 CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
641 CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
642 CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
643 CODING_SYSTEM_EOL_CR(codesys) = Qnil;
644 CODING_SYSTEM_EOL_LF(codesys) = Qnil;
645 CODING_SYSTEM_TYPE(codesys) = type;
646 CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
648 if (type == CODESYS_ISO2022) {
650 for (i = 0; i < 4; i++)
651 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
653 } else if (type == CODESYS_CCL) {
654 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
655 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
658 CODING_SYSTEM_NAME(codesys) = name;
664 /* Given a list of charset conversion specs as specified in a Lisp
665 program, parse it into STORE_HERE. */
668 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
669 Lisp_Object spec_list)
673 EXTERNAL_LIST_LOOP(rest, spec_list) {
674 Lisp_Object car = XCAR(rest);
675 Lisp_Object from, to;
676 struct charset_conversion_spec spec;
678 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
679 signal_simple_error("Invalid charset conversion spec",
681 from = Fget_charset(XCAR(car));
682 to = Fget_charset(XCAR(XCDR(car)));
683 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
684 signal_simple_error_2
685 ("Attempted conversion between different charset types",
687 spec.from_charset = from;
688 spec.to_charset = to;
690 Dynarr_add(store_here, spec);
694 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
695 specs, return the equivalent as the Lisp programmer would see it.
697 If LOAD_HERE is 0, return Qnil. */
700 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
707 for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
708 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
710 Fcons(list2(ccs->from_charset, ccs->to_charset), result);
713 return Fnreverse(result);
718 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
719 Register symbol NAME as a coding system.
721 TYPE describes the conversion method used and should be one of
724 Automatic conversion. SXEmacs attempts to detect the coding system
727 No conversion. Use this for binary files and such. On output,
728 graphic characters that are not in ASCII or Latin-1 will be
729 replaced by a ?. (For a no-conversion-encoded buffer, these
730 characters will only be present if you explicitly insert them.)
732 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
734 ISO 10646 UCS-4 encoding.
736 ISO 10646 UTF-8 encoding.
738 Any ISO2022-compliant encoding. Among other things, this includes
739 JIS (the Japanese encoding commonly used for e-mail), EUC (the
740 standard Unix encoding for Japanese and other languages), and
741 Compound Text (the encoding used in X11). You can specify more
742 specific information about the conversion with the PROPS argument.
744 Big5 (the encoding commonly used for Taiwanese).
746 The conversion is performed using a user-written pseudo-code
747 program. CCL (Code Conversion Language) is the name of this
750 Write out or read in the raw contents of the memory representing
751 the buffer's text. This is primarily useful for debugging
752 purposes, and is only enabled when SXEmacs has been compiled with
753 DEBUG_SXEMACS defined (via the --debug configure option).
754 WARNING: Reading in a file using 'internal conversion can result
755 in an internal inconsistency in the memory representing a
756 buffer's text, which will produce unpredictable results and may
757 cause SXEmacs to crash. Under normal circumstances you should
758 never use 'internal conversion.
760 DOC-STRING is a string describing the coding system.
762 PROPS is a property list, describing the specific nature of the
763 character set. Recognized properties are:
766 String to be displayed in the modeline when this coding system is
770 End-of-line conversion to be used. It should be one of
773 Automatically detect the end-of-line type (LF, CRLF,
774 or CR). Also generate subsidiary coding systems named
775 `NAME-unix', `NAME-dos', and `NAME-mac', that are
776 identical to this coding system but have an EOL-TYPE
777 value of 'lf, 'crlf, and 'cr, respectively.
779 The end of a line is marked externally using ASCII LF.
780 Since this is also the way that SXEmacs represents an
781 end-of-line internally, specifying this option results
782 in no end-of-line conversion. This is the standard
783 format for Unix text files.
785 The end of a line is marked externally using ASCII
786 CRLF. This is the standard format for MS-DOS text
789 The end of a line is marked externally using ASCII CR.
790 This is the standard format for Macintosh text files.
792 Automatically detect the end-of-line type but do not
793 generate subsidiary coding systems. (This value is
794 converted to nil when stored internally, and
795 `coding-system-property' will return nil.)
797 'post-read-conversion
798 Function called after a file has been read in, to perform the
799 decoding. Called with two arguments, START and END, denoting
800 a region of the current buffer to be decoded.
802 'pre-write-conversion
803 Function called before a file is written out, to perform the
804 encoding. Called with two arguments, START and END, denoting
805 a region of the current buffer to be encoded.
807 The following additional properties are recognized if TYPE is 'iso2022:
813 The character set initially designated to the G0 - G3 registers.
814 The value should be one of
816 -- A charset object (designate that character set)
817 -- nil (do not ever use this register)
818 -- t (no character set is initially designated to
819 the register, but may be later on; this automatically
820 sets the corresponding `force-g*-on-output' property)
826 If non-nil, send an explicit designation sequence on output before
827 using the specified register.
830 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
831 "ESC $ B" on output in place of the full designation sequences
832 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
835 If non-nil, don't designate ASCII to G0 at each end of line on output.
836 Setting this to non-nil also suppresses other state-resetting that
837 normally happens at the end of a line.
840 If non-nil, don't designate ASCII to G0 before control chars on output.
843 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
847 If non-nil, use locking-shift (SO/SI) instead of single-shift
848 or designation by escape sequence.
851 If non-nil, don't use ISO6429's direction specification.
854 If non-nil, literal control characters that are the same as
855 the beginning of a recognized ISO2022 or ISO6429 escape sequence
856 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
857 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
858 so that they can be properly distinguished from an escape sequence.
859 (Note that doing this results in a non-portable encoding.) This
860 encoding flag is used for byte-compiled files. Note that ESC
861 is a good choice for a quoting character because there are no
862 escape sequences whose second byte is a character from the Control-0
863 or Control-1 character sets; this is explicitly disallowed by the
866 'input-charset-conversion
867 A list of conversion specifications, specifying conversion of
868 characters in one charset to another when decoding is performed.
869 Each specification is a list of two elements: the source charset,
870 and the destination charset.
872 'output-charset-conversion
873 A list of conversion specifications, specifying conversion of
874 characters in one charset to another when encoding is performed.
875 The form of each specification is the same as for
876 'input-charset-conversion.
878 The following additional properties are recognized (and required)
882 CCL program used for decoding (converting to internal format).
885 CCL program used for encoding (converting to external format).
887 (name, type, doc_string, props))
889 Lisp_Coding_System *codesys;
890 enum coding_system_type ty;
891 int need_to_setup_eol_systems = 1;
893 /* Convert type to constant */
894 if (NILP(type) || EQ(type, Qundecided)) {
895 ty = CODESYS_AUTODETECT;
898 else if (EQ(type, Qshift_jis)) {
899 ty = CODESYS_SHIFT_JIS;
900 } else if (EQ(type, Qiso2022)) {
901 ty = CODESYS_ISO2022;
902 } else if (EQ(type, Qbig5)) {
904 } else if (EQ(type, Qucs4)) {
906 } else if (EQ(type, Qutf8)) {
908 } else if (EQ(type, Qccl)) {
912 else if (EQ(type, Qno_conversion)) {
913 ty = CODESYS_NO_CONVERSION;
916 else if (EQ(type, Qinternal)) {
917 ty = CODESYS_INTERNAL;
921 signal_simple_error("Invalid coding system type", type);
925 codesys = allocate_coding_system(ty, name);
927 if (NILP(doc_string))
928 doc_string = build_string("");
930 CHECK_STRING(doc_string);
931 CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
934 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
935 if (EQ(key, Qmnemonic)) {
938 CODING_SYSTEM_MNEMONIC(codesys) = value;
941 else if (EQ(key, Qeol_type)) {
942 need_to_setup_eol_systems = NILP(value);
945 CODING_SYSTEM_EOL_TYPE(codesys) =
946 symbol_to_eol_type(value);
949 else if (EQ(key, Qpost_read_conversion))
950 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
952 else if (EQ(key, Qpre_write_conversion))
953 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
956 else if (ty == CODESYS_ISO2022) {
957 #define FROB_INITIAL_CHARSET(charset_num) \
958 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
959 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
961 if (EQ(key, Qcharset_g0))
962 FROB_INITIAL_CHARSET(0);
963 else if (EQ(key, Qcharset_g1))
964 FROB_INITIAL_CHARSET(1);
965 else if (EQ(key, Qcharset_g2))
966 FROB_INITIAL_CHARSET(2);
967 else if (EQ(key, Qcharset_g3))
968 FROB_INITIAL_CHARSET(3);
970 #define FROB_FORCE_CHARSET(charset_num) \
971 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
973 else if (EQ(key, Qforce_g0_on_output))
974 FROB_FORCE_CHARSET(0);
975 else if (EQ(key, Qforce_g1_on_output))
976 FROB_FORCE_CHARSET(1);
977 else if (EQ(key, Qforce_g2_on_output))
978 FROB_FORCE_CHARSET(2);
979 else if (EQ(key, Qforce_g3_on_output))
980 FROB_FORCE_CHARSET(3);
982 #define FROB_BOOLEAN_PROPERTY(prop) \
983 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
985 else if (EQ(key, Qshort))
986 FROB_BOOLEAN_PROPERTY(SHORT);
987 else if (EQ(key, Qno_ascii_eol))
988 FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
989 else if (EQ(key, Qno_ascii_cntl))
990 FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
991 else if (EQ(key, Qseven))
992 FROB_BOOLEAN_PROPERTY(SEVEN);
993 else if (EQ(key, Qlock_shift))
994 FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
995 else if (EQ(key, Qno_iso6429))
996 FROB_BOOLEAN_PROPERTY(NO_ISO6429);
997 else if (EQ(key, Qescape_quoted))
998 FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
1000 else if (EQ(key, Qinput_charset_conversion)) {
1001 codesys->iso2022.input_conv =
1002 Dynarr_new(charset_conversion_spec);
1003 parse_charset_conversion_specs(codesys->
1007 } else if (EQ(key, Qoutput_charset_conversion)) {
1008 codesys->iso2022.output_conv =
1009 Dynarr_new(charset_conversion_spec);
1010 parse_charset_conversion_specs(codesys->
1016 ("Unrecognized property", key);
1017 } else if (EQ(type, Qccl)) {
1019 struct ccl_program test_ccl;
1022 /* Check key first. */
1023 if (EQ(key, Qdecode))
1024 suffix = "-ccl-decode";
1025 else if (EQ(key, Qencode))
1026 suffix = "-ccl-encode";
1029 ("Unrecognized property", key);
1031 /* If value is vector, register it as a ccl program
1032 associated with an newly created symbol for
1033 backward compatibility. */
1034 if (VECTORP(value)) {
1037 (Fsymbol_name(name),
1038 build_string(suffix)),
1040 Fregister_ccl_program(sym, value);
1042 CHECK_SYMBOL(value);
1045 /* check if the given ccl programs are valid. */
1046 if (setup_ccl_program(&test_ccl, sym) < 0)
1048 ("Invalid CCL program", value);
1050 if (EQ(key, Qdecode))
1051 CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1052 else if (EQ(key, Qencode))
1053 CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1058 signal_simple_error("Unrecognized property",
1063 if (need_to_setup_eol_systems)
1064 setup_eol_coding_systems(codesys);
1067 Lisp_Object codesys_obj;
1068 XSETCODING_SYSTEM(codesys_obj, codesys);
1069 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1074 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1075 Copy OLD-CODING-SYSTEM to NEW-NAME.
1076 If NEW-NAME does not name an existing coding system, a new one will
1079 (old_coding_system, new_name))
1081 Lisp_Object new_coding_system;
1082 old_coding_system = Fget_coding_system(old_coding_system);
1083 new_coding_system = Ffind_coding_system(new_name);
1084 if (NILP(new_coding_system)) {
1085 XSETCODING_SYSTEM(new_coding_system,
1086 allocate_coding_system
1087 (XCODING_SYSTEM_TYPE(old_coding_system),
1089 Fputhash(new_name, new_coding_system,
1090 Vcoding_system_hash_table);
1094 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1095 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1096 memcpy(((char *)to) + sizeof(to->header),
1097 ((char *)from) + sizeof(from->header),
1098 sizeof(*from) - sizeof(from->header));
1099 to->name = new_name;
1101 return new_coding_system;
1104 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1105 Return t if OBJECT names a coding system, and is not a coding system alias.
1109 Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1110 return CODING_SYSTEMP(val) ? Qt : Qnil;
1113 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1114 Return t if OBJECT is a coding system alias.
1115 All coding system aliases are created by `define-coding-system-alias'.
1119 Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1120 return SYMBOLP(val) ? Qt : Qnil;
1123 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1124 Return the coding-system symbol for which symbol ALIAS is an alias.
1128 Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1129 if (SYMBOLP(aliasee)) {
1132 signal_simple_error(
1133 "Symbol is not a coding system alias", alias);
1135 /* To keep the compiler happy */
1140 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1142 return Fintern(concat2(Fsymbol_name(symbol),
1143 build_string(ascii_string)), Qnil);
1146 /* A maphash function, for removing dangling coding system aliases. */
1148 dangling_coding_system_alias_p(Lisp_Object alias,
1149 Lisp_Object aliasee, void *dangling_aliases)
1151 if (SYMBOLP(aliasee)
1152 && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1153 (*(int *)dangling_aliases)++;
1160 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1161 Define symbol ALIAS as an alias for coding system ALIASEE.
1163 You can use this function to redefine an alias that has already been defined,
1164 but you cannot redefine a name which is the canonical name for a coding system.
1165 \(a canonical name of a coding system is what is returned when you call
1166 `coding-system-name' on a coding system).
1168 ALIASEE itself can be an alias, which allows you to define nested aliases.
1170 You are forbidden, however, from creating alias loops or `dangling' aliases.
1171 These will be detected, and an error will be signaled if you attempt to do so.
1173 If ALIASEE is nil, then ALIAS will simply be undefined.
1175 See also `coding-system-alias-p', `coding-system-aliasee',
1176 and `coding-system-canonical-name-p'.
1180 Lisp_Object real_coding_system, probe;
1182 CHECK_SYMBOL(alias);
1184 if (!NILP(Fcoding_system_canonical_name_p(alias)))
1186 ("Symbol is the canonical name of a coding system and cannot be redefined",
1189 if (NILP(aliasee)) {
1190 Lisp_Object subsidiary_unix =
1191 append_suffix_to_symbol(alias, "-unix");
1192 Lisp_Object subsidiary_dos =
1193 append_suffix_to_symbol(alias, "-dos");
1194 Lisp_Object subsidiary_mac =
1195 append_suffix_to_symbol(alias, "-mac");
1197 Fremhash(alias, Vcoding_system_hash_table);
1199 /* Undefine subsidiary aliases,
1200 presumably created by a previous call to this function */
1201 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1202 !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1203 !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1204 Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1205 Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1206 Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1209 /* Undefine dangling coding system aliases. */
1211 int dangling_aliases;
1214 dangling_aliases = 0;
1216 (dangling_coding_system_alias_p,
1217 Vcoding_system_hash_table,
1219 } while (dangling_aliases > 0);
1225 if (CODING_SYSTEMP(aliasee))
1226 aliasee = XCODING_SYSTEM_NAME(aliasee);
1228 /* Checks that aliasee names a coding-system */
1229 real_coding_system = Fget_coding_system(aliasee);
1231 /* Check for coding system alias loops */
1232 if (EQ(alias, aliasee))
1233 alias_loop:signal_simple_error_2
1234 ("Attempt to create a coding system alias loop", alias,
1237 for (probe = aliasee;
1239 probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1240 if (EQ(probe, alias))
1244 Fputhash(alias, aliasee, Vcoding_system_hash_table);
1246 /* Set up aliases for subsidiaries.
1247 #### There must be a better way to handle subsidiary coding
1250 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1252 for (int i = 0; i < countof(suffixes); i++) {
1253 Lisp_Object alias_subsidiary =
1254 append_suffix_to_symbol(alias, suffixes[i]);
1255 Lisp_Object aliasee_subsidiary =
1256 append_suffix_to_symbol(aliasee, suffixes[i]);
1258 if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1259 Fdefine_coding_system_alias(alias_subsidiary,
1260 aliasee_subsidiary);
1264 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1265 but it doesn't look intentional, so I'd rather return something
1266 meaningful or nothing at all. */
1271 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1273 Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1274 Lisp_Object new_coding_system;
1276 if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1277 return coding_system;
1280 case EOL_AUTODETECT:
1281 return coding_system;
1283 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1286 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1289 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1296 return NILP(new_coding_system) ? coding_system : new_coding_system;
1299 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1300 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1302 (coding_system, eol_type))
1304 coding_system = Fget_coding_system(coding_system);
1306 return subsidiary_coding_system(coding_system,
1307 symbol_to_eol_type(eol_type));
1310 /************************************************************************/
1311 /* Coding system accessors */
1312 /************************************************************************/
1314 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1315 Return the doc string for CODING-SYSTEM.
1319 coding_system = Fget_coding_system(coding_system);
1320 return XCODING_SYSTEM_DOC_STRING(coding_system);
1323 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1324 Return the type of CODING-SYSTEM.
1328 Lisp_Object tmp = Fget_coding_system(coding_system);
1330 switch (XCODING_SYSTEM_TYPE(tmp)) {
1333 case CODESYS_AUTODETECT:
1336 case CODESYS_SHIFT_JIS:
1338 case CODESYS_ISO2022:
1349 case CODESYS_NO_CONVERSION:
1350 return Qno_conversion;
1351 #ifdef DEBUG_SXEMACS
1352 case CODESYS_INTERNAL:
1360 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1363 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1365 return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1368 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1369 Return initial charset of CODING-SYSTEM designated to GNUM.
1372 (coding_system, gnum))
1374 coding_system = Fget_coding_system(coding_system);
1377 return coding_system_charset(coding_system, XINT(gnum));
1381 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1382 Return the PROP property of CODING-SYSTEM.
1384 (coding_system, prop))
1387 enum coding_system_type type;
1389 coding_system = Fget_coding_system(coding_system);
1391 type = XCODING_SYSTEM_TYPE(coding_system);
1393 for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1394 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1396 switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1397 case CODESYS_PROP_ALL_OK:
1400 case CODESYS_PROP_ISO2022:
1401 if (type != CODESYS_ISO2022)
1403 ("Property only valid in ISO2022 coding systems",
1407 case CODESYS_PROP_CCL:
1408 if (type != CODESYS_CCL)
1410 ("Property only valid in CCL coding systems",
1420 signal_simple_error("Unrecognized property", prop);
1422 if (EQ(prop, Qname))
1423 return XCODING_SYSTEM_NAME(coding_system);
1424 else if (EQ(prop, Qtype))
1425 return Fcoding_system_type(coding_system);
1426 else if (EQ(prop, Qdoc_string))
1427 return XCODING_SYSTEM_DOC_STRING(coding_system);
1428 else if (EQ(prop, Qmnemonic))
1429 return XCODING_SYSTEM_MNEMONIC(coding_system);
1430 else if (EQ(prop, Qeol_type))
1432 eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1433 else if (EQ(prop, Qeol_lf))
1434 return XCODING_SYSTEM_EOL_LF(coding_system);
1435 else if (EQ(prop, Qeol_crlf))
1436 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1437 else if (EQ(prop, Qeol_cr))
1438 return XCODING_SYSTEM_EOL_CR(coding_system);
1439 else if (EQ(prop, Qpost_read_conversion))
1440 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1441 else if (EQ(prop, Qpre_write_conversion))
1442 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1444 else if (type == CODESYS_ISO2022) {
1445 if (EQ(prop, Qcharset_g0))
1446 return coding_system_charset(coding_system, 0);
1447 else if (EQ(prop, Qcharset_g1))
1448 return coding_system_charset(coding_system, 1);
1449 else if (EQ(prop, Qcharset_g2))
1450 return coding_system_charset(coding_system, 2);
1451 else if (EQ(prop, Qcharset_g3))
1452 return coding_system_charset(coding_system, 3);
1454 #define FORCE_CHARSET(charset_num) \
1455 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1456 (coding_system, charset_num) ? Qt : Qnil)
1458 else if (EQ(prop, Qforce_g0_on_output))
1459 return FORCE_CHARSET(0);
1460 else if (EQ(prop, Qforce_g1_on_output))
1461 return FORCE_CHARSET(1);
1462 else if (EQ(prop, Qforce_g2_on_output))
1463 return FORCE_CHARSET(2);
1464 else if (EQ(prop, Qforce_g3_on_output))
1465 return FORCE_CHARSET(3);
1467 #define LISP_BOOLEAN(prop) \
1468 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1470 else if (EQ(prop, Qshort))
1471 return LISP_BOOLEAN(SHORT);
1472 else if (EQ(prop, Qno_ascii_eol))
1473 return LISP_BOOLEAN(NO_ASCII_EOL);
1474 else if (EQ(prop, Qno_ascii_cntl))
1475 return LISP_BOOLEAN(NO_ASCII_CNTL);
1476 else if (EQ(prop, Qseven))
1477 return LISP_BOOLEAN(SEVEN);
1478 else if (EQ(prop, Qlock_shift))
1479 return LISP_BOOLEAN(LOCK_SHIFT);
1480 else if (EQ(prop, Qno_iso6429))
1481 return LISP_BOOLEAN(NO_ISO6429);
1482 else if (EQ(prop, Qescape_quoted))
1483 return LISP_BOOLEAN(ESCAPE_QUOTED);
1485 else if (EQ(prop, Qinput_charset_conversion))
1487 unparse_charset_conversion_specs
1488 (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1489 else if (EQ(prop, Qoutput_charset_conversion))
1491 unparse_charset_conversion_specs
1492 (XCODING_SYSTEM(coding_system)->iso2022.
1496 } else if (type == CODESYS_CCL) {
1497 if (EQ(prop, Qdecode))
1498 return XCODING_SYSTEM_CCL_DECODE(coding_system);
1499 else if (EQ(prop, Qencode))
1500 return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1508 return Qnil; /* not reached */
1511 /************************************************************************/
1512 /* Coding category functions */
1513 /************************************************************************/
1515 static int decode_coding_category(Lisp_Object symbol)
1519 CHECK_SYMBOL(symbol);
1520 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1521 if (EQ(coding_category_symbol[i], symbol))
1524 signal_simple_error("Unrecognized coding category", symbol);
1525 return 0; /* not reached */
1528 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1529 Return a list of all recognized coding categories.
1534 Lisp_Object list = Qnil;
1536 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1537 list = Fcons(coding_category_symbol[i], list);
1541 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1542 Change the priority order of the coding categories.
1543 LIST should be list of coding categories, in descending order of
1544 priority. Unspecified coding categories will be lower in priority
1545 than all specified ones, in the same relative order they were in
1550 int category_to_priority[CODING_CATEGORY_LAST];
1554 /* First generate a list that maps coding categories to priorities. */
1556 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1557 category_to_priority[i] = -1;
1559 /* Highest priority comes from the specified list. */
1561 EXTERNAL_LIST_LOOP(rest, list) {
1562 int cat = decode_coding_category(XCAR(rest));
1564 if (category_to_priority[cat] >= 0)
1565 signal_simple_error("Duplicate coding category in list",
1567 category_to_priority[cat] = i++;
1570 /* Now go through the existing categories by priority to retrieve
1571 the categories not yet specified and preserve their priority
1573 for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1574 int cat = fcd->coding_category_by_priority[j];
1575 if (category_to_priority[cat] < 0)
1576 category_to_priority[cat] = i++;
1579 /* Now we need to construct the inverse of the mapping we just
1582 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1583 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1585 /* Phew! That was confusing. */
1589 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1590 Return a list of coding categories in descending order of priority.
1595 Lisp_Object list = Qnil;
1597 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1599 Fcons(coding_category_symbol
1600 [fcd->coding_category_by_priority[i]], list);
1604 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1605 Change the coding system associated with a coding category.
1607 (coding_category, coding_system))
1609 int cat = decode_coding_category(coding_category);
1611 coding_system = Fget_coding_system(coding_system);
1612 fcd->coding_category_system[cat] = coding_system;
1616 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1617 Return the coding system associated with a coding category.
1621 int cat = decode_coding_category(coding_category);
1622 Lisp_Object sys = fcd->coding_category_system[cat];
1625 return XCODING_SYSTEM_NAME(sys);
1629 /************************************************************************/
1630 /* Detecting the encoding of data */
1631 /************************************************************************/
1633 struct detection_state {
1634 eol_type_t eol_type;
1661 struct iso2022_decoder iso;
1663 int high_byte_count;
1664 unsigned int saw_single_shift:1;
1673 static int acceptable_control_char_p(int c)
1676 /* Allow and ignore control characters that you might
1677 reasonably see in a text file */
1682 case 8: /* backspace */
1683 case 11: /* vertical tab */
1684 case 12: /* form feed */
1685 case 26: /* MS-DOS C-z junk */
1686 case 31: /* '^_' -- for info */
1693 static int mask_has_at_most_one_bit_p(int mask)
1695 /* Perhaps the only thing useful you learn from intensive Microsoft
1696 technical interviews */
1697 return (mask & (mask - 1)) == 0;
1701 detect_eol_type(struct detection_state *st, const Extbyte * src,
1702 Lstream_data_count n)
1705 const unsigned char c = *(const unsigned char*)src++;
1707 if (st->eol.just_saw_cr)
1709 else if (st->eol.seen_anything)
1711 } else if (st->eol.just_saw_cr)
1714 st->eol.just_saw_cr = 1;
1716 st->eol.just_saw_cr = 0;
1717 st->eol.seen_anything = 1;
1720 return EOL_AUTODETECT;
1723 /* Attempt to determine the encoding and EOL type of the given text.
1724 Before calling this function for the first type, you must initialize
1725 st->eol_type as appropriate and initialize st->mask to ~0.
1727 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1730 st->mask holds the determined coding category mask, or ~0 if only
1731 ASCII has been seen so far.
1735 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1736 is present in st->mask
1737 1 == definitive answers are here for both st->eol_type and st->mask
1741 detect_coding_type(struct detection_state *st, const Extbyte * src,
1742 Lstream_data_count n, int just_do_eol)
1744 if (st->eol_type == EOL_AUTODETECT)
1745 st->eol_type = detect_eol_type(st, src, n);
1748 return st->eol_type != EOL_AUTODETECT;
1750 if (!st->seen_non_ascii) {
1751 for (; n; n--, src++) {
1752 const unsigned char c = *(const unsigned char *)src;
1753 if ((c < 0x20 && !acceptable_control_char_p(c))
1755 st->seen_non_ascii = 1;
1757 st->shift_jis.mask = ~0;
1761 st->iso2022.mask = ~0;
1772 if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1773 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1774 if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1775 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1776 if (!mask_has_at_most_one_bit_p(st->big5.mask))
1777 st->big5.mask = detect_coding_big5(st, src, n);
1778 if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1779 st->utf8.mask = detect_coding_utf8(st, src, n);
1780 if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1781 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1783 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1784 | st->utf8.mask | st->ucs4.mask;
1787 int retval = mask_has_at_most_one_bit_p(st->mask);
1788 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1789 return retval && st->eol_type != EOL_AUTODETECT;
1793 static Lisp_Object coding_system_from_mask(int mask)
1796 /* If the file was entirely or basically ASCII, use the
1797 default value of `buffer-file-coding-system'. */
1798 Lisp_Object retval =
1799 XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1800 if (!NILP(retval)) {
1801 retval = Ffind_coding_system(retval);
1804 (Qbad_variable, Qwarning,
1805 "Invalid `default-buffer-file-coding-system', set to nil");
1806 XBUFFER(Vbuffer_defaults)->
1807 buffer_file_coding_system = Qnil;
1811 retval = Fget_coding_system(Qraw_text);
1817 mask = postprocess_iso2022_mask(mask);
1819 /* Look through the coding categories by priority and find
1820 the first one that is allowed. */
1821 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1822 cat = fcd->coding_category_by_priority[i];
1825 if ((mask & (1 << cat)) &&
1826 !NILP(fcd->coding_category_system[cat]))
1830 return fcd->coding_category_system[cat];
1832 return Fget_coding_system(Qraw_text);
1836 /* Given a seekable read stream and potential coding system and EOL type
1837 as specified, do any autodetection that is called for. If the
1838 coding system and/or EOL type are not `autodetect', they will be left
1839 alone; but this function will never return an autodetect coding system
1842 This function does not automatically fetch subsidiary coding systems;
1843 that should be unnecessary with the explicit eol-type argument. */
1845 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1846 /* number of leading lines to check for a coding cookie */
1847 #define LINES_TO_CHECK 2
1850 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1851 eol_type_t * eol_type_in_out)
1853 struct detection_state decst;
1855 if (*eol_type_in_out == EOL_AUTODETECT)
1856 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
1859 decst.eol_type = *eol_type_in_out;
1862 /* If autodetection is called for, do it now. */
1863 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1864 || *eol_type_in_out == EOL_AUTODETECT) {
1866 Lisp_Object coding_system = Qnil;
1868 Lstream_data_count nread =
1869 Lstream_read(stream, buf, sizeof(buf));
1871 int lines_checked = 0;
1873 /* Look for initial "-*-"; mode line prefix */
1875 scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1876 p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1877 if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1878 Extbyte *local_vars_beg = p + 3;
1879 /* Look for final "-*-"; mode line suffix */
1880 for (p = local_vars_beg,
1881 scan_end = buf + nread - LENGTH("-*-");
1883 && lines_checked < LINES_TO_CHECK; p++)
1884 if (*p == '-' && *(p + 1) == '*'
1885 && *(p + 2) == '-') {
1886 Extbyte *suffix = p;
1887 /* Look for "coding:" */
1888 for (p = local_vars_beg,
1916 /* Get coding system name */
1919 /* Characters valid in a MIME charset name (rfc 1521),
1920 and in a Lisp symbol name. */
1923 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1924 "abcdefghijklmnopqrstuvwxyz"
1946 /* #### file must use standard EOLs or we miss 2d line */
1947 /* #### not to mention this is broken for UTF-16 DOS files */
1948 else if (*p == '\n' || *p == '\r') {
1950 /* skip past multibyte (DOS) newline */
1952 && *(p + 1) == '\n')
1957 /* #### file must use standard EOLs or we miss 2d line */
1958 /* #### not to mention this is broken for UTF-16 DOS files */
1959 else if (*p == '\n' || *p == '\r') {
1961 /* skip past multibyte (DOS) newline */
1962 if (*p == '\r' && *(p + 1) == '\n')
1966 if (NILP(coding_system))
1968 if (detect_coding_type(&decst, buf, nread,
1971 != CODESYS_AUTODETECT))
1973 nread = Lstream_read(stream, buf, sizeof(buf));
1979 else if (XCODING_SYSTEM_TYPE(*codesys_in_out) ==
1981 && XCODING_SYSTEM_EOL_TYPE(coding_system) ==
1984 if (detect_coding_type(&decst, buf, nread, 1))
1986 nread = Lstream_read(stream, buf, sizeof(buf));
1992 *eol_type_in_out = decst.eol_type;
1993 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
1994 if (NILP(coding_system))
1996 coding_system_from_mask(decst.mask);
1998 *codesys_in_out = coding_system;
2002 /* If we absolutely can't determine the EOL type, just assume LF. */
2003 if (*eol_type_in_out == EOL_AUTODETECT)
2004 *eol_type_in_out = EOL_LF;
2006 Lstream_rewind(stream);
2009 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2010 Detect coding system of the text in the region between START and END.
2011 Return a list of possible coding systems ordered by priority.
2012 If only ASCII characters are found, return 'undecided or one of
2013 its subsidiary coding systems according to a detected end-of-line
2014 type. Optional arg BUFFER defaults to the current buffer.
2016 (start, end, buffer))
2018 Lisp_Object val = Qnil;
2019 struct buffer *buf = decode_buffer(buffer, 0);
2021 Lisp_Object instream, lb_instream;
2022 lstream_t istr, lb_istr;
2023 struct detection_state decst;
2024 struct gcpro gcpro1, gcpro2;
2026 get_buffer_range_char(buf, start, end, &b, &e, 0);
2027 lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2028 lb_istr = XLSTREAM(lb_instream);
2030 make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2031 istr = XLSTREAM(instream);
2032 GCPRO2(instream, lb_instream);
2034 decst.eol_type = EOL_AUTODETECT;
2037 Extbyte random_buffer[4096];
2038 Lstream_data_count nread =
2039 Lstream_read(istr, random_buffer, sizeof(random_buffer));
2043 if (detect_coding_type(&decst, random_buffer, nread, 0))
2047 if (decst.mask == ~0)
2048 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2055 decst.mask = postprocess_iso2022_mask(decst.mask);
2057 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2058 int sys = fcd->coding_category_by_priority[i];
2059 if (decst.mask & (1 << sys)) {
2060 Lisp_Object codesys =
2061 fcd->coding_category_system[sys];
2064 subsidiary_coding_system(codesys,
2067 val = Fcons(codesys, val);
2071 Lstream_close(istr);
2073 Lstream_delete(istr);
2074 Lstream_delete(lb_istr);
2078 /************************************************************************/
2079 /* Converting to internal Mule format ("decoding") */
2080 /************************************************************************/
2082 /* A decoding stream is a stream used for decoding text (i.e.
2083 converting from some external format to internal format).
2084 The decoding-stream object keeps track of the actual coding
2085 stream, the stream that is at the other end, and data that
2086 needs to be persistent across the lifetime of the stream. */
2088 /* Handle the EOL stuff related to just-read-in character C.
2089 EOL_TYPE is the EOL type of the coding stream.
2090 FLAGS is the current value of FLAGS in the coding stream, and may
2091 be modified by this macro. (The macro only looks at the
2092 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2093 bytes are to be written. You need to also define a local goto
2094 label "label_continue_loop" that is at the end of the main
2095 character-reading loop.
2097 If C is a CR character, then this macro handles it entirely and
2098 jumps to label_continue_loop. Otherwise, this macro does not add
2099 anything to DST, and continues normally. You should continue
2100 processing C normally after this macro. */
2102 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2106 if (eol_type == EOL_CR) \
2107 Dynarr_add (dst, '\n'); \
2108 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2109 Dynarr_add (dst, c); \
2111 flags |= CODING_STATE_CR; \
2112 goto label_continue_loop; \
2114 else if (flags & CODING_STATE_CR) \
2115 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2117 Dynarr_add (dst, '\r'); \
2118 flags &= ~CODING_STATE_CR; \
2122 /* C should be a binary character in the range 0 - 255; convert
2123 to internal format and add to Dynarr DST. */
2125 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2127 if (BYTE_ASCII_P (c)) \
2128 Dynarr_add (dst, c); \
2129 else if (BYTE_C1_P (c)) \
2131 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2132 Dynarr_add (dst, c + 0x20); \
2136 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2137 Dynarr_add (dst, c); \
2141 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2145 DECODE_ADD_BINARY_CHAR (ch, dst); \
2150 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2152 if (flags & CODING_STATE_END) \
2154 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2155 if (flags & CODING_STATE_CR) \
2156 Dynarr_add (dst, '\r'); \
2160 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2162 typedef struct decoding_stream_s *decoding_stream_t;
2163 struct decoding_stream_s {
2164 /* Coding system that governs the conversion. */
2165 Lisp_Coding_System *codesys;
2167 /* Stream that we read the encoded data from or
2168 write the decoded data to. */
2169 lstream_t other_end;
2171 /* If we are reading, then we can return only a fixed amount of
2172 data, so if the conversion resulted in too much data, we store it
2173 here for retrieval the next time around. */
2174 unsigned_char_dynarr *runoff;
2176 /* FLAGS holds flags indicating the current state of the decoding.
2177 Some of these flags are dependent on the coding system. */
2180 /* CH holds a partially built-up character. Since we only deal
2181 with one- and two-byte characters at the moment, we only use
2182 this to store the first byte of a two-byte character. */
2185 /* EOL_TYPE specifies the type of end-of-line conversion that
2186 currently applies. We need to keep this separate from the
2187 EOL type stored in CODESYS because the latter might indicate
2188 automatic EOL-type detection while the former will always
2189 indicate a particular EOL type. */
2190 eol_type_t eol_type;
2192 /* Additional ISO2022 information. We define the structure above
2193 because it's also needed by the detection routines. */
2194 struct iso2022_decoder iso2022;
2196 /* Additional information (the state of the running CCL program)
2197 used by the CCL decoder. */
2198 struct ccl_program ccl;
2200 /* counter for UTF-8 or UCS-4 */
2201 unsigned char counter;
2203 struct detection_state decst;
2206 static Lstream_data_count
2207 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2208 static Lstream_data_count
2209 decoding_writer(lstream_t stream,
2210 const unsigned char *data, Lstream_data_count size);
2211 static int decoding_rewinder(lstream_t stream);
2212 static int decoding_seekable_p(lstream_t stream);
2213 static int decoding_flusher(lstream_t stream);
2214 static int decoding_closer(lstream_t stream);
2216 static Lisp_Object decoding_marker(Lisp_Object stream);
2218 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2219 sizeof(struct decoding_stream_s));
2222 decoding_marker(Lisp_Object stream)
2224 lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2225 Lisp_Object str_obj;
2227 /* We do not need to mark the coding systems or charsets stored
2228 within the stream because they are stored in a global list
2229 and automatically marked. */
2231 XSETLSTREAM(str_obj, str);
2232 mark_object(str_obj);
2233 if (str->imp->marker) {
2234 return str->imp->marker(str_obj);
2240 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2241 so we read data from the other end, decode it, and store it into DATA. */
2243 static Lstream_data_count
2244 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2246 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2247 unsigned char *orig_data = data;
2248 Lstream_data_count read_size;
2249 int error_occurred = 0;
2251 /* We need to interface to mule_decode(), which expects to take some
2252 amount of data and store the result into a Dynarr. We have
2253 mule_decode() store into str->runoff, and take data from there
2256 /* We loop until we have enough data, reading chunks from the other
2257 end and decoding it. */
2259 /* Take data from the runoff if we can. Make sure to take at
2260 most SIZE bytes, and delete the data from the runoff. */
2261 if (Dynarr_length(str->runoff) > 0) {
2262 Lstream_data_count chunk =
2264 (Lstream_data_count)
2265 Dynarr_length(str->runoff));
2266 memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2267 Dynarr_delete_many(str->runoff, 0, chunk);
2273 /* No more room for data */
2277 if (str->flags & CODING_STATE_END) {
2278 /* This means that on the previous iteration, we hit the
2279 EOF on the other end. We loop once more so that
2280 mule_decode() can output any final stuff it may be
2281 holding, or any "go back to a sane state" escape
2282 sequences. (This latter makes sense during
2287 /* Exhausted the runoff, so get some more. DATA has at least
2288 SIZE bytes left of storage in it, so it's OK to read directly
2289 into it. (We'll be overwriting above, after we've decoded it
2290 into the runoff.) */
2291 read_size = Lstream_read(str->other_end, data, size);
2292 if (read_size < 0) {
2296 if (read_size == 0) {
2297 /* There might be some more end data produced in the
2298 translation. See the comment above. */
2299 str->flags |= CODING_STATE_END;
2301 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2304 if (data - orig_data == 0) {
2305 return error_occurred ? -1 : 0;
2307 return data - orig_data;
2311 static Lstream_data_count
2312 decoding_writer(lstream_t stream, const unsigned char *data,
2313 Lstream_data_count size)
2315 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2316 Lstream_data_count retval;
2318 /* Decode all our data into the runoff, and then attempt to write
2319 it all out to the other end. Remove whatever chunk we succeeded
2321 mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2322 retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2323 Dynarr_length(str->runoff));
2325 Dynarr_delete_many(str->runoff, 0, retval);
2327 /* Do NOT return retval. The return value indicates how much
2328 of the incoming data was written, not how many bytes were
2334 reset_decoding_stream(decoding_stream_t str)
2337 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2338 Lisp_Object coding_system;
2339 XSETCODING_SYSTEM(coding_system, str->codesys);
2340 reset_iso2022(coding_system, &str->iso2022);
2341 } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2342 setup_ccl_program(&str->ccl,
2343 CODING_SYSTEM_CCL_DECODE(str->codesys));
2347 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2348 || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2350 str->decst.eol_type = EOL_AUTODETECT;
2351 str->decst.mask = ~0;
2353 str->flags = str->ch = 0;
2357 decoding_rewinder(lstream_t stream)
2359 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2360 reset_decoding_stream(str);
2361 Dynarr_reset(str->runoff);
2362 return Lstream_rewind(str->other_end);
2366 decoding_seekable_p(lstream_t stream)
2368 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2369 return Lstream_seekable_p(str->other_end);
2373 decoding_flusher(lstream_t stream)
2375 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2376 return Lstream_flush(str->other_end);
2380 decoding_closer(lstream_t stream)
2382 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2383 if (stream->flags & LSTREAM_FL_WRITE) {
2384 str->flags |= CODING_STATE_END;
2385 decoding_writer(stream, 0, 0);
2387 Dynarr_free(str->runoff);
2389 #ifdef ENABLE_COMPOSITE_CHARS
2390 if (str->iso2022.composite_chars) {
2391 Dynarr_free(str->iso2022.composite_chars);
2395 return Lstream_close(str->other_end);
2399 decoding_stream_coding_system(lstream_t stream)
2401 Lisp_Object coding_system;
2402 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2404 XSETCODING_SYSTEM(coding_system, str->codesys);
2405 return subsidiary_coding_system(coding_system, str->eol_type);
2409 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2411 Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2412 decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2414 if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2415 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2417 reset_decoding_stream(str);
2421 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2422 stream for writing, no automatic code detection will be performed.
2423 The reason for this is that automatic code detection requires a
2424 seekable input. Things will also fail if you open a decoding
2425 stream for reading using a non-fully-specified coding system and
2426 a non-seekable input stream. */
2429 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2431 lstream_t lstr = Lstream_new(lstream_decoding, mode);
2432 decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2436 str->other_end = stream;
2437 str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2438 str->eol_type = EOL_AUTODETECT;
2439 if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2440 /* We can determine the coding system now. */
2441 determine_real_coding_system(stream, &codesys, &str->eol_type);
2443 set_decoding_stream_coding_system(lstr, codesys);
2444 str->decst.eol_type = str->eol_type;
2445 str->decst.mask = ~0;
2446 XSETLSTREAM(obj, lstr);
2451 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2453 return make_decoding_stream_1(stream, codesys, "r");
2457 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2459 return make_decoding_stream_1(stream, codesys, "w");
2462 /* Note: the decode_coding_* functions all take the same
2463 arguments as mule_decode(), which is to say some SRC data of
2464 size N, which is to be stored into dynamic array DST.
2465 DECODING is the stream within which the decoding is
2466 taking place, but no data is actually read from or
2467 written to that stream; that is handled in decoding_reader()
2468 or decoding_writer(). This allows the same functions to
2469 be used for both reading and writing. */
2472 mule_decode(lstream_t decoding, const Extbyte * src,
2473 unsigned_char_dynarr * dst, Lstream_data_count n)
2475 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2477 /* If necessary, do encoding-detection now. We do this when
2478 we're a writing stream or a non-seekable reading stream,
2479 meaning that we can't just process the whole input,
2480 rewind, and start over. */
2482 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2483 str->eol_type == EOL_AUTODETECT) {
2484 Lisp_Object codesys;
2486 XSETCODING_SYSTEM(codesys, str->codesys);
2487 detect_coding_type(&str->decst, src, n,
2488 CODING_SYSTEM_TYPE(str->codesys) !=
2489 CODESYS_AUTODETECT);
2490 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2491 str->decst.mask != ~0)
2492 /* #### This is cheesy. What we really ought to do is
2493 buffer up a certain amount of data so as to get a
2494 less random result. */
2495 codesys = coding_system_from_mask(str->decst.mask);
2496 str->eol_type = str->decst.eol_type;
2497 if (XCODING_SYSTEM(codesys) != str->codesys) {
2498 /* Preserve the CODING_STATE_END flag in case it was set.
2499 If we erase it, bad things might happen. */
2500 int was_end = str->flags & CODING_STATE_END;
2501 set_decoding_stream_coding_system(decoding, codesys);
2503 str->flags |= CODING_STATE_END;
2507 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2508 #ifdef DEBUG_SXEMACS
2509 case CODESYS_INTERNAL:
2510 Dynarr_add_many(dst, src, n);
2513 case CODESYS_AUTODETECT:
2514 /* If we got this far and still haven't decided on the coding
2515 system, then do no conversion. */
2516 case CODESYS_NO_CONVERSION:
2517 decode_coding_no_conversion(decoding, src, dst, n);
2520 case CODESYS_SHIFT_JIS:
2521 decode_coding_sjis(decoding, src, dst, n);
2524 decode_coding_big5(decoding, src, dst, n);
2527 decode_coding_ucs4(decoding, src, dst, n);
2530 decode_coding_utf8(decoding, src, dst, n);
2533 str->ccl.last_block = str->flags & CODING_STATE_END;
2534 /* When applying ccl program to stream, MUST NOT set NULL
2536 ccl_driver(&str->ccl,
2538 ? (const unsigned char *)src
2539 : (const unsigned char *)""),
2540 dst, n, 0, CCL_MODE_DECODING);
2542 case CODESYS_ISO2022:
2543 decode_coding_iso2022(decoding, src, dst, n);
2551 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2552 Decode the text between START and END which is encoded in CODING-SYSTEM.
2553 This is useful if you've read in encoded text from a file without decoding
2554 it (e.g. you read in a JIS-formatted file but used the `binary' or
2555 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2556 Return length of decoded text.
2557 BUFFER defaults to the current buffer if unspecified.
2559 (start, end, coding_system, buffer))
2562 struct buffer *buf = decode_buffer(buffer, 0);
2563 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2564 lstream_t istr, ostr;
2565 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2567 get_buffer_range_char(buf, start, end, &b, &e, 0);
2569 barf_if_buffer_read_only(buf, b, e);
2571 coding_system = Fget_coding_system(coding_system);
2572 instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2573 lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2574 de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2576 outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2577 Fget_coding_system(Qbinary));
2578 istr = XLSTREAM(instream);
2579 ostr = XLSTREAM(outstream);
2580 GCPRO4(instream, lb_outstream, de_outstream, outstream);
2582 /* The chain of streams looks like this:
2584 [BUFFER] <----- send through
2585 ------> [ENCODE AS BINARY]
2586 ------> [DECODE AS SPECIFIED]
2591 char tempbuf[1024]; /* some random amount */
2592 Bufpos newpos, even_newer_pos;
2593 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2594 Lstream_data_count size_in_bytes =
2595 Lstream_read(istr, tempbuf, sizeof(tempbuf));
2599 newpos = lisp_buffer_stream_startpos(istr);
2600 Lstream_write(ostr, tempbuf, size_in_bytes);
2601 even_newer_pos = lisp_buffer_stream_startpos(istr);
2602 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2605 Lstream_close(istr);
2606 Lstream_close(ostr);
2608 Lstream_delete(istr);
2609 Lstream_delete(ostr);
2610 Lstream_delete(XLSTREAM(de_outstream));
2611 Lstream_delete(XLSTREAM(lb_outstream));
2615 /************************************************************************/
2616 /* Converting to an external encoding ("encoding") */
2617 /************************************************************************/
2619 /* An encoding stream is an output stream. When you create the
2620 stream, you specify the coding system that governs the encoding
2621 and another stream that the resulting encoded data is to be
2622 sent to, and then start sending data to it. */
2624 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2626 typedef struct encoding_stream_s *encoding_stream_t;
2627 struct encoding_stream_s {
2628 /* Coding system that governs the conversion. */
2629 Lisp_Coding_System *codesys;
2631 /* Stream that we read the encoded data from or
2632 write the decoded data to. */
2633 lstream_t other_end;
2635 /* If we are reading, then we can return only a fixed amount of
2636 data, so if the conversion resulted in too much data, we store it
2637 here for retrieval the next time around. */
2638 unsigned_char_dynarr *runoff;
2640 /* FLAGS holds flags indicating the current state of the encoding.
2641 Some of these flags are dependent on the coding system. */
2644 /* CH holds a partially built-up character. Since we only deal
2645 with one- and two-byte characters at the moment, we only use
2646 this to store the first byte of a two-byte character. */
2649 /* Additional information used by the ISO2022 encoder. */
2651 /* CHARSET holds the character sets currently assigned to the G0
2652 through G3 registers. It is initialized from the array
2653 INITIAL_CHARSET in CODESYS. */
2654 Lisp_Object charset[4];
2656 /* Which registers are currently invoked into the left (GL) and
2657 right (GR) halves of the 8-bit encoding space? */
2658 int register_left, register_right;
2660 /* Whether we need to explicitly designate the charset in the
2661 G? register before using it. It is initialized from the
2662 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2663 unsigned char force_charset_on_output[4];
2665 /* Other state variables that need to be preserved across
2667 Lisp_Object current_charset;
2669 int current_char_boundary;
2672 /* Additional information (the state of the running CCL program)
2673 used by the CCL encoder. */
2674 struct ccl_program ccl;
2678 static Lstream_data_count
2679 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2680 static Lstream_data_count
2681 encoding_writer(lstream_t stream,
2682 const unsigned char *data, Lstream_data_count size);
2683 static int encoding_rewinder(lstream_t stream);
2684 static int encoding_seekable_p(lstream_t stream);
2685 static int encoding_flusher(lstream_t stream);
2686 static int encoding_closer(lstream_t stream);
2688 static Lisp_Object encoding_marker(Lisp_Object stream);
2690 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2691 sizeof(struct encoding_stream_s));
2694 encoding_marker(Lisp_Object stream)
2696 lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2697 Lisp_Object str_obj;
2699 /* We do not need to mark the coding systems or charsets stored
2700 within the stream because they are stored in a global list
2701 and automatically marked. */
2703 XSETLSTREAM(str_obj, str);
2704 mark_object(str_obj);
2705 if (str->imp->marker) {
2706 return str->imp->marker(str_obj);
2712 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2713 so we read data from the other end, encode it, and store it into DATA. */
2715 static Lstream_data_count
2716 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2718 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2719 unsigned char *orig_data = data;
2720 Lstream_data_count read_size;
2721 int error_occurred = 0;
2723 /* We need to interface to mule_encode(), which expects to take some
2724 amount of data and store the result into a Dynarr. We have
2725 mule_encode() store into str->runoff, and take data from there
2728 /* We loop until we have enough data, reading chunks from the other
2729 end and encoding it. */
2731 /* Take data from the runoff if we can. Make sure to take at
2732 most SIZE bytes, and delete the data from the runoff. */
2733 if (Dynarr_length(str->runoff) > 0) {
2734 int chunk = min((int)size, Dynarr_length(str->runoff));
2735 memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2736 Dynarr_delete_many(str->runoff, 0, chunk);
2742 /* No more room for data */
2746 if (str->flags & CODING_STATE_END) {
2747 /* This means that on the previous iteration, we hit the
2748 EOF on the other end. We loop once more so that
2749 mule_encode() can output any final stuff it may be
2750 holding, or any "go back to a sane state" escape
2751 sequences. (This latter makes sense during
2756 /* Exhausted the runoff, so get some more. DATA at least SIZE
2757 bytes left of storage in it, so it's OK to read directly into
2758 it. (We'll be overwriting above, after we've encoded it into
2760 read_size = Lstream_read(str->other_end, data, size);
2761 if (read_size < 0) {
2765 if (read_size == 0) {
2766 /* There might be some more end data produced in the
2767 translation. See the comment above. */
2768 str->flags |= CODING_STATE_END;
2770 mule_encode(stream, data, str->runoff, read_size);
2773 if (data == orig_data) {
2774 return error_occurred ? -1 : 0;
2776 return data - orig_data;
2780 static Lstream_data_count
2781 encoding_writer(lstream_t stream, const unsigned char *data,
2782 Lstream_data_count size)
2784 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2785 Lstream_data_count retval;
2787 /* Encode all our data into the runoff, and then attempt to write
2788 it all out to the other end. Remove whatever chunk we succeeded
2790 mule_encode(stream, data, str->runoff, size);
2791 retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2792 Dynarr_length(str->runoff));
2794 Dynarr_delete_many(str->runoff, 0, retval);
2796 /* Do NOT return retval. The return value indicates how much
2797 of the incoming data was written, not how many bytes were
2803 reset_encoding_stream(encoding_stream_t str)
2806 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2807 case CODESYS_ISO2022: {
2810 for (i = 0; i < 4; i++) {
2811 str->iso2022.charset[i] =
2812 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2814 str->iso2022.force_charset_on_output[i] =
2815 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2818 str->iso2022.register_left = 0;
2819 str->iso2022.register_right = 1;
2820 str->iso2022.current_charset = Qnil;
2821 str->iso2022.current_half = 0;
2822 str->iso2022.current_char_boundary = 1;
2826 setup_ccl_program(&str->ccl,
2827 CODING_SYSTEM_CCL_ENCODE(str->codesys));
2830 /* list the rest of them lot explicitly */
2831 case CODESYS_AUTODETECT:
2832 case CODESYS_SHIFT_JIS:
2836 case CODESYS_NO_CONVERSION:
2837 #ifdef DEBUG_SXEMACS
2838 case CODESYS_INTERNAL:
2845 str->flags = str->ch = 0;
2849 encoding_rewinder(lstream_t stream)
2851 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2852 reset_encoding_stream(str);
2853 Dynarr_reset(str->runoff);
2854 return Lstream_rewind(str->other_end);
2858 encoding_seekable_p(lstream_t stream)
2860 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2861 return Lstream_seekable_p(str->other_end);
2865 encoding_flusher(lstream_t stream)
2867 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2868 return Lstream_flush(str->other_end);
2872 encoding_closer(lstream_t stream)
2874 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2875 if (stream->flags & LSTREAM_FL_WRITE) {
2876 str->flags |= CODING_STATE_END;
2877 encoding_writer(stream, 0, 0);
2879 Dynarr_free(str->runoff);
2880 return Lstream_close(str->other_end);
2884 encoding_stream_coding_system(lstream_t stream)
2886 Lisp_Object coding_system;
2887 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2889 XSETCODING_SYSTEM(coding_system, str->codesys);
2890 return coding_system;
2894 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2896 Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2897 encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2899 reset_encoding_stream(str);
2903 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2905 lstream_t lstr = Lstream_new(lstream_encoding, mode);
2906 encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2910 str->runoff = Dynarr_new(unsigned_char);
2911 str->other_end = stream;
2912 set_encoding_stream_coding_system(lstr, codesys);
2913 XSETLSTREAM(obj, lstr);
2918 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2920 return make_encoding_stream_1(stream, codesys, "r");
2924 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2926 return make_encoding_stream_1(stream, codesys, "w");
2929 /* Convert N bytes of internally-formatted data stored in SRC to an
2930 external format, according to the encoding stream ENCODING.
2931 Store the encoded data into DST. */
2934 mule_encode(lstream_t encoding, const Bufbyte * src,
2935 unsigned_char_dynarr * dst, Lstream_data_count n)
2937 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2939 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2940 #ifdef DEBUG_SXEMACS
2941 case CODESYS_INTERNAL:
2942 Dynarr_add_many(dst, src, n);
2945 case CODESYS_AUTODETECT:
2946 /* If we got this far and still haven't decided on the coding
2947 system, then do no conversion. */
2948 case CODESYS_NO_CONVERSION:
2949 encode_coding_no_conversion(encoding, src, dst, n);
2952 case CODESYS_SHIFT_JIS:
2953 encode_coding_sjis(encoding, src, dst, n);
2956 encode_coding_big5(encoding, src, dst, n);
2959 encode_coding_ucs4(encoding, src, dst, n);
2962 encode_coding_utf8(encoding, src, dst, n);
2965 str->ccl.last_block = str->flags & CODING_STATE_END;
2966 /* When applying ccl program to stream, MUST NOT set NULL
2968 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
2969 dst, n, 0, CCL_MODE_ENCODING);
2971 case CODESYS_ISO2022:
2972 encode_coding_iso2022(encoding, src, dst, n);
2980 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2981 Encode the text between START and END using CODING-SYSTEM.
2982 This will, for example, convert Japanese characters into stuff such as
2983 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2984 text. BUFFER defaults to the current buffer if unspecified.
2986 (start, end, coding_system, buffer))
2989 struct buffer *buf = decode_buffer(buffer, 0);
2990 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2991 lstream_t istr, ostr;
2992 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2994 get_buffer_range_char(buf, start, end, &b, &e, 0);
2996 barf_if_buffer_read_only(buf, b, e);
2998 coding_system = Fget_coding_system(coding_system);
2999 instream = make_lisp_buffer_input_stream(buf, b, e, 0);
3000 lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
3001 de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
3002 Fget_coding_system(Qbinary));
3003 outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
3005 istr = XLSTREAM(instream);
3006 ostr = XLSTREAM(outstream);
3007 GCPRO4(instream, outstream, de_outstream, lb_outstream);
3008 /* The chain of streams looks like this:
3010 [BUFFER] <----- send through
3011 ------> [ENCODE AS SPECIFIED]
3012 ------> [DECODE AS BINARY]
3016 char tempbuf[1024]; /* some random amount */
3017 Bufpos newpos, even_newer_pos;
3018 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
3019 Lstream_data_count size_in_bytes =
3020 Lstream_read(istr, tempbuf, sizeof(tempbuf));
3024 newpos = lisp_buffer_stream_startpos(istr);
3025 Lstream_write(ostr, tempbuf, size_in_bytes);
3026 even_newer_pos = lisp_buffer_stream_startpos(istr);
3027 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3033 lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3034 Lstream_close(istr);
3035 Lstream_close(ostr);
3037 Lstream_delete(istr);
3038 Lstream_delete(ostr);
3039 Lstream_delete(XLSTREAM(de_outstream));
3040 Lstream_delete(XLSTREAM(lb_outstream));
3041 return make_int(retlen);
3047 /************************************************************************/
3048 /* Shift-JIS methods */
3049 /************************************************************************/
3051 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3052 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3053 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3054 encoded by "position-code + 0x80". A character of JISX0208
3055 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3056 position-codes are divided and shifted so that it fit in the range
3059 --- CODE RANGE of Shift-JIS ---
3060 (character set) (range)
3062 JISX0201-Kana 0xA0 .. 0xDF
3063 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3064 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3065 -------------------------------
3069 /* Is this the first byte of a Shift-JIS two-byte char? */
3071 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3072 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3074 /* Is this the second byte of a Shift-JIS two-byte char? */
3076 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3077 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3079 #define BYTE_SJIS_KATAKANA_P(c) \
3080 ((c) >= 0xA1 && (c) <= 0xDF)
3083 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3084 Lstream_data_count n)
3087 const unsigned char c = *(const unsigned char *)src++;
3088 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3090 if (st->shift_jis.in_second_byte) {
3091 st->shift_jis.in_second_byte = 0;
3094 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3095 st->shift_jis.in_second_byte = 1;
3097 return CODING_CATEGORY_SHIFT_JIS_MASK;
3100 /* Convert Shift-JIS data to internal format. */
3103 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3104 unsigned_char_dynarr * dst, Lstream_data_count n)
3106 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3107 unsigned int flags = str->flags;
3108 unsigned int ch = str->ch;
3109 eol_type_t eol_type = str->eol_type;
3112 const unsigned char c = *(const unsigned char *)src++;
3115 /* Previous character was first byte of Shift-JIS Kanji
3117 if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3118 unsigned char e1, e2;
3120 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3121 DECODE_SJIS(ch, c, e1, e2);
3122 Dynarr_add(dst, e1);
3123 Dynarr_add(dst, e2);
3125 DECODE_ADD_BINARY_CHAR(ch, dst);
3126 DECODE_ADD_BINARY_CHAR(c, dst);
3130 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3131 if (BYTE_SJIS_TWO_BYTE_1_P(c))
3133 else if (BYTE_SJIS_KATAKANA_P(c)) {
3134 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3137 DECODE_ADD_BINARY_CHAR(c, dst);
3139 label_continue_loop:;
3142 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3148 /* Convert internally-formatted data to Shift-JIS. */
3151 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3152 unsigned_char_dynarr * dst, Lstream_data_count n)
3154 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3155 unsigned int flags = str->flags;
3156 unsigned int ch = str->ch;
3157 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3162 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3163 Dynarr_add(dst, '\r');
3164 if (eol_type != EOL_CR)
3165 Dynarr_add(dst, '\n');
3167 } else if (BYTE_ASCII_P(c)) {
3170 } else if (BUFBYTE_LEADING_BYTE_P(c))
3171 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3172 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3173 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3175 if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3178 } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3179 ch == LEADING_BYTE_JAPANESE_JISX0208)
3182 /* j1 is bessel j1 function,
3183 * so we use something else */
3184 /* unsigned char j1, j2; */
3185 unsigned char tt1, tt2;
3187 ENCODE_SJIS(ch, c, tt1, tt2);
3188 Dynarr_add(dst, tt1);
3189 Dynarr_add(dst, tt2);
3199 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3200 Decode a JISX0208 character of Shift-JIS coding-system.
3201 CODE is the character code in Shift-JIS as a cons of type bytes.
3202 Return the corresponding character.
3206 unsigned char c1, c2, s1, s2;
3209 CHECK_INT(XCAR(code));
3210 CHECK_INT(XCDR(code));
3211 s1 = XINT(XCAR(code));
3212 s2 = XINT(XCDR(code));
3213 if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3214 DECODE_SJIS(s1, s2, c1, c2);
3215 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3216 c1 & 0x7F, c2 & 0x7F));
3221 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3222 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3223 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3227 Lisp_Object charset;
3230 CHECK_CHAR_COERCE_INT(character);
3231 BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3232 if (EQ(charset, Vcharset_japanese_jisx0208)) {
3233 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3234 return Fcons(make_int(s1), make_int(s2));
3239 /************************************************************************/
3241 /************************************************************************/
3243 /* BIG5 is a coding system encoding two character sets: ASCII and
3244 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3245 character set and is encoded in two-byte.
3247 --- CODE RANGE of BIG5 ---
3248 (character set) (range)
3250 Big5 (1st byte) 0xA1 .. 0xFE
3251 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3252 --------------------------
3254 Since the number of characters in Big5 is larger than maximum
3255 characters in Emacs' charset (96x96), it can't be handled as one
3256 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3257 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3258 contains frequently used characters and the latter contains less
3259 frequently used characters. */
3261 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3262 ((c) >= 0xA1 && (c) <= 0xFE)
3264 /* Is this the second byte of a Shift-JIS two-byte char? */
3266 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3267 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3269 /* Number of Big5 characters which have the same code in 1st byte. */
3271 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3273 /* Code conversion macros. These are macros because they are used in
3274 inner loops during code conversion.
3276 Note that temporary variables in macros introduce the classic
3277 dynamic-scoping problems with variable names. We use capital-
3278 lettered variables in the assumption that SXEmacs does not use
3279 capital letters in variables except in a very formalized way
3282 /* Convert Big5 code (b1, b2) into its internal string representation
3285 /* There is a much simpler way to split the Big5 charset into two.
3286 For the moment I'm going to leave the algorithm as-is because it
3287 claims to separate out the most-used characters into a single
3288 charset, which perhaps will lead to optimizations in various
3291 The way the algorithm works is something like this:
3293 Big5 can be viewed as a 94x157 charset, where the row is
3294 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3295 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3296 the split between low and high column numbers is apparently
3297 meaningless; ascending rows produce less and less frequent chars.
3298 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3299 the first charset, and the upper half (0xC9 .. 0xFE) to the
3300 second. To do the conversion, we convert the character into
3301 a single number where 0 .. 156 is the first row, 157 .. 313
3302 is the second, etc. That way, the characters are ordered by
3303 decreasing frequency. Then we just chop the space in two
3304 and coerce the result into a 94x94 space.
3307 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3309 int B1 = b1, B2 = b2; \
3311 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3315 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3319 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3320 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3322 c1 = I / (0xFF - 0xA1) + 0xA1; \
3323 c2 = I % (0xFF - 0xA1) + 0xA1; \
3326 /* Convert the internal string representation of a Big5 character
3327 (lb, c1, c2) into Big5 code (b1, b2). */
3329 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3331 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3333 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3335 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3337 b1 = I / BIG5_SAME_ROW + 0xA1; \
3338 b2 = I % BIG5_SAME_ROW; \
3339 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3343 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3344 Lstream_data_count n)
3347 const unsigned char c = *(const unsigned char *)src++;
3348 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3349 (c >= 0x80 && c <= 0xA0))
3351 if (st->big5.in_second_byte) {
3352 st->big5.in_second_byte = 0;
3353 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3355 } else if (c >= 0xA1)
3356 st->big5.in_second_byte = 1;
3358 return CODING_CATEGORY_BIG5_MASK;
3361 /* Convert Big5 data to internal format. */
3364 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3365 unsigned_char_dynarr * dst, Lstream_data_count n)
3367 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3368 unsigned int flags = str->flags;
3369 unsigned int ch = str->ch;
3370 eol_type_t eol_type = str->eol_type;
3373 const unsigned char c = *(const unsigned char *)src++;
3375 /* Previous character was first byte of Big5 char. */
3376 if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3377 unsigned char b1, b2, b3;
3378 DECODE_BIG5(ch, c, b1, b2, b3);
3379 Dynarr_add(dst, b1);
3380 Dynarr_add(dst, b2);
3381 Dynarr_add(dst, b3);
3383 DECODE_ADD_BINARY_CHAR(ch, dst);
3384 DECODE_ADD_BINARY_CHAR(c, dst);
3388 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3389 if (BYTE_BIG5_TWO_BYTE_1_P(c))
3392 DECODE_ADD_BINARY_CHAR(c, dst);
3394 label_continue_loop:;
3397 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3403 /* Convert internally-formatted data to Big5. */
3406 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3407 unsigned_char_dynarr * dst, Lstream_data_count n)
3410 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3411 unsigned int flags = str->flags;
3412 unsigned int ch = str->ch;
3413 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3418 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3419 Dynarr_add(dst, '\r');
3420 if (eol_type != EOL_CR)
3421 Dynarr_add(dst, '\n');
3422 } else if (BYTE_ASCII_P(c)) {
3425 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3426 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3427 c == LEADING_BYTE_CHINESE_BIG5_2) {
3428 /* A recognized leading byte. */
3430 continue; /* not done with this character. */
3432 /* otherwise just ignore this character. */
3433 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3434 ch == LEADING_BYTE_CHINESE_BIG5_2) {
3435 /* Previous char was a recognized leading byte. */
3437 continue; /* not done with this character. */
3439 /* Encountering second byte of a Big5 character. */
3440 unsigned char b1, b2;
3442 ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3443 Dynarr_add(dst, b1);
3444 Dynarr_add(dst, b2);
3454 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3455 Decode a Big5 character CODE of BIG5 coding-system.
3456 CODE is the character code in BIG5, a cons of two integers.
3457 Return the corresponding character.
3461 unsigned char c1, c2, b1, b2;
3464 CHECK_INT(XCAR(code));
3465 CHECK_INT(XCDR(code));
3466 b1 = XINT(XCAR(code));
3467 b2 = XINT(XCDR(code));
3468 if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3470 Lisp_Object charset;
3471 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3472 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3473 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3478 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3479 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3480 Return the corresponding character code in Big5.
3484 Lisp_Object charset;
3487 CHECK_CHAR_COERCE_INT(character);
3488 BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3489 if (EQ(charset, Vcharset_chinese_big5_1) ||
3490 EQ(charset, Vcharset_chinese_big5_2)) {
3491 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3493 return Fcons(make_int(b1), make_int(b2));
3498 /************************************************************************/
3501 /* UCS-4 character codes are implemented as nonnegative integers. */
3503 /************************************************************************/
3505 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3506 Map UCS-4 code CODE to Mule character CHARACTER.
3508 Return T on success, NIL on failure.
3514 CHECK_CHAR(character);
3518 if (c < countof(fcd->ucs_to_mule_table)) {
3519 fcd->ucs_to_mule_table[c] = character;
3525 static Lisp_Object ucs_to_char(unsigned long code)
3527 if (code < countof(fcd->ucs_to_mule_table)) {
3528 return fcd->ucs_to_mule_table[code];
3529 } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3533 c = code % (94 * 94);
3535 (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3536 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3537 CHARSET_LEFT_TO_RIGHT),
3538 c / 94 + 33, c % 94 + 33));
3543 DEFUN("ucs-char", Fucs_char, 1, 1, 0, /*
3544 Return Mule character corresponding to UCS code CODE (a positive integer).
3549 return ucs_to_char(XINT(code));
3552 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3553 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3557 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3558 Fset_char_ucs is more restrictive on index arg, but should
3559 check code arg in a char_table method. */
3560 CHECK_CHAR(character);
3562 return Fput_char_table(character, code, mule_to_ucs_table);
3565 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0, /*
3566 Return the UCS code (a positive integer) corresponding to CHARACTER.
3570 return Fget_char_table(character, mule_to_ucs_table);
3573 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3574 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3575 is not found, instead.
3576 #### do something more appropriate (use blob?)
3577 Danger, Will Robinson! Data loss. Should we signal user? */
3578 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3580 Lisp_Object chr = ucs_to_char(ch);
3583 Bufbyte work[MAX_EMCHAR_LEN];
3588 simple_set_charptr_emchar(work, ch) :
3589 non_ascii_set_charptr_emchar(work, ch);
3590 Dynarr_add_many(dst, work, len);
3592 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3593 Dynarr_add(dst, 34 + 128);
3594 Dynarr_add(dst, 46 + 128);
3598 static unsigned long
3599 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3602 = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3607 } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3608 (XCHARSET_CHARS(charset) == 94)) {
3609 unsigned char final = XCHARSET_FINAL(charset);
3611 if (('@' <= final) && (final < 0x7f)) {
3612 return 0xe00000 + (final - '@') * 94 * 94
3613 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3623 encode_ucs4(Lisp_Object charset,
3624 unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3626 unsigned long code = mule_char_to_ucs4(charset, h, l);
3627 Dynarr_add(dst, code >> 24);
3628 Dynarr_add(dst, (code >> 16) & 255);
3629 Dynarr_add(dst, (code >> 8) & 255);
3630 Dynarr_add(dst, code & 255);
3634 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3635 Lstream_data_count n)
3638 const unsigned char c = *(const unsigned char *)src++;
3639 switch (st->ucs4.in_byte) {
3647 st->ucs4.in_byte = 0;
3653 return CODING_CATEGORY_UCS4_MASK;
3657 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3658 unsigned_char_dynarr * dst, Lstream_data_count n)
3660 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3661 unsigned int flags = str->flags;
3662 unsigned int ch = str->ch;
3663 unsigned char counter = str->counter;
3666 const unsigned char c = *(const unsigned char *)src++;
3673 decode_ucs4((ch << 8) | c, dst);
3682 if (counter & CODING_STATE_END)
3683 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3687 str->counter = counter;
3691 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3692 unsigned_char_dynarr * dst, Lstream_data_count n)
3694 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3695 unsigned int flags = str->flags;
3696 unsigned int ch = str->ch;
3697 unsigned char char_boundary = str->iso2022.current_char_boundary;
3698 Lisp_Object charset = str->iso2022.current_charset;
3700 #ifdef ENABLE_COMPOSITE_CHARS
3701 /* flags for handling composite chars. We do a little switcharoo
3702 on the source while we're outputting the composite char. */
3703 unsigned int saved_n = 0;
3704 const unsigned char *saved_src = NULL;
3705 int in_composite = 0;
3711 unsigned char c = *src++;
3713 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
3715 encode_ucs4(Vcharset_ascii, c, 0, dst);
3717 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
3719 charset = CHARSET_BY_LEADING_BYTE(c);
3720 if (LEADING_BYTE_PREFIX_P(c))
3723 } else { /* Processing Non-ASCII character */
3725 if (EQ(charset, Vcharset_control_1)) {
3726 encode_ucs4(Vcharset_control_1, c, 0, dst);
3728 switch (XCHARSET_REP_BYTES(charset)) {
3730 encode_ucs4(charset, c, 0, dst);
3733 if (XCHARSET_PRIVATE_P(charset)) {
3734 encode_ucs4(charset, c, 0, dst);
3737 #ifdef ENABLE_COMPOSITE_CHARS
3740 Vcharset_composite)) {
3742 /* #### Bother! We don't know how to
3755 (Vcharset_composite,
3760 composite_char_string
3769 n = XSTRING_LENGTH(lstr);
3772 #endif /* ENABLE_COMPOSITE_CHARS */
3774 encode_ucs4(charset, ch,
3785 encode_ucs4(charset, ch, c,
3800 #ifdef ENABLE_COMPOSITE_CHARS
3805 goto back_to_square_n; /* Wheeeeeeeee ..... */
3807 #endif /* ENABLE_COMPOSITE_CHARS */
3811 str->iso2022.current_char_boundary = char_boundary;
3812 str->iso2022.current_charset = charset;
3814 /* Verbum caro factum est! */
3817 /************************************************************************/
3819 /************************************************************************/
3822 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3823 Lstream_data_count n)
3826 const unsigned char c = *(const unsigned char *)src++;
3827 switch (st->utf8.in_byte) {
3829 if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3830 || c == ISO_CODE_SO)
3833 st->utf8.in_byte = 5;
3835 st->utf8.in_byte = 4;
3837 st->utf8.in_byte = 3;
3839 st->utf8.in_byte = 2;
3841 st->utf8.in_byte = 1;
3846 if ((c & 0xc0) != 0x80)
3852 return CODING_CATEGORY_UTF8_MASK;
3856 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3857 unsigned_char_dynarr * dst, Lstream_data_count n)
3859 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3860 unsigned int flags = str->flags;
3861 unsigned int ch = str->ch;
3862 eol_type_t eol_type = str->eol_type;
3863 unsigned char counter = str->counter;
3866 const unsigned char c = *(const unsigned char *)src++;
3872 } else if (c >= 0xf8) {
3875 } else if (c >= 0xf0) {
3878 } else if (c >= 0xe0) {
3881 } else if (c >= 0xc0) {
3885 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3886 decode_ucs4(c, dst);
3890 ch = (ch << 6) | (c & 0x3f);
3891 decode_ucs4(ch, dst);
3896 ch = (ch << 6) | (c & 0x3f);
3899 label_continue_loop:;
3902 if (flags & CODING_STATE_END)
3903 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3907 str->counter = counter;
3911 encode_utf8(Lisp_Object charset,
3912 unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3914 unsigned long code = mule_char_to_ucs4(charset, h, l);
3916 Dynarr_add(dst, code);
3917 } else if (code <= 0x7ff) {
3918 Dynarr_add(dst, (code >> 6) | 0xc0);
3919 Dynarr_add(dst, (code & 0x3f) | 0x80);
3920 } else if (code <= 0xffff) {
3921 Dynarr_add(dst, (code >> 12) | 0xe0);
3922 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3923 Dynarr_add(dst, (code & 0x3f) | 0x80);
3924 } else if (code <= 0x1fffff) {
3925 Dynarr_add(dst, (code >> 18) | 0xf0);
3926 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3927 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3928 Dynarr_add(dst, (code & 0x3f) | 0x80);
3929 } else if (code <= 0x3ffffff) {
3930 Dynarr_add(dst, (code >> 24) | 0xf8);
3931 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3932 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3933 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3934 Dynarr_add(dst, (code & 0x3f) | 0x80);
3936 Dynarr_add(dst, (code >> 30) | 0xfc);
3937 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3938 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3939 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3940 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3941 Dynarr_add(dst, (code & 0x3f) | 0x80);
3946 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3947 unsigned_char_dynarr * dst, Lstream_data_count n)
3949 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3950 unsigned int flags = str->flags;
3951 unsigned int ch = str->ch;
3952 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3953 unsigned char char_boundary = str->iso2022.current_char_boundary;
3954 Lisp_Object charset = str->iso2022.current_charset;
3956 #ifdef ENABLE_COMPOSITE_CHARS
3957 /* flags for handling composite chars. We do a little switcharoo
3958 on the source while we're outputting the composite char. */
3959 unsigned int saved_n = 0;
3960 const unsigned char *saved_src = NULL;
3961 int in_composite = 0;
3964 #endif /* ENABLE_COMPOSITE_CHARS */
3967 unsigned char c = *src++;
3969 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
3972 if (eol_type != EOL_LF
3973 && eol_type != EOL_AUTODETECT)
3974 Dynarr_add(dst, '\r');
3975 if (eol_type != EOL_CR)
3978 encode_utf8(Vcharset_ascii, c, 0, dst);
3980 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
3982 charset = CHARSET_BY_LEADING_BYTE(c);
3983 if (LEADING_BYTE_PREFIX_P(c))
3986 } else { /* Processing Non-ASCII character */
3988 if (EQ(charset, Vcharset_control_1)) {
3989 encode_utf8(Vcharset_control_1, c, 0, dst);
3991 switch (XCHARSET_REP_BYTES(charset)) {
3993 encode_utf8(charset, c, 0, dst);
3996 if (XCHARSET_PRIVATE_P(charset)) {
3997 encode_utf8(charset, c, 0, dst);
4000 #ifdef ENABLE_COMPOSITE_CHARS
4003 Vcharset_composite)) {
4005 /* #### Bother! We don't know how to
4014 (Vcharset_composite,
4019 composite_char_string
4028 n = XSTRING_LENGTH(lstr);
4031 #endif /* ENABLE_COMPOSITE_CHARS */
4033 encode_utf8(charset, ch,
4044 encode_utf8(charset, ch, c,
4059 #ifdef ENABLE_COMPOSITE_CHARS
4064 goto back_to_square_n; /* Wheeeeeeeee ..... */
4070 str->iso2022.current_char_boundary = char_boundary;
4071 str->iso2022.current_charset = charset;
4073 /* Verbum caro factum est! */
4076 /************************************************************************/
4077 /* ISO2022 methods */
4078 /************************************************************************/
4080 /* The following note describes the coding system ISO2022 briefly.
4081 Since the intention of this note is to help understand the
4082 functions in this file, some parts are NOT ACCURATE or OVERLY
4083 SIMPLIFIED. For thorough understanding, please refer to the
4084 original document of ISO2022.
4086 ISO2022 provides many mechanisms to encode several character sets
4087 in 7-bit and 8-bit environments. For 7-bit environments, all text
4088 is encoded using bytes less than 128. This may make the encoded
4089 text a little bit longer, but the text passes more easily through
4090 several gateways, some of which strip off MSB (Most Signigant Bit).
4092 There are two kinds of character sets: control character set and
4093 graphic character set. The former contains control characters such
4094 as `newline' and `escape' to provide control functions (control
4095 functions are also provided by escape sequences). The latter
4096 contains graphic characters such as 'A' and '-'. Emacs recognizes
4097 two control character sets and many graphic character sets.
4099 Graphic character sets are classified into one of the following
4100 four classes, according to the number of bytes (DIMENSION) and
4101 number of characters in one dimension (CHARS) of the set:
4102 - DIMENSION1_CHARS94
4103 - DIMENSION1_CHARS96
4104 - DIMENSION2_CHARS94
4105 - DIMENSION2_CHARS96
4107 In addition, each character set is assigned an identification tag,
4108 unique for each set, called "final character" (denoted as <F>
4109 hereafter). The <F> of each character set is decided by ECMA(*)
4110 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4111 (0x30..0x3F are for private use only).
4113 Note (*): ECMA = European Computer Manufacturers Association
4115 Here are examples of graphic character set [NAME(<F>)]:
4116 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4117 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4118 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4119 o DIMENSION2_CHARS96 -- none for the moment
4121 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4122 C0 [0x00..0x1F] -- control character plane 0
4123 GL [0x20..0x7F] -- graphic character plane 0
4124 C1 [0x80..0x9F] -- control character plane 1
4125 GR [0xA0..0xFF] -- graphic character plane 1
4127 A control character set is directly designated and invoked to C0 or
4128 C1 by an escape sequence. The most common case is that:
4129 - ISO646's control character set is designated/invoked to C0, and
4130 - ISO6429's control character set is designated/invoked to C1,
4131 and usually these designations/invocations are omitted in encoded
4132 text. In a 7-bit environment, only C0 can be used, and a control
4133 character for C1 is encoded by an appropriate escape sequence to
4134 fit into the environment. All control characters for C1 are
4135 defined to have corresponding escape sequences.
4137 A graphic character set is at first designated to one of four
4138 graphic registers (G0 through G3), then these graphic registers are
4139 invoked to GL or GR. These designations and invocations can be
4140 done independently. The most common case is that G0 is invoked to
4141 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4142 these invocations and designations are omitted in encoded text.
4143 In a 7-bit environment, only GL can be used.
4145 When a graphic character set of CHARS94 is invoked to GL, codes
4146 0x20 and 0x7F of the GL area work as control characters SPACE and
4147 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4150 There are two ways of invocation: locking-shift and single-shift.
4151 With locking-shift, the invocation lasts until the next different
4152 invocation, whereas with single-shift, the invocation affects the
4153 following character only and doesn't affect the locking-shift
4154 state. Invocations are done by the following control characters or
4157 ----------------------------------------------------------------------
4158 abbrev function cntrl escape seq description
4159 ----------------------------------------------------------------------
4160 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4161 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4162 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4163 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4164 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4165 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4166 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4167 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4168 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4169 ----------------------------------------------------------------------
4170 (*) These are not used by any known coding system.
4172 Control characters for these functions are defined by macros
4173 ISO_CODE_XXX in `coding.h'.
4175 Designations are done by the following escape sequences:
4176 ----------------------------------------------------------------------
4177 escape sequence description
4178 ----------------------------------------------------------------------
4179 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4180 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4181 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4182 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4183 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4184 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4185 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4186 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4187 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4188 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4189 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4190 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4191 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4192 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4193 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4194 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4195 ----------------------------------------------------------------------
4197 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4198 of dimension 1, chars 94, and final character <F>, etc...
4200 Note (*): Although these designations are not allowed in ISO2022,
4201 Emacs accepts them on decoding, and produces them on encoding
4202 CHARS96 character sets in a coding system which is characterized as
4203 7-bit environment, non-locking-shift, and non-single-shift.
4205 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4206 '(' can be omitted. We refer to this as "short-form" hereafter.
4208 Now you may notice that there are a lot of ways for encoding the
4209 same multilingual text in ISO2022. Actually, there exist many
4210 coding systems such as Compound Text (used in X11's inter client
4211 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4212 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4213 localized platforms), and all of these are variants of ISO2022.
4215 In addition to the above, Emacs handles two more kinds of escape
4216 sequences: ISO6429's direction specification and Emacs' private
4217 sequence for specifying character composition.
4219 ISO6429's direction specification takes the following form:
4220 o CSI ']' -- end of the current direction
4221 o CSI '0' ']' -- end of the current direction
4222 o CSI '1' ']' -- start of left-to-right text
4223 o CSI '2' ']' -- start of right-to-left text
4224 The control character CSI (0x9B: control sequence introducer) is
4225 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4227 Character composition specification takes the following form:
4228 o ESC '0' -- start character composition
4229 o ESC '1' -- end character composition
4230 Since these are not standard escape sequences of any ISO standard,
4231 their use with these meanings is restricted to Emacs only. */
4234 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4238 for (i = 0; i < 4; i++) {
4239 if (!NILP(coding_system))
4241 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4244 iso->charset[i] = Qt;
4245 iso->invalid_designated[i] = 0;
4247 iso->esc = ISO_ESC_NOTHING;
4248 iso->esc_bytes_index = 0;
4249 iso->register_left = 0;
4250 iso->register_right = 1;
4251 iso->switched_dir_and_no_valid_charset_yet = 0;
4252 iso->invalid_switch_dir = 0;
4253 iso->output_direction_sequence = 0;
4254 iso->output_literally = 0;
4255 #ifdef ENABLE_COMPOSITE_CHARS
4256 if (iso->composite_chars)
4257 Dynarr_reset(iso->composite_chars);
4261 static int fit_to_be_escape_quoted(unsigned char c)
4277 /* Parse one byte of an ISO2022 escape sequence.
4278 If the result is an invalid escape sequence, return 0 and
4279 do not change anything in STR. Otherwise, if the result is
4280 an incomplete escape sequence, update ISO2022.ESC and
4281 ISO2022.ESC_BYTES and return -1. Otherwise, update
4282 all the state variables (but not ISO2022.ESC_BYTES) and
4285 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4286 or invocation of an invalid character set and treat that as
4287 an unrecognized escape sequence.
4289 ********************************************************************
4291 #### Strategies for error annotation and coding orthogonalization
4293 We really want to separate out a number of things. Conceptually,
4294 there is a nested syntax.
4296 At the top level is the ISO 2022 extension syntax, including charset
4297 designation and invocation, and certain auxiliary controls such as the
4298 ISO 6429 direction specification. These are octet-oriented, with the
4299 single exception (AFAIK) of the "exit Unicode" sequence which uses the
4300 UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4301 UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a
4302 (deprecated) special case in Unicode processing.
4304 The middle layer is ISO 2022 character interpretation. This will depend
4305 on the current state of the ISO 2022 registers, and assembles octets
4306 into the character's internal representation.
4308 The lowest level is translating system control conventions. At present
4309 this is restricted to newline translation, but one could imagine doing
4310 tab conversion or line wrapping here. "Escape from Unicode" processing
4311 would be done at this level.
4313 At each level the parser will verify the syntax. In the case of a
4314 syntax error or warning (such as a redundant escape sequence that affects
4315 no characters), the parser will take some action, typically inserting the
4316 erroneous octets directly into the output and creating an annotation
4317 which can be used by higher level I/O to mark the affected region.
4319 This should make it possible to do something sensible about separating
4320 newline convention processing from character construction, and about
4321 preventing ISO 2022 escape sequences from being recognized
4324 The basic strategy will be to have octet classification tables, and
4325 switch processing according to the table entry.
4327 It's possible that, by doing the processing with tables of functions or
4328 the like, the parser can be used for both detection and translation. */
4331 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4332 unsigned char c, unsigned int *flags,
4333 int check_invalid_charsets)
4335 /* (1) If we're at the end of a designation sequence, CS is the
4336 charset being designated and REG is the register to designate
4339 (2) If we're at the end of a locking-shift sequence, REG is
4340 the register to invoke and HALF (0 == left, 1 == right) is
4341 the half to invoke it into.
4343 (3) If we're at the end of a single-shift sequence, REG is
4344 the register to invoke. */
4345 Lisp_Object cs = Qnil;
4348 /* NOTE: This code does goto's all over the fucking place.
4349 The reason for this is that we're basically implementing
4350 a state machine here, and hierarchical languages like C
4351 don't really provide a clean way of doing this. */
4353 if (!(*flags & CODING_STATE_ESCAPE))
4354 /* At beginning of escape sequence; we need to reset our
4355 escape-state variables. */
4356 iso->esc = ISO_ESC_NOTHING;
4358 iso->output_literally = 0;
4359 iso->output_direction_sequence = 0;
4362 case ISO_ESC_NOTHING:
4363 iso->esc_bytes_index = 0;
4365 case ISO_CODE_ESC: /* Start escape sequence */
4366 *flags |= CODING_STATE_ESCAPE;
4370 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4371 *flags |= CODING_STATE_ESCAPE;
4372 iso->esc = ISO_ESC_5_11;
4375 case ISO_CODE_SO: /* locking shift 1 */
4379 case ISO_CODE_SI: /* locking shift 0 */
4384 case ISO_CODE_SS2: /* single shift */
4387 case ISO_CODE_SS3: /* single shift */
4391 default: /* Other control characters */
4397 /**** single shift ****/
4399 case 'N': /* single shift 2 */
4402 case 'O': /* single shift 3 */
4406 /**** locking shift ****/
4408 case '~': /* locking shift 1 right */
4412 case 'n': /* locking shift 2 */
4416 case '}': /* locking shift 2 right */
4420 case 'o': /* locking shift 3 */
4424 case '|': /* locking shift 3 right */
4429 #ifdef ENABLE_COMPOSITE_CHARS
4430 /**** composite ****/
4433 iso->esc = ISO_ESC_START_COMPOSITE;
4434 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4435 CODING_STATE_COMPOSITE;
4439 iso->esc = ISO_ESC_END_COMPOSITE;
4440 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4441 ~CODING_STATE_COMPOSITE;
4443 #endif /* ENABLE_COMPOSITE_CHARS */
4445 /**** directionality ****/
4448 iso->esc = ISO_ESC_5_11;
4451 /**** designation ****/
4453 case '$': /* multibyte charset prefix */
4454 iso->esc = ISO_ESC_2_4;
4458 if (0x28 <= c && c <= 0x2F) {
4460 (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4464 /* This function is called with CODESYS equal to nil when
4465 doing coding-system detection. */
4467 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4468 && fit_to_be_escape_quoted(c)) {
4469 iso->esc = ISO_ESC_LITERAL;
4470 *flags &= CODING_STATE_ISO2022_LOCK;
4478 /**** directionality ****/
4480 case ISO_ESC_5_11: /* ISO6429 direction control */
4483 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4484 goto directionality;
4487 iso->esc = ISO_ESC_5_11_0;
4489 iso->esc = ISO_ESC_5_11_1;
4491 iso->esc = ISO_ESC_5_11_2;
4496 case ISO_ESC_5_11_0:
4499 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4500 goto directionality;
4504 case ISO_ESC_5_11_1:
4507 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4508 goto directionality;
4512 case ISO_ESC_5_11_2:
4515 (*flags & CODING_STATE_ISO2022_LOCK) |
4517 goto directionality;
4522 iso->esc = ISO_ESC_DIRECTIONALITY;
4523 /* Various junk here to attempt to preserve the direction
4524 sequences literally in the text if they would otherwise be
4525 swallowed due to invalid designations that don't show up as
4526 actual charset changes in the text. */
4527 if (iso->invalid_switch_dir) {
4528 /* We already inserted a direction switch literally into
4529 the text. We assume (#### this may not be right)
4530 that the next direction switch is the one going the
4531 other way, and we need to output that literally as
4533 iso->output_literally = 1;
4534 iso->invalid_switch_dir = 0;
4538 /* If we are in the thrall of an invalid designation,
4539 then stick the directionality sequence literally into
4540 the output stream so it ends up in the original text
4542 for (jj = 0; jj < 4; jj++)
4543 if (iso->invalid_designated[jj])
4546 iso->output_literally = 1;
4547 iso->invalid_switch_dir = 1;
4549 /* Indicate that we haven't yet seen a valid
4550 designation, so that if a switch-dir is
4551 directly followed by an invalid designation,
4552 both get inserted literally. */
4553 iso->switched_dir_and_no_valid_charset_yet = 1;
4557 /**** designation ****/
4560 if (0x28 <= c && c <= 0x2F) {
4562 (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4565 if (0x40 <= c && c <= 0x42) {
4566 cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4567 *flags & CODING_STATE_R2L ?
4568 CHARSET_RIGHT_TO_LEFT :
4569 CHARSET_LEFT_TO_RIGHT);
4586 case ISO_ESC_2_4_10:
4587 case ISO_ESC_2_4_11:
4588 case ISO_ESC_2_4_12:
4589 case ISO_ESC_2_4_13:
4590 case ISO_ESC_2_4_14:
4591 case ISO_ESC_2_4_15:
4592 case ISO_ESC_SINGLE_SHIFT:
4593 case ISO_ESC_LOCKING_SHIFT:
4594 case ISO_ESC_DESIGNATE:
4595 case ISO_ESC_DIRECTIONALITY:
4596 case ISO_ESC_LITERAL:
4601 if (c < '0' || c > '~')
4602 return 0; /* bad final byte */
4604 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4605 type = ((iso->esc >= ISO_ESC_2_12) ?
4606 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4607 reg = (iso->esc - ISO_ESC_2_8) & 3;
4608 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4609 iso->esc <= ISO_ESC_2_4_15) {
4610 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4611 CHARSET_TYPE_96X96 :
4612 CHARSET_TYPE_94X94);
4613 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4615 /* Can this ever be reached? -slb */
4620 cs = CHARSET_BY_ATTRIBUTES(type, c,
4621 *flags & CODING_STATE_R2L ?
4622 CHARSET_RIGHT_TO_LEFT :
4623 CHARSET_LEFT_TO_RIGHT);
4629 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4633 if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4634 /* can't invoke something that ain't there. */
4636 iso->esc = ISO_ESC_SINGLE_SHIFT;
4637 *flags &= CODING_STATE_ISO2022_LOCK;
4639 *flags |= CODING_STATE_SS2;
4641 *flags |= CODING_STATE_SS3;
4645 if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4646 /* can't invoke something that ain't there. */
4649 iso->register_right = reg;
4651 iso->register_left = reg;
4652 *flags &= CODING_STATE_ISO2022_LOCK;
4653 iso->esc = ISO_ESC_LOCKING_SHIFT;
4657 if (NILP(cs) && check_invalid_charsets) {
4658 iso->invalid_designated[reg] = 1;
4659 iso->charset[reg] = Vcharset_ascii;
4660 iso->esc = ISO_ESC_DESIGNATE;
4661 *flags &= CODING_STATE_ISO2022_LOCK;
4662 iso->output_literally = 1;
4663 if (iso->switched_dir_and_no_valid_charset_yet) {
4664 /* We encountered a switch-direction followed by an
4665 invalid designation. Ensure that the switch-direction
4666 gets outputted; otherwise it will probably get eaten
4667 when the text is written out again. */
4668 iso->switched_dir_and_no_valid_charset_yet = 0;
4669 iso->output_direction_sequence = 1;
4670 /* And make sure that the switch-dir going the other
4671 way gets outputted, as well. */
4672 iso->invalid_switch_dir = 1;
4676 /* This function is called with CODESYS equal to nil when
4677 doing coding-system detection. */
4678 if (!NILP(codesys)) {
4679 charset_conversion_spec_dynarr *dyn =
4680 XCODING_SYSTEM(codesys)->iso2022.input_conv;
4685 for (i = 0; i < Dynarr_length(dyn); i++) {
4686 struct charset_conversion_spec *spec =
4688 if (EQ(cs, spec->from_charset))
4689 cs = spec->to_charset;
4694 iso->charset[reg] = cs;
4695 iso->esc = ISO_ESC_DESIGNATE;
4696 *flags &= CODING_STATE_ISO2022_LOCK;
4697 if (iso->invalid_designated[reg]) {
4698 iso->invalid_designated[reg] = 0;
4699 iso->output_literally = 1;
4701 if (iso->switched_dir_and_no_valid_charset_yet)
4702 iso->switched_dir_and_no_valid_charset_yet = 0;
4707 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4708 Lstream_data_count n)
4712 /* #### There are serious deficiencies in the recognition mechanism
4713 here. This needs to be much smarter if it's going to cut it.
4714 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4715 it should be detected as Latin-1.
4716 All the ISO2022 stuff in this file should be synced up with the
4717 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4718 Perhaps we should wait till R2L works in FSF Emacs? */
4720 if (!st->iso2022.initted) {
4721 reset_iso2022(Qnil, &st->iso2022.iso);
4722 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4723 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4724 CODING_CATEGORY_ISO_8_1_MASK |
4725 CODING_CATEGORY_ISO_8_2_MASK |
4726 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4727 st->iso2022.flags = 0;
4728 st->iso2022.high_byte_count = 0;
4729 st->iso2022.saw_single_shift = 0;
4730 st->iso2022.initted = 1;
4733 mask = st->iso2022.mask;
4736 const unsigned char c = *(const unsigned char *)src++;
4738 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4739 st->iso2022.high_byte_count++;
4741 if (st->iso2022.high_byte_count
4742 && !st->iso2022.saw_single_shift) {
4743 if (st->iso2022.high_byte_count & 1)
4744 /* odd number of high bytes; assume not iso-8-2 */
4745 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4747 st->iso2022.high_byte_count = 0;
4748 st->iso2022.saw_single_shift = 0;
4750 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4752 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4753 && (BYTE_C0_P(c) || BYTE_C1_P(c))) { /* control chars */
4755 /* Allow and ignore control characters that you might
4756 reasonably see in a text file */
4761 case 8: /* backspace */
4762 case 11: /* vertical tab */
4763 case 12: /* form feed */
4764 case 26: /* MS-DOS C-z junk */
4765 case 31: /* '^_' -- for info */
4766 goto label_continue_loop;
4773 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4775 if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4776 &st->iso2022.flags, 0)) {
4777 switch (st->iso2022.iso.esc) {
4778 case ISO_ESC_DESIGNATE:
4779 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4780 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4782 case ISO_ESC_LOCKING_SHIFT:
4783 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4784 goto ran_out_of_chars;
4785 case ISO_ESC_SINGLE_SHIFT:
4786 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4787 st->iso2022.saw_single_shift = 1;
4791 case ISO_ESC_NOTHING:
4804 case ISO_ESC_2_4_10:
4805 case ISO_ESC_2_4_11:
4806 case ISO_ESC_2_4_12:
4807 case ISO_ESC_2_4_13:
4808 case ISO_ESC_2_4_14:
4809 case ISO_ESC_2_4_15:
4811 case ISO_ESC_5_11_0:
4812 case ISO_ESC_5_11_1:
4813 case ISO_ESC_5_11_2:
4814 case ISO_ESC_DIRECTIONALITY:
4815 case ISO_ESC_LITERAL:
4821 goto ran_out_of_chars;
4824 label_continue_loop:;
4831 static int postprocess_iso2022_mask(int mask)
4833 /* #### kind of cheesy */
4834 /* If seven-bit ISO is allowed, then assume that the encoding is
4835 entirely seven-bit and turn off the eight-bit ones. */
4836 if (mask & CODING_CATEGORY_ISO_7_MASK)
4837 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4838 CODING_CATEGORY_ISO_8_1_MASK |
4839 CODING_CATEGORY_ISO_8_2_MASK);
4843 /* If FLAGS is a null pointer or specifies right-to-left motion,
4844 output a switch-dir-to-left-to-right sequence to DST.
4845 Also update FLAGS if it is not a null pointer.
4846 If INTERNAL_P is set, we are outputting in internal format and
4847 need to handle the CSI differently. */
4850 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4851 unsigned_char_dynarr * dst,
4852 unsigned int *flags, int internal_p)
4854 if (!flags || (*flags & CODING_STATE_R2L)) {
4855 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4856 Dynarr_add(dst, ISO_CODE_ESC);
4857 Dynarr_add(dst, '[');
4858 } else if (internal_p)
4859 DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4861 Dynarr_add(dst, ISO_CODE_CSI);
4862 Dynarr_add(dst, '0');
4863 Dynarr_add(dst, ']');
4865 *flags &= ~CODING_STATE_R2L;
4869 /* If FLAGS is a null pointer or specifies a direction different from
4870 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4871 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4872 sequence to DST. Also update FLAGS if it is not a null pointer.
4873 If INTERNAL_P is set, we are outputting in internal format and
4874 need to handle the CSI differently. */
4877 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4878 unsigned_char_dynarr * dst, unsigned int *flags,
4881 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4882 direction == CHARSET_LEFT_TO_RIGHT)
4883 restore_left_to_right_direction(codesys, dst, flags,
4885 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4886 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4887 direction == CHARSET_RIGHT_TO_LEFT) {
4888 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4889 Dynarr_add(dst, ISO_CODE_ESC);
4890 Dynarr_add(dst, '[');
4891 } else if (internal_p)
4892 DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4894 Dynarr_add(dst, ISO_CODE_CSI);
4895 Dynarr_add(dst, '2');
4896 Dynarr_add(dst, ']');
4898 *flags |= CODING_STATE_R2L;
4902 /* Convert ISO2022-format data to internal format. */
4905 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4906 unsigned_char_dynarr * dst, Lstream_data_count n)
4908 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4909 unsigned int flags = str->flags;
4910 unsigned int ch = str->ch;
4911 eol_type_t eol_type = str->eol_type;
4912 #ifdef ENABLE_COMPOSITE_CHARS
4913 unsigned_char_dynarr *real_dst = dst;
4915 Lisp_Object coding_system;
4917 XSETCODING_SYSTEM(coding_system, str->codesys);
4919 #ifdef ENABLE_COMPOSITE_CHARS
4920 if (flags & CODING_STATE_COMPOSITE)
4921 dst = str->iso2022.composite_chars;
4922 #endif /* ENABLE_COMPOSITE_CHARS */
4925 const unsigned char c = *(const unsigned char *)src++;
4926 if (flags & CODING_STATE_ESCAPE) {
4927 /* Within ESC sequence */
4928 int retval = parse_iso2022_esc(
4929 coding_system, &str->iso2022, c, &flags, 1);
4932 switch (str->iso2022.esc) {
4933 #ifdef ENABLE_COMPOSITE_CHARS
4934 case ISO_ESC_START_COMPOSITE:
4935 if (str->iso2022.composite_chars)
4936 Dynarr_reset(str->iso2022.
4939 str->iso2022.composite_chars =
4940 Dynarr_new(unsigned_char);
4941 dst = str->iso2022.composite_chars;
4943 case ISO_ESC_END_COMPOSITE:
4945 Bufbyte comstr[MAX_EMCHAR_LEN];
4948 lookup_composite_char
4949 (Dynarr_atp(dst, 0),
4950 Dynarr_length(dst));
4953 set_charptr_emchar(comstr,
4955 Dynarr_add_many(dst, comstr,
4959 #endif /* ENABLE_COMPOSITE_CHARS */
4961 case ISO_ESC_LITERAL:
4962 DECODE_ADD_BINARY_CHAR(c, dst);
4965 case ISO_ESC_NOTHING:
4978 case ISO_ESC_2_4_10:
4979 case ISO_ESC_2_4_11:
4980 case ISO_ESC_2_4_12:
4981 case ISO_ESC_2_4_13:
4982 case ISO_ESC_2_4_14:
4983 case ISO_ESC_2_4_15:
4985 case ISO_ESC_5_11_0:
4986 case ISO_ESC_5_11_1:
4987 case ISO_ESC_5_11_2:
4988 case ISO_ESC_SINGLE_SHIFT:
4989 case ISO_ESC_LOCKING_SHIFT:
4990 case ISO_ESC_DESIGNATE:
4991 case ISO_ESC_DIRECTIONALITY:
4994 /* Everything else handled already */
4999 /* Attempted error recovery. */
5000 if (str->iso2022.output_direction_sequence)
5001 ensure_correct_direction(flags &
5003 CHARSET_RIGHT_TO_LEFT :
5004 CHARSET_LEFT_TO_RIGHT,
5005 str->codesys, dst, 0,
5007 /* More error recovery. */
5008 if (!retval || str->iso2022.output_literally) {
5009 /* Output the (possibly invalid) sequence */
5011 for (i = 0; i < str->iso2022.esc_bytes_index;
5013 DECODE_ADD_BINARY_CHAR(str->iso2022.
5016 flags &= CODING_STATE_ISO2022_LOCK;
5018 n++, src--; /* Repeat the loop with the same character. */
5020 /* No sense in reprocessing the final byte of the
5021 escape sequence; it could mess things up anyway.
5023 DECODE_ADD_BINARY_CHAR(c, dst);
5027 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) { /* Control characters */
5029 /***** Error-handling *****/
5031 /* If we were in the middle of a character, dump out the
5032 partial character. */
5033 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5035 /* If we just saw a single-shift character, dump it out.
5036 This may dump out the wrong sort of single-shift character,
5037 but least it will give an indication that something went
5039 if (flags & CODING_STATE_SS2) {
5040 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5041 flags &= ~CODING_STATE_SS2;
5043 if (flags & CODING_STATE_SS3) {
5044 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5045 flags &= ~CODING_STATE_SS3;
5048 /***** Now handle the control characters. *****/
5051 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5053 flags &= CODING_STATE_ISO2022_LOCK;
5055 if (!parse_iso2022_esc
5056 (coding_system, &str->iso2022, c, &flags, 1))
5057 DECODE_ADD_BINARY_CHAR(c, dst);
5058 } else { /* Graphic characters */
5059 Lisp_Object charset;
5063 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5065 /* Now determine the charset. */
5066 reg = ((flags & CODING_STATE_SS2) ? 2
5067 : (flags & CODING_STATE_SS3) ? 3
5068 : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5069 : str->iso2022.register_left);
5070 charset = str->iso2022.charset[reg];
5072 /* Error checking: */
5073 if (!CHARSETP(charset)
5074 || str->iso2022.invalid_designated[reg]
5076 (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5077 && XCHARSET_CHARS(charset) == 94))
5078 /* Mrmph. We are trying to invoke a register that has no
5079 or an invalid charset in it, or trying to add a character
5080 outside the range of the charset. Insert that char literally
5081 to preserve it for the output. */
5083 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5084 DECODE_ADD_BINARY_CHAR(c, dst);
5088 /* Things are probably hunky-dorey. */
5090 /* Fetch reverse charset, maybe. */
5091 if (((flags & CODING_STATE_R2L) &&
5092 XCHARSET_DIRECTION(charset) ==
5093 CHARSET_LEFT_TO_RIGHT)
5094 || (!(flags & CODING_STATE_R2L)
5095 && XCHARSET_DIRECTION(charset) ==
5096 CHARSET_RIGHT_TO_LEFT)) {
5097 Lisp_Object new_charset =
5098 XCHARSET_REVERSE_DIRECTION_CHARSET
5100 if (!NILP(new_charset))
5101 charset = new_charset;
5104 lb = XCHARSET_LEADING_BYTE(charset);
5105 switch (XCHARSET_REP_BYTES(charset)) {
5107 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5108 Dynarr_add(dst, c & 0x7F);
5111 case 2: /* one-byte official */
5112 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5113 Dynarr_add(dst, lb);
5114 Dynarr_add(dst, c | 0x80);
5117 case 3: /* one-byte private or two-byte official */
5118 if (XCHARSET_PRIVATE_P(charset)) {
5119 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5121 PRE_LEADING_BYTE_PRIVATE_1);
5122 Dynarr_add(dst, lb);
5123 Dynarr_add(dst, c | 0x80);
5126 Dynarr_add(dst, lb);
5137 default: /* two-byte private */
5140 PRE_LEADING_BYTE_PRIVATE_2);
5141 Dynarr_add(dst, lb);
5142 Dynarr_add(dst, ch | 0x80);
5143 Dynarr_add(dst, c | 0x80);
5151 flags &= CODING_STATE_ISO2022_LOCK;
5154 label_continue_loop:;
5157 if (flags & CODING_STATE_END)
5158 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5164 /***** ISO2022 encoder *****/
5166 /* Designate CHARSET into register REG. */
5169 iso2022_designate(Lisp_Object charset, unsigned char reg,
5170 encoding_stream_t str, unsigned_char_dynarr * dst)
5172 static const char inter94[] = "()*+";
5173 static const char inter96[] = ",-./";
5175 unsigned char final;
5176 Lisp_Object old_charset = str->iso2022.charset[reg];
5178 str->iso2022.charset[reg] = charset;
5179 if (!CHARSETP(charset))
5180 /* charset might be an initial nil or t. */
5182 type = XCHARSET_TYPE(charset);
5183 final = XCHARSET_FINAL(charset);
5184 if (!str->iso2022.force_charset_on_output[reg] &&
5185 CHARSETP(old_charset) &&
5186 XCHARSET_TYPE(old_charset) == type &&
5187 XCHARSET_FINAL(old_charset) == final)
5190 str->iso2022.force_charset_on_output[reg] = 0;
5193 charset_conversion_spec_dynarr *dyn =
5194 str->codesys->iso2022.output_conv;
5199 for (i = 0; i < Dynarr_length(dyn); i++) {
5200 struct charset_conversion_spec *spec =
5202 if (EQ(charset, spec->from_charset))
5203 charset = spec->to_charset;
5208 Dynarr_add(dst, ISO_CODE_ESC);
5210 case CHARSET_TYPE_94:
5211 Dynarr_add(dst, inter94[reg]);
5213 case CHARSET_TYPE_96:
5214 Dynarr_add(dst, inter96[reg]);
5216 case CHARSET_TYPE_94X94:
5217 Dynarr_add(dst, '$');
5218 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5219 || final < '@' || final > 'B')
5220 Dynarr_add(dst, inter94[reg]);
5222 case CHARSET_TYPE_96X96:
5223 Dynarr_add(dst, '$');
5224 Dynarr_add(dst, inter96[reg]);
5229 Dynarr_add(dst, final);
5233 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5235 if (str->iso2022.register_left != 0) {
5236 Dynarr_add(dst, ISO_CODE_SI);
5237 str->iso2022.register_left = 0;
5242 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5244 if (str->iso2022.register_left != 1) {
5245 Dynarr_add(dst, ISO_CODE_SO);
5246 str->iso2022.register_left = 1;
5250 /* Convert internally-formatted data to ISO2022 format. */
5253 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5254 unsigned_char_dynarr * dst, Lstream_data_count n)
5256 unsigned char charmask, c;
5257 unsigned char char_boundary;
5258 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5259 unsigned int flags = str->flags;
5260 unsigned int ch = str->ch;
5261 Lisp_Coding_System *codesys = str->codesys;
5262 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5264 Lisp_Object charset;
5267 #ifdef ENABLE_COMPOSITE_CHARS
5268 /* flags for handling composite chars. We do a little switcharoo
5269 on the source while we're outputting the composite char. */
5270 unsigned int saved_n = 0;
5271 const unsigned char *saved_src = NULL;
5272 int in_composite = 0;
5273 #endif /* ENABLE_COMPOSITE_CHARS */
5275 char_boundary = str->iso2022.current_char_boundary;
5276 charset = str->iso2022.current_charset;
5277 half = str->iso2022.current_half;
5279 #ifdef ENABLE_COMPOSITE_CHARS
5285 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
5288 restore_left_to_right_direction(codesys, dst, &flags,
5291 /* Make sure G0 contains ASCII */
5292 if ((c > ' ' && c < ISO_CODE_DEL) ||
5293 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5294 ensure_normal_shift(str, dst);
5295 iso2022_designate(Vcharset_ascii, 0, str, dst);
5298 /* If necessary, restore everything to the default state
5301 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5302 restore_left_to_right_direction(codesys, dst,
5305 ensure_normal_shift(str, dst);
5307 for (i = 0; i < 4; i++) {
5308 Lisp_Object initial_charset =
5309 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5311 iso2022_designate(initial_charset, i,
5316 if (eol_type != EOL_LF
5317 && eol_type != EOL_AUTODETECT)
5318 Dynarr_add(dst, '\r');
5319 if (eol_type != EOL_CR)
5322 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5323 && fit_to_be_escape_quoted(c))
5324 Dynarr_add(dst, ISO_CODE_ESC);
5330 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
5332 charset = CHARSET_BY_LEADING_BYTE(c);
5333 if (LEADING_BYTE_PREFIX_P(c))
5335 else if (!EQ(charset, Vcharset_control_1)
5336 #ifdef ENABLE_COMPOSITE_CHARS
5337 && !EQ(charset, Vcharset_composite)
5342 ensure_correct_direction(XCHARSET_DIRECTION
5346 /* Now determine which register to use. */
5348 for (i = 0; i < 4; i++) {
5349 if (EQ(charset, str->iso2022.charset[i])
5351 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5359 if (XCHARSET_GRAPHIC(charset) != 0) {
5361 (str->iso2022.charset[1])
5363 (!CODING_SYSTEM_ISO2022_SEVEN
5366 CODING_SYSTEM_ISO2022_LOCK_SHIFT
5383 iso2022_designate(charset, reg, str, dst);
5385 /* Now invoke that register. */
5388 ensure_normal_shift(str, dst);
5393 if (CODING_SYSTEM_ISO2022_SEVEN
5395 ensure_shift_out(str, dst);
5402 if (CODING_SYSTEM_ISO2022_SEVEN
5404 Dynarr_add(dst, ISO_CODE_ESC);
5405 Dynarr_add(dst, 'N');
5408 Dynarr_add(dst, ISO_CODE_SS2);
5414 if (CODING_SYSTEM_ISO2022_SEVEN
5416 Dynarr_add(dst, ISO_CODE_ESC);
5417 Dynarr_add(dst, 'O');
5420 Dynarr_add(dst, ISO_CODE_SS3);
5430 } else { /* Processing Non-ASCII character */
5431 charmask = (half == 0 ? 0x7F : 0xFF);
5433 if (EQ(charset, Vcharset_control_1)) {
5434 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5435 && fit_to_be_escape_quoted(c))
5436 Dynarr_add(dst, ISO_CODE_ESC);
5437 /* you asked for it ... */
5438 Dynarr_add(dst, c - 0x20);
5440 switch (XCHARSET_REP_BYTES(charset)) {
5442 Dynarr_add(dst, c & charmask);
5445 if (XCHARSET_PRIVATE_P(charset)) {
5446 Dynarr_add(dst, c & charmask);
5449 #ifdef ENABLE_COMPOSITE_CHARS
5452 Vcharset_composite)) {
5454 /* #### Bother! We don't know how to
5461 (Vcharset_composite,
5466 composite_char_string
5475 n = XSTRING_LENGTH(lstr);
5478 Dynarr_add(dst, '0'); /* start composing */
5481 #endif /* ENABLE_COMPOSITE_CHARS */
5498 Dynarr_add(dst, ch & charmask);
5499 Dynarr_add(dst, c & charmask);
5513 #ifdef ENABLE_COMPOSITE_CHARS
5518 Dynarr_add(dst, ISO_CODE_ESC);
5519 Dynarr_add(dst, '1'); /* end composing */
5520 goto back_to_square_n; /* Wheeeeeeeee ..... */
5522 #endif /* ENABLE_COMPOSITE_CHARS */
5524 if (char_boundary && flags & CODING_STATE_END) {
5525 restore_left_to_right_direction(codesys, dst, &flags, 0);
5526 ensure_normal_shift(str, dst);
5527 for (i = 0; i < 4; i++) {
5528 Lisp_Object initial_charset =
5529 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5530 iso2022_designate(initial_charset, i, str, dst);
5536 str->iso2022.current_char_boundary = char_boundary;
5537 str->iso2022.current_charset = charset;
5538 str->iso2022.current_half = half;
5540 /* Verbum caro factum est! */
5544 /************************************************************************/
5545 /* No-conversion methods */
5546 /************************************************************************/
5548 /* This is used when reading in "binary" files -- i.e. files that may
5549 contain all 256 possible byte values and that are not to be
5550 interpreted as being in any particular decoding. */
5552 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5553 unsigned_char_dynarr * dst, Lstream_data_count n)
5555 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5556 unsigned int flags = str->flags;
5557 unsigned int ch = str->ch;
5558 eol_type_t eol_type = str->eol_type;
5561 const unsigned char c = *(const unsigned char *)src++;
5563 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5564 DECODE_ADD_BINARY_CHAR(c, dst);
5565 label_continue_loop:;
5568 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5575 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5576 unsigned_char_dynarr * dst, Lstream_data_count n)
5579 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5580 unsigned int flags = str->flags;
5581 unsigned int ch = str->ch;
5582 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5587 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5588 Dynarr_add(dst, '\r');
5589 if (eol_type != EOL_CR)
5590 Dynarr_add(dst, '\n');
5592 } else if (BYTE_ASCII_P(c)) {
5595 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5597 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5598 c == LEADING_BYTE_CONTROL_1)
5601 Dynarr_add(dst, '~'); /* untranslatable character */
5603 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5605 else if (ch == LEADING_BYTE_CONTROL_1) {
5607 Dynarr_add(dst, c - 0x20);
5609 /* else it should be the second or third byte of an
5610 untranslatable character, so ignore it */
5619 /************************************************************************/
5620 /* Initialization */
5621 /************************************************************************/
5623 void syms_of_file_coding(void)
5625 INIT_LRECORD_IMPLEMENTATION(coding_system);
5627 DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5629 DEFSUBR(Fcoding_system_p);
5630 DEFSUBR(Ffind_coding_system);
5631 DEFSUBR(Fget_coding_system);
5632 DEFSUBR(Fcoding_system_list);
5633 DEFSUBR(Fcoding_system_name);
5634 DEFSUBR(Fmake_coding_system);
5635 DEFSUBR(Fcopy_coding_system);
5636 DEFSUBR(Fcoding_system_canonical_name_p);
5637 DEFSUBR(Fcoding_system_alias_p);
5638 DEFSUBR(Fcoding_system_aliasee);
5639 DEFSUBR(Fdefine_coding_system_alias);
5640 DEFSUBR(Fsubsidiary_coding_system);
5642 DEFSUBR(Fcoding_system_type);
5643 DEFSUBR(Fcoding_system_doc_string);
5645 DEFSUBR(Fcoding_system_charset);
5647 DEFSUBR(Fcoding_system_property);
5649 DEFSUBR(Fcoding_category_list);
5650 DEFSUBR(Fset_coding_priority_list);
5651 DEFSUBR(Fcoding_priority_list);
5652 DEFSUBR(Fset_coding_category_system);
5653 DEFSUBR(Fcoding_category_system);
5655 DEFSUBR(Fdetect_coding_region);
5656 DEFSUBR(Fdecode_coding_region);
5657 DEFSUBR(Fencode_coding_region);
5659 DEFSUBR(Fdecode_shift_jis_char);
5660 DEFSUBR(Fencode_shift_jis_char);
5661 DEFSUBR(Fdecode_big5_char);
5662 DEFSUBR(Fencode_big5_char);
5663 DEFSUBR(Fset_ucs_char);
5665 DEFSUBR(Fset_char_ucs);
5668 defsymbol(&Qcoding_systemp, "coding-system-p");
5669 defsymbol(&Qno_conversion, "no-conversion");
5670 defsymbol(&Qraw_text, "raw-text");
5672 defsymbol(&Qbig5, "big5");
5673 defsymbol(&Qshift_jis, "shift-jis");
5674 defsymbol(&Qucs4, "ucs-4");
5675 defsymbol(&Qutf8, "utf-8");
5676 defsymbol(&Qccl, "ccl");
5677 defsymbol(&Qiso2022, "iso2022");
5679 defsymbol(&Qmnemonic, "mnemonic");
5680 defsymbol(&Qeol_type, "eol-type");
5681 defsymbol(&Qpost_read_conversion, "post-read-conversion");
5682 defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5684 defsymbol(&Qcr, "cr");
5685 defsymbol(&Qlf, "lf");
5686 defsymbol(&Qcrlf, "crlf");
5687 defsymbol(&Qeol_cr, "eol-cr");
5688 defsymbol(&Qeol_lf, "eol-lf");
5689 defsymbol(&Qeol_crlf, "eol-crlf");
5691 defsymbol(&Qcharset_g0, "charset-g0");
5692 defsymbol(&Qcharset_g1, "charset-g1");
5693 defsymbol(&Qcharset_g2, "charset-g2");
5694 defsymbol(&Qcharset_g3, "charset-g3");
5695 defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5696 defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5697 defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5698 defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5699 defsymbol(&Qno_iso6429, "no-iso6429");
5700 defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5701 defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5703 defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5704 defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5705 defsymbol(&Qseven, "seven");
5706 defsymbol(&Qlock_shift, "lock-shift");
5707 defsymbol(&Qescape_quoted, "escape-quoted");
5709 defsymbol(&Qencode, "encode");
5710 defsymbol(&Qdecode, "decode");
5713 defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5715 defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5716 defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5717 defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5718 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5719 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5721 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5722 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5723 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5726 defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5730 void lstream_type_create_file_coding(void)
5732 LSTREAM_HAS_METHOD(decoding, reader);
5733 LSTREAM_HAS_METHOD(decoding, writer);
5734 LSTREAM_HAS_METHOD(decoding, rewinder);
5735 LSTREAM_HAS_METHOD(decoding, seekable_p);
5736 LSTREAM_HAS_METHOD(decoding, flusher);
5737 LSTREAM_HAS_METHOD(decoding, closer);
5738 LSTREAM_HAS_METHOD(decoding, marker);
5740 LSTREAM_HAS_METHOD(encoding, reader);
5741 LSTREAM_HAS_METHOD(encoding, writer);
5742 LSTREAM_HAS_METHOD(encoding, rewinder);
5743 LSTREAM_HAS_METHOD(encoding, seekable_p);
5744 LSTREAM_HAS_METHOD(encoding, flusher);
5745 LSTREAM_HAS_METHOD(encoding, closer);
5746 LSTREAM_HAS_METHOD(encoding, marker);
5749 void vars_of_file_coding(void)
5753 fcd = xnew(struct file_coding_dump);
5754 dump_add_root_struct_ptr(&fcd, &fcd_description);
5756 /* Initialize to something reasonable ... */
5757 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5758 fcd->coding_category_system[i] = Qnil;
5759 fcd->coding_category_by_priority[i] = i;
5762 Fprovide(intern("file-coding"));
5764 DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system /*
5765 Coding system used for TTY keyboard input.
5766 Not used under a windowing system.
5768 Vkeyboard_coding_system = Qnil;
5770 DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system /*
5771 Coding system used for TTY display output.
5772 Not used under a windowing system.
5774 Vterminal_coding_system = Qnil;
5776 DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read /*
5777 Overriding coding system used when reading from a file or process.
5778 You should bind this variable with `let', but do not set it globally.
5779 If this is non-nil, it specifies the coding system that will be used
5780 to decode input on read operations, such as from a file or process.
5781 It overrides `buffer-file-coding-system-for-read',
5782 `insert-file-contents-pre-hook', etc. Use those variables instead of
5783 this one for permanent changes to the environment. */ );
5784 Vcoding_system_for_read = Qnil;
5786 DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write /*
5787 Overriding coding system used when writing to a file or process.
5788 You should bind this variable with `let', but do not set it globally.
5789 If this is non-nil, it specifies the coding system that will be used
5790 to encode output for write operations, such as to a file or process.
5791 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5792 Use those variables instead of this one for permanent changes to the
5794 Vcoding_system_for_write = Qnil;
5796 DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system /*
5797 Coding system used to convert pathnames when accessing files.
5799 Vfile_name_coding_system = Qnil;
5801 DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5802 Non-nil means the buffer contents are regarded as multi-byte form
5803 of characters, not a binary code. This affects the display, file I/O,
5804 and behaviors of various editing commands.
5806 Setting this to nil does not do anything.
5808 enable_multibyte_characters = 1;
5811 void complex_vars_of_file_coding(void)
5813 staticpro(&Vcoding_system_hash_table);
5814 Vcoding_system_hash_table =
5815 make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5817 the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5818 dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5819 &codesys_prop_dynarr_description);
5821 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5823 struct codesys_prop csp; \
5825 csp.prop_type = (Prop_Type); \
5826 Dynarr_add (the_codesys_prop_dynarr, csp); \
5829 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5830 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5831 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5832 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5833 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5834 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5835 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5837 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5838 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5839 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5840 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5841 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5842 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5843 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5844 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5845 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5846 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5847 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5848 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5849 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5850 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5851 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5852 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5853 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5855 DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5856 DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5858 /* Need to create this here or we're really screwed. */
5860 (Qraw_text, Qno_conversion,
5862 ("Raw text, which means it converts only line-break-codes."),
5863 list2(Qmnemonic, build_string("Raw")));
5866 (Qbinary, Qno_conversion,
5867 build_string("Binary, which means it does not convert anything."),
5868 list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5870 Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5872 Fdefine_coding_system_alias(Qfile_name, Qbinary);
5874 Fdefine_coding_system_alias(Qterminal, Qbinary);
5875 Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5877 /* Need this for bootstrapping */
5878 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5879 Fget_coding_system(Qraw_text);
5885 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5886 fcd->ucs_to_mule_table[i] = Qnil;
5888 staticpro(&mule_to_ucs_table);
5889 mule_to_ucs_table = Fmake_char_table(Qgeneric);