1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of SXEmacs
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 /* Synched up with: Mule 2.3. Not in FSF. */
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
30 #include "ui/insdel.h"
37 #include "file-coding.h"
39 Lisp_Object Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
50 struct file_coding_dump {
51 /* Coding system currently associated with each coding category. */
52 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
54 /* Table of all coding categories in decreasing order of priority.
55 This describes a permutation of the possible coding categories. */
56 int coding_category_by_priority[CODING_CATEGORY_LAST];
59 Lisp_Object ucs_to_mule_table[65536];
63 static const struct lrecord_description fcd_description_1[] = {
64 {XD_LISP_OBJECT_ARRAY,
65 offsetof(struct file_coding_dump, coding_category_system),
66 CODING_CATEGORY_LAST},
68 {XD_LISP_OBJECT_ARRAY,
69 offsetof(struct file_coding_dump, ucs_to_mule_table),
70 countof(fcd->ucs_to_mule_table)},
75 static const struct struct_description fcd_description = {
76 sizeof(struct file_coding_dump),
80 Lisp_Object mule_to_ucs_table;
82 Lisp_Object Qcoding_systemp;
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
104 Lisp_Object Qencode, Qdecode;
106 Lisp_Object Vcoding_system_hash_table;
108 int enable_multibyte_characters;
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet:1;
162 unsigned int invalid_switch_dir:1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally:1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence:1;
175 EXFUN(Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179 const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181 unsigned_char_dynarr * dst,
182 Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184 unsigned_char_dynarr * dst,
185 Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187 Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189 unsigned_char_dynarr * dst,
190 Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192 unsigned_char_dynarr * dst,
193 Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195 Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197 unsigned_char_dynarr * dst,
198 Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200 unsigned_char_dynarr * dst,
201 Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203 Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205 unsigned_char_dynarr * dst,
206 Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208 unsigned_char_dynarr * dst,
209 Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212 struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214 const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216 unsigned_char_dynarr * dst,
217 Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219 unsigned_char_dynarr * dst,
220 Lstream_data_count n);
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223 unsigned_char_dynarr * dst,
224 Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226 unsigned_char_dynarr * dst,
227 Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229 unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231 unsigned_char_dynarr * dst, Lstream_data_count n);
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
240 Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
243 static const struct lrecord_description codesys_prop_description_1[] = {
244 {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
248 static const struct struct_description codesys_prop_description = {
249 sizeof(codesys_prop),
250 codesys_prop_description_1
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
258 static const struct struct_description codesys_prop_dynarr_description = {
259 sizeof(codesys_prop_dynarr),
260 codesys_prop_dynarr_description_1
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
265 enum codesys_prop_enum {
267 CODESYS_PROP_ISO2022,
271 /************************************************************************/
272 /* Coding system functions */
273 /************************************************************************/
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
280 static const struct lrecord_description ccs_description_1[] = {
281 {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282 {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
286 static const struct struct_description ccs_description = {
287 sizeof(charset_conversion_spec),
291 static const struct lrecord_description ccsd_description_1[] = {
292 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
296 static const struct struct_description ccsd_description = {
297 sizeof(charset_conversion_spec_dynarr),
302 static const struct lrecord_description coding_system_description[] = {
303 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
312 {XD_LISP_OBJECT_ARRAY,
313 offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314 {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
316 {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
318 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325 mark_coding_system, print_coding_system,
326 finalize_coding_system,
327 0, 0, coding_system_description,
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
332 Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
334 mark_object(CODING_SYSTEM_NAME(codesys));
335 mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336 mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337 mark_object(CODING_SYSTEM_EOL_LF(codesys));
338 mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339 mark_object(CODING_SYSTEM_EOL_CR(codesys));
341 switch (CODING_SYSTEM_TYPE(codesys)) {
344 case CODESYS_ISO2022:
345 for (i = 0; i < 4; i++)
346 mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
348 if (codesys->iso2022.input_conv) {
350 i < Dynarr_length(codesys->iso2022.input_conv);
352 struct charset_conversion_spec *ccs =
353 Dynarr_atp(codesys->iso2022.input_conv, i);
354 mark_object(ccs->from_charset);
355 mark_object(ccs->to_charset);
358 if (codesys->iso2022.output_conv) {
360 i < Dynarr_length(codesys->iso2022.output_conv);
362 struct charset_conversion_spec *ccs =
363 Dynarr_atp(codesys->iso2022.output_conv, i);
364 mark_object(ccs->from_charset);
365 mark_object(ccs->to_charset);
371 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
375 /* list the rest of them lot explicitly */
376 case CODESYS_AUTODETECT:
377 case CODESYS_SHIFT_JIS:
381 case CODESYS_NO_CONVERSION:
383 case CODESYS_INTERNAL:
390 mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
397 Lisp_Coding_System *c = XCODING_SYSTEM(obj);
399 error("printing unreadable object #<coding-system 0x%x>",
402 write_c_string("#<coding-system ", printcharfun);
403 print_internal(c->name, printcharfun, 1);
404 write_c_string(">", printcharfun);
407 static void finalize_coding_system(void *header, int for_disksave)
409 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410 /* Since coding systems never go away, this function is not
411 necessary. But it would be necessary if we changed things
412 so that coding systems could go away. */
413 if (!for_disksave) { /* see comment in lstream.c */
414 switch (CODING_SYSTEM_TYPE(c)) {
416 case CODESYS_ISO2022:
417 if (c->iso2022.input_conv) {
418 Dynarr_free(c->iso2022.input_conv);
419 c->iso2022.input_conv = 0;
421 if (c->iso2022.output_conv) {
422 Dynarr_free(c->iso2022.output_conv);
423 c->iso2022.output_conv = 0;
427 /* list the rest of them lot explicitly */
428 case CODESYS_AUTODETECT:
429 case CODESYS_SHIFT_JIS:
434 case CODESYS_NO_CONVERSION:
436 case CODESYS_INTERNAL:
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
447 CHECK_SYMBOL(symbol);
449 return EOL_AUTODETECT;
452 if (EQ(symbol, Qcrlf))
457 signal_simple_error("Unrecognized eol type", symbol);
458 return EOL_AUTODETECT; /* not reached */
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
478 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
480 Lisp_Object codesys_obj;
481 int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
482 char *codesys_name = (char *)alloca(len + 7);
484 char *codesys_mnemonic = 0;
486 Lisp_Object codesys_name_sym, sub_codesys_obj;
490 XSETCODING_SYSTEM(codesys_obj, codesys);
493 string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
495 if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
496 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
497 codesys_mnemonic = (char *)alloca(mlen + 7);
498 memcpy(codesys_mnemonic,
499 XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
501 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
502 strcpy (codesys_name + len, "-" op_sys); \
504 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
505 codesys_name_sym = intern (codesys_name); \
506 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
507 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
509 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
510 build_string (codesys_mnemonic); \
511 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
514 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
515 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
516 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
519 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
520 Return t if OBJECT is a coding system.
521 A coding system is an object that defines how text containing multiple
522 character sets is encoded into a stream of (typically 8-bit) bytes.
523 The coding system is used to decode the stream into a series of
524 characters (which may be from multiple charsets) when the text is read
525 from a file or process, and is used to encode the text back into the
526 same format when it is written out to a file or process.
528 For example, many ISO2022-compliant coding systems (such as Compound
529 Text, which is used for inter-client data under the X Window System)
530 use escape sequences to switch between different charsets -- Japanese
531 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
532 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
533 `make-coding-system' for more information.
535 Coding systems are normally identified using a symbol, and the
536 symbol is accepted in place of the actual coding system object whenever
537 a coding system is called for. (This is similar to how faces work.)
541 return CODING_SYSTEMP(object) ? Qt : Qnil;
544 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
545 Retrieve the coding system of the given name.
547 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
548 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
549 If there is no such coding system, nil is returned. Otherwise the
550 associated coding system object is returned.
552 (coding_system_or_name))
554 if (NILP(coding_system_or_name))
555 coding_system_or_name = Qbinary;
556 else if (CODING_SYSTEMP(coding_system_or_name))
557 return coding_system_or_name;
559 CHECK_SYMBOL(coding_system_or_name);
562 coding_system_or_name =
563 Fgethash(coding_system_or_name, Vcoding_system_hash_table,
566 if (CODING_SYSTEMP(coding_system_or_name)
567 || NILP(coding_system_or_name))
568 return coding_system_or_name;
572 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
573 Retrieve the coding system of the given name.
574 Same as `find-coding-system' except that if there is no such
575 coding system, an error is signaled instead of returning nil.
579 Lisp_Object coding_system = Ffind_coding_system(name);
581 if (NILP(coding_system))
582 signal_simple_error("No such coding system", name);
583 return coding_system;
586 /* We store the coding systems in hash tables with the names as the key and the
587 actual coding system object as the value. Occasionally we need to use them
588 in a list format. These routines provide us with that. */
589 struct coding_system_list_closure {
590 Lisp_Object *coding_system_list;
594 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
595 void *coding_system_list_closure)
597 /* This function can GC */
598 struct coding_system_list_closure *cscl =
599 (struct coding_system_list_closure *)coding_system_list_closure;
600 Lisp_Object *coding_system_list = cscl->coding_system_list;
602 *coding_system_list = Fcons(key, *coding_system_list);
606 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
607 Return a list of the names of all defined coding systems.
611 Lisp_Object coding_system_list = Qnil;
613 struct coding_system_list_closure coding_system_list_closure;
615 GCPRO1(coding_system_list);
616 coding_system_list_closure.coding_system_list = &coding_system_list;
617 elisp_maphash(add_coding_system_to_list_mapper,
618 Vcoding_system_hash_table, &coding_system_list_closure);
621 return coding_system_list;
624 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
625 Return the name of the given coding system.
629 coding_system = Fget_coding_system(coding_system);
630 return XCODING_SYSTEM_NAME(coding_system);
633 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
636 Lisp_Coding_System *codesys =
637 alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
639 zero_lcrecord(codesys);
640 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
641 CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
642 CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
643 CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
644 CODING_SYSTEM_EOL_CR(codesys) = Qnil;
645 CODING_SYSTEM_EOL_LF(codesys) = Qnil;
646 CODING_SYSTEM_TYPE(codesys) = type;
647 CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
649 if (type == CODESYS_ISO2022) {
651 for (i = 0; i < 4; i++)
652 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
654 } else if (type == CODESYS_CCL) {
655 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
656 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
659 CODING_SYSTEM_NAME(codesys) = name;
665 /* Given a list of charset conversion specs as specified in a Lisp
666 program, parse it into STORE_HERE. */
669 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
670 Lisp_Object spec_list)
674 EXTERNAL_LIST_LOOP(rest, spec_list) {
675 Lisp_Object car = XCAR(rest);
676 Lisp_Object from, to;
677 struct charset_conversion_spec spec;
679 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
680 signal_simple_error("Invalid charset conversion spec",
682 from = Fget_charset(XCAR(car));
683 to = Fget_charset(XCAR(XCDR(car)));
684 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
685 signal_simple_error_2
686 ("Attempted conversion between different charset types",
688 spec.from_charset = from;
689 spec.to_charset = to;
691 Dynarr_add(store_here, spec);
695 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
696 specs, return the equivalent as the Lisp programmer would see it.
698 If LOAD_HERE is 0, return Qnil. */
701 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
708 for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
709 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
711 Fcons(list2(ccs->from_charset, ccs->to_charset), result);
714 return Fnreverse(result);
719 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
720 Register symbol NAME as a coding system.
722 TYPE describes the conversion method used and should be one of
725 Automatic conversion. SXEmacs attempts to detect the coding system
728 No conversion. Use this for binary files and such. On output,
729 graphic characters that are not in ASCII or Latin-1 will be
730 replaced by a ?. (For a no-conversion-encoded buffer, these
731 characters will only be present if you explicitly insert them.)
733 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
735 ISO 10646 UCS-4 encoding.
737 ISO 10646 UTF-8 encoding.
739 Any ISO2022-compliant encoding. Among other things, this includes
740 JIS (the Japanese encoding commonly used for e-mail), EUC (the
741 standard Unix encoding for Japanese and other languages), and
742 Compound Text (the encoding used in X11). You can specify more
743 specific information about the conversion with the PROPS argument.
745 Big5 (the encoding commonly used for Taiwanese).
747 The conversion is performed using a user-written pseudo-code
748 program. CCL (Code Conversion Language) is the name of this
751 Write out or read in the raw contents of the memory representing
752 the buffer's text. This is primarily useful for debugging
753 purposes, and is only enabled when SXEmacs has been compiled with
754 DEBUG_SXEMACS defined (via the --debug configure option).
755 WARNING: Reading in a file using 'internal conversion can result
756 in an internal inconsistency in the memory representing a
757 buffer's text, which will produce unpredictable results and may
758 cause SXEmacs to crash. Under normal circumstances you should
759 never use 'internal conversion.
761 DOC-STRING is a string describing the coding system.
763 PROPS is a property list, describing the specific nature of the
764 character set. Recognized properties are:
767 String to be displayed in the modeline when this coding system is
771 End-of-line conversion to be used. It should be one of
774 Automatically detect the end-of-line type (LF, CRLF,
775 or CR). Also generate subsidiary coding systems named
776 `NAME-unix', `NAME-dos', and `NAME-mac', that are
777 identical to this coding system but have an EOL-TYPE
778 value of 'lf, 'crlf, and 'cr, respectively.
780 The end of a line is marked externally using ASCII LF.
781 Since this is also the way that SXEmacs represents an
782 end-of-line internally, specifying this option results
783 in no end-of-line conversion. This is the standard
784 format for Unix text files.
786 The end of a line is marked externally using ASCII
787 CRLF. This is the standard format for MS-DOS text
790 The end of a line is marked externally using ASCII CR.
791 This is the standard format for Macintosh text files.
793 Automatically detect the end-of-line type but do not
794 generate subsidiary coding systems. (This value is
795 converted to nil when stored internally, and
796 `coding-system-property' will return nil.)
798 'post-read-conversion
799 Function called after a file has been read in, to perform the
800 decoding. Called with two arguments, START and END, denoting
801 a region of the current buffer to be decoded.
803 'pre-write-conversion
804 Function called before a file is written out, to perform the
805 encoding. Called with two arguments, START and END, denoting
806 a region of the current buffer to be encoded.
808 The following additional properties are recognized if TYPE is 'iso2022:
814 The character set initially designated to the G0 - G3 registers.
815 The value should be one of
817 -- A charset object (designate that character set)
818 -- nil (do not ever use this register)
819 -- t (no character set is initially designated to
820 the register, but may be later on; this automatically
821 sets the corresponding `force-g*-on-output' property)
827 If non-nil, send an explicit designation sequence on output before
828 using the specified register.
831 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
832 "ESC $ B" on output in place of the full designation sequences
833 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
836 If non-nil, don't designate ASCII to G0 at each end of line on output.
837 Setting this to non-nil also suppresses other state-resetting that
838 normally happens at the end of a line.
841 If non-nil, don't designate ASCII to G0 before control chars on output.
844 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
848 If non-nil, use locking-shift (SO/SI) instead of single-shift
849 or designation by escape sequence.
852 If non-nil, don't use ISO6429's direction specification.
855 If non-nil, literal control characters that are the same as
856 the beginning of a recognized ISO2022 or ISO6429 escape sequence
857 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
858 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
859 so that they can be properly distinguished from an escape sequence.
860 (Note that doing this results in a non-portable encoding.) This
861 encoding flag is used for byte-compiled files. Note that ESC
862 is a good choice for a quoting character because there are no
863 escape sequences whose second byte is a character from the Control-0
864 or Control-1 character sets; this is explicitly disallowed by the
867 'input-charset-conversion
868 A list of conversion specifications, specifying conversion of
869 characters in one charset to another when decoding is performed.
870 Each specification is a list of two elements: the source charset,
871 and the destination charset.
873 'output-charset-conversion
874 A list of conversion specifications, specifying conversion of
875 characters in one charset to another when encoding is performed.
876 The form of each specification is the same as for
877 'input-charset-conversion.
879 The following additional properties are recognized (and required)
883 CCL program used for decoding (converting to internal format).
886 CCL program used for encoding (converting to external format).
888 (name, type, doc_string, props))
890 Lisp_Coding_System *codesys;
891 enum coding_system_type ty;
892 int need_to_setup_eol_systems = 1;
894 /* Convert type to constant */
895 if (NILP(type) || EQ(type, Qundecided)) {
896 ty = CODESYS_AUTODETECT;
899 else if (EQ(type, Qshift_jis)) {
900 ty = CODESYS_SHIFT_JIS;
901 } else if (EQ(type, Qiso2022)) {
902 ty = CODESYS_ISO2022;
903 } else if (EQ(type, Qbig5)) {
905 } else if (EQ(type, Qucs4)) {
907 } else if (EQ(type, Qutf8)) {
909 } else if (EQ(type, Qccl)) {
913 else if (EQ(type, Qno_conversion)) {
914 ty = CODESYS_NO_CONVERSION;
917 else if (EQ(type, Qinternal)) {
918 ty = CODESYS_INTERNAL;
922 signal_simple_error("Invalid coding system type", type);
926 codesys = allocate_coding_system(ty, name);
928 if (NILP(doc_string))
929 doc_string = build_string("");
931 CHECK_STRING(doc_string);
932 CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
935 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
936 if (EQ(key, Qmnemonic)) {
939 CODING_SYSTEM_MNEMONIC(codesys) = value;
942 else if (EQ(key, Qeol_type)) {
943 need_to_setup_eol_systems = NILP(value);
946 CODING_SYSTEM_EOL_TYPE(codesys) =
947 symbol_to_eol_type(value);
950 else if (EQ(key, Qpost_read_conversion))
951 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
953 else if (EQ(key, Qpre_write_conversion))
954 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
957 else if (ty == CODESYS_ISO2022) {
958 #define FROB_INITIAL_CHARSET(charset_num) \
959 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
960 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
962 if (EQ(key, Qcharset_g0))
963 FROB_INITIAL_CHARSET(0);
964 else if (EQ(key, Qcharset_g1))
965 FROB_INITIAL_CHARSET(1);
966 else if (EQ(key, Qcharset_g2))
967 FROB_INITIAL_CHARSET(2);
968 else if (EQ(key, Qcharset_g3))
969 FROB_INITIAL_CHARSET(3);
971 #define FROB_FORCE_CHARSET(charset_num) \
972 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
974 else if (EQ(key, Qforce_g0_on_output))
975 FROB_FORCE_CHARSET(0);
976 else if (EQ(key, Qforce_g1_on_output))
977 FROB_FORCE_CHARSET(1);
978 else if (EQ(key, Qforce_g2_on_output))
979 FROB_FORCE_CHARSET(2);
980 else if (EQ(key, Qforce_g3_on_output))
981 FROB_FORCE_CHARSET(3);
983 #define FROB_BOOLEAN_PROPERTY(prop) \
984 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
986 else if (EQ(key, Qshort))
987 FROB_BOOLEAN_PROPERTY(SHORT);
988 else if (EQ(key, Qno_ascii_eol))
989 FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
990 else if (EQ(key, Qno_ascii_cntl))
991 FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
992 else if (EQ(key, Qseven))
993 FROB_BOOLEAN_PROPERTY(SEVEN);
994 else if (EQ(key, Qlock_shift))
995 FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
996 else if (EQ(key, Qno_iso6429))
997 FROB_BOOLEAN_PROPERTY(NO_ISO6429);
998 else if (EQ(key, Qescape_quoted))
999 FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
1001 else if (EQ(key, Qinput_charset_conversion)) {
1002 codesys->iso2022.input_conv =
1003 Dynarr_new(charset_conversion_spec);
1004 parse_charset_conversion_specs(codesys->
1008 } else if (EQ(key, Qoutput_charset_conversion)) {
1009 codesys->iso2022.output_conv =
1010 Dynarr_new(charset_conversion_spec);
1011 parse_charset_conversion_specs(codesys->
1017 ("Unrecognized property", key);
1018 } else if (EQ(type, Qccl)) {
1020 struct ccl_program test_ccl;
1023 /* Check key first. */
1024 if (EQ(key, Qdecode))
1025 suffix = "-ccl-decode";
1026 else if (EQ(key, Qencode))
1027 suffix = "-ccl-encode";
1030 ("Unrecognized property", key);
1032 /* If value is vector, register it as a ccl program
1033 associated with an newly created symbol for
1034 backward compatibility. */
1035 if (VECTORP(value)) {
1038 (Fsymbol_name(name),
1039 build_string(suffix)),
1041 Fregister_ccl_program(sym, value);
1043 CHECK_SYMBOL(value);
1046 /* check if the given ccl programs are valid. */
1047 if (setup_ccl_program(&test_ccl, sym) < 0)
1049 ("Invalid CCL program", value);
1051 if (EQ(key, Qdecode))
1052 CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1053 else if (EQ(key, Qencode))
1054 CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1059 signal_simple_error("Unrecognized property",
1064 if (need_to_setup_eol_systems)
1065 setup_eol_coding_systems(codesys);
1068 Lisp_Object codesys_obj;
1069 XSETCODING_SYSTEM(codesys_obj, codesys);
1070 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1075 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1076 Copy OLD-CODING-SYSTEM to NEW-NAME.
1077 If NEW-NAME does not name an existing coding system, a new one will
1080 (old_coding_system, new_name))
1082 Lisp_Object new_coding_system;
1083 old_coding_system = Fget_coding_system(old_coding_system);
1084 new_coding_system = Ffind_coding_system(new_name);
1085 if (NILP(new_coding_system)) {
1086 XSETCODING_SYSTEM(new_coding_system,
1087 allocate_coding_system
1088 (XCODING_SYSTEM_TYPE(old_coding_system),
1090 Fputhash(new_name, new_coding_system,
1091 Vcoding_system_hash_table);
1095 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1096 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1097 memcpy(((char *)to) + sizeof(to->header),
1098 ((char *)from) + sizeof(from->header),
1099 sizeof(*from) - sizeof(from->header));
1100 to->name = new_name;
1102 return new_coding_system;
1105 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1106 Return t if OBJECT names a coding system, and is not a coding system alias.
1110 Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1111 return CODING_SYSTEMP(val) ? Qt : Qnil;
1114 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1115 Return t if OBJECT is a coding system alias.
1116 All coding system aliases are created by `define-coding-system-alias'.
1120 Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1121 return SYMBOLP(val) ? Qt : Qnil;
1124 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1125 Return the coding-system symbol for which symbol ALIAS is an alias.
1129 Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1130 if (SYMBOLP(aliasee)) {
1133 signal_simple_error(
1134 "Symbol is not a coding system alias", alias);
1136 /* To keep the compiler happy */
1141 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1143 return Fintern(concat2(Fsymbol_name(symbol),
1144 build_string(ascii_string)), Qnil);
1147 /* A maphash function, for removing dangling coding system aliases. */
1149 dangling_coding_system_alias_p(Lisp_Object alias,
1150 Lisp_Object aliasee, void *dangling_aliases)
1152 if (SYMBOLP(aliasee)
1153 && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1154 (*(int *)dangling_aliases)++;
1161 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1162 Define symbol ALIAS as an alias for coding system ALIASEE.
1164 You can use this function to redefine an alias that has already been defined,
1165 but you cannot redefine a name which is the canonical name for a coding system.
1166 \(a canonical name of a coding system is what is returned when you call
1167 `coding-system-name' on a coding system).
1169 ALIASEE itself can be an alias, which allows you to define nested aliases.
1171 You are forbidden, however, from creating alias loops or `dangling' aliases.
1172 These will be detected, and an error will be signaled if you attempt to do so.
1174 If ALIASEE is nil, then ALIAS will simply be undefined.
1176 See also `coding-system-alias-p', `coding-system-aliasee',
1177 and `coding-system-canonical-name-p'.
1181 Lisp_Object real_coding_system, probe;
1183 CHECK_SYMBOL(alias);
1185 if (!NILP(Fcoding_system_canonical_name_p(alias)))
1187 ("Symbol is the canonical name of a coding system and cannot be redefined",
1190 if (NILP(aliasee)) {
1191 Lisp_Object subsidiary_unix =
1192 append_suffix_to_symbol(alias, "-unix");
1193 Lisp_Object subsidiary_dos =
1194 append_suffix_to_symbol(alias, "-dos");
1195 Lisp_Object subsidiary_mac =
1196 append_suffix_to_symbol(alias, "-mac");
1198 Fremhash(alias, Vcoding_system_hash_table);
1200 /* Undefine subsidiary aliases,
1201 presumably created by a previous call to this function */
1202 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1203 !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1204 !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1205 Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1206 Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1207 Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1210 /* Undefine dangling coding system aliases. */
1212 int dangling_aliases;
1215 dangling_aliases = 0;
1217 (dangling_coding_system_alias_p,
1218 Vcoding_system_hash_table,
1220 } while (dangling_aliases > 0);
1226 if (CODING_SYSTEMP(aliasee))
1227 aliasee = XCODING_SYSTEM_NAME(aliasee);
1229 /* Checks that aliasee names a coding-system */
1230 real_coding_system = Fget_coding_system(aliasee);
1232 /* Check for coding system alias loops */
1233 if (EQ(alias, aliasee))
1234 alias_loop:signal_simple_error_2
1235 ("Attempt to create a coding system alias loop", alias,
1238 for (probe = aliasee;
1240 probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1241 if (EQ(probe, alias))
1245 Fputhash(alias, aliasee, Vcoding_system_hash_table);
1247 /* Set up aliases for subsidiaries.
1248 #### There must be a better way to handle subsidiary coding
1251 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1253 for (int i = 0; i < countof(suffixes); i++) {
1254 Lisp_Object alias_subsidiary =
1255 append_suffix_to_symbol(alias, suffixes[i]);
1256 Lisp_Object aliasee_subsidiary =
1257 append_suffix_to_symbol(aliasee, suffixes[i]);
1259 if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1260 Fdefine_coding_system_alias(alias_subsidiary,
1261 aliasee_subsidiary);
1265 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1266 but it doesn't look intentional, so I'd rather return something
1267 meaningful or nothing at all. */
1272 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1274 Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1275 Lisp_Object new_coding_system;
1277 if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1278 return coding_system;
1281 case EOL_AUTODETECT:
1282 return coding_system;
1284 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1287 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1290 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1297 return NILP(new_coding_system) ? coding_system : new_coding_system;
1300 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1301 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1303 (coding_system, eol_type))
1305 coding_system = Fget_coding_system(coding_system);
1307 return subsidiary_coding_system(coding_system,
1308 symbol_to_eol_type(eol_type));
1311 /************************************************************************/
1312 /* Coding system accessors */
1313 /************************************************************************/
1315 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1316 Return the doc string for CODING-SYSTEM.
1320 coding_system = Fget_coding_system(coding_system);
1321 return XCODING_SYSTEM_DOC_STRING(coding_system);
1324 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1325 Return the type of CODING-SYSTEM.
1329 Lisp_Object tmp = Fget_coding_system(coding_system);
1331 switch (XCODING_SYSTEM_TYPE(tmp)) {
1335 case CODESYS_AUTODETECT:
1338 case CODESYS_SHIFT_JIS:
1340 case CODESYS_ISO2022:
1351 case CODESYS_NO_CONVERSION:
1352 return Qno_conversion;
1353 #ifdef DEBUG_SXEMACS
1354 case CODESYS_INTERNAL:
1362 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1365 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1367 return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1370 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1371 Return initial charset of CODING-SYSTEM designated to GNUM.
1374 (coding_system, gnum))
1376 coding_system = Fget_coding_system(coding_system);
1379 return coding_system_charset(coding_system, XINT(gnum));
1383 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1384 Return the PROP property of CODING-SYSTEM.
1386 (coding_system, prop))
1389 enum coding_system_type type;
1391 coding_system = Fget_coding_system(coding_system);
1393 type = XCODING_SYSTEM_TYPE(coding_system);
1395 for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1396 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1398 switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1399 case CODESYS_PROP_ALL_OK:
1402 case CODESYS_PROP_ISO2022:
1403 if (type != CODESYS_ISO2022)
1405 ("Property only valid in ISO2022 coding systems",
1409 case CODESYS_PROP_CCL:
1410 if (type != CODESYS_CCL)
1412 ("Property only valid in CCL coding systems",
1422 signal_simple_error("Unrecognized property", prop);
1424 if (EQ(prop, Qname))
1425 return XCODING_SYSTEM_NAME(coding_system);
1426 else if (EQ(prop, Qtype))
1427 return Fcoding_system_type(coding_system);
1428 else if (EQ(prop, Qdoc_string))
1429 return XCODING_SYSTEM_DOC_STRING(coding_system);
1430 else if (EQ(prop, Qmnemonic))
1431 return XCODING_SYSTEM_MNEMONIC(coding_system);
1432 else if (EQ(prop, Qeol_type))
1434 eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1435 else if (EQ(prop, Qeol_lf))
1436 return XCODING_SYSTEM_EOL_LF(coding_system);
1437 else if (EQ(prop, Qeol_crlf))
1438 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1439 else if (EQ(prop, Qeol_cr))
1440 return XCODING_SYSTEM_EOL_CR(coding_system);
1441 else if (EQ(prop, Qpost_read_conversion))
1442 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1443 else if (EQ(prop, Qpre_write_conversion))
1444 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1446 else if (type == CODESYS_ISO2022) {
1447 if (EQ(prop, Qcharset_g0))
1448 return coding_system_charset(coding_system, 0);
1449 else if (EQ(prop, Qcharset_g1))
1450 return coding_system_charset(coding_system, 1);
1451 else if (EQ(prop, Qcharset_g2))
1452 return coding_system_charset(coding_system, 2);
1453 else if (EQ(prop, Qcharset_g3))
1454 return coding_system_charset(coding_system, 3);
1456 #define FORCE_CHARSET(charset_num) \
1457 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1458 (coding_system, charset_num) ? Qt : Qnil)
1460 else if (EQ(prop, Qforce_g0_on_output))
1461 return FORCE_CHARSET(0);
1462 else if (EQ(prop, Qforce_g1_on_output))
1463 return FORCE_CHARSET(1);
1464 else if (EQ(prop, Qforce_g2_on_output))
1465 return FORCE_CHARSET(2);
1466 else if (EQ(prop, Qforce_g3_on_output))
1467 return FORCE_CHARSET(3);
1469 #define LISP_BOOLEAN(prop) \
1470 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1472 else if (EQ(prop, Qshort))
1473 return LISP_BOOLEAN(SHORT);
1474 else if (EQ(prop, Qno_ascii_eol))
1475 return LISP_BOOLEAN(NO_ASCII_EOL);
1476 else if (EQ(prop, Qno_ascii_cntl))
1477 return LISP_BOOLEAN(NO_ASCII_CNTL);
1478 else if (EQ(prop, Qseven))
1479 return LISP_BOOLEAN(SEVEN);
1480 else if (EQ(prop, Qlock_shift))
1481 return LISP_BOOLEAN(LOCK_SHIFT);
1482 else if (EQ(prop, Qno_iso6429))
1483 return LISP_BOOLEAN(NO_ISO6429);
1484 else if (EQ(prop, Qescape_quoted))
1485 return LISP_BOOLEAN(ESCAPE_QUOTED);
1487 else if (EQ(prop, Qinput_charset_conversion))
1489 unparse_charset_conversion_specs
1490 (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1491 else if (EQ(prop, Qoutput_charset_conversion))
1493 unparse_charset_conversion_specs
1494 (XCODING_SYSTEM(coding_system)->iso2022.
1498 } else if (type == CODESYS_CCL) {
1499 if (EQ(prop, Qdecode))
1500 return XCODING_SYSTEM_CCL_DECODE(coding_system);
1501 else if (EQ(prop, Qencode))
1502 return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1510 return Qnil; /* not reached */
1513 /************************************************************************/
1514 /* Coding category functions */
1515 /************************************************************************/
1517 static int decode_coding_category(Lisp_Object symbol)
1521 CHECK_SYMBOL(symbol);
1522 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1523 if (EQ(coding_category_symbol[i], symbol))
1526 signal_simple_error("Unrecognized coding category", symbol);
1527 return 0; /* not reached */
1530 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1531 Return a list of all recognized coding categories.
1536 Lisp_Object list = Qnil;
1538 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1539 list = Fcons(coding_category_symbol[i], list);
1543 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1544 Change the priority order of the coding categories.
1545 LIST should be list of coding categories, in descending order of
1546 priority. Unspecified coding categories will be lower in priority
1547 than all specified ones, in the same relative order they were in
1552 int category_to_priority[CODING_CATEGORY_LAST];
1556 /* First generate a list that maps coding categories to priorities. */
1558 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1559 category_to_priority[i] = -1;
1561 /* Highest priority comes from the specified list. */
1563 EXTERNAL_LIST_LOOP(rest, list) {
1564 int cat = decode_coding_category(XCAR(rest));
1566 if (category_to_priority[cat] >= 0)
1567 signal_simple_error("Duplicate coding category in list",
1569 category_to_priority[cat] = i++;
1572 /* Now go through the existing categories by priority to retrieve
1573 the categories not yet specified and preserve their priority
1575 for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1576 int cat = fcd->coding_category_by_priority[j];
1577 if (category_to_priority[cat] < 0)
1578 category_to_priority[cat] = i++;
1581 /* Now we need to construct the inverse of the mapping we just
1584 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1585 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1587 /* Phew! That was confusing. */
1591 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1592 Return a list of coding categories in descending order of priority.
1597 Lisp_Object list = Qnil;
1599 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1601 Fcons(coding_category_symbol
1602 [fcd->coding_category_by_priority[i]], list);
1606 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1607 Change the coding system associated with a coding category.
1609 (coding_category, coding_system))
1611 int cat = decode_coding_category(coding_category);
1613 coding_system = Fget_coding_system(coding_system);
1614 fcd->coding_category_system[cat] = coding_system;
1618 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1619 Return the coding system associated with a coding category.
1623 int cat = decode_coding_category(coding_category);
1624 Lisp_Object sys = fcd->coding_category_system[cat];
1627 return XCODING_SYSTEM_NAME(sys);
1631 /************************************************************************/
1632 /* Detecting the encoding of data */
1633 /************************************************************************/
1635 struct detection_state {
1636 eol_type_t eol_type;
1663 struct iso2022_decoder iso;
1665 int high_byte_count;
1666 unsigned int saw_single_shift:1;
1675 static int acceptable_control_char_p(int c)
1678 /* Allow and ignore control characters that you might
1679 reasonably see in a text file */
1684 case 8: /* backspace */
1685 case 11: /* vertical tab */
1686 case 12: /* form feed */
1687 case 26: /* MS-DOS C-z junk */
1688 case 31: /* '^_' -- for info */
1695 static int mask_has_at_most_one_bit_p(int mask)
1697 /* Perhaps the only thing useful you learn from intensive Microsoft
1698 technical interviews */
1699 return (mask & (mask - 1)) == 0;
1703 detect_eol_type(struct detection_state *st, const Extbyte * src,
1704 Lstream_data_count n)
1707 const unsigned char c = *(const unsigned char*)src++;
1709 if (st->eol.just_saw_cr)
1711 else if (st->eol.seen_anything)
1713 } else if (st->eol.just_saw_cr)
1716 st->eol.just_saw_cr = 1;
1718 st->eol.just_saw_cr = 0;
1719 st->eol.seen_anything = 1;
1722 return EOL_AUTODETECT;
1725 /* Attempt to determine the encoding and EOL type of the given text.
1726 Before calling this function for the first type, you must initialize
1727 st->eol_type as appropriate and initialize st->mask to ~0.
1729 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1732 st->mask holds the determined coding category mask, or ~0 if only
1733 ASCII has been seen so far.
1737 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1738 is present in st->mask
1739 1 == definitive answers are here for both st->eol_type and st->mask
1743 detect_coding_type(struct detection_state *st, const Extbyte * src,
1744 Lstream_data_count n, int just_do_eol)
1746 if (st->eol_type == EOL_AUTODETECT)
1747 st->eol_type = detect_eol_type(st, src, n);
1750 return st->eol_type != EOL_AUTODETECT;
1752 if (!st->seen_non_ascii) {
1753 for (; n; n--, src++) {
1754 const unsigned char c = *(const unsigned char *)src;
1755 if ((c < 0x20 && !acceptable_control_char_p(c))
1757 st->seen_non_ascii = 1;
1759 st->shift_jis.mask = ~0;
1763 st->iso2022.mask = ~0;
1774 if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1775 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1776 if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1777 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1778 if (!mask_has_at_most_one_bit_p(st->big5.mask))
1779 st->big5.mask = detect_coding_big5(st, src, n);
1780 if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1781 st->utf8.mask = detect_coding_utf8(st, src, n);
1782 if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1783 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1785 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1786 | st->utf8.mask | st->ucs4.mask;
1789 int retval = mask_has_at_most_one_bit_p(st->mask);
1790 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1791 return retval && st->eol_type != EOL_AUTODETECT;
1795 static Lisp_Object coding_system_from_mask(int mask)
1798 /* If the file was entirely or basically ASCII, use the
1799 default value of `buffer-file-coding-system'. */
1800 Lisp_Object retval =
1801 XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1802 if (!NILP(retval)) {
1803 retval = Ffind_coding_system(retval);
1806 (Qbad_variable, Qwarning,
1807 "Invalid `default-buffer-file-coding-system', set to nil");
1808 XBUFFER(Vbuffer_defaults)->
1809 buffer_file_coding_system = Qnil;
1813 retval = Fget_coding_system(Qraw_text);
1819 mask = postprocess_iso2022_mask(mask);
1821 /* Look through the coding categories by priority and find
1822 the first one that is allowed. */
1823 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1824 cat = fcd->coding_category_by_priority[i];
1827 if ((mask & (1 << cat)) &&
1828 !NILP(fcd->coding_category_system[cat]))
1832 return fcd->coding_category_system[cat];
1834 return Fget_coding_system(Qraw_text);
1838 /* Given a seekable read stream and potential coding system and EOL type
1839 as specified, do any autodetection that is called for. If the
1840 coding system and/or EOL type are not `autodetect', they will be left
1841 alone; but this function will never return an autodetect coding system
1844 This function does not automatically fetch subsidiary coding systems;
1845 that should be unnecessary with the explicit eol-type argument. */
1847 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1848 /* number of leading lines to check for a coding cookie */
1849 #define LINES_TO_CHECK 2
1852 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1853 eol_type_t * eol_type_in_out)
1855 static const char mime_name_valid_chars[] =
1856 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1857 "abcdefghijklmnopqrstuvwxyz"
1861 struct detection_state decst;
1863 if (*eol_type_in_out == EOL_AUTODETECT)
1864 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
1867 decst.eol_type = *eol_type_in_out;
1870 /* If autodetection is called for, do it now. */
1871 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1872 || *eol_type_in_out == EOL_AUTODETECT) {
1874 Lisp_Object coding_system = Qnil;
1876 Lstream_data_count nread =
1877 Lstream_read(stream, buf, sizeof(buf));
1879 int lines_checked = 0;
1881 /* Look for initial "-*-"; mode line prefix */
1882 for (p = buf, scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1883 p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1884 if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1885 Extbyte *local_vars_beg = p + 3;
1886 /* Look for final "-*-"; mode line suffix */
1887 for (p = local_vars_beg, scan_end = buf + nread - LENGTH("-*-");
1888 p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1889 if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1890 Extbyte *suffix = p;
1891 /* Look for "coding:" */
1892 for (p = local_vars_beg, scan_end = suffix - LENGTH("coding:?");
1893 p <= scan_end; p++) {
1894 if (memcmp("coding:", p, LENGTH("coding:")) != 0)
1896 if (p != local_vars_beg && strchr(" \t;", *p) == NULL )
1900 p += LENGTH("coding:");
1901 while (*p == ' ' || *p == '\t') {
1905 /* Get coding system name */
1908 /* Characters valid in a MIME charset name (rfc 1521),
1909 and in a Lisp symbol name. */
1910 n = strspn((char *)p, mime_name_valid_chars);
1915 coding_system = Ffind_coding_system(intern((char *)p));
1922 /* #### file must use standard EOLs or we miss 2d line */
1923 /* #### not to mention this is broken for UTF-16 DOS files */
1924 else if (*p == '\n' || *p == '\r') {
1926 /* skip past multibyte (DOS) newline */
1928 && *(p + 1) == '\n')
1933 /* #### file must use standard EOLs or we miss 2d line */
1934 /* #### not to mention this is broken for UTF-16 DOS files */
1935 else if (*p == '\n' || *p == '\r') {
1937 /* skip past multibyte (DOS) newline */
1938 if (*p == '\r' && *(p + 1) == '\n')
1942 if (NILP(coding_system)) {
1944 if (detect_coding_type(&decst, buf, nread,
1945 XCODING_SYSTEM_TYPE(*codesys_in_out)
1946 != CODESYS_AUTODETECT))
1948 nread = Lstream_read(stream, buf, sizeof(buf));
1953 } else if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1954 && XCODING_SYSTEM_EOL_TYPE(coding_system) == EOL_AUTODETECT) {
1956 if (detect_coding_type(&decst, buf, nread, 1))
1958 nread = Lstream_read(stream, buf, sizeof(buf));
1964 *eol_type_in_out = decst.eol_type;
1965 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
1966 if (NILP(coding_system))
1968 coding_system_from_mask(decst.mask);
1970 *codesys_in_out = coding_system;
1974 /* If we absolutely can't determine the EOL type, just assume LF. */
1975 if (*eol_type_in_out == EOL_AUTODETECT)
1976 *eol_type_in_out = EOL_LF;
1978 Lstream_rewind(stream);
1981 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1982 Detect coding system of the text in the region between START and END.
1983 Return a list of possible coding systems ordered by priority.
1984 If only ASCII characters are found, return 'undecided or one of
1985 its subsidiary coding systems according to a detected end-of-line
1986 type. Optional arg BUFFER defaults to the current buffer.
1988 (start, end, buffer))
1990 Lisp_Object val = Qnil;
1991 struct buffer *buf = decode_buffer(buffer, 0);
1993 Lisp_Object instream, lb_instream;
1994 lstream_t istr, lb_istr;
1995 struct detection_state decst;
1996 struct gcpro gcpro1, gcpro2;
1998 get_buffer_range_char(buf, start, end, &b, &e, 0);
1999 lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2000 lb_istr = XLSTREAM(lb_instream);
2002 make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2003 istr = XLSTREAM(instream);
2004 GCPRO2(instream, lb_instream);
2006 decst.eol_type = EOL_AUTODETECT;
2009 Extbyte random_buffer[4096];
2010 Lstream_data_count nread =
2011 Lstream_read(istr, random_buffer, sizeof(random_buffer));
2015 if (detect_coding_type(&decst, random_buffer, nread, 0))
2019 if (decst.mask == ~0)
2020 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2027 decst.mask = postprocess_iso2022_mask(decst.mask);
2029 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2030 int sys = fcd->coding_category_by_priority[i];
2031 if (decst.mask & (1 << sys)) {
2032 Lisp_Object codesys =
2033 fcd->coding_category_system[sys];
2036 subsidiary_coding_system(codesys,
2039 val = Fcons(codesys, val);
2043 Lstream_close(istr);
2045 Lstream_delete(istr);
2046 Lstream_delete(lb_istr);
2050 /************************************************************************/
2051 /* Converting to internal Mule format ("decoding") */
2052 /************************************************************************/
2054 /* A decoding stream is a stream used for decoding text (i.e.
2055 converting from some external format to internal format).
2056 The decoding-stream object keeps track of the actual coding
2057 stream, the stream that is at the other end, and data that
2058 needs to be persistent across the lifetime of the stream. */
2060 /* Handle the EOL stuff related to just-read-in character C.
2061 EOL_TYPE is the EOL type of the coding stream.
2062 FLAGS is the current value of FLAGS in the coding stream, and may
2063 be modified by this macro. (The macro only looks at the
2064 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2065 bytes are to be written. You need to also define a local goto
2066 label "label_continue_loop" that is at the end of the main
2067 character-reading loop.
2069 If C is a CR character, then this macro handles it entirely and
2070 jumps to label_continue_loop. Otherwise, this macro does not add
2071 anything to DST, and continues normally. You should continue
2072 processing C normally after this macro. */
2074 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2078 if (eol_type == EOL_CR) \
2079 Dynarr_add (dst, '\n'); \
2080 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2081 Dynarr_add (dst, c); \
2083 flags |= CODING_STATE_CR; \
2084 goto label_continue_loop; \
2086 else if (flags & CODING_STATE_CR) \
2087 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2089 Dynarr_add (dst, '\r'); \
2090 flags &= ~CODING_STATE_CR; \
2094 /* C should be a binary character in the range 0 - 255; convert
2095 to internal format and add to Dynarr DST. */
2097 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2099 if (BYTE_ASCII_P (c)) \
2100 Dynarr_add (dst, c); \
2101 else if (BYTE_C1_P (c)) \
2103 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2104 Dynarr_add (dst, c + 0x20); \
2108 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2109 Dynarr_add (dst, c); \
2113 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2117 DECODE_ADD_BINARY_CHAR (ch, dst); \
2122 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2124 if (flags & CODING_STATE_END) \
2126 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2127 if (flags & CODING_STATE_CR) \
2128 Dynarr_add (dst, '\r'); \
2132 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2134 typedef struct decoding_stream_s *decoding_stream_t;
2135 struct decoding_stream_s {
2136 /* Coding system that governs the conversion. */
2137 Lisp_Coding_System *codesys;
2139 /* Stream that we read the encoded data from or
2140 write the decoded data to. */
2141 lstream_t other_end;
2143 /* If we are reading, then we can return only a fixed amount of
2144 data, so if the conversion resulted in too much data, we store it
2145 here for retrieval the next time around. */
2146 unsigned_char_dynarr *runoff;
2148 /* FLAGS holds flags indicating the current state of the decoding.
2149 Some of these flags are dependent on the coding system. */
2152 /* CH holds a partially built-up character. Since we only deal
2153 with one- and two-byte characters at the moment, we only use
2154 this to store the first byte of a two-byte character. */
2157 /* EOL_TYPE specifies the type of end-of-line conversion that
2158 currently applies. We need to keep this separate from the
2159 EOL type stored in CODESYS because the latter might indicate
2160 automatic EOL-type detection while the former will always
2161 indicate a particular EOL type. */
2162 eol_type_t eol_type;
2164 /* Additional ISO2022 information. We define the structure above
2165 because it's also needed by the detection routines. */
2166 struct iso2022_decoder iso2022;
2168 /* Additional information (the state of the running CCL program)
2169 used by the CCL decoder. */
2170 struct ccl_program ccl;
2172 /* counter for UTF-8 or UCS-4 */
2173 unsigned char counter;
2175 struct detection_state decst;
2178 static Lstream_data_count
2179 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2180 static Lstream_data_count
2181 decoding_writer(lstream_t stream,
2182 const unsigned char *data, Lstream_data_count size);
2183 static int decoding_rewinder(lstream_t stream);
2184 static int decoding_seekable_p(lstream_t stream);
2185 static int decoding_flusher(lstream_t stream);
2186 static int decoding_closer(lstream_t stream);
2188 static Lisp_Object decoding_marker(Lisp_Object stream);
2190 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2191 sizeof(struct decoding_stream_s));
2194 decoding_marker(Lisp_Object stream)
2196 lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2197 Lisp_Object str_obj;
2199 /* We do not need to mark the coding systems or charsets stored
2200 within the stream because they are stored in a global list
2201 and automatically marked. */
2203 XSETLSTREAM(str_obj, str);
2204 mark_object(str_obj);
2205 if (str->imp->marker) {
2206 return str->imp->marker(str_obj);
2212 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2213 so we read data from the other end, decode it, and store it into DATA. */
2215 static Lstream_data_count
2216 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2218 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2219 unsigned char *orig_data = data;
2220 Lstream_data_count read_size;
2221 int error_occurred = 0;
2223 /* We need to interface to mule_decode(), which expects to take some
2224 amount of data and store the result into a Dynarr. We have
2225 mule_decode() store into str->runoff, and take data from there
2228 /* We loop until we have enough data, reading chunks from the other
2229 end and decoding it. */
2231 /* Take data from the runoff if we can. Make sure to take at
2232 most SIZE bytes, and delete the data from the runoff. */
2233 if (Dynarr_length(str->runoff) > 0) {
2234 Lstream_data_count chunk =
2236 (Lstream_data_count)
2237 Dynarr_length(str->runoff));
2238 memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2239 Dynarr_delete_many(str->runoff, 0, chunk);
2245 /* No more room for data */
2249 if (str->flags & CODING_STATE_END) {
2250 /* This means that on the previous iteration, we hit the
2251 EOF on the other end. We loop once more so that
2252 mule_decode() can output any final stuff it may be
2253 holding, or any "go back to a sane state" escape
2254 sequences. (This latter makes sense during
2259 /* Exhausted the runoff, so get some more. DATA has at least
2260 SIZE bytes left of storage in it, so it's OK to read directly
2261 into it. (We'll be overwriting above, after we've decoded it
2262 into the runoff.) */
2263 read_size = Lstream_read(str->other_end, data, size);
2264 if (read_size < 0) {
2268 if (read_size == 0) {
2269 /* There might be some more end data produced in the
2270 translation. See the comment above. */
2271 str->flags |= CODING_STATE_END;
2273 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2276 if (data - orig_data == 0) {
2277 return error_occurred ? -1 : 0;
2279 return data - orig_data;
2283 static Lstream_data_count
2284 decoding_writer(lstream_t stream, const unsigned char *data,
2285 Lstream_data_count size)
2287 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2288 Lstream_data_count retval;
2290 /* Decode all our data into the runoff, and then attempt to write
2291 it all out to the other end. Remove whatever chunk we succeeded
2293 mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2294 retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2295 Dynarr_length(str->runoff));
2297 Dynarr_delete_many(str->runoff, 0, retval);
2299 /* Do NOT return retval. The return value indicates how much
2300 of the incoming data was written, not how many bytes were
2306 reset_decoding_stream(decoding_stream_t str)
2309 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2310 Lisp_Object coding_system;
2311 XSETCODING_SYSTEM(coding_system, str->codesys);
2312 reset_iso2022(coding_system, &str->iso2022);
2313 } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2314 setup_ccl_program(&str->ccl,
2315 CODING_SYSTEM_CCL_DECODE(str->codesys));
2319 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2320 || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2322 str->decst.eol_type = EOL_AUTODETECT;
2323 str->decst.mask = ~0;
2325 str->flags = str->ch = 0;
2329 decoding_rewinder(lstream_t stream)
2331 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2332 reset_decoding_stream(str);
2333 Dynarr_reset(str->runoff);
2334 return Lstream_rewind(str->other_end);
2338 decoding_seekable_p(lstream_t stream)
2340 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2341 return Lstream_seekable_p(str->other_end);
2345 decoding_flusher(lstream_t stream)
2347 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2348 return Lstream_flush(str->other_end);
2352 decoding_closer(lstream_t stream)
2354 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2355 if (stream->flags & LSTREAM_FL_WRITE) {
2356 str->flags |= CODING_STATE_END;
2357 decoding_writer(stream, 0, 0);
2359 Dynarr_free(str->runoff);
2361 #ifdef ENABLE_COMPOSITE_CHARS
2362 if (str->iso2022.composite_chars) {
2363 Dynarr_free(str->iso2022.composite_chars);
2367 return Lstream_close(str->other_end);
2371 decoding_stream_coding_system(lstream_t stream)
2373 Lisp_Object coding_system;
2374 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2376 XSETCODING_SYSTEM(coding_system, str->codesys);
2377 return subsidiary_coding_system(coding_system, str->eol_type);
2381 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2383 Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2384 decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2386 if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2387 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2389 reset_decoding_stream(str);
2393 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2394 stream for writing, no automatic code detection will be performed.
2395 The reason for this is that automatic code detection requires a
2396 seekable input. Things will also fail if you open a decoding
2397 stream for reading using a non-fully-specified coding system and
2398 a non-seekable input stream. */
2401 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2403 lstream_t lstr = Lstream_new(lstream_decoding, mode);
2404 decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2408 str->other_end = stream;
2409 str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2410 str->eol_type = EOL_AUTODETECT;
2411 if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2412 /* We can determine the coding system now. */
2413 determine_real_coding_system(stream, &codesys, &str->eol_type);
2415 set_decoding_stream_coding_system(lstr, codesys);
2416 str->decst.eol_type = str->eol_type;
2417 str->decst.mask = ~0;
2418 XSETLSTREAM(obj, lstr);
2423 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2425 return make_decoding_stream_1(stream, codesys, "r");
2429 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2431 return make_decoding_stream_1(stream, codesys, "w");
2434 /* Note: the decode_coding_* functions all take the same
2435 arguments as mule_decode(), which is to say some SRC data of
2436 size N, which is to be stored into dynamic array DST.
2437 DECODING is the stream within which the decoding is
2438 taking place, but no data is actually read from or
2439 written to that stream; that is handled in decoding_reader()
2440 or decoding_writer(). This allows the same functions to
2441 be used for both reading and writing. */
2444 mule_decode(lstream_t decoding, const Extbyte * src,
2445 unsigned_char_dynarr * dst, Lstream_data_count n)
2447 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2449 /* If necessary, do encoding-detection now. We do this when
2450 we're a writing stream or a non-seekable reading stream,
2451 meaning that we can't just process the whole input,
2452 rewind, and start over. */
2454 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2455 str->eol_type == EOL_AUTODETECT) {
2456 Lisp_Object codesys;
2458 XSETCODING_SYSTEM(codesys, str->codesys);
2459 detect_coding_type(&str->decst, src, n,
2460 CODING_SYSTEM_TYPE(str->codesys) !=
2461 CODESYS_AUTODETECT);
2462 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2463 str->decst.mask != ~0)
2464 /* #### This is cheesy. What we really ought to do is
2465 buffer up a certain amount of data so as to get a
2466 less random result. */
2467 codesys = coding_system_from_mask(str->decst.mask);
2468 str->eol_type = str->decst.eol_type;
2469 if (XCODING_SYSTEM(codesys) != str->codesys) {
2470 /* Preserve the CODING_STATE_END flag in case it was set.
2471 If we erase it, bad things might happen. */
2472 int was_end = str->flags & CODING_STATE_END;
2473 set_decoding_stream_coding_system(decoding, codesys);
2475 str->flags |= CODING_STATE_END;
2479 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2480 #ifdef DEBUG_SXEMACS
2481 case CODESYS_INTERNAL:
2482 Dynarr_add_many(dst, src, n);
2485 case CODESYS_AUTODETECT:
2486 /* If we got this far and still haven't decided on the coding
2487 system, then do no conversion. */
2488 case CODESYS_NO_CONVERSION:
2489 decode_coding_no_conversion(decoding, src, dst, n);
2492 case CODESYS_SHIFT_JIS:
2493 decode_coding_sjis(decoding, src, dst, n);
2496 decode_coding_big5(decoding, src, dst, n);
2499 decode_coding_ucs4(decoding, src, dst, n);
2502 decode_coding_utf8(decoding, src, dst, n);
2505 str->ccl.last_block = str->flags & CODING_STATE_END;
2506 /* When applying ccl program to stream, MUST NOT set NULL
2508 ccl_driver(&str->ccl,
2510 ? (const unsigned char *)src
2511 : (const unsigned char *)""),
2512 dst, n, 0, CCL_MODE_DECODING);
2514 case CODESYS_ISO2022:
2515 decode_coding_iso2022(decoding, src, dst, n);
2523 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2524 Decode the text between START and END which is encoded in CODING-SYSTEM.
2525 This is useful if you've read in encoded text from a file without decoding
2526 it (e.g. you read in a JIS-formatted file but used the `binary' or
2527 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2528 Return length of decoded text.
2529 BUFFER defaults to the current buffer if unspecified.
2531 (start, end, coding_system, buffer))
2534 struct buffer *buf = decode_buffer(buffer, 0);
2535 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2536 lstream_t istr, ostr;
2537 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2539 get_buffer_range_char(buf, start, end, &b, &e, 0);
2541 barf_if_buffer_read_only(buf, b, e);
2543 coding_system = Fget_coding_system(coding_system);
2544 instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2545 lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2546 de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2548 outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2549 Fget_coding_system(Qbinary));
2550 istr = XLSTREAM(instream);
2551 ostr = XLSTREAM(outstream);
2552 GCPRO4(instream, lb_outstream, de_outstream, outstream);
2554 /* The chain of streams looks like this:
2556 [BUFFER] <----- send through
2557 ------> [ENCODE AS BINARY]
2558 ------> [DECODE AS SPECIFIED]
2563 char tempbuf[1024]; /* some random amount */
2564 Bufpos newpos, even_newer_pos;
2565 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2566 Lstream_data_count size_in_bytes =
2567 Lstream_read(istr, tempbuf, sizeof(tempbuf));
2571 newpos = lisp_buffer_stream_startpos(istr);
2572 Lstream_write(ostr, tempbuf, size_in_bytes);
2573 even_newer_pos = lisp_buffer_stream_startpos(istr);
2574 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2577 Lstream_close(istr);
2578 Lstream_close(ostr);
2580 Lstream_delete(istr);
2581 Lstream_delete(ostr);
2582 Lstream_delete(XLSTREAM(de_outstream));
2583 Lstream_delete(XLSTREAM(lb_outstream));
2587 /************************************************************************/
2588 /* Converting to an external encoding ("encoding") */
2589 /************************************************************************/
2591 /* An encoding stream is an output stream. When you create the
2592 stream, you specify the coding system that governs the encoding
2593 and another stream that the resulting encoded data is to be
2594 sent to, and then start sending data to it. */
2596 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2598 typedef struct encoding_stream_s *encoding_stream_t;
2599 struct encoding_stream_s {
2600 /* Coding system that governs the conversion. */
2601 Lisp_Coding_System *codesys;
2603 /* Stream that we read the encoded data from or
2604 write the decoded data to. */
2605 lstream_t other_end;
2607 /* If we are reading, then we can return only a fixed amount of
2608 data, so if the conversion resulted in too much data, we store it
2609 here for retrieval the next time around. */
2610 unsigned_char_dynarr *runoff;
2612 /* FLAGS holds flags indicating the current state of the encoding.
2613 Some of these flags are dependent on the coding system. */
2616 /* CH holds a partially built-up character. Since we only deal
2617 with one- and two-byte characters at the moment, we only use
2618 this to store the first byte of a two-byte character. */
2621 /* Additional information used by the ISO2022 encoder. */
2623 /* CHARSET holds the character sets currently assigned to the G0
2624 through G3 registers. It is initialized from the array
2625 INITIAL_CHARSET in CODESYS. */
2626 Lisp_Object charset[4];
2628 /* Which registers are currently invoked into the left (GL) and
2629 right (GR) halves of the 8-bit encoding space? */
2630 int register_left, register_right;
2632 /* Whether we need to explicitly designate the charset in the
2633 G? register before using it. It is initialized from the
2634 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2635 unsigned char force_charset_on_output[4];
2637 /* Other state variables that need to be preserved across
2639 Lisp_Object current_charset;
2641 int current_char_boundary;
2644 /* Additional information (the state of the running CCL program)
2645 used by the CCL encoder. */
2646 struct ccl_program ccl;
2650 static Lstream_data_count
2651 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2652 static Lstream_data_count
2653 encoding_writer(lstream_t stream,
2654 const unsigned char *data, Lstream_data_count size);
2655 static int encoding_rewinder(lstream_t stream);
2656 static int encoding_seekable_p(lstream_t stream);
2657 static int encoding_flusher(lstream_t stream);
2658 static int encoding_closer(lstream_t stream);
2660 static Lisp_Object encoding_marker(Lisp_Object stream);
2662 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2663 sizeof(struct encoding_stream_s));
2666 encoding_marker(Lisp_Object stream)
2668 lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2669 Lisp_Object str_obj;
2671 /* We do not need to mark the coding systems or charsets stored
2672 within the stream because they are stored in a global list
2673 and automatically marked. */
2675 XSETLSTREAM(str_obj, str);
2676 mark_object(str_obj);
2677 if (str->imp->marker) {
2678 return str->imp->marker(str_obj);
2684 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2685 so we read data from the other end, encode it, and store it into DATA. */
2687 static Lstream_data_count
2688 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2690 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2691 unsigned char *orig_data = data;
2692 Lstream_data_count read_size;
2693 int error_occurred = 0;
2695 /* We need to interface to mule_encode(), which expects to take some
2696 amount of data and store the result into a Dynarr. We have
2697 mule_encode() store into str->runoff, and take data from there
2700 /* We loop until we have enough data, reading chunks from the other
2701 end and encoding it. */
2703 /* Take data from the runoff if we can. Make sure to take at
2704 most SIZE bytes, and delete the data from the runoff. */
2705 if (Dynarr_length(str->runoff) > 0) {
2706 int chunk = min((int)size, Dynarr_length(str->runoff));
2707 memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2708 Dynarr_delete_many(str->runoff, 0, chunk);
2714 /* No more room for data */
2718 if (str->flags & CODING_STATE_END) {
2719 /* This means that on the previous iteration, we hit the
2720 EOF on the other end. We loop once more so that
2721 mule_encode() can output any final stuff it may be
2722 holding, or any "go back to a sane state" escape
2723 sequences. (This latter makes sense during
2728 /* Exhausted the runoff, so get some more. DATA at least SIZE
2729 bytes left of storage in it, so it's OK to read directly into
2730 it. (We'll be overwriting above, after we've encoded it into
2732 read_size = Lstream_read(str->other_end, data, size);
2733 if (read_size < 0) {
2737 if (read_size == 0) {
2738 /* There might be some more end data produced in the
2739 translation. See the comment above. */
2740 str->flags |= CODING_STATE_END;
2742 mule_encode(stream, data, str->runoff, read_size);
2745 if (data == orig_data) {
2746 return error_occurred ? -1 : 0;
2748 return data - orig_data;
2752 static Lstream_data_count
2753 encoding_writer(lstream_t stream, const unsigned char *data,
2754 Lstream_data_count size)
2756 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2757 Lstream_data_count retval;
2759 /* Encode all our data into the runoff, and then attempt to write
2760 it all out to the other end. Remove whatever chunk we succeeded
2762 mule_encode(stream, data, str->runoff, size);
2763 retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2764 Dynarr_length(str->runoff));
2766 Dynarr_delete_many(str->runoff, 0, retval);
2768 /* Do NOT return retval. The return value indicates how much
2769 of the incoming data was written, not how many bytes were
2775 reset_encoding_stream(encoding_stream_t str)
2778 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2779 case CODESYS_ISO2022: {
2782 for (i = 0; i < 4; i++) {
2783 str->iso2022.charset[i] =
2784 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2786 str->iso2022.force_charset_on_output[i] =
2787 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2790 str->iso2022.register_left = 0;
2791 str->iso2022.register_right = 1;
2792 str->iso2022.current_charset = Qnil;
2793 str->iso2022.current_half = 0;
2794 str->iso2022.current_char_boundary = 1;
2798 setup_ccl_program(&str->ccl,
2799 CODING_SYSTEM_CCL_ENCODE(str->codesys));
2802 /* list the rest of them lot explicitly */
2803 case CODESYS_AUTODETECT:
2804 case CODESYS_SHIFT_JIS:
2808 case CODESYS_NO_CONVERSION:
2809 #ifdef DEBUG_SXEMACS
2810 case CODESYS_INTERNAL:
2817 str->flags = str->ch = 0;
2821 encoding_rewinder(lstream_t stream)
2823 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2824 reset_encoding_stream(str);
2825 Dynarr_reset(str->runoff);
2826 return Lstream_rewind(str->other_end);
2830 encoding_seekable_p(lstream_t stream)
2832 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2833 return Lstream_seekable_p(str->other_end);
2837 encoding_flusher(lstream_t stream)
2839 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2840 return Lstream_flush(str->other_end);
2844 encoding_closer(lstream_t stream)
2846 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2847 if (stream->flags & LSTREAM_FL_WRITE) {
2848 str->flags |= CODING_STATE_END;
2849 encoding_writer(stream, 0, 0);
2851 Dynarr_free(str->runoff);
2852 return Lstream_close(str->other_end);
2856 encoding_stream_coding_system(lstream_t stream)
2858 Lisp_Object coding_system;
2859 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2861 XSETCODING_SYSTEM(coding_system, str->codesys);
2862 return coding_system;
2866 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2868 Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2869 encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2871 reset_encoding_stream(str);
2875 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2877 lstream_t lstr = Lstream_new(lstream_encoding, mode);
2878 encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2882 str->runoff = Dynarr_new(unsigned_char);
2883 str->other_end = stream;
2884 set_encoding_stream_coding_system(lstr, codesys);
2885 XSETLSTREAM(obj, lstr);
2890 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2892 return make_encoding_stream_1(stream, codesys, "r");
2896 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2898 return make_encoding_stream_1(stream, codesys, "w");
2901 /* Convert N bytes of internally-formatted data stored in SRC to an
2902 external format, according to the encoding stream ENCODING.
2903 Store the encoded data into DST. */
2906 mule_encode(lstream_t encoding, const Bufbyte * src,
2907 unsigned_char_dynarr * dst, Lstream_data_count n)
2909 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2911 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2912 #ifdef DEBUG_SXEMACS
2913 case CODESYS_INTERNAL:
2914 Dynarr_add_many(dst, src, n);
2917 case CODESYS_AUTODETECT:
2918 /* If we got this far and still haven't decided on the coding
2919 system, then do no conversion. */
2920 case CODESYS_NO_CONVERSION:
2921 encode_coding_no_conversion(encoding, src, dst, n);
2924 case CODESYS_SHIFT_JIS:
2925 encode_coding_sjis(encoding, src, dst, n);
2928 encode_coding_big5(encoding, src, dst, n);
2931 encode_coding_ucs4(encoding, src, dst, n);
2934 encode_coding_utf8(encoding, src, dst, n);
2937 str->ccl.last_block = str->flags & CODING_STATE_END;
2938 /* When applying ccl program to stream, MUST NOT set NULL
2940 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
2941 dst, n, 0, CCL_MODE_ENCODING);
2943 case CODESYS_ISO2022:
2944 encode_coding_iso2022(encoding, src, dst, n);
2952 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2953 Encode the text between START and END using CODING-SYSTEM.
2954 This will, for example, convert Japanese characters into stuff such as
2955 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2956 text. BUFFER defaults to the current buffer if unspecified.
2958 (start, end, coding_system, buffer))
2961 struct buffer *buf = decode_buffer(buffer, 0);
2962 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2963 lstream_t istr, ostr;
2964 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2966 get_buffer_range_char(buf, start, end, &b, &e, 0);
2968 barf_if_buffer_read_only(buf, b, e);
2970 coding_system = Fget_coding_system(coding_system);
2971 instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2972 lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2973 de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2974 Fget_coding_system(Qbinary));
2975 outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2977 istr = XLSTREAM(instream);
2978 ostr = XLSTREAM(outstream);
2979 GCPRO4(instream, outstream, de_outstream, lb_outstream);
2980 /* The chain of streams looks like this:
2982 [BUFFER] <----- send through
2983 ------> [ENCODE AS SPECIFIED]
2984 ------> [DECODE AS BINARY]
2988 char tempbuf[1024]; /* some random amount */
2989 Bufpos newpos, even_newer_pos;
2990 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2991 Lstream_data_count size_in_bytes =
2992 Lstream_read(istr, tempbuf, sizeof(tempbuf));
2996 newpos = lisp_buffer_stream_startpos(istr);
2997 Lstream_write(ostr, tempbuf, size_in_bytes);
2998 even_newer_pos = lisp_buffer_stream_startpos(istr);
2999 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3005 lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3006 Lstream_close(istr);
3007 Lstream_close(ostr);
3009 Lstream_delete(istr);
3010 Lstream_delete(ostr);
3011 Lstream_delete(XLSTREAM(de_outstream));
3012 Lstream_delete(XLSTREAM(lb_outstream));
3013 return make_int(retlen);
3019 /************************************************************************/
3020 /* Shift-JIS methods */
3021 /************************************************************************/
3023 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3024 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3025 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3026 encoded by "position-code + 0x80". A character of JISX0208
3027 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3028 position-codes are divided and shifted so that it fit in the range
3031 --- CODE RANGE of Shift-JIS ---
3032 (character set) (range)
3034 JISX0201-Kana 0xA0 .. 0xDF
3035 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3036 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3037 -------------------------------
3041 /* Is this the first byte of a Shift-JIS two-byte char? */
3043 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3044 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3046 /* Is this the second byte of a Shift-JIS two-byte char? */
3048 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3049 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3051 #define BYTE_SJIS_KATAKANA_P(c) \
3052 ((c) >= 0xA1 && (c) <= 0xDF)
3055 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3056 Lstream_data_count n)
3059 const unsigned char c = *(const unsigned char *)src++;
3060 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3062 if (st->shift_jis.in_second_byte) {
3063 st->shift_jis.in_second_byte = 0;
3066 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3067 st->shift_jis.in_second_byte = 1;
3069 return CODING_CATEGORY_SHIFT_JIS_MASK;
3072 /* Convert Shift-JIS data to internal format. */
3075 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3076 unsigned_char_dynarr * dst, Lstream_data_count n)
3078 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3079 unsigned int flags = str->flags;
3080 unsigned int ch = str->ch;
3081 eol_type_t eol_type = str->eol_type;
3084 const unsigned char c = *(const unsigned char *)src++;
3087 /* Previous character was first byte of Shift-JIS Kanji
3089 if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3090 unsigned char e1, e2;
3092 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3093 DECODE_SJIS(ch, c, e1, e2);
3094 Dynarr_add(dst, e1);
3095 Dynarr_add(dst, e2);
3097 DECODE_ADD_BINARY_CHAR(ch, dst);
3098 DECODE_ADD_BINARY_CHAR(c, dst);
3102 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3103 if (BYTE_SJIS_TWO_BYTE_1_P(c))
3105 else if (BYTE_SJIS_KATAKANA_P(c)) {
3106 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3109 DECODE_ADD_BINARY_CHAR(c, dst);
3111 label_continue_loop:;
3114 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3120 /* Convert internally-formatted data to Shift-JIS. */
3123 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3124 unsigned_char_dynarr * dst, Lstream_data_count n)
3126 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3127 unsigned int flags = str->flags;
3128 unsigned int ch = str->ch;
3129 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3134 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3135 Dynarr_add(dst, '\r');
3136 if (eol_type != EOL_CR)
3137 Dynarr_add(dst, '\n');
3139 } else if (BYTE_ASCII_P(c)) {
3142 } else if (BUFBYTE_LEADING_BYTE_P(c))
3143 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3144 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3145 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3147 if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3150 } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3151 ch == LEADING_BYTE_JAPANESE_JISX0208)
3154 /* j1 is bessel j1 function,
3155 * so we use something else */
3156 /* unsigned char j1, j2; */
3157 unsigned char tt1, tt2;
3159 ENCODE_SJIS(ch, c, tt1, tt2);
3160 Dynarr_add(dst, tt1);
3161 Dynarr_add(dst, tt2);
3171 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3172 Decode a JISX0208 character of Shift-JIS coding-system.
3173 CODE is the character code in Shift-JIS as a cons of type bytes.
3174 Return the corresponding character.
3178 unsigned char c1, c2, s1, s2;
3181 CHECK_INT(XCAR(code));
3182 CHECK_INT(XCDR(code));
3183 s1 = XINT(XCAR(code));
3184 s2 = XINT(XCDR(code));
3185 if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3186 DECODE_SJIS(s1, s2, c1, c2);
3187 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3188 c1 & 0x7F, c2 & 0x7F));
3193 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3194 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3195 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3199 Lisp_Object charset;
3202 CHECK_CHAR_COERCE_INT(character);
3203 BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3204 if (EQ(charset, Vcharset_japanese_jisx0208)) {
3205 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3206 return Fcons(make_int(s1), make_int(s2));
3211 /************************************************************************/
3213 /************************************************************************/
3215 /* BIG5 is a coding system encoding two character sets: ASCII and
3216 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3217 character set and is encoded in two-byte.
3219 --- CODE RANGE of BIG5 ---
3220 (character set) (range)
3222 Big5 (1st byte) 0xA1 .. 0xFE
3223 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3224 --------------------------
3226 Since the number of characters in Big5 is larger than maximum
3227 characters in Emacs' charset (96x96), it can't be handled as one
3228 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3229 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3230 contains frequently used characters and the latter contains less
3231 frequently used characters. */
3233 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3234 ((c) >= 0xA1 && (c) <= 0xFE)
3236 /* Is this the second byte of a Shift-JIS two-byte char? */
3238 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3239 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3241 /* Number of Big5 characters which have the same code in 1st byte. */
3243 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3245 /* Code conversion macros. These are macros because they are used in
3246 inner loops during code conversion.
3248 Note that temporary variables in macros introduce the classic
3249 dynamic-scoping problems with variable names. We use capital-
3250 lettered variables in the assumption that SXEmacs does not use
3251 capital letters in variables except in a very formalized way
3254 /* Convert Big5 code (b1, b2) into its internal string representation
3257 /* There is a much simpler way to split the Big5 charset into two.
3258 For the moment I'm going to leave the algorithm as-is because it
3259 claims to separate out the most-used characters into a single
3260 charset, which perhaps will lead to optimizations in various
3263 The way the algorithm works is something like this:
3265 Big5 can be viewed as a 94x157 charset, where the row is
3266 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3267 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3268 the split between low and high column numbers is apparently
3269 meaningless; ascending rows produce less and less frequent chars.
3270 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3271 the first charset, and the upper half (0xC9 .. 0xFE) to the
3272 second. To do the conversion, we convert the character into
3273 a single number where 0 .. 156 is the first row, 157 .. 313
3274 is the second, etc. That way, the characters are ordered by
3275 decreasing frequency. Then we just chop the space in two
3276 and coerce the result into a 94x94 space.
3279 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3281 int B1 = b1, B2 = b2; \
3283 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3287 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3291 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3292 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3294 c1 = I / (0xFF - 0xA1) + 0xA1; \
3295 c2 = I % (0xFF - 0xA1) + 0xA1; \
3298 /* Convert the internal string representation of a Big5 character
3299 (lb, c1, c2) into Big5 code (b1, b2). */
3301 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3303 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3305 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3307 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3309 b1 = I / BIG5_SAME_ROW + 0xA1; \
3310 b2 = I % BIG5_SAME_ROW; \
3311 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3315 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3316 Lstream_data_count n)
3319 const unsigned char c = *(const unsigned char *)src++;
3320 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3321 (c >= 0x80 && c <= 0xA0))
3323 if (st->big5.in_second_byte) {
3324 st->big5.in_second_byte = 0;
3325 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3327 } else if (c >= 0xA1)
3328 st->big5.in_second_byte = 1;
3330 return CODING_CATEGORY_BIG5_MASK;
3333 /* Convert Big5 data to internal format. */
3336 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3337 unsigned_char_dynarr * dst, Lstream_data_count n)
3339 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3340 unsigned int flags = str->flags;
3341 unsigned int ch = str->ch;
3342 eol_type_t eol_type = str->eol_type;
3345 const unsigned char c = *(const unsigned char *)src++;
3347 /* Previous character was first byte of Big5 char. */
3348 if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3349 unsigned char b1, b2, b3;
3350 DECODE_BIG5(ch, c, b1, b2, b3);
3351 Dynarr_add(dst, b1);
3352 Dynarr_add(dst, b2);
3353 Dynarr_add(dst, b3);
3355 DECODE_ADD_BINARY_CHAR(ch, dst);
3356 DECODE_ADD_BINARY_CHAR(c, dst);
3360 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3361 if (BYTE_BIG5_TWO_BYTE_1_P(c))
3364 DECODE_ADD_BINARY_CHAR(c, dst);
3366 label_continue_loop:;
3369 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3375 /* Convert internally-formatted data to Big5. */
3378 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3379 unsigned_char_dynarr * dst, Lstream_data_count n)
3382 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3383 unsigned int flags = str->flags;
3384 unsigned int ch = str->ch;
3385 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3390 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3391 Dynarr_add(dst, '\r');
3392 if (eol_type != EOL_CR)
3393 Dynarr_add(dst, '\n');
3394 } else if (BYTE_ASCII_P(c)) {
3397 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3398 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3399 c == LEADING_BYTE_CHINESE_BIG5_2) {
3400 /* A recognized leading byte. */
3402 continue; /* not done with this character. */
3404 /* otherwise just ignore this character. */
3405 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3406 ch == LEADING_BYTE_CHINESE_BIG5_2) {
3407 /* Previous char was a recognized leading byte. */
3409 continue; /* not done with this character. */
3411 /* Encountering second byte of a Big5 character. */
3412 unsigned char b1, b2;
3414 ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3415 Dynarr_add(dst, b1);
3416 Dynarr_add(dst, b2);
3426 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3427 Decode a Big5 character CODE of BIG5 coding-system.
3428 CODE is the character code in BIG5, a cons of two integers.
3429 Return the corresponding character.
3433 unsigned char c1, c2, b1, b2;
3436 CHECK_INT(XCAR(code));
3437 CHECK_INT(XCDR(code));
3438 b1 = XINT(XCAR(code));
3439 b2 = XINT(XCDR(code));
3440 if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3442 Lisp_Object charset;
3443 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3444 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3445 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3450 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3451 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3452 Return the corresponding character code in Big5.
3456 Lisp_Object charset;
3459 CHECK_CHAR_COERCE_INT(character);
3460 BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3461 if (EQ(charset, Vcharset_chinese_big5_1) ||
3462 EQ(charset, Vcharset_chinese_big5_2)) {
3463 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3465 return Fcons(make_int(b1), make_int(b2));
3470 /************************************************************************/
3473 /* UCS-4 character codes are implemented as nonnegative integers. */
3475 /************************************************************************/
3477 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3478 Map UCS-4 code CODE to Mule character CHARACTER.
3480 Return T on success, NIL on failure.
3486 CHECK_CHAR(character);
3490 if (c < countof(fcd->ucs_to_mule_table)) {
3491 fcd->ucs_to_mule_table[c] = character;
3497 static Lisp_Object ucs_to_char(unsigned long code)
3499 if (code < countof(fcd->ucs_to_mule_table)) {
3500 return fcd->ucs_to_mule_table[code];
3501 } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3505 c = code % (94 * 94);
3507 (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3508 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3509 CHARSET_LEFT_TO_RIGHT),
3510 c / 94 + 33, c % 94 + 33));
3515 DEFUN("ucs-char", Fucs_char, 1, 1, 0, /*
3516 Return Mule character corresponding to UCS code CODE (a positive integer).
3521 return ucs_to_char(XINT(code));
3524 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3525 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3529 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3530 Fset_char_ucs is more restrictive on index arg, but should
3531 check code arg in a char_table method. */
3532 CHECK_CHAR(character);
3534 return Fput_char_table(character, code, mule_to_ucs_table);
3537 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0, /*
3538 Return the UCS code (a positive integer) corresponding to CHARACTER.
3542 return Fget_char_table(character, mule_to_ucs_table);
3545 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3546 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3547 is not found, instead.
3548 #### do something more appropriate (use blob?)
3549 Danger, Will Robinson! Data loss. Should we signal user? */
3550 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3552 Lisp_Object chr = ucs_to_char(ch);
3555 Bufbyte work[MAX_EMCHAR_LEN];
3560 simple_set_charptr_emchar(work, ch) :
3561 non_ascii_set_charptr_emchar(work, ch);
3562 Dynarr_add_many(dst, work, len);
3564 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3565 Dynarr_add(dst, 34 + 128);
3566 Dynarr_add(dst, 46 + 128);
3570 static unsigned long
3571 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3574 = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3579 } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3580 (XCHARSET_CHARS(charset) == 94)) {
3581 unsigned char final = XCHARSET_FINAL(charset);
3583 if (('@' <= final) && (final < 0x7f)) {
3584 return 0xe00000 + (final - '@') * 94 * 94
3585 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3595 encode_ucs4(Lisp_Object charset,
3596 unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3598 unsigned long code = mule_char_to_ucs4(charset, h, l);
3599 Dynarr_add(dst, code >> 24);
3600 Dynarr_add(dst, (code >> 16) & 255);
3601 Dynarr_add(dst, (code >> 8) & 255);
3602 Dynarr_add(dst, code & 255);
3606 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3607 Lstream_data_count n)
3610 const unsigned char c = *(const unsigned char *)src++;
3611 switch (st->ucs4.in_byte) {
3619 st->ucs4.in_byte = 0;
3625 return CODING_CATEGORY_UCS4_MASK;
3629 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3630 unsigned_char_dynarr * dst, Lstream_data_count n)
3632 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3633 unsigned int flags = str->flags;
3634 unsigned int ch = str->ch;
3635 unsigned char counter = str->counter;
3638 const unsigned char c = *(const unsigned char *)src++;
3645 decode_ucs4((ch << 8) | c, dst);
3654 if (counter & CODING_STATE_END)
3655 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3659 str->counter = counter;
3663 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3664 unsigned_char_dynarr * dst, Lstream_data_count n)
3666 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3667 unsigned int flags = str->flags;
3668 unsigned int ch = str->ch;
3669 unsigned char char_boundary = str->iso2022.current_char_boundary;
3670 Lisp_Object charset = str->iso2022.current_charset;
3672 #ifdef ENABLE_COMPOSITE_CHARS
3673 /* flags for handling composite chars. We do a little switcharoo
3674 on the source while we're outputting the composite char. */
3675 unsigned int saved_n = 0;
3676 const unsigned char *saved_src = NULL;
3677 int in_composite = 0;
3683 unsigned char c = *src++;
3685 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
3687 encode_ucs4(Vcharset_ascii, c, 0, dst);
3689 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
3691 charset = CHARSET_BY_LEADING_BYTE(c);
3692 if (LEADING_BYTE_PREFIX_P(c))
3695 } else { /* Processing Non-ASCII character */
3697 if (EQ(charset, Vcharset_control_1)) {
3698 encode_ucs4(Vcharset_control_1, c, 0, dst);
3700 switch (XCHARSET_REP_BYTES(charset)) {
3702 encode_ucs4(charset, c, 0, dst);
3705 if (XCHARSET_PRIVATE_P(charset)) {
3706 encode_ucs4(charset, c, 0, dst);
3709 #ifdef ENABLE_COMPOSITE_CHARS
3712 Vcharset_composite)) {
3714 /* #### Bother! We don't know how to
3727 (Vcharset_composite,
3732 composite_char_string
3741 n = XSTRING_LENGTH(lstr);
3744 #endif /* ENABLE_COMPOSITE_CHARS */
3746 encode_ucs4(charset, ch,
3757 encode_ucs4(charset, ch, c,
3772 #ifdef ENABLE_COMPOSITE_CHARS
3777 goto back_to_square_n; /* Wheeeeeeeee ..... */
3779 #endif /* ENABLE_COMPOSITE_CHARS */
3783 str->iso2022.current_char_boundary = char_boundary;
3784 str->iso2022.current_charset = charset;
3786 /* Verbum caro factum est! */
3789 /************************************************************************/
3791 /************************************************************************/
3794 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3795 Lstream_data_count n)
3798 const unsigned char c = *(const unsigned char *)src++;
3799 switch (st->utf8.in_byte) {
3801 if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3802 || c == ISO_CODE_SO)
3805 st->utf8.in_byte = 5;
3807 st->utf8.in_byte = 4;
3809 st->utf8.in_byte = 3;
3811 st->utf8.in_byte = 2;
3813 st->utf8.in_byte = 1;
3818 if ((c & 0xc0) != 0x80)
3824 return CODING_CATEGORY_UTF8_MASK;
3828 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3829 unsigned_char_dynarr * dst, Lstream_data_count n)
3831 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3832 unsigned int flags = str->flags;
3833 unsigned int ch = str->ch;
3834 eol_type_t eol_type = str->eol_type;
3835 unsigned char counter = str->counter;
3838 const unsigned char c = *(const unsigned char *)src++;
3844 } else if (c >= 0xf8) {
3847 } else if (c >= 0xf0) {
3850 } else if (c >= 0xe0) {
3853 } else if (c >= 0xc0) {
3857 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3858 decode_ucs4(c, dst);
3862 ch = (ch << 6) | (c & 0x3f);
3863 decode_ucs4(ch, dst);
3868 ch = (ch << 6) | (c & 0x3f);
3871 label_continue_loop:;
3874 if (flags & CODING_STATE_END)
3875 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3879 str->counter = counter;
3883 encode_utf8(Lisp_Object charset,
3884 unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3886 unsigned long code = mule_char_to_ucs4(charset, h, l);
3888 Dynarr_add(dst, code);
3889 } else if (code <= 0x7ff) {
3890 Dynarr_add(dst, (code >> 6) | 0xc0);
3891 Dynarr_add(dst, (code & 0x3f) | 0x80);
3892 } else if (code <= 0xffff) {
3893 Dynarr_add(dst, (code >> 12) | 0xe0);
3894 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3895 Dynarr_add(dst, (code & 0x3f) | 0x80);
3896 } else if (code <= 0x1fffff) {
3897 Dynarr_add(dst, (code >> 18) | 0xf0);
3898 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3899 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3900 Dynarr_add(dst, (code & 0x3f) | 0x80);
3901 } else if (code <= 0x3ffffff) {
3902 Dynarr_add(dst, (code >> 24) | 0xf8);
3903 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3904 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3905 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3906 Dynarr_add(dst, (code & 0x3f) | 0x80);
3908 Dynarr_add(dst, (code >> 30) | 0xfc);
3909 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3910 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3911 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3912 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3913 Dynarr_add(dst, (code & 0x3f) | 0x80);
3918 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3919 unsigned_char_dynarr * dst, Lstream_data_count n)
3921 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3922 unsigned int flags = str->flags;
3923 unsigned int ch = str->ch;
3924 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3925 unsigned char char_boundary = str->iso2022.current_char_boundary;
3926 Lisp_Object charset = str->iso2022.current_charset;
3928 #ifdef ENABLE_COMPOSITE_CHARS
3929 /* flags for handling composite chars. We do a little switcharoo
3930 on the source while we're outputting the composite char. */
3931 unsigned int saved_n = 0;
3932 const unsigned char *saved_src = NULL;
3933 int in_composite = 0;
3936 #endif /* ENABLE_COMPOSITE_CHARS */
3939 unsigned char c = *src++;
3941 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
3944 if (eol_type != EOL_LF
3945 && eol_type != EOL_AUTODETECT)
3946 Dynarr_add(dst, '\r');
3947 if (eol_type != EOL_CR)
3950 encode_utf8(Vcharset_ascii, c, 0, dst);
3952 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
3954 charset = CHARSET_BY_LEADING_BYTE(c);
3955 if (LEADING_BYTE_PREFIX_P(c))
3958 } else { /* Processing Non-ASCII character */
3960 if (EQ(charset, Vcharset_control_1)) {
3961 encode_utf8(Vcharset_control_1, c, 0, dst);
3963 switch (XCHARSET_REP_BYTES(charset)) {
3965 encode_utf8(charset, c, 0, dst);
3968 if (XCHARSET_PRIVATE_P(charset)) {
3969 encode_utf8(charset, c, 0, dst);
3972 #ifdef ENABLE_COMPOSITE_CHARS
3975 Vcharset_composite)) {
3977 /* #### Bother! We don't know how to
3986 (Vcharset_composite,
3991 composite_char_string
4000 n = XSTRING_LENGTH(lstr);
4003 #endif /* ENABLE_COMPOSITE_CHARS */
4005 encode_utf8(charset, ch,
4016 encode_utf8(charset, ch, c,
4031 #ifdef ENABLE_COMPOSITE_CHARS
4036 goto back_to_square_n; /* Wheeeeeeeee ..... */
4042 str->iso2022.current_char_boundary = char_boundary;
4043 str->iso2022.current_charset = charset;
4045 /* Verbum caro factum est! */
4048 /************************************************************************/
4049 /* ISO2022 methods */
4050 /************************************************************************/
4052 /* The following note describes the coding system ISO2022 briefly.
4053 Since the intention of this note is to help understand the
4054 functions in this file, some parts are NOT ACCURATE or OVERLY
4055 SIMPLIFIED. For thorough understanding, please refer to the
4056 original document of ISO2022.
4058 ISO2022 provides many mechanisms to encode several character sets
4059 in 7-bit and 8-bit environments. For 7-bit environments, all text
4060 is encoded using bytes less than 128. This may make the encoded
4061 text a little bit longer, but the text passes more easily through
4062 several gateways, some of which strip off MSB (Most Signigant Bit).
4064 There are two kinds of character sets: control character set and
4065 graphic character set. The former contains control characters such
4066 as `newline' and `escape' to provide control functions (control
4067 functions are also provided by escape sequences). The latter
4068 contains graphic characters such as 'A' and '-'. Emacs recognizes
4069 two control character sets and many graphic character sets.
4071 Graphic character sets are classified into one of the following
4072 four classes, according to the number of bytes (DIMENSION) and
4073 number of characters in one dimension (CHARS) of the set:
4074 - DIMENSION1_CHARS94
4075 - DIMENSION1_CHARS96
4076 - DIMENSION2_CHARS94
4077 - DIMENSION2_CHARS96
4079 In addition, each character set is assigned an identification tag,
4080 unique for each set, called "final character" (denoted as <F>
4081 hereafter). The <F> of each character set is decided by ECMA(*)
4082 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4083 (0x30..0x3F are for private use only).
4085 Note (*): ECMA = European Computer Manufacturers Association
4087 Here are examples of graphic character set [NAME(<F>)]:
4088 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4089 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4090 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4091 o DIMENSION2_CHARS96 -- none for the moment
4093 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4094 C0 [0x00..0x1F] -- control character plane 0
4095 GL [0x20..0x7F] -- graphic character plane 0
4096 C1 [0x80..0x9F] -- control character plane 1
4097 GR [0xA0..0xFF] -- graphic character plane 1
4099 A control character set is directly designated and invoked to C0 or
4100 C1 by an escape sequence. The most common case is that:
4101 - ISO646's control character set is designated/invoked to C0, and
4102 - ISO6429's control character set is designated/invoked to C1,
4103 and usually these designations/invocations are omitted in encoded
4104 text. In a 7-bit environment, only C0 can be used, and a control
4105 character for C1 is encoded by an appropriate escape sequence to
4106 fit into the environment. All control characters for C1 are
4107 defined to have corresponding escape sequences.
4109 A graphic character set is at first designated to one of four
4110 graphic registers (G0 through G3), then these graphic registers are
4111 invoked to GL or GR. These designations and invocations can be
4112 done independently. The most common case is that G0 is invoked to
4113 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4114 these invocations and designations are omitted in encoded text.
4115 In a 7-bit environment, only GL can be used.
4117 When a graphic character set of CHARS94 is invoked to GL, codes
4118 0x20 and 0x7F of the GL area work as control characters SPACE and
4119 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4122 There are two ways of invocation: locking-shift and single-shift.
4123 With locking-shift, the invocation lasts until the next different
4124 invocation, whereas with single-shift, the invocation affects the
4125 following character only and doesn't affect the locking-shift
4126 state. Invocations are done by the following control characters or
4129 ----------------------------------------------------------------------
4130 abbrev function cntrl escape seq description
4131 ----------------------------------------------------------------------
4132 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4133 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4134 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4135 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4136 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4137 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4138 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4139 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4140 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4141 ----------------------------------------------------------------------
4142 (*) These are not used by any known coding system.
4144 Control characters for these functions are defined by macros
4145 ISO_CODE_XXX in `coding.h'.
4147 Designations are done by the following escape sequences:
4148 ----------------------------------------------------------------------
4149 escape sequence description
4150 ----------------------------------------------------------------------
4151 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4152 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4153 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4154 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4155 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4156 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4157 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4158 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4159 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4160 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4161 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4162 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4163 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4164 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4165 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4166 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4167 ----------------------------------------------------------------------
4169 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4170 of dimension 1, chars 94, and final character <F>, etc...
4172 Note (*): Although these designations are not allowed in ISO2022,
4173 Emacs accepts them on decoding, and produces them on encoding
4174 CHARS96 character sets in a coding system which is characterized as
4175 7-bit environment, non-locking-shift, and non-single-shift.
4177 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4178 '(' can be omitted. We refer to this as "short-form" hereafter.
4180 Now you may notice that there are a lot of ways for encoding the
4181 same multilingual text in ISO2022. Actually, there exist many
4182 coding systems such as Compound Text (used in X11's inter client
4183 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4184 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4185 localized platforms), and all of these are variants of ISO2022.
4187 In addition to the above, Emacs handles two more kinds of escape
4188 sequences: ISO6429's direction specification and Emacs' private
4189 sequence for specifying character composition.
4191 ISO6429's direction specification takes the following form:
4192 o CSI ']' -- end of the current direction
4193 o CSI '0' ']' -- end of the current direction
4194 o CSI '1' ']' -- start of left-to-right text
4195 o CSI '2' ']' -- start of right-to-left text
4196 The control character CSI (0x9B: control sequence introducer) is
4197 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4199 Character composition specification takes the following form:
4200 o ESC '0' -- start character composition
4201 o ESC '1' -- end character composition
4202 Since these are not standard escape sequences of any ISO standard,
4203 their use with these meanings is restricted to Emacs only. */
4206 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4210 for (i = 0; i < 4; i++) {
4211 if (!NILP(coding_system))
4213 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4216 iso->charset[i] = Qt;
4217 iso->invalid_designated[i] = 0;
4219 iso->esc = ISO_ESC_NOTHING;
4220 iso->esc_bytes_index = 0;
4221 iso->register_left = 0;
4222 iso->register_right = 1;
4223 iso->switched_dir_and_no_valid_charset_yet = 0;
4224 iso->invalid_switch_dir = 0;
4225 iso->output_direction_sequence = 0;
4226 iso->output_literally = 0;
4227 #ifdef ENABLE_COMPOSITE_CHARS
4228 if (iso->composite_chars)
4229 Dynarr_reset(iso->composite_chars);
4233 static int fit_to_be_escape_quoted(unsigned char c)
4249 /* Parse one byte of an ISO2022 escape sequence.
4250 If the result is an invalid escape sequence, return 0 and
4251 do not change anything in STR. Otherwise, if the result is
4252 an incomplete escape sequence, update ISO2022.ESC and
4253 ISO2022.ESC_BYTES and return -1. Otherwise, update
4254 all the state variables (but not ISO2022.ESC_BYTES) and
4257 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4258 or invocation of an invalid character set and treat that as
4259 an unrecognized escape sequence.
4261 ********************************************************************
4263 #### Strategies for error annotation and coding orthogonalization
4265 We really want to separate out a number of things. Conceptually,
4266 there is a nested syntax.
4268 At the top level is the ISO 2022 extension syntax, including charset
4269 designation and invocation, and certain auxiliary controls such as the
4270 ISO 6429 direction specification. These are octet-oriented, with the
4271 single exception (AFAIK) of the "exit Unicode" sequence which uses the
4272 UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4273 UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a
4274 (deprecated) special case in Unicode processing.
4276 The middle layer is ISO 2022 character interpretation. This will depend
4277 on the current state of the ISO 2022 registers, and assembles octets
4278 into the character's internal representation.
4280 The lowest level is translating system control conventions. At present
4281 this is restricted to newline translation, but one could imagine doing
4282 tab conversion or line wrapping here. "Escape from Unicode" processing
4283 would be done at this level.
4285 At each level the parser will verify the syntax. In the case of a
4286 syntax error or warning (such as a redundant escape sequence that affects
4287 no characters), the parser will take some action, typically inserting the
4288 erroneous octets directly into the output and creating an annotation
4289 which can be used by higher level I/O to mark the affected region.
4291 This should make it possible to do something sensible about separating
4292 newline convention processing from character construction, and about
4293 preventing ISO 2022 escape sequences from being recognized
4296 The basic strategy will be to have octet classification tables, and
4297 switch processing according to the table entry.
4299 It's possible that, by doing the processing with tables of functions or
4300 the like, the parser can be used for both detection and translation. */
4303 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4304 unsigned char c, unsigned int *flags,
4305 int check_invalid_charsets)
4307 /* (1) If we're at the end of a designation sequence, CS is the
4308 charset being designated and REG is the register to designate
4311 (2) If we're at the end of a locking-shift sequence, REG is
4312 the register to invoke and HALF (0 == left, 1 == right) is
4313 the half to invoke it into.
4315 (3) If we're at the end of a single-shift sequence, REG is
4316 the register to invoke. */
4317 Lisp_Object cs = Qnil;
4320 /* NOTE: This code does goto's all over the fucking place.
4321 The reason for this is that we're basically implementing
4322 a state machine here, and hierarchical languages like C
4323 don't really provide a clean way of doing this. */
4325 if (!(*flags & CODING_STATE_ESCAPE))
4326 /* At beginning of escape sequence; we need to reset our
4327 escape-state variables. */
4328 iso->esc = ISO_ESC_NOTHING;
4330 iso->output_literally = 0;
4331 iso->output_direction_sequence = 0;
4334 case ISO_ESC_NOTHING:
4335 iso->esc_bytes_index = 0;
4337 case ISO_CODE_ESC: /* Start escape sequence */
4338 *flags |= CODING_STATE_ESCAPE;
4342 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4343 *flags |= CODING_STATE_ESCAPE;
4344 iso->esc = ISO_ESC_5_11;
4347 case ISO_CODE_SO: /* locking shift 1 */
4351 case ISO_CODE_SI: /* locking shift 0 */
4356 case ISO_CODE_SS2: /* single shift */
4359 case ISO_CODE_SS3: /* single shift */
4363 default: /* Other control characters */
4369 /**** single shift ****/
4371 case 'N': /* single shift 2 */
4374 case 'O': /* single shift 3 */
4378 /**** locking shift ****/
4380 case '~': /* locking shift 1 right */
4384 case 'n': /* locking shift 2 */
4388 case '}': /* locking shift 2 right */
4392 case 'o': /* locking shift 3 */
4396 case '|': /* locking shift 3 right */
4401 #ifdef ENABLE_COMPOSITE_CHARS
4402 /**** composite ****/
4405 iso->esc = ISO_ESC_START_COMPOSITE;
4406 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4407 CODING_STATE_COMPOSITE;
4411 iso->esc = ISO_ESC_END_COMPOSITE;
4412 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4413 ~CODING_STATE_COMPOSITE;
4415 #endif /* ENABLE_COMPOSITE_CHARS */
4417 /**** directionality ****/
4420 iso->esc = ISO_ESC_5_11;
4423 /**** designation ****/
4425 case '$': /* multibyte charset prefix */
4426 iso->esc = ISO_ESC_2_4;
4430 if (0x28 <= c && c <= 0x2F) {
4432 (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4436 /* This function is called with CODESYS equal to nil when
4437 doing coding-system detection. */
4439 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4440 && fit_to_be_escape_quoted(c)) {
4441 iso->esc = ISO_ESC_LITERAL;
4442 *flags &= CODING_STATE_ISO2022_LOCK;
4450 /**** directionality ****/
4452 case ISO_ESC_5_11: /* ISO6429 direction control */
4455 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4456 goto directionality;
4459 iso->esc = ISO_ESC_5_11_0;
4461 iso->esc = ISO_ESC_5_11_1;
4463 iso->esc = ISO_ESC_5_11_2;
4468 case ISO_ESC_5_11_0:
4471 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4472 goto directionality;
4476 case ISO_ESC_5_11_1:
4479 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4480 goto directionality;
4484 case ISO_ESC_5_11_2:
4487 (*flags & CODING_STATE_ISO2022_LOCK) |
4489 goto directionality;
4494 iso->esc = ISO_ESC_DIRECTIONALITY;
4495 /* Various junk here to attempt to preserve the direction
4496 sequences literally in the text if they would otherwise be
4497 swallowed due to invalid designations that don't show up as
4498 actual charset changes in the text. */
4499 if (iso->invalid_switch_dir) {
4500 /* We already inserted a direction switch literally into
4501 the text. We assume (#### this may not be right)
4502 that the next direction switch is the one going the
4503 other way, and we need to output that literally as
4505 iso->output_literally = 1;
4506 iso->invalid_switch_dir = 0;
4510 /* If we are in the thrall of an invalid designation,
4511 then stick the directionality sequence literally into
4512 the output stream so it ends up in the original text
4514 for (jj = 0; jj < 4; jj++)
4515 if (iso->invalid_designated[jj])
4518 iso->output_literally = 1;
4519 iso->invalid_switch_dir = 1;
4521 /* Indicate that we haven't yet seen a valid
4522 designation, so that if a switch-dir is
4523 directly followed by an invalid designation,
4524 both get inserted literally. */
4525 iso->switched_dir_and_no_valid_charset_yet = 1;
4529 /**** designation ****/
4532 if (0x28 <= c && c <= 0x2F) {
4534 (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4537 if (0x40 <= c && c <= 0x42) {
4538 cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4539 *flags & CODING_STATE_R2L ?
4540 CHARSET_RIGHT_TO_LEFT :
4541 CHARSET_LEFT_TO_RIGHT);
4558 case ISO_ESC_2_4_10:
4559 case ISO_ESC_2_4_11:
4560 case ISO_ESC_2_4_12:
4561 case ISO_ESC_2_4_13:
4562 case ISO_ESC_2_4_14:
4563 case ISO_ESC_2_4_15:
4564 case ISO_ESC_SINGLE_SHIFT:
4565 case ISO_ESC_LOCKING_SHIFT:
4566 case ISO_ESC_DESIGNATE:
4567 case ISO_ESC_DIRECTIONALITY:
4568 case ISO_ESC_LITERAL:
4573 if (c < '0' || c > '~')
4574 return 0; /* bad final byte */
4576 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4577 type = ((iso->esc >= ISO_ESC_2_12) ?
4578 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4579 reg = (iso->esc - ISO_ESC_2_8) & 3;
4580 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4581 iso->esc <= ISO_ESC_2_4_15) {
4582 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4583 CHARSET_TYPE_96X96 :
4584 CHARSET_TYPE_94X94);
4585 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4587 /* Can this ever be reached? -slb */
4592 cs = CHARSET_BY_ATTRIBUTES(type, c,
4593 *flags & CODING_STATE_R2L ?
4594 CHARSET_RIGHT_TO_LEFT :
4595 CHARSET_LEFT_TO_RIGHT);
4601 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4605 if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4606 /* can't invoke something that ain't there. */
4608 iso->esc = ISO_ESC_SINGLE_SHIFT;
4609 *flags &= CODING_STATE_ISO2022_LOCK;
4611 *flags |= CODING_STATE_SS2;
4613 *flags |= CODING_STATE_SS3;
4617 if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4618 /* can't invoke something that ain't there. */
4621 iso->register_right = reg;
4623 iso->register_left = reg;
4624 *flags &= CODING_STATE_ISO2022_LOCK;
4625 iso->esc = ISO_ESC_LOCKING_SHIFT;
4629 if (NILP(cs) && check_invalid_charsets) {
4630 iso->invalid_designated[reg] = 1;
4631 iso->charset[reg] = Vcharset_ascii;
4632 iso->esc = ISO_ESC_DESIGNATE;
4633 *flags &= CODING_STATE_ISO2022_LOCK;
4634 iso->output_literally = 1;
4635 if (iso->switched_dir_and_no_valid_charset_yet) {
4636 /* We encountered a switch-direction followed by an
4637 invalid designation. Ensure that the switch-direction
4638 gets outputted; otherwise it will probably get eaten
4639 when the text is written out again. */
4640 iso->switched_dir_and_no_valid_charset_yet = 0;
4641 iso->output_direction_sequence = 1;
4642 /* And make sure that the switch-dir going the other
4643 way gets outputted, as well. */
4644 iso->invalid_switch_dir = 1;
4648 /* This function is called with CODESYS equal to nil when
4649 doing coding-system detection. */
4650 if (!NILP(codesys)) {
4651 charset_conversion_spec_dynarr *dyn =
4652 XCODING_SYSTEM(codesys)->iso2022.input_conv;
4657 for (i = 0; i < Dynarr_length(dyn); i++) {
4658 struct charset_conversion_spec *spec =
4660 if (EQ(cs, spec->from_charset))
4661 cs = spec->to_charset;
4666 iso->charset[reg] = cs;
4667 iso->esc = ISO_ESC_DESIGNATE;
4668 *flags &= CODING_STATE_ISO2022_LOCK;
4669 if (iso->invalid_designated[reg]) {
4670 iso->invalid_designated[reg] = 0;
4671 iso->output_literally = 1;
4673 if (iso->switched_dir_and_no_valid_charset_yet)
4674 iso->switched_dir_and_no_valid_charset_yet = 0;
4679 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4680 Lstream_data_count n)
4684 /* #### There are serious deficiencies in the recognition mechanism
4685 here. This needs to be much smarter if it's going to cut it.
4686 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4687 it should be detected as Latin-1.
4688 All the ISO2022 stuff in this file should be synced up with the
4689 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4690 Perhaps we should wait till R2L works in FSF Emacs? */
4692 if (!st->iso2022.initted) {
4693 reset_iso2022(Qnil, &st->iso2022.iso);
4694 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4695 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4696 CODING_CATEGORY_ISO_8_1_MASK |
4697 CODING_CATEGORY_ISO_8_2_MASK |
4698 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4699 st->iso2022.flags = 0;
4700 st->iso2022.high_byte_count = 0;
4701 st->iso2022.saw_single_shift = 0;
4702 st->iso2022.initted = 1;
4705 mask = st->iso2022.mask;
4708 const unsigned char c = *(const unsigned char *)src++;
4710 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4711 st->iso2022.high_byte_count++;
4713 if (st->iso2022.high_byte_count
4714 && !st->iso2022.saw_single_shift) {
4715 if (st->iso2022.high_byte_count & 1)
4716 /* odd number of high bytes; assume not iso-8-2 */
4717 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4719 st->iso2022.high_byte_count = 0;
4720 st->iso2022.saw_single_shift = 0;
4722 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4724 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4725 && (BYTE_C0_P(c) || BYTE_C1_P(c))) { /* control chars */
4727 /* Allow and ignore control characters that you might
4728 reasonably see in a text file */
4733 case 8: /* backspace */
4734 case 11: /* vertical tab */
4735 case 12: /* form feed */
4736 case 26: /* MS-DOS C-z junk */
4737 case 31: /* '^_' -- for info */
4738 goto label_continue_loop;
4745 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4747 if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4748 &st->iso2022.flags, 0)) {
4749 switch (st->iso2022.iso.esc) {
4750 case ISO_ESC_DESIGNATE:
4751 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4752 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4754 case ISO_ESC_LOCKING_SHIFT:
4755 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4756 goto ran_out_of_chars;
4757 case ISO_ESC_SINGLE_SHIFT:
4758 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4759 st->iso2022.saw_single_shift = 1;
4763 case ISO_ESC_NOTHING:
4776 case ISO_ESC_2_4_10:
4777 case ISO_ESC_2_4_11:
4778 case ISO_ESC_2_4_12:
4779 case ISO_ESC_2_4_13:
4780 case ISO_ESC_2_4_14:
4781 case ISO_ESC_2_4_15:
4783 case ISO_ESC_5_11_0:
4784 case ISO_ESC_5_11_1:
4785 case ISO_ESC_5_11_2:
4786 case ISO_ESC_DIRECTIONALITY:
4787 case ISO_ESC_LITERAL:
4793 goto ran_out_of_chars;
4796 label_continue_loop:;
4803 static int postprocess_iso2022_mask(int mask)
4805 /* #### kind of cheesy */
4806 /* If seven-bit ISO is allowed, then assume that the encoding is
4807 entirely seven-bit and turn off the eight-bit ones. */
4808 if (mask & CODING_CATEGORY_ISO_7_MASK)
4809 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4810 CODING_CATEGORY_ISO_8_1_MASK |
4811 CODING_CATEGORY_ISO_8_2_MASK);
4815 /* If FLAGS is a null pointer or specifies right-to-left motion,
4816 output a switch-dir-to-left-to-right sequence to DST.
4817 Also update FLAGS if it is not a null pointer.
4818 If INTERNAL_P is set, we are outputting in internal format and
4819 need to handle the CSI differently. */
4822 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4823 unsigned_char_dynarr * dst,
4824 unsigned int *flags, int internal_p)
4826 if (!flags || (*flags & CODING_STATE_R2L)) {
4827 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4828 Dynarr_add(dst, ISO_CODE_ESC);
4829 Dynarr_add(dst, '[');
4830 } else if (internal_p)
4831 DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4833 Dynarr_add(dst, ISO_CODE_CSI);
4834 Dynarr_add(dst, '0');
4835 Dynarr_add(dst, ']');
4837 *flags &= ~CODING_STATE_R2L;
4841 /* If FLAGS is a null pointer or specifies a direction different from
4842 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4843 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4844 sequence to DST. Also update FLAGS if it is not a null pointer.
4845 If INTERNAL_P is set, we are outputting in internal format and
4846 need to handle the CSI differently. */
4849 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4850 unsigned_char_dynarr * dst, unsigned int *flags,
4853 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4854 direction == CHARSET_LEFT_TO_RIGHT)
4855 restore_left_to_right_direction(codesys, dst, flags,
4857 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4858 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4859 direction == CHARSET_RIGHT_TO_LEFT) {
4860 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4861 Dynarr_add(dst, ISO_CODE_ESC);
4862 Dynarr_add(dst, '[');
4863 } else if (internal_p)
4864 DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4866 Dynarr_add(dst, ISO_CODE_CSI);
4867 Dynarr_add(dst, '2');
4868 Dynarr_add(dst, ']');
4870 *flags |= CODING_STATE_R2L;
4874 /* Convert ISO2022-format data to internal format. */
4877 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4878 unsigned_char_dynarr * dst, Lstream_data_count n)
4880 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4881 unsigned int flags = str->flags;
4882 unsigned int ch = str->ch;
4883 eol_type_t eol_type = str->eol_type;
4884 #ifdef ENABLE_COMPOSITE_CHARS
4885 unsigned_char_dynarr *real_dst = dst;
4887 Lisp_Object coding_system;
4889 XSETCODING_SYSTEM(coding_system, str->codesys);
4891 #ifdef ENABLE_COMPOSITE_CHARS
4892 if (flags & CODING_STATE_COMPOSITE)
4893 dst = str->iso2022.composite_chars;
4894 #endif /* ENABLE_COMPOSITE_CHARS */
4897 const unsigned char c = *(const unsigned char *)src++;
4898 if (flags & CODING_STATE_ESCAPE) {
4899 /* Within ESC sequence */
4900 int retval = parse_iso2022_esc(
4901 coding_system, &str->iso2022, c, &flags, 1);
4904 switch (str->iso2022.esc) {
4905 #ifdef ENABLE_COMPOSITE_CHARS
4906 case ISO_ESC_START_COMPOSITE:
4907 if (str->iso2022.composite_chars)
4908 Dynarr_reset(str->iso2022.
4911 str->iso2022.composite_chars =
4912 Dynarr_new(unsigned_char);
4913 dst = str->iso2022.composite_chars;
4915 case ISO_ESC_END_COMPOSITE:
4917 Bufbyte comstr[MAX_EMCHAR_LEN];
4920 lookup_composite_char
4921 (Dynarr_atp(dst, 0),
4922 Dynarr_length(dst));
4925 set_charptr_emchar(comstr,
4927 Dynarr_add_many(dst, comstr,
4931 #endif /* ENABLE_COMPOSITE_CHARS */
4933 case ISO_ESC_LITERAL:
4934 DECODE_ADD_BINARY_CHAR(c, dst);
4937 case ISO_ESC_NOTHING:
4950 case ISO_ESC_2_4_10:
4951 case ISO_ESC_2_4_11:
4952 case ISO_ESC_2_4_12:
4953 case ISO_ESC_2_4_13:
4954 case ISO_ESC_2_4_14:
4955 case ISO_ESC_2_4_15:
4957 case ISO_ESC_5_11_0:
4958 case ISO_ESC_5_11_1:
4959 case ISO_ESC_5_11_2:
4960 case ISO_ESC_SINGLE_SHIFT:
4961 case ISO_ESC_LOCKING_SHIFT:
4962 case ISO_ESC_DESIGNATE:
4963 case ISO_ESC_DIRECTIONALITY:
4966 /* Everything else handled already */
4971 /* Attempted error recovery. */
4972 if (str->iso2022.output_direction_sequence)
4973 ensure_correct_direction(flags &
4975 CHARSET_RIGHT_TO_LEFT :
4976 CHARSET_LEFT_TO_RIGHT,
4977 str->codesys, dst, 0,
4979 /* More error recovery. */
4980 if (!retval || str->iso2022.output_literally) {
4981 /* Output the (possibly invalid) sequence */
4983 for (i = 0; i < str->iso2022.esc_bytes_index;
4985 DECODE_ADD_BINARY_CHAR(str->iso2022.
4988 flags &= CODING_STATE_ISO2022_LOCK;
4990 n++, src--; /* Repeat the loop with the same character. */
4992 /* No sense in reprocessing the final byte of the
4993 escape sequence; it could mess things up anyway.
4995 DECODE_ADD_BINARY_CHAR(c, dst);
4999 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) { /* Control characters */
5001 /***** Error-handling *****/
5003 /* If we were in the middle of a character, dump out the
5004 partial character. */
5005 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5007 /* If we just saw a single-shift character, dump it out.
5008 This may dump out the wrong sort of single-shift character,
5009 but least it will give an indication that something went
5011 if (flags & CODING_STATE_SS2) {
5012 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5013 flags &= ~CODING_STATE_SS2;
5015 if (flags & CODING_STATE_SS3) {
5016 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5017 flags &= ~CODING_STATE_SS3;
5020 /***** Now handle the control characters. *****/
5023 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5025 flags &= CODING_STATE_ISO2022_LOCK;
5027 if (!parse_iso2022_esc
5028 (coding_system, &str->iso2022, c, &flags, 1))
5029 DECODE_ADD_BINARY_CHAR(c, dst);
5030 } else { /* Graphic characters */
5031 Lisp_Object charset;
5035 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5037 /* Now determine the charset. */
5038 reg = ((flags & CODING_STATE_SS2) ? 2
5039 : (flags & CODING_STATE_SS3) ? 3
5040 : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5041 : str->iso2022.register_left);
5042 charset = str->iso2022.charset[reg];
5044 /* Error checking: */
5045 if (!CHARSETP(charset)
5046 || str->iso2022.invalid_designated[reg]
5048 (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5049 && XCHARSET_CHARS(charset) == 94))
5050 /* Mrmph. We are trying to invoke a register that has no
5051 or an invalid charset in it, or trying to add a character
5052 outside the range of the charset. Insert that char literally
5053 to preserve it for the output. */
5055 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5056 DECODE_ADD_BINARY_CHAR(c, dst);
5060 /* Things are probably hunky-dorey. */
5062 /* Fetch reverse charset, maybe. */
5063 if (((flags & CODING_STATE_R2L) &&
5064 XCHARSET_DIRECTION(charset) ==
5065 CHARSET_LEFT_TO_RIGHT)
5066 || (!(flags & CODING_STATE_R2L)
5067 && XCHARSET_DIRECTION(charset) ==
5068 CHARSET_RIGHT_TO_LEFT)) {
5069 Lisp_Object new_charset =
5070 XCHARSET_REVERSE_DIRECTION_CHARSET
5072 if (!NILP(new_charset))
5073 charset = new_charset;
5076 lb = XCHARSET_LEADING_BYTE(charset);
5077 switch (XCHARSET_REP_BYTES(charset)) {
5079 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5080 Dynarr_add(dst, c & 0x7F);
5083 case 2: /* one-byte official */
5084 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5085 Dynarr_add(dst, lb);
5086 Dynarr_add(dst, c | 0x80);
5089 case 3: /* one-byte private or two-byte official */
5090 if (XCHARSET_PRIVATE_P(charset)) {
5091 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5093 PRE_LEADING_BYTE_PRIVATE_1);
5094 Dynarr_add(dst, lb);
5095 Dynarr_add(dst, c | 0x80);
5098 Dynarr_add(dst, lb);
5109 default: /* two-byte private */
5112 PRE_LEADING_BYTE_PRIVATE_2);
5113 Dynarr_add(dst, lb);
5114 Dynarr_add(dst, ch | 0x80);
5115 Dynarr_add(dst, c | 0x80);
5123 flags &= CODING_STATE_ISO2022_LOCK;
5126 label_continue_loop:;
5129 if (flags & CODING_STATE_END)
5130 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5136 /***** ISO2022 encoder *****/
5138 /* Designate CHARSET into register REG. */
5141 iso2022_designate(Lisp_Object charset, unsigned char reg,
5142 encoding_stream_t str, unsigned_char_dynarr * dst)
5144 static const char inter94[] = "()*+";
5145 static const char inter96[] = ",-./";
5147 unsigned char final;
5148 Lisp_Object old_charset = str->iso2022.charset[reg];
5150 str->iso2022.charset[reg] = charset;
5151 if (!CHARSETP(charset))
5152 /* charset might be an initial nil or t. */
5154 type = XCHARSET_TYPE(charset);
5155 final = XCHARSET_FINAL(charset);
5156 if (!str->iso2022.force_charset_on_output[reg] &&
5157 CHARSETP(old_charset) &&
5158 XCHARSET_TYPE(old_charset) == type &&
5159 XCHARSET_FINAL(old_charset) == final)
5162 str->iso2022.force_charset_on_output[reg] = 0;
5165 charset_conversion_spec_dynarr *dyn =
5166 str->codesys->iso2022.output_conv;
5171 for (i = 0; i < Dynarr_length(dyn); i++) {
5172 struct charset_conversion_spec *spec =
5174 if (EQ(charset, spec->from_charset))
5175 charset = spec->to_charset;
5180 Dynarr_add(dst, ISO_CODE_ESC);
5182 case CHARSET_TYPE_94:
5183 Dynarr_add(dst, inter94[reg]);
5185 case CHARSET_TYPE_96:
5186 Dynarr_add(dst, inter96[reg]);
5188 case CHARSET_TYPE_94X94:
5189 Dynarr_add(dst, '$');
5190 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5191 || final < '@' || final > 'B')
5192 Dynarr_add(dst, inter94[reg]);
5194 case CHARSET_TYPE_96X96:
5195 Dynarr_add(dst, '$');
5196 Dynarr_add(dst, inter96[reg]);
5201 Dynarr_add(dst, final);
5205 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5207 if (str->iso2022.register_left != 0) {
5208 Dynarr_add(dst, ISO_CODE_SI);
5209 str->iso2022.register_left = 0;
5214 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5216 if (str->iso2022.register_left != 1) {
5217 Dynarr_add(dst, ISO_CODE_SO);
5218 str->iso2022.register_left = 1;
5222 /* Convert internally-formatted data to ISO2022 format. */
5225 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5226 unsigned_char_dynarr * dst, Lstream_data_count n)
5228 unsigned char charmask, c;
5229 unsigned char char_boundary;
5230 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5231 unsigned int flags = str->flags;
5232 unsigned int ch = str->ch;
5233 Lisp_Coding_System *codesys = str->codesys;
5234 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5236 Lisp_Object charset;
5239 #ifdef ENABLE_COMPOSITE_CHARS
5240 /* flags for handling composite chars. We do a little switcharoo
5241 on the source while we're outputting the composite char. */
5242 unsigned int saved_n = 0;
5243 const unsigned char *saved_src = NULL;
5244 int in_composite = 0;
5245 #endif /* ENABLE_COMPOSITE_CHARS */
5247 char_boundary = str->iso2022.current_char_boundary;
5248 charset = str->iso2022.current_charset;
5249 half = str->iso2022.current_half;
5251 #ifdef ENABLE_COMPOSITE_CHARS
5257 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
5260 restore_left_to_right_direction(codesys, dst, &flags,
5263 /* Make sure G0 contains ASCII */
5264 if ((c > ' ' && c < ISO_CODE_DEL) ||
5265 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5266 ensure_normal_shift(str, dst);
5267 iso2022_designate(Vcharset_ascii, 0, str, dst);
5270 /* If necessary, restore everything to the default state
5273 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5274 restore_left_to_right_direction(codesys, dst,
5277 ensure_normal_shift(str, dst);
5279 for (i = 0; i < 4; i++) {
5280 Lisp_Object initial_charset =
5281 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5283 iso2022_designate(initial_charset, i,
5288 if (eol_type != EOL_LF
5289 && eol_type != EOL_AUTODETECT)
5290 Dynarr_add(dst, '\r');
5291 if (eol_type != EOL_CR)
5294 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5295 && fit_to_be_escape_quoted(c))
5296 Dynarr_add(dst, ISO_CODE_ESC);
5302 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
5304 charset = CHARSET_BY_LEADING_BYTE(c);
5305 if (LEADING_BYTE_PREFIX_P(c))
5307 else if (!EQ(charset, Vcharset_control_1)
5308 #ifdef ENABLE_COMPOSITE_CHARS
5309 && !EQ(charset, Vcharset_composite)
5314 ensure_correct_direction(XCHARSET_DIRECTION
5318 /* Now determine which register to use. */
5320 for (i = 0; i < 4; i++) {
5321 if (EQ(charset, str->iso2022.charset[i])
5323 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5331 if (XCHARSET_GRAPHIC(charset) != 0) {
5333 (str->iso2022.charset[1])
5335 (!CODING_SYSTEM_ISO2022_SEVEN
5338 CODING_SYSTEM_ISO2022_LOCK_SHIFT
5355 iso2022_designate(charset, reg, str, dst);
5357 /* Now invoke that register. */
5360 ensure_normal_shift(str, dst);
5365 if (CODING_SYSTEM_ISO2022_SEVEN
5367 ensure_shift_out(str, dst);
5374 if (CODING_SYSTEM_ISO2022_SEVEN
5376 Dynarr_add(dst, ISO_CODE_ESC);
5377 Dynarr_add(dst, 'N');
5380 Dynarr_add(dst, ISO_CODE_SS2);
5386 if (CODING_SYSTEM_ISO2022_SEVEN
5388 Dynarr_add(dst, ISO_CODE_ESC);
5389 Dynarr_add(dst, 'O');
5392 Dynarr_add(dst, ISO_CODE_SS3);
5402 } else { /* Processing Non-ASCII character */
5403 charmask = (half == 0 ? 0x7F : 0xFF);
5405 if (EQ(charset, Vcharset_control_1)) {
5406 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5407 && fit_to_be_escape_quoted(c))
5408 Dynarr_add(dst, ISO_CODE_ESC);
5409 /* you asked for it ... */
5410 Dynarr_add(dst, c - 0x20);
5412 switch (XCHARSET_REP_BYTES(charset)) {
5414 Dynarr_add(dst, c & charmask);
5417 if (XCHARSET_PRIVATE_P(charset)) {
5418 Dynarr_add(dst, c & charmask);
5421 #ifdef ENABLE_COMPOSITE_CHARS
5424 Vcharset_composite)) {
5426 /* #### Bother! We don't know how to
5433 (Vcharset_composite,
5438 composite_char_string
5447 n = XSTRING_LENGTH(lstr);
5450 Dynarr_add(dst, '0'); /* start composing */
5453 #endif /* ENABLE_COMPOSITE_CHARS */
5470 Dynarr_add(dst, ch & charmask);
5471 Dynarr_add(dst, c & charmask);
5485 #ifdef ENABLE_COMPOSITE_CHARS
5490 Dynarr_add(dst, ISO_CODE_ESC);
5491 Dynarr_add(dst, '1'); /* end composing */
5492 goto back_to_square_n; /* Wheeeeeeeee ..... */
5494 #endif /* ENABLE_COMPOSITE_CHARS */
5496 if (char_boundary && flags & CODING_STATE_END) {
5497 restore_left_to_right_direction(codesys, dst, &flags, 0);
5498 ensure_normal_shift(str, dst);
5499 for (i = 0; i < 4; i++) {
5500 Lisp_Object initial_charset =
5501 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5502 iso2022_designate(initial_charset, i, str, dst);
5508 str->iso2022.current_char_boundary = char_boundary;
5509 str->iso2022.current_charset = charset;
5510 str->iso2022.current_half = half;
5512 /* Verbum caro factum est! */
5516 /************************************************************************/
5517 /* No-conversion methods */
5518 /************************************************************************/
5520 /* This is used when reading in "binary" files -- i.e. files that may
5521 contain all 256 possible byte values and that are not to be
5522 interpreted as being in any particular decoding. */
5524 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5525 unsigned_char_dynarr * dst, Lstream_data_count n)
5527 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5528 unsigned int flags = str->flags;
5529 unsigned int ch = str->ch;
5530 eol_type_t eol_type = str->eol_type;
5533 const unsigned char c = *(const unsigned char *)src++;
5535 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5536 DECODE_ADD_BINARY_CHAR(c, dst);
5537 label_continue_loop:;
5540 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5547 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5548 unsigned_char_dynarr * dst, Lstream_data_count n)
5551 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5552 unsigned int flags = str->flags;
5553 unsigned int ch = str->ch;
5554 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5559 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5560 Dynarr_add(dst, '\r');
5561 if (eol_type != EOL_CR)
5562 Dynarr_add(dst, '\n');
5564 } else if (BYTE_ASCII_P(c)) {
5567 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5569 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5570 c == LEADING_BYTE_CONTROL_1)
5573 Dynarr_add(dst, '~'); /* untranslatable character */
5575 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5577 else if (ch == LEADING_BYTE_CONTROL_1) {
5579 Dynarr_add(dst, c - 0x20);
5581 /* else it should be the second or third byte of an
5582 untranslatable character, so ignore it */
5591 /************************************************************************/
5592 /* Initialization */
5593 /************************************************************************/
5595 void syms_of_file_coding(void)
5597 INIT_LRECORD_IMPLEMENTATION(coding_system);
5599 DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5601 DEFSUBR(Fcoding_system_p);
5602 DEFSUBR(Ffind_coding_system);
5603 DEFSUBR(Fget_coding_system);
5604 DEFSUBR(Fcoding_system_list);
5605 DEFSUBR(Fcoding_system_name);
5606 DEFSUBR(Fmake_coding_system);
5607 DEFSUBR(Fcopy_coding_system);
5608 DEFSUBR(Fcoding_system_canonical_name_p);
5609 DEFSUBR(Fcoding_system_alias_p);
5610 DEFSUBR(Fcoding_system_aliasee);
5611 DEFSUBR(Fdefine_coding_system_alias);
5612 DEFSUBR(Fsubsidiary_coding_system);
5614 DEFSUBR(Fcoding_system_type);
5615 DEFSUBR(Fcoding_system_doc_string);
5617 DEFSUBR(Fcoding_system_charset);
5619 DEFSUBR(Fcoding_system_property);
5621 DEFSUBR(Fcoding_category_list);
5622 DEFSUBR(Fset_coding_priority_list);
5623 DEFSUBR(Fcoding_priority_list);
5624 DEFSUBR(Fset_coding_category_system);
5625 DEFSUBR(Fcoding_category_system);
5627 DEFSUBR(Fdetect_coding_region);
5628 DEFSUBR(Fdecode_coding_region);
5629 DEFSUBR(Fencode_coding_region);
5631 DEFSUBR(Fdecode_shift_jis_char);
5632 DEFSUBR(Fencode_shift_jis_char);
5633 DEFSUBR(Fdecode_big5_char);
5634 DEFSUBR(Fencode_big5_char);
5635 DEFSUBR(Fset_ucs_char);
5637 DEFSUBR(Fset_char_ucs);
5640 defsymbol(&Qcoding_systemp, "coding-system-p");
5641 defsymbol(&Qno_conversion, "no-conversion");
5642 defsymbol(&Qraw_text, "raw-text");
5644 defsymbol(&Qbig5, "big5");
5645 defsymbol(&Qshift_jis, "shift-jis");
5646 defsymbol(&Qucs4, "ucs-4");
5647 defsymbol(&Qutf8, "utf-8");
5648 defsymbol(&Qccl, "ccl");
5649 defsymbol(&Qiso2022, "iso2022");
5651 defsymbol(&Qmnemonic, "mnemonic");
5652 defsymbol(&Qeol_type, "eol-type");
5653 defsymbol(&Qpost_read_conversion, "post-read-conversion");
5654 defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5656 defsymbol(&Qcr, "cr");
5657 defsymbol(&Qlf, "lf");
5658 defsymbol(&Qcrlf, "crlf");
5659 defsymbol(&Qeol_cr, "eol-cr");
5660 defsymbol(&Qeol_lf, "eol-lf");
5661 defsymbol(&Qeol_crlf, "eol-crlf");
5663 defsymbol(&Qcharset_g0, "charset-g0");
5664 defsymbol(&Qcharset_g1, "charset-g1");
5665 defsymbol(&Qcharset_g2, "charset-g2");
5666 defsymbol(&Qcharset_g3, "charset-g3");
5667 defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5668 defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5669 defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5670 defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5671 defsymbol(&Qno_iso6429, "no-iso6429");
5672 defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5673 defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5675 defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5676 defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5677 defsymbol(&Qseven, "seven");
5678 defsymbol(&Qlock_shift, "lock-shift");
5679 defsymbol(&Qescape_quoted, "escape-quoted");
5681 defsymbol(&Qencode, "encode");
5682 defsymbol(&Qdecode, "decode");
5685 defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5687 defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5688 defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5689 defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5690 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5691 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5693 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5694 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5695 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5698 defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5702 void lstream_type_create_file_coding(void)
5704 LSTREAM_HAS_METHOD(decoding, reader);
5705 LSTREAM_HAS_METHOD(decoding, writer);
5706 LSTREAM_HAS_METHOD(decoding, rewinder);
5707 LSTREAM_HAS_METHOD(decoding, seekable_p);
5708 LSTREAM_HAS_METHOD(decoding, flusher);
5709 LSTREAM_HAS_METHOD(decoding, closer);
5710 LSTREAM_HAS_METHOD(decoding, marker);
5712 LSTREAM_HAS_METHOD(encoding, reader);
5713 LSTREAM_HAS_METHOD(encoding, writer);
5714 LSTREAM_HAS_METHOD(encoding, rewinder);
5715 LSTREAM_HAS_METHOD(encoding, seekable_p);
5716 LSTREAM_HAS_METHOD(encoding, flusher);
5717 LSTREAM_HAS_METHOD(encoding, closer);
5718 LSTREAM_HAS_METHOD(encoding, marker);
5721 void vars_of_file_coding(void)
5725 fcd = xnew(struct file_coding_dump);
5726 dump_add_root_struct_ptr(&fcd, &fcd_description);
5728 /* Initialize to something reasonable ... */
5729 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5730 fcd->coding_category_system[i] = Qnil;
5731 fcd->coding_category_by_priority[i] = i;
5734 Fprovide(intern("file-coding"));
5736 DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system /*
5737 Coding system used for TTY keyboard input.
5738 Not used under a windowing system.
5740 Vkeyboard_coding_system = Qnil;
5742 DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system /*
5743 Coding system used for TTY display output.
5744 Not used under a windowing system.
5746 Vterminal_coding_system = Qnil;
5748 DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read /*
5749 Overriding coding system used when reading from a file or process.
5750 You should bind this variable with `let', but do not set it globally.
5751 If this is non-nil, it specifies the coding system that will be used
5752 to decode input on read operations, such as from a file or process.
5753 It overrides `buffer-file-coding-system-for-read',
5754 `insert-file-contents-pre-hook', etc. Use those variables instead of
5755 this one for permanent changes to the environment. */ );
5756 Vcoding_system_for_read = Qnil;
5758 DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write /*
5759 Overriding coding system used when writing to a file or process.
5760 You should bind this variable with `let', but do not set it globally.
5761 If this is non-nil, it specifies the coding system that will be used
5762 to encode output for write operations, such as to a file or process.
5763 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5764 Use those variables instead of this one for permanent changes to the
5766 Vcoding_system_for_write = Qnil;
5768 DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system /*
5769 Coding system used to convert pathnames when accessing files.
5771 Vfile_name_coding_system = Qnil;
5773 DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5774 Non-nil means the buffer contents are regarded as multi-byte form
5775 of characters, not a binary code. This affects the display, file I/O,
5776 and behaviors of various editing commands.
5778 Setting this to nil does not do anything.
5780 enable_multibyte_characters = 1;
5783 void complex_vars_of_file_coding(void)
5785 staticpro(&Vcoding_system_hash_table);
5786 Vcoding_system_hash_table =
5787 make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5789 the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5790 dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5791 &codesys_prop_dynarr_description);
5793 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5795 struct codesys_prop csp; \
5797 csp.prop_type = (Prop_Type); \
5798 Dynarr_add (the_codesys_prop_dynarr, csp); \
5801 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5802 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5803 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5804 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5805 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5806 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5807 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5809 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5810 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5811 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5812 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5813 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5814 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5815 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5816 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5817 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5818 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5819 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5820 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5821 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5822 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5823 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5824 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5825 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5827 DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5828 DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5830 /* Need to create this here or we're really screwed. */
5832 (Qraw_text, Qno_conversion,
5834 ("Raw text, which means it converts only line-break-codes."),
5835 list2(Qmnemonic, build_string("Raw")));
5838 (Qbinary, Qno_conversion,
5839 build_string("Binary, which means it does not convert anything."),
5840 list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5842 Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5844 Fdefine_coding_system_alias(Qfile_name, Qbinary);
5846 Fdefine_coding_system_alias(Qterminal, Qbinary);
5847 Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5849 /* Need this for bootstrapping */
5850 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5851 Fget_coding_system(Qraw_text);
5857 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5858 fcd->ucs_to_mule_table[i] = Qnil;
5860 staticpro(&mule_to_ucs_table);
5861 mule_to_ucs_table = Fmake_char_table(Qgeneric);