1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of SXEmacs
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 /* Synched up with: Mule 2.3. Not in FSF. */
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
30 #include "ui/insdel.h"
37 #include "file-coding.h"
39 Lisp_Object Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
50 struct file_coding_dump {
51 /* Coding system currently associated with each coding category. */
52 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
54 /* Table of all coding categories in decreasing order of priority.
55 This describes a permutation of the possible coding categories. */
56 int coding_category_by_priority[CODING_CATEGORY_LAST];
59 Lisp_Object ucs_to_mule_table[65536];
63 static const struct lrecord_description fcd_description_1[] = {
64 {XD_LISP_OBJECT_ARRAY,
65 offsetof(struct file_coding_dump, coding_category_system),
66 CODING_CATEGORY_LAST},
68 {XD_LISP_OBJECT_ARRAY,
69 offsetof(struct file_coding_dump, ucs_to_mule_table),
70 countof(fcd->ucs_to_mule_table)},
75 static const struct struct_description fcd_description = {
76 sizeof(struct file_coding_dump),
80 Lisp_Object mule_to_ucs_table;
82 Lisp_Object Qcoding_systemp;
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
104 Lisp_Object Qencode, Qdecode;
106 Lisp_Object Vcoding_system_hash_table;
108 int enable_multibyte_characters;
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet:1;
162 unsigned int invalid_switch_dir:1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally:1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence:1;
175 EXFUN(Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179 const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181 unsigned_char_dynarr * dst,
182 Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184 unsigned_char_dynarr * dst,
185 Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187 Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189 unsigned_char_dynarr * dst,
190 Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192 unsigned_char_dynarr * dst,
193 Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195 Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197 unsigned_char_dynarr * dst,
198 Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200 unsigned_char_dynarr * dst,
201 Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203 Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205 unsigned_char_dynarr * dst,
206 Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208 unsigned_char_dynarr * dst,
209 Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212 struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214 const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216 unsigned_char_dynarr * dst,
217 Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219 unsigned_char_dynarr * dst,
220 Lstream_data_count n);
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223 unsigned_char_dynarr * dst,
224 Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226 unsigned_char_dynarr * dst,
227 Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229 unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231 unsigned_char_dynarr * dst, Lstream_data_count n);
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
240 Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
243 static const struct lrecord_description codesys_prop_description_1[] = {
244 {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
248 static const struct struct_description codesys_prop_description = {
249 sizeof(codesys_prop),
250 codesys_prop_description_1
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
258 static const struct struct_description codesys_prop_dynarr_description = {
259 sizeof(codesys_prop_dynarr),
260 codesys_prop_dynarr_description_1
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
265 enum codesys_prop_enum {
267 CODESYS_PROP_ISO2022,
271 /************************************************************************/
272 /* Coding system functions */
273 /************************************************************************/
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
280 static const struct lrecord_description ccs_description_1[] = {
281 {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282 {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
286 static const struct struct_description ccs_description = {
287 sizeof(charset_conversion_spec),
291 static const struct lrecord_description ccsd_description_1[] = {
292 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
296 static const struct struct_description ccsd_description = {
297 sizeof(charset_conversion_spec_dynarr),
302 static const struct lrecord_description coding_system_description[] = {
303 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
312 {XD_LISP_OBJECT_ARRAY,
313 offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314 {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
316 {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
318 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325 mark_coding_system, print_coding_system,
326 finalize_coding_system,
327 0, 0, coding_system_description,
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
332 Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
334 mark_object(CODING_SYSTEM_NAME(codesys));
335 mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336 mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337 mark_object(CODING_SYSTEM_EOL_LF(codesys));
338 mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339 mark_object(CODING_SYSTEM_EOL_CR(codesys));
341 switch (CODING_SYSTEM_TYPE(codesys)) {
344 case CODESYS_ISO2022:
345 for (i = 0; i < 4; i++)
346 mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
348 if (codesys->iso2022.input_conv) {
350 i < Dynarr_length(codesys->iso2022.input_conv);
352 struct charset_conversion_spec *ccs =
353 Dynarr_atp(codesys->iso2022.input_conv, i);
354 mark_object(ccs->from_charset);
355 mark_object(ccs->to_charset);
358 if (codesys->iso2022.output_conv) {
360 i < Dynarr_length(codesys->iso2022.output_conv);
362 struct charset_conversion_spec *ccs =
363 Dynarr_atp(codesys->iso2022.output_conv, i);
364 mark_object(ccs->from_charset);
365 mark_object(ccs->to_charset);
371 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
375 /* list the rest of them lot explicitly */
376 case CODESYS_AUTODETECT:
377 case CODESYS_SHIFT_JIS:
381 case CODESYS_NO_CONVERSION:
383 case CODESYS_INTERNAL:
390 mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
397 Lisp_Coding_System *c = XCODING_SYSTEM(obj);
399 error("printing unreadable object #<coding-system 0x%x>",
402 write_c_string("#<coding-system ", printcharfun);
403 print_internal(c->name, printcharfun, 1);
404 write_c_string(">", printcharfun);
407 static void finalize_coding_system(void *header, int for_disksave)
409 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410 /* Since coding systems never go away, this function is not
411 necessary. But it would be necessary if we changed things
412 so that coding systems could go away. */
413 if (!for_disksave) { /* see comment in lstream.c */
414 switch (CODING_SYSTEM_TYPE(c)) {
416 case CODESYS_ISO2022:
417 if (c->iso2022.input_conv) {
418 Dynarr_free(c->iso2022.input_conv);
419 c->iso2022.input_conv = 0;
421 if (c->iso2022.output_conv) {
422 Dynarr_free(c->iso2022.output_conv);
423 c->iso2022.output_conv = 0;
427 /* list the rest of them lot explicitly */
428 case CODESYS_AUTODETECT:
429 case CODESYS_SHIFT_JIS:
434 case CODESYS_NO_CONVERSION:
436 case CODESYS_INTERNAL:
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
447 CHECK_SYMBOL(symbol);
449 return EOL_AUTODETECT;
452 if (EQ(symbol, Qcrlf))
457 signal_simple_error("Unrecognized eol type", symbol);
458 return EOL_AUTODETECT; /* not reached */
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
478 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
480 Lisp_Object codesys_obj;
481 int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
482 char *codesys_name = (char *)alloca(len + 7);
484 char *codesys_mnemonic = 0;
486 Lisp_Object codesys_name_sym, sub_codesys_obj;
490 XSETCODING_SYSTEM(codesys_obj, codesys);
493 string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
495 if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
496 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
497 codesys_mnemonic = (char *)alloca(mlen + 7);
498 memcpy(codesys_mnemonic,
499 XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
501 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
502 strcpy (codesys_name + len, "-" op_sys); \
504 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
505 codesys_name_sym = intern (codesys_name); \
506 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
507 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
509 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
510 build_string (codesys_mnemonic); \
511 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
514 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
515 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
516 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
519 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
520 Return t if OBJECT is a coding system.
521 A coding system is an object that defines how text containing multiple
522 character sets is encoded into a stream of (typically 8-bit) bytes.
523 The coding system is used to decode the stream into a series of
524 characters (which may be from multiple charsets) when the text is read
525 from a file or process, and is used to encode the text back into the
526 same format when it is written out to a file or process.
528 For example, many ISO2022-compliant coding systems (such as Compound
529 Text, which is used for inter-client data under the X Window System)
530 use escape sequences to switch between different charsets -- Japanese
531 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
532 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
533 `make-coding-system' for more information.
535 Coding systems are normally identified using a symbol, and the
536 symbol is accepted in place of the actual coding system object whenever
537 a coding system is called for. (This is similar to how faces work.)
541 return CODING_SYSTEMP(object) ? Qt : Qnil;
544 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
545 Retrieve the coding system of the given name.
547 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
548 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
549 If there is no such coding system, nil is returned. Otherwise the
550 associated coding system object is returned.
552 (coding_system_or_name))
554 if (NILP(coding_system_or_name))
555 coding_system_or_name = Qbinary;
556 else if (CODING_SYSTEMP(coding_system_or_name))
557 return coding_system_or_name;
559 CHECK_SYMBOL(coding_system_or_name);
562 coding_system_or_name =
563 Fgethash(coding_system_or_name, Vcoding_system_hash_table,
566 if (CODING_SYSTEMP(coding_system_or_name)
567 || NILP(coding_system_or_name))
568 return coding_system_or_name;
572 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
573 Retrieve the coding system of the given name.
574 Same as `find-coding-system' except that if there is no such
575 coding system, an error is signaled instead of returning nil.
579 Lisp_Object coding_system = Ffind_coding_system(name);
581 if (NILP(coding_system))
582 signal_simple_error("No such coding system", name);
583 return coding_system;
586 /* We store the coding systems in hash tables with the names as the key and the
587 actual coding system object as the value. Occasionally we need to use them
588 in a list format. These routines provide us with that. */
589 struct coding_system_list_closure {
590 Lisp_Object *coding_system_list;
594 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
595 void *coding_system_list_closure)
597 /* This function can GC */
598 struct coding_system_list_closure *cscl =
599 (struct coding_system_list_closure *)coding_system_list_closure;
600 Lisp_Object *coding_system_list = cscl->coding_system_list;
602 *coding_system_list = Fcons(key, *coding_system_list);
606 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
607 Return a list of the names of all defined coding systems.
611 Lisp_Object coding_system_list = Qnil;
613 struct coding_system_list_closure coding_system_list_closure;
615 GCPRO1(coding_system_list);
616 coding_system_list_closure.coding_system_list = &coding_system_list;
617 elisp_maphash(add_coding_system_to_list_mapper,
618 Vcoding_system_hash_table, &coding_system_list_closure);
621 return coding_system_list;
624 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
625 Return the name of the given coding system.
629 coding_system = Fget_coding_system(coding_system);
630 return XCODING_SYSTEM_NAME(coding_system);
633 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
636 Lisp_Coding_System *codesys =
637 alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
639 zero_lcrecord(codesys);
640 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
641 CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
642 CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
643 CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
644 CODING_SYSTEM_EOL_CR(codesys) = Qnil;
645 CODING_SYSTEM_EOL_LF(codesys) = Qnil;
646 CODING_SYSTEM_TYPE(codesys) = type;
647 CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
649 if (type == CODESYS_ISO2022) {
651 for (i = 0; i < 4; i++)
652 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
654 } else if (type == CODESYS_CCL) {
655 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
656 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
659 CODING_SYSTEM_NAME(codesys) = name;
665 /* Given a list of charset conversion specs as specified in a Lisp
666 program, parse it into STORE_HERE. */
669 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
670 Lisp_Object spec_list)
674 EXTERNAL_LIST_LOOP(rest, spec_list) {
675 Lisp_Object car = XCAR(rest);
676 Lisp_Object from, to;
677 struct charset_conversion_spec spec;
679 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
680 signal_simple_error("Invalid charset conversion spec",
682 from = Fget_charset(XCAR(car));
683 to = Fget_charset(XCAR(XCDR(car)));
684 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
685 signal_simple_error_2
686 ("Attempted conversion between different charset types",
688 spec.from_charset = from;
689 spec.to_charset = to;
691 Dynarr_add(store_here, spec);
695 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
696 specs, return the equivalent as the Lisp programmer would see it.
698 If LOAD_HERE is 0, return Qnil. */
701 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
708 for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
709 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
711 Fcons(list2(ccs->from_charset, ccs->to_charset), result);
714 return Fnreverse(result);
719 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
720 Register symbol NAME as a coding system.
722 TYPE describes the conversion method used and should be one of
725 Automatic conversion. SXEmacs attempts to detect the coding system
728 No conversion. Use this for binary files and such. On output,
729 graphic characters that are not in ASCII or Latin-1 will be
730 replaced by a ?. (For a no-conversion-encoded buffer, these
731 characters will only be present if you explicitly insert them.)
733 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
735 ISO 10646 UCS-4 encoding.
737 ISO 10646 UTF-8 encoding.
739 Any ISO2022-compliant encoding. Among other things, this includes
740 JIS (the Japanese encoding commonly used for e-mail), EUC (the
741 standard Unix encoding for Japanese and other languages), and
742 Compound Text (the encoding used in X11). You can specify more
743 specific information about the conversion with the PROPS argument.
745 Big5 (the encoding commonly used for Taiwanese).
747 The conversion is performed using a user-written pseudo-code
748 program. CCL (Code Conversion Language) is the name of this
751 Write out or read in the raw contents of the memory representing
752 the buffer's text. This is primarily useful for debugging
753 purposes, and is only enabled when SXEmacs has been compiled with
754 DEBUG_SXEMACS defined (via the --debug configure option).
755 WARNING: Reading in a file using 'internal conversion can result
756 in an internal inconsistency in the memory representing a
757 buffer's text, which will produce unpredictable results and may
758 cause SXEmacs to crash. Under normal circumstances you should
759 never use 'internal conversion.
761 DOC-STRING is a string describing the coding system.
763 PROPS is a property list, describing the specific nature of the
764 character set. Recognized properties are:
767 String to be displayed in the modeline when this coding system is
771 End-of-line conversion to be used. It should be one of
774 Automatically detect the end-of-line type (LF, CRLF,
775 or CR). Also generate subsidiary coding systems named
776 `NAME-unix', `NAME-dos', and `NAME-mac', that are
777 identical to this coding system but have an EOL-TYPE
778 value of 'lf, 'crlf, and 'cr, respectively.
780 The end of a line is marked externally using ASCII LF.
781 Since this is also the way that SXEmacs represents an
782 end-of-line internally, specifying this option results
783 in no end-of-line conversion. This is the standard
784 format for Unix text files.
786 The end of a line is marked externally using ASCII
787 CRLF. This is the standard format for MS-DOS text
790 The end of a line is marked externally using ASCII CR.
791 This is the standard format for Macintosh text files.
793 Automatically detect the end-of-line type but do not
794 generate subsidiary coding systems. (This value is
795 converted to nil when stored internally, and
796 `coding-system-property' will return nil.)
798 'post-read-conversion
799 Function called after a file has been read in, to perform the
800 decoding. Called with two arguments, START and END, denoting
801 a region of the current buffer to be decoded.
803 'pre-write-conversion
804 Function called before a file is written out, to perform the
805 encoding. Called with two arguments, START and END, denoting
806 a region of the current buffer to be encoded.
808 The following additional properties are recognized if TYPE is 'iso2022:
814 The character set initially designated to the G0 - G3 registers.
815 The value should be one of
817 -- A charset object (designate that character set)
818 -- nil (do not ever use this register)
819 -- t (no character set is initially designated to
820 the register, but may be later on; this automatically
821 sets the corresponding `force-g*-on-output' property)
827 If non-nil, send an explicit designation sequence on output before
828 using the specified register.
831 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
832 "ESC $ B" on output in place of the full designation sequences
833 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
836 If non-nil, don't designate ASCII to G0 at each end of line on output.
837 Setting this to non-nil also suppresses other state-resetting that
838 normally happens at the end of a line.
841 If non-nil, don't designate ASCII to G0 before control chars on output.
844 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
848 If non-nil, use locking-shift (SO/SI) instead of single-shift
849 or designation by escape sequence.
852 If non-nil, don't use ISO6429's direction specification.
855 If non-nil, literal control characters that are the same as
856 the beginning of a recognized ISO2022 or ISO6429 escape sequence
857 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
858 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
859 so that they can be properly distinguished from an escape sequence.
860 (Note that doing this results in a non-portable encoding.) This
861 encoding flag is used for byte-compiled files. Note that ESC
862 is a good choice for a quoting character because there are no
863 escape sequences whose second byte is a character from the Control-0
864 or Control-1 character sets; this is explicitly disallowed by the
867 'input-charset-conversion
868 A list of conversion specifications, specifying conversion of
869 characters in one charset to another when decoding is performed.
870 Each specification is a list of two elements: the source charset,
871 and the destination charset.
873 'output-charset-conversion
874 A list of conversion specifications, specifying conversion of
875 characters in one charset to another when encoding is performed.
876 The form of each specification is the same as for
877 'input-charset-conversion.
879 The following additional properties are recognized (and required)
883 CCL program used for decoding (converting to internal format).
886 CCL program used for encoding (converting to external format).
888 (name, type, doc_string, props))
890 Lisp_Coding_System *codesys;
891 enum coding_system_type ty;
892 int need_to_setup_eol_systems = 1;
894 /* Convert type to constant */
895 if (NILP(type) || EQ(type, Qundecided)) {
896 ty = CODESYS_AUTODETECT;
899 else if (EQ(type, Qshift_jis)) {
900 ty = CODESYS_SHIFT_JIS;
901 } else if (EQ(type, Qiso2022)) {
902 ty = CODESYS_ISO2022;
903 } else if (EQ(type, Qbig5)) {
905 } else if (EQ(type, Qucs4)) {
907 } else if (EQ(type, Qutf8)) {
909 } else if (EQ(type, Qccl)) {
913 else if (EQ(type, Qno_conversion)) {
914 ty = CODESYS_NO_CONVERSION;
917 else if (EQ(type, Qinternal)) {
918 ty = CODESYS_INTERNAL;
922 signal_simple_error("Invalid coding system type", type);
926 codesys = allocate_coding_system(ty, name);
928 if (NILP(doc_string))
929 doc_string = build_string("");
931 CHECK_STRING(doc_string);
932 CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
935 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
936 if (EQ(key, Qmnemonic)) {
939 CODING_SYSTEM_MNEMONIC(codesys) = value;
942 else if (EQ(key, Qeol_type)) {
943 need_to_setup_eol_systems = NILP(value);
946 CODING_SYSTEM_EOL_TYPE(codesys) =
947 symbol_to_eol_type(value);
950 else if (EQ(key, Qpost_read_conversion))
951 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
953 else if (EQ(key, Qpre_write_conversion))
954 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
957 else if (ty == CODESYS_ISO2022) {
958 #define FROB_INITIAL_CHARSET(charset_num) \
959 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
960 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
962 if (EQ(key, Qcharset_g0))
963 FROB_INITIAL_CHARSET(0);
964 else if (EQ(key, Qcharset_g1))
965 FROB_INITIAL_CHARSET(1);
966 else if (EQ(key, Qcharset_g2))
967 FROB_INITIAL_CHARSET(2);
968 else if (EQ(key, Qcharset_g3))
969 FROB_INITIAL_CHARSET(3);
971 #define FROB_FORCE_CHARSET(charset_num) \
972 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
974 else if (EQ(key, Qforce_g0_on_output))
975 FROB_FORCE_CHARSET(0);
976 else if (EQ(key, Qforce_g1_on_output))
977 FROB_FORCE_CHARSET(1);
978 else if (EQ(key, Qforce_g2_on_output))
979 FROB_FORCE_CHARSET(2);
980 else if (EQ(key, Qforce_g3_on_output))
981 FROB_FORCE_CHARSET(3);
983 #define FROB_BOOLEAN_PROPERTY(prop) \
984 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
986 else if (EQ(key, Qshort))
987 FROB_BOOLEAN_PROPERTY(SHORT);
988 else if (EQ(key, Qno_ascii_eol))
989 FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
990 else if (EQ(key, Qno_ascii_cntl))
991 FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
992 else if (EQ(key, Qseven))
993 FROB_BOOLEAN_PROPERTY(SEVEN);
994 else if (EQ(key, Qlock_shift))
995 FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
996 else if (EQ(key, Qno_iso6429))
997 FROB_BOOLEAN_PROPERTY(NO_ISO6429);
998 else if (EQ(key, Qescape_quoted))
999 FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
1001 else if (EQ(key, Qinput_charset_conversion)) {
1002 codesys->iso2022.input_conv =
1003 Dynarr_new(charset_conversion_spec);
1004 parse_charset_conversion_specs(codesys->
1008 } else if (EQ(key, Qoutput_charset_conversion)) {
1009 codesys->iso2022.output_conv =
1010 Dynarr_new(charset_conversion_spec);
1011 parse_charset_conversion_specs(codesys->
1017 ("Unrecognized property", key);
1018 } else if (EQ(type, Qccl)) {
1020 struct ccl_program test_ccl;
1023 /* Check key first. */
1024 if (EQ(key, Qdecode))
1025 suffix = "-ccl-decode";
1026 else if (EQ(key, Qencode))
1027 suffix = "-ccl-encode";
1030 ("Unrecognized property", key);
1032 /* If value is vector, register it as a ccl program
1033 associated with an newly created symbol for
1034 backward compatibility. */
1035 if (VECTORP(value)) {
1038 (Fsymbol_name(name),
1039 build_string(suffix)),
1041 Fregister_ccl_program(sym, value);
1043 CHECK_SYMBOL(value);
1046 /* check if the given ccl programs are valid. */
1047 if (setup_ccl_program(&test_ccl, sym) < 0)
1049 ("Invalid CCL program", value);
1051 if (EQ(key, Qdecode))
1052 CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1053 else if (EQ(key, Qencode))
1054 CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1059 signal_simple_error("Unrecognized property",
1064 if (need_to_setup_eol_systems)
1065 setup_eol_coding_systems(codesys);
1068 Lisp_Object codesys_obj;
1069 XSETCODING_SYSTEM(codesys_obj, codesys);
1070 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1075 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1076 Copy OLD-CODING-SYSTEM to NEW-NAME.
1077 If NEW-NAME does not name an existing coding system, a new one will
1080 (old_coding_system, new_name))
1082 Lisp_Object new_coding_system;
1083 old_coding_system = Fget_coding_system(old_coding_system);
1084 new_coding_system = Ffind_coding_system(new_name);
1085 if (NILP(new_coding_system)) {
1086 XSETCODING_SYSTEM(new_coding_system,
1087 allocate_coding_system
1088 (XCODING_SYSTEM_TYPE(old_coding_system),
1090 Fputhash(new_name, new_coding_system,
1091 Vcoding_system_hash_table);
1095 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1096 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1097 memcpy(((char *)to) + sizeof(to->header),
1098 ((char *)from) + sizeof(from->header),
1099 sizeof(*from) - sizeof(from->header));
1100 to->name = new_name;
1102 return new_coding_system;
1105 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1106 Return t if OBJECT names a coding system, and is not a coding system alias.
1110 Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1111 return CODING_SYSTEMP(val) ? Qt : Qnil;
1114 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1115 Return t if OBJECT is a coding system alias.
1116 All coding system aliases are created by `define-coding-system-alias'.
1120 Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1121 return SYMBOLP(val) ? Qt : Qnil;
1124 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1125 Return the coding-system symbol for which symbol ALIAS is an alias.
1129 Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1130 if (SYMBOLP(aliasee)) {
1133 signal_simple_error(
1134 "Symbol is not a coding system alias", alias);
1136 /* To keep the compiler happy */
1141 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1143 return Fintern(concat2(Fsymbol_name(symbol),
1144 build_string(ascii_string)), Qnil);
1147 /* A maphash function, for removing dangling coding system aliases. */
1149 dangling_coding_system_alias_p(Lisp_Object alias,
1150 Lisp_Object aliasee, void *dangling_aliases)
1152 if (SYMBOLP(aliasee)
1153 && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1154 (*(int *)dangling_aliases)++;
1161 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1162 Define symbol ALIAS as an alias for coding system ALIASEE.
1164 You can use this function to redefine an alias that has already been defined,
1165 but you cannot redefine a name which is the canonical name for a coding system.
1166 \(a canonical name of a coding system is what is returned when you call
1167 `coding-system-name' on a coding system).
1169 ALIASEE itself can be an alias, which allows you to define nested aliases.
1171 You are forbidden, however, from creating alias loops or `dangling' aliases.
1172 These will be detected, and an error will be signaled if you attempt to do so.
1174 If ALIASEE is nil, then ALIAS will simply be undefined.
1176 See also `coding-system-alias-p', `coding-system-aliasee',
1177 and `coding-system-canonical-name-p'.
1181 Lisp_Object real_coding_system, probe;
1183 CHECK_SYMBOL(alias);
1185 if (!NILP(Fcoding_system_canonical_name_p(alias)))
1187 ("Symbol is the canonical name of a coding system and cannot be redefined",
1190 if (NILP(aliasee)) {
1191 Lisp_Object subsidiary_unix =
1192 append_suffix_to_symbol(alias, "-unix");
1193 Lisp_Object subsidiary_dos =
1194 append_suffix_to_symbol(alias, "-dos");
1195 Lisp_Object subsidiary_mac =
1196 append_suffix_to_symbol(alias, "-mac");
1198 Fremhash(alias, Vcoding_system_hash_table);
1200 /* Undefine subsidiary aliases,
1201 presumably created by a previous call to this function */
1202 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1203 !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1204 !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1205 Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1206 Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1207 Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1210 /* Undefine dangling coding system aliases. */
1212 int dangling_aliases;
1215 dangling_aliases = 0;
1217 (dangling_coding_system_alias_p,
1218 Vcoding_system_hash_table,
1220 } while (dangling_aliases > 0);
1226 if (CODING_SYSTEMP(aliasee))
1227 aliasee = XCODING_SYSTEM_NAME(aliasee);
1229 /* Checks that aliasee names a coding-system */
1230 real_coding_system = Fget_coding_system(aliasee);
1232 /* Check for coding system alias loops */
1233 if (EQ(alias, aliasee))
1234 alias_loop:signal_simple_error_2
1235 ("Attempt to create a coding system alias loop", alias,
1238 for (probe = aliasee;
1240 probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1241 if (EQ(probe, alias))
1245 Fputhash(alias, aliasee, Vcoding_system_hash_table);
1247 /* Set up aliases for subsidiaries.
1248 #### There must be a better way to handle subsidiary coding
1251 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1253 for (int i = 0; i < countof(suffixes); i++) {
1254 Lisp_Object alias_subsidiary =
1255 append_suffix_to_symbol(alias, suffixes[i]);
1256 Lisp_Object aliasee_subsidiary =
1257 append_suffix_to_symbol(aliasee, suffixes[i]);
1259 if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1260 Fdefine_coding_system_alias(alias_subsidiary,
1261 aliasee_subsidiary);
1265 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1266 but it doesn't look intentional, so I'd rather return something
1267 meaningful or nothing at all. */
1272 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1274 Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1275 Lisp_Object new_coding_system;
1277 if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1278 return coding_system;
1281 case EOL_AUTODETECT:
1282 return coding_system;
1284 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1287 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1290 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1297 return NILP(new_coding_system) ? coding_system : new_coding_system;
1300 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1301 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1303 (coding_system, eol_type))
1305 coding_system = Fget_coding_system(coding_system);
1307 return subsidiary_coding_system(coding_system,
1308 symbol_to_eol_type(eol_type));
1311 /************************************************************************/
1312 /* Coding system accessors */
1313 /************************************************************************/
1315 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1316 Return the doc string for CODING-SYSTEM.
1320 coding_system = Fget_coding_system(coding_system);
1321 return XCODING_SYSTEM_DOC_STRING(coding_system);
1324 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1325 Return the type of CODING-SYSTEM.
1329 Lisp_Object tmp = Fget_coding_system(coding_system);
1331 switch (XCODING_SYSTEM_TYPE(tmp)) {
1335 case CODESYS_AUTODETECT:
1338 case CODESYS_SHIFT_JIS:
1340 case CODESYS_ISO2022:
1351 case CODESYS_NO_CONVERSION:
1352 return Qno_conversion;
1353 #ifdef DEBUG_SXEMACS
1354 case CODESYS_INTERNAL:
1362 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1365 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1367 return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1370 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1371 Return initial charset of CODING-SYSTEM designated to GNUM.
1374 (coding_system, gnum))
1376 coding_system = Fget_coding_system(coding_system);
1379 return coding_system_charset(coding_system, XINT(gnum));
1383 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1384 Return the PROP property of CODING-SYSTEM.
1386 (coding_system, prop))
1389 enum coding_system_type type;
1391 coding_system = Fget_coding_system(coding_system);
1393 type = XCODING_SYSTEM_TYPE(coding_system);
1395 for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1396 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1398 switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1399 case CODESYS_PROP_ALL_OK:
1402 case CODESYS_PROP_ISO2022:
1403 if (type != CODESYS_ISO2022)
1405 ("Property only valid in ISO2022 coding systems",
1409 case CODESYS_PROP_CCL:
1410 if (type != CODESYS_CCL)
1412 ("Property only valid in CCL coding systems",
1422 signal_simple_error("Unrecognized property", prop);
1424 if (EQ(prop, Qname))
1425 return XCODING_SYSTEM_NAME(coding_system);
1426 else if (EQ(prop, Qtype))
1427 return Fcoding_system_type(coding_system);
1428 else if (EQ(prop, Qdoc_string))
1429 return XCODING_SYSTEM_DOC_STRING(coding_system);
1430 else if (EQ(prop, Qmnemonic))
1431 return XCODING_SYSTEM_MNEMONIC(coding_system);
1432 else if (EQ(prop, Qeol_type))
1434 eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1435 else if (EQ(prop, Qeol_lf))
1436 return XCODING_SYSTEM_EOL_LF(coding_system);
1437 else if (EQ(prop, Qeol_crlf))
1438 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1439 else if (EQ(prop, Qeol_cr))
1440 return XCODING_SYSTEM_EOL_CR(coding_system);
1441 else if (EQ(prop, Qpost_read_conversion))
1442 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1443 else if (EQ(prop, Qpre_write_conversion))
1444 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1446 else if (type == CODESYS_ISO2022) {
1447 if (EQ(prop, Qcharset_g0))
1448 return coding_system_charset(coding_system, 0);
1449 else if (EQ(prop, Qcharset_g1))
1450 return coding_system_charset(coding_system, 1);
1451 else if (EQ(prop, Qcharset_g2))
1452 return coding_system_charset(coding_system, 2);
1453 else if (EQ(prop, Qcharset_g3))
1454 return coding_system_charset(coding_system, 3);
1456 #define FORCE_CHARSET(charset_num) \
1457 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1458 (coding_system, charset_num) ? Qt : Qnil)
1460 else if (EQ(prop, Qforce_g0_on_output))
1461 return FORCE_CHARSET(0);
1462 else if (EQ(prop, Qforce_g1_on_output))
1463 return FORCE_CHARSET(1);
1464 else if (EQ(prop, Qforce_g2_on_output))
1465 return FORCE_CHARSET(2);
1466 else if (EQ(prop, Qforce_g3_on_output))
1467 return FORCE_CHARSET(3);
1469 #define LISP_BOOLEAN(prop) \
1470 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1472 else if (EQ(prop, Qshort))
1473 return LISP_BOOLEAN(SHORT);
1474 else if (EQ(prop, Qno_ascii_eol))
1475 return LISP_BOOLEAN(NO_ASCII_EOL);
1476 else if (EQ(prop, Qno_ascii_cntl))
1477 return LISP_BOOLEAN(NO_ASCII_CNTL);
1478 else if (EQ(prop, Qseven))
1479 return LISP_BOOLEAN(SEVEN);
1480 else if (EQ(prop, Qlock_shift))
1481 return LISP_BOOLEAN(LOCK_SHIFT);
1482 else if (EQ(prop, Qno_iso6429))
1483 return LISP_BOOLEAN(NO_ISO6429);
1484 else if (EQ(prop, Qescape_quoted))
1485 return LISP_BOOLEAN(ESCAPE_QUOTED);
1487 else if (EQ(prop, Qinput_charset_conversion))
1489 unparse_charset_conversion_specs
1490 (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1491 else if (EQ(prop, Qoutput_charset_conversion))
1493 unparse_charset_conversion_specs
1494 (XCODING_SYSTEM(coding_system)->iso2022.
1498 } else if (type == CODESYS_CCL) {
1499 if (EQ(prop, Qdecode))
1500 return XCODING_SYSTEM_CCL_DECODE(coding_system);
1501 else if (EQ(prop, Qencode))
1502 return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1510 return Qnil; /* not reached */
1513 /************************************************************************/
1514 /* Coding category functions */
1515 /************************************************************************/
1517 static int decode_coding_category(Lisp_Object symbol)
1521 CHECK_SYMBOL(symbol);
1522 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1523 if (EQ(coding_category_symbol[i], symbol))
1526 signal_simple_error("Unrecognized coding category", symbol);
1527 return 0; /* not reached */
1530 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1531 Return a list of all recognized coding categories.
1536 Lisp_Object list = Qnil;
1538 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1539 list = Fcons(coding_category_symbol[i], list);
1543 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1544 Change the priority order of the coding categories.
1545 LIST should be list of coding categories, in descending order of
1546 priority. Unspecified coding categories will be lower in priority
1547 than all specified ones, in the same relative order they were in
1552 int category_to_priority[CODING_CATEGORY_LAST];
1556 /* First generate a list that maps coding categories to priorities. */
1558 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1559 category_to_priority[i] = -1;
1561 /* Highest priority comes from the specified list. */
1563 EXTERNAL_LIST_LOOP(rest, list) {
1564 int cat = decode_coding_category(XCAR(rest));
1566 if (category_to_priority[cat] >= 0)
1567 signal_simple_error("Duplicate coding category in list",
1569 category_to_priority[cat] = i++;
1572 /* Now go through the existing categories by priority to retrieve
1573 the categories not yet specified and preserve their priority
1575 for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1576 int cat = fcd->coding_category_by_priority[j];
1577 if (category_to_priority[cat] < 0)
1578 category_to_priority[cat] = i++;
1581 /* Now we need to construct the inverse of the mapping we just
1584 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1585 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1587 /* Phew! That was confusing. */
1591 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1592 Return a list of coding categories in descending order of priority.
1597 Lisp_Object list = Qnil;
1599 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1601 Fcons(coding_category_symbol
1602 [fcd->coding_category_by_priority[i]], list);
1606 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1607 Change the coding system associated with a coding category.
1609 (coding_category, coding_system))
1611 int cat = decode_coding_category(coding_category);
1613 coding_system = Fget_coding_system(coding_system);
1614 fcd->coding_category_system[cat] = coding_system;
1618 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1619 Return the coding system associated with a coding category.
1623 int cat = decode_coding_category(coding_category);
1624 Lisp_Object sys = fcd->coding_category_system[cat];
1627 return XCODING_SYSTEM_NAME(sys);
1631 /************************************************************************/
1632 /* Detecting the encoding of data */
1633 /************************************************************************/
1635 struct detection_state {
1636 eol_type_t eol_type;
1663 struct iso2022_decoder iso;
1665 int high_byte_count;
1666 unsigned int saw_single_shift:1;
1675 static int acceptable_control_char_p(int c)
1678 /* Allow and ignore control characters that you might
1679 reasonably see in a text file */
1684 case 8: /* backspace */
1685 case 11: /* vertical tab */
1686 case 12: /* form feed */
1687 case 26: /* MS-DOS C-z junk */
1688 case 31: /* '^_' -- for info */
1695 static int mask_has_at_most_one_bit_p(int mask)
1697 /* Perhaps the only thing useful you learn from intensive Microsoft
1698 technical interviews */
1699 return (mask & (mask - 1)) == 0;
1703 detect_eol_type(struct detection_state *st, const Extbyte * src,
1704 Lstream_data_count n)
1707 const unsigned char c = *(const unsigned char*)src++;
1709 if (st->eol.just_saw_cr)
1711 else if (st->eol.seen_anything)
1713 } else if (st->eol.just_saw_cr)
1716 st->eol.just_saw_cr = 1;
1718 st->eol.just_saw_cr = 0;
1719 st->eol.seen_anything = 1;
1722 return EOL_AUTODETECT;
1725 /* Attempt to determine the encoding and EOL type of the given text.
1726 Before calling this function for the first type, you must initialize
1727 st->eol_type as appropriate and initialize st->mask to ~0.
1729 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1732 st->mask holds the determined coding category mask, or ~0 if only
1733 ASCII has been seen so far.
1737 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1738 is present in st->mask
1739 1 == definitive answers are here for both st->eol_type and st->mask
1743 detect_coding_type(struct detection_state *st, const Extbyte * src,
1744 Lstream_data_count n, int just_do_eol)
1746 if (st->eol_type == EOL_AUTODETECT)
1747 st->eol_type = detect_eol_type(st, src, n);
1750 return st->eol_type != EOL_AUTODETECT;
1752 if (!st->seen_non_ascii) {
1753 for (; n; n--, src++) {
1754 const unsigned char c = *(const unsigned char *)src;
1755 if ((c < 0x20 && !acceptable_control_char_p(c))
1757 st->seen_non_ascii = 1;
1759 st->shift_jis.mask = ~0;
1763 st->iso2022.mask = ~0;
1774 if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1775 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1776 if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1777 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1778 if (!mask_has_at_most_one_bit_p(st->big5.mask))
1779 st->big5.mask = detect_coding_big5(st, src, n);
1780 if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1781 st->utf8.mask = detect_coding_utf8(st, src, n);
1782 if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1783 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1785 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1786 | st->utf8.mask | st->ucs4.mask;
1789 int retval = mask_has_at_most_one_bit_p(st->mask);
1790 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1791 return retval && st->eol_type != EOL_AUTODETECT;
1795 static Lisp_Object coding_system_from_mask(int mask)
1798 /* If the file was entirely or basically ASCII, use the
1799 default value of `buffer-file-coding-system'. */
1800 Lisp_Object retval =
1801 XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1802 if (!NILP(retval)) {
1803 retval = Ffind_coding_system(retval);
1806 (Qbad_variable, Qwarning,
1807 "Invalid `default-buffer-file-coding-system', set to nil");
1808 XBUFFER(Vbuffer_defaults)->
1809 buffer_file_coding_system = Qnil;
1813 retval = Fget_coding_system(Qraw_text);
1819 mask = postprocess_iso2022_mask(mask);
1821 /* Look through the coding categories by priority and find
1822 the first one that is allowed. */
1823 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1824 cat = fcd->coding_category_by_priority[i];
1827 if ((mask & (1 << cat)) &&
1828 !NILP(fcd->coding_category_system[cat]))
1832 return fcd->coding_category_system[cat];
1834 return Fget_coding_system(Qraw_text);
1838 /* Given a seekable read stream and potential coding system and EOL type
1839 as specified, do any autodetection that is called for. If the
1840 coding system and/or EOL type are not `autodetect', they will be left
1841 alone; but this function will never return an autodetect coding system
1844 This function does not automatically fetch subsidiary coding systems;
1845 that should be unnecessary with the explicit eol-type argument. */
1847 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1848 /* number of leading lines to check for a coding cookie */
1849 #define LINES_TO_CHECK 2
1852 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1853 eol_type_t * eol_type_in_out)
1855 struct detection_state decst;
1857 if (*eol_type_in_out == EOL_AUTODETECT)
1858 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
1861 decst.eol_type = *eol_type_in_out;
1864 /* If autodetection is called for, do it now. */
1865 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1866 || *eol_type_in_out == EOL_AUTODETECT) {
1868 Lisp_Object coding_system = Qnil;
1870 Lstream_data_count nread =
1871 Lstream_read(stream, buf, sizeof(buf));
1873 int lines_checked = 0;
1875 /* Look for initial "-*-"; mode line prefix */
1877 scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1878 p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1879 if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1880 Extbyte *local_vars_beg = p + 3;
1881 /* Look for final "-*-"; mode line suffix */
1882 for (p = local_vars_beg,
1883 scan_end = buf + nread - LENGTH("-*-");
1885 && lines_checked < LINES_TO_CHECK; p++)
1886 if (*p == '-' && *(p + 1) == '*'
1887 && *(p + 2) == '-') {
1888 Extbyte *suffix = p;
1889 /* Look for "coding:" */
1890 for (p = local_vars_beg,
1918 /* Get coding system name */
1921 /* Characters valid in a MIME charset name (rfc 1521),
1922 and in a Lisp symbol name. */
1925 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1926 "abcdefghijklmnopqrstuvwxyz"
1948 /* #### file must use standard EOLs or we miss 2d line */
1949 /* #### not to mention this is broken for UTF-16 DOS files */
1950 else if (*p == '\n' || *p == '\r') {
1952 /* skip past multibyte (DOS) newline */
1954 && *(p + 1) == '\n')
1959 /* #### file must use standard EOLs or we miss 2d line */
1960 /* #### not to mention this is broken for UTF-16 DOS files */
1961 else if (*p == '\n' || *p == '\r') {
1963 /* skip past multibyte (DOS) newline */
1964 if (*p == '\r' && *(p + 1) == '\n')
1968 if (NILP(coding_system))
1970 if (detect_coding_type(&decst, buf, nread,
1973 != CODESYS_AUTODETECT))
1975 nread = Lstream_read(stream, buf, sizeof(buf));
1981 else if (XCODING_SYSTEM_TYPE(*codesys_in_out) ==
1983 && XCODING_SYSTEM_EOL_TYPE(coding_system) ==
1986 if (detect_coding_type(&decst, buf, nread, 1))
1988 nread = Lstream_read(stream, buf, sizeof(buf));
1994 *eol_type_in_out = decst.eol_type;
1995 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
1996 if (NILP(coding_system))
1998 coding_system_from_mask(decst.mask);
2000 *codesys_in_out = coding_system;
2004 /* If we absolutely can't determine the EOL type, just assume LF. */
2005 if (*eol_type_in_out == EOL_AUTODETECT)
2006 *eol_type_in_out = EOL_LF;
2008 Lstream_rewind(stream);
2011 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2012 Detect coding system of the text in the region between START and END.
2013 Return a list of possible coding systems ordered by priority.
2014 If only ASCII characters are found, return 'undecided or one of
2015 its subsidiary coding systems according to a detected end-of-line
2016 type. Optional arg BUFFER defaults to the current buffer.
2018 (start, end, buffer))
2020 Lisp_Object val = Qnil;
2021 struct buffer *buf = decode_buffer(buffer, 0);
2023 Lisp_Object instream, lb_instream;
2024 lstream_t istr, lb_istr;
2025 struct detection_state decst;
2026 struct gcpro gcpro1, gcpro2;
2028 get_buffer_range_char(buf, start, end, &b, &e, 0);
2029 lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2030 lb_istr = XLSTREAM(lb_instream);
2032 make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2033 istr = XLSTREAM(instream);
2034 GCPRO2(instream, lb_instream);
2036 decst.eol_type = EOL_AUTODETECT;
2039 Extbyte random_buffer[4096];
2040 Lstream_data_count nread =
2041 Lstream_read(istr, random_buffer, sizeof(random_buffer));
2045 if (detect_coding_type(&decst, random_buffer, nread, 0))
2049 if (decst.mask == ~0)
2050 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2057 decst.mask = postprocess_iso2022_mask(decst.mask);
2059 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2060 int sys = fcd->coding_category_by_priority[i];
2061 if (decst.mask & (1 << sys)) {
2062 Lisp_Object codesys =
2063 fcd->coding_category_system[sys];
2066 subsidiary_coding_system(codesys,
2069 val = Fcons(codesys, val);
2073 Lstream_close(istr);
2075 Lstream_delete(istr);
2076 Lstream_delete(lb_istr);
2080 /************************************************************************/
2081 /* Converting to internal Mule format ("decoding") */
2082 /************************************************************************/
2084 /* A decoding stream is a stream used for decoding text (i.e.
2085 converting from some external format to internal format).
2086 The decoding-stream object keeps track of the actual coding
2087 stream, the stream that is at the other end, and data that
2088 needs to be persistent across the lifetime of the stream. */
2090 /* Handle the EOL stuff related to just-read-in character C.
2091 EOL_TYPE is the EOL type of the coding stream.
2092 FLAGS is the current value of FLAGS in the coding stream, and may
2093 be modified by this macro. (The macro only looks at the
2094 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2095 bytes are to be written. You need to also define a local goto
2096 label "label_continue_loop" that is at the end of the main
2097 character-reading loop.
2099 If C is a CR character, then this macro handles it entirely and
2100 jumps to label_continue_loop. Otherwise, this macro does not add
2101 anything to DST, and continues normally. You should continue
2102 processing C normally after this macro. */
2104 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2108 if (eol_type == EOL_CR) \
2109 Dynarr_add (dst, '\n'); \
2110 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2111 Dynarr_add (dst, c); \
2113 flags |= CODING_STATE_CR; \
2114 goto label_continue_loop; \
2116 else if (flags & CODING_STATE_CR) \
2117 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2119 Dynarr_add (dst, '\r'); \
2120 flags &= ~CODING_STATE_CR; \
2124 /* C should be a binary character in the range 0 - 255; convert
2125 to internal format and add to Dynarr DST. */
2127 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2129 if (BYTE_ASCII_P (c)) \
2130 Dynarr_add (dst, c); \
2131 else if (BYTE_C1_P (c)) \
2133 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2134 Dynarr_add (dst, c + 0x20); \
2138 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2139 Dynarr_add (dst, c); \
2143 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2147 DECODE_ADD_BINARY_CHAR (ch, dst); \
2152 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2154 if (flags & CODING_STATE_END) \
2156 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2157 if (flags & CODING_STATE_CR) \
2158 Dynarr_add (dst, '\r'); \
2162 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2164 typedef struct decoding_stream_s *decoding_stream_t;
2165 struct decoding_stream_s {
2166 /* Coding system that governs the conversion. */
2167 Lisp_Coding_System *codesys;
2169 /* Stream that we read the encoded data from or
2170 write the decoded data to. */
2171 lstream_t other_end;
2173 /* If we are reading, then we can return only a fixed amount of
2174 data, so if the conversion resulted in too much data, we store it
2175 here for retrieval the next time around. */
2176 unsigned_char_dynarr *runoff;
2178 /* FLAGS holds flags indicating the current state of the decoding.
2179 Some of these flags are dependent on the coding system. */
2182 /* CH holds a partially built-up character. Since we only deal
2183 with one- and two-byte characters at the moment, we only use
2184 this to store the first byte of a two-byte character. */
2187 /* EOL_TYPE specifies the type of end-of-line conversion that
2188 currently applies. We need to keep this separate from the
2189 EOL type stored in CODESYS because the latter might indicate
2190 automatic EOL-type detection while the former will always
2191 indicate a particular EOL type. */
2192 eol_type_t eol_type;
2194 /* Additional ISO2022 information. We define the structure above
2195 because it's also needed by the detection routines. */
2196 struct iso2022_decoder iso2022;
2198 /* Additional information (the state of the running CCL program)
2199 used by the CCL decoder. */
2200 struct ccl_program ccl;
2202 /* counter for UTF-8 or UCS-4 */
2203 unsigned char counter;
2205 struct detection_state decst;
2208 static Lstream_data_count
2209 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2210 static Lstream_data_count
2211 decoding_writer(lstream_t stream,
2212 const unsigned char *data, Lstream_data_count size);
2213 static int decoding_rewinder(lstream_t stream);
2214 static int decoding_seekable_p(lstream_t stream);
2215 static int decoding_flusher(lstream_t stream);
2216 static int decoding_closer(lstream_t stream);
2218 static Lisp_Object decoding_marker(Lisp_Object stream);
2220 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2221 sizeof(struct decoding_stream_s));
2224 decoding_marker(Lisp_Object stream)
2226 lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2227 Lisp_Object str_obj;
2229 /* We do not need to mark the coding systems or charsets stored
2230 within the stream because they are stored in a global list
2231 and automatically marked. */
2233 XSETLSTREAM(str_obj, str);
2234 mark_object(str_obj);
2235 if (str->imp->marker) {
2236 return str->imp->marker(str_obj);
2242 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2243 so we read data from the other end, decode it, and store it into DATA. */
2245 static Lstream_data_count
2246 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2248 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2249 unsigned char *orig_data = data;
2250 Lstream_data_count read_size;
2251 int error_occurred = 0;
2253 /* We need to interface to mule_decode(), which expects to take some
2254 amount of data and store the result into a Dynarr. We have
2255 mule_decode() store into str->runoff, and take data from there
2258 /* We loop until we have enough data, reading chunks from the other
2259 end and decoding it. */
2261 /* Take data from the runoff if we can. Make sure to take at
2262 most SIZE bytes, and delete the data from the runoff. */
2263 if (Dynarr_length(str->runoff) > 0) {
2264 Lstream_data_count chunk =
2266 (Lstream_data_count)
2267 Dynarr_length(str->runoff));
2268 memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2269 Dynarr_delete_many(str->runoff, 0, chunk);
2275 /* No more room for data */
2279 if (str->flags & CODING_STATE_END) {
2280 /* This means that on the previous iteration, we hit the
2281 EOF on the other end. We loop once more so that
2282 mule_decode() can output any final stuff it may be
2283 holding, or any "go back to a sane state" escape
2284 sequences. (This latter makes sense during
2289 /* Exhausted the runoff, so get some more. DATA has at least
2290 SIZE bytes left of storage in it, so it's OK to read directly
2291 into it. (We'll be overwriting above, after we've decoded it
2292 into the runoff.) */
2293 read_size = Lstream_read(str->other_end, data, size);
2294 if (read_size < 0) {
2298 if (read_size == 0) {
2299 /* There might be some more end data produced in the
2300 translation. See the comment above. */
2301 str->flags |= CODING_STATE_END;
2303 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2306 if (data - orig_data == 0) {
2307 return error_occurred ? -1 : 0;
2309 return data - orig_data;
2313 static Lstream_data_count
2314 decoding_writer(lstream_t stream, const unsigned char *data,
2315 Lstream_data_count size)
2317 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2318 Lstream_data_count retval;
2320 /* Decode all our data into the runoff, and then attempt to write
2321 it all out to the other end. Remove whatever chunk we succeeded
2323 mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2324 retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2325 Dynarr_length(str->runoff));
2327 Dynarr_delete_many(str->runoff, 0, retval);
2329 /* Do NOT return retval. The return value indicates how much
2330 of the incoming data was written, not how many bytes were
2336 reset_decoding_stream(decoding_stream_t str)
2339 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2340 Lisp_Object coding_system;
2341 XSETCODING_SYSTEM(coding_system, str->codesys);
2342 reset_iso2022(coding_system, &str->iso2022);
2343 } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2344 setup_ccl_program(&str->ccl,
2345 CODING_SYSTEM_CCL_DECODE(str->codesys));
2349 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2350 || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2352 str->decst.eol_type = EOL_AUTODETECT;
2353 str->decst.mask = ~0;
2355 str->flags = str->ch = 0;
2359 decoding_rewinder(lstream_t stream)
2361 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2362 reset_decoding_stream(str);
2363 Dynarr_reset(str->runoff);
2364 return Lstream_rewind(str->other_end);
2368 decoding_seekable_p(lstream_t stream)
2370 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2371 return Lstream_seekable_p(str->other_end);
2375 decoding_flusher(lstream_t stream)
2377 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2378 return Lstream_flush(str->other_end);
2382 decoding_closer(lstream_t stream)
2384 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2385 if (stream->flags & LSTREAM_FL_WRITE) {
2386 str->flags |= CODING_STATE_END;
2387 decoding_writer(stream, 0, 0);
2389 Dynarr_free(str->runoff);
2391 #ifdef ENABLE_COMPOSITE_CHARS
2392 if (str->iso2022.composite_chars) {
2393 Dynarr_free(str->iso2022.composite_chars);
2397 return Lstream_close(str->other_end);
2401 decoding_stream_coding_system(lstream_t stream)
2403 Lisp_Object coding_system;
2404 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2406 XSETCODING_SYSTEM(coding_system, str->codesys);
2407 return subsidiary_coding_system(coding_system, str->eol_type);
2411 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2413 Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2414 decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2416 if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2417 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2419 reset_decoding_stream(str);
2423 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2424 stream for writing, no automatic code detection will be performed.
2425 The reason for this is that automatic code detection requires a
2426 seekable input. Things will also fail if you open a decoding
2427 stream for reading using a non-fully-specified coding system and
2428 a non-seekable input stream. */
2431 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2433 lstream_t lstr = Lstream_new(lstream_decoding, mode);
2434 decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2438 str->other_end = stream;
2439 str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2440 str->eol_type = EOL_AUTODETECT;
2441 if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2442 /* We can determine the coding system now. */
2443 determine_real_coding_system(stream, &codesys, &str->eol_type);
2445 set_decoding_stream_coding_system(lstr, codesys);
2446 str->decst.eol_type = str->eol_type;
2447 str->decst.mask = ~0;
2448 XSETLSTREAM(obj, lstr);
2453 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2455 return make_decoding_stream_1(stream, codesys, "r");
2459 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2461 return make_decoding_stream_1(stream, codesys, "w");
2464 /* Note: the decode_coding_* functions all take the same
2465 arguments as mule_decode(), which is to say some SRC data of
2466 size N, which is to be stored into dynamic array DST.
2467 DECODING is the stream within which the decoding is
2468 taking place, but no data is actually read from or
2469 written to that stream; that is handled in decoding_reader()
2470 or decoding_writer(). This allows the same functions to
2471 be used for both reading and writing. */
2474 mule_decode(lstream_t decoding, const Extbyte * src,
2475 unsigned_char_dynarr * dst, Lstream_data_count n)
2477 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2479 /* If necessary, do encoding-detection now. We do this when
2480 we're a writing stream or a non-seekable reading stream,
2481 meaning that we can't just process the whole input,
2482 rewind, and start over. */
2484 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2485 str->eol_type == EOL_AUTODETECT) {
2486 Lisp_Object codesys;
2488 XSETCODING_SYSTEM(codesys, str->codesys);
2489 detect_coding_type(&str->decst, src, n,
2490 CODING_SYSTEM_TYPE(str->codesys) !=
2491 CODESYS_AUTODETECT);
2492 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2493 str->decst.mask != ~0)
2494 /* #### This is cheesy. What we really ought to do is
2495 buffer up a certain amount of data so as to get a
2496 less random result. */
2497 codesys = coding_system_from_mask(str->decst.mask);
2498 str->eol_type = str->decst.eol_type;
2499 if (XCODING_SYSTEM(codesys) != str->codesys) {
2500 /* Preserve the CODING_STATE_END flag in case it was set.
2501 If we erase it, bad things might happen. */
2502 int was_end = str->flags & CODING_STATE_END;
2503 set_decoding_stream_coding_system(decoding, codesys);
2505 str->flags |= CODING_STATE_END;
2509 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2510 #ifdef DEBUG_SXEMACS
2511 case CODESYS_INTERNAL:
2512 Dynarr_add_many(dst, src, n);
2515 case CODESYS_AUTODETECT:
2516 /* If we got this far and still haven't decided on the coding
2517 system, then do no conversion. */
2518 case CODESYS_NO_CONVERSION:
2519 decode_coding_no_conversion(decoding, src, dst, n);
2522 case CODESYS_SHIFT_JIS:
2523 decode_coding_sjis(decoding, src, dst, n);
2526 decode_coding_big5(decoding, src, dst, n);
2529 decode_coding_ucs4(decoding, src, dst, n);
2532 decode_coding_utf8(decoding, src, dst, n);
2535 str->ccl.last_block = str->flags & CODING_STATE_END;
2536 /* When applying ccl program to stream, MUST NOT set NULL
2538 ccl_driver(&str->ccl,
2540 ? (const unsigned char *)src
2541 : (const unsigned char *)""),
2542 dst, n, 0, CCL_MODE_DECODING);
2544 case CODESYS_ISO2022:
2545 decode_coding_iso2022(decoding, src, dst, n);
2553 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2554 Decode the text between START and END which is encoded in CODING-SYSTEM.
2555 This is useful if you've read in encoded text from a file without decoding
2556 it (e.g. you read in a JIS-formatted file but used the `binary' or
2557 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2558 Return length of decoded text.
2559 BUFFER defaults to the current buffer if unspecified.
2561 (start, end, coding_system, buffer))
2564 struct buffer *buf = decode_buffer(buffer, 0);
2565 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2566 lstream_t istr, ostr;
2567 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2569 get_buffer_range_char(buf, start, end, &b, &e, 0);
2571 barf_if_buffer_read_only(buf, b, e);
2573 coding_system = Fget_coding_system(coding_system);
2574 instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2575 lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2576 de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2578 outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2579 Fget_coding_system(Qbinary));
2580 istr = XLSTREAM(instream);
2581 ostr = XLSTREAM(outstream);
2582 GCPRO4(instream, lb_outstream, de_outstream, outstream);
2584 /* The chain of streams looks like this:
2586 [BUFFER] <----- send through
2587 ------> [ENCODE AS BINARY]
2588 ------> [DECODE AS SPECIFIED]
2593 char tempbuf[1024]; /* some random amount */
2594 Bufpos newpos, even_newer_pos;
2595 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2596 Lstream_data_count size_in_bytes =
2597 Lstream_read(istr, tempbuf, sizeof(tempbuf));
2601 newpos = lisp_buffer_stream_startpos(istr);
2602 Lstream_write(ostr, tempbuf, size_in_bytes);
2603 even_newer_pos = lisp_buffer_stream_startpos(istr);
2604 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2607 Lstream_close(istr);
2608 Lstream_close(ostr);
2610 Lstream_delete(istr);
2611 Lstream_delete(ostr);
2612 Lstream_delete(XLSTREAM(de_outstream));
2613 Lstream_delete(XLSTREAM(lb_outstream));
2617 /************************************************************************/
2618 /* Converting to an external encoding ("encoding") */
2619 /************************************************************************/
2621 /* An encoding stream is an output stream. When you create the
2622 stream, you specify the coding system that governs the encoding
2623 and another stream that the resulting encoded data is to be
2624 sent to, and then start sending data to it. */
2626 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2628 typedef struct encoding_stream_s *encoding_stream_t;
2629 struct encoding_stream_s {
2630 /* Coding system that governs the conversion. */
2631 Lisp_Coding_System *codesys;
2633 /* Stream that we read the encoded data from or
2634 write the decoded data to. */
2635 lstream_t other_end;
2637 /* If we are reading, then we can return only a fixed amount of
2638 data, so if the conversion resulted in too much data, we store it
2639 here for retrieval the next time around. */
2640 unsigned_char_dynarr *runoff;
2642 /* FLAGS holds flags indicating the current state of the encoding.
2643 Some of these flags are dependent on the coding system. */
2646 /* CH holds a partially built-up character. Since we only deal
2647 with one- and two-byte characters at the moment, we only use
2648 this to store the first byte of a two-byte character. */
2651 /* Additional information used by the ISO2022 encoder. */
2653 /* CHARSET holds the character sets currently assigned to the G0
2654 through G3 registers. It is initialized from the array
2655 INITIAL_CHARSET in CODESYS. */
2656 Lisp_Object charset[4];
2658 /* Which registers are currently invoked into the left (GL) and
2659 right (GR) halves of the 8-bit encoding space? */
2660 int register_left, register_right;
2662 /* Whether we need to explicitly designate the charset in the
2663 G? register before using it. It is initialized from the
2664 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2665 unsigned char force_charset_on_output[4];
2667 /* Other state variables that need to be preserved across
2669 Lisp_Object current_charset;
2671 int current_char_boundary;
2674 /* Additional information (the state of the running CCL program)
2675 used by the CCL encoder. */
2676 struct ccl_program ccl;
2680 static Lstream_data_count
2681 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2682 static Lstream_data_count
2683 encoding_writer(lstream_t stream,
2684 const unsigned char *data, Lstream_data_count size);
2685 static int encoding_rewinder(lstream_t stream);
2686 static int encoding_seekable_p(lstream_t stream);
2687 static int encoding_flusher(lstream_t stream);
2688 static int encoding_closer(lstream_t stream);
2690 static Lisp_Object encoding_marker(Lisp_Object stream);
2692 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2693 sizeof(struct encoding_stream_s));
2696 encoding_marker(Lisp_Object stream)
2698 lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2699 Lisp_Object str_obj;
2701 /* We do not need to mark the coding systems or charsets stored
2702 within the stream because they are stored in a global list
2703 and automatically marked. */
2705 XSETLSTREAM(str_obj, str);
2706 mark_object(str_obj);
2707 if (str->imp->marker) {
2708 return str->imp->marker(str_obj);
2714 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2715 so we read data from the other end, encode it, and store it into DATA. */
2717 static Lstream_data_count
2718 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2720 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2721 unsigned char *orig_data = data;
2722 Lstream_data_count read_size;
2723 int error_occurred = 0;
2725 /* We need to interface to mule_encode(), which expects to take some
2726 amount of data and store the result into a Dynarr. We have
2727 mule_encode() store into str->runoff, and take data from there
2730 /* We loop until we have enough data, reading chunks from the other
2731 end and encoding it. */
2733 /* Take data from the runoff if we can. Make sure to take at
2734 most SIZE bytes, and delete the data from the runoff. */
2735 if (Dynarr_length(str->runoff) > 0) {
2736 int chunk = min((int)size, Dynarr_length(str->runoff));
2737 memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2738 Dynarr_delete_many(str->runoff, 0, chunk);
2744 /* No more room for data */
2748 if (str->flags & CODING_STATE_END) {
2749 /* This means that on the previous iteration, we hit the
2750 EOF on the other end. We loop once more so that
2751 mule_encode() can output any final stuff it may be
2752 holding, or any "go back to a sane state" escape
2753 sequences. (This latter makes sense during
2758 /* Exhausted the runoff, so get some more. DATA at least SIZE
2759 bytes left of storage in it, so it's OK to read directly into
2760 it. (We'll be overwriting above, after we've encoded it into
2762 read_size = Lstream_read(str->other_end, data, size);
2763 if (read_size < 0) {
2767 if (read_size == 0) {
2768 /* There might be some more end data produced in the
2769 translation. See the comment above. */
2770 str->flags |= CODING_STATE_END;
2772 mule_encode(stream, data, str->runoff, read_size);
2775 if (data == orig_data) {
2776 return error_occurred ? -1 : 0;
2778 return data - orig_data;
2782 static Lstream_data_count
2783 encoding_writer(lstream_t stream, const unsigned char *data,
2784 Lstream_data_count size)
2786 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2787 Lstream_data_count retval;
2789 /* Encode all our data into the runoff, and then attempt to write
2790 it all out to the other end. Remove whatever chunk we succeeded
2792 mule_encode(stream, data, str->runoff, size);
2793 retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2794 Dynarr_length(str->runoff));
2796 Dynarr_delete_many(str->runoff, 0, retval);
2798 /* Do NOT return retval. The return value indicates how much
2799 of the incoming data was written, not how many bytes were
2805 reset_encoding_stream(encoding_stream_t str)
2808 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2809 case CODESYS_ISO2022: {
2812 for (i = 0; i < 4; i++) {
2813 str->iso2022.charset[i] =
2814 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2816 str->iso2022.force_charset_on_output[i] =
2817 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2820 str->iso2022.register_left = 0;
2821 str->iso2022.register_right = 1;
2822 str->iso2022.current_charset = Qnil;
2823 str->iso2022.current_half = 0;
2824 str->iso2022.current_char_boundary = 1;
2828 setup_ccl_program(&str->ccl,
2829 CODING_SYSTEM_CCL_ENCODE(str->codesys));
2832 /* list the rest of them lot explicitly */
2833 case CODESYS_AUTODETECT:
2834 case CODESYS_SHIFT_JIS:
2838 case CODESYS_NO_CONVERSION:
2839 #ifdef DEBUG_SXEMACS
2840 case CODESYS_INTERNAL:
2847 str->flags = str->ch = 0;
2851 encoding_rewinder(lstream_t stream)
2853 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2854 reset_encoding_stream(str);
2855 Dynarr_reset(str->runoff);
2856 return Lstream_rewind(str->other_end);
2860 encoding_seekable_p(lstream_t stream)
2862 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2863 return Lstream_seekable_p(str->other_end);
2867 encoding_flusher(lstream_t stream)
2869 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2870 return Lstream_flush(str->other_end);
2874 encoding_closer(lstream_t stream)
2876 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2877 if (stream->flags & LSTREAM_FL_WRITE) {
2878 str->flags |= CODING_STATE_END;
2879 encoding_writer(stream, 0, 0);
2881 Dynarr_free(str->runoff);
2882 return Lstream_close(str->other_end);
2886 encoding_stream_coding_system(lstream_t stream)
2888 Lisp_Object coding_system;
2889 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2891 XSETCODING_SYSTEM(coding_system, str->codesys);
2892 return coding_system;
2896 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2898 Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2899 encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2901 reset_encoding_stream(str);
2905 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2907 lstream_t lstr = Lstream_new(lstream_encoding, mode);
2908 encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2912 str->runoff = Dynarr_new(unsigned_char);
2913 str->other_end = stream;
2914 set_encoding_stream_coding_system(lstr, codesys);
2915 XSETLSTREAM(obj, lstr);
2920 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2922 return make_encoding_stream_1(stream, codesys, "r");
2926 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2928 return make_encoding_stream_1(stream, codesys, "w");
2931 /* Convert N bytes of internally-formatted data stored in SRC to an
2932 external format, according to the encoding stream ENCODING.
2933 Store the encoded data into DST. */
2936 mule_encode(lstream_t encoding, const Bufbyte * src,
2937 unsigned_char_dynarr * dst, Lstream_data_count n)
2939 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2941 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2942 #ifdef DEBUG_SXEMACS
2943 case CODESYS_INTERNAL:
2944 Dynarr_add_many(dst, src, n);
2947 case CODESYS_AUTODETECT:
2948 /* If we got this far and still haven't decided on the coding
2949 system, then do no conversion. */
2950 case CODESYS_NO_CONVERSION:
2951 encode_coding_no_conversion(encoding, src, dst, n);
2954 case CODESYS_SHIFT_JIS:
2955 encode_coding_sjis(encoding, src, dst, n);
2958 encode_coding_big5(encoding, src, dst, n);
2961 encode_coding_ucs4(encoding, src, dst, n);
2964 encode_coding_utf8(encoding, src, dst, n);
2967 str->ccl.last_block = str->flags & CODING_STATE_END;
2968 /* When applying ccl program to stream, MUST NOT set NULL
2970 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
2971 dst, n, 0, CCL_MODE_ENCODING);
2973 case CODESYS_ISO2022:
2974 encode_coding_iso2022(encoding, src, dst, n);
2982 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2983 Encode the text between START and END using CODING-SYSTEM.
2984 This will, for example, convert Japanese characters into stuff such as
2985 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2986 text. BUFFER defaults to the current buffer if unspecified.
2988 (start, end, coding_system, buffer))
2991 struct buffer *buf = decode_buffer(buffer, 0);
2992 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2993 lstream_t istr, ostr;
2994 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2996 get_buffer_range_char(buf, start, end, &b, &e, 0);
2998 barf_if_buffer_read_only(buf, b, e);
3000 coding_system = Fget_coding_system(coding_system);
3001 instream = make_lisp_buffer_input_stream(buf, b, e, 0);
3002 lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
3003 de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
3004 Fget_coding_system(Qbinary));
3005 outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
3007 istr = XLSTREAM(instream);
3008 ostr = XLSTREAM(outstream);
3009 GCPRO4(instream, outstream, de_outstream, lb_outstream);
3010 /* The chain of streams looks like this:
3012 [BUFFER] <----- send through
3013 ------> [ENCODE AS SPECIFIED]
3014 ------> [DECODE AS BINARY]
3018 char tempbuf[1024]; /* some random amount */
3019 Bufpos newpos, even_newer_pos;
3020 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
3021 Lstream_data_count size_in_bytes =
3022 Lstream_read(istr, tempbuf, sizeof(tempbuf));
3026 newpos = lisp_buffer_stream_startpos(istr);
3027 Lstream_write(ostr, tempbuf, size_in_bytes);
3028 even_newer_pos = lisp_buffer_stream_startpos(istr);
3029 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3035 lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3036 Lstream_close(istr);
3037 Lstream_close(ostr);
3039 Lstream_delete(istr);
3040 Lstream_delete(ostr);
3041 Lstream_delete(XLSTREAM(de_outstream));
3042 Lstream_delete(XLSTREAM(lb_outstream));
3043 return make_int(retlen);
3049 /************************************************************************/
3050 /* Shift-JIS methods */
3051 /************************************************************************/
3053 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3054 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3055 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3056 encoded by "position-code + 0x80". A character of JISX0208
3057 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3058 position-codes are divided and shifted so that it fit in the range
3061 --- CODE RANGE of Shift-JIS ---
3062 (character set) (range)
3064 JISX0201-Kana 0xA0 .. 0xDF
3065 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3066 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3067 -------------------------------
3071 /* Is this the first byte of a Shift-JIS two-byte char? */
3073 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3074 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3076 /* Is this the second byte of a Shift-JIS two-byte char? */
3078 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3079 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3081 #define BYTE_SJIS_KATAKANA_P(c) \
3082 ((c) >= 0xA1 && (c) <= 0xDF)
3085 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3086 Lstream_data_count n)
3089 const unsigned char c = *(const unsigned char *)src++;
3090 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3092 if (st->shift_jis.in_second_byte) {
3093 st->shift_jis.in_second_byte = 0;
3096 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3097 st->shift_jis.in_second_byte = 1;
3099 return CODING_CATEGORY_SHIFT_JIS_MASK;
3102 /* Convert Shift-JIS data to internal format. */
3105 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3106 unsigned_char_dynarr * dst, Lstream_data_count n)
3108 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3109 unsigned int flags = str->flags;
3110 unsigned int ch = str->ch;
3111 eol_type_t eol_type = str->eol_type;
3114 const unsigned char c = *(const unsigned char *)src++;
3117 /* Previous character was first byte of Shift-JIS Kanji
3119 if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3120 unsigned char e1, e2;
3122 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3123 DECODE_SJIS(ch, c, e1, e2);
3124 Dynarr_add(dst, e1);
3125 Dynarr_add(dst, e2);
3127 DECODE_ADD_BINARY_CHAR(ch, dst);
3128 DECODE_ADD_BINARY_CHAR(c, dst);
3132 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3133 if (BYTE_SJIS_TWO_BYTE_1_P(c))
3135 else if (BYTE_SJIS_KATAKANA_P(c)) {
3136 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3139 DECODE_ADD_BINARY_CHAR(c, dst);
3141 label_continue_loop:;
3144 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3150 /* Convert internally-formatted data to Shift-JIS. */
3153 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3154 unsigned_char_dynarr * dst, Lstream_data_count n)
3156 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3157 unsigned int flags = str->flags;
3158 unsigned int ch = str->ch;
3159 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3164 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3165 Dynarr_add(dst, '\r');
3166 if (eol_type != EOL_CR)
3167 Dynarr_add(dst, '\n');
3169 } else if (BYTE_ASCII_P(c)) {
3172 } else if (BUFBYTE_LEADING_BYTE_P(c))
3173 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3174 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3175 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3177 if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3180 } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3181 ch == LEADING_BYTE_JAPANESE_JISX0208)
3184 /* j1 is bessel j1 function,
3185 * so we use something else */
3186 /* unsigned char j1, j2; */
3187 unsigned char tt1, tt2;
3189 ENCODE_SJIS(ch, c, tt1, tt2);
3190 Dynarr_add(dst, tt1);
3191 Dynarr_add(dst, tt2);
3201 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3202 Decode a JISX0208 character of Shift-JIS coding-system.
3203 CODE is the character code in Shift-JIS as a cons of type bytes.
3204 Return the corresponding character.
3208 unsigned char c1, c2, s1, s2;
3211 CHECK_INT(XCAR(code));
3212 CHECK_INT(XCDR(code));
3213 s1 = XINT(XCAR(code));
3214 s2 = XINT(XCDR(code));
3215 if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3216 DECODE_SJIS(s1, s2, c1, c2);
3217 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3218 c1 & 0x7F, c2 & 0x7F));
3223 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3224 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3225 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3229 Lisp_Object charset;
3232 CHECK_CHAR_COERCE_INT(character);
3233 BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3234 if (EQ(charset, Vcharset_japanese_jisx0208)) {
3235 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3236 return Fcons(make_int(s1), make_int(s2));
3241 /************************************************************************/
3243 /************************************************************************/
3245 /* BIG5 is a coding system encoding two character sets: ASCII and
3246 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3247 character set and is encoded in two-byte.
3249 --- CODE RANGE of BIG5 ---
3250 (character set) (range)
3252 Big5 (1st byte) 0xA1 .. 0xFE
3253 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3254 --------------------------
3256 Since the number of characters in Big5 is larger than maximum
3257 characters in Emacs' charset (96x96), it can't be handled as one
3258 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3259 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3260 contains frequently used characters and the latter contains less
3261 frequently used characters. */
3263 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3264 ((c) >= 0xA1 && (c) <= 0xFE)
3266 /* Is this the second byte of a Shift-JIS two-byte char? */
3268 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3269 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3271 /* Number of Big5 characters which have the same code in 1st byte. */
3273 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3275 /* Code conversion macros. These are macros because they are used in
3276 inner loops during code conversion.
3278 Note that temporary variables in macros introduce the classic
3279 dynamic-scoping problems with variable names. We use capital-
3280 lettered variables in the assumption that SXEmacs does not use
3281 capital letters in variables except in a very formalized way
3284 /* Convert Big5 code (b1, b2) into its internal string representation
3287 /* There is a much simpler way to split the Big5 charset into two.
3288 For the moment I'm going to leave the algorithm as-is because it
3289 claims to separate out the most-used characters into a single
3290 charset, which perhaps will lead to optimizations in various
3293 The way the algorithm works is something like this:
3295 Big5 can be viewed as a 94x157 charset, where the row is
3296 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3297 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3298 the split between low and high column numbers is apparently
3299 meaningless; ascending rows produce less and less frequent chars.
3300 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3301 the first charset, and the upper half (0xC9 .. 0xFE) to the
3302 second. To do the conversion, we convert the character into
3303 a single number where 0 .. 156 is the first row, 157 .. 313
3304 is the second, etc. That way, the characters are ordered by
3305 decreasing frequency. Then we just chop the space in two
3306 and coerce the result into a 94x94 space.
3309 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3311 int B1 = b1, B2 = b2; \
3313 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3317 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3321 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3322 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3324 c1 = I / (0xFF - 0xA1) + 0xA1; \
3325 c2 = I % (0xFF - 0xA1) + 0xA1; \
3328 /* Convert the internal string representation of a Big5 character
3329 (lb, c1, c2) into Big5 code (b1, b2). */
3331 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3333 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3335 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3337 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3339 b1 = I / BIG5_SAME_ROW + 0xA1; \
3340 b2 = I % BIG5_SAME_ROW; \
3341 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3345 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3346 Lstream_data_count n)
3349 const unsigned char c = *(const unsigned char *)src++;
3350 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3351 (c >= 0x80 && c <= 0xA0))
3353 if (st->big5.in_second_byte) {
3354 st->big5.in_second_byte = 0;
3355 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3357 } else if (c >= 0xA1)
3358 st->big5.in_second_byte = 1;
3360 return CODING_CATEGORY_BIG5_MASK;
3363 /* Convert Big5 data to internal format. */
3366 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3367 unsigned_char_dynarr * dst, Lstream_data_count n)
3369 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3370 unsigned int flags = str->flags;
3371 unsigned int ch = str->ch;
3372 eol_type_t eol_type = str->eol_type;
3375 const unsigned char c = *(const unsigned char *)src++;
3377 /* Previous character was first byte of Big5 char. */
3378 if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3379 unsigned char b1, b2, b3;
3380 DECODE_BIG5(ch, c, b1, b2, b3);
3381 Dynarr_add(dst, b1);
3382 Dynarr_add(dst, b2);
3383 Dynarr_add(dst, b3);
3385 DECODE_ADD_BINARY_CHAR(ch, dst);
3386 DECODE_ADD_BINARY_CHAR(c, dst);
3390 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3391 if (BYTE_BIG5_TWO_BYTE_1_P(c))
3394 DECODE_ADD_BINARY_CHAR(c, dst);
3396 label_continue_loop:;
3399 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3405 /* Convert internally-formatted data to Big5. */
3408 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3409 unsigned_char_dynarr * dst, Lstream_data_count n)
3412 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3413 unsigned int flags = str->flags;
3414 unsigned int ch = str->ch;
3415 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3420 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3421 Dynarr_add(dst, '\r');
3422 if (eol_type != EOL_CR)
3423 Dynarr_add(dst, '\n');
3424 } else if (BYTE_ASCII_P(c)) {
3427 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3428 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3429 c == LEADING_BYTE_CHINESE_BIG5_2) {
3430 /* A recognized leading byte. */
3432 continue; /* not done with this character. */
3434 /* otherwise just ignore this character. */
3435 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3436 ch == LEADING_BYTE_CHINESE_BIG5_2) {
3437 /* Previous char was a recognized leading byte. */
3439 continue; /* not done with this character. */
3441 /* Encountering second byte of a Big5 character. */
3442 unsigned char b1, b2;
3444 ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3445 Dynarr_add(dst, b1);
3446 Dynarr_add(dst, b2);
3456 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3457 Decode a Big5 character CODE of BIG5 coding-system.
3458 CODE is the character code in BIG5, a cons of two integers.
3459 Return the corresponding character.
3463 unsigned char c1, c2, b1, b2;
3466 CHECK_INT(XCAR(code));
3467 CHECK_INT(XCDR(code));
3468 b1 = XINT(XCAR(code));
3469 b2 = XINT(XCDR(code));
3470 if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3472 Lisp_Object charset;
3473 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3474 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3475 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3480 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3481 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3482 Return the corresponding character code in Big5.
3486 Lisp_Object charset;
3489 CHECK_CHAR_COERCE_INT(character);
3490 BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3491 if (EQ(charset, Vcharset_chinese_big5_1) ||
3492 EQ(charset, Vcharset_chinese_big5_2)) {
3493 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3495 return Fcons(make_int(b1), make_int(b2));
3500 /************************************************************************/
3503 /* UCS-4 character codes are implemented as nonnegative integers. */
3505 /************************************************************************/
3507 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3508 Map UCS-4 code CODE to Mule character CHARACTER.
3510 Return T on success, NIL on failure.
3516 CHECK_CHAR(character);
3520 if (c < countof(fcd->ucs_to_mule_table)) {
3521 fcd->ucs_to_mule_table[c] = character;
3527 static Lisp_Object ucs_to_char(unsigned long code)
3529 if (code < countof(fcd->ucs_to_mule_table)) {
3530 return fcd->ucs_to_mule_table[code];
3531 } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3535 c = code % (94 * 94);
3537 (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3538 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3539 CHARSET_LEFT_TO_RIGHT),
3540 c / 94 + 33, c % 94 + 33));
3545 DEFUN("ucs-char", Fucs_char, 1, 1, 0, /*
3546 Return Mule character corresponding to UCS code CODE (a positive integer).
3551 return ucs_to_char(XINT(code));
3554 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3555 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3559 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3560 Fset_char_ucs is more restrictive on index arg, but should
3561 check code arg in a char_table method. */
3562 CHECK_CHAR(character);
3564 return Fput_char_table(character, code, mule_to_ucs_table);
3567 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0, /*
3568 Return the UCS code (a positive integer) corresponding to CHARACTER.
3572 return Fget_char_table(character, mule_to_ucs_table);
3575 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3576 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3577 is not found, instead.
3578 #### do something more appropriate (use blob?)
3579 Danger, Will Robinson! Data loss. Should we signal user? */
3580 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3582 Lisp_Object chr = ucs_to_char(ch);
3585 Bufbyte work[MAX_EMCHAR_LEN];
3590 simple_set_charptr_emchar(work, ch) :
3591 non_ascii_set_charptr_emchar(work, ch);
3592 Dynarr_add_many(dst, work, len);
3594 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3595 Dynarr_add(dst, 34 + 128);
3596 Dynarr_add(dst, 46 + 128);
3600 static unsigned long
3601 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3604 = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3609 } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3610 (XCHARSET_CHARS(charset) == 94)) {
3611 unsigned char final = XCHARSET_FINAL(charset);
3613 if (('@' <= final) && (final < 0x7f)) {
3614 return 0xe00000 + (final - '@') * 94 * 94
3615 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3625 encode_ucs4(Lisp_Object charset,
3626 unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3628 unsigned long code = mule_char_to_ucs4(charset, h, l);
3629 Dynarr_add(dst, code >> 24);
3630 Dynarr_add(dst, (code >> 16) & 255);
3631 Dynarr_add(dst, (code >> 8) & 255);
3632 Dynarr_add(dst, code & 255);
3636 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3637 Lstream_data_count n)
3640 const unsigned char c = *(const unsigned char *)src++;
3641 switch (st->ucs4.in_byte) {
3649 st->ucs4.in_byte = 0;
3655 return CODING_CATEGORY_UCS4_MASK;
3659 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3660 unsigned_char_dynarr * dst, Lstream_data_count n)
3662 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3663 unsigned int flags = str->flags;
3664 unsigned int ch = str->ch;
3665 unsigned char counter = str->counter;
3668 const unsigned char c = *(const unsigned char *)src++;
3675 decode_ucs4((ch << 8) | c, dst);
3684 if (counter & CODING_STATE_END)
3685 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3689 str->counter = counter;
3693 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3694 unsigned_char_dynarr * dst, Lstream_data_count n)
3696 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3697 unsigned int flags = str->flags;
3698 unsigned int ch = str->ch;
3699 unsigned char char_boundary = str->iso2022.current_char_boundary;
3700 Lisp_Object charset = str->iso2022.current_charset;
3702 #ifdef ENABLE_COMPOSITE_CHARS
3703 /* flags for handling composite chars. We do a little switcharoo
3704 on the source while we're outputting the composite char. */
3705 unsigned int saved_n = 0;
3706 const unsigned char *saved_src = NULL;
3707 int in_composite = 0;
3713 unsigned char c = *src++;
3715 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
3717 encode_ucs4(Vcharset_ascii, c, 0, dst);
3719 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
3721 charset = CHARSET_BY_LEADING_BYTE(c);
3722 if (LEADING_BYTE_PREFIX_P(c))
3725 } else { /* Processing Non-ASCII character */
3727 if (EQ(charset, Vcharset_control_1)) {
3728 encode_ucs4(Vcharset_control_1, c, 0, dst);
3730 switch (XCHARSET_REP_BYTES(charset)) {
3732 encode_ucs4(charset, c, 0, dst);
3735 if (XCHARSET_PRIVATE_P(charset)) {
3736 encode_ucs4(charset, c, 0, dst);
3739 #ifdef ENABLE_COMPOSITE_CHARS
3742 Vcharset_composite)) {
3744 /* #### Bother! We don't know how to
3757 (Vcharset_composite,
3762 composite_char_string
3771 n = XSTRING_LENGTH(lstr);
3774 #endif /* ENABLE_COMPOSITE_CHARS */
3776 encode_ucs4(charset, ch,
3787 encode_ucs4(charset, ch, c,
3802 #ifdef ENABLE_COMPOSITE_CHARS
3807 goto back_to_square_n; /* Wheeeeeeeee ..... */
3809 #endif /* ENABLE_COMPOSITE_CHARS */
3813 str->iso2022.current_char_boundary = char_boundary;
3814 str->iso2022.current_charset = charset;
3816 /* Verbum caro factum est! */
3819 /************************************************************************/
3821 /************************************************************************/
3824 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3825 Lstream_data_count n)
3828 const unsigned char c = *(const unsigned char *)src++;
3829 switch (st->utf8.in_byte) {
3831 if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3832 || c == ISO_CODE_SO)
3835 st->utf8.in_byte = 5;
3837 st->utf8.in_byte = 4;
3839 st->utf8.in_byte = 3;
3841 st->utf8.in_byte = 2;
3843 st->utf8.in_byte = 1;
3848 if ((c & 0xc0) != 0x80)
3854 return CODING_CATEGORY_UTF8_MASK;
3858 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3859 unsigned_char_dynarr * dst, Lstream_data_count n)
3861 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3862 unsigned int flags = str->flags;
3863 unsigned int ch = str->ch;
3864 eol_type_t eol_type = str->eol_type;
3865 unsigned char counter = str->counter;
3868 const unsigned char c = *(const unsigned char *)src++;
3874 } else if (c >= 0xf8) {
3877 } else if (c >= 0xf0) {
3880 } else if (c >= 0xe0) {
3883 } else if (c >= 0xc0) {
3887 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3888 decode_ucs4(c, dst);
3892 ch = (ch << 6) | (c & 0x3f);
3893 decode_ucs4(ch, dst);
3898 ch = (ch << 6) | (c & 0x3f);
3901 label_continue_loop:;
3904 if (flags & CODING_STATE_END)
3905 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3909 str->counter = counter;
3913 encode_utf8(Lisp_Object charset,
3914 unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3916 unsigned long code = mule_char_to_ucs4(charset, h, l);
3918 Dynarr_add(dst, code);
3919 } else if (code <= 0x7ff) {
3920 Dynarr_add(dst, (code >> 6) | 0xc0);
3921 Dynarr_add(dst, (code & 0x3f) | 0x80);
3922 } else if (code <= 0xffff) {
3923 Dynarr_add(dst, (code >> 12) | 0xe0);
3924 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3925 Dynarr_add(dst, (code & 0x3f) | 0x80);
3926 } else if (code <= 0x1fffff) {
3927 Dynarr_add(dst, (code >> 18) | 0xf0);
3928 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3929 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3930 Dynarr_add(dst, (code & 0x3f) | 0x80);
3931 } else if (code <= 0x3ffffff) {
3932 Dynarr_add(dst, (code >> 24) | 0xf8);
3933 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3934 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3935 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3936 Dynarr_add(dst, (code & 0x3f) | 0x80);
3938 Dynarr_add(dst, (code >> 30) | 0xfc);
3939 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3940 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3941 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3942 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3943 Dynarr_add(dst, (code & 0x3f) | 0x80);
3948 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3949 unsigned_char_dynarr * dst, Lstream_data_count n)
3951 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3952 unsigned int flags = str->flags;
3953 unsigned int ch = str->ch;
3954 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3955 unsigned char char_boundary = str->iso2022.current_char_boundary;
3956 Lisp_Object charset = str->iso2022.current_charset;
3958 #ifdef ENABLE_COMPOSITE_CHARS
3959 /* flags for handling composite chars. We do a little switcharoo
3960 on the source while we're outputting the composite char. */
3961 unsigned int saved_n = 0;
3962 const unsigned char *saved_src = NULL;
3963 int in_composite = 0;
3966 #endif /* ENABLE_COMPOSITE_CHARS */
3969 unsigned char c = *src++;
3971 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
3974 if (eol_type != EOL_LF
3975 && eol_type != EOL_AUTODETECT)
3976 Dynarr_add(dst, '\r');
3977 if (eol_type != EOL_CR)
3980 encode_utf8(Vcharset_ascii, c, 0, dst);
3982 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
3984 charset = CHARSET_BY_LEADING_BYTE(c);
3985 if (LEADING_BYTE_PREFIX_P(c))
3988 } else { /* Processing Non-ASCII character */
3990 if (EQ(charset, Vcharset_control_1)) {
3991 encode_utf8(Vcharset_control_1, c, 0, dst);
3993 switch (XCHARSET_REP_BYTES(charset)) {
3995 encode_utf8(charset, c, 0, dst);
3998 if (XCHARSET_PRIVATE_P(charset)) {
3999 encode_utf8(charset, c, 0, dst);
4002 #ifdef ENABLE_COMPOSITE_CHARS
4005 Vcharset_composite)) {
4007 /* #### Bother! We don't know how to
4016 (Vcharset_composite,
4021 composite_char_string
4030 n = XSTRING_LENGTH(lstr);
4033 #endif /* ENABLE_COMPOSITE_CHARS */
4035 encode_utf8(charset, ch,
4046 encode_utf8(charset, ch, c,
4061 #ifdef ENABLE_COMPOSITE_CHARS
4066 goto back_to_square_n; /* Wheeeeeeeee ..... */
4072 str->iso2022.current_char_boundary = char_boundary;
4073 str->iso2022.current_charset = charset;
4075 /* Verbum caro factum est! */
4078 /************************************************************************/
4079 /* ISO2022 methods */
4080 /************************************************************************/
4082 /* The following note describes the coding system ISO2022 briefly.
4083 Since the intention of this note is to help understand the
4084 functions in this file, some parts are NOT ACCURATE or OVERLY
4085 SIMPLIFIED. For thorough understanding, please refer to the
4086 original document of ISO2022.
4088 ISO2022 provides many mechanisms to encode several character sets
4089 in 7-bit and 8-bit environments. For 7-bit environments, all text
4090 is encoded using bytes less than 128. This may make the encoded
4091 text a little bit longer, but the text passes more easily through
4092 several gateways, some of which strip off MSB (Most Signigant Bit).
4094 There are two kinds of character sets: control character set and
4095 graphic character set. The former contains control characters such
4096 as `newline' and `escape' to provide control functions (control
4097 functions are also provided by escape sequences). The latter
4098 contains graphic characters such as 'A' and '-'. Emacs recognizes
4099 two control character sets and many graphic character sets.
4101 Graphic character sets are classified into one of the following
4102 four classes, according to the number of bytes (DIMENSION) and
4103 number of characters in one dimension (CHARS) of the set:
4104 - DIMENSION1_CHARS94
4105 - DIMENSION1_CHARS96
4106 - DIMENSION2_CHARS94
4107 - DIMENSION2_CHARS96
4109 In addition, each character set is assigned an identification tag,
4110 unique for each set, called "final character" (denoted as <F>
4111 hereafter). The <F> of each character set is decided by ECMA(*)
4112 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4113 (0x30..0x3F are for private use only).
4115 Note (*): ECMA = European Computer Manufacturers Association
4117 Here are examples of graphic character set [NAME(<F>)]:
4118 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4119 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4120 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4121 o DIMENSION2_CHARS96 -- none for the moment
4123 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4124 C0 [0x00..0x1F] -- control character plane 0
4125 GL [0x20..0x7F] -- graphic character plane 0
4126 C1 [0x80..0x9F] -- control character plane 1
4127 GR [0xA0..0xFF] -- graphic character plane 1
4129 A control character set is directly designated and invoked to C0 or
4130 C1 by an escape sequence. The most common case is that:
4131 - ISO646's control character set is designated/invoked to C0, and
4132 - ISO6429's control character set is designated/invoked to C1,
4133 and usually these designations/invocations are omitted in encoded
4134 text. In a 7-bit environment, only C0 can be used, and a control
4135 character for C1 is encoded by an appropriate escape sequence to
4136 fit into the environment. All control characters for C1 are
4137 defined to have corresponding escape sequences.
4139 A graphic character set is at first designated to one of four
4140 graphic registers (G0 through G3), then these graphic registers are
4141 invoked to GL or GR. These designations and invocations can be
4142 done independently. The most common case is that G0 is invoked to
4143 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4144 these invocations and designations are omitted in encoded text.
4145 In a 7-bit environment, only GL can be used.
4147 When a graphic character set of CHARS94 is invoked to GL, codes
4148 0x20 and 0x7F of the GL area work as control characters SPACE and
4149 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4152 There are two ways of invocation: locking-shift and single-shift.
4153 With locking-shift, the invocation lasts until the next different
4154 invocation, whereas with single-shift, the invocation affects the
4155 following character only and doesn't affect the locking-shift
4156 state. Invocations are done by the following control characters or
4159 ----------------------------------------------------------------------
4160 abbrev function cntrl escape seq description
4161 ----------------------------------------------------------------------
4162 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4163 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4164 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4165 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4166 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4167 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4168 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4169 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4170 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4171 ----------------------------------------------------------------------
4172 (*) These are not used by any known coding system.
4174 Control characters for these functions are defined by macros
4175 ISO_CODE_XXX in `coding.h'.
4177 Designations are done by the following escape sequences:
4178 ----------------------------------------------------------------------
4179 escape sequence description
4180 ----------------------------------------------------------------------
4181 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4182 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4183 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4184 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4185 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4186 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4187 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4188 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4189 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4190 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4191 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4192 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4193 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4194 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4195 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4196 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4197 ----------------------------------------------------------------------
4199 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4200 of dimension 1, chars 94, and final character <F>, etc...
4202 Note (*): Although these designations are not allowed in ISO2022,
4203 Emacs accepts them on decoding, and produces them on encoding
4204 CHARS96 character sets in a coding system which is characterized as
4205 7-bit environment, non-locking-shift, and non-single-shift.
4207 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4208 '(' can be omitted. We refer to this as "short-form" hereafter.
4210 Now you may notice that there are a lot of ways for encoding the
4211 same multilingual text in ISO2022. Actually, there exist many
4212 coding systems such as Compound Text (used in X11's inter client
4213 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4214 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4215 localized platforms), and all of these are variants of ISO2022.
4217 In addition to the above, Emacs handles two more kinds of escape
4218 sequences: ISO6429's direction specification and Emacs' private
4219 sequence for specifying character composition.
4221 ISO6429's direction specification takes the following form:
4222 o CSI ']' -- end of the current direction
4223 o CSI '0' ']' -- end of the current direction
4224 o CSI '1' ']' -- start of left-to-right text
4225 o CSI '2' ']' -- start of right-to-left text
4226 The control character CSI (0x9B: control sequence introducer) is
4227 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4229 Character composition specification takes the following form:
4230 o ESC '0' -- start character composition
4231 o ESC '1' -- end character composition
4232 Since these are not standard escape sequences of any ISO standard,
4233 their use with these meanings is restricted to Emacs only. */
4236 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4240 for (i = 0; i < 4; i++) {
4241 if (!NILP(coding_system))
4243 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4246 iso->charset[i] = Qt;
4247 iso->invalid_designated[i] = 0;
4249 iso->esc = ISO_ESC_NOTHING;
4250 iso->esc_bytes_index = 0;
4251 iso->register_left = 0;
4252 iso->register_right = 1;
4253 iso->switched_dir_and_no_valid_charset_yet = 0;
4254 iso->invalid_switch_dir = 0;
4255 iso->output_direction_sequence = 0;
4256 iso->output_literally = 0;
4257 #ifdef ENABLE_COMPOSITE_CHARS
4258 if (iso->composite_chars)
4259 Dynarr_reset(iso->composite_chars);
4263 static int fit_to_be_escape_quoted(unsigned char c)
4279 /* Parse one byte of an ISO2022 escape sequence.
4280 If the result is an invalid escape sequence, return 0 and
4281 do not change anything in STR. Otherwise, if the result is
4282 an incomplete escape sequence, update ISO2022.ESC and
4283 ISO2022.ESC_BYTES and return -1. Otherwise, update
4284 all the state variables (but not ISO2022.ESC_BYTES) and
4287 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4288 or invocation of an invalid character set and treat that as
4289 an unrecognized escape sequence.
4291 ********************************************************************
4293 #### Strategies for error annotation and coding orthogonalization
4295 We really want to separate out a number of things. Conceptually,
4296 there is a nested syntax.
4298 At the top level is the ISO 2022 extension syntax, including charset
4299 designation and invocation, and certain auxiliary controls such as the
4300 ISO 6429 direction specification. These are octet-oriented, with the
4301 single exception (AFAIK) of the "exit Unicode" sequence which uses the
4302 UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4303 UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a
4304 (deprecated) special case in Unicode processing.
4306 The middle layer is ISO 2022 character interpretation. This will depend
4307 on the current state of the ISO 2022 registers, and assembles octets
4308 into the character's internal representation.
4310 The lowest level is translating system control conventions. At present
4311 this is restricted to newline translation, but one could imagine doing
4312 tab conversion or line wrapping here. "Escape from Unicode" processing
4313 would be done at this level.
4315 At each level the parser will verify the syntax. In the case of a
4316 syntax error or warning (such as a redundant escape sequence that affects
4317 no characters), the parser will take some action, typically inserting the
4318 erroneous octets directly into the output and creating an annotation
4319 which can be used by higher level I/O to mark the affected region.
4321 This should make it possible to do something sensible about separating
4322 newline convention processing from character construction, and about
4323 preventing ISO 2022 escape sequences from being recognized
4326 The basic strategy will be to have octet classification tables, and
4327 switch processing according to the table entry.
4329 It's possible that, by doing the processing with tables of functions or
4330 the like, the parser can be used for both detection and translation. */
4333 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4334 unsigned char c, unsigned int *flags,
4335 int check_invalid_charsets)
4337 /* (1) If we're at the end of a designation sequence, CS is the
4338 charset being designated and REG is the register to designate
4341 (2) If we're at the end of a locking-shift sequence, REG is
4342 the register to invoke and HALF (0 == left, 1 == right) is
4343 the half to invoke it into.
4345 (3) If we're at the end of a single-shift sequence, REG is
4346 the register to invoke. */
4347 Lisp_Object cs = Qnil;
4350 /* NOTE: This code does goto's all over the fucking place.
4351 The reason for this is that we're basically implementing
4352 a state machine here, and hierarchical languages like C
4353 don't really provide a clean way of doing this. */
4355 if (!(*flags & CODING_STATE_ESCAPE))
4356 /* At beginning of escape sequence; we need to reset our
4357 escape-state variables. */
4358 iso->esc = ISO_ESC_NOTHING;
4360 iso->output_literally = 0;
4361 iso->output_direction_sequence = 0;
4364 case ISO_ESC_NOTHING:
4365 iso->esc_bytes_index = 0;
4367 case ISO_CODE_ESC: /* Start escape sequence */
4368 *flags |= CODING_STATE_ESCAPE;
4372 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4373 *flags |= CODING_STATE_ESCAPE;
4374 iso->esc = ISO_ESC_5_11;
4377 case ISO_CODE_SO: /* locking shift 1 */
4381 case ISO_CODE_SI: /* locking shift 0 */
4386 case ISO_CODE_SS2: /* single shift */
4389 case ISO_CODE_SS3: /* single shift */
4393 default: /* Other control characters */
4399 /**** single shift ****/
4401 case 'N': /* single shift 2 */
4404 case 'O': /* single shift 3 */
4408 /**** locking shift ****/
4410 case '~': /* locking shift 1 right */
4414 case 'n': /* locking shift 2 */
4418 case '}': /* locking shift 2 right */
4422 case 'o': /* locking shift 3 */
4426 case '|': /* locking shift 3 right */
4431 #ifdef ENABLE_COMPOSITE_CHARS
4432 /**** composite ****/
4435 iso->esc = ISO_ESC_START_COMPOSITE;
4436 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4437 CODING_STATE_COMPOSITE;
4441 iso->esc = ISO_ESC_END_COMPOSITE;
4442 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4443 ~CODING_STATE_COMPOSITE;
4445 #endif /* ENABLE_COMPOSITE_CHARS */
4447 /**** directionality ****/
4450 iso->esc = ISO_ESC_5_11;
4453 /**** designation ****/
4455 case '$': /* multibyte charset prefix */
4456 iso->esc = ISO_ESC_2_4;
4460 if (0x28 <= c && c <= 0x2F) {
4462 (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4466 /* This function is called with CODESYS equal to nil when
4467 doing coding-system detection. */
4469 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4470 && fit_to_be_escape_quoted(c)) {
4471 iso->esc = ISO_ESC_LITERAL;
4472 *flags &= CODING_STATE_ISO2022_LOCK;
4480 /**** directionality ****/
4482 case ISO_ESC_5_11: /* ISO6429 direction control */
4485 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4486 goto directionality;
4489 iso->esc = ISO_ESC_5_11_0;
4491 iso->esc = ISO_ESC_5_11_1;
4493 iso->esc = ISO_ESC_5_11_2;
4498 case ISO_ESC_5_11_0:
4501 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4502 goto directionality;
4506 case ISO_ESC_5_11_1:
4509 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4510 goto directionality;
4514 case ISO_ESC_5_11_2:
4517 (*flags & CODING_STATE_ISO2022_LOCK) |
4519 goto directionality;
4524 iso->esc = ISO_ESC_DIRECTIONALITY;
4525 /* Various junk here to attempt to preserve the direction
4526 sequences literally in the text if they would otherwise be
4527 swallowed due to invalid designations that don't show up as
4528 actual charset changes in the text. */
4529 if (iso->invalid_switch_dir) {
4530 /* We already inserted a direction switch literally into
4531 the text. We assume (#### this may not be right)
4532 that the next direction switch is the one going the
4533 other way, and we need to output that literally as
4535 iso->output_literally = 1;
4536 iso->invalid_switch_dir = 0;
4540 /* If we are in the thrall of an invalid designation,
4541 then stick the directionality sequence literally into
4542 the output stream so it ends up in the original text
4544 for (jj = 0; jj < 4; jj++)
4545 if (iso->invalid_designated[jj])
4548 iso->output_literally = 1;
4549 iso->invalid_switch_dir = 1;
4551 /* Indicate that we haven't yet seen a valid
4552 designation, so that if a switch-dir is
4553 directly followed by an invalid designation,
4554 both get inserted literally. */
4555 iso->switched_dir_and_no_valid_charset_yet = 1;
4559 /**** designation ****/
4562 if (0x28 <= c && c <= 0x2F) {
4564 (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4567 if (0x40 <= c && c <= 0x42) {
4568 cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4569 *flags & CODING_STATE_R2L ?
4570 CHARSET_RIGHT_TO_LEFT :
4571 CHARSET_LEFT_TO_RIGHT);
4588 case ISO_ESC_2_4_10:
4589 case ISO_ESC_2_4_11:
4590 case ISO_ESC_2_4_12:
4591 case ISO_ESC_2_4_13:
4592 case ISO_ESC_2_4_14:
4593 case ISO_ESC_2_4_15:
4594 case ISO_ESC_SINGLE_SHIFT:
4595 case ISO_ESC_LOCKING_SHIFT:
4596 case ISO_ESC_DESIGNATE:
4597 case ISO_ESC_DIRECTIONALITY:
4598 case ISO_ESC_LITERAL:
4603 if (c < '0' || c > '~')
4604 return 0; /* bad final byte */
4606 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4607 type = ((iso->esc >= ISO_ESC_2_12) ?
4608 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4609 reg = (iso->esc - ISO_ESC_2_8) & 3;
4610 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4611 iso->esc <= ISO_ESC_2_4_15) {
4612 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4613 CHARSET_TYPE_96X96 :
4614 CHARSET_TYPE_94X94);
4615 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4617 /* Can this ever be reached? -slb */
4622 cs = CHARSET_BY_ATTRIBUTES(type, c,
4623 *flags & CODING_STATE_R2L ?
4624 CHARSET_RIGHT_TO_LEFT :
4625 CHARSET_LEFT_TO_RIGHT);
4631 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4635 if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4636 /* can't invoke something that ain't there. */
4638 iso->esc = ISO_ESC_SINGLE_SHIFT;
4639 *flags &= CODING_STATE_ISO2022_LOCK;
4641 *flags |= CODING_STATE_SS2;
4643 *flags |= CODING_STATE_SS3;
4647 if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4648 /* can't invoke something that ain't there. */
4651 iso->register_right = reg;
4653 iso->register_left = reg;
4654 *flags &= CODING_STATE_ISO2022_LOCK;
4655 iso->esc = ISO_ESC_LOCKING_SHIFT;
4659 if (NILP(cs) && check_invalid_charsets) {
4660 iso->invalid_designated[reg] = 1;
4661 iso->charset[reg] = Vcharset_ascii;
4662 iso->esc = ISO_ESC_DESIGNATE;
4663 *flags &= CODING_STATE_ISO2022_LOCK;
4664 iso->output_literally = 1;
4665 if (iso->switched_dir_and_no_valid_charset_yet) {
4666 /* We encountered a switch-direction followed by an
4667 invalid designation. Ensure that the switch-direction
4668 gets outputted; otherwise it will probably get eaten
4669 when the text is written out again. */
4670 iso->switched_dir_and_no_valid_charset_yet = 0;
4671 iso->output_direction_sequence = 1;
4672 /* And make sure that the switch-dir going the other
4673 way gets outputted, as well. */
4674 iso->invalid_switch_dir = 1;
4678 /* This function is called with CODESYS equal to nil when
4679 doing coding-system detection. */
4680 if (!NILP(codesys)) {
4681 charset_conversion_spec_dynarr *dyn =
4682 XCODING_SYSTEM(codesys)->iso2022.input_conv;
4687 for (i = 0; i < Dynarr_length(dyn); i++) {
4688 struct charset_conversion_spec *spec =
4690 if (EQ(cs, spec->from_charset))
4691 cs = spec->to_charset;
4696 iso->charset[reg] = cs;
4697 iso->esc = ISO_ESC_DESIGNATE;
4698 *flags &= CODING_STATE_ISO2022_LOCK;
4699 if (iso->invalid_designated[reg]) {
4700 iso->invalid_designated[reg] = 0;
4701 iso->output_literally = 1;
4703 if (iso->switched_dir_and_no_valid_charset_yet)
4704 iso->switched_dir_and_no_valid_charset_yet = 0;
4709 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4710 Lstream_data_count n)
4714 /* #### There are serious deficiencies in the recognition mechanism
4715 here. This needs to be much smarter if it's going to cut it.
4716 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4717 it should be detected as Latin-1.
4718 All the ISO2022 stuff in this file should be synced up with the
4719 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4720 Perhaps we should wait till R2L works in FSF Emacs? */
4722 if (!st->iso2022.initted) {
4723 reset_iso2022(Qnil, &st->iso2022.iso);
4724 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4725 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4726 CODING_CATEGORY_ISO_8_1_MASK |
4727 CODING_CATEGORY_ISO_8_2_MASK |
4728 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4729 st->iso2022.flags = 0;
4730 st->iso2022.high_byte_count = 0;
4731 st->iso2022.saw_single_shift = 0;
4732 st->iso2022.initted = 1;
4735 mask = st->iso2022.mask;
4738 const unsigned char c = *(const unsigned char *)src++;
4740 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4741 st->iso2022.high_byte_count++;
4743 if (st->iso2022.high_byte_count
4744 && !st->iso2022.saw_single_shift) {
4745 if (st->iso2022.high_byte_count & 1)
4746 /* odd number of high bytes; assume not iso-8-2 */
4747 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4749 st->iso2022.high_byte_count = 0;
4750 st->iso2022.saw_single_shift = 0;
4752 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4754 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4755 && (BYTE_C0_P(c) || BYTE_C1_P(c))) { /* control chars */
4757 /* Allow and ignore control characters that you might
4758 reasonably see in a text file */
4763 case 8: /* backspace */
4764 case 11: /* vertical tab */
4765 case 12: /* form feed */
4766 case 26: /* MS-DOS C-z junk */
4767 case 31: /* '^_' -- for info */
4768 goto label_continue_loop;
4775 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4777 if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4778 &st->iso2022.flags, 0)) {
4779 switch (st->iso2022.iso.esc) {
4780 case ISO_ESC_DESIGNATE:
4781 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4782 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4784 case ISO_ESC_LOCKING_SHIFT:
4785 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4786 goto ran_out_of_chars;
4787 case ISO_ESC_SINGLE_SHIFT:
4788 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4789 st->iso2022.saw_single_shift = 1;
4793 case ISO_ESC_NOTHING:
4806 case ISO_ESC_2_4_10:
4807 case ISO_ESC_2_4_11:
4808 case ISO_ESC_2_4_12:
4809 case ISO_ESC_2_4_13:
4810 case ISO_ESC_2_4_14:
4811 case ISO_ESC_2_4_15:
4813 case ISO_ESC_5_11_0:
4814 case ISO_ESC_5_11_1:
4815 case ISO_ESC_5_11_2:
4816 case ISO_ESC_DIRECTIONALITY:
4817 case ISO_ESC_LITERAL:
4823 goto ran_out_of_chars;
4826 label_continue_loop:;
4833 static int postprocess_iso2022_mask(int mask)
4835 /* #### kind of cheesy */
4836 /* If seven-bit ISO is allowed, then assume that the encoding is
4837 entirely seven-bit and turn off the eight-bit ones. */
4838 if (mask & CODING_CATEGORY_ISO_7_MASK)
4839 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4840 CODING_CATEGORY_ISO_8_1_MASK |
4841 CODING_CATEGORY_ISO_8_2_MASK);
4845 /* If FLAGS is a null pointer or specifies right-to-left motion,
4846 output a switch-dir-to-left-to-right sequence to DST.
4847 Also update FLAGS if it is not a null pointer.
4848 If INTERNAL_P is set, we are outputting in internal format and
4849 need to handle the CSI differently. */
4852 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4853 unsigned_char_dynarr * dst,
4854 unsigned int *flags, int internal_p)
4856 if (!flags || (*flags & CODING_STATE_R2L)) {
4857 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4858 Dynarr_add(dst, ISO_CODE_ESC);
4859 Dynarr_add(dst, '[');
4860 } else if (internal_p)
4861 DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4863 Dynarr_add(dst, ISO_CODE_CSI);
4864 Dynarr_add(dst, '0');
4865 Dynarr_add(dst, ']');
4867 *flags &= ~CODING_STATE_R2L;
4871 /* If FLAGS is a null pointer or specifies a direction different from
4872 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4873 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4874 sequence to DST. Also update FLAGS if it is not a null pointer.
4875 If INTERNAL_P is set, we are outputting in internal format and
4876 need to handle the CSI differently. */
4879 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4880 unsigned_char_dynarr * dst, unsigned int *flags,
4883 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4884 direction == CHARSET_LEFT_TO_RIGHT)
4885 restore_left_to_right_direction(codesys, dst, flags,
4887 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4888 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4889 direction == CHARSET_RIGHT_TO_LEFT) {
4890 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4891 Dynarr_add(dst, ISO_CODE_ESC);
4892 Dynarr_add(dst, '[');
4893 } else if (internal_p)
4894 DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4896 Dynarr_add(dst, ISO_CODE_CSI);
4897 Dynarr_add(dst, '2');
4898 Dynarr_add(dst, ']');
4900 *flags |= CODING_STATE_R2L;
4904 /* Convert ISO2022-format data to internal format. */
4907 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4908 unsigned_char_dynarr * dst, Lstream_data_count n)
4910 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4911 unsigned int flags = str->flags;
4912 unsigned int ch = str->ch;
4913 eol_type_t eol_type = str->eol_type;
4914 #ifdef ENABLE_COMPOSITE_CHARS
4915 unsigned_char_dynarr *real_dst = dst;
4917 Lisp_Object coding_system;
4919 XSETCODING_SYSTEM(coding_system, str->codesys);
4921 #ifdef ENABLE_COMPOSITE_CHARS
4922 if (flags & CODING_STATE_COMPOSITE)
4923 dst = str->iso2022.composite_chars;
4924 #endif /* ENABLE_COMPOSITE_CHARS */
4927 const unsigned char c = *(const unsigned char *)src++;
4928 if (flags & CODING_STATE_ESCAPE) {
4929 /* Within ESC sequence */
4930 int retval = parse_iso2022_esc(
4931 coding_system, &str->iso2022, c, &flags, 1);
4934 switch (str->iso2022.esc) {
4935 #ifdef ENABLE_COMPOSITE_CHARS
4936 case ISO_ESC_START_COMPOSITE:
4937 if (str->iso2022.composite_chars)
4938 Dynarr_reset(str->iso2022.
4941 str->iso2022.composite_chars =
4942 Dynarr_new(unsigned_char);
4943 dst = str->iso2022.composite_chars;
4945 case ISO_ESC_END_COMPOSITE:
4947 Bufbyte comstr[MAX_EMCHAR_LEN];
4950 lookup_composite_char
4951 (Dynarr_atp(dst, 0),
4952 Dynarr_length(dst));
4955 set_charptr_emchar(comstr,
4957 Dynarr_add_many(dst, comstr,
4961 #endif /* ENABLE_COMPOSITE_CHARS */
4963 case ISO_ESC_LITERAL:
4964 DECODE_ADD_BINARY_CHAR(c, dst);
4967 case ISO_ESC_NOTHING:
4980 case ISO_ESC_2_4_10:
4981 case ISO_ESC_2_4_11:
4982 case ISO_ESC_2_4_12:
4983 case ISO_ESC_2_4_13:
4984 case ISO_ESC_2_4_14:
4985 case ISO_ESC_2_4_15:
4987 case ISO_ESC_5_11_0:
4988 case ISO_ESC_5_11_1:
4989 case ISO_ESC_5_11_2:
4990 case ISO_ESC_SINGLE_SHIFT:
4991 case ISO_ESC_LOCKING_SHIFT:
4992 case ISO_ESC_DESIGNATE:
4993 case ISO_ESC_DIRECTIONALITY:
4996 /* Everything else handled already */
5001 /* Attempted error recovery. */
5002 if (str->iso2022.output_direction_sequence)
5003 ensure_correct_direction(flags &
5005 CHARSET_RIGHT_TO_LEFT :
5006 CHARSET_LEFT_TO_RIGHT,
5007 str->codesys, dst, 0,
5009 /* More error recovery. */
5010 if (!retval || str->iso2022.output_literally) {
5011 /* Output the (possibly invalid) sequence */
5013 for (i = 0; i < str->iso2022.esc_bytes_index;
5015 DECODE_ADD_BINARY_CHAR(str->iso2022.
5018 flags &= CODING_STATE_ISO2022_LOCK;
5020 n++, src--; /* Repeat the loop with the same character. */
5022 /* No sense in reprocessing the final byte of the
5023 escape sequence; it could mess things up anyway.
5025 DECODE_ADD_BINARY_CHAR(c, dst);
5029 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) { /* Control characters */
5031 /***** Error-handling *****/
5033 /* If we were in the middle of a character, dump out the
5034 partial character. */
5035 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5037 /* If we just saw a single-shift character, dump it out.
5038 This may dump out the wrong sort of single-shift character,
5039 but least it will give an indication that something went
5041 if (flags & CODING_STATE_SS2) {
5042 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5043 flags &= ~CODING_STATE_SS2;
5045 if (flags & CODING_STATE_SS3) {
5046 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5047 flags &= ~CODING_STATE_SS3;
5050 /***** Now handle the control characters. *****/
5053 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5055 flags &= CODING_STATE_ISO2022_LOCK;
5057 if (!parse_iso2022_esc
5058 (coding_system, &str->iso2022, c, &flags, 1))
5059 DECODE_ADD_BINARY_CHAR(c, dst);
5060 } else { /* Graphic characters */
5061 Lisp_Object charset;
5065 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5067 /* Now determine the charset. */
5068 reg = ((flags & CODING_STATE_SS2) ? 2
5069 : (flags & CODING_STATE_SS3) ? 3
5070 : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5071 : str->iso2022.register_left);
5072 charset = str->iso2022.charset[reg];
5074 /* Error checking: */
5075 if (!CHARSETP(charset)
5076 || str->iso2022.invalid_designated[reg]
5078 (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5079 && XCHARSET_CHARS(charset) == 94))
5080 /* Mrmph. We are trying to invoke a register that has no
5081 or an invalid charset in it, or trying to add a character
5082 outside the range of the charset. Insert that char literally
5083 to preserve it for the output. */
5085 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5086 DECODE_ADD_BINARY_CHAR(c, dst);
5090 /* Things are probably hunky-dorey. */
5092 /* Fetch reverse charset, maybe. */
5093 if (((flags & CODING_STATE_R2L) &&
5094 XCHARSET_DIRECTION(charset) ==
5095 CHARSET_LEFT_TO_RIGHT)
5096 || (!(flags & CODING_STATE_R2L)
5097 && XCHARSET_DIRECTION(charset) ==
5098 CHARSET_RIGHT_TO_LEFT)) {
5099 Lisp_Object new_charset =
5100 XCHARSET_REVERSE_DIRECTION_CHARSET
5102 if (!NILP(new_charset))
5103 charset = new_charset;
5106 lb = XCHARSET_LEADING_BYTE(charset);
5107 switch (XCHARSET_REP_BYTES(charset)) {
5109 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5110 Dynarr_add(dst, c & 0x7F);
5113 case 2: /* one-byte official */
5114 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5115 Dynarr_add(dst, lb);
5116 Dynarr_add(dst, c | 0x80);
5119 case 3: /* one-byte private or two-byte official */
5120 if (XCHARSET_PRIVATE_P(charset)) {
5121 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5123 PRE_LEADING_BYTE_PRIVATE_1);
5124 Dynarr_add(dst, lb);
5125 Dynarr_add(dst, c | 0x80);
5128 Dynarr_add(dst, lb);
5139 default: /* two-byte private */
5142 PRE_LEADING_BYTE_PRIVATE_2);
5143 Dynarr_add(dst, lb);
5144 Dynarr_add(dst, ch | 0x80);
5145 Dynarr_add(dst, c | 0x80);
5153 flags &= CODING_STATE_ISO2022_LOCK;
5156 label_continue_loop:;
5159 if (flags & CODING_STATE_END)
5160 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5166 /***** ISO2022 encoder *****/
5168 /* Designate CHARSET into register REG. */
5171 iso2022_designate(Lisp_Object charset, unsigned char reg,
5172 encoding_stream_t str, unsigned_char_dynarr * dst)
5174 static const char inter94[] = "()*+";
5175 static const char inter96[] = ",-./";
5177 unsigned char final;
5178 Lisp_Object old_charset = str->iso2022.charset[reg];
5180 str->iso2022.charset[reg] = charset;
5181 if (!CHARSETP(charset))
5182 /* charset might be an initial nil or t. */
5184 type = XCHARSET_TYPE(charset);
5185 final = XCHARSET_FINAL(charset);
5186 if (!str->iso2022.force_charset_on_output[reg] &&
5187 CHARSETP(old_charset) &&
5188 XCHARSET_TYPE(old_charset) == type &&
5189 XCHARSET_FINAL(old_charset) == final)
5192 str->iso2022.force_charset_on_output[reg] = 0;
5195 charset_conversion_spec_dynarr *dyn =
5196 str->codesys->iso2022.output_conv;
5201 for (i = 0; i < Dynarr_length(dyn); i++) {
5202 struct charset_conversion_spec *spec =
5204 if (EQ(charset, spec->from_charset))
5205 charset = spec->to_charset;
5210 Dynarr_add(dst, ISO_CODE_ESC);
5212 case CHARSET_TYPE_94:
5213 Dynarr_add(dst, inter94[reg]);
5215 case CHARSET_TYPE_96:
5216 Dynarr_add(dst, inter96[reg]);
5218 case CHARSET_TYPE_94X94:
5219 Dynarr_add(dst, '$');
5220 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5221 || final < '@' || final > 'B')
5222 Dynarr_add(dst, inter94[reg]);
5224 case CHARSET_TYPE_96X96:
5225 Dynarr_add(dst, '$');
5226 Dynarr_add(dst, inter96[reg]);
5231 Dynarr_add(dst, final);
5235 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5237 if (str->iso2022.register_left != 0) {
5238 Dynarr_add(dst, ISO_CODE_SI);
5239 str->iso2022.register_left = 0;
5244 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5246 if (str->iso2022.register_left != 1) {
5247 Dynarr_add(dst, ISO_CODE_SO);
5248 str->iso2022.register_left = 1;
5252 /* Convert internally-formatted data to ISO2022 format. */
5255 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5256 unsigned_char_dynarr * dst, Lstream_data_count n)
5258 unsigned char charmask, c;
5259 unsigned char char_boundary;
5260 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5261 unsigned int flags = str->flags;
5262 unsigned int ch = str->ch;
5263 Lisp_Coding_System *codesys = str->codesys;
5264 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5266 Lisp_Object charset;
5269 #ifdef ENABLE_COMPOSITE_CHARS
5270 /* flags for handling composite chars. We do a little switcharoo
5271 on the source while we're outputting the composite char. */
5272 unsigned int saved_n = 0;
5273 const unsigned char *saved_src = NULL;
5274 int in_composite = 0;
5275 #endif /* ENABLE_COMPOSITE_CHARS */
5277 char_boundary = str->iso2022.current_char_boundary;
5278 charset = str->iso2022.current_charset;
5279 half = str->iso2022.current_half;
5281 #ifdef ENABLE_COMPOSITE_CHARS
5287 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
5290 restore_left_to_right_direction(codesys, dst, &flags,
5293 /* Make sure G0 contains ASCII */
5294 if ((c > ' ' && c < ISO_CODE_DEL) ||
5295 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5296 ensure_normal_shift(str, dst);
5297 iso2022_designate(Vcharset_ascii, 0, str, dst);
5300 /* If necessary, restore everything to the default state
5303 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5304 restore_left_to_right_direction(codesys, dst,
5307 ensure_normal_shift(str, dst);
5309 for (i = 0; i < 4; i++) {
5310 Lisp_Object initial_charset =
5311 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5313 iso2022_designate(initial_charset, i,
5318 if (eol_type != EOL_LF
5319 && eol_type != EOL_AUTODETECT)
5320 Dynarr_add(dst, '\r');
5321 if (eol_type != EOL_CR)
5324 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5325 && fit_to_be_escape_quoted(c))
5326 Dynarr_add(dst, ISO_CODE_ESC);
5332 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
5334 charset = CHARSET_BY_LEADING_BYTE(c);
5335 if (LEADING_BYTE_PREFIX_P(c))
5337 else if (!EQ(charset, Vcharset_control_1)
5338 #ifdef ENABLE_COMPOSITE_CHARS
5339 && !EQ(charset, Vcharset_composite)
5344 ensure_correct_direction(XCHARSET_DIRECTION
5348 /* Now determine which register to use. */
5350 for (i = 0; i < 4; i++) {
5351 if (EQ(charset, str->iso2022.charset[i])
5353 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5361 if (XCHARSET_GRAPHIC(charset) != 0) {
5363 (str->iso2022.charset[1])
5365 (!CODING_SYSTEM_ISO2022_SEVEN
5368 CODING_SYSTEM_ISO2022_LOCK_SHIFT
5385 iso2022_designate(charset, reg, str, dst);
5387 /* Now invoke that register. */
5390 ensure_normal_shift(str, dst);
5395 if (CODING_SYSTEM_ISO2022_SEVEN
5397 ensure_shift_out(str, dst);
5404 if (CODING_SYSTEM_ISO2022_SEVEN
5406 Dynarr_add(dst, ISO_CODE_ESC);
5407 Dynarr_add(dst, 'N');
5410 Dynarr_add(dst, ISO_CODE_SS2);
5416 if (CODING_SYSTEM_ISO2022_SEVEN
5418 Dynarr_add(dst, ISO_CODE_ESC);
5419 Dynarr_add(dst, 'O');
5422 Dynarr_add(dst, ISO_CODE_SS3);
5432 } else { /* Processing Non-ASCII character */
5433 charmask = (half == 0 ? 0x7F : 0xFF);
5435 if (EQ(charset, Vcharset_control_1)) {
5436 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5437 && fit_to_be_escape_quoted(c))
5438 Dynarr_add(dst, ISO_CODE_ESC);
5439 /* you asked for it ... */
5440 Dynarr_add(dst, c - 0x20);
5442 switch (XCHARSET_REP_BYTES(charset)) {
5444 Dynarr_add(dst, c & charmask);
5447 if (XCHARSET_PRIVATE_P(charset)) {
5448 Dynarr_add(dst, c & charmask);
5451 #ifdef ENABLE_COMPOSITE_CHARS
5454 Vcharset_composite)) {
5456 /* #### Bother! We don't know how to
5463 (Vcharset_composite,
5468 composite_char_string
5477 n = XSTRING_LENGTH(lstr);
5480 Dynarr_add(dst, '0'); /* start composing */
5483 #endif /* ENABLE_COMPOSITE_CHARS */
5500 Dynarr_add(dst, ch & charmask);
5501 Dynarr_add(dst, c & charmask);
5515 #ifdef ENABLE_COMPOSITE_CHARS
5520 Dynarr_add(dst, ISO_CODE_ESC);
5521 Dynarr_add(dst, '1'); /* end composing */
5522 goto back_to_square_n; /* Wheeeeeeeee ..... */
5524 #endif /* ENABLE_COMPOSITE_CHARS */
5526 if (char_boundary && flags & CODING_STATE_END) {
5527 restore_left_to_right_direction(codesys, dst, &flags, 0);
5528 ensure_normal_shift(str, dst);
5529 for (i = 0; i < 4; i++) {
5530 Lisp_Object initial_charset =
5531 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5532 iso2022_designate(initial_charset, i, str, dst);
5538 str->iso2022.current_char_boundary = char_boundary;
5539 str->iso2022.current_charset = charset;
5540 str->iso2022.current_half = half;
5542 /* Verbum caro factum est! */
5546 /************************************************************************/
5547 /* No-conversion methods */
5548 /************************************************************************/
5550 /* This is used when reading in "binary" files -- i.e. files that may
5551 contain all 256 possible byte values and that are not to be
5552 interpreted as being in any particular decoding. */
5554 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5555 unsigned_char_dynarr * dst, Lstream_data_count n)
5557 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5558 unsigned int flags = str->flags;
5559 unsigned int ch = str->ch;
5560 eol_type_t eol_type = str->eol_type;
5563 const unsigned char c = *(const unsigned char *)src++;
5565 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5566 DECODE_ADD_BINARY_CHAR(c, dst);
5567 label_continue_loop:;
5570 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5577 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5578 unsigned_char_dynarr * dst, Lstream_data_count n)
5581 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5582 unsigned int flags = str->flags;
5583 unsigned int ch = str->ch;
5584 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5589 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5590 Dynarr_add(dst, '\r');
5591 if (eol_type != EOL_CR)
5592 Dynarr_add(dst, '\n');
5594 } else if (BYTE_ASCII_P(c)) {
5597 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5599 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5600 c == LEADING_BYTE_CONTROL_1)
5603 Dynarr_add(dst, '~'); /* untranslatable character */
5605 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5607 else if (ch == LEADING_BYTE_CONTROL_1) {
5609 Dynarr_add(dst, c - 0x20);
5611 /* else it should be the second or third byte of an
5612 untranslatable character, so ignore it */
5621 /************************************************************************/
5622 /* Initialization */
5623 /************************************************************************/
5625 void syms_of_file_coding(void)
5627 INIT_LRECORD_IMPLEMENTATION(coding_system);
5629 DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5631 DEFSUBR(Fcoding_system_p);
5632 DEFSUBR(Ffind_coding_system);
5633 DEFSUBR(Fget_coding_system);
5634 DEFSUBR(Fcoding_system_list);
5635 DEFSUBR(Fcoding_system_name);
5636 DEFSUBR(Fmake_coding_system);
5637 DEFSUBR(Fcopy_coding_system);
5638 DEFSUBR(Fcoding_system_canonical_name_p);
5639 DEFSUBR(Fcoding_system_alias_p);
5640 DEFSUBR(Fcoding_system_aliasee);
5641 DEFSUBR(Fdefine_coding_system_alias);
5642 DEFSUBR(Fsubsidiary_coding_system);
5644 DEFSUBR(Fcoding_system_type);
5645 DEFSUBR(Fcoding_system_doc_string);
5647 DEFSUBR(Fcoding_system_charset);
5649 DEFSUBR(Fcoding_system_property);
5651 DEFSUBR(Fcoding_category_list);
5652 DEFSUBR(Fset_coding_priority_list);
5653 DEFSUBR(Fcoding_priority_list);
5654 DEFSUBR(Fset_coding_category_system);
5655 DEFSUBR(Fcoding_category_system);
5657 DEFSUBR(Fdetect_coding_region);
5658 DEFSUBR(Fdecode_coding_region);
5659 DEFSUBR(Fencode_coding_region);
5661 DEFSUBR(Fdecode_shift_jis_char);
5662 DEFSUBR(Fencode_shift_jis_char);
5663 DEFSUBR(Fdecode_big5_char);
5664 DEFSUBR(Fencode_big5_char);
5665 DEFSUBR(Fset_ucs_char);
5667 DEFSUBR(Fset_char_ucs);
5670 defsymbol(&Qcoding_systemp, "coding-system-p");
5671 defsymbol(&Qno_conversion, "no-conversion");
5672 defsymbol(&Qraw_text, "raw-text");
5674 defsymbol(&Qbig5, "big5");
5675 defsymbol(&Qshift_jis, "shift-jis");
5676 defsymbol(&Qucs4, "ucs-4");
5677 defsymbol(&Qutf8, "utf-8");
5678 defsymbol(&Qccl, "ccl");
5679 defsymbol(&Qiso2022, "iso2022");
5681 defsymbol(&Qmnemonic, "mnemonic");
5682 defsymbol(&Qeol_type, "eol-type");
5683 defsymbol(&Qpost_read_conversion, "post-read-conversion");
5684 defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5686 defsymbol(&Qcr, "cr");
5687 defsymbol(&Qlf, "lf");
5688 defsymbol(&Qcrlf, "crlf");
5689 defsymbol(&Qeol_cr, "eol-cr");
5690 defsymbol(&Qeol_lf, "eol-lf");
5691 defsymbol(&Qeol_crlf, "eol-crlf");
5693 defsymbol(&Qcharset_g0, "charset-g0");
5694 defsymbol(&Qcharset_g1, "charset-g1");
5695 defsymbol(&Qcharset_g2, "charset-g2");
5696 defsymbol(&Qcharset_g3, "charset-g3");
5697 defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5698 defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5699 defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5700 defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5701 defsymbol(&Qno_iso6429, "no-iso6429");
5702 defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5703 defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5705 defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5706 defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5707 defsymbol(&Qseven, "seven");
5708 defsymbol(&Qlock_shift, "lock-shift");
5709 defsymbol(&Qescape_quoted, "escape-quoted");
5711 defsymbol(&Qencode, "encode");
5712 defsymbol(&Qdecode, "decode");
5715 defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5717 defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5718 defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5719 defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5720 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5721 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5723 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5724 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5725 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5728 defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5732 void lstream_type_create_file_coding(void)
5734 LSTREAM_HAS_METHOD(decoding, reader);
5735 LSTREAM_HAS_METHOD(decoding, writer);
5736 LSTREAM_HAS_METHOD(decoding, rewinder);
5737 LSTREAM_HAS_METHOD(decoding, seekable_p);
5738 LSTREAM_HAS_METHOD(decoding, flusher);
5739 LSTREAM_HAS_METHOD(decoding, closer);
5740 LSTREAM_HAS_METHOD(decoding, marker);
5742 LSTREAM_HAS_METHOD(encoding, reader);
5743 LSTREAM_HAS_METHOD(encoding, writer);
5744 LSTREAM_HAS_METHOD(encoding, rewinder);
5745 LSTREAM_HAS_METHOD(encoding, seekable_p);
5746 LSTREAM_HAS_METHOD(encoding, flusher);
5747 LSTREAM_HAS_METHOD(encoding, closer);
5748 LSTREAM_HAS_METHOD(encoding, marker);
5751 void vars_of_file_coding(void)
5755 fcd = xnew(struct file_coding_dump);
5756 dump_add_root_struct_ptr(&fcd, &fcd_description);
5758 /* Initialize to something reasonable ... */
5759 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5760 fcd->coding_category_system[i] = Qnil;
5761 fcd->coding_category_by_priority[i] = i;
5764 Fprovide(intern("file-coding"));
5766 DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system /*
5767 Coding system used for TTY keyboard input.
5768 Not used under a windowing system.
5770 Vkeyboard_coding_system = Qnil;
5772 DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system /*
5773 Coding system used for TTY display output.
5774 Not used under a windowing system.
5776 Vterminal_coding_system = Qnil;
5778 DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read /*
5779 Overriding coding system used when reading from a file or process.
5780 You should bind this variable with `let', but do not set it globally.
5781 If this is non-nil, it specifies the coding system that will be used
5782 to decode input on read operations, such as from a file or process.
5783 It overrides `buffer-file-coding-system-for-read',
5784 `insert-file-contents-pre-hook', etc. Use those variables instead of
5785 this one for permanent changes to the environment. */ );
5786 Vcoding_system_for_read = Qnil;
5788 DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write /*
5789 Overriding coding system used when writing to a file or process.
5790 You should bind this variable with `let', but do not set it globally.
5791 If this is non-nil, it specifies the coding system that will be used
5792 to encode output for write operations, such as to a file or process.
5793 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5794 Use those variables instead of this one for permanent changes to the
5796 Vcoding_system_for_write = Qnil;
5798 DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system /*
5799 Coding system used to convert pathnames when accessing files.
5801 Vfile_name_coding_system = Qnil;
5803 DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5804 Non-nil means the buffer contents are regarded as multi-byte form
5805 of characters, not a binary code. This affects the display, file I/O,
5806 and behaviors of various editing commands.
5808 Setting this to nil does not do anything.
5810 enable_multibyte_characters = 1;
5813 void complex_vars_of_file_coding(void)
5815 staticpro(&Vcoding_system_hash_table);
5816 Vcoding_system_hash_table =
5817 make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5819 the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5820 dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5821 &codesys_prop_dynarr_description);
5823 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5825 struct codesys_prop csp; \
5827 csp.prop_type = (Prop_Type); \
5828 Dynarr_add (the_codesys_prop_dynarr, csp); \
5831 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5832 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5833 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5834 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5835 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5836 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5837 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5839 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5840 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5841 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5842 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5843 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5844 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5845 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5846 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5847 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5848 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5849 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5850 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5851 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5852 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5853 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5854 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5855 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5857 DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5858 DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5860 /* Need to create this here or we're really screwed. */
5862 (Qraw_text, Qno_conversion,
5864 ("Raw text, which means it converts only line-break-codes."),
5865 list2(Qmnemonic, build_string("Raw")));
5868 (Qbinary, Qno_conversion,
5869 build_string("Binary, which means it does not convert anything."),
5870 list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5872 Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5874 Fdefine_coding_system_alias(Qfile_name, Qbinary);
5876 Fdefine_coding_system_alias(Qterminal, Qbinary);
5877 Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5879 /* Need this for bootstrapping */
5880 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5881 Fget_coding_system(Qraw_text);
5887 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5888 fcd->ucs_to_mule_table[i] = Qnil;
5890 staticpro(&mule_to_ucs_table);
5891 mule_to_ucs_table = Fmake_char_table(Qgeneric);