1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of SXEmacs
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 /* Synched up with: Mule 2.3. Not in FSF. */
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
30 #include "ui/insdel.h"
37 #include "file-coding.h"
39 Lisp_Object Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
50 struct file_coding_dump {
51 /* Coding system currently associated with each coding category. */
52 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
54 /* Table of all coding categories in decreasing order of priority.
55 This describes a permutation of the possible coding categories. */
56 int coding_category_by_priority[CODING_CATEGORY_LAST];
59 Lisp_Object ucs_to_mule_table[65536];
63 static const struct lrecord_description fcd_description_1[] = {
64 {XD_LISP_OBJECT_ARRAY,
65 offsetof(struct file_coding_dump, coding_category_system),
66 CODING_CATEGORY_LAST},
68 {XD_LISP_OBJECT_ARRAY,
69 offsetof(struct file_coding_dump, ucs_to_mule_table),
70 countof(fcd->ucs_to_mule_table)},
75 static const struct struct_description fcd_description = {
76 sizeof(struct file_coding_dump),
80 Lisp_Object mule_to_ucs_table;
82 Lisp_Object Qcoding_systemp;
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
104 Lisp_Object Qencode, Qdecode;
106 Lisp_Object Vcoding_system_hash_table;
108 int enable_multibyte_characters;
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet:1;
162 unsigned int invalid_switch_dir:1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally:1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence:1;
175 EXFUN(Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179 const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181 unsigned_char_dynarr * dst,
182 Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184 unsigned_char_dynarr * dst,
185 Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187 Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189 unsigned_char_dynarr * dst,
190 Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192 unsigned_char_dynarr * dst,
193 Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195 Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197 unsigned_char_dynarr * dst,
198 Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200 unsigned_char_dynarr * dst,
201 Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203 Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205 unsigned_char_dynarr * dst,
206 Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208 unsigned_char_dynarr * dst,
209 Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212 struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214 const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216 unsigned_char_dynarr * dst,
217 Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219 unsigned_char_dynarr * dst,
220 Lstream_data_count n);
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223 unsigned_char_dynarr * dst,
224 Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226 unsigned_char_dynarr * dst,
227 Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229 unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231 unsigned_char_dynarr * dst, Lstream_data_count n);
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
240 Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
243 static const struct lrecord_description codesys_prop_description_1[] = {
244 {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
248 static const struct struct_description codesys_prop_description = {
249 sizeof(codesys_prop),
250 codesys_prop_description_1
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
258 static const struct struct_description codesys_prop_dynarr_description = {
259 sizeof(codesys_prop_dynarr),
260 codesys_prop_dynarr_description_1
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
265 enum codesys_prop_enum {
267 CODESYS_PROP_ISO2022,
271 /************************************************************************/
272 /* Coding system functions */
273 /************************************************************************/
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
280 static const struct lrecord_description ccs_description_1[] = {
281 {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282 {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
286 static const struct struct_description ccs_description = {
287 sizeof(charset_conversion_spec),
291 static const struct lrecord_description ccsd_description_1[] = {
292 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
296 static const struct struct_description ccsd_description = {
297 sizeof(charset_conversion_spec_dynarr),
302 static const struct lrecord_description coding_system_description[] = {
303 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
312 {XD_LISP_OBJECT_ARRAY,
313 offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314 {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
316 {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
318 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325 mark_coding_system, print_coding_system,
326 finalize_coding_system,
327 0, 0, coding_system_description,
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
332 Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
334 mark_object(CODING_SYSTEM_NAME(codesys));
335 mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336 mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337 mark_object(CODING_SYSTEM_EOL_LF(codesys));
338 mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339 mark_object(CODING_SYSTEM_EOL_CR(codesys));
341 switch (CODING_SYSTEM_TYPE(codesys)) {
344 case CODESYS_ISO2022:
345 for (i = 0; i < 4; i++)
346 mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
348 if (codesys->iso2022.input_conv) {
350 i < Dynarr_length(codesys->iso2022.input_conv);
352 struct charset_conversion_spec *ccs =
353 Dynarr_atp(codesys->iso2022.input_conv, i);
354 mark_object(ccs->from_charset);
355 mark_object(ccs->to_charset);
358 if (codesys->iso2022.output_conv) {
360 i < Dynarr_length(codesys->iso2022.output_conv);
362 struct charset_conversion_spec *ccs =
363 Dynarr_atp(codesys->iso2022.output_conv, i);
364 mark_object(ccs->from_charset);
365 mark_object(ccs->to_charset);
371 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
375 /* list the rest of them lot explicitly */
376 case CODESYS_AUTODETECT:
377 case CODESYS_SHIFT_JIS:
381 case CODESYS_NO_CONVERSION:
383 case CODESYS_INTERNAL:
390 mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
397 Lisp_Coding_System *c = XCODING_SYSTEM(obj);
399 error("printing unreadable object #<coding-system 0x%x>",
402 write_c_string("#<coding-system ", printcharfun);
403 print_internal(c->name, printcharfun, 1);
404 write_c_string(">", printcharfun);
407 static void finalize_coding_system(void *header, int for_disksave)
409 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410 /* Since coding systems never go away, this function is not
411 necessary. But it would be necessary if we changed things
412 so that coding systems could go away. */
413 if (!for_disksave) { /* see comment in lstream.c */
414 switch (CODING_SYSTEM_TYPE(c)) {
416 case CODESYS_ISO2022:
417 if (c->iso2022.input_conv) {
418 Dynarr_free(c->iso2022.input_conv);
419 c->iso2022.input_conv = 0;
421 if (c->iso2022.output_conv) {
422 Dynarr_free(c->iso2022.output_conv);
423 c->iso2022.output_conv = 0;
427 /* list the rest of them lot explicitly */
428 case CODESYS_AUTODETECT:
429 case CODESYS_SHIFT_JIS:
434 case CODESYS_NO_CONVERSION:
436 case CODESYS_INTERNAL:
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
447 CHECK_SYMBOL(symbol);
449 return EOL_AUTODETECT;
452 if (EQ(symbol, Qcrlf))
457 signal_simple_error("Unrecognized eol type", symbol);
458 return EOL_AUTODETECT; /* not reached */
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
478 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
480 Lisp_Object codesys_obj;
481 int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
482 char *codesys_name = (char *)alloca(len + 7);
484 char *codesys_mnemonic = 0;
486 Lisp_Object codesys_name_sym, sub_codesys_obj;
490 XSETCODING_SYSTEM(codesys_obj, codesys);
493 string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
495 if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
496 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
497 codesys_mnemonic = (char *)alloca(mlen + 7);
498 memcpy(codesys_mnemonic,
499 XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
501 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
502 strcpy (codesys_name + len, "-" op_sys); \
504 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
505 codesys_name_sym = intern (codesys_name); \
506 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
507 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
509 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
510 build_string (codesys_mnemonic); \
511 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
514 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
515 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
516 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
519 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
520 Return t if OBJECT is a coding system.
521 A coding system is an object that defines how text containing multiple
522 character sets is encoded into a stream of (typically 8-bit) bytes.
523 The coding system is used to decode the stream into a series of
524 characters (which may be from multiple charsets) when the text is read
525 from a file or process, and is used to encode the text back into the
526 same format when it is written out to a file or process.
528 For example, many ISO2022-compliant coding systems (such as Compound
529 Text, which is used for inter-client data under the X Window System)
530 use escape sequences to switch between different charsets -- Japanese
531 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
532 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
533 `make-coding-system' for more information.
535 Coding systems are normally identified using a symbol, and the
536 symbol is accepted in place of the actual coding system object whenever
537 a coding system is called for. (This is similar to how faces work.)
541 return CODING_SYSTEMP(object) ? Qt : Qnil;
544 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
545 Retrieve the coding system of the given name.
547 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
548 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
549 If there is no such coding system, nil is returned. Otherwise the
550 associated coding system object is returned.
552 (coding_system_or_name))
554 if (NILP(coding_system_or_name))
555 coding_system_or_name = Qbinary;
556 else if (CODING_SYSTEMP(coding_system_or_name))
557 return coding_system_or_name;
559 CHECK_SYMBOL(coding_system_or_name);
562 coding_system_or_name =
563 Fgethash(coding_system_or_name, Vcoding_system_hash_table,
566 if (CODING_SYSTEMP(coding_system_or_name)
567 || NILP(coding_system_or_name))
568 return coding_system_or_name;
572 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
573 Retrieve the coding system of the given name.
574 Same as `find-coding-system' except that if there is no such
575 coding system, an error is signaled instead of returning nil.
579 Lisp_Object coding_system = Ffind_coding_system(name);
581 if (NILP(coding_system))
582 signal_simple_error("No such coding system", name);
583 return coding_system;
586 /* We store the coding systems in hash tables with the names as the key and the
587 actual coding system object as the value. Occasionally we need to use them
588 in a list format. These routines provide us with that. */
589 struct coding_system_list_closure {
590 Lisp_Object *coding_system_list;
594 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
595 void *coding_system_list_closure)
597 /* This function can GC */
598 struct coding_system_list_closure *cscl =
599 (struct coding_system_list_closure *)coding_system_list_closure;
600 Lisp_Object *coding_system_list = cscl->coding_system_list;
602 *coding_system_list = Fcons(key, *coding_system_list);
606 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
607 Return a list of the names of all defined coding systems.
611 Lisp_Object coding_system_list = Qnil;
613 struct coding_system_list_closure coding_system_list_closure;
615 GCPRO1(coding_system_list);
616 coding_system_list_closure.coding_system_list = &coding_system_list;
617 elisp_maphash(add_coding_system_to_list_mapper,
618 Vcoding_system_hash_table, &coding_system_list_closure);
621 return coding_system_list;
624 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
625 Return the name of the given coding system.
629 coding_system = Fget_coding_system(coding_system);
630 return XCODING_SYSTEM_NAME(coding_system);
633 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
636 Lisp_Coding_System *codesys =
637 alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
639 zero_lcrecord(codesys);
640 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
641 CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
642 CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
643 CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
644 CODING_SYSTEM_EOL_CR(codesys) = Qnil;
645 CODING_SYSTEM_EOL_LF(codesys) = Qnil;
646 CODING_SYSTEM_TYPE(codesys) = type;
647 CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
649 if (type == CODESYS_ISO2022) {
651 for (i = 0; i < 4; i++)
652 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
654 } else if (type == CODESYS_CCL) {
655 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
656 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
659 CODING_SYSTEM_NAME(codesys) = name;
665 /* Given a list of charset conversion specs as specified in a Lisp
666 program, parse it into STORE_HERE. */
669 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
670 Lisp_Object spec_list)
674 EXTERNAL_LIST_LOOP(rest, spec_list) {
675 Lisp_Object car = XCAR(rest);
676 Lisp_Object from, to;
677 struct charset_conversion_spec spec;
679 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
680 signal_simple_error("Invalid charset conversion spec",
682 from = Fget_charset(XCAR(car));
683 to = Fget_charset(XCAR(XCDR(car)));
684 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
685 signal_simple_error_2
686 ("Attempted conversion between different charset types",
688 spec.from_charset = from;
689 spec.to_charset = to;
691 Dynarr_add(store_here, spec);
695 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
696 specs, return the equivalent as the Lisp programmer would see it.
698 If LOAD_HERE is 0, return Qnil. */
701 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
708 for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
709 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
711 Fcons(list2(ccs->from_charset, ccs->to_charset), result);
714 return Fnreverse(result);
719 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
720 Register symbol NAME as a coding system.
722 TYPE describes the conversion method used and should be one of
725 Automatic conversion. SXEmacs attempts to detect the coding system
728 No conversion. Use this for binary files and such. On output,
729 graphic characters that are not in ASCII or Latin-1 will be
730 replaced by a ?. (For a no-conversion-encoded buffer, these
731 characters will only be present if you explicitly insert them.)
733 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
735 ISO 10646 UCS-4 encoding.
737 ISO 10646 UTF-8 encoding.
739 Any ISO2022-compliant encoding. Among other things, this includes
740 JIS (the Japanese encoding commonly used for e-mail), EUC (the
741 standard Unix encoding for Japanese and other languages), and
742 Compound Text (the encoding used in X11). You can specify more
743 specific information about the conversion with the PROPS argument.
745 Big5 (the encoding commonly used for Taiwanese).
747 The conversion is performed using a user-written pseudo-code
748 program. CCL (Code Conversion Language) is the name of this
751 Write out or read in the raw contents of the memory representing
752 the buffer's text. This is primarily useful for debugging
753 purposes, and is only enabled when SXEmacs has been compiled with
754 DEBUG_SXEMACS defined (via the --debug configure option).
755 WARNING: Reading in a file using 'internal conversion can result
756 in an internal inconsistency in the memory representing a
757 buffer's text, which will produce unpredictable results and may
758 cause SXEmacs to crash. Under normal circumstances you should
759 never use 'internal conversion.
761 DOC-STRING is a string describing the coding system.
763 PROPS is a property list, describing the specific nature of the
764 character set. Recognized properties are:
767 String to be displayed in the modeline when this coding system is
771 End-of-line conversion to be used. It should be one of
774 Automatically detect the end-of-line type (LF, CRLF,
775 or CR). Also generate subsidiary coding systems named
776 `NAME-unix', `NAME-dos', and `NAME-mac', that are
777 identical to this coding system but have an EOL-TYPE
778 value of 'lf, 'crlf, and 'cr, respectively.
780 The end of a line is marked externally using ASCII LF.
781 Since this is also the way that SXEmacs represents an
782 end-of-line internally, specifying this option results
783 in no end-of-line conversion. This is the standard
784 format for Unix text files.
786 The end of a line is marked externally using ASCII
787 CRLF. This is the standard format for MS-DOS text
790 The end of a line is marked externally using ASCII CR.
791 This is the standard format for Macintosh text files.
793 Automatically detect the end-of-line type but do not
794 generate subsidiary coding systems. (This value is
795 converted to nil when stored internally, and
796 `coding-system-property' will return nil.)
798 'post-read-conversion
799 Function called after a file has been read in, to perform the
800 decoding. Called with two arguments, START and END, denoting
801 a region of the current buffer to be decoded.
803 'pre-write-conversion
804 Function called before a file is written out, to perform the
805 encoding. Called with two arguments, START and END, denoting
806 a region of the current buffer to be encoded.
808 The following additional properties are recognized if TYPE is 'iso2022:
814 The character set initially designated to the G0 - G3 registers.
815 The value should be one of
817 -- A charset object (designate that character set)
818 -- nil (do not ever use this register)
819 -- t (no character set is initially designated to
820 the register, but may be later on; this automatically
821 sets the corresponding `force-g*-on-output' property)
827 If non-nil, send an explicit designation sequence on output before
828 using the specified register.
831 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
832 "ESC $ B" on output in place of the full designation sequences
833 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
836 If non-nil, don't designate ASCII to G0 at each end of line on output.
837 Setting this to non-nil also suppresses other state-resetting that
838 normally happens at the end of a line.
841 If non-nil, don't designate ASCII to G0 before control chars on output.
844 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
848 If non-nil, use locking-shift (SO/SI) instead of single-shift
849 or designation by escape sequence.
852 If non-nil, don't use ISO6429's direction specification.
855 If non-nil, literal control characters that are the same as
856 the beginning of a recognized ISO2022 or ISO6429 escape sequence
857 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
858 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
859 so that they can be properly distinguished from an escape sequence.
860 (Note that doing this results in a non-portable encoding.) This
861 encoding flag is used for byte-compiled files. Note that ESC
862 is a good choice for a quoting character because there are no
863 escape sequences whose second byte is a character from the Control-0
864 or Control-1 character sets; this is explicitly disallowed by the
867 'input-charset-conversion
868 A list of conversion specifications, specifying conversion of
869 characters in one charset to another when decoding is performed.
870 Each specification is a list of two elements: the source charset,
871 and the destination charset.
873 'output-charset-conversion
874 A list of conversion specifications, specifying conversion of
875 characters in one charset to another when encoding is performed.
876 The form of each specification is the same as for
877 'input-charset-conversion.
879 The following additional properties are recognized (and required)
883 CCL program used for decoding (converting to internal format).
886 CCL program used for encoding (converting to external format).
888 (name, type, doc_string, props))
890 Lisp_Coding_System *codesys;
891 enum coding_system_type ty;
892 int need_to_setup_eol_systems = 1;
894 /* Convert type to constant */
895 if (NILP(type) || EQ(type, Qundecided)) {
896 ty = CODESYS_AUTODETECT;
899 else if (EQ(type, Qshift_jis)) {
900 ty = CODESYS_SHIFT_JIS;
901 } else if (EQ(type, Qiso2022)) {
902 ty = CODESYS_ISO2022;
903 } else if (EQ(type, Qbig5)) {
905 } else if (EQ(type, Qucs4)) {
907 } else if (EQ(type, Qutf8)) {
909 } else if (EQ(type, Qccl)) {
913 else if (EQ(type, Qno_conversion)) {
914 ty = CODESYS_NO_CONVERSION;
917 else if (EQ(type, Qinternal)) {
918 ty = CODESYS_INTERNAL;
922 signal_simple_error("Invalid coding system type", type);
926 codesys = allocate_coding_system(ty, name);
928 if (NILP(doc_string))
929 doc_string = build_string("");
931 CHECK_STRING(doc_string);
932 CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
935 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
936 if (EQ(key, Qmnemonic)) {
939 CODING_SYSTEM_MNEMONIC(codesys) = value;
942 else if (EQ(key, Qeol_type)) {
943 need_to_setup_eol_systems = NILP(value);
946 CODING_SYSTEM_EOL_TYPE(codesys) =
947 symbol_to_eol_type(value);
950 else if (EQ(key, Qpost_read_conversion))
951 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
953 else if (EQ(key, Qpre_write_conversion))
954 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
957 else if (ty == CODESYS_ISO2022) {
958 #define FROB_INITIAL_CHARSET(charset_num) \
959 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
960 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
962 if (EQ(key, Qcharset_g0))
963 FROB_INITIAL_CHARSET(0);
964 else if (EQ(key, Qcharset_g1))
965 FROB_INITIAL_CHARSET(1);
966 else if (EQ(key, Qcharset_g2))
967 FROB_INITIAL_CHARSET(2);
968 else if (EQ(key, Qcharset_g3))
969 FROB_INITIAL_CHARSET(3);
971 #define FROB_FORCE_CHARSET(charset_num) \
972 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
974 else if (EQ(key, Qforce_g0_on_output))
975 FROB_FORCE_CHARSET(0);
976 else if (EQ(key, Qforce_g1_on_output))
977 FROB_FORCE_CHARSET(1);
978 else if (EQ(key, Qforce_g2_on_output))
979 FROB_FORCE_CHARSET(2);
980 else if (EQ(key, Qforce_g3_on_output))
981 FROB_FORCE_CHARSET(3);
983 #define FROB_BOOLEAN_PROPERTY(prop) \
984 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
986 else if (EQ(key, Qshort))
987 FROB_BOOLEAN_PROPERTY(SHORT);
988 else if (EQ(key, Qno_ascii_eol))
989 FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
990 else if (EQ(key, Qno_ascii_cntl))
991 FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
992 else if (EQ(key, Qseven))
993 FROB_BOOLEAN_PROPERTY(SEVEN);
994 else if (EQ(key, Qlock_shift))
995 FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
996 else if (EQ(key, Qno_iso6429))
997 FROB_BOOLEAN_PROPERTY(NO_ISO6429);
998 else if (EQ(key, Qescape_quoted))
999 FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
1001 else if (EQ(key, Qinput_charset_conversion)) {
1002 codesys->iso2022.input_conv =
1003 Dynarr_new(charset_conversion_spec);
1004 parse_charset_conversion_specs(codesys->
1008 } else if (EQ(key, Qoutput_charset_conversion)) {
1009 codesys->iso2022.output_conv =
1010 Dynarr_new(charset_conversion_spec);
1011 parse_charset_conversion_specs(codesys->
1017 ("Unrecognized property", key);
1018 } else if (EQ(type, Qccl)) {
1020 struct ccl_program test_ccl;
1023 /* Check key first. */
1024 if (EQ(key, Qdecode))
1025 suffix = "-ccl-decode";
1026 else if (EQ(key, Qencode))
1027 suffix = "-ccl-encode";
1030 ("Unrecognized property", key);
1032 /* If value is vector, register it as a ccl program
1033 associated with an newly created symbol for
1034 backward compatibility. */
1035 if (VECTORP(value)) {
1038 (Fsymbol_name(name),
1039 build_string(suffix)),
1041 Fregister_ccl_program(sym, value);
1043 CHECK_SYMBOL(value);
1046 /* check if the given ccl programs are valid. */
1047 if (setup_ccl_program(&test_ccl, sym) < 0)
1049 ("Invalid CCL program", value);
1051 if (EQ(key, Qdecode))
1052 CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1053 else if (EQ(key, Qencode))
1054 CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1059 signal_simple_error("Unrecognized property",
1064 if (need_to_setup_eol_systems)
1065 setup_eol_coding_systems(codesys);
1068 Lisp_Object codesys_obj;
1069 XSETCODING_SYSTEM(codesys_obj, codesys);
1070 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1075 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1076 Copy OLD-CODING-SYSTEM to NEW-NAME.
1077 If NEW-NAME does not name an existing coding system, a new one will
1080 (old_coding_system, new_name))
1082 Lisp_Object new_coding_system;
1083 old_coding_system = Fget_coding_system(old_coding_system);
1084 new_coding_system = Ffind_coding_system(new_name);
1085 if (NILP(new_coding_system)) {
1086 XSETCODING_SYSTEM(new_coding_system,
1087 allocate_coding_system
1088 (XCODING_SYSTEM_TYPE(old_coding_system),
1090 Fputhash(new_name, new_coding_system,
1091 Vcoding_system_hash_table);
1095 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1096 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1097 memcpy(((char *)to) + sizeof(to->header),
1098 ((char *)from) + sizeof(from->header),
1099 sizeof(*from) - sizeof(from->header));
1100 to->name = new_name;
1102 return new_coding_system;
1105 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1106 Return t if OBJECT names a coding system, and is not a coding system alias.
1110 Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1111 return CODING_SYSTEMP(val) ? Qt : Qnil;
1114 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1115 Return t if OBJECT is a coding system alias.
1116 All coding system aliases are created by `define-coding-system-alias'.
1120 Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1121 return SYMBOLP(val) ? Qt : Qnil;
1124 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1125 Return the coding-system symbol for which symbol ALIAS is an alias.
1129 Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1130 if (SYMBOLP(aliasee)) {
1133 signal_simple_error(
1134 "Symbol is not a coding system alias", alias);
1136 /* To keep the compiler happy */
1141 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1143 return Fintern(concat2(Fsymbol_name(symbol),
1144 build_string(ascii_string)), Qnil);
1147 /* A maphash function, for removing dangling coding system aliases. */
1149 dangling_coding_system_alias_p(Lisp_Object alias,
1150 Lisp_Object aliasee, void *dangling_aliases)
1152 if (SYMBOLP(aliasee)
1153 && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1154 (*(int *)dangling_aliases)++;
1161 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1162 Define symbol ALIAS as an alias for coding system ALIASEE.
1164 You can use this function to redefine an alias that has already been defined,
1165 but you cannot redefine a name which is the canonical name for a coding system.
1166 \(a canonical name of a coding system is what is returned when you call
1167 `coding-system-name' on a coding system).
1169 ALIASEE itself can be an alias, which allows you to define nested aliases.
1171 You are forbidden, however, from creating alias loops or `dangling' aliases.
1172 These will be detected, and an error will be signaled if you attempt to do so.
1174 If ALIASEE is nil, then ALIAS will simply be undefined.
1176 See also `coding-system-alias-p', `coding-system-aliasee',
1177 and `coding-system-canonical-name-p'.
1181 Lisp_Object real_coding_system, probe;
1183 CHECK_SYMBOL(alias);
1185 if (!NILP(Fcoding_system_canonical_name_p(alias)))
1187 ("Symbol is the canonical name of a coding system and cannot be redefined",
1190 if (NILP(aliasee)) {
1191 Lisp_Object subsidiary_unix =
1192 append_suffix_to_symbol(alias, "-unix");
1193 Lisp_Object subsidiary_dos =
1194 append_suffix_to_symbol(alias, "-dos");
1195 Lisp_Object subsidiary_mac =
1196 append_suffix_to_symbol(alias, "-mac");
1198 Fremhash(alias, Vcoding_system_hash_table);
1200 /* Undefine subsidiary aliases,
1201 presumably created by a previous call to this function */
1202 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1203 !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1204 !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1205 Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1206 Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1207 Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1210 /* Undefine dangling coding system aliases. */
1212 int dangling_aliases;
1215 dangling_aliases = 0;
1217 (dangling_coding_system_alias_p,
1218 Vcoding_system_hash_table,
1220 } while (dangling_aliases > 0);
1226 if (CODING_SYSTEMP(aliasee))
1227 aliasee = XCODING_SYSTEM_NAME(aliasee);
1229 /* Checks that aliasee names a coding-system */
1230 real_coding_system = Fget_coding_system(aliasee);
1232 /* Check for coding system alias loops */
1233 if (EQ(alias, aliasee))
1234 alias_loop:signal_simple_error_2
1235 ("Attempt to create a coding system alias loop", alias,
1238 for (probe = aliasee;
1240 probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1241 if (EQ(probe, alias))
1245 Fputhash(alias, aliasee, Vcoding_system_hash_table);
1247 /* Set up aliases for subsidiaries.
1248 #### There must be a better way to handle subsidiary coding
1251 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1253 for (int i = 0; i < countof(suffixes); i++) {
1254 Lisp_Object alias_subsidiary =
1255 append_suffix_to_symbol(alias, suffixes[i]);
1256 Lisp_Object aliasee_subsidiary =
1257 append_suffix_to_symbol(aliasee, suffixes[i]);
1259 if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1260 Fdefine_coding_system_alias(alias_subsidiary,
1261 aliasee_subsidiary);
1265 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1266 but it doesn't look intentional, so I'd rather return something
1267 meaningful or nothing at all. */
1272 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1274 Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1275 Lisp_Object new_coding_system;
1277 if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1278 return coding_system;
1281 case EOL_AUTODETECT:
1282 return coding_system;
1284 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1287 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1290 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1297 return NILP(new_coding_system) ? coding_system : new_coding_system;
1300 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1301 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1303 (coding_system, eol_type))
1305 coding_system = Fget_coding_system(coding_system);
1307 return subsidiary_coding_system(coding_system,
1308 symbol_to_eol_type(eol_type));
1311 /************************************************************************/
1312 /* Coding system accessors */
1313 /************************************************************************/
1315 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1316 Return the doc string for CODING-SYSTEM.
1320 coding_system = Fget_coding_system(coding_system);
1321 return XCODING_SYSTEM_DOC_STRING(coding_system);
1324 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1325 Return the type of CODING-SYSTEM.
1329 Lisp_Object tmp = Fget_coding_system(coding_system);
1331 switch (XCODING_SYSTEM_TYPE(tmp)) {
1334 case CODESYS_AUTODETECT:
1337 case CODESYS_SHIFT_JIS:
1339 case CODESYS_ISO2022:
1350 case CODESYS_NO_CONVERSION:
1351 return Qno_conversion;
1352 #ifdef DEBUG_SXEMACS
1353 case CODESYS_INTERNAL:
1361 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1364 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1366 return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1369 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1370 Return initial charset of CODING-SYSTEM designated to GNUM.
1373 (coding_system, gnum))
1375 coding_system = Fget_coding_system(coding_system);
1378 return coding_system_charset(coding_system, XINT(gnum));
1382 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1383 Return the PROP property of CODING-SYSTEM.
1385 (coding_system, prop))
1388 enum coding_system_type type;
1390 coding_system = Fget_coding_system(coding_system);
1392 type = XCODING_SYSTEM_TYPE(coding_system);
1394 for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1395 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1397 switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1398 case CODESYS_PROP_ALL_OK:
1401 case CODESYS_PROP_ISO2022:
1402 if (type != CODESYS_ISO2022)
1404 ("Property only valid in ISO2022 coding systems",
1408 case CODESYS_PROP_CCL:
1409 if (type != CODESYS_CCL)
1411 ("Property only valid in CCL coding systems",
1421 signal_simple_error("Unrecognized property", prop);
1423 if (EQ(prop, Qname))
1424 return XCODING_SYSTEM_NAME(coding_system);
1425 else if (EQ(prop, Qtype))
1426 return Fcoding_system_type(coding_system);
1427 else if (EQ(prop, Qdoc_string))
1428 return XCODING_SYSTEM_DOC_STRING(coding_system);
1429 else if (EQ(prop, Qmnemonic))
1430 return XCODING_SYSTEM_MNEMONIC(coding_system);
1431 else if (EQ(prop, Qeol_type))
1433 eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1434 else if (EQ(prop, Qeol_lf))
1435 return XCODING_SYSTEM_EOL_LF(coding_system);
1436 else if (EQ(prop, Qeol_crlf))
1437 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1438 else if (EQ(prop, Qeol_cr))
1439 return XCODING_SYSTEM_EOL_CR(coding_system);
1440 else if (EQ(prop, Qpost_read_conversion))
1441 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1442 else if (EQ(prop, Qpre_write_conversion))
1443 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1445 else if (type == CODESYS_ISO2022) {
1446 if (EQ(prop, Qcharset_g0))
1447 return coding_system_charset(coding_system, 0);
1448 else if (EQ(prop, Qcharset_g1))
1449 return coding_system_charset(coding_system, 1);
1450 else if (EQ(prop, Qcharset_g2))
1451 return coding_system_charset(coding_system, 2);
1452 else if (EQ(prop, Qcharset_g3))
1453 return coding_system_charset(coding_system, 3);
1455 #define FORCE_CHARSET(charset_num) \
1456 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1457 (coding_system, charset_num) ? Qt : Qnil)
1459 else if (EQ(prop, Qforce_g0_on_output))
1460 return FORCE_CHARSET(0);
1461 else if (EQ(prop, Qforce_g1_on_output))
1462 return FORCE_CHARSET(1);
1463 else if (EQ(prop, Qforce_g2_on_output))
1464 return FORCE_CHARSET(2);
1465 else if (EQ(prop, Qforce_g3_on_output))
1466 return FORCE_CHARSET(3);
1468 #define LISP_BOOLEAN(prop) \
1469 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1471 else if (EQ(prop, Qshort))
1472 return LISP_BOOLEAN(SHORT);
1473 else if (EQ(prop, Qno_ascii_eol))
1474 return LISP_BOOLEAN(NO_ASCII_EOL);
1475 else if (EQ(prop, Qno_ascii_cntl))
1476 return LISP_BOOLEAN(NO_ASCII_CNTL);
1477 else if (EQ(prop, Qseven))
1478 return LISP_BOOLEAN(SEVEN);
1479 else if (EQ(prop, Qlock_shift))
1480 return LISP_BOOLEAN(LOCK_SHIFT);
1481 else if (EQ(prop, Qno_iso6429))
1482 return LISP_BOOLEAN(NO_ISO6429);
1483 else if (EQ(prop, Qescape_quoted))
1484 return LISP_BOOLEAN(ESCAPE_QUOTED);
1486 else if (EQ(prop, Qinput_charset_conversion))
1488 unparse_charset_conversion_specs
1489 (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1490 else if (EQ(prop, Qoutput_charset_conversion))
1492 unparse_charset_conversion_specs
1493 (XCODING_SYSTEM(coding_system)->iso2022.
1497 } else if (type == CODESYS_CCL) {
1498 if (EQ(prop, Qdecode))
1499 return XCODING_SYSTEM_CCL_DECODE(coding_system);
1500 else if (EQ(prop, Qencode))
1501 return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1509 return Qnil; /* not reached */
1512 /************************************************************************/
1513 /* Coding category functions */
1514 /************************************************************************/
1516 static int decode_coding_category(Lisp_Object symbol)
1520 CHECK_SYMBOL(symbol);
1521 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1522 if (EQ(coding_category_symbol[i], symbol))
1525 signal_simple_error("Unrecognized coding category", symbol);
1526 return 0; /* not reached */
1529 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1530 Return a list of all recognized coding categories.
1535 Lisp_Object list = Qnil;
1537 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1538 list = Fcons(coding_category_symbol[i], list);
1542 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1543 Change the priority order of the coding categories.
1544 LIST should be list of coding categories, in descending order of
1545 priority. Unspecified coding categories will be lower in priority
1546 than all specified ones, in the same relative order they were in
1551 int category_to_priority[CODING_CATEGORY_LAST];
1555 /* First generate a list that maps coding categories to priorities. */
1557 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1558 category_to_priority[i] = -1;
1560 /* Highest priority comes from the specified list. */
1562 EXTERNAL_LIST_LOOP(rest, list) {
1563 int cat = decode_coding_category(XCAR(rest));
1565 if (category_to_priority[cat] >= 0)
1566 signal_simple_error("Duplicate coding category in list",
1568 category_to_priority[cat] = i++;
1571 /* Now go through the existing categories by priority to retrieve
1572 the categories not yet specified and preserve their priority
1574 for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1575 int cat = fcd->coding_category_by_priority[j];
1576 if (category_to_priority[cat] < 0)
1577 category_to_priority[cat] = i++;
1580 /* Now we need to construct the inverse of the mapping we just
1583 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1584 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1586 /* Phew! That was confusing. */
1590 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1591 Return a list of coding categories in descending order of priority.
1596 Lisp_Object list = Qnil;
1598 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1600 Fcons(coding_category_symbol
1601 [fcd->coding_category_by_priority[i]], list);
1605 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1606 Change the coding system associated with a coding category.
1608 (coding_category, coding_system))
1610 int cat = decode_coding_category(coding_category);
1612 coding_system = Fget_coding_system(coding_system);
1613 fcd->coding_category_system[cat] = coding_system;
1617 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1618 Return the coding system associated with a coding category.
1622 int cat = decode_coding_category(coding_category);
1623 Lisp_Object sys = fcd->coding_category_system[cat];
1626 return XCODING_SYSTEM_NAME(sys);
1630 /************************************************************************/
1631 /* Detecting the encoding of data */
1632 /************************************************************************/
1634 struct detection_state {
1635 eol_type_t eol_type;
1662 struct iso2022_decoder iso;
1664 int high_byte_count;
1665 unsigned int saw_single_shift:1;
1674 static int acceptable_control_char_p(int c)
1677 /* Allow and ignore control characters that you might
1678 reasonably see in a text file */
1683 case 8: /* backspace */
1684 case 11: /* vertical tab */
1685 case 12: /* form feed */
1686 case 26: /* MS-DOS C-z junk */
1687 case 31: /* '^_' -- for info */
1694 static int mask_has_at_most_one_bit_p(int mask)
1696 /* Perhaps the only thing useful you learn from intensive Microsoft
1697 technical interviews */
1698 return (mask & (mask - 1)) == 0;
1702 detect_eol_type(struct detection_state *st, const Extbyte * src,
1703 Lstream_data_count n)
1706 const unsigned char c = *(const unsigned char*)src++;
1708 if (st->eol.just_saw_cr)
1710 else if (st->eol.seen_anything)
1712 } else if (st->eol.just_saw_cr)
1715 st->eol.just_saw_cr = 1;
1717 st->eol.just_saw_cr = 0;
1718 st->eol.seen_anything = 1;
1721 return EOL_AUTODETECT;
1724 /* Attempt to determine the encoding and EOL type of the given text.
1725 Before calling this function for the first type, you must initialize
1726 st->eol_type as appropriate and initialize st->mask to ~0.
1728 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1731 st->mask holds the determined coding category mask, or ~0 if only
1732 ASCII has been seen so far.
1736 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1737 is present in st->mask
1738 1 == definitive answers are here for both st->eol_type and st->mask
1742 detect_coding_type(struct detection_state *st, const Extbyte * src,
1743 Lstream_data_count n, int just_do_eol)
1745 if (st->eol_type == EOL_AUTODETECT)
1746 st->eol_type = detect_eol_type(st, src, n);
1749 return st->eol_type != EOL_AUTODETECT;
1751 if (!st->seen_non_ascii) {
1752 for (; n; n--, src++) {
1753 const unsigned char c = *(const unsigned char *)src;
1754 if ((c < 0x20 && !acceptable_control_char_p(c))
1756 st->seen_non_ascii = 1;
1758 st->shift_jis.mask = ~0;
1762 st->iso2022.mask = ~0;
1773 if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1774 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1775 if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1776 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1777 if (!mask_has_at_most_one_bit_p(st->big5.mask))
1778 st->big5.mask = detect_coding_big5(st, src, n);
1779 if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1780 st->utf8.mask = detect_coding_utf8(st, src, n);
1781 if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1782 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1784 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1785 | st->utf8.mask | st->ucs4.mask;
1788 int retval = mask_has_at_most_one_bit_p(st->mask);
1789 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1790 return retval && st->eol_type != EOL_AUTODETECT;
1794 static Lisp_Object coding_system_from_mask(int mask)
1797 /* If the file was entirely or basically ASCII, use the
1798 default value of `buffer-file-coding-system'. */
1799 Lisp_Object retval =
1800 XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1801 if (!NILP(retval)) {
1802 retval = Ffind_coding_system(retval);
1805 (Qbad_variable, Qwarning,
1806 "Invalid `default-buffer-file-coding-system', set to nil");
1807 XBUFFER(Vbuffer_defaults)->
1808 buffer_file_coding_system = Qnil;
1812 retval = Fget_coding_system(Qraw_text);
1818 mask = postprocess_iso2022_mask(mask);
1820 /* Look through the coding categories by priority and find
1821 the first one that is allowed. */
1822 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1823 cat = fcd->coding_category_by_priority[i];
1826 if ((mask & (1 << cat)) &&
1827 !NILP(fcd->coding_category_system[cat]))
1831 return fcd->coding_category_system[cat];
1833 return Fget_coding_system(Qraw_text);
1837 /* Given a seekable read stream and potential coding system and EOL type
1838 as specified, do any autodetection that is called for. If the
1839 coding system and/or EOL type are not `autodetect', they will be left
1840 alone; but this function will never return an autodetect coding system
1843 This function does not automatically fetch subsidiary coding systems;
1844 that should be unnecessary with the explicit eol-type argument. */
1846 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1847 /* number of leading lines to check for a coding cookie */
1848 #define LINES_TO_CHECK 2
1851 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1852 eol_type_t * eol_type_in_out)
1854 struct detection_state decst;
1856 if (*eol_type_in_out == EOL_AUTODETECT)
1857 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
1860 decst.eol_type = *eol_type_in_out;
1863 /* If autodetection is called for, do it now. */
1864 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1865 || *eol_type_in_out == EOL_AUTODETECT) {
1867 Lisp_Object coding_system = Qnil;
1869 Lstream_data_count nread =
1870 Lstream_read(stream, buf, sizeof(buf));
1872 int lines_checked = 0;
1874 /* Look for initial "-*-"; mode line prefix */
1876 scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1877 p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1878 if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1879 Extbyte *local_vars_beg = p + 3;
1880 /* Look for final "-*-"; mode line suffix */
1881 for (p = local_vars_beg,
1882 scan_end = buf + nread - LENGTH("-*-");
1884 && lines_checked < LINES_TO_CHECK; p++)
1885 if (*p == '-' && *(p + 1) == '*'
1886 && *(p + 2) == '-') {
1887 Extbyte *suffix = p;
1888 /* Look for "coding:" */
1889 for (p = local_vars_beg,
1917 /* Get coding system name */
1920 /* Characters valid in a MIME charset name (rfc 1521),
1921 and in a Lisp symbol name. */
1924 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1925 "abcdefghijklmnopqrstuvwxyz"
1947 /* #### file must use standard EOLs or we miss 2d line */
1948 /* #### not to mention this is broken for UTF-16 DOS files */
1949 else if (*p == '\n' || *p == '\r') {
1951 /* skip past multibyte (DOS) newline */
1953 && *(p + 1) == '\n')
1958 /* #### file must use standard EOLs or we miss 2d line */
1959 /* #### not to mention this is broken for UTF-16 DOS files */
1960 else if (*p == '\n' || *p == '\r') {
1962 /* skip past multibyte (DOS) newline */
1963 if (*p == '\r' && *(p + 1) == '\n')
1967 if (NILP(coding_system))
1969 if (detect_coding_type(&decst, buf, nread,
1972 != CODESYS_AUTODETECT))
1974 nread = Lstream_read(stream, buf, sizeof(buf));
1980 else if (XCODING_SYSTEM_TYPE(*codesys_in_out) ==
1982 && XCODING_SYSTEM_EOL_TYPE(coding_system) ==
1985 if (detect_coding_type(&decst, buf, nread, 1))
1987 nread = Lstream_read(stream, buf, sizeof(buf));
1993 *eol_type_in_out = decst.eol_type;
1994 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
1995 if (NILP(coding_system))
1997 coding_system_from_mask(decst.mask);
1999 *codesys_in_out = coding_system;
2003 /* If we absolutely can't determine the EOL type, just assume LF. */
2004 if (*eol_type_in_out == EOL_AUTODETECT)
2005 *eol_type_in_out = EOL_LF;
2007 Lstream_rewind(stream);
2010 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2011 Detect coding system of the text in the region between START and END.
2012 Return a list of possible coding systems ordered by priority.
2013 If only ASCII characters are found, return 'undecided or one of
2014 its subsidiary coding systems according to a detected end-of-line
2015 type. Optional arg BUFFER defaults to the current buffer.
2017 (start, end, buffer))
2019 Lisp_Object val = Qnil;
2020 struct buffer *buf = decode_buffer(buffer, 0);
2022 Lisp_Object instream, lb_instream;
2023 lstream_t istr, lb_istr;
2024 struct detection_state decst;
2025 struct gcpro gcpro1, gcpro2;
2027 get_buffer_range_char(buf, start, end, &b, &e, 0);
2028 lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2029 lb_istr = XLSTREAM(lb_instream);
2031 make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2032 istr = XLSTREAM(instream);
2033 GCPRO2(instream, lb_instream);
2035 decst.eol_type = EOL_AUTODETECT;
2038 Extbyte random_buffer[4096];
2039 Lstream_data_count nread =
2040 Lstream_read(istr, random_buffer, sizeof(random_buffer));
2044 if (detect_coding_type(&decst, random_buffer, nread, 0))
2048 if (decst.mask == ~0)
2049 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2056 decst.mask = postprocess_iso2022_mask(decst.mask);
2058 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2059 int sys = fcd->coding_category_by_priority[i];
2060 if (decst.mask & (1 << sys)) {
2061 Lisp_Object codesys =
2062 fcd->coding_category_system[sys];
2065 subsidiary_coding_system(codesys,
2068 val = Fcons(codesys, val);
2072 Lstream_close(istr);
2074 Lstream_delete(istr);
2075 Lstream_delete(lb_istr);
2079 /************************************************************************/
2080 /* Converting to internal Mule format ("decoding") */
2081 /************************************************************************/
2083 /* A decoding stream is a stream used for decoding text (i.e.
2084 converting from some external format to internal format).
2085 The decoding-stream object keeps track of the actual coding
2086 stream, the stream that is at the other end, and data that
2087 needs to be persistent across the lifetime of the stream. */
2089 /* Handle the EOL stuff related to just-read-in character C.
2090 EOL_TYPE is the EOL type of the coding stream.
2091 FLAGS is the current value of FLAGS in the coding stream, and may
2092 be modified by this macro. (The macro only looks at the
2093 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2094 bytes are to be written. You need to also define a local goto
2095 label "label_continue_loop" that is at the end of the main
2096 character-reading loop.
2098 If C is a CR character, then this macro handles it entirely and
2099 jumps to label_continue_loop. Otherwise, this macro does not add
2100 anything to DST, and continues normally. You should continue
2101 processing C normally after this macro. */
2103 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2107 if (eol_type == EOL_CR) \
2108 Dynarr_add (dst, '\n'); \
2109 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2110 Dynarr_add (dst, c); \
2112 flags |= CODING_STATE_CR; \
2113 goto label_continue_loop; \
2115 else if (flags & CODING_STATE_CR) \
2116 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2118 Dynarr_add (dst, '\r'); \
2119 flags &= ~CODING_STATE_CR; \
2123 /* C should be a binary character in the range 0 - 255; convert
2124 to internal format and add to Dynarr DST. */
2126 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2128 if (BYTE_ASCII_P (c)) \
2129 Dynarr_add (dst, c); \
2130 else if (BYTE_C1_P (c)) \
2132 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2133 Dynarr_add (dst, c + 0x20); \
2137 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2138 Dynarr_add (dst, c); \
2142 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2146 DECODE_ADD_BINARY_CHAR (ch, dst); \
2151 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2153 if (flags & CODING_STATE_END) \
2155 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2156 if (flags & CODING_STATE_CR) \
2157 Dynarr_add (dst, '\r'); \
2161 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2163 typedef struct decoding_stream_s *decoding_stream_t;
2164 struct decoding_stream_s {
2165 /* Coding system that governs the conversion. */
2166 Lisp_Coding_System *codesys;
2168 /* Stream that we read the encoded data from or
2169 write the decoded data to. */
2170 lstream_t other_end;
2172 /* If we are reading, then we can return only a fixed amount of
2173 data, so if the conversion resulted in too much data, we store it
2174 here for retrieval the next time around. */
2175 unsigned_char_dynarr *runoff;
2177 /* FLAGS holds flags indicating the current state of the decoding.
2178 Some of these flags are dependent on the coding system. */
2181 /* CH holds a partially built-up character. Since we only deal
2182 with one- and two-byte characters at the moment, we only use
2183 this to store the first byte of a two-byte character. */
2186 /* EOL_TYPE specifies the type of end-of-line conversion that
2187 currently applies. We need to keep this separate from the
2188 EOL type stored in CODESYS because the latter might indicate
2189 automatic EOL-type detection while the former will always
2190 indicate a particular EOL type. */
2191 eol_type_t eol_type;
2193 /* Additional ISO2022 information. We define the structure above
2194 because it's also needed by the detection routines. */
2195 struct iso2022_decoder iso2022;
2197 /* Additional information (the state of the running CCL program)
2198 used by the CCL decoder. */
2199 struct ccl_program ccl;
2201 /* counter for UTF-8 or UCS-4 */
2202 unsigned char counter;
2204 struct detection_state decst;
2207 static Lstream_data_count
2208 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2209 static Lstream_data_count
2210 decoding_writer(lstream_t stream,
2211 const unsigned char *data, Lstream_data_count size);
2212 static int decoding_rewinder(lstream_t stream);
2213 static int decoding_seekable_p(lstream_t stream);
2214 static int decoding_flusher(lstream_t stream);
2215 static int decoding_closer(lstream_t stream);
2217 static Lisp_Object decoding_marker(Lisp_Object stream);
2219 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2220 sizeof(struct decoding_stream_s));
2223 decoding_marker(Lisp_Object stream)
2225 lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2226 Lisp_Object str_obj;
2228 /* We do not need to mark the coding systems or charsets stored
2229 within the stream because they are stored in a global list
2230 and automatically marked. */
2232 XSETLSTREAM(str_obj, str);
2233 mark_object(str_obj);
2234 if (str->imp->marker) {
2235 return str->imp->marker(str_obj);
2241 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2242 so we read data from the other end, decode it, and store it into DATA. */
2244 static Lstream_data_count
2245 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2247 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2248 unsigned char *orig_data = data;
2249 Lstream_data_count read_size;
2250 int error_occurred = 0;
2252 /* We need to interface to mule_decode(), which expects to take some
2253 amount of data and store the result into a Dynarr. We have
2254 mule_decode() store into str->runoff, and take data from there
2257 /* We loop until we have enough data, reading chunks from the other
2258 end and decoding it. */
2260 /* Take data from the runoff if we can. Make sure to take at
2261 most SIZE bytes, and delete the data from the runoff. */
2262 if (Dynarr_length(str->runoff) > 0) {
2263 Lstream_data_count chunk =
2265 (Lstream_data_count)
2266 Dynarr_length(str->runoff));
2267 memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2268 Dynarr_delete_many(str->runoff, 0, chunk);
2274 /* No more room for data */
2278 if (str->flags & CODING_STATE_END) {
2279 /* This means that on the previous iteration, we hit the
2280 EOF on the other end. We loop once more so that
2281 mule_decode() can output any final stuff it may be
2282 holding, or any "go back to a sane state" escape
2283 sequences. (This latter makes sense during
2288 /* Exhausted the runoff, so get some more. DATA has at least
2289 SIZE bytes left of storage in it, so it's OK to read directly
2290 into it. (We'll be overwriting above, after we've decoded it
2291 into the runoff.) */
2292 read_size = Lstream_read(str->other_end, data, size);
2293 if (read_size < 0) {
2297 if (read_size == 0) {
2298 /* There might be some more end data produced in the
2299 translation. See the comment above. */
2300 str->flags |= CODING_STATE_END;
2302 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2305 if (data - orig_data == 0) {
2306 return error_occurred ? -1 : 0;
2308 return data - orig_data;
2312 static Lstream_data_count
2313 decoding_writer(lstream_t stream, const unsigned char *data,
2314 Lstream_data_count size)
2316 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2317 Lstream_data_count retval;
2319 /* Decode all our data into the runoff, and then attempt to write
2320 it all out to the other end. Remove whatever chunk we succeeded
2322 mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2323 retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2324 Dynarr_length(str->runoff));
2326 Dynarr_delete_many(str->runoff, 0, retval);
2328 /* Do NOT return retval. The return value indicates how much
2329 of the incoming data was written, not how many bytes were
2335 reset_decoding_stream(decoding_stream_t str)
2338 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2339 Lisp_Object coding_system;
2340 XSETCODING_SYSTEM(coding_system, str->codesys);
2341 reset_iso2022(coding_system, &str->iso2022);
2342 } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2343 setup_ccl_program(&str->ccl,
2344 CODING_SYSTEM_CCL_DECODE(str->codesys));
2348 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2349 || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2351 str->decst.eol_type = EOL_AUTODETECT;
2352 str->decst.mask = ~0;
2354 str->flags = str->ch = 0;
2358 decoding_rewinder(lstream_t stream)
2360 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2361 reset_decoding_stream(str);
2362 Dynarr_reset(str->runoff);
2363 return Lstream_rewind(str->other_end);
2367 decoding_seekable_p(lstream_t stream)
2369 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2370 return Lstream_seekable_p(str->other_end);
2374 decoding_flusher(lstream_t stream)
2376 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2377 return Lstream_flush(str->other_end);
2381 decoding_closer(lstream_t stream)
2383 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2384 if (stream->flags & LSTREAM_FL_WRITE) {
2385 str->flags |= CODING_STATE_END;
2386 decoding_writer(stream, 0, 0);
2388 Dynarr_free(str->runoff);
2390 #ifdef ENABLE_COMPOSITE_CHARS
2391 if (str->iso2022.composite_chars) {
2392 Dynarr_free(str->iso2022.composite_chars);
2396 return Lstream_close(str->other_end);
2400 decoding_stream_coding_system(lstream_t stream)
2402 Lisp_Object coding_system;
2403 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2405 XSETCODING_SYSTEM(coding_system, str->codesys);
2406 return subsidiary_coding_system(coding_system, str->eol_type);
2410 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2412 Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2413 decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2415 if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2416 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2418 reset_decoding_stream(str);
2422 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2423 stream for writing, no automatic code detection will be performed.
2424 The reason for this is that automatic code detection requires a
2425 seekable input. Things will also fail if you open a decoding
2426 stream for reading using a non-fully-specified coding system and
2427 a non-seekable input stream. */
2430 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2432 lstream_t lstr = Lstream_new(lstream_decoding, mode);
2433 decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2437 str->other_end = stream;
2438 str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2439 str->eol_type = EOL_AUTODETECT;
2440 if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2441 /* We can determine the coding system now. */
2442 determine_real_coding_system(stream, &codesys, &str->eol_type);
2444 set_decoding_stream_coding_system(lstr, codesys);
2445 str->decst.eol_type = str->eol_type;
2446 str->decst.mask = ~0;
2447 XSETLSTREAM(obj, lstr);
2452 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2454 return make_decoding_stream_1(stream, codesys, "r");
2458 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2460 return make_decoding_stream_1(stream, codesys, "w");
2463 /* Note: the decode_coding_* functions all take the same
2464 arguments as mule_decode(), which is to say some SRC data of
2465 size N, which is to be stored into dynamic array DST.
2466 DECODING is the stream within which the decoding is
2467 taking place, but no data is actually read from or
2468 written to that stream; that is handled in decoding_reader()
2469 or decoding_writer(). This allows the same functions to
2470 be used for both reading and writing. */
2473 mule_decode(lstream_t decoding, const Extbyte * src,
2474 unsigned_char_dynarr * dst, Lstream_data_count n)
2476 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2478 /* If necessary, do encoding-detection now. We do this when
2479 we're a writing stream or a non-seekable reading stream,
2480 meaning that we can't just process the whole input,
2481 rewind, and start over. */
2483 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2484 str->eol_type == EOL_AUTODETECT) {
2485 Lisp_Object codesys;
2487 XSETCODING_SYSTEM(codesys, str->codesys);
2488 detect_coding_type(&str->decst, src, n,
2489 CODING_SYSTEM_TYPE(str->codesys) !=
2490 CODESYS_AUTODETECT);
2491 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2492 str->decst.mask != ~0)
2493 /* #### This is cheesy. What we really ought to do is
2494 buffer up a certain amount of data so as to get a
2495 less random result. */
2496 codesys = coding_system_from_mask(str->decst.mask);
2497 str->eol_type = str->decst.eol_type;
2498 if (XCODING_SYSTEM(codesys) != str->codesys) {
2499 /* Preserve the CODING_STATE_END flag in case it was set.
2500 If we erase it, bad things might happen. */
2501 int was_end = str->flags & CODING_STATE_END;
2502 set_decoding_stream_coding_system(decoding, codesys);
2504 str->flags |= CODING_STATE_END;
2508 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2509 #ifdef DEBUG_SXEMACS
2510 case CODESYS_INTERNAL:
2511 Dynarr_add_many(dst, src, n);
2514 case CODESYS_AUTODETECT:
2515 /* If we got this far and still haven't decided on the coding
2516 system, then do no conversion. */
2517 case CODESYS_NO_CONVERSION:
2518 decode_coding_no_conversion(decoding, src, dst, n);
2521 case CODESYS_SHIFT_JIS:
2522 decode_coding_sjis(decoding, src, dst, n);
2525 decode_coding_big5(decoding, src, dst, n);
2528 decode_coding_ucs4(decoding, src, dst, n);
2531 decode_coding_utf8(decoding, src, dst, n);
2534 str->ccl.last_block = str->flags & CODING_STATE_END;
2535 /* When applying ccl program to stream, MUST NOT set NULL
2537 ccl_driver(&str->ccl,
2539 ? (const unsigned char *)src
2540 : (const unsigned char *)""),
2541 dst, n, 0, CCL_MODE_DECODING);
2543 case CODESYS_ISO2022:
2544 decode_coding_iso2022(decoding, src, dst, n);
2552 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2553 Decode the text between START and END which is encoded in CODING-SYSTEM.
2554 This is useful if you've read in encoded text from a file without decoding
2555 it (e.g. you read in a JIS-formatted file but used the `binary' or
2556 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2557 Return length of decoded text.
2558 BUFFER defaults to the current buffer if unspecified.
2560 (start, end, coding_system, buffer))
2563 struct buffer *buf = decode_buffer(buffer, 0);
2564 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2565 lstream_t istr, ostr;
2566 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2568 get_buffer_range_char(buf, start, end, &b, &e, 0);
2570 barf_if_buffer_read_only(buf, b, e);
2572 coding_system = Fget_coding_system(coding_system);
2573 instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2574 lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2575 de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2577 outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2578 Fget_coding_system(Qbinary));
2579 istr = XLSTREAM(instream);
2580 ostr = XLSTREAM(outstream);
2581 GCPRO4(instream, lb_outstream, de_outstream, outstream);
2583 /* The chain of streams looks like this:
2585 [BUFFER] <----- send through
2586 ------> [ENCODE AS BINARY]
2587 ------> [DECODE AS SPECIFIED]
2592 char tempbuf[1024]; /* some random amount */
2593 Bufpos newpos, even_newer_pos;
2594 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2595 Lstream_data_count size_in_bytes =
2596 Lstream_read(istr, tempbuf, sizeof(tempbuf));
2600 newpos = lisp_buffer_stream_startpos(istr);
2601 Lstream_write(ostr, tempbuf, size_in_bytes);
2602 even_newer_pos = lisp_buffer_stream_startpos(istr);
2603 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2606 Lstream_close(istr);
2607 Lstream_close(ostr);
2609 Lstream_delete(istr);
2610 Lstream_delete(ostr);
2611 Lstream_delete(XLSTREAM(de_outstream));
2612 Lstream_delete(XLSTREAM(lb_outstream));
2616 /************************************************************************/
2617 /* Converting to an external encoding ("encoding") */
2618 /************************************************************************/
2620 /* An encoding stream is an output stream. When you create the
2621 stream, you specify the coding system that governs the encoding
2622 and another stream that the resulting encoded data is to be
2623 sent to, and then start sending data to it. */
2625 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2627 typedef struct encoding_stream_s *encoding_stream_t;
2628 struct encoding_stream_s {
2629 /* Coding system that governs the conversion. */
2630 Lisp_Coding_System *codesys;
2632 /* Stream that we read the encoded data from or
2633 write the decoded data to. */
2634 lstream_t other_end;
2636 /* If we are reading, then we can return only a fixed amount of
2637 data, so if the conversion resulted in too much data, we store it
2638 here for retrieval the next time around. */
2639 unsigned_char_dynarr *runoff;
2641 /* FLAGS holds flags indicating the current state of the encoding.
2642 Some of these flags are dependent on the coding system. */
2645 /* CH holds a partially built-up character. Since we only deal
2646 with one- and two-byte characters at the moment, we only use
2647 this to store the first byte of a two-byte character. */
2650 /* Additional information used by the ISO2022 encoder. */
2652 /* CHARSET holds the character sets currently assigned to the G0
2653 through G3 registers. It is initialized from the array
2654 INITIAL_CHARSET in CODESYS. */
2655 Lisp_Object charset[4];
2657 /* Which registers are currently invoked into the left (GL) and
2658 right (GR) halves of the 8-bit encoding space? */
2659 int register_left, register_right;
2661 /* Whether we need to explicitly designate the charset in the
2662 G? register before using it. It is initialized from the
2663 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2664 unsigned char force_charset_on_output[4];
2666 /* Other state variables that need to be preserved across
2668 Lisp_Object current_charset;
2670 int current_char_boundary;
2673 /* Additional information (the state of the running CCL program)
2674 used by the CCL encoder. */
2675 struct ccl_program ccl;
2679 static Lstream_data_count
2680 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2681 static Lstream_data_count
2682 encoding_writer(lstream_t stream,
2683 const unsigned char *data, Lstream_data_count size);
2684 static int encoding_rewinder(lstream_t stream);
2685 static int encoding_seekable_p(lstream_t stream);
2686 static int encoding_flusher(lstream_t stream);
2687 static int encoding_closer(lstream_t stream);
2689 static Lisp_Object encoding_marker(Lisp_Object stream);
2691 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2692 sizeof(struct encoding_stream_s));
2695 encoding_marker(Lisp_Object stream)
2697 lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2698 Lisp_Object str_obj;
2700 /* We do not need to mark the coding systems or charsets stored
2701 within the stream because they are stored in a global list
2702 and automatically marked. */
2704 XSETLSTREAM(str_obj, str);
2705 mark_object(str_obj);
2706 if (str->imp->marker) {
2707 return str->imp->marker(str_obj);
2713 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2714 so we read data from the other end, encode it, and store it into DATA. */
2716 static Lstream_data_count
2717 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2719 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2720 unsigned char *orig_data = data;
2721 Lstream_data_count read_size;
2722 int error_occurred = 0;
2724 /* We need to interface to mule_encode(), which expects to take some
2725 amount of data and store the result into a Dynarr. We have
2726 mule_encode() store into str->runoff, and take data from there
2729 /* We loop until we have enough data, reading chunks from the other
2730 end and encoding it. */
2732 /* Take data from the runoff if we can. Make sure to take at
2733 most SIZE bytes, and delete the data from the runoff. */
2734 if (Dynarr_length(str->runoff) > 0) {
2735 int chunk = min((int)size, Dynarr_length(str->runoff));
2736 memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2737 Dynarr_delete_many(str->runoff, 0, chunk);
2743 /* No more room for data */
2747 if (str->flags & CODING_STATE_END) {
2748 /* This means that on the previous iteration, we hit the
2749 EOF on the other end. We loop once more so that
2750 mule_encode() can output any final stuff it may be
2751 holding, or any "go back to a sane state" escape
2752 sequences. (This latter makes sense during
2757 /* Exhausted the runoff, so get some more. DATA at least SIZE
2758 bytes left of storage in it, so it's OK to read directly into
2759 it. (We'll be overwriting above, after we've encoded it into
2761 read_size = Lstream_read(str->other_end, data, size);
2762 if (read_size < 0) {
2766 if (read_size == 0) {
2767 /* There might be some more end data produced in the
2768 translation. See the comment above. */
2769 str->flags |= CODING_STATE_END;
2771 mule_encode(stream, data, str->runoff, read_size);
2774 if (data == orig_data) {
2775 return error_occurred ? -1 : 0;
2777 return data - orig_data;
2781 static Lstream_data_count
2782 encoding_writer(lstream_t stream, const unsigned char *data,
2783 Lstream_data_count size)
2785 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2786 Lstream_data_count retval;
2788 /* Encode all our data into the runoff, and then attempt to write
2789 it all out to the other end. Remove whatever chunk we succeeded
2791 mule_encode(stream, data, str->runoff, size);
2792 retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2793 Dynarr_length(str->runoff));
2795 Dynarr_delete_many(str->runoff, 0, retval);
2797 /* Do NOT return retval. The return value indicates how much
2798 of the incoming data was written, not how many bytes were
2804 reset_encoding_stream(encoding_stream_t str)
2807 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2808 case CODESYS_ISO2022: {
2811 for (i = 0; i < 4; i++) {
2812 str->iso2022.charset[i] =
2813 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2815 str->iso2022.force_charset_on_output[i] =
2816 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2819 str->iso2022.register_left = 0;
2820 str->iso2022.register_right = 1;
2821 str->iso2022.current_charset = Qnil;
2822 str->iso2022.current_half = 0;
2823 str->iso2022.current_char_boundary = 1;
2827 setup_ccl_program(&str->ccl,
2828 CODING_SYSTEM_CCL_ENCODE(str->codesys));
2831 /* list the rest of them lot explicitly */
2832 case CODESYS_AUTODETECT:
2833 case CODESYS_SHIFT_JIS:
2837 case CODESYS_NO_CONVERSION:
2838 #ifdef DEBUG_SXEMACS
2839 case CODESYS_INTERNAL:
2846 str->flags = str->ch = 0;
2850 encoding_rewinder(lstream_t stream)
2852 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2853 reset_encoding_stream(str);
2854 Dynarr_reset(str->runoff);
2855 return Lstream_rewind(str->other_end);
2859 encoding_seekable_p(lstream_t stream)
2861 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2862 return Lstream_seekable_p(str->other_end);
2866 encoding_flusher(lstream_t stream)
2868 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2869 return Lstream_flush(str->other_end);
2873 encoding_closer(lstream_t stream)
2875 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2876 if (stream->flags & LSTREAM_FL_WRITE) {
2877 str->flags |= CODING_STATE_END;
2878 encoding_writer(stream, 0, 0);
2880 Dynarr_free(str->runoff);
2881 return Lstream_close(str->other_end);
2885 encoding_stream_coding_system(lstream_t stream)
2887 Lisp_Object coding_system;
2888 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2890 XSETCODING_SYSTEM(coding_system, str->codesys);
2891 return coding_system;
2895 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2897 Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2898 encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2900 reset_encoding_stream(str);
2904 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2906 lstream_t lstr = Lstream_new(lstream_encoding, mode);
2907 encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2911 str->runoff = Dynarr_new(unsigned_char);
2912 str->other_end = stream;
2913 set_encoding_stream_coding_system(lstr, codesys);
2914 XSETLSTREAM(obj, lstr);
2919 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2921 return make_encoding_stream_1(stream, codesys, "r");
2925 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2927 return make_encoding_stream_1(stream, codesys, "w");
2930 /* Convert N bytes of internally-formatted data stored in SRC to an
2931 external format, according to the encoding stream ENCODING.
2932 Store the encoded data into DST. */
2935 mule_encode(lstream_t encoding, const Bufbyte * src,
2936 unsigned_char_dynarr * dst, Lstream_data_count n)
2938 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2940 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2941 #ifdef DEBUG_SXEMACS
2942 case CODESYS_INTERNAL:
2943 Dynarr_add_many(dst, src, n);
2946 case CODESYS_AUTODETECT:
2947 /* If we got this far and still haven't decided on the coding
2948 system, then do no conversion. */
2949 case CODESYS_NO_CONVERSION:
2950 encode_coding_no_conversion(encoding, src, dst, n);
2953 case CODESYS_SHIFT_JIS:
2954 encode_coding_sjis(encoding, src, dst, n);
2957 encode_coding_big5(encoding, src, dst, n);
2960 encode_coding_ucs4(encoding, src, dst, n);
2963 encode_coding_utf8(encoding, src, dst, n);
2966 str->ccl.last_block = str->flags & CODING_STATE_END;
2967 /* When applying ccl program to stream, MUST NOT set NULL
2969 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
2970 dst, n, 0, CCL_MODE_ENCODING);
2972 case CODESYS_ISO2022:
2973 encode_coding_iso2022(encoding, src, dst, n);
2981 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2982 Encode the text between START and END using CODING-SYSTEM.
2983 This will, for example, convert Japanese characters into stuff such as
2984 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2985 text. BUFFER defaults to the current buffer if unspecified.
2987 (start, end, coding_system, buffer))
2990 struct buffer *buf = decode_buffer(buffer, 0);
2991 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2992 lstream_t istr, ostr;
2993 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2995 get_buffer_range_char(buf, start, end, &b, &e, 0);
2997 barf_if_buffer_read_only(buf, b, e);
2999 coding_system = Fget_coding_system(coding_system);
3000 instream = make_lisp_buffer_input_stream(buf, b, e, 0);
3001 lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
3002 de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
3003 Fget_coding_system(Qbinary));
3004 outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
3006 istr = XLSTREAM(instream);
3007 ostr = XLSTREAM(outstream);
3008 GCPRO4(instream, outstream, de_outstream, lb_outstream);
3009 /* The chain of streams looks like this:
3011 [BUFFER] <----- send through
3012 ------> [ENCODE AS SPECIFIED]
3013 ------> [DECODE AS BINARY]
3017 char tempbuf[1024]; /* some random amount */
3018 Bufpos newpos, even_newer_pos;
3019 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
3020 Lstream_data_count size_in_bytes =
3021 Lstream_read(istr, tempbuf, sizeof(tempbuf));
3025 newpos = lisp_buffer_stream_startpos(istr);
3026 Lstream_write(ostr, tempbuf, size_in_bytes);
3027 even_newer_pos = lisp_buffer_stream_startpos(istr);
3028 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3034 lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3035 Lstream_close(istr);
3036 Lstream_close(ostr);
3038 Lstream_delete(istr);
3039 Lstream_delete(ostr);
3040 Lstream_delete(XLSTREAM(de_outstream));
3041 Lstream_delete(XLSTREAM(lb_outstream));
3042 return make_int(retlen);
3048 /************************************************************************/
3049 /* Shift-JIS methods */
3050 /************************************************************************/
3052 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3053 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3054 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3055 encoded by "position-code + 0x80". A character of JISX0208
3056 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3057 position-codes are divided and shifted so that it fit in the range
3060 --- CODE RANGE of Shift-JIS ---
3061 (character set) (range)
3063 JISX0201-Kana 0xA0 .. 0xDF
3064 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3065 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3066 -------------------------------
3070 /* Is this the first byte of a Shift-JIS two-byte char? */
3072 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3073 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3075 /* Is this the second byte of a Shift-JIS two-byte char? */
3077 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3078 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3080 #define BYTE_SJIS_KATAKANA_P(c) \
3081 ((c) >= 0xA1 && (c) <= 0xDF)
3084 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3085 Lstream_data_count n)
3088 const unsigned char c = *(const unsigned char *)src++;
3089 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3091 if (st->shift_jis.in_second_byte) {
3092 st->shift_jis.in_second_byte = 0;
3095 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3096 st->shift_jis.in_second_byte = 1;
3098 return CODING_CATEGORY_SHIFT_JIS_MASK;
3101 /* Convert Shift-JIS data to internal format. */
3104 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3105 unsigned_char_dynarr * dst, Lstream_data_count n)
3107 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3108 unsigned int flags = str->flags;
3109 unsigned int ch = str->ch;
3110 eol_type_t eol_type = str->eol_type;
3113 const unsigned char c = *(const unsigned char *)src++;
3116 /* Previous character was first byte of Shift-JIS Kanji
3118 if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3119 unsigned char e1, e2;
3121 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3122 DECODE_SJIS(ch, c, e1, e2);
3123 Dynarr_add(dst, e1);
3124 Dynarr_add(dst, e2);
3126 DECODE_ADD_BINARY_CHAR(ch, dst);
3127 DECODE_ADD_BINARY_CHAR(c, dst);
3131 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3132 if (BYTE_SJIS_TWO_BYTE_1_P(c))
3134 else if (BYTE_SJIS_KATAKANA_P(c)) {
3135 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3138 DECODE_ADD_BINARY_CHAR(c, dst);
3140 label_continue_loop:;
3143 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3149 /* Convert internally-formatted data to Shift-JIS. */
3152 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3153 unsigned_char_dynarr * dst, Lstream_data_count n)
3155 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3156 unsigned int flags = str->flags;
3157 unsigned int ch = str->ch;
3158 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3163 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3164 Dynarr_add(dst, '\r');
3165 if (eol_type != EOL_CR)
3166 Dynarr_add(dst, '\n');
3168 } else if (BYTE_ASCII_P(c)) {
3171 } else if (BUFBYTE_LEADING_BYTE_P(c))
3172 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3173 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3174 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3176 if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3179 } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3180 ch == LEADING_BYTE_JAPANESE_JISX0208)
3183 /* j1 is bessel j1 function,
3184 * so we use something else */
3185 /* unsigned char j1, j2; */
3186 unsigned char tt1, tt2;
3188 ENCODE_SJIS(ch, c, tt1, tt2);
3189 Dynarr_add(dst, tt1);
3190 Dynarr_add(dst, tt2);
3200 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3201 Decode a JISX0208 character of Shift-JIS coding-system.
3202 CODE is the character code in Shift-JIS as a cons of type bytes.
3203 Return the corresponding character.
3207 unsigned char c1, c2, s1, s2;
3210 CHECK_INT(XCAR(code));
3211 CHECK_INT(XCDR(code));
3212 s1 = XINT(XCAR(code));
3213 s2 = XINT(XCDR(code));
3214 if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3215 DECODE_SJIS(s1, s2, c1, c2);
3216 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3217 c1 & 0x7F, c2 & 0x7F));
3222 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3223 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3224 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3228 Lisp_Object charset;
3231 CHECK_CHAR_COERCE_INT(character);
3232 BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3233 if (EQ(charset, Vcharset_japanese_jisx0208)) {
3234 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3235 return Fcons(make_int(s1), make_int(s2));
3240 /************************************************************************/
3242 /************************************************************************/
3244 /* BIG5 is a coding system encoding two character sets: ASCII and
3245 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3246 character set and is encoded in two-byte.
3248 --- CODE RANGE of BIG5 ---
3249 (character set) (range)
3251 Big5 (1st byte) 0xA1 .. 0xFE
3252 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3253 --------------------------
3255 Since the number of characters in Big5 is larger than maximum
3256 characters in Emacs' charset (96x96), it can't be handled as one
3257 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3258 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3259 contains frequently used characters and the latter contains less
3260 frequently used characters. */
3262 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3263 ((c) >= 0xA1 && (c) <= 0xFE)
3265 /* Is this the second byte of a Shift-JIS two-byte char? */
3267 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3268 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3270 /* Number of Big5 characters which have the same code in 1st byte. */
3272 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3274 /* Code conversion macros. These are macros because they are used in
3275 inner loops during code conversion.
3277 Note that temporary variables in macros introduce the classic
3278 dynamic-scoping problems with variable names. We use capital-
3279 lettered variables in the assumption that SXEmacs does not use
3280 capital letters in variables except in a very formalized way
3283 /* Convert Big5 code (b1, b2) into its internal string representation
3286 /* There is a much simpler way to split the Big5 charset into two.
3287 For the moment I'm going to leave the algorithm as-is because it
3288 claims to separate out the most-used characters into a single
3289 charset, which perhaps will lead to optimizations in various
3292 The way the algorithm works is something like this:
3294 Big5 can be viewed as a 94x157 charset, where the row is
3295 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3296 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3297 the split between low and high column numbers is apparently
3298 meaningless; ascending rows produce less and less frequent chars.
3299 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3300 the first charset, and the upper half (0xC9 .. 0xFE) to the
3301 second. To do the conversion, we convert the character into
3302 a single number where 0 .. 156 is the first row, 157 .. 313
3303 is the second, etc. That way, the characters are ordered by
3304 decreasing frequency. Then we just chop the space in two
3305 and coerce the result into a 94x94 space.
3308 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3310 int B1 = b1, B2 = b2; \
3312 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3316 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3320 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3321 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3323 c1 = I / (0xFF - 0xA1) + 0xA1; \
3324 c2 = I % (0xFF - 0xA1) + 0xA1; \
3327 /* Convert the internal string representation of a Big5 character
3328 (lb, c1, c2) into Big5 code (b1, b2). */
3330 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3332 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3334 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3336 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3338 b1 = I / BIG5_SAME_ROW + 0xA1; \
3339 b2 = I % BIG5_SAME_ROW; \
3340 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3344 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3345 Lstream_data_count n)
3348 const unsigned char c = *(const unsigned char *)src++;
3349 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3350 (c >= 0x80 && c <= 0xA0))
3352 if (st->big5.in_second_byte) {
3353 st->big5.in_second_byte = 0;
3354 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3356 } else if (c >= 0xA1)
3357 st->big5.in_second_byte = 1;
3359 return CODING_CATEGORY_BIG5_MASK;
3362 /* Convert Big5 data to internal format. */
3365 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3366 unsigned_char_dynarr * dst, Lstream_data_count n)
3368 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3369 unsigned int flags = str->flags;
3370 unsigned int ch = str->ch;
3371 eol_type_t eol_type = str->eol_type;
3374 const unsigned char c = *(const unsigned char *)src++;
3376 /* Previous character was first byte of Big5 char. */
3377 if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3378 unsigned char b1, b2, b3;
3379 DECODE_BIG5(ch, c, b1, b2, b3);
3380 Dynarr_add(dst, b1);
3381 Dynarr_add(dst, b2);
3382 Dynarr_add(dst, b3);
3384 DECODE_ADD_BINARY_CHAR(ch, dst);
3385 DECODE_ADD_BINARY_CHAR(c, dst);
3389 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3390 if (BYTE_BIG5_TWO_BYTE_1_P(c))
3393 DECODE_ADD_BINARY_CHAR(c, dst);
3395 label_continue_loop:;
3398 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3404 /* Convert internally-formatted data to Big5. */
3407 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3408 unsigned_char_dynarr * dst, Lstream_data_count n)
3411 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3412 unsigned int flags = str->flags;
3413 unsigned int ch = str->ch;
3414 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3419 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3420 Dynarr_add(dst, '\r');
3421 if (eol_type != EOL_CR)
3422 Dynarr_add(dst, '\n');
3423 } else if (BYTE_ASCII_P(c)) {
3426 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3427 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3428 c == LEADING_BYTE_CHINESE_BIG5_2) {
3429 /* A recognized leading byte. */
3431 continue; /* not done with this character. */
3433 /* otherwise just ignore this character. */
3434 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3435 ch == LEADING_BYTE_CHINESE_BIG5_2) {
3436 /* Previous char was a recognized leading byte. */
3438 continue; /* not done with this character. */
3440 /* Encountering second byte of a Big5 character. */
3441 unsigned char b1, b2;
3443 ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3444 Dynarr_add(dst, b1);
3445 Dynarr_add(dst, b2);
3455 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3456 Decode a Big5 character CODE of BIG5 coding-system.
3457 CODE is the character code in BIG5, a cons of two integers.
3458 Return the corresponding character.
3462 unsigned char c1, c2, b1, b2;
3465 CHECK_INT(XCAR(code));
3466 CHECK_INT(XCDR(code));
3467 b1 = XINT(XCAR(code));
3468 b2 = XINT(XCDR(code));
3469 if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3471 Lisp_Object charset;
3472 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3473 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3474 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3479 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3480 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3481 Return the corresponding character code in Big5.
3485 Lisp_Object charset;
3488 CHECK_CHAR_COERCE_INT(character);
3489 BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3490 if (EQ(charset, Vcharset_chinese_big5_1) ||
3491 EQ(charset, Vcharset_chinese_big5_2)) {
3492 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3494 return Fcons(make_int(b1), make_int(b2));
3499 /************************************************************************/
3502 /* UCS-4 character codes are implemented as nonnegative integers. */
3504 /************************************************************************/
3506 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3507 Map UCS-4 code CODE to Mule character CHARACTER.
3509 Return T on success, NIL on failure.
3515 CHECK_CHAR(character);
3519 if (c < countof(fcd->ucs_to_mule_table)) {
3520 fcd->ucs_to_mule_table[c] = character;
3526 static Lisp_Object ucs_to_char(unsigned long code)
3528 if (code < countof(fcd->ucs_to_mule_table)) {
3529 return fcd->ucs_to_mule_table[code];
3530 } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3534 c = code % (94 * 94);
3536 (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3537 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3538 CHARSET_LEFT_TO_RIGHT),
3539 c / 94 + 33, c % 94 + 33));
3544 DEFUN("ucs-char", Fucs_char, 1, 1, 0, /*
3545 Return Mule character corresponding to UCS code CODE (a positive integer).
3550 return ucs_to_char(XINT(code));
3553 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3554 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3558 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3559 Fset_char_ucs is more restrictive on index arg, but should
3560 check code arg in a char_table method. */
3561 CHECK_CHAR(character);
3563 return Fput_char_table(character, code, mule_to_ucs_table);
3566 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0, /*
3567 Return the UCS code (a positive integer) corresponding to CHARACTER.
3571 return Fget_char_table(character, mule_to_ucs_table);
3574 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3575 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3576 is not found, instead.
3577 #### do something more appropriate (use blob?)
3578 Danger, Will Robinson! Data loss. Should we signal user? */
3579 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3581 Lisp_Object chr = ucs_to_char(ch);
3584 Bufbyte work[MAX_EMCHAR_LEN];
3589 simple_set_charptr_emchar(work, ch) :
3590 non_ascii_set_charptr_emchar(work, ch);
3591 Dynarr_add_many(dst, work, len);
3593 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3594 Dynarr_add(dst, 34 + 128);
3595 Dynarr_add(dst, 46 + 128);
3599 static unsigned long
3600 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3603 = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3608 } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3609 (XCHARSET_CHARS(charset) == 94)) {
3610 unsigned char final = XCHARSET_FINAL(charset);
3612 if (('@' <= final) && (final < 0x7f)) {
3613 return 0xe00000 + (final - '@') * 94 * 94
3614 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3624 encode_ucs4(Lisp_Object charset,
3625 unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3627 unsigned long code = mule_char_to_ucs4(charset, h, l);
3628 Dynarr_add(dst, code >> 24);
3629 Dynarr_add(dst, (code >> 16) & 255);
3630 Dynarr_add(dst, (code >> 8) & 255);
3631 Dynarr_add(dst, code & 255);
3635 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3636 Lstream_data_count n)
3639 const unsigned char c = *(const unsigned char *)src++;
3640 switch (st->ucs4.in_byte) {
3648 st->ucs4.in_byte = 0;
3654 return CODING_CATEGORY_UCS4_MASK;
3658 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3659 unsigned_char_dynarr * dst, Lstream_data_count n)
3661 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3662 unsigned int flags = str->flags;
3663 unsigned int ch = str->ch;
3664 unsigned char counter = str->counter;
3667 const unsigned char c = *(const unsigned char *)src++;
3674 decode_ucs4((ch << 8) | c, dst);
3683 if (counter & CODING_STATE_END)
3684 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3688 str->counter = counter;
3692 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3693 unsigned_char_dynarr * dst, Lstream_data_count n)
3695 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3696 unsigned int flags = str->flags;
3697 unsigned int ch = str->ch;
3698 unsigned char char_boundary = str->iso2022.current_char_boundary;
3699 Lisp_Object charset = str->iso2022.current_charset;
3701 #ifdef ENABLE_COMPOSITE_CHARS
3702 /* flags for handling composite chars. We do a little switcharoo
3703 on the source while we're outputting the composite char. */
3704 unsigned int saved_n = 0;
3705 const unsigned char *saved_src = NULL;
3706 int in_composite = 0;
3712 unsigned char c = *src++;
3714 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
3716 encode_ucs4(Vcharset_ascii, c, 0, dst);
3718 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
3720 charset = CHARSET_BY_LEADING_BYTE(c);
3721 if (LEADING_BYTE_PREFIX_P(c))
3724 } else { /* Processing Non-ASCII character */
3726 if (EQ(charset, Vcharset_control_1)) {
3727 encode_ucs4(Vcharset_control_1, c, 0, dst);
3729 switch (XCHARSET_REP_BYTES(charset)) {
3731 encode_ucs4(charset, c, 0, dst);
3734 if (XCHARSET_PRIVATE_P(charset)) {
3735 encode_ucs4(charset, c, 0, dst);
3738 #ifdef ENABLE_COMPOSITE_CHARS
3741 Vcharset_composite)) {
3743 /* #### Bother! We don't know how to
3756 (Vcharset_composite,
3761 composite_char_string
3770 n = XSTRING_LENGTH(lstr);
3773 #endif /* ENABLE_COMPOSITE_CHARS */
3775 encode_ucs4(charset, ch,
3786 encode_ucs4(charset, ch, c,
3801 #ifdef ENABLE_COMPOSITE_CHARS
3806 goto back_to_square_n; /* Wheeeeeeeee ..... */
3808 #endif /* ENABLE_COMPOSITE_CHARS */
3812 str->iso2022.current_char_boundary = char_boundary;
3813 str->iso2022.current_charset = charset;
3815 /* Verbum caro factum est! */
3818 /************************************************************************/
3820 /************************************************************************/
3823 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3824 Lstream_data_count n)
3827 const unsigned char c = *(const unsigned char *)src++;
3828 switch (st->utf8.in_byte) {
3830 if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3831 || c == ISO_CODE_SO)
3834 st->utf8.in_byte = 5;
3836 st->utf8.in_byte = 4;
3838 st->utf8.in_byte = 3;
3840 st->utf8.in_byte = 2;
3842 st->utf8.in_byte = 1;
3847 if ((c & 0xc0) != 0x80)
3853 return CODING_CATEGORY_UTF8_MASK;
3857 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3858 unsigned_char_dynarr * dst, Lstream_data_count n)
3860 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3861 unsigned int flags = str->flags;
3862 unsigned int ch = str->ch;
3863 eol_type_t eol_type = str->eol_type;
3864 unsigned char counter = str->counter;
3867 const unsigned char c = *(const unsigned char *)src++;
3873 } else if (c >= 0xf8) {
3876 } else if (c >= 0xf0) {
3879 } else if (c >= 0xe0) {
3882 } else if (c >= 0xc0) {
3886 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3887 decode_ucs4(c, dst);
3891 ch = (ch << 6) | (c & 0x3f);
3892 decode_ucs4(ch, dst);
3897 ch = (ch << 6) | (c & 0x3f);
3900 label_continue_loop:;
3903 if (flags & CODING_STATE_END)
3904 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3908 str->counter = counter;
3912 encode_utf8(Lisp_Object charset,
3913 unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3915 unsigned long code = mule_char_to_ucs4(charset, h, l);
3917 Dynarr_add(dst, code);
3918 } else if (code <= 0x7ff) {
3919 Dynarr_add(dst, (code >> 6) | 0xc0);
3920 Dynarr_add(dst, (code & 0x3f) | 0x80);
3921 } else if (code <= 0xffff) {
3922 Dynarr_add(dst, (code >> 12) | 0xe0);
3923 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3924 Dynarr_add(dst, (code & 0x3f) | 0x80);
3925 } else if (code <= 0x1fffff) {
3926 Dynarr_add(dst, (code >> 18) | 0xf0);
3927 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3928 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3929 Dynarr_add(dst, (code & 0x3f) | 0x80);
3930 } else if (code <= 0x3ffffff) {
3931 Dynarr_add(dst, (code >> 24) | 0xf8);
3932 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3933 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3934 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3935 Dynarr_add(dst, (code & 0x3f) | 0x80);
3937 Dynarr_add(dst, (code >> 30) | 0xfc);
3938 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3939 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3940 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3941 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3942 Dynarr_add(dst, (code & 0x3f) | 0x80);
3947 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3948 unsigned_char_dynarr * dst, Lstream_data_count n)
3950 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3951 unsigned int flags = str->flags;
3952 unsigned int ch = str->ch;
3953 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3954 unsigned char char_boundary = str->iso2022.current_char_boundary;
3955 Lisp_Object charset = str->iso2022.current_charset;
3957 #ifdef ENABLE_COMPOSITE_CHARS
3958 /* flags for handling composite chars. We do a little switcharoo
3959 on the source while we're outputting the composite char. */
3960 unsigned int saved_n = 0;
3961 const unsigned char *saved_src = NULL;
3962 int in_composite = 0;
3965 #endif /* ENABLE_COMPOSITE_CHARS */
3968 unsigned char c = *src++;
3970 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
3973 if (eol_type != EOL_LF
3974 && eol_type != EOL_AUTODETECT)
3975 Dynarr_add(dst, '\r');
3976 if (eol_type != EOL_CR)
3979 encode_utf8(Vcharset_ascii, c, 0, dst);
3981 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
3983 charset = CHARSET_BY_LEADING_BYTE(c);
3984 if (LEADING_BYTE_PREFIX_P(c))
3987 } else { /* Processing Non-ASCII character */
3989 if (EQ(charset, Vcharset_control_1)) {
3990 encode_utf8(Vcharset_control_1, c, 0, dst);
3992 switch (XCHARSET_REP_BYTES(charset)) {
3994 encode_utf8(charset, c, 0, dst);
3997 if (XCHARSET_PRIVATE_P(charset)) {
3998 encode_utf8(charset, c, 0, dst);
4001 #ifdef ENABLE_COMPOSITE_CHARS
4004 Vcharset_composite)) {
4006 /* #### Bother! We don't know how to
4015 (Vcharset_composite,
4020 composite_char_string
4029 n = XSTRING_LENGTH(lstr);
4032 #endif /* ENABLE_COMPOSITE_CHARS */
4034 encode_utf8(charset, ch,
4045 encode_utf8(charset, ch, c,
4060 #ifdef ENABLE_COMPOSITE_CHARS
4065 goto back_to_square_n; /* Wheeeeeeeee ..... */
4071 str->iso2022.current_char_boundary = char_boundary;
4072 str->iso2022.current_charset = charset;
4074 /* Verbum caro factum est! */
4077 /************************************************************************/
4078 /* ISO2022 methods */
4079 /************************************************************************/
4081 /* The following note describes the coding system ISO2022 briefly.
4082 Since the intention of this note is to help understand the
4083 functions in this file, some parts are NOT ACCURATE or OVERLY
4084 SIMPLIFIED. For thorough understanding, please refer to the
4085 original document of ISO2022.
4087 ISO2022 provides many mechanisms to encode several character sets
4088 in 7-bit and 8-bit environments. For 7-bit environments, all text
4089 is encoded using bytes less than 128. This may make the encoded
4090 text a little bit longer, but the text passes more easily through
4091 several gateways, some of which strip off MSB (Most Signigant Bit).
4093 There are two kinds of character sets: control character set and
4094 graphic character set. The former contains control characters such
4095 as `newline' and `escape' to provide control functions (control
4096 functions are also provided by escape sequences). The latter
4097 contains graphic characters such as 'A' and '-'. Emacs recognizes
4098 two control character sets and many graphic character sets.
4100 Graphic character sets are classified into one of the following
4101 four classes, according to the number of bytes (DIMENSION) and
4102 number of characters in one dimension (CHARS) of the set:
4103 - DIMENSION1_CHARS94
4104 - DIMENSION1_CHARS96
4105 - DIMENSION2_CHARS94
4106 - DIMENSION2_CHARS96
4108 In addition, each character set is assigned an identification tag,
4109 unique for each set, called "final character" (denoted as <F>
4110 hereafter). The <F> of each character set is decided by ECMA(*)
4111 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4112 (0x30..0x3F are for private use only).
4114 Note (*): ECMA = European Computer Manufacturers Association
4116 Here are examples of graphic character set [NAME(<F>)]:
4117 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4118 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4119 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4120 o DIMENSION2_CHARS96 -- none for the moment
4122 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4123 C0 [0x00..0x1F] -- control character plane 0
4124 GL [0x20..0x7F] -- graphic character plane 0
4125 C1 [0x80..0x9F] -- control character plane 1
4126 GR [0xA0..0xFF] -- graphic character plane 1
4128 A control character set is directly designated and invoked to C0 or
4129 C1 by an escape sequence. The most common case is that:
4130 - ISO646's control character set is designated/invoked to C0, and
4131 - ISO6429's control character set is designated/invoked to C1,
4132 and usually these designations/invocations are omitted in encoded
4133 text. In a 7-bit environment, only C0 can be used, and a control
4134 character for C1 is encoded by an appropriate escape sequence to
4135 fit into the environment. All control characters for C1 are
4136 defined to have corresponding escape sequences.
4138 A graphic character set is at first designated to one of four
4139 graphic registers (G0 through G3), then these graphic registers are
4140 invoked to GL or GR. These designations and invocations can be
4141 done independently. The most common case is that G0 is invoked to
4142 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4143 these invocations and designations are omitted in encoded text.
4144 In a 7-bit environment, only GL can be used.
4146 When a graphic character set of CHARS94 is invoked to GL, codes
4147 0x20 and 0x7F of the GL area work as control characters SPACE and
4148 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4151 There are two ways of invocation: locking-shift and single-shift.
4152 With locking-shift, the invocation lasts until the next different
4153 invocation, whereas with single-shift, the invocation affects the
4154 following character only and doesn't affect the locking-shift
4155 state. Invocations are done by the following control characters or
4158 ----------------------------------------------------------------------
4159 abbrev function cntrl escape seq description
4160 ----------------------------------------------------------------------
4161 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4162 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4163 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4164 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4165 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4166 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4167 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4168 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4169 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4170 ----------------------------------------------------------------------
4171 (*) These are not used by any known coding system.
4173 Control characters for these functions are defined by macros
4174 ISO_CODE_XXX in `coding.h'.
4176 Designations are done by the following escape sequences:
4177 ----------------------------------------------------------------------
4178 escape sequence description
4179 ----------------------------------------------------------------------
4180 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4181 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4182 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4183 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4184 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4185 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4186 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4187 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4188 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4189 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4190 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4191 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4192 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4193 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4194 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4195 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4196 ----------------------------------------------------------------------
4198 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4199 of dimension 1, chars 94, and final character <F>, etc...
4201 Note (*): Although these designations are not allowed in ISO2022,
4202 Emacs accepts them on decoding, and produces them on encoding
4203 CHARS96 character sets in a coding system which is characterized as
4204 7-bit environment, non-locking-shift, and non-single-shift.
4206 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4207 '(' can be omitted. We refer to this as "short-form" hereafter.
4209 Now you may notice that there are a lot of ways for encoding the
4210 same multilingual text in ISO2022. Actually, there exist many
4211 coding systems such as Compound Text (used in X11's inter client
4212 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4213 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4214 localized platforms), and all of these are variants of ISO2022.
4216 In addition to the above, Emacs handles two more kinds of escape
4217 sequences: ISO6429's direction specification and Emacs' private
4218 sequence for specifying character composition.
4220 ISO6429's direction specification takes the following form:
4221 o CSI ']' -- end of the current direction
4222 o CSI '0' ']' -- end of the current direction
4223 o CSI '1' ']' -- start of left-to-right text
4224 o CSI '2' ']' -- start of right-to-left text
4225 The control character CSI (0x9B: control sequence introducer) is
4226 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4228 Character composition specification takes the following form:
4229 o ESC '0' -- start character composition
4230 o ESC '1' -- end character composition
4231 Since these are not standard escape sequences of any ISO standard,
4232 their use with these meanings is restricted to Emacs only. */
4235 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4239 for (i = 0; i < 4; i++) {
4240 if (!NILP(coding_system))
4242 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4245 iso->charset[i] = Qt;
4246 iso->invalid_designated[i] = 0;
4248 iso->esc = ISO_ESC_NOTHING;
4249 iso->esc_bytes_index = 0;
4250 iso->register_left = 0;
4251 iso->register_right = 1;
4252 iso->switched_dir_and_no_valid_charset_yet = 0;
4253 iso->invalid_switch_dir = 0;
4254 iso->output_direction_sequence = 0;
4255 iso->output_literally = 0;
4256 #ifdef ENABLE_COMPOSITE_CHARS
4257 if (iso->composite_chars)
4258 Dynarr_reset(iso->composite_chars);
4262 static int fit_to_be_escape_quoted(unsigned char c)
4278 /* Parse one byte of an ISO2022 escape sequence.
4279 If the result is an invalid escape sequence, return 0 and
4280 do not change anything in STR. Otherwise, if the result is
4281 an incomplete escape sequence, update ISO2022.ESC and
4282 ISO2022.ESC_BYTES and return -1. Otherwise, update
4283 all the state variables (but not ISO2022.ESC_BYTES) and
4286 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4287 or invocation of an invalid character set and treat that as
4288 an unrecognized escape sequence.
4290 ********************************************************************
4292 #### Strategies for error annotation and coding orthogonalization
4294 We really want to separate out a number of things. Conceptually,
4295 there is a nested syntax.
4297 At the top level is the ISO 2022 extension syntax, including charset
4298 designation and invocation, and certain auxiliary controls such as the
4299 ISO 6429 direction specification. These are octet-oriented, with the
4300 single exception (AFAIK) of the "exit Unicode" sequence which uses the
4301 UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4302 UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a
4303 (deprecated) special case in Unicode processing.
4305 The middle layer is ISO 2022 character interpretation. This will depend
4306 on the current state of the ISO 2022 registers, and assembles octets
4307 into the character's internal representation.
4309 The lowest level is translating system control conventions. At present
4310 this is restricted to newline translation, but one could imagine doing
4311 tab conversion or line wrapping here. "Escape from Unicode" processing
4312 would be done at this level.
4314 At each level the parser will verify the syntax. In the case of a
4315 syntax error or warning (such as a redundant escape sequence that affects
4316 no characters), the parser will take some action, typically inserting the
4317 erroneous octets directly into the output and creating an annotation
4318 which can be used by higher level I/O to mark the affected region.
4320 This should make it possible to do something sensible about separating
4321 newline convention processing from character construction, and about
4322 preventing ISO 2022 escape sequences from being recognized
4325 The basic strategy will be to have octet classification tables, and
4326 switch processing according to the table entry.
4328 It's possible that, by doing the processing with tables of functions or
4329 the like, the parser can be used for both detection and translation. */
4332 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4333 unsigned char c, unsigned int *flags,
4334 int check_invalid_charsets)
4336 /* (1) If we're at the end of a designation sequence, CS is the
4337 charset being designated and REG is the register to designate
4340 (2) If we're at the end of a locking-shift sequence, REG is
4341 the register to invoke and HALF (0 == left, 1 == right) is
4342 the half to invoke it into.
4344 (3) If we're at the end of a single-shift sequence, REG is
4345 the register to invoke. */
4346 Lisp_Object cs = Qnil;
4349 /* NOTE: This code does goto's all over the fucking place.
4350 The reason for this is that we're basically implementing
4351 a state machine here, and hierarchical languages like C
4352 don't really provide a clean way of doing this. */
4354 if (!(*flags & CODING_STATE_ESCAPE))
4355 /* At beginning of escape sequence; we need to reset our
4356 escape-state variables. */
4357 iso->esc = ISO_ESC_NOTHING;
4359 iso->output_literally = 0;
4360 iso->output_direction_sequence = 0;
4363 case ISO_ESC_NOTHING:
4364 iso->esc_bytes_index = 0;
4366 case ISO_CODE_ESC: /* Start escape sequence */
4367 *flags |= CODING_STATE_ESCAPE;
4371 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4372 *flags |= CODING_STATE_ESCAPE;
4373 iso->esc = ISO_ESC_5_11;
4376 case ISO_CODE_SO: /* locking shift 1 */
4380 case ISO_CODE_SI: /* locking shift 0 */
4385 case ISO_CODE_SS2: /* single shift */
4388 case ISO_CODE_SS3: /* single shift */
4392 default: /* Other control characters */
4398 /**** single shift ****/
4400 case 'N': /* single shift 2 */
4403 case 'O': /* single shift 3 */
4407 /**** locking shift ****/
4409 case '~': /* locking shift 1 right */
4413 case 'n': /* locking shift 2 */
4417 case '}': /* locking shift 2 right */
4421 case 'o': /* locking shift 3 */
4425 case '|': /* locking shift 3 right */
4430 #ifdef ENABLE_COMPOSITE_CHARS
4431 /**** composite ****/
4434 iso->esc = ISO_ESC_START_COMPOSITE;
4435 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4436 CODING_STATE_COMPOSITE;
4440 iso->esc = ISO_ESC_END_COMPOSITE;
4441 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4442 ~CODING_STATE_COMPOSITE;
4444 #endif /* ENABLE_COMPOSITE_CHARS */
4446 /**** directionality ****/
4449 iso->esc = ISO_ESC_5_11;
4452 /**** designation ****/
4454 case '$': /* multibyte charset prefix */
4455 iso->esc = ISO_ESC_2_4;
4459 if (0x28 <= c && c <= 0x2F) {
4461 (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4465 /* This function is called with CODESYS equal to nil when
4466 doing coding-system detection. */
4468 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4469 && fit_to_be_escape_quoted(c)) {
4470 iso->esc = ISO_ESC_LITERAL;
4471 *flags &= CODING_STATE_ISO2022_LOCK;
4479 /**** directionality ****/
4481 case ISO_ESC_5_11: /* ISO6429 direction control */
4484 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4485 goto directionality;
4488 iso->esc = ISO_ESC_5_11_0;
4490 iso->esc = ISO_ESC_5_11_1;
4492 iso->esc = ISO_ESC_5_11_2;
4497 case ISO_ESC_5_11_0:
4500 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4501 goto directionality;
4505 case ISO_ESC_5_11_1:
4508 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4509 goto directionality;
4513 case ISO_ESC_5_11_2:
4516 (*flags & CODING_STATE_ISO2022_LOCK) |
4518 goto directionality;
4523 iso->esc = ISO_ESC_DIRECTIONALITY;
4524 /* Various junk here to attempt to preserve the direction
4525 sequences literally in the text if they would otherwise be
4526 swallowed due to invalid designations that don't show up as
4527 actual charset changes in the text. */
4528 if (iso->invalid_switch_dir) {
4529 /* We already inserted a direction switch literally into
4530 the text. We assume (#### this may not be right)
4531 that the next direction switch is the one going the
4532 other way, and we need to output that literally as
4534 iso->output_literally = 1;
4535 iso->invalid_switch_dir = 0;
4539 /* If we are in the thrall of an invalid designation,
4540 then stick the directionality sequence literally into
4541 the output stream so it ends up in the original text
4543 for (jj = 0; jj < 4; jj++)
4544 if (iso->invalid_designated[jj])
4547 iso->output_literally = 1;
4548 iso->invalid_switch_dir = 1;
4550 /* Indicate that we haven't yet seen a valid
4551 designation, so that if a switch-dir is
4552 directly followed by an invalid designation,
4553 both get inserted literally. */
4554 iso->switched_dir_and_no_valid_charset_yet = 1;
4558 /**** designation ****/
4561 if (0x28 <= c && c <= 0x2F) {
4563 (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4566 if (0x40 <= c && c <= 0x42) {
4567 cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4568 *flags & CODING_STATE_R2L ?
4569 CHARSET_RIGHT_TO_LEFT :
4570 CHARSET_LEFT_TO_RIGHT);
4587 case ISO_ESC_2_4_10:
4588 case ISO_ESC_2_4_11:
4589 case ISO_ESC_2_4_12:
4590 case ISO_ESC_2_4_13:
4591 case ISO_ESC_2_4_14:
4592 case ISO_ESC_2_4_15:
4593 case ISO_ESC_SINGLE_SHIFT:
4594 case ISO_ESC_LOCKING_SHIFT:
4595 case ISO_ESC_DESIGNATE:
4596 case ISO_ESC_DIRECTIONALITY:
4597 case ISO_ESC_LITERAL:
4602 if (c < '0' || c > '~')
4603 return 0; /* bad final byte */
4605 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4606 type = ((iso->esc >= ISO_ESC_2_12) ?
4607 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4608 reg = (iso->esc - ISO_ESC_2_8) & 3;
4609 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4610 iso->esc <= ISO_ESC_2_4_15) {
4611 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4612 CHARSET_TYPE_96X96 :
4613 CHARSET_TYPE_94X94);
4614 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4616 /* Can this ever be reached? -slb */
4621 cs = CHARSET_BY_ATTRIBUTES(type, c,
4622 *flags & CODING_STATE_R2L ?
4623 CHARSET_RIGHT_TO_LEFT :
4624 CHARSET_LEFT_TO_RIGHT);
4630 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4634 if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4635 /* can't invoke something that ain't there. */
4637 iso->esc = ISO_ESC_SINGLE_SHIFT;
4638 *flags &= CODING_STATE_ISO2022_LOCK;
4640 *flags |= CODING_STATE_SS2;
4642 *flags |= CODING_STATE_SS3;
4646 if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4647 /* can't invoke something that ain't there. */
4650 iso->register_right = reg;
4652 iso->register_left = reg;
4653 *flags &= CODING_STATE_ISO2022_LOCK;
4654 iso->esc = ISO_ESC_LOCKING_SHIFT;
4658 if (NILP(cs) && check_invalid_charsets) {
4659 iso->invalid_designated[reg] = 1;
4660 iso->charset[reg] = Vcharset_ascii;
4661 iso->esc = ISO_ESC_DESIGNATE;
4662 *flags &= CODING_STATE_ISO2022_LOCK;
4663 iso->output_literally = 1;
4664 if (iso->switched_dir_and_no_valid_charset_yet) {
4665 /* We encountered a switch-direction followed by an
4666 invalid designation. Ensure that the switch-direction
4667 gets outputted; otherwise it will probably get eaten
4668 when the text is written out again. */
4669 iso->switched_dir_and_no_valid_charset_yet = 0;
4670 iso->output_direction_sequence = 1;
4671 /* And make sure that the switch-dir going the other
4672 way gets outputted, as well. */
4673 iso->invalid_switch_dir = 1;
4677 /* This function is called with CODESYS equal to nil when
4678 doing coding-system detection. */
4679 if (!NILP(codesys)) {
4680 charset_conversion_spec_dynarr *dyn =
4681 XCODING_SYSTEM(codesys)->iso2022.input_conv;
4686 for (i = 0; i < Dynarr_length(dyn); i++) {
4687 struct charset_conversion_spec *spec =
4689 if (EQ(cs, spec->from_charset))
4690 cs = spec->to_charset;
4695 iso->charset[reg] = cs;
4696 iso->esc = ISO_ESC_DESIGNATE;
4697 *flags &= CODING_STATE_ISO2022_LOCK;
4698 if (iso->invalid_designated[reg]) {
4699 iso->invalid_designated[reg] = 0;
4700 iso->output_literally = 1;
4702 if (iso->switched_dir_and_no_valid_charset_yet)
4703 iso->switched_dir_and_no_valid_charset_yet = 0;
4708 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4709 Lstream_data_count n)
4713 /* #### There are serious deficiencies in the recognition mechanism
4714 here. This needs to be much smarter if it's going to cut it.
4715 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4716 it should be detected as Latin-1.
4717 All the ISO2022 stuff in this file should be synced up with the
4718 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4719 Perhaps we should wait till R2L works in FSF Emacs? */
4721 if (!st->iso2022.initted) {
4722 reset_iso2022(Qnil, &st->iso2022.iso);
4723 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4724 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4725 CODING_CATEGORY_ISO_8_1_MASK |
4726 CODING_CATEGORY_ISO_8_2_MASK |
4727 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4728 st->iso2022.flags = 0;
4729 st->iso2022.high_byte_count = 0;
4730 st->iso2022.saw_single_shift = 0;
4731 st->iso2022.initted = 1;
4734 mask = st->iso2022.mask;
4737 const unsigned char c = *(const unsigned char *)src++;
4739 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4740 st->iso2022.high_byte_count++;
4742 if (st->iso2022.high_byte_count
4743 && !st->iso2022.saw_single_shift) {
4744 if (st->iso2022.high_byte_count & 1)
4745 /* odd number of high bytes; assume not iso-8-2 */
4746 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4748 st->iso2022.high_byte_count = 0;
4749 st->iso2022.saw_single_shift = 0;
4751 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4753 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4754 && (BYTE_C0_P(c) || BYTE_C1_P(c))) { /* control chars */
4756 /* Allow and ignore control characters that you might
4757 reasonably see in a text file */
4762 case 8: /* backspace */
4763 case 11: /* vertical tab */
4764 case 12: /* form feed */
4765 case 26: /* MS-DOS C-z junk */
4766 case 31: /* '^_' -- for info */
4767 goto label_continue_loop;
4774 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4776 if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4777 &st->iso2022.flags, 0)) {
4778 switch (st->iso2022.iso.esc) {
4779 case ISO_ESC_DESIGNATE:
4780 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4781 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4783 case ISO_ESC_LOCKING_SHIFT:
4784 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4785 goto ran_out_of_chars;
4786 case ISO_ESC_SINGLE_SHIFT:
4787 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4788 st->iso2022.saw_single_shift = 1;
4792 case ISO_ESC_NOTHING:
4805 case ISO_ESC_2_4_10:
4806 case ISO_ESC_2_4_11:
4807 case ISO_ESC_2_4_12:
4808 case ISO_ESC_2_4_13:
4809 case ISO_ESC_2_4_14:
4810 case ISO_ESC_2_4_15:
4812 case ISO_ESC_5_11_0:
4813 case ISO_ESC_5_11_1:
4814 case ISO_ESC_5_11_2:
4815 case ISO_ESC_DIRECTIONALITY:
4816 case ISO_ESC_LITERAL:
4822 goto ran_out_of_chars;
4825 label_continue_loop:;
4832 static int postprocess_iso2022_mask(int mask)
4834 /* #### kind of cheesy */
4835 /* If seven-bit ISO is allowed, then assume that the encoding is
4836 entirely seven-bit and turn off the eight-bit ones. */
4837 if (mask & CODING_CATEGORY_ISO_7_MASK)
4838 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4839 CODING_CATEGORY_ISO_8_1_MASK |
4840 CODING_CATEGORY_ISO_8_2_MASK);
4844 /* If FLAGS is a null pointer or specifies right-to-left motion,
4845 output a switch-dir-to-left-to-right sequence to DST.
4846 Also update FLAGS if it is not a null pointer.
4847 If INTERNAL_P is set, we are outputting in internal format and
4848 need to handle the CSI differently. */
4851 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4852 unsigned_char_dynarr * dst,
4853 unsigned int *flags, int internal_p)
4855 if (!flags || (*flags & CODING_STATE_R2L)) {
4856 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4857 Dynarr_add(dst, ISO_CODE_ESC);
4858 Dynarr_add(dst, '[');
4859 } else if (internal_p)
4860 DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4862 Dynarr_add(dst, ISO_CODE_CSI);
4863 Dynarr_add(dst, '0');
4864 Dynarr_add(dst, ']');
4866 *flags &= ~CODING_STATE_R2L;
4870 /* If FLAGS is a null pointer or specifies a direction different from
4871 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4872 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4873 sequence to DST. Also update FLAGS if it is not a null pointer.
4874 If INTERNAL_P is set, we are outputting in internal format and
4875 need to handle the CSI differently. */
4878 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4879 unsigned_char_dynarr * dst, unsigned int *flags,
4882 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4883 direction == CHARSET_LEFT_TO_RIGHT)
4884 restore_left_to_right_direction(codesys, dst, flags,
4886 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4887 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4888 direction == CHARSET_RIGHT_TO_LEFT) {
4889 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4890 Dynarr_add(dst, ISO_CODE_ESC);
4891 Dynarr_add(dst, '[');
4892 } else if (internal_p)
4893 DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4895 Dynarr_add(dst, ISO_CODE_CSI);
4896 Dynarr_add(dst, '2');
4897 Dynarr_add(dst, ']');
4899 *flags |= CODING_STATE_R2L;
4903 /* Convert ISO2022-format data to internal format. */
4906 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4907 unsigned_char_dynarr * dst, Lstream_data_count n)
4909 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4910 unsigned int flags = str->flags;
4911 unsigned int ch = str->ch;
4912 eol_type_t eol_type = str->eol_type;
4913 #ifdef ENABLE_COMPOSITE_CHARS
4914 unsigned_char_dynarr *real_dst = dst;
4916 Lisp_Object coding_system;
4918 XSETCODING_SYSTEM(coding_system, str->codesys);
4920 #ifdef ENABLE_COMPOSITE_CHARS
4921 if (flags & CODING_STATE_COMPOSITE)
4922 dst = str->iso2022.composite_chars;
4923 #endif /* ENABLE_COMPOSITE_CHARS */
4926 const unsigned char c = *(const unsigned char *)src++;
4927 if (flags & CODING_STATE_ESCAPE) {
4928 /* Within ESC sequence */
4929 int retval = parse_iso2022_esc(
4930 coding_system, &str->iso2022, c, &flags, 1);
4933 switch (str->iso2022.esc) {
4934 #ifdef ENABLE_COMPOSITE_CHARS
4935 case ISO_ESC_START_COMPOSITE:
4936 if (str->iso2022.composite_chars)
4937 Dynarr_reset(str->iso2022.
4940 str->iso2022.composite_chars =
4941 Dynarr_new(unsigned_char);
4942 dst = str->iso2022.composite_chars;
4944 case ISO_ESC_END_COMPOSITE:
4946 Bufbyte comstr[MAX_EMCHAR_LEN];
4949 lookup_composite_char
4950 (Dynarr_atp(dst, 0),
4951 Dynarr_length(dst));
4954 set_charptr_emchar(comstr,
4956 Dynarr_add_many(dst, comstr,
4960 #endif /* ENABLE_COMPOSITE_CHARS */
4962 case ISO_ESC_LITERAL:
4963 DECODE_ADD_BINARY_CHAR(c, dst);
4966 case ISO_ESC_NOTHING:
4979 case ISO_ESC_2_4_10:
4980 case ISO_ESC_2_4_11:
4981 case ISO_ESC_2_4_12:
4982 case ISO_ESC_2_4_13:
4983 case ISO_ESC_2_4_14:
4984 case ISO_ESC_2_4_15:
4986 case ISO_ESC_5_11_0:
4987 case ISO_ESC_5_11_1:
4988 case ISO_ESC_5_11_2:
4989 case ISO_ESC_SINGLE_SHIFT:
4990 case ISO_ESC_LOCKING_SHIFT:
4991 case ISO_ESC_DESIGNATE:
4992 case ISO_ESC_DIRECTIONALITY:
4995 /* Everything else handled already */
5000 /* Attempted error recovery. */
5001 if (str->iso2022.output_direction_sequence)
5002 ensure_correct_direction(flags &
5004 CHARSET_RIGHT_TO_LEFT :
5005 CHARSET_LEFT_TO_RIGHT,
5006 str->codesys, dst, 0,
5008 /* More error recovery. */
5009 if (!retval || str->iso2022.output_literally) {
5010 /* Output the (possibly invalid) sequence */
5012 for (i = 0; i < str->iso2022.esc_bytes_index;
5014 DECODE_ADD_BINARY_CHAR(str->iso2022.
5017 flags &= CODING_STATE_ISO2022_LOCK;
5019 n++, src--; /* Repeat the loop with the same character. */
5021 /* No sense in reprocessing the final byte of the
5022 escape sequence; it could mess things up anyway.
5024 DECODE_ADD_BINARY_CHAR(c, dst);
5028 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) { /* Control characters */
5030 /***** Error-handling *****/
5032 /* If we were in the middle of a character, dump out the
5033 partial character. */
5034 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5036 /* If we just saw a single-shift character, dump it out.
5037 This may dump out the wrong sort of single-shift character,
5038 but least it will give an indication that something went
5040 if (flags & CODING_STATE_SS2) {
5041 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5042 flags &= ~CODING_STATE_SS2;
5044 if (flags & CODING_STATE_SS3) {
5045 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5046 flags &= ~CODING_STATE_SS3;
5049 /***** Now handle the control characters. *****/
5052 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5054 flags &= CODING_STATE_ISO2022_LOCK;
5056 if (!parse_iso2022_esc
5057 (coding_system, &str->iso2022, c, &flags, 1))
5058 DECODE_ADD_BINARY_CHAR(c, dst);
5059 } else { /* Graphic characters */
5060 Lisp_Object charset;
5064 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5066 /* Now determine the charset. */
5067 reg = ((flags & CODING_STATE_SS2) ? 2
5068 : (flags & CODING_STATE_SS3) ? 3
5069 : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5070 : str->iso2022.register_left);
5071 charset = str->iso2022.charset[reg];
5073 /* Error checking: */
5074 if (!CHARSETP(charset)
5075 || str->iso2022.invalid_designated[reg]
5077 (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5078 && XCHARSET_CHARS(charset) == 94))
5079 /* Mrmph. We are trying to invoke a register that has no
5080 or an invalid charset in it, or trying to add a character
5081 outside the range of the charset. Insert that char literally
5082 to preserve it for the output. */
5084 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5085 DECODE_ADD_BINARY_CHAR(c, dst);
5089 /* Things are probably hunky-dorey. */
5091 /* Fetch reverse charset, maybe. */
5092 if (((flags & CODING_STATE_R2L) &&
5093 XCHARSET_DIRECTION(charset) ==
5094 CHARSET_LEFT_TO_RIGHT)
5095 || (!(flags & CODING_STATE_R2L)
5096 && XCHARSET_DIRECTION(charset) ==
5097 CHARSET_RIGHT_TO_LEFT)) {
5098 Lisp_Object new_charset =
5099 XCHARSET_REVERSE_DIRECTION_CHARSET
5101 if (!NILP(new_charset))
5102 charset = new_charset;
5105 lb = XCHARSET_LEADING_BYTE(charset);
5106 switch (XCHARSET_REP_BYTES(charset)) {
5108 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5109 Dynarr_add(dst, c & 0x7F);
5112 case 2: /* one-byte official */
5113 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5114 Dynarr_add(dst, lb);
5115 Dynarr_add(dst, c | 0x80);
5118 case 3: /* one-byte private or two-byte official */
5119 if (XCHARSET_PRIVATE_P(charset)) {
5120 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5122 PRE_LEADING_BYTE_PRIVATE_1);
5123 Dynarr_add(dst, lb);
5124 Dynarr_add(dst, c | 0x80);
5127 Dynarr_add(dst, lb);
5138 default: /* two-byte private */
5141 PRE_LEADING_BYTE_PRIVATE_2);
5142 Dynarr_add(dst, lb);
5143 Dynarr_add(dst, ch | 0x80);
5144 Dynarr_add(dst, c | 0x80);
5152 flags &= CODING_STATE_ISO2022_LOCK;
5155 label_continue_loop:;
5158 if (flags & CODING_STATE_END)
5159 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5165 /***** ISO2022 encoder *****/
5167 /* Designate CHARSET into register REG. */
5170 iso2022_designate(Lisp_Object charset, unsigned char reg,
5171 encoding_stream_t str, unsigned_char_dynarr * dst)
5173 static const char inter94[] = "()*+";
5174 static const char inter96[] = ",-./";
5176 unsigned char final;
5177 Lisp_Object old_charset = str->iso2022.charset[reg];
5179 str->iso2022.charset[reg] = charset;
5180 if (!CHARSETP(charset))
5181 /* charset might be an initial nil or t. */
5183 type = XCHARSET_TYPE(charset);
5184 final = XCHARSET_FINAL(charset);
5185 if (!str->iso2022.force_charset_on_output[reg] &&
5186 CHARSETP(old_charset) &&
5187 XCHARSET_TYPE(old_charset) == type &&
5188 XCHARSET_FINAL(old_charset) == final)
5191 str->iso2022.force_charset_on_output[reg] = 0;
5194 charset_conversion_spec_dynarr *dyn =
5195 str->codesys->iso2022.output_conv;
5200 for (i = 0; i < Dynarr_length(dyn); i++) {
5201 struct charset_conversion_spec *spec =
5203 if (EQ(charset, spec->from_charset))
5204 charset = spec->to_charset;
5209 Dynarr_add(dst, ISO_CODE_ESC);
5211 case CHARSET_TYPE_94:
5212 Dynarr_add(dst, inter94[reg]);
5214 case CHARSET_TYPE_96:
5215 Dynarr_add(dst, inter96[reg]);
5217 case CHARSET_TYPE_94X94:
5218 Dynarr_add(dst, '$');
5219 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5220 || final < '@' || final > 'B')
5221 Dynarr_add(dst, inter94[reg]);
5223 case CHARSET_TYPE_96X96:
5224 Dynarr_add(dst, '$');
5225 Dynarr_add(dst, inter96[reg]);
5230 Dynarr_add(dst, final);
5234 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5236 if (str->iso2022.register_left != 0) {
5237 Dynarr_add(dst, ISO_CODE_SI);
5238 str->iso2022.register_left = 0;
5243 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5245 if (str->iso2022.register_left != 1) {
5246 Dynarr_add(dst, ISO_CODE_SO);
5247 str->iso2022.register_left = 1;
5251 /* Convert internally-formatted data to ISO2022 format. */
5254 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5255 unsigned_char_dynarr * dst, Lstream_data_count n)
5257 unsigned char charmask, c;
5258 unsigned char char_boundary;
5259 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5260 unsigned int flags = str->flags;
5261 unsigned int ch = str->ch;
5262 Lisp_Coding_System *codesys = str->codesys;
5263 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5265 Lisp_Object charset;
5268 #ifdef ENABLE_COMPOSITE_CHARS
5269 /* flags for handling composite chars. We do a little switcharoo
5270 on the source while we're outputting the composite char. */
5271 unsigned int saved_n = 0;
5272 const unsigned char *saved_src = NULL;
5273 int in_composite = 0;
5274 #endif /* ENABLE_COMPOSITE_CHARS */
5276 char_boundary = str->iso2022.current_char_boundary;
5277 charset = str->iso2022.current_charset;
5278 half = str->iso2022.current_half;
5280 #ifdef ENABLE_COMPOSITE_CHARS
5286 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
5289 restore_left_to_right_direction(codesys, dst, &flags,
5292 /* Make sure G0 contains ASCII */
5293 if ((c > ' ' && c < ISO_CODE_DEL) ||
5294 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5295 ensure_normal_shift(str, dst);
5296 iso2022_designate(Vcharset_ascii, 0, str, dst);
5299 /* If necessary, restore everything to the default state
5302 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5303 restore_left_to_right_direction(codesys, dst,
5306 ensure_normal_shift(str, dst);
5308 for (i = 0; i < 4; i++) {
5309 Lisp_Object initial_charset =
5310 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5312 iso2022_designate(initial_charset, i,
5317 if (eol_type != EOL_LF
5318 && eol_type != EOL_AUTODETECT)
5319 Dynarr_add(dst, '\r');
5320 if (eol_type != EOL_CR)
5323 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5324 && fit_to_be_escape_quoted(c))
5325 Dynarr_add(dst, ISO_CODE_ESC);
5331 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
5333 charset = CHARSET_BY_LEADING_BYTE(c);
5334 if (LEADING_BYTE_PREFIX_P(c))
5336 else if (!EQ(charset, Vcharset_control_1)
5337 #ifdef ENABLE_COMPOSITE_CHARS
5338 && !EQ(charset, Vcharset_composite)
5343 ensure_correct_direction(XCHARSET_DIRECTION
5347 /* Now determine which register to use. */
5349 for (i = 0; i < 4; i++) {
5350 if (EQ(charset, str->iso2022.charset[i])
5352 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5360 if (XCHARSET_GRAPHIC(charset) != 0) {
5362 (str->iso2022.charset[1])
5364 (!CODING_SYSTEM_ISO2022_SEVEN
5367 CODING_SYSTEM_ISO2022_LOCK_SHIFT
5384 iso2022_designate(charset, reg, str, dst);
5386 /* Now invoke that register. */
5389 ensure_normal_shift(str, dst);
5394 if (CODING_SYSTEM_ISO2022_SEVEN
5396 ensure_shift_out(str, dst);
5403 if (CODING_SYSTEM_ISO2022_SEVEN
5405 Dynarr_add(dst, ISO_CODE_ESC);
5406 Dynarr_add(dst, 'N');
5409 Dynarr_add(dst, ISO_CODE_SS2);
5415 if (CODING_SYSTEM_ISO2022_SEVEN
5417 Dynarr_add(dst, ISO_CODE_ESC);
5418 Dynarr_add(dst, 'O');
5421 Dynarr_add(dst, ISO_CODE_SS3);
5431 } else { /* Processing Non-ASCII character */
5432 charmask = (half == 0 ? 0x7F : 0xFF);
5434 if (EQ(charset, Vcharset_control_1)) {
5435 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5436 && fit_to_be_escape_quoted(c))
5437 Dynarr_add(dst, ISO_CODE_ESC);
5438 /* you asked for it ... */
5439 Dynarr_add(dst, c - 0x20);
5441 switch (XCHARSET_REP_BYTES(charset)) {
5443 Dynarr_add(dst, c & charmask);
5446 if (XCHARSET_PRIVATE_P(charset)) {
5447 Dynarr_add(dst, c & charmask);
5450 #ifdef ENABLE_COMPOSITE_CHARS
5453 Vcharset_composite)) {
5455 /* #### Bother! We don't know how to
5462 (Vcharset_composite,
5467 composite_char_string
5476 n = XSTRING_LENGTH(lstr);
5479 Dynarr_add(dst, '0'); /* start composing */
5482 #endif /* ENABLE_COMPOSITE_CHARS */
5499 Dynarr_add(dst, ch & charmask);
5500 Dynarr_add(dst, c & charmask);
5514 #ifdef ENABLE_COMPOSITE_CHARS
5519 Dynarr_add(dst, ISO_CODE_ESC);
5520 Dynarr_add(dst, '1'); /* end composing */
5521 goto back_to_square_n; /* Wheeeeeeeee ..... */
5523 #endif /* ENABLE_COMPOSITE_CHARS */
5525 if (char_boundary && flags & CODING_STATE_END) {
5526 restore_left_to_right_direction(codesys, dst, &flags, 0);
5527 ensure_normal_shift(str, dst);
5528 for (i = 0; i < 4; i++) {
5529 Lisp_Object initial_charset =
5530 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5531 iso2022_designate(initial_charset, i, str, dst);
5537 str->iso2022.current_char_boundary = char_boundary;
5538 str->iso2022.current_charset = charset;
5539 str->iso2022.current_half = half;
5541 /* Verbum caro factum est! */
5545 /************************************************************************/
5546 /* No-conversion methods */
5547 /************************************************************************/
5549 /* This is used when reading in "binary" files -- i.e. files that may
5550 contain all 256 possible byte values and that are not to be
5551 interpreted as being in any particular decoding. */
5553 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5554 unsigned_char_dynarr * dst, Lstream_data_count n)
5556 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5557 unsigned int flags = str->flags;
5558 unsigned int ch = str->ch;
5559 eol_type_t eol_type = str->eol_type;
5562 const unsigned char c = *(const unsigned char *)src++;
5564 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5565 DECODE_ADD_BINARY_CHAR(c, dst);
5566 label_continue_loop:;
5569 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5576 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5577 unsigned_char_dynarr * dst, Lstream_data_count n)
5580 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5581 unsigned int flags = str->flags;
5582 unsigned int ch = str->ch;
5583 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5588 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5589 Dynarr_add(dst, '\r');
5590 if (eol_type != EOL_CR)
5591 Dynarr_add(dst, '\n');
5593 } else if (BYTE_ASCII_P(c)) {
5596 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5598 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5599 c == LEADING_BYTE_CONTROL_1)
5602 Dynarr_add(dst, '~'); /* untranslatable character */
5604 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5606 else if (ch == LEADING_BYTE_CONTROL_1) {
5608 Dynarr_add(dst, c - 0x20);
5610 /* else it should be the second or third byte of an
5611 untranslatable character, so ignore it */
5620 /************************************************************************/
5621 /* Initialization */
5622 /************************************************************************/
5624 void syms_of_file_coding(void)
5626 INIT_LRECORD_IMPLEMENTATION(coding_system);
5628 DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5630 DEFSUBR(Fcoding_system_p);
5631 DEFSUBR(Ffind_coding_system);
5632 DEFSUBR(Fget_coding_system);
5633 DEFSUBR(Fcoding_system_list);
5634 DEFSUBR(Fcoding_system_name);
5635 DEFSUBR(Fmake_coding_system);
5636 DEFSUBR(Fcopy_coding_system);
5637 DEFSUBR(Fcoding_system_canonical_name_p);
5638 DEFSUBR(Fcoding_system_alias_p);
5639 DEFSUBR(Fcoding_system_aliasee);
5640 DEFSUBR(Fdefine_coding_system_alias);
5641 DEFSUBR(Fsubsidiary_coding_system);
5643 DEFSUBR(Fcoding_system_type);
5644 DEFSUBR(Fcoding_system_doc_string);
5646 DEFSUBR(Fcoding_system_charset);
5648 DEFSUBR(Fcoding_system_property);
5650 DEFSUBR(Fcoding_category_list);
5651 DEFSUBR(Fset_coding_priority_list);
5652 DEFSUBR(Fcoding_priority_list);
5653 DEFSUBR(Fset_coding_category_system);
5654 DEFSUBR(Fcoding_category_system);
5656 DEFSUBR(Fdetect_coding_region);
5657 DEFSUBR(Fdecode_coding_region);
5658 DEFSUBR(Fencode_coding_region);
5660 DEFSUBR(Fdecode_shift_jis_char);
5661 DEFSUBR(Fencode_shift_jis_char);
5662 DEFSUBR(Fdecode_big5_char);
5663 DEFSUBR(Fencode_big5_char);
5664 DEFSUBR(Fset_ucs_char);
5666 DEFSUBR(Fset_char_ucs);
5669 defsymbol(&Qcoding_systemp, "coding-system-p");
5670 defsymbol(&Qno_conversion, "no-conversion");
5671 defsymbol(&Qraw_text, "raw-text");
5673 defsymbol(&Qbig5, "big5");
5674 defsymbol(&Qshift_jis, "shift-jis");
5675 defsymbol(&Qucs4, "ucs-4");
5676 defsymbol(&Qutf8, "utf-8");
5677 defsymbol(&Qccl, "ccl");
5678 defsymbol(&Qiso2022, "iso2022");
5680 defsymbol(&Qmnemonic, "mnemonic");
5681 defsymbol(&Qeol_type, "eol-type");
5682 defsymbol(&Qpost_read_conversion, "post-read-conversion");
5683 defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5685 defsymbol(&Qcr, "cr");
5686 defsymbol(&Qlf, "lf");
5687 defsymbol(&Qcrlf, "crlf");
5688 defsymbol(&Qeol_cr, "eol-cr");
5689 defsymbol(&Qeol_lf, "eol-lf");
5690 defsymbol(&Qeol_crlf, "eol-crlf");
5692 defsymbol(&Qcharset_g0, "charset-g0");
5693 defsymbol(&Qcharset_g1, "charset-g1");
5694 defsymbol(&Qcharset_g2, "charset-g2");
5695 defsymbol(&Qcharset_g3, "charset-g3");
5696 defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5697 defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5698 defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5699 defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5700 defsymbol(&Qno_iso6429, "no-iso6429");
5701 defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5702 defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5704 defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5705 defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5706 defsymbol(&Qseven, "seven");
5707 defsymbol(&Qlock_shift, "lock-shift");
5708 defsymbol(&Qescape_quoted, "escape-quoted");
5710 defsymbol(&Qencode, "encode");
5711 defsymbol(&Qdecode, "decode");
5714 defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5716 defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5717 defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5718 defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5719 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5720 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5722 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5723 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5724 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5727 defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5731 void lstream_type_create_file_coding(void)
5733 LSTREAM_HAS_METHOD(decoding, reader);
5734 LSTREAM_HAS_METHOD(decoding, writer);
5735 LSTREAM_HAS_METHOD(decoding, rewinder);
5736 LSTREAM_HAS_METHOD(decoding, seekable_p);
5737 LSTREAM_HAS_METHOD(decoding, flusher);
5738 LSTREAM_HAS_METHOD(decoding, closer);
5739 LSTREAM_HAS_METHOD(decoding, marker);
5741 LSTREAM_HAS_METHOD(encoding, reader);
5742 LSTREAM_HAS_METHOD(encoding, writer);
5743 LSTREAM_HAS_METHOD(encoding, rewinder);
5744 LSTREAM_HAS_METHOD(encoding, seekable_p);
5745 LSTREAM_HAS_METHOD(encoding, flusher);
5746 LSTREAM_HAS_METHOD(encoding, closer);
5747 LSTREAM_HAS_METHOD(encoding, marker);
5750 void vars_of_file_coding(void)
5754 fcd = xnew(struct file_coding_dump);
5755 dump_add_root_struct_ptr(&fcd, &fcd_description);
5757 /* Initialize to something reasonable ... */
5758 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5759 fcd->coding_category_system[i] = Qnil;
5760 fcd->coding_category_by_priority[i] = i;
5763 Fprovide(intern("file-coding"));
5765 DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system /*
5766 Coding system used for TTY keyboard input.
5767 Not used under a windowing system.
5769 Vkeyboard_coding_system = Qnil;
5771 DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system /*
5772 Coding system used for TTY display output.
5773 Not used under a windowing system.
5775 Vterminal_coding_system = Qnil;
5777 DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read /*
5778 Overriding coding system used when reading from a file or process.
5779 You should bind this variable with `let', but do not set it globally.
5780 If this is non-nil, it specifies the coding system that will be used
5781 to decode input on read operations, such as from a file or process.
5782 It overrides `buffer-file-coding-system-for-read',
5783 `insert-file-contents-pre-hook', etc. Use those variables instead of
5784 this one for permanent changes to the environment. */ );
5785 Vcoding_system_for_read = Qnil;
5787 DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write /*
5788 Overriding coding system used when writing to a file or process.
5789 You should bind this variable with `let', but do not set it globally.
5790 If this is non-nil, it specifies the coding system that will be used
5791 to encode output for write operations, such as to a file or process.
5792 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5793 Use those variables instead of this one for permanent changes to the
5795 Vcoding_system_for_write = Qnil;
5797 DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system /*
5798 Coding system used to convert pathnames when accessing files.
5800 Vfile_name_coding_system = Qnil;
5802 DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5803 Non-nil means the buffer contents are regarded as multi-byte form
5804 of characters, not a binary code. This affects the display, file I/O,
5805 and behaviors of various editing commands.
5807 Setting this to nil does not do anything.
5809 enable_multibyte_characters = 1;
5812 void complex_vars_of_file_coding(void)
5814 staticpro(&Vcoding_system_hash_table);
5815 Vcoding_system_hash_table =
5816 make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5818 the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5819 dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5820 &codesys_prop_dynarr_description);
5822 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5824 struct codesys_prop csp; \
5826 csp.prop_type = (Prop_Type); \
5827 Dynarr_add (the_codesys_prop_dynarr, csp); \
5830 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5831 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5832 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5833 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5834 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5835 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5836 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5838 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5839 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5840 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5841 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5842 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5843 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5844 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5845 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5846 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5847 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5848 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5849 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5850 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5851 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5852 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5853 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5854 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5856 DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5857 DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5859 /* Need to create this here or we're really screwed. */
5861 (Qraw_text, Qno_conversion,
5863 ("Raw text, which means it converts only line-break-codes."),
5864 list2(Qmnemonic, build_string("Raw")));
5867 (Qbinary, Qno_conversion,
5868 build_string("Binary, which means it does not convert anything."),
5869 list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5871 Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5873 Fdefine_coding_system_alias(Qfile_name, Qbinary);
5875 Fdefine_coding_system_alias(Qterminal, Qbinary);
5876 Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5878 /* Need this for bootstrapping */
5879 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5880 Fget_coding_system(Qraw_text);
5886 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5887 fcd->ucs_to_mule_table[i] = Qnil;
5889 staticpro(&mule_to_ucs_table);
5890 mule_to_ucs_table = Fmake_char_table(Qgeneric);