1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of SXEmacs
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 /* Synched up with: Mule 2.3. Not in FSF. */
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
30 #include "ui/insdel.h"
37 #include "file-coding.h"
39 Lisp_Object Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
50 struct file_coding_dump {
51 /* Coding system currently associated with each coding category. */
52 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
54 /* Table of all coding categories in decreasing order of priority.
55 This describes a permutation of the possible coding categories. */
56 int coding_category_by_priority[CODING_CATEGORY_LAST];
59 Lisp_Object ucs_to_mule_table[65536];
63 static const struct lrecord_description fcd_description_1[] = {
64 {XD_LISP_OBJECT_ARRAY,
65 offsetof(struct file_coding_dump, coding_category_system),
66 CODING_CATEGORY_LAST},
68 {XD_LISP_OBJECT_ARRAY,
69 offsetof(struct file_coding_dump, ucs_to_mule_table),
70 countof(fcd->ucs_to_mule_table)},
75 static const struct struct_description fcd_description = {
76 sizeof(struct file_coding_dump),
80 Lisp_Object mule_to_ucs_table;
82 Lisp_Object Qcoding_systemp;
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
104 Lisp_Object Qencode, Qdecode;
106 Lisp_Object Vcoding_system_hash_table;
108 int enable_multibyte_characters;
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet:1;
162 unsigned int invalid_switch_dir:1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally:1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence:1;
175 EXFUN(Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179 const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181 unsigned_char_dynarr * dst,
182 Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184 unsigned_char_dynarr * dst,
185 Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187 Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189 unsigned_char_dynarr * dst,
190 Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192 unsigned_char_dynarr * dst,
193 Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195 Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197 unsigned_char_dynarr * dst,
198 Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200 unsigned_char_dynarr * dst,
201 Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203 Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205 unsigned_char_dynarr * dst,
206 Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208 unsigned_char_dynarr * dst,
209 Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212 struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214 const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216 unsigned_char_dynarr * dst,
217 Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219 unsigned_char_dynarr * dst,
220 Lstream_data_count n);
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223 unsigned_char_dynarr * dst,
224 Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226 unsigned_char_dynarr * dst,
227 Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229 unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231 unsigned_char_dynarr * dst, Lstream_data_count n);
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
240 Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
243 static const struct lrecord_description codesys_prop_description_1[] = {
244 {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
248 static const struct struct_description codesys_prop_description = {
249 sizeof(codesys_prop),
250 codesys_prop_description_1
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
258 static const struct struct_description codesys_prop_dynarr_description = {
259 sizeof(codesys_prop_dynarr),
260 codesys_prop_dynarr_description_1
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
265 enum codesys_prop_enum {
267 CODESYS_PROP_ISO2022,
271 /************************************************************************/
272 /* Coding system functions */
273 /************************************************************************/
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
280 static const struct lrecord_description ccs_description_1[] = {
281 {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282 {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
286 static const struct struct_description ccs_description = {
287 sizeof(charset_conversion_spec),
291 static const struct lrecord_description ccsd_description_1[] = {
292 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
296 static const struct struct_description ccsd_description = {
297 sizeof(charset_conversion_spec_dynarr),
302 static const struct lrecord_description coding_system_description[] = {
303 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
312 {XD_LISP_OBJECT_ARRAY,
313 offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314 {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
316 {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
318 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319 {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325 mark_coding_system, print_coding_system,
326 finalize_coding_system,
327 0, 0, coding_system_description,
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
332 Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
334 mark_object(CODING_SYSTEM_NAME(codesys));
335 mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336 mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337 mark_object(CODING_SYSTEM_EOL_LF(codesys));
338 mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339 mark_object(CODING_SYSTEM_EOL_CR(codesys));
341 switch (CODING_SYSTEM_TYPE(codesys)) {
344 case CODESYS_ISO2022:
345 for (i = 0; i < 4; i++)
346 mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
348 if (codesys->iso2022.input_conv) {
350 i < Dynarr_length(codesys->iso2022.input_conv);
352 struct charset_conversion_spec *ccs =
353 Dynarr_atp(codesys->iso2022.input_conv, i);
354 mark_object(ccs->from_charset);
355 mark_object(ccs->to_charset);
358 if (codesys->iso2022.output_conv) {
360 i < Dynarr_length(codesys->iso2022.output_conv);
362 struct charset_conversion_spec *ccs =
363 Dynarr_atp(codesys->iso2022.output_conv, i);
364 mark_object(ccs->from_charset);
365 mark_object(ccs->to_charset);
371 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
375 /* list the rest of them lot explicitly */
376 case CODESYS_AUTODETECT:
377 case CODESYS_SHIFT_JIS:
381 case CODESYS_NO_CONVERSION:
383 case CODESYS_INTERNAL:
390 mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
397 Lisp_Coding_System *c = XCODING_SYSTEM(obj);
399 error("printing unreadable object #<coding-system 0x%x>",
402 write_c_string("#<coding-system ", printcharfun);
403 print_internal(c->name, printcharfun, 1);
404 write_c_string(">", printcharfun);
407 static void finalize_coding_system(void *header, int for_disksave)
409 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410 /* Since coding systems never go away, this function is not
411 necessary. But it would be necessary if we changed things
412 so that coding systems could go away. */
413 if (!for_disksave) { /* see comment in lstream.c */
414 switch (CODING_SYSTEM_TYPE(c)) {
416 case CODESYS_ISO2022:
417 if (c->iso2022.input_conv) {
418 Dynarr_free(c->iso2022.input_conv);
419 c->iso2022.input_conv = 0;
421 if (c->iso2022.output_conv) {
422 Dynarr_free(c->iso2022.output_conv);
423 c->iso2022.output_conv = 0;
427 /* list the rest of them lot explicitly */
428 case CODESYS_AUTODETECT:
429 case CODESYS_SHIFT_JIS:
434 case CODESYS_NO_CONVERSION:
436 case CODESYS_INTERNAL:
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
447 CHECK_SYMBOL(symbol);
449 return EOL_AUTODETECT;
452 if (EQ(symbol, Qcrlf))
457 signal_simple_error("Unrecognized eol type", symbol);
458 return EOL_AUTODETECT; /* not reached */
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
478 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
480 Lisp_Object codesys_obj;
481 int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
482 char *codesys_name = (char *)alloca(len + 7);
484 char *codesys_mnemonic = 0;
486 Lisp_Object codesys_name_sym, sub_codesys_obj;
490 XSETCODING_SYSTEM(codesys_obj, codesys);
493 string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
495 if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
496 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
497 codesys_mnemonic = (char *)alloca(mlen + 7);
498 memcpy(codesys_mnemonic,
499 XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
501 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
502 strcpy (codesys_name + len, "-" op_sys); \
504 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
505 codesys_name_sym = intern (codesys_name); \
506 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
507 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
509 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
510 build_string (codesys_mnemonic); \
511 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
514 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
515 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
516 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
519 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
520 Return t if OBJECT is a coding system.
521 A coding system is an object that defines how text containing multiple
522 character sets is encoded into a stream of (typically 8-bit) bytes.
523 The coding system is used to decode the stream into a series of
524 characters (which may be from multiple charsets) when the text is read
525 from a file or process, and is used to encode the text back into the
526 same format when it is written out to a file or process.
528 For example, many ISO2022-compliant coding systems (such as Compound
529 Text, which is used for inter-client data under the X Window System)
530 use escape sequences to switch between different charsets -- Japanese
531 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
532 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
533 `make-coding-system' for more information.
535 Coding systems are normally identified using a symbol, and the
536 symbol is accepted in place of the actual coding system object whenever
537 a coding system is called for. (This is similar to how faces work.)
541 return CODING_SYSTEMP(object) ? Qt : Qnil;
544 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
545 Retrieve the coding system of the given name.
547 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
548 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
549 If there is no such coding system, nil is returned. Otherwise the
550 associated coding system object is returned.
552 (coding_system_or_name))
554 if (NILP(coding_system_or_name))
555 coding_system_or_name = Qbinary;
556 else if (CODING_SYSTEMP(coding_system_or_name))
557 return coding_system_or_name;
559 CHECK_SYMBOL(coding_system_or_name);
562 coding_system_or_name =
563 Fgethash(coding_system_or_name, Vcoding_system_hash_table,
566 if (CODING_SYSTEMP(coding_system_or_name)
567 || NILP(coding_system_or_name))
568 return coding_system_or_name;
572 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
573 Retrieve the coding system of the given name.
574 Same as `find-coding-system' except that if there is no such
575 coding system, an error is signaled instead of returning nil.
579 Lisp_Object coding_system = Ffind_coding_system(name);
581 if (NILP(coding_system))
582 signal_simple_error("No such coding system", name);
583 return coding_system;
586 /* We store the coding systems in hash tables with the names as the key and the
587 actual coding system object as the value. Occasionally we need to use them
588 in a list format. These routines provide us with that. */
589 struct coding_system_list_closure {
590 Lisp_Object *coding_system_list;
594 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
595 void *coding_system_list_closure)
597 /* This function can GC */
598 struct coding_system_list_closure *cscl =
599 (struct coding_system_list_closure *)coding_system_list_closure;
600 Lisp_Object *coding_system_list = cscl->coding_system_list;
602 *coding_system_list = Fcons(key, *coding_system_list);
606 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
607 Return a list of the names of all defined coding systems.
611 Lisp_Object coding_system_list = Qnil;
613 struct coding_system_list_closure coding_system_list_closure;
615 GCPRO1(coding_system_list);
616 coding_system_list_closure.coding_system_list = &coding_system_list;
617 elisp_maphash(add_coding_system_to_list_mapper,
618 Vcoding_system_hash_table, &coding_system_list_closure);
621 return coding_system_list;
624 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
625 Return the name of the given coding system.
629 coding_system = Fget_coding_system(coding_system);
630 return XCODING_SYSTEM_NAME(coding_system);
633 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
636 Lisp_Coding_System *codesys =
637 alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
639 zero_lcrecord(codesys);
640 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
641 CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
642 CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
643 CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
644 CODING_SYSTEM_EOL_CR(codesys) = Qnil;
645 CODING_SYSTEM_EOL_LF(codesys) = Qnil;
646 CODING_SYSTEM_TYPE(codesys) = type;
647 CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
649 if (type == CODESYS_ISO2022) {
651 for (i = 0; i < 4; i++)
652 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
654 } else if (type == CODESYS_CCL) {
655 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
656 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
659 CODING_SYSTEM_NAME(codesys) = name;
665 /* Given a list of charset conversion specs as specified in a Lisp
666 program, parse it into STORE_HERE. */
669 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
670 Lisp_Object spec_list)
674 EXTERNAL_LIST_LOOP(rest, spec_list) {
675 Lisp_Object car = XCAR(rest);
676 Lisp_Object from, to;
677 struct charset_conversion_spec spec;
679 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
680 signal_simple_error("Invalid charset conversion spec",
682 from = Fget_charset(XCAR(car));
683 to = Fget_charset(XCAR(XCDR(car)));
684 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
685 signal_simple_error_2
686 ("Attempted conversion between different charset types",
688 spec.from_charset = from;
689 spec.to_charset = to;
691 Dynarr_add(store_here, spec);
695 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
696 specs, return the equivalent as the Lisp programmer would see it.
698 If LOAD_HERE is 0, return Qnil. */
701 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
708 for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
709 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
711 Fcons(list2(ccs->from_charset, ccs->to_charset), result);
714 return Fnreverse(result);
719 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
720 Register symbol NAME as a coding system.
722 TYPE describes the conversion method used and should be one of
725 Automatic conversion. SXEmacs attempts to detect the coding system
728 No conversion. Use this for binary files and such. On output,
729 graphic characters that are not in ASCII or Latin-1 will be
730 replaced by a ?. (For a no-conversion-encoded buffer, these
731 characters will only be present if you explicitly insert them.)
733 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
735 ISO 10646 UCS-4 encoding.
737 ISO 10646 UTF-8 encoding.
739 Any ISO2022-compliant encoding. Among other things, this includes
740 JIS (the Japanese encoding commonly used for e-mail), EUC (the
741 standard Unix encoding for Japanese and other languages), and
742 Compound Text (the encoding used in X11). You can specify more
743 specific information about the conversion with the PROPS argument.
745 Big5 (the encoding commonly used for Taiwanese).
747 The conversion is performed using a user-written pseudo-code
748 program. CCL (Code Conversion Language) is the name of this
751 Write out or read in the raw contents of the memory representing
752 the buffer's text. This is primarily useful for debugging
753 purposes, and is only enabled when SXEmacs has been compiled with
754 DEBUG_SXEMACS defined (via the --debug configure option).
755 WARNING: Reading in a file using 'internal conversion can result
756 in an internal inconsistency in the memory representing a
757 buffer's text, which will produce unpredictable results and may
758 cause SXEmacs to crash. Under normal circumstances you should
759 never use 'internal conversion.
761 DOC-STRING is a string describing the coding system.
763 PROPS is a property list, describing the specific nature of the
764 character set. Recognized properties are:
767 String to be displayed in the modeline when this coding system is
771 End-of-line conversion to be used. It should be one of
774 Automatically detect the end-of-line type (LF, CRLF,
775 or CR). Also generate subsidiary coding systems named
776 `NAME-unix', `NAME-dos', and `NAME-mac', that are
777 identical to this coding system but have an EOL-TYPE
778 value of 'lf, 'crlf, and 'cr, respectively.
780 The end of a line is marked externally using ASCII LF.
781 Since this is also the way that SXEmacs represents an
782 end-of-line internally, specifying this option results
783 in no end-of-line conversion. This is the standard
784 format for Unix text files.
786 The end of a line is marked externally using ASCII
787 CRLF. This is the standard format for MS-DOS text
790 The end of a line is marked externally using ASCII CR.
791 This is the standard format for Macintosh text files.
793 Automatically detect the end-of-line type but do not
794 generate subsidiary coding systems. (This value is
795 converted to nil when stored internally, and
796 `coding-system-property' will return nil.)
798 'post-read-conversion
799 Function called after a file has been read in, to perform the
800 decoding. Called with two arguments, START and END, denoting
801 a region of the current buffer to be decoded.
803 'pre-write-conversion
804 Function called before a file is written out, to perform the
805 encoding. Called with two arguments, START and END, denoting
806 a region of the current buffer to be encoded.
808 The following additional properties are recognized if TYPE is 'iso2022:
814 The character set initially designated to the G0 - G3 registers.
815 The value should be one of
817 -- A charset object (designate that character set)
818 -- nil (do not ever use this register)
819 -- t (no character set is initially designated to
820 the register, but may be later on; this automatically
821 sets the corresponding `force-g*-on-output' property)
827 If non-nil, send an explicit designation sequence on output before
828 using the specified register.
831 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
832 "ESC $ B" on output in place of the full designation sequences
833 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
836 If non-nil, don't designate ASCII to G0 at each end of line on output.
837 Setting this to non-nil also suppresses other state-resetting that
838 normally happens at the end of a line.
841 If non-nil, don't designate ASCII to G0 before control chars on output.
844 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
848 If non-nil, use locking-shift (SO/SI) instead of single-shift
849 or designation by escape sequence.
852 If non-nil, don't use ISO6429's direction specification.
855 If non-nil, literal control characters that are the same as
856 the beginning of a recognized ISO2022 or ISO6429 escape sequence
857 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
858 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
859 so that they can be properly distinguished from an escape sequence.
860 (Note that doing this results in a non-portable encoding.) This
861 encoding flag is used for byte-compiled files. Note that ESC
862 is a good choice for a quoting character because there are no
863 escape sequences whose second byte is a character from the Control-0
864 or Control-1 character sets; this is explicitly disallowed by the
867 'input-charset-conversion
868 A list of conversion specifications, specifying conversion of
869 characters in one charset to another when decoding is performed.
870 Each specification is a list of two elements: the source charset,
871 and the destination charset.
873 'output-charset-conversion
874 A list of conversion specifications, specifying conversion of
875 characters in one charset to another when encoding is performed.
876 The form of each specification is the same as for
877 'input-charset-conversion.
879 The following additional properties are recognized (and required)
883 CCL program used for decoding (converting to internal format).
886 CCL program used for encoding (converting to external format).
888 (name, type, doc_string, props))
890 Lisp_Coding_System *codesys;
891 enum coding_system_type ty;
892 int need_to_setup_eol_systems = 1;
894 /* Convert type to constant */
895 if (NILP(type) || EQ(type, Qundecided)) {
896 ty = CODESYS_AUTODETECT;
899 else if (EQ(type, Qshift_jis)) {
900 ty = CODESYS_SHIFT_JIS;
901 } else if (EQ(type, Qiso2022)) {
902 ty = CODESYS_ISO2022;
903 } else if (EQ(type, Qbig5)) {
905 } else if (EQ(type, Qucs4)) {
907 } else if (EQ(type, Qutf8)) {
909 } else if (EQ(type, Qccl)) {
913 else if (EQ(type, Qno_conversion)) {
914 ty = CODESYS_NO_CONVERSION;
917 else if (EQ(type, Qinternal)) {
918 ty = CODESYS_INTERNAL;
922 signal_simple_error("Invalid coding system type", type);
926 codesys = allocate_coding_system(ty, name);
928 if (NILP(doc_string))
929 doc_string = build_string("");
931 CHECK_STRING(doc_string);
932 CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
935 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
936 if (EQ(key, Qmnemonic)) {
939 CODING_SYSTEM_MNEMONIC(codesys) = value;
942 else if (EQ(key, Qeol_type)) {
943 need_to_setup_eol_systems = NILP(value);
946 CODING_SYSTEM_EOL_TYPE(codesys) =
947 symbol_to_eol_type(value);
950 else if (EQ(key, Qpost_read_conversion))
951 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
953 else if (EQ(key, Qpre_write_conversion))
954 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
957 else if (ty == CODESYS_ISO2022) {
958 #define FROB_INITIAL_CHARSET(charset_num) \
959 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
960 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
962 if (EQ(key, Qcharset_g0))
963 FROB_INITIAL_CHARSET(0);
964 else if (EQ(key, Qcharset_g1))
965 FROB_INITIAL_CHARSET(1);
966 else if (EQ(key, Qcharset_g2))
967 FROB_INITIAL_CHARSET(2);
968 else if (EQ(key, Qcharset_g3))
969 FROB_INITIAL_CHARSET(3);
971 #define FROB_FORCE_CHARSET(charset_num) \
972 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
974 else if (EQ(key, Qforce_g0_on_output))
975 FROB_FORCE_CHARSET(0);
976 else if (EQ(key, Qforce_g1_on_output))
977 FROB_FORCE_CHARSET(1);
978 else if (EQ(key, Qforce_g2_on_output))
979 FROB_FORCE_CHARSET(2);
980 else if (EQ(key, Qforce_g3_on_output))
981 FROB_FORCE_CHARSET(3);
983 #define FROB_BOOLEAN_PROPERTY(prop) \
984 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
986 else if (EQ(key, Qshort))
987 FROB_BOOLEAN_PROPERTY(SHORT);
988 else if (EQ(key, Qno_ascii_eol))
989 FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
990 else if (EQ(key, Qno_ascii_cntl))
991 FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
992 else if (EQ(key, Qseven))
993 FROB_BOOLEAN_PROPERTY(SEVEN);
994 else if (EQ(key, Qlock_shift))
995 FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
996 else if (EQ(key, Qno_iso6429))
997 FROB_BOOLEAN_PROPERTY(NO_ISO6429);
998 else if (EQ(key, Qescape_quoted))
999 FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
1001 else if (EQ(key, Qinput_charset_conversion)) {
1002 codesys->iso2022.input_conv =
1003 Dynarr_new(charset_conversion_spec);
1004 parse_charset_conversion_specs(codesys->
1008 } else if (EQ(key, Qoutput_charset_conversion)) {
1009 codesys->iso2022.output_conv =
1010 Dynarr_new(charset_conversion_spec);
1011 parse_charset_conversion_specs(codesys->
1017 ("Unrecognized property", key);
1018 } else if (EQ(type, Qccl)) {
1020 struct ccl_program test_ccl;
1023 /* Check key first. */
1024 if (EQ(key, Qdecode))
1025 suffix = "-ccl-decode";
1026 else if (EQ(key, Qencode))
1027 suffix = "-ccl-encode";
1030 ("Unrecognized property", key);
1032 /* If value is vector, register it as a ccl program
1033 associated with an newly created symbol for
1034 backward compatibility. */
1035 if (VECTORP(value)) {
1038 (Fsymbol_name(name),
1039 build_string(suffix)),
1041 Fregister_ccl_program(sym, value);
1043 CHECK_SYMBOL(value);
1046 /* check if the given ccl programs are valid. */
1047 if (setup_ccl_program(&test_ccl, sym) < 0)
1049 ("Invalid CCL program", value);
1051 if (EQ(key, Qdecode))
1052 CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1053 else if (EQ(key, Qencode))
1054 CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1059 signal_simple_error("Unrecognized property",
1064 if (need_to_setup_eol_systems)
1065 setup_eol_coding_systems(codesys);
1068 Lisp_Object codesys_obj;
1069 XSETCODING_SYSTEM(codesys_obj, codesys);
1070 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1075 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1076 Copy OLD-CODING-SYSTEM to NEW-NAME.
1077 If NEW-NAME does not name an existing coding system, a new one will
1080 (old_coding_system, new_name))
1082 Lisp_Object new_coding_system;
1083 old_coding_system = Fget_coding_system(old_coding_system);
1084 new_coding_system = Ffind_coding_system(new_name);
1085 if (NILP(new_coding_system)) {
1086 XSETCODING_SYSTEM(new_coding_system,
1087 allocate_coding_system
1088 (XCODING_SYSTEM_TYPE(old_coding_system),
1090 Fputhash(new_name, new_coding_system,
1091 Vcoding_system_hash_table);
1095 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1096 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1097 memcpy(((char *)to) + sizeof(to->header),
1098 ((char *)from) + sizeof(from->header),
1099 sizeof(*from) - sizeof(from->header));
1100 to->name = new_name;
1102 return new_coding_system;
1105 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1106 Return t if OBJECT names a coding system, and is not a coding system alias.
1110 Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1111 return CODING_SYSTEMP(val) ? Qt : Qnil;
1114 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1115 Return t if OBJECT is a coding system alias.
1116 All coding system aliases are created by `define-coding-system-alias'.
1120 Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1121 return SYMBOLP(val) ? Qt : Qnil;
1124 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1125 Return the coding-system symbol for which symbol ALIAS is an alias.
1129 Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1130 if (SYMBOLP(aliasee)) {
1133 signal_simple_error(
1134 "Symbol is not a coding system alias", alias);
1136 /* To keep the compiler happy */
1141 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1143 return Fintern(concat2(Fsymbol_name(symbol),
1144 build_string(ascii_string)), Qnil);
1147 /* A maphash function, for removing dangling coding system aliases. */
1149 dangling_coding_system_alias_p(Lisp_Object alias,
1150 Lisp_Object aliasee, void *dangling_aliases)
1152 if (SYMBOLP(aliasee)
1153 && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1154 (*(int *)dangling_aliases)++;
1161 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1162 Define symbol ALIAS as an alias for coding system ALIASEE.
1164 You can use this function to redefine an alias that has already been defined,
1165 but you cannot redefine a name which is the canonical name for a coding system.
1166 \(a canonical name of a coding system is what is returned when you call
1167 `coding-system-name' on a coding system).
1169 ALIASEE itself can be an alias, which allows you to define nested aliases.
1171 You are forbidden, however, from creating alias loops or `dangling' aliases.
1172 These will be detected, and an error will be signaled if you attempt to do so.
1174 If ALIASEE is nil, then ALIAS will simply be undefined.
1176 See also `coding-system-alias-p', `coding-system-aliasee',
1177 and `coding-system-canonical-name-p'.
1181 Lisp_Object real_coding_system, probe;
1183 CHECK_SYMBOL(alias);
1185 if (!NILP(Fcoding_system_canonical_name_p(alias)))
1187 ("Symbol is the canonical name of a coding system and cannot be redefined",
1190 if (NILP(aliasee)) {
1191 Lisp_Object subsidiary_unix =
1192 append_suffix_to_symbol(alias, "-unix");
1193 Lisp_Object subsidiary_dos =
1194 append_suffix_to_symbol(alias, "-dos");
1195 Lisp_Object subsidiary_mac =
1196 append_suffix_to_symbol(alias, "-mac");
1198 Fremhash(alias, Vcoding_system_hash_table);
1200 /* Undefine subsidiary aliases,
1201 presumably created by a previous call to this function */
1202 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1203 !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1204 !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1205 Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1206 Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1207 Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1210 /* Undefine dangling coding system aliases. */
1212 int dangling_aliases;
1215 dangling_aliases = 0;
1217 (dangling_coding_system_alias_p,
1218 Vcoding_system_hash_table,
1220 } while (dangling_aliases > 0);
1226 if (CODING_SYSTEMP(aliasee))
1227 aliasee = XCODING_SYSTEM_NAME(aliasee);
1229 /* Checks that aliasee names a coding-system */
1230 real_coding_system = Fget_coding_system(aliasee);
1231 SXE_SET_UNUSED(real_coding_system);
1233 /* Check for coding system alias loops */
1234 if (EQ(alias, aliasee))
1235 alias_loop:signal_simple_error_2
1236 ("Attempt to create a coding system alias loop", alias,
1239 for (probe = aliasee;
1241 probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1242 if (EQ(probe, alias))
1246 Fputhash(alias, aliasee, Vcoding_system_hash_table);
1248 /* Set up aliases for subsidiaries.
1249 #### There must be a better way to handle subsidiary coding
1252 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1254 for (int i = 0; i < countof(suffixes); i++) {
1255 Lisp_Object alias_subsidiary =
1256 append_suffix_to_symbol(alias, suffixes[i]);
1257 Lisp_Object aliasee_subsidiary =
1258 append_suffix_to_symbol(aliasee, suffixes[i]);
1260 if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1261 Fdefine_coding_system_alias(alias_subsidiary,
1262 aliasee_subsidiary);
1266 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1267 but it doesn't look intentional, so I'd rather return something
1268 meaningful or nothing at all. */
1273 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1275 Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1276 Lisp_Object new_coding_system;
1278 if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1279 return coding_system;
1282 case EOL_AUTODETECT:
1283 return coding_system;
1285 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1288 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1291 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1298 return NILP(new_coding_system) ? coding_system : new_coding_system;
1301 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1302 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1304 (coding_system, eol_type))
1306 coding_system = Fget_coding_system(coding_system);
1308 return subsidiary_coding_system(coding_system,
1309 symbol_to_eol_type(eol_type));
1312 /************************************************************************/
1313 /* Coding system accessors */
1314 /************************************************************************/
1316 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1317 Return the doc string for CODING-SYSTEM.
1321 coding_system = Fget_coding_system(coding_system);
1322 return XCODING_SYSTEM_DOC_STRING(coding_system);
1325 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1326 Return the type of CODING-SYSTEM.
1330 Lisp_Object tmp = Fget_coding_system(coding_system);
1332 switch (XCODING_SYSTEM_TYPE(tmp)) {
1336 case CODESYS_AUTODETECT:
1339 case CODESYS_SHIFT_JIS:
1341 case CODESYS_ISO2022:
1352 case CODESYS_NO_CONVERSION:
1353 return Qno_conversion;
1354 #ifdef DEBUG_SXEMACS
1355 case CODESYS_INTERNAL:
1363 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1366 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1368 return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1371 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1372 Return initial charset of CODING-SYSTEM designated to GNUM.
1375 (coding_system, gnum))
1377 coding_system = Fget_coding_system(coding_system);
1380 return coding_system_charset(coding_system, XINT(gnum));
1384 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1385 Return the PROP property of CODING-SYSTEM.
1387 (coding_system, prop))
1390 enum coding_system_type type;
1392 coding_system = Fget_coding_system(coding_system);
1394 type = XCODING_SYSTEM_TYPE(coding_system);
1396 for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1397 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1399 switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1400 case CODESYS_PROP_ALL_OK:
1403 case CODESYS_PROP_ISO2022:
1404 if (type != CODESYS_ISO2022)
1406 ("Property only valid in ISO2022 coding systems",
1410 case CODESYS_PROP_CCL:
1411 if (type != CODESYS_CCL)
1413 ("Property only valid in CCL coding systems",
1423 signal_simple_error("Unrecognized property", prop);
1425 if (EQ(prop, Qname))
1426 return XCODING_SYSTEM_NAME(coding_system);
1427 else if (EQ(prop, Qtype))
1428 return Fcoding_system_type(coding_system);
1429 else if (EQ(prop, Qdoc_string))
1430 return XCODING_SYSTEM_DOC_STRING(coding_system);
1431 else if (EQ(prop, Qmnemonic))
1432 return XCODING_SYSTEM_MNEMONIC(coding_system);
1433 else if (EQ(prop, Qeol_type))
1435 eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1436 else if (EQ(prop, Qeol_lf))
1437 return XCODING_SYSTEM_EOL_LF(coding_system);
1438 else if (EQ(prop, Qeol_crlf))
1439 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1440 else if (EQ(prop, Qeol_cr))
1441 return XCODING_SYSTEM_EOL_CR(coding_system);
1442 else if (EQ(prop, Qpost_read_conversion))
1443 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1444 else if (EQ(prop, Qpre_write_conversion))
1445 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1447 else if (type == CODESYS_ISO2022) {
1448 if (EQ(prop, Qcharset_g0))
1449 return coding_system_charset(coding_system, 0);
1450 else if (EQ(prop, Qcharset_g1))
1451 return coding_system_charset(coding_system, 1);
1452 else if (EQ(prop, Qcharset_g2))
1453 return coding_system_charset(coding_system, 2);
1454 else if (EQ(prop, Qcharset_g3))
1455 return coding_system_charset(coding_system, 3);
1457 #define FORCE_CHARSET(charset_num) \
1458 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1459 (coding_system, charset_num) ? Qt : Qnil)
1461 else if (EQ(prop, Qforce_g0_on_output))
1462 return FORCE_CHARSET(0);
1463 else if (EQ(prop, Qforce_g1_on_output))
1464 return FORCE_CHARSET(1);
1465 else if (EQ(prop, Qforce_g2_on_output))
1466 return FORCE_CHARSET(2);
1467 else if (EQ(prop, Qforce_g3_on_output))
1468 return FORCE_CHARSET(3);
1470 #define LISP_BOOLEAN(prop) \
1471 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1473 else if (EQ(prop, Qshort))
1474 return LISP_BOOLEAN(SHORT);
1475 else if (EQ(prop, Qno_ascii_eol))
1476 return LISP_BOOLEAN(NO_ASCII_EOL);
1477 else if (EQ(prop, Qno_ascii_cntl))
1478 return LISP_BOOLEAN(NO_ASCII_CNTL);
1479 else if (EQ(prop, Qseven))
1480 return LISP_BOOLEAN(SEVEN);
1481 else if (EQ(prop, Qlock_shift))
1482 return LISP_BOOLEAN(LOCK_SHIFT);
1483 else if (EQ(prop, Qno_iso6429))
1484 return LISP_BOOLEAN(NO_ISO6429);
1485 else if (EQ(prop, Qescape_quoted))
1486 return LISP_BOOLEAN(ESCAPE_QUOTED);
1488 else if (EQ(prop, Qinput_charset_conversion))
1490 unparse_charset_conversion_specs
1491 (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1492 else if (EQ(prop, Qoutput_charset_conversion))
1494 unparse_charset_conversion_specs
1495 (XCODING_SYSTEM(coding_system)->iso2022.
1499 } else if (type == CODESYS_CCL) {
1500 if (EQ(prop, Qdecode))
1501 return XCODING_SYSTEM_CCL_DECODE(coding_system);
1502 else if (EQ(prop, Qencode))
1503 return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1511 return Qnil; /* not reached */
1514 /************************************************************************/
1515 /* Coding category functions */
1516 /************************************************************************/
1518 static int decode_coding_category(Lisp_Object symbol)
1522 CHECK_SYMBOL(symbol);
1523 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1524 if (EQ(coding_category_symbol[i], symbol))
1527 signal_simple_error("Unrecognized coding category", symbol);
1528 return 0; /* not reached */
1531 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1532 Return a list of all recognized coding categories.
1537 Lisp_Object list = Qnil;
1539 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1540 list = Fcons(coding_category_symbol[i], list);
1544 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1545 Change the priority order of the coding categories.
1546 LIST should be list of coding categories, in descending order of
1547 priority. Unspecified coding categories will be lower in priority
1548 than all specified ones, in the same relative order they were in
1553 int category_to_priority[CODING_CATEGORY_LAST];
1557 /* First generate a list that maps coding categories to priorities. */
1559 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1560 category_to_priority[i] = -1;
1562 /* Highest priority comes from the specified list. */
1564 EXTERNAL_LIST_LOOP(rest, list) {
1565 int cat = decode_coding_category(XCAR(rest));
1567 if (category_to_priority[cat] >= 0)
1568 signal_simple_error("Duplicate coding category in list",
1570 category_to_priority[cat] = i++;
1573 /* Now go through the existing categories by priority to retrieve
1574 the categories not yet specified and preserve their priority
1576 for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1577 int cat = fcd->coding_category_by_priority[j];
1578 if (category_to_priority[cat] < 0)
1579 category_to_priority[cat] = i++;
1582 /* Now we need to construct the inverse of the mapping we just
1585 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1586 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1588 /* Phew! That was confusing. */
1592 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1593 Return a list of coding categories in descending order of priority.
1598 Lisp_Object list = Qnil;
1600 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1602 Fcons(coding_category_symbol
1603 [fcd->coding_category_by_priority[i]], list);
1607 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1608 Change the coding system associated with a coding category.
1610 (coding_category, coding_system))
1612 int cat = decode_coding_category(coding_category);
1614 coding_system = Fget_coding_system(coding_system);
1615 fcd->coding_category_system[cat] = coding_system;
1619 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1620 Return the coding system associated with a coding category.
1624 int cat = decode_coding_category(coding_category);
1625 Lisp_Object sys = fcd->coding_category_system[cat];
1628 return XCODING_SYSTEM_NAME(sys);
1632 /************************************************************************/
1633 /* Detecting the encoding of data */
1634 /************************************************************************/
1636 struct detection_state {
1637 eol_type_t eol_type;
1664 struct iso2022_decoder iso;
1666 int high_byte_count;
1667 unsigned int saw_single_shift:1;
1676 static int acceptable_control_char_p(int c)
1679 /* Allow and ignore control characters that you might
1680 reasonably see in a text file */
1685 case 8: /* backspace */
1686 case 11: /* vertical tab */
1687 case 12: /* form feed */
1688 case 26: /* MS-DOS C-z junk */
1689 case 31: /* '^_' -- for info */
1696 static int mask_has_at_most_one_bit_p(int mask)
1698 /* Perhaps the only thing useful you learn from intensive Microsoft
1699 technical interviews */
1700 return (mask & (mask - 1)) == 0;
1704 detect_eol_type(struct detection_state *st, const Extbyte * src,
1705 Lstream_data_count n)
1708 const unsigned char c = *(const unsigned char*)src++;
1710 if (st->eol.just_saw_cr)
1712 else if (st->eol.seen_anything)
1714 } else if (st->eol.just_saw_cr)
1717 st->eol.just_saw_cr = 1;
1719 st->eol.just_saw_cr = 0;
1720 st->eol.seen_anything = 1;
1723 return EOL_AUTODETECT;
1726 /* Attempt to determine the encoding and EOL type of the given text.
1727 Before calling this function for the first type, you must initialize
1728 st->eol_type as appropriate and initialize st->mask to ~0.
1730 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1733 st->mask holds the determined coding category mask, or ~0 if only
1734 ASCII has been seen so far.
1738 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1739 is present in st->mask
1740 1 == definitive answers are here for both st->eol_type and st->mask
1744 detect_coding_type(struct detection_state *st, const Extbyte * src,
1745 Lstream_data_count n, int just_do_eol)
1747 if (st->eol_type == EOL_AUTODETECT)
1748 st->eol_type = detect_eol_type(st, src, n);
1751 return st->eol_type != EOL_AUTODETECT;
1753 if (!st->seen_non_ascii) {
1754 for (; n; n--, src++) {
1755 const unsigned char c = *(const unsigned char *)src;
1756 if ((c < 0x20 && !acceptable_control_char_p(c))
1758 st->seen_non_ascii = 1;
1760 st->shift_jis.mask = ~0;
1764 st->iso2022.mask = ~0;
1775 if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1776 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1777 if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1778 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1779 if (!mask_has_at_most_one_bit_p(st->big5.mask))
1780 st->big5.mask = detect_coding_big5(st, src, n);
1781 if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1782 st->utf8.mask = detect_coding_utf8(st, src, n);
1783 if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1784 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1786 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1787 | st->utf8.mask | st->ucs4.mask;
1790 int retval = mask_has_at_most_one_bit_p(st->mask);
1791 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1792 return retval && st->eol_type != EOL_AUTODETECT;
1796 static Lisp_Object coding_system_from_mask(int mask)
1799 /* If the file was entirely or basically ASCII, use the
1800 default value of `buffer-file-coding-system'. */
1801 Lisp_Object retval =
1802 XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1803 if (!NILP(retval)) {
1804 retval = Ffind_coding_system(retval);
1807 (Qbad_variable, Qwarning,
1808 "Invalid `default-buffer-file-coding-system', set to nil");
1809 XBUFFER(Vbuffer_defaults)->
1810 buffer_file_coding_system = Qnil;
1814 retval = Fget_coding_system(Qraw_text);
1820 mask = postprocess_iso2022_mask(mask);
1822 /* Look through the coding categories by priority and find
1823 the first one that is allowed. */
1824 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1825 cat = fcd->coding_category_by_priority[i];
1828 if ((mask & (1 << cat)) &&
1829 !NILP(fcd->coding_category_system[cat]))
1833 return fcd->coding_category_system[cat];
1835 return Fget_coding_system(Qraw_text);
1839 /* Given a seekable read stream and potential coding system and EOL type
1840 as specified, do any autodetection that is called for. If the
1841 coding system and/or EOL type are not `autodetect', they will be left
1842 alone; but this function will never return an autodetect coding system
1845 This function does not automatically fetch subsidiary coding systems;
1846 that should be unnecessary with the explicit eol-type argument. */
1848 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1849 /* number of leading lines to check for a coding cookie */
1850 #define LINES_TO_CHECK 2
1854 autodetect_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1855 eol_type_t * eol_type_in_out)
1857 static const char mime_name_valid_chars[] =
1858 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1859 "abcdefghijklmnopqrstuvwxyz"
1863 struct detection_state decst;
1866 Lisp_Object coding_system = Qnil;
1869 int lines_checked = 0;
1870 Lstream_data_count nread =
1871 Lstream_read(stream, buf, sizeof(buf));
1874 /* Nothing more can be done here */
1878 decst.eol_type = *eol_type_in_out;
1882 /* Look for initial "-*-"; mode line prefix */
1883 for (p = buf, scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1884 p <= scan_end && lines_checked < LINES_TO_CHECK; p++) {
1885 Extbyte *local_vars_beg = p + 3;
1887 if (*p == '\n' || *p == '\r') {
1888 /* file must use standard EOLs or we miss 2d
1889 line not to mention this is broken for
1892 /* skip past multibyte (DOS) newline */
1893 if (*p == '\r' && *(p + 1) == '\n')
1897 if (*p != '-' || *(p + 1) != '*' || *(p + 2) != '-') {
1901 /* Look for final "-*-"; mode line suffix */
1902 for (p = local_vars_beg, scan_end = buf + nread - LENGTH("-*-");
1903 p <= scan_end && lines_checked < LINES_TO_CHECK; p++) {
1904 Extbyte *suffix = p;
1905 if (*p == '\n' || *p == '\r') {
1906 /* file must use standard EOLs or we
1907 miss 2d line not to mention this is
1908 broken for UTF-16 DOS files */
1910 /* skip past multibyte (DOS) newline */
1912 && *(p + 1) == '\n')
1916 if (*p != '-' || *(p + 1) != '*' || *(p + 2) != '-') {
1920 /* Look for "coding:" */
1921 for (p = local_vars_beg, scan_end = suffix - LENGTH("coding:?");
1922 p <= scan_end; p++) {
1926 if (memcmp("coding:", p, LENGTH("coding:")) != 0) {
1929 if (p != local_vars_beg && strchr(" \t;", *p) == NULL ) {
1932 p += LENGTH("coding:");
1933 while (*p == ' ' || *p == '\t') {
1937 /* Get coding system name */
1940 /* Characters valid in a MIME charset
1941 name (rfc 1521), and in a Lisp
1943 n = strspn((char *)p, mime_name_valid_chars);
1948 coding_system = Ffind_coding_system(
1954 /* Try " coding:" */
1955 if (NILP(coding_system)) {
1956 for (p = local_vars_beg, scan_end = suffix - LENGTH(" coding:?");
1957 p <= scan_end; p++) {
1961 if (memcmp(" coding:", p, LENGTH(" coding:")) != 0) {
1964 if (p != local_vars_beg && strchr(" \t;", *p) == NULL ) {
1967 p += LENGTH(" coding:");
1968 while (*p == ' ' || *p == '\t') {
1972 /* Get coding system name */
1975 /* Characters valid in a MIME charset
1976 name (rfc 1521), and in a Lisp
1978 n = strspn((char *)p, mime_name_valid_chars);
1983 coding_system = Ffind_coding_system(
1994 if (NILP(coding_system)) {
1996 if (detect_coding_type(&decst, buf, nread,
1997 XCODING_SYSTEM_TYPE(*codesys_in_out)
1998 != CODESYS_AUTODETECT))
2000 nread = Lstream_read(stream, buf, sizeof(buf));
2005 } else if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
2006 && XCODING_SYSTEM_EOL_TYPE(coding_system) == EOL_AUTODETECT) {
2008 if (detect_coding_type(&decst, buf, nread, 1))
2010 nread = Lstream_read(stream, buf, sizeof(buf));
2016 *eol_type_in_out = decst.eol_type;
2017 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
2018 if (NILP(coding_system))
2020 coding_system_from_mask(decst.mask);
2022 *codesys_in_out = coding_system;
2027 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
2028 eol_type_t * eol_type_in_out)
2031 if (*eol_type_in_out == EOL_AUTODETECT)
2032 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
2035 /* If autodetection is called for, do it now. */
2036 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
2037 || *eol_type_in_out == EOL_AUTODETECT) {
2038 autodetect_real_coding_system(stream, codesys_in_out,
2042 /* If we absolutely can't determine the EOL type, just assume LF. */
2043 if (*eol_type_in_out == EOL_AUTODETECT)
2044 *eol_type_in_out = EOL_LF;
2046 Lstream_rewind(stream);
2049 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2050 Detect coding system of the text in the region between START and END.
2051 Return a list of possible coding systems ordered by priority.
2052 If only ASCII characters are found, return 'undecided or one of
2053 its subsidiary coding systems according to a detected end-of-line
2054 type. Optional arg BUFFER defaults to the current buffer.
2056 (start, end, buffer))
2058 Lisp_Object val = Qnil;
2059 struct buffer *buf = decode_buffer(buffer, 0);
2061 Lisp_Object instream, lb_instream;
2062 lstream_t istr, lb_istr;
2063 struct detection_state decst;
2064 struct gcpro gcpro1, gcpro2;
2066 get_buffer_range_char(buf, start, end, &b, &e, 0);
2067 lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2068 lb_istr = XLSTREAM(lb_instream);
2070 make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2071 istr = XLSTREAM(instream);
2072 GCPRO2(instream, lb_instream);
2074 decst.eol_type = EOL_AUTODETECT;
2077 Extbyte random_buffer[4096];
2078 Lstream_data_count nread =
2079 Lstream_read(istr, random_buffer, sizeof(random_buffer));
2083 if (detect_coding_type(&decst, random_buffer, nread, 0))
2087 if (decst.mask == ~0)
2088 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2095 decst.mask = postprocess_iso2022_mask(decst.mask);
2097 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2098 int sys = fcd->coding_category_by_priority[i];
2099 if (decst.mask & (1 << sys)) {
2100 Lisp_Object codesys =
2101 fcd->coding_category_system[sys];
2104 subsidiary_coding_system(codesys,
2107 val = Fcons(codesys, val);
2111 Lstream_close(istr);
2113 Lstream_delete(istr);
2114 Lstream_delete(lb_istr);
2118 /************************************************************************/
2119 /* Converting to internal Mule format ("decoding") */
2120 /************************************************************************/
2122 /* A decoding stream is a stream used for decoding text (i.e.
2123 converting from some external format to internal format).
2124 The decoding-stream object keeps track of the actual coding
2125 stream, the stream that is at the other end, and data that
2126 needs to be persistent across the lifetime of the stream. */
2128 /* Handle the EOL stuff related to just-read-in character C.
2129 EOL_TYPE is the EOL type of the coding stream.
2130 FLAGS is the current value of FLAGS in the coding stream, and may
2131 be modified by this macro. (The macro only looks at the
2132 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2133 bytes are to be written. You need to also define a local goto
2134 label "label_continue_loop" that is at the end of the main
2135 character-reading loop.
2137 If C is a CR character, then this macro handles it entirely and
2138 jumps to label_continue_loop. Otherwise, this macro does not add
2139 anything to DST, and continues normally. You should continue
2140 processing C normally after this macro. */
2142 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2146 if (eol_type == EOL_CR) \
2147 Dynarr_add (dst, '\n'); \
2148 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2149 Dynarr_add (dst, c); \
2151 flags |= CODING_STATE_CR; \
2152 goto label_continue_loop; \
2154 else if (flags & CODING_STATE_CR) \
2155 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2157 Dynarr_add (dst, '\r'); \
2158 flags &= ~CODING_STATE_CR; \
2162 /* C should be a binary character in the range 0 - 255; convert
2163 to internal format and add to Dynarr DST. */
2165 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2167 if (BYTE_ASCII_P (c)) \
2168 Dynarr_add (dst, c); \
2169 else if (BYTE_C1_P (c)) \
2171 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2172 Dynarr_add (dst, c + 0x20); \
2176 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2177 Dynarr_add (dst, c); \
2181 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2185 DECODE_ADD_BINARY_CHAR (ch, dst); \
2190 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2192 if (flags & CODING_STATE_END) \
2194 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2195 if (flags & CODING_STATE_CR) \
2196 Dynarr_add (dst, '\r'); \
2200 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2202 typedef struct decoding_stream_s *decoding_stream_t;
2203 struct decoding_stream_s {
2204 /* Coding system that governs the conversion. */
2205 Lisp_Coding_System *codesys;
2207 /* Stream that we read the encoded data from or
2208 write the decoded data to. */
2209 lstream_t other_end;
2211 /* If we are reading, then we can return only a fixed amount of
2212 data, so if the conversion resulted in too much data, we store it
2213 here for retrieval the next time around. */
2214 unsigned_char_dynarr *runoff;
2216 /* FLAGS holds flags indicating the current state of the decoding.
2217 Some of these flags are dependent on the coding system. */
2220 /* CH holds a partially built-up character. Since we only deal
2221 with one- and two-byte characters at the moment, we only use
2222 this to store the first byte of a two-byte character. */
2225 /* EOL_TYPE specifies the type of end-of-line conversion that
2226 currently applies. We need to keep this separate from the
2227 EOL type stored in CODESYS because the latter might indicate
2228 automatic EOL-type detection while the former will always
2229 indicate a particular EOL type. */
2230 eol_type_t eol_type;
2232 /* Additional ISO2022 information. We define the structure above
2233 because it's also needed by the detection routines. */
2234 struct iso2022_decoder iso2022;
2236 /* Additional information (the state of the running CCL program)
2237 used by the CCL decoder. */
2238 struct ccl_program ccl;
2240 /* counter for UTF-8 or UCS-4 */
2241 unsigned char counter;
2243 struct detection_state decst;
2246 static Lstream_data_count
2247 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2248 static Lstream_data_count
2249 decoding_writer(lstream_t stream,
2250 const unsigned char *data, Lstream_data_count size);
2251 static int decoding_rewinder(lstream_t stream);
2252 static int decoding_seekable_p(lstream_t stream);
2253 static int decoding_flusher(lstream_t stream);
2254 static int decoding_closer(lstream_t stream);
2256 static Lisp_Object decoding_marker(Lisp_Object stream);
2258 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2259 sizeof(struct decoding_stream_s));
2262 decoding_marker(Lisp_Object stream)
2264 lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2265 Lisp_Object str_obj;
2267 /* We do not need to mark the coding systems or charsets stored
2268 within the stream because they are stored in a global list
2269 and automatically marked. */
2271 XSETLSTREAM(str_obj, str);
2272 mark_object(str_obj);
2273 if (str->imp->marker) {
2274 return str->imp->marker(str_obj);
2280 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2281 so we read data from the other end, decode it, and store it into DATA. */
2283 static Lstream_data_count
2284 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2286 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2287 unsigned char *orig_data = data;
2288 Lstream_data_count read_size;
2289 int error_occurred = 0;
2291 /* We need to interface to mule_decode(), which expects to take some
2292 amount of data and store the result into a Dynarr. We have
2293 mule_decode() store into str->runoff, and take data from there
2296 /* We loop until we have enough data, reading chunks from the other
2297 end and decoding it. */
2299 /* Take data from the runoff if we can. Make sure to take at
2300 most SIZE bytes, and delete the data from the runoff. */
2301 if (Dynarr_length(str->runoff) > 0) {
2302 Lstream_data_count chunk =
2304 (Lstream_data_count)
2305 Dynarr_length(str->runoff));
2306 memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2307 Dynarr_delete_many(str->runoff, 0, chunk);
2313 /* No more room for data */
2317 if (str->flags & CODING_STATE_END) {
2318 /* This means that on the previous iteration, we hit the
2319 EOF on the other end. We loop once more so that
2320 mule_decode() can output any final stuff it may be
2321 holding, or any "go back to a sane state" escape
2322 sequences. (This latter makes sense during
2327 /* Exhausted the runoff, so get some more. DATA has at least
2328 SIZE bytes left of storage in it, so it's OK to read directly
2329 into it. (We'll be overwriting above, after we've decoded it
2330 into the runoff.) */
2331 read_size = Lstream_read(str->other_end, data, size);
2332 if (read_size < 0) {
2336 if (read_size == 0) {
2337 /* There might be some more end data produced in the
2338 translation. See the comment above. */
2339 str->flags |= CODING_STATE_END;
2341 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2344 if (data - orig_data == 0) {
2345 return error_occurred ? -1 : 0;
2347 return data - orig_data;
2351 static Lstream_data_count
2352 decoding_writer(lstream_t stream, const unsigned char *data,
2353 Lstream_data_count size)
2355 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2356 Lstream_data_count retval;
2358 /* Decode all our data into the runoff, and then attempt to write
2359 it all out to the other end. Remove whatever chunk we succeeded
2361 mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2362 retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2363 Dynarr_length(str->runoff));
2365 Dynarr_delete_many(str->runoff, 0, retval);
2367 /* Do NOT return retval. The return value indicates how much
2368 of the incoming data was written, not how many bytes were
2374 reset_decoding_stream(decoding_stream_t str)
2377 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2378 Lisp_Object coding_system;
2379 XSETCODING_SYSTEM(coding_system, str->codesys);
2380 reset_iso2022(coding_system, &str->iso2022);
2381 } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2382 setup_ccl_program(&str->ccl,
2383 CODING_SYSTEM_CCL_DECODE(str->codesys));
2387 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2388 || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2390 str->decst.eol_type = EOL_AUTODETECT;
2391 str->decst.mask = ~0;
2393 str->flags = str->ch = 0;
2397 decoding_rewinder(lstream_t stream)
2399 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2400 reset_decoding_stream(str);
2401 Dynarr_reset(str->runoff);
2402 return Lstream_rewind(str->other_end);
2406 decoding_seekable_p(lstream_t stream)
2408 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2409 return Lstream_seekable_p(str->other_end);
2413 decoding_flusher(lstream_t stream)
2415 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2416 return Lstream_flush(str->other_end);
2420 decoding_closer(lstream_t stream)
2422 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2423 if (stream->flags & LSTREAM_FL_WRITE) {
2424 str->flags |= CODING_STATE_END;
2425 decoding_writer(stream, 0, 0);
2427 Dynarr_free(str->runoff);
2429 #ifdef ENABLE_COMPOSITE_CHARS
2430 if (str->iso2022.composite_chars) {
2431 Dynarr_free(str->iso2022.composite_chars);
2435 return Lstream_close(str->other_end);
2439 decoding_stream_coding_system(lstream_t stream)
2441 Lisp_Object coding_system;
2442 decoding_stream_t str = DECODING_STREAM_DATA(stream);
2444 XSETCODING_SYSTEM(coding_system, str->codesys);
2445 return subsidiary_coding_system(coding_system, str->eol_type);
2449 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2451 Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2452 decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2454 if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2455 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2457 reset_decoding_stream(str);
2461 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2462 stream for writing, no automatic code detection will be performed.
2463 The reason for this is that automatic code detection requires a
2464 seekable input. Things will also fail if you open a decoding
2465 stream for reading using a non-fully-specified coding system and
2466 a non-seekable input stream. */
2469 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2471 lstream_t lstr = Lstream_new(lstream_decoding, mode);
2472 decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2476 str->other_end = stream;
2477 str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2478 str->eol_type = EOL_AUTODETECT;
2479 if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2480 /* We can determine the coding system now. */
2481 determine_real_coding_system(stream, &codesys, &str->eol_type);
2483 set_decoding_stream_coding_system(lstr, codesys);
2484 str->decst.eol_type = str->eol_type;
2485 str->decst.mask = ~0;
2486 XSETLSTREAM(obj, lstr);
2491 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2493 return make_decoding_stream_1(stream, codesys, "r");
2497 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2499 return make_decoding_stream_1(stream, codesys, "w");
2502 /* Note: the decode_coding_* functions all take the same
2503 arguments as mule_decode(), which is to say some SRC data of
2504 size N, which is to be stored into dynamic array DST.
2505 DECODING is the stream within which the decoding is
2506 taking place, but no data is actually read from or
2507 written to that stream; that is handled in decoding_reader()
2508 or decoding_writer(). This allows the same functions to
2509 be used for both reading and writing. */
2512 mule_decode(lstream_t decoding, const Extbyte * src,
2513 unsigned_char_dynarr * dst, Lstream_data_count n)
2515 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2517 /* If necessary, do encoding-detection now. We do this when
2518 we're a writing stream or a non-seekable reading stream,
2519 meaning that we can't just process the whole input,
2520 rewind, and start over. */
2522 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2523 str->eol_type == EOL_AUTODETECT) {
2524 Lisp_Object codesys;
2526 XSETCODING_SYSTEM(codesys, str->codesys);
2527 detect_coding_type(&str->decst, src, n,
2528 CODING_SYSTEM_TYPE(str->codesys) !=
2529 CODESYS_AUTODETECT);
2530 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2531 str->decst.mask != ~0)
2532 /* #### This is cheesy. What we really ought to do is
2533 buffer up a certain amount of data so as to get a
2534 less random result. */
2535 codesys = coding_system_from_mask(str->decst.mask);
2536 str->eol_type = str->decst.eol_type;
2537 if (XCODING_SYSTEM(codesys) != str->codesys) {
2538 /* Preserve the CODING_STATE_END flag in case it was set.
2539 If we erase it, bad things might happen. */
2540 int was_end = str->flags & CODING_STATE_END;
2541 set_decoding_stream_coding_system(decoding, codesys);
2543 str->flags |= CODING_STATE_END;
2547 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2548 #ifdef DEBUG_SXEMACS
2549 case CODESYS_INTERNAL:
2550 Dynarr_add_many(dst, src, n);
2553 case CODESYS_AUTODETECT:
2554 /* If we got this far and still haven't decided on the coding
2555 system, then do no conversion. */
2556 case CODESYS_NO_CONVERSION:
2557 decode_coding_no_conversion(decoding, src, dst, n);
2560 case CODESYS_SHIFT_JIS:
2561 decode_coding_sjis(decoding, src, dst, n);
2564 decode_coding_big5(decoding, src, dst, n);
2567 decode_coding_ucs4(decoding, src, dst, n);
2570 decode_coding_utf8(decoding, src, dst, n);
2573 str->ccl.last_block = str->flags & CODING_STATE_END;
2574 /* When applying ccl program to stream, MUST NOT set NULL
2576 ccl_driver(&str->ccl,
2578 ? (const unsigned char *)src
2579 : (const unsigned char *)""),
2580 dst, n, 0, CCL_MODE_DECODING);
2582 case CODESYS_ISO2022:
2583 decode_coding_iso2022(decoding, src, dst, n);
2591 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2592 Decode the text between START and END which is encoded in CODING-SYSTEM.
2593 This is useful if you've read in encoded text from a file without decoding
2594 it (e.g. you read in a JIS-formatted file but used the `binary' or
2595 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2596 Return length of decoded text.
2597 BUFFER defaults to the current buffer if unspecified.
2599 (start, end, coding_system, buffer))
2602 struct buffer *buf = decode_buffer(buffer, 0);
2603 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2604 lstream_t istr, ostr;
2605 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2607 get_buffer_range_char(buf, start, end, &b, &e, 0);
2609 barf_if_buffer_read_only(buf, b, e);
2611 coding_system = Fget_coding_system(coding_system);
2612 instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2613 lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2614 de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2616 outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2617 Fget_coding_system(Qbinary));
2618 istr = XLSTREAM(instream);
2619 ostr = XLSTREAM(outstream);
2620 GCPRO4(instream, lb_outstream, de_outstream, outstream);
2622 /* The chain of streams looks like this:
2624 [BUFFER] <----- send through
2625 ------> [ENCODE AS BINARY]
2626 ------> [DECODE AS SPECIFIED]
2631 char tempbuf[1024]; /* some random amount */
2632 Bufpos newpos, even_newer_pos;
2633 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2634 Lstream_data_count size_in_bytes =
2635 Lstream_read(istr, tempbuf, sizeof(tempbuf));
2639 newpos = lisp_buffer_stream_startpos(istr);
2640 Lstream_write(ostr, tempbuf, size_in_bytes);
2641 even_newer_pos = lisp_buffer_stream_startpos(istr);
2642 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2645 Lstream_close(istr);
2646 Lstream_close(ostr);
2648 Lstream_delete(istr);
2649 Lstream_delete(ostr);
2650 Lstream_delete(XLSTREAM(de_outstream));
2651 Lstream_delete(XLSTREAM(lb_outstream));
2655 /************************************************************************/
2656 /* Converting to an external encoding ("encoding") */
2657 /************************************************************************/
2659 /* An encoding stream is an output stream. When you create the
2660 stream, you specify the coding system that governs the encoding
2661 and another stream that the resulting encoded data is to be
2662 sent to, and then start sending data to it. */
2664 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2666 typedef struct encoding_stream_s *encoding_stream_t;
2667 struct encoding_stream_s {
2668 /* Coding system that governs the conversion. */
2669 Lisp_Coding_System *codesys;
2671 /* Stream that we read the encoded data from or
2672 write the decoded data to. */
2673 lstream_t other_end;
2675 /* If we are reading, then we can return only a fixed amount of
2676 data, so if the conversion resulted in too much data, we store it
2677 here for retrieval the next time around. */
2678 unsigned_char_dynarr *runoff;
2680 /* FLAGS holds flags indicating the current state of the encoding.
2681 Some of these flags are dependent on the coding system. */
2684 /* CH holds a partially built-up character. Since we only deal
2685 with one- and two-byte characters at the moment, we only use
2686 this to store the first byte of a two-byte character. */
2689 /* Additional information used by the ISO2022 encoder. */
2691 /* CHARSET holds the character sets currently assigned to the G0
2692 through G3 registers. It is initialized from the array
2693 INITIAL_CHARSET in CODESYS. */
2694 Lisp_Object charset[4];
2696 /* Which registers are currently invoked into the left (GL) and
2697 right (GR) halves of the 8-bit encoding space? */
2698 int register_left, register_right;
2700 /* Whether we need to explicitly designate the charset in the
2701 G? register before using it. It is initialized from the
2702 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2703 unsigned char force_charset_on_output[4];
2705 /* Other state variables that need to be preserved across
2707 Lisp_Object current_charset;
2709 int current_char_boundary;
2712 /* Additional information (the state of the running CCL program)
2713 used by the CCL encoder. */
2714 struct ccl_program ccl;
2718 static Lstream_data_count
2719 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2720 static Lstream_data_count
2721 encoding_writer(lstream_t stream,
2722 const unsigned char *data, Lstream_data_count size);
2723 static int encoding_rewinder(lstream_t stream);
2724 static int encoding_seekable_p(lstream_t stream);
2725 static int encoding_flusher(lstream_t stream);
2726 static int encoding_closer(lstream_t stream);
2728 static Lisp_Object encoding_marker(Lisp_Object stream);
2730 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2731 sizeof(struct encoding_stream_s));
2734 encoding_marker(Lisp_Object stream)
2736 lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2737 Lisp_Object str_obj;
2739 /* We do not need to mark the coding systems or charsets stored
2740 within the stream because they are stored in a global list
2741 and automatically marked. */
2743 XSETLSTREAM(str_obj, str);
2744 mark_object(str_obj);
2745 if (str->imp->marker) {
2746 return str->imp->marker(str_obj);
2752 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2753 so we read data from the other end, encode it, and store it into DATA. */
2755 static Lstream_data_count
2756 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2758 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2759 unsigned char *orig_data = data;
2760 Lstream_data_count read_size;
2761 int error_occurred = 0;
2763 /* We need to interface to mule_encode(), which expects to take some
2764 amount of data and store the result into a Dynarr. We have
2765 mule_encode() store into str->runoff, and take data from there
2768 /* We loop until we have enough data, reading chunks from the other
2769 end and encoding it. */
2771 /* Take data from the runoff if we can. Make sure to take at
2772 most SIZE bytes, and delete the data from the runoff. */
2773 if (Dynarr_length(str->runoff) > 0) {
2774 int chunk = min((int)size, Dynarr_length(str->runoff));
2775 memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2776 Dynarr_delete_many(str->runoff, 0, chunk);
2782 /* No more room for data */
2786 if (str->flags & CODING_STATE_END) {
2787 /* This means that on the previous iteration, we hit the
2788 EOF on the other end. We loop once more so that
2789 mule_encode() can output any final stuff it may be
2790 holding, or any "go back to a sane state" escape
2791 sequences. (This latter makes sense during
2796 /* Exhausted the runoff, so get some more. DATA at least SIZE
2797 bytes left of storage in it, so it's OK to read directly into
2798 it. (We'll be overwriting above, after we've encoded it into
2800 read_size = Lstream_read(str->other_end, data, size);
2801 if (read_size < 0) {
2805 if (read_size == 0) {
2806 /* There might be some more end data produced in the
2807 translation. See the comment above. */
2808 str->flags |= CODING_STATE_END;
2810 mule_encode(stream, data, str->runoff, read_size);
2813 if (data == orig_data) {
2814 return error_occurred ? -1 : 0;
2816 return data - orig_data;
2820 static Lstream_data_count
2821 encoding_writer(lstream_t stream, const unsigned char *data,
2822 Lstream_data_count size)
2824 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2825 Lstream_data_count retval;
2827 /* Encode all our data into the runoff, and then attempt to write
2828 it all out to the other end. Remove whatever chunk we succeeded
2830 mule_encode(stream, data, str->runoff, size);
2831 retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2832 Dynarr_length(str->runoff));
2834 Dynarr_delete_many(str->runoff, 0, retval);
2836 /* Do NOT return retval. The return value indicates how much
2837 of the incoming data was written, not how many bytes were
2843 reset_encoding_stream(encoding_stream_t str)
2846 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2847 case CODESYS_ISO2022: {
2850 for (i = 0; i < 4; i++) {
2851 str->iso2022.charset[i] =
2852 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2854 str->iso2022.force_charset_on_output[i] =
2855 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2858 str->iso2022.register_left = 0;
2859 str->iso2022.register_right = 1;
2860 str->iso2022.current_charset = Qnil;
2861 str->iso2022.current_half = 0;
2862 str->iso2022.current_char_boundary = 1;
2866 setup_ccl_program(&str->ccl,
2867 CODING_SYSTEM_CCL_ENCODE(str->codesys));
2870 /* list the rest of them lot explicitly */
2871 case CODESYS_AUTODETECT:
2872 case CODESYS_SHIFT_JIS:
2876 case CODESYS_NO_CONVERSION:
2877 #ifdef DEBUG_SXEMACS
2878 case CODESYS_INTERNAL:
2885 str->flags = str->ch = 0;
2889 encoding_rewinder(lstream_t stream)
2891 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2892 reset_encoding_stream(str);
2893 Dynarr_reset(str->runoff);
2894 return Lstream_rewind(str->other_end);
2898 encoding_seekable_p(lstream_t stream)
2900 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2901 return Lstream_seekable_p(str->other_end);
2905 encoding_flusher(lstream_t stream)
2907 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2908 return Lstream_flush(str->other_end);
2912 encoding_closer(lstream_t stream)
2914 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2915 if (stream->flags & LSTREAM_FL_WRITE) {
2916 str->flags |= CODING_STATE_END;
2917 encoding_writer(stream, 0, 0);
2919 Dynarr_free(str->runoff);
2920 return Lstream_close(str->other_end);
2924 encoding_stream_coding_system(lstream_t stream)
2926 Lisp_Object coding_system;
2927 encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2929 XSETCODING_SYSTEM(coding_system, str->codesys);
2930 return coding_system;
2934 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2936 Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2937 encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2939 reset_encoding_stream(str);
2943 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2945 lstream_t lstr = Lstream_new(lstream_encoding, mode);
2946 encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2950 str->runoff = Dynarr_new(unsigned_char);
2951 str->other_end = stream;
2952 set_encoding_stream_coding_system(lstr, codesys);
2953 XSETLSTREAM(obj, lstr);
2958 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2960 return make_encoding_stream_1(stream, codesys, "r");
2964 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2966 return make_encoding_stream_1(stream, codesys, "w");
2969 /* Convert N bytes of internally-formatted data stored in SRC to an
2970 external format, according to the encoding stream ENCODING.
2971 Store the encoded data into DST. */
2974 mule_encode(lstream_t encoding, const Bufbyte * src,
2975 unsigned_char_dynarr * dst, Lstream_data_count n)
2977 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2979 switch (CODING_SYSTEM_TYPE(str->codesys)) {
2980 #ifdef DEBUG_SXEMACS
2981 case CODESYS_INTERNAL:
2982 Dynarr_add_many(dst, src, n);
2985 case CODESYS_AUTODETECT:
2986 /* If we got this far and still haven't decided on the coding
2987 system, then do no conversion. */
2988 case CODESYS_NO_CONVERSION:
2989 encode_coding_no_conversion(encoding, src, dst, n);
2992 case CODESYS_SHIFT_JIS:
2993 encode_coding_sjis(encoding, src, dst, n);
2996 encode_coding_big5(encoding, src, dst, n);
2999 encode_coding_ucs4(encoding, src, dst, n);
3002 encode_coding_utf8(encoding, src, dst, n);
3005 str->ccl.last_block = str->flags & CODING_STATE_END;
3006 /* When applying ccl program to stream, MUST NOT set NULL
3008 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
3009 dst, n, 0, CCL_MODE_ENCODING);
3011 case CODESYS_ISO2022:
3012 encode_coding_iso2022(encoding, src, dst, n);
3020 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3021 Encode the text between START and END using CODING-SYSTEM.
3022 This will, for example, convert Japanese characters into stuff such as
3023 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3024 text. BUFFER defaults to the current buffer if unspecified.
3026 (start, end, coding_system, buffer))
3029 struct buffer *buf = decode_buffer(buffer, 0);
3030 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3031 lstream_t istr, ostr;
3032 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3034 get_buffer_range_char(buf, start, end, &b, &e, 0);
3036 barf_if_buffer_read_only(buf, b, e);
3038 coding_system = Fget_coding_system(coding_system);
3039 instream = make_lisp_buffer_input_stream(buf, b, e, 0);
3040 lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
3041 de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
3042 Fget_coding_system(Qbinary));
3043 outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
3045 istr = XLSTREAM(instream);
3046 ostr = XLSTREAM(outstream);
3047 GCPRO4(instream, outstream, de_outstream, lb_outstream);
3048 /* The chain of streams looks like this:
3050 [BUFFER] <----- send through
3051 ------> [ENCODE AS SPECIFIED]
3052 ------> [DECODE AS BINARY]
3056 char tempbuf[1024]; /* some random amount */
3057 Bufpos newpos, even_newer_pos;
3058 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
3059 Lstream_data_count size_in_bytes =
3060 Lstream_read(istr, tempbuf, sizeof(tempbuf));
3064 newpos = lisp_buffer_stream_startpos(istr);
3065 Lstream_write(ostr, tempbuf, size_in_bytes);
3066 even_newer_pos = lisp_buffer_stream_startpos(istr);
3067 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3073 lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3074 Lstream_close(istr);
3075 Lstream_close(ostr);
3077 Lstream_delete(istr);
3078 Lstream_delete(ostr);
3079 Lstream_delete(XLSTREAM(de_outstream));
3080 Lstream_delete(XLSTREAM(lb_outstream));
3081 return make_int(retlen);
3087 /************************************************************************/
3088 /* Shift-JIS methods */
3089 /************************************************************************/
3091 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3092 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3093 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3094 encoded by "position-code + 0x80". A character of JISX0208
3095 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3096 position-codes are divided and shifted so that it fit in the range
3099 --- CODE RANGE of Shift-JIS ---
3100 (character set) (range)
3102 JISX0201-Kana 0xA0 .. 0xDF
3103 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3104 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3105 -------------------------------
3109 /* Is this the first byte of a Shift-JIS two-byte char? */
3111 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3112 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3114 /* Is this the second byte of a Shift-JIS two-byte char? */
3116 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3117 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3119 #define BYTE_SJIS_KATAKANA_P(c) \
3120 ((c) >= 0xA1 && (c) <= 0xDF)
3123 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3124 Lstream_data_count n)
3127 const unsigned char c = *(const unsigned char *)src++;
3128 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3130 if (st->shift_jis.in_second_byte) {
3131 st->shift_jis.in_second_byte = 0;
3134 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3135 st->shift_jis.in_second_byte = 1;
3137 return CODING_CATEGORY_SHIFT_JIS_MASK;
3140 /* Convert Shift-JIS data to internal format. */
3143 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3144 unsigned_char_dynarr * dst, Lstream_data_count n)
3146 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3147 unsigned int flags = str->flags;
3148 unsigned int ch = str->ch;
3149 eol_type_t eol_type = str->eol_type;
3152 const unsigned char c = *(const unsigned char *)src++;
3155 /* Previous character was first byte of Shift-JIS Kanji
3157 if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3158 unsigned char e1, e2;
3160 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3161 DECODE_SJIS(ch, c, e1, e2);
3162 Dynarr_add(dst, e1);
3163 Dynarr_add(dst, e2);
3165 DECODE_ADD_BINARY_CHAR(ch, dst);
3166 DECODE_ADD_BINARY_CHAR(c, dst);
3170 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3171 if (BYTE_SJIS_TWO_BYTE_1_P(c))
3173 else if (BYTE_SJIS_KATAKANA_P(c)) {
3174 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3177 DECODE_ADD_BINARY_CHAR(c, dst);
3179 label_continue_loop:;
3182 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3188 /* Convert internally-formatted data to Shift-JIS. */
3191 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3192 unsigned_char_dynarr * dst, Lstream_data_count n)
3194 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3195 unsigned int flags = str->flags;
3196 unsigned int ch = str->ch;
3197 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3202 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3203 Dynarr_add(dst, '\r');
3204 if (eol_type != EOL_CR)
3205 Dynarr_add(dst, '\n');
3207 } else if (BYTE_ASCII_P(c)) {
3210 } else if (BUFBYTE_LEADING_BYTE_P(c))
3211 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3212 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3213 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3215 if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3218 } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3219 ch == LEADING_BYTE_JAPANESE_JISX0208)
3222 /* j1 is bessel j1 function,
3223 * so we use something else */
3224 /* unsigned char j1, j2; */
3225 unsigned char tt1, tt2;
3227 ENCODE_SJIS(ch, c, tt1, tt2);
3228 Dynarr_add(dst, tt1);
3229 Dynarr_add(dst, tt2);
3239 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3240 Decode a JISX0208 character of Shift-JIS coding-system.
3241 CODE is the character code in Shift-JIS as a cons of type bytes.
3242 Return the corresponding character.
3246 unsigned char c1, c2, s1, s2;
3249 CHECK_INT(XCAR(code));
3250 CHECK_INT(XCDR(code));
3251 s1 = XINT(XCAR(code));
3252 s2 = XINT(XCDR(code));
3253 if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3254 DECODE_SJIS(s1, s2, c1, c2);
3255 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3256 c1 & 0x7F, c2 & 0x7F));
3261 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3262 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3263 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3267 Lisp_Object charset;
3270 CHECK_CHAR_COERCE_INT(character);
3271 BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3272 if (EQ(charset, Vcharset_japanese_jisx0208)) {
3273 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3274 return Fcons(make_int(s1), make_int(s2));
3279 /************************************************************************/
3281 /************************************************************************/
3283 /* BIG5 is a coding system encoding two character sets: ASCII and
3284 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3285 character set and is encoded in two-byte.
3287 --- CODE RANGE of BIG5 ---
3288 (character set) (range)
3290 Big5 (1st byte) 0xA1 .. 0xFE
3291 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3292 --------------------------
3294 Since the number of characters in Big5 is larger than maximum
3295 characters in Emacs' charset (96x96), it can't be handled as one
3296 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3297 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3298 contains frequently used characters and the latter contains less
3299 frequently used characters. */
3301 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3302 ((c) >= 0xA1 && (c) <= 0xFE)
3304 /* Is this the second byte of a Shift-JIS two-byte char? */
3306 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3307 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3309 /* Number of Big5 characters which have the same code in 1st byte. */
3311 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3313 /* Code conversion macros. These are macros because they are used in
3314 inner loops during code conversion.
3316 Note that temporary variables in macros introduce the classic
3317 dynamic-scoping problems with variable names. We use capital-
3318 lettered variables in the assumption that SXEmacs does not use
3319 capital letters in variables except in a very formalized way
3322 /* Convert Big5 code (b1, b2) into its internal string representation
3325 /* There is a much simpler way to split the Big5 charset into two.
3326 For the moment I'm going to leave the algorithm as-is because it
3327 claims to separate out the most-used characters into a single
3328 charset, which perhaps will lead to optimizations in various
3331 The way the algorithm works is something like this:
3333 Big5 can be viewed as a 94x157 charset, where the row is
3334 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3335 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3336 the split between low and high column numbers is apparently
3337 meaningless; ascending rows produce less and less frequent chars.
3338 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3339 the first charset, and the upper half (0xC9 .. 0xFE) to the
3340 second. To do the conversion, we convert the character into
3341 a single number where 0 .. 156 is the first row, 157 .. 313
3342 is the second, etc. That way, the characters are ordered by
3343 decreasing frequency. Then we just chop the space in two
3344 and coerce the result into a 94x94 space.
3347 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3349 int B1 = b1, B2 = b2; \
3351 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3355 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3359 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3360 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3362 c1 = I / (0xFF - 0xA1) + 0xA1; \
3363 c2 = I % (0xFF - 0xA1) + 0xA1; \
3366 /* Convert the internal string representation of a Big5 character
3367 (lb, c1, c2) into Big5 code (b1, b2). */
3369 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3371 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3373 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3375 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3377 b1 = I / BIG5_SAME_ROW + 0xA1; \
3378 b2 = I % BIG5_SAME_ROW; \
3379 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3383 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3384 Lstream_data_count n)
3387 const unsigned char c = *(const unsigned char *)src++;
3388 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3389 (c >= 0x80 && c <= 0xA0))
3391 if (st->big5.in_second_byte) {
3392 st->big5.in_second_byte = 0;
3393 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3395 } else if (c >= 0xA1)
3396 st->big5.in_second_byte = 1;
3398 return CODING_CATEGORY_BIG5_MASK;
3401 /* Convert Big5 data to internal format. */
3404 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3405 unsigned_char_dynarr * dst, Lstream_data_count n)
3407 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3408 unsigned int flags = str->flags;
3409 unsigned int ch = str->ch;
3410 eol_type_t eol_type = str->eol_type;
3413 const unsigned char c = *(const unsigned char *)src++;
3415 /* Previous character was first byte of Big5 char. */
3416 if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3417 unsigned char b1, b2, b3;
3418 DECODE_BIG5(ch, c, b1, b2, b3);
3419 Dynarr_add(dst, b1);
3420 Dynarr_add(dst, b2);
3421 Dynarr_add(dst, b3);
3423 DECODE_ADD_BINARY_CHAR(ch, dst);
3424 DECODE_ADD_BINARY_CHAR(c, dst);
3428 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3429 if (BYTE_BIG5_TWO_BYTE_1_P(c))
3432 DECODE_ADD_BINARY_CHAR(c, dst);
3434 label_continue_loop:;
3437 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3443 /* Convert internally-formatted data to Big5. */
3446 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3447 unsigned_char_dynarr * dst, Lstream_data_count n)
3450 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3451 unsigned int flags = str->flags;
3452 unsigned int ch = str->ch;
3453 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3458 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3459 Dynarr_add(dst, '\r');
3460 if (eol_type != EOL_CR)
3461 Dynarr_add(dst, '\n');
3462 } else if (BYTE_ASCII_P(c)) {
3465 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3466 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3467 c == LEADING_BYTE_CHINESE_BIG5_2) {
3468 /* A recognized leading byte. */
3470 continue; /* not done with this character. */
3472 /* otherwise just ignore this character. */
3473 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3474 ch == LEADING_BYTE_CHINESE_BIG5_2) {
3475 /* Previous char was a recognized leading byte. */
3477 continue; /* not done with this character. */
3479 /* Encountering second byte of a Big5 character. */
3480 unsigned char b1, b2;
3482 ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3483 Dynarr_add(dst, b1);
3484 Dynarr_add(dst, b2);
3494 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3495 Decode a Big5 character CODE of BIG5 coding-system.
3496 CODE is the character code in BIG5, a cons of two integers.
3497 Return the corresponding character.
3501 unsigned char c1, c2, b1, b2;
3504 CHECK_INT(XCAR(code));
3505 CHECK_INT(XCDR(code));
3506 b1 = XINT(XCAR(code));
3507 b2 = XINT(XCDR(code));
3508 if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3510 Lisp_Object charset;
3511 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3512 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3513 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3518 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3519 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3520 Return the corresponding character code in Big5.
3524 Lisp_Object charset;
3527 CHECK_CHAR_COERCE_INT(character);
3528 BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3529 if (EQ(charset, Vcharset_chinese_big5_1) ||
3530 EQ(charset, Vcharset_chinese_big5_2)) {
3531 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3533 return Fcons(make_int(b1), make_int(b2));
3538 /************************************************************************/
3541 /* UCS-4 character codes are implemented as nonnegative integers. */
3543 /************************************************************************/
3545 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3546 Map UCS-4 code CODE to Mule character CHARACTER.
3548 Return T on success, NIL on failure.
3554 CHECK_CHAR(character);
3558 if (c < countof(fcd->ucs_to_mule_table)) {
3559 fcd->ucs_to_mule_table[c] = character;
3565 static Lisp_Object ucs_to_char(unsigned long code)
3567 if (code < countof(fcd->ucs_to_mule_table)) {
3568 return fcd->ucs_to_mule_table[code];
3569 } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3573 c = code % (94 * 94);
3575 (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3576 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3577 CHARSET_LEFT_TO_RIGHT),
3578 c / 94 + 33, c % 94 + 33));
3583 DEFUN("ucs-char", Fucs_char, 1, 1, 0, /*
3584 Return Mule character corresponding to UCS code CODE (a positive integer).
3589 return ucs_to_char(XINT(code));
3592 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3593 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3597 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3598 Fset_char_ucs is more restrictive on index arg, but should
3599 check code arg in a char_table method. */
3600 CHECK_CHAR(character);
3602 return Fput_char_table(character, code, mule_to_ucs_table);
3605 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0, /*
3606 Return the UCS code (a positive integer) corresponding to CHARACTER.
3610 return Fget_char_table(character, mule_to_ucs_table);
3613 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3614 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3615 is not found, instead.
3616 #### do something more appropriate (use blob?)
3617 Danger, Will Robinson! Data loss. Should we signal user? */
3618 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3620 Lisp_Object chr = ucs_to_char(ch);
3623 Bufbyte work[MAX_EMCHAR_LEN];
3628 simple_set_charptr_emchar(work, ch) :
3629 non_ascii_set_charptr_emchar(work, ch);
3630 Dynarr_add_many(dst, work, len);
3632 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3633 Dynarr_add(dst, 34 + 128);
3634 Dynarr_add(dst, 46 + 128);
3638 static unsigned long
3639 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3642 = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3647 } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3648 (XCHARSET_CHARS(charset) == 94)) {
3649 unsigned char final = XCHARSET_FINAL(charset);
3651 if (('@' <= final) && (final < 0x7f)) {
3652 return 0xe00000 + (final - '@') * 94 * 94
3653 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3663 encode_ucs4(Lisp_Object charset,
3664 unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3666 unsigned long code = mule_char_to_ucs4(charset, h, l);
3667 Dynarr_add(dst, code >> 24);
3668 Dynarr_add(dst, (code >> 16) & 255);
3669 Dynarr_add(dst, (code >> 8) & 255);
3670 Dynarr_add(dst, code & 255);
3674 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3675 Lstream_data_count n)
3678 const unsigned char c = *(const unsigned char *)src++;
3679 switch (st->ucs4.in_byte) {
3687 st->ucs4.in_byte = 0;
3693 return CODING_CATEGORY_UCS4_MASK;
3697 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3698 unsigned_char_dynarr * dst, Lstream_data_count n)
3700 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3701 unsigned int flags = str->flags;
3702 unsigned int ch = str->ch;
3703 unsigned char counter = str->counter;
3706 const unsigned char c = *(const unsigned char *)src++;
3713 decode_ucs4((ch << 8) | c, dst);
3722 if (counter & CODING_STATE_END)
3723 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3727 str->counter = counter;
3731 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3732 unsigned_char_dynarr * dst, Lstream_data_count n)
3734 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3735 unsigned int flags = str->flags;
3736 unsigned int ch = str->ch;
3737 unsigned char char_boundary = str->iso2022.current_char_boundary;
3738 Lisp_Object charset = str->iso2022.current_charset;
3740 #ifdef ENABLE_COMPOSITE_CHARS
3741 /* flags for handling composite chars. We do a little switcharoo
3742 on the source while we're outputting the composite char. */
3743 unsigned int saved_n = 0;
3744 const unsigned char *saved_src = NULL;
3745 int in_composite = 0;
3751 unsigned char c = *src++;
3753 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
3755 encode_ucs4(Vcharset_ascii, c, 0, dst);
3757 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
3759 charset = CHARSET_BY_LEADING_BYTE(c);
3760 if (LEADING_BYTE_PREFIX_P(c))
3763 } else { /* Processing Non-ASCII character */
3765 if (EQ(charset, Vcharset_control_1)) {
3766 encode_ucs4(Vcharset_control_1, c, 0, dst);
3768 switch (XCHARSET_REP_BYTES(charset)) {
3770 encode_ucs4(charset, c, 0, dst);
3773 if (XCHARSET_PRIVATE_P(charset)) {
3774 encode_ucs4(charset, c, 0, dst);
3777 #ifdef ENABLE_COMPOSITE_CHARS
3780 Vcharset_composite)) {
3782 /* #### Bother! We don't know how to
3795 (Vcharset_composite,
3800 composite_char_string
3809 n = XSTRING_LENGTH(lstr);
3812 #endif /* ENABLE_COMPOSITE_CHARS */
3814 encode_ucs4(charset, ch,
3825 encode_ucs4(charset, ch, c,
3840 #ifdef ENABLE_COMPOSITE_CHARS
3845 goto back_to_square_n; /* Wheeeeeeeee ..... */
3847 #endif /* ENABLE_COMPOSITE_CHARS */
3851 str->iso2022.current_char_boundary = char_boundary;
3852 str->iso2022.current_charset = charset;
3854 /* Verbum caro factum est! */
3857 /************************************************************************/
3859 /************************************************************************/
3862 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3863 Lstream_data_count n)
3866 const unsigned char c = *(const unsigned char *)src++;
3867 switch (st->utf8.in_byte) {
3869 if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3870 || c == ISO_CODE_SO)
3873 st->utf8.in_byte = 5;
3875 st->utf8.in_byte = 4;
3877 st->utf8.in_byte = 3;
3879 st->utf8.in_byte = 2;
3881 st->utf8.in_byte = 1;
3886 if ((c & 0xc0) != 0x80)
3892 return CODING_CATEGORY_UTF8_MASK;
3896 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3897 unsigned_char_dynarr * dst, Lstream_data_count n)
3899 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3900 unsigned int flags = str->flags;
3901 unsigned int ch = str->ch;
3902 eol_type_t eol_type = str->eol_type;
3903 unsigned char counter = str->counter;
3906 const unsigned char c = *(const unsigned char *)src++;
3912 } else if (c >= 0xf8) {
3915 } else if (c >= 0xf0) {
3918 } else if (c >= 0xe0) {
3921 } else if (c >= 0xc0) {
3925 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3926 decode_ucs4(c, dst);
3930 ch = (ch << 6) | (c & 0x3f);
3931 decode_ucs4(ch, dst);
3936 ch = (ch << 6) | (c & 0x3f);
3939 label_continue_loop:;
3942 if (flags & CODING_STATE_END)
3943 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3947 str->counter = counter;
3951 encode_utf8(Lisp_Object charset,
3952 unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3954 unsigned long code = mule_char_to_ucs4(charset, h, l);
3956 Dynarr_add(dst, code);
3957 } else if (code <= 0x7ff) {
3958 Dynarr_add(dst, (code >> 6) | 0xc0);
3959 Dynarr_add(dst, (code & 0x3f) | 0x80);
3960 } else if (code <= 0xffff) {
3961 Dynarr_add(dst, (code >> 12) | 0xe0);
3962 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3963 Dynarr_add(dst, (code & 0x3f) | 0x80);
3964 } else if (code <= 0x1fffff) {
3965 Dynarr_add(dst, (code >> 18) | 0xf0);
3966 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3967 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3968 Dynarr_add(dst, (code & 0x3f) | 0x80);
3969 } else if (code <= 0x3ffffff) {
3970 Dynarr_add(dst, (code >> 24) | 0xf8);
3971 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3972 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3973 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3974 Dynarr_add(dst, (code & 0x3f) | 0x80);
3976 Dynarr_add(dst, (code >> 30) | 0xfc);
3977 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3978 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3979 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3980 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3981 Dynarr_add(dst, (code & 0x3f) | 0x80);
3986 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3987 unsigned_char_dynarr * dst, Lstream_data_count n)
3989 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3990 unsigned int flags = str->flags;
3991 unsigned int ch = str->ch;
3992 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3993 unsigned char char_boundary = str->iso2022.current_char_boundary;
3994 Lisp_Object charset = str->iso2022.current_charset;
3996 #ifdef ENABLE_COMPOSITE_CHARS
3997 /* flags for handling composite chars. We do a little switcharoo
3998 on the source while we're outputting the composite char. */
3999 unsigned int saved_n = 0;
4000 const unsigned char *saved_src = NULL;
4001 int in_composite = 0;
4004 #endif /* ENABLE_COMPOSITE_CHARS */
4007 unsigned char c = *src++;
4009 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
4012 if (eol_type != EOL_LF
4013 && eol_type != EOL_AUTODETECT)
4014 Dynarr_add(dst, '\r');
4015 if (eol_type != EOL_CR)
4018 encode_utf8(Vcharset_ascii, c, 0, dst);
4020 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
4022 charset = CHARSET_BY_LEADING_BYTE(c);
4023 if (LEADING_BYTE_PREFIX_P(c))
4026 } else { /* Processing Non-ASCII character */
4028 if (EQ(charset, Vcharset_control_1)) {
4029 encode_utf8(Vcharset_control_1, c, 0, dst);
4031 switch (XCHARSET_REP_BYTES(charset)) {
4033 encode_utf8(charset, c, 0, dst);
4036 if (XCHARSET_PRIVATE_P(charset)) {
4037 encode_utf8(charset, c, 0, dst);
4040 #ifdef ENABLE_COMPOSITE_CHARS
4043 Vcharset_composite)) {
4045 /* #### Bother! We don't know how to
4054 (Vcharset_composite,
4059 composite_char_string
4068 n = XSTRING_LENGTH(lstr);
4071 #endif /* ENABLE_COMPOSITE_CHARS */
4073 encode_utf8(charset, ch,
4084 encode_utf8(charset, ch, c,
4099 #ifdef ENABLE_COMPOSITE_CHARS
4104 goto back_to_square_n; /* Wheeeeeeeee ..... */
4110 str->iso2022.current_char_boundary = char_boundary;
4111 str->iso2022.current_charset = charset;
4113 /* Verbum caro factum est! */
4116 /************************************************************************/
4117 /* ISO2022 methods */
4118 /************************************************************************/
4120 /* The following note describes the coding system ISO2022 briefly.
4121 Since the intention of this note is to help understand the
4122 functions in this file, some parts are NOT ACCURATE or OVERLY
4123 SIMPLIFIED. For thorough understanding, please refer to the
4124 original document of ISO2022.
4126 ISO2022 provides many mechanisms to encode several character sets
4127 in 7-bit and 8-bit environments. For 7-bit environments, all text
4128 is encoded using bytes less than 128. This may make the encoded
4129 text a little bit longer, but the text passes more easily through
4130 several gateways, some of which strip off MSB (Most Signigant Bit).
4132 There are two kinds of character sets: control character set and
4133 graphic character set. The former contains control characters such
4134 as `newline' and `escape' to provide control functions (control
4135 functions are also provided by escape sequences). The latter
4136 contains graphic characters such as 'A' and '-'. Emacs recognizes
4137 two control character sets and many graphic character sets.
4139 Graphic character sets are classified into one of the following
4140 four classes, according to the number of bytes (DIMENSION) and
4141 number of characters in one dimension (CHARS) of the set:
4142 - DIMENSION1_CHARS94
4143 - DIMENSION1_CHARS96
4144 - DIMENSION2_CHARS94
4145 - DIMENSION2_CHARS96
4147 In addition, each character set is assigned an identification tag,
4148 unique for each set, called "final character" (denoted as <F>
4149 hereafter). The <F> of each character set is decided by ECMA(*)
4150 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4151 (0x30..0x3F are for private use only).
4153 Note (*): ECMA = European Computer Manufacturers Association
4155 Here are examples of graphic character set [NAME(<F>)]:
4156 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4157 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4158 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4159 o DIMENSION2_CHARS96 -- none for the moment
4161 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4162 C0 [0x00..0x1F] -- control character plane 0
4163 GL [0x20..0x7F] -- graphic character plane 0
4164 C1 [0x80..0x9F] -- control character plane 1
4165 GR [0xA0..0xFF] -- graphic character plane 1
4167 A control character set is directly designated and invoked to C0 or
4168 C1 by an escape sequence. The most common case is that:
4169 - ISO646's control character set is designated/invoked to C0, and
4170 - ISO6429's control character set is designated/invoked to C1,
4171 and usually these designations/invocations are omitted in encoded
4172 text. In a 7-bit environment, only C0 can be used, and a control
4173 character for C1 is encoded by an appropriate escape sequence to
4174 fit into the environment. All control characters for C1 are
4175 defined to have corresponding escape sequences.
4177 A graphic character set is at first designated to one of four
4178 graphic registers (G0 through G3), then these graphic registers are
4179 invoked to GL or GR. These designations and invocations can be
4180 done independently. The most common case is that G0 is invoked to
4181 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4182 these invocations and designations are omitted in encoded text.
4183 In a 7-bit environment, only GL can be used.
4185 When a graphic character set of CHARS94 is invoked to GL, codes
4186 0x20 and 0x7F of the GL area work as control characters SPACE and
4187 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4190 There are two ways of invocation: locking-shift and single-shift.
4191 With locking-shift, the invocation lasts until the next different
4192 invocation, whereas with single-shift, the invocation affects the
4193 following character only and doesn't affect the locking-shift
4194 state. Invocations are done by the following control characters or
4197 ----------------------------------------------------------------------
4198 abbrev function cntrl escape seq description
4199 ----------------------------------------------------------------------
4200 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4201 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4202 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4203 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4204 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4205 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4206 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4207 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4208 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4209 ----------------------------------------------------------------------
4210 (*) These are not used by any known coding system.
4212 Control characters for these functions are defined by macros
4213 ISO_CODE_XXX in `coding.h'.
4215 Designations are done by the following escape sequences:
4216 ----------------------------------------------------------------------
4217 escape sequence description
4218 ----------------------------------------------------------------------
4219 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4220 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4221 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4222 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4223 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4224 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4225 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4226 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4227 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4228 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4229 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4230 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4231 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4232 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4233 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4234 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4235 ----------------------------------------------------------------------
4237 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4238 of dimension 1, chars 94, and final character <F>, etc...
4240 Note (*): Although these designations are not allowed in ISO2022,
4241 Emacs accepts them on decoding, and produces them on encoding
4242 CHARS96 character sets in a coding system which is characterized as
4243 7-bit environment, non-locking-shift, and non-single-shift.
4245 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4246 '(' can be omitted. We refer to this as "short-form" hereafter.
4248 Now you may notice that there are a lot of ways for encoding the
4249 same multilingual text in ISO2022. Actually, there exist many
4250 coding systems such as Compound Text (used in X11's inter client
4251 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4252 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4253 localized platforms), and all of these are variants of ISO2022.
4255 In addition to the above, Emacs handles two more kinds of escape
4256 sequences: ISO6429's direction specification and Emacs' private
4257 sequence for specifying character composition.
4259 ISO6429's direction specification takes the following form:
4260 o CSI ']' -- end of the current direction
4261 o CSI '0' ']' -- end of the current direction
4262 o CSI '1' ']' -- start of left-to-right text
4263 o CSI '2' ']' -- start of right-to-left text
4264 The control character CSI (0x9B: control sequence introducer) is
4265 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4267 Character composition specification takes the following form:
4268 o ESC '0' -- start character composition
4269 o ESC '1' -- end character composition
4270 Since these are not standard escape sequences of any ISO standard,
4271 their use with these meanings is restricted to Emacs only. */
4274 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4278 for (i = 0; i < 4; i++) {
4279 if (!NILP(coding_system))
4281 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4284 iso->charset[i] = Qt;
4285 iso->invalid_designated[i] = 0;
4287 iso->esc = ISO_ESC_NOTHING;
4288 iso->esc_bytes_index = 0;
4289 iso->register_left = 0;
4290 iso->register_right = 1;
4291 iso->switched_dir_and_no_valid_charset_yet = 0;
4292 iso->invalid_switch_dir = 0;
4293 iso->output_direction_sequence = 0;
4294 iso->output_literally = 0;
4295 #ifdef ENABLE_COMPOSITE_CHARS
4296 if (iso->composite_chars)
4297 Dynarr_reset(iso->composite_chars);
4301 static int fit_to_be_escape_quoted(unsigned char c)
4317 /* Parse one byte of an ISO2022 escape sequence.
4318 If the result is an invalid escape sequence, return 0 and
4319 do not change anything in STR. Otherwise, if the result is
4320 an incomplete escape sequence, update ISO2022.ESC and
4321 ISO2022.ESC_BYTES and return -1. Otherwise, update
4322 all the state variables (but not ISO2022.ESC_BYTES) and
4325 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4326 or invocation of an invalid character set and treat that as
4327 an unrecognized escape sequence.
4329 ********************************************************************
4331 #### Strategies for error annotation and coding orthogonalization
4333 We really want to separate out a number of things. Conceptually,
4334 there is a nested syntax.
4336 At the top level is the ISO 2022 extension syntax, including charset
4337 designation and invocation, and certain auxiliary controls such as the
4338 ISO 6429 direction specification. These are octet-oriented, with the
4339 single exception (AFAIK) of the "exit Unicode" sequence which uses the
4340 UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4341 UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a
4342 (deprecated) special case in Unicode processing.
4344 The middle layer is ISO 2022 character interpretation. This will depend
4345 on the current state of the ISO 2022 registers, and assembles octets
4346 into the character's internal representation.
4348 The lowest level is translating system control conventions. At present
4349 this is restricted to newline translation, but one could imagine doing
4350 tab conversion or line wrapping here. "Escape from Unicode" processing
4351 would be done at this level.
4353 At each level the parser will verify the syntax. In the case of a
4354 syntax error or warning (such as a redundant escape sequence that affects
4355 no characters), the parser will take some action, typically inserting the
4356 erroneous octets directly into the output and creating an annotation
4357 which can be used by higher level I/O to mark the affected region.
4359 This should make it possible to do something sensible about separating
4360 newline convention processing from character construction, and about
4361 preventing ISO 2022 escape sequences from being recognized
4364 The basic strategy will be to have octet classification tables, and
4365 switch processing according to the table entry.
4367 It's possible that, by doing the processing with tables of functions or
4368 the like, the parser can be used for both detection and translation. */
4371 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4372 unsigned char c, unsigned int *flags,
4373 int check_invalid_charsets)
4375 /* (1) If we're at the end of a designation sequence, CS is the
4376 charset being designated and REG is the register to designate
4379 (2) If we're at the end of a locking-shift sequence, REG is
4380 the register to invoke and HALF (0 == left, 1 == right) is
4381 the half to invoke it into.
4383 (3) If we're at the end of a single-shift sequence, REG is
4384 the register to invoke. */
4385 Lisp_Object cs = Qnil;
4388 /* NOTE: This code does goto's all over the fucking place.
4389 The reason for this is that we're basically implementing
4390 a state machine here, and hierarchical languages like C
4391 don't really provide a clean way of doing this. */
4393 if (!(*flags & CODING_STATE_ESCAPE))
4394 /* At beginning of escape sequence; we need to reset our
4395 escape-state variables. */
4396 iso->esc = ISO_ESC_NOTHING;
4398 iso->output_literally = 0;
4399 iso->output_direction_sequence = 0;
4402 case ISO_ESC_NOTHING:
4403 iso->esc_bytes_index = 0;
4405 case ISO_CODE_ESC: /* Start escape sequence */
4406 *flags |= CODING_STATE_ESCAPE;
4410 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4411 *flags |= CODING_STATE_ESCAPE;
4412 iso->esc = ISO_ESC_5_11;
4415 case ISO_CODE_SO: /* locking shift 1 */
4419 case ISO_CODE_SI: /* locking shift 0 */
4424 case ISO_CODE_SS2: /* single shift */
4427 case ISO_CODE_SS3: /* single shift */
4431 default: /* Other control characters */
4437 /**** single shift ****/
4439 case 'N': /* single shift 2 */
4442 case 'O': /* single shift 3 */
4446 /**** locking shift ****/
4448 case '~': /* locking shift 1 right */
4452 case 'n': /* locking shift 2 */
4456 case '}': /* locking shift 2 right */
4460 case 'o': /* locking shift 3 */
4464 case '|': /* locking shift 3 right */
4469 #ifdef ENABLE_COMPOSITE_CHARS
4470 /**** composite ****/
4473 iso->esc = ISO_ESC_START_COMPOSITE;
4474 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4475 CODING_STATE_COMPOSITE;
4479 iso->esc = ISO_ESC_END_COMPOSITE;
4480 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4481 ~CODING_STATE_COMPOSITE;
4483 #endif /* ENABLE_COMPOSITE_CHARS */
4485 /**** directionality ****/
4488 iso->esc = ISO_ESC_5_11;
4491 /**** designation ****/
4493 case '$': /* multibyte charset prefix */
4494 iso->esc = ISO_ESC_2_4;
4498 if (0x28 <= c && c <= 0x2F) {
4500 (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4504 /* This function is called with CODESYS equal to nil when
4505 doing coding-system detection. */
4507 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4508 && fit_to_be_escape_quoted(c)) {
4509 iso->esc = ISO_ESC_LITERAL;
4510 *flags &= CODING_STATE_ISO2022_LOCK;
4518 /**** directionality ****/
4520 case ISO_ESC_5_11: /* ISO6429 direction control */
4523 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4524 goto directionality;
4527 iso->esc = ISO_ESC_5_11_0;
4529 iso->esc = ISO_ESC_5_11_1;
4531 iso->esc = ISO_ESC_5_11_2;
4536 case ISO_ESC_5_11_0:
4539 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4540 goto directionality;
4544 case ISO_ESC_5_11_1:
4547 (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4548 goto directionality;
4552 case ISO_ESC_5_11_2:
4555 (*flags & CODING_STATE_ISO2022_LOCK) |
4557 goto directionality;
4562 iso->esc = ISO_ESC_DIRECTIONALITY;
4563 /* Various junk here to attempt to preserve the direction
4564 sequences literally in the text if they would otherwise be
4565 swallowed due to invalid designations that don't show up as
4566 actual charset changes in the text. */
4567 if (iso->invalid_switch_dir) {
4568 /* We already inserted a direction switch literally into
4569 the text. We assume (#### this may not be right)
4570 that the next direction switch is the one going the
4571 other way, and we need to output that literally as
4573 iso->output_literally = 1;
4574 iso->invalid_switch_dir = 0;
4578 /* If we are in the thrall of an invalid designation,
4579 then stick the directionality sequence literally into
4580 the output stream so it ends up in the original text
4582 for (jj = 0; jj < 4; jj++)
4583 if (iso->invalid_designated[jj])
4586 iso->output_literally = 1;
4587 iso->invalid_switch_dir = 1;
4589 /* Indicate that we haven't yet seen a valid
4590 designation, so that if a switch-dir is
4591 directly followed by an invalid designation,
4592 both get inserted literally. */
4593 iso->switched_dir_and_no_valid_charset_yet = 1;
4597 /**** designation ****/
4600 if (0x28 <= c && c <= 0x2F) {
4602 (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4605 if (0x40 <= c && c <= 0x42) {
4606 cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4607 *flags & CODING_STATE_R2L ?
4608 CHARSET_RIGHT_TO_LEFT :
4609 CHARSET_LEFT_TO_RIGHT);
4626 case ISO_ESC_2_4_10:
4627 case ISO_ESC_2_4_11:
4628 case ISO_ESC_2_4_12:
4629 case ISO_ESC_2_4_13:
4630 case ISO_ESC_2_4_14:
4631 case ISO_ESC_2_4_15:
4632 case ISO_ESC_SINGLE_SHIFT:
4633 case ISO_ESC_LOCKING_SHIFT:
4634 case ISO_ESC_DESIGNATE:
4635 case ISO_ESC_DIRECTIONALITY:
4636 case ISO_ESC_LITERAL:
4641 if (c < '0' || c > '~')
4642 return 0; /* bad final byte */
4644 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4645 type = ((iso->esc >= ISO_ESC_2_12) ?
4646 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4647 reg = (iso->esc - ISO_ESC_2_8) & 3;
4648 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4649 iso->esc <= ISO_ESC_2_4_15) {
4650 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4651 CHARSET_TYPE_96X96 :
4652 CHARSET_TYPE_94X94);
4653 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4655 /* Can this ever be reached? -slb */
4660 cs = CHARSET_BY_ATTRIBUTES(type, c,
4661 *flags & CODING_STATE_R2L ?
4662 CHARSET_RIGHT_TO_LEFT :
4663 CHARSET_LEFT_TO_RIGHT);
4669 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4673 if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4674 /* can't invoke something that ain't there. */
4676 iso->esc = ISO_ESC_SINGLE_SHIFT;
4677 *flags &= CODING_STATE_ISO2022_LOCK;
4679 *flags |= CODING_STATE_SS2;
4681 *flags |= CODING_STATE_SS3;
4685 if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4686 /* can't invoke something that ain't there. */
4689 iso->register_right = reg;
4691 iso->register_left = reg;
4692 *flags &= CODING_STATE_ISO2022_LOCK;
4693 iso->esc = ISO_ESC_LOCKING_SHIFT;
4697 if (NILP(cs) && check_invalid_charsets) {
4698 iso->invalid_designated[reg] = 1;
4699 iso->charset[reg] = Vcharset_ascii;
4700 iso->esc = ISO_ESC_DESIGNATE;
4701 *flags &= CODING_STATE_ISO2022_LOCK;
4702 iso->output_literally = 1;
4703 if (iso->switched_dir_and_no_valid_charset_yet) {
4704 /* We encountered a switch-direction followed by an
4705 invalid designation. Ensure that the switch-direction
4706 gets outputted; otherwise it will probably get eaten
4707 when the text is written out again. */
4708 iso->switched_dir_and_no_valid_charset_yet = 0;
4709 iso->output_direction_sequence = 1;
4710 /* And make sure that the switch-dir going the other
4711 way gets outputted, as well. */
4712 iso->invalid_switch_dir = 1;
4716 /* This function is called with CODESYS equal to nil when
4717 doing coding-system detection. */
4718 if (!NILP(codesys)) {
4719 charset_conversion_spec_dynarr *dyn =
4720 XCODING_SYSTEM(codesys)->iso2022.input_conv;
4725 for (i = 0; i < Dynarr_length(dyn); i++) {
4726 struct charset_conversion_spec *spec =
4728 if (EQ(cs, spec->from_charset))
4729 cs = spec->to_charset;
4734 iso->charset[reg] = cs;
4735 iso->esc = ISO_ESC_DESIGNATE;
4736 *flags &= CODING_STATE_ISO2022_LOCK;
4737 if (iso->invalid_designated[reg]) {
4738 iso->invalid_designated[reg] = 0;
4739 iso->output_literally = 1;
4741 if (iso->switched_dir_and_no_valid_charset_yet)
4742 iso->switched_dir_and_no_valid_charset_yet = 0;
4747 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4748 Lstream_data_count n)
4752 /* #### There are serious deficiencies in the recognition mechanism
4753 here. This needs to be much smarter if it's going to cut it.
4754 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4755 it should be detected as Latin-1.
4756 All the ISO2022 stuff in this file should be synced up with the
4757 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4758 Perhaps we should wait till R2L works in FSF Emacs? */
4760 if (!st->iso2022.initted) {
4761 reset_iso2022(Qnil, &st->iso2022.iso);
4762 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4763 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4764 CODING_CATEGORY_ISO_8_1_MASK |
4765 CODING_CATEGORY_ISO_8_2_MASK |
4766 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4767 st->iso2022.flags = 0;
4768 st->iso2022.high_byte_count = 0;
4769 st->iso2022.saw_single_shift = 0;
4770 st->iso2022.initted = 1;
4773 mask = st->iso2022.mask;
4776 const unsigned char c = *(const unsigned char *)src++;
4778 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4779 st->iso2022.high_byte_count++;
4781 if (st->iso2022.high_byte_count
4782 && !st->iso2022.saw_single_shift) {
4783 if (st->iso2022.high_byte_count & 1)
4784 /* odd number of high bytes; assume not iso-8-2 */
4785 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4787 st->iso2022.high_byte_count = 0;
4788 st->iso2022.saw_single_shift = 0;
4790 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4792 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4793 && (BYTE_C0_P(c) || BYTE_C1_P(c))) { /* control chars */
4795 /* Allow and ignore control characters that you might
4796 reasonably see in a text file */
4801 case 8: /* backspace */
4802 case 11: /* vertical tab */
4803 case 12: /* form feed */
4804 case 26: /* MS-DOS C-z junk */
4805 case 31: /* '^_' -- for info */
4806 goto label_continue_loop;
4813 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4815 if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4816 &st->iso2022.flags, 0)) {
4817 switch (st->iso2022.iso.esc) {
4818 case ISO_ESC_DESIGNATE:
4819 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4820 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4822 case ISO_ESC_LOCKING_SHIFT:
4823 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4824 goto ran_out_of_chars;
4825 case ISO_ESC_SINGLE_SHIFT:
4826 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4827 st->iso2022.saw_single_shift = 1;
4831 case ISO_ESC_NOTHING:
4844 case ISO_ESC_2_4_10:
4845 case ISO_ESC_2_4_11:
4846 case ISO_ESC_2_4_12:
4847 case ISO_ESC_2_4_13:
4848 case ISO_ESC_2_4_14:
4849 case ISO_ESC_2_4_15:
4851 case ISO_ESC_5_11_0:
4852 case ISO_ESC_5_11_1:
4853 case ISO_ESC_5_11_2:
4854 case ISO_ESC_DIRECTIONALITY:
4855 case ISO_ESC_LITERAL:
4861 goto ran_out_of_chars;
4864 label_continue_loop:;
4871 static int postprocess_iso2022_mask(int mask)
4873 /* #### kind of cheesy */
4874 /* If seven-bit ISO is allowed, then assume that the encoding is
4875 entirely seven-bit and turn off the eight-bit ones. */
4876 if (mask & CODING_CATEGORY_ISO_7_MASK)
4877 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4878 CODING_CATEGORY_ISO_8_1_MASK |
4879 CODING_CATEGORY_ISO_8_2_MASK);
4883 /* If FLAGS is a null pointer or specifies right-to-left motion,
4884 output a switch-dir-to-left-to-right sequence to DST.
4885 Also update FLAGS if it is not a null pointer.
4886 If INTERNAL_P is set, we are outputting in internal format and
4887 need to handle the CSI differently. */
4890 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4891 unsigned_char_dynarr * dst,
4892 unsigned int *flags, int internal_p)
4894 if (!flags || (*flags & CODING_STATE_R2L)) {
4895 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4896 Dynarr_add(dst, ISO_CODE_ESC);
4897 Dynarr_add(dst, '[');
4898 } else if (internal_p)
4899 DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4901 Dynarr_add(dst, ISO_CODE_CSI);
4902 Dynarr_add(dst, '0');
4903 Dynarr_add(dst, ']');
4905 *flags &= ~CODING_STATE_R2L;
4909 /* If FLAGS is a null pointer or specifies a direction different from
4910 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4911 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4912 sequence to DST. Also update FLAGS if it is not a null pointer.
4913 If INTERNAL_P is set, we are outputting in internal format and
4914 need to handle the CSI differently. */
4917 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4918 unsigned_char_dynarr * dst, unsigned int *flags,
4921 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4922 direction == CHARSET_LEFT_TO_RIGHT)
4923 restore_left_to_right_direction(codesys, dst, flags,
4925 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4926 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4927 direction == CHARSET_RIGHT_TO_LEFT) {
4928 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4929 Dynarr_add(dst, ISO_CODE_ESC);
4930 Dynarr_add(dst, '[');
4931 } else if (internal_p)
4932 DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4934 Dynarr_add(dst, ISO_CODE_CSI);
4935 Dynarr_add(dst, '2');
4936 Dynarr_add(dst, ']');
4938 *flags |= CODING_STATE_R2L;
4942 /* Convert ISO2022-format data to internal format. */
4945 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4946 unsigned_char_dynarr * dst, Lstream_data_count n)
4948 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4949 unsigned int flags = str->flags;
4950 unsigned int ch = str->ch;
4951 eol_type_t eol_type = str->eol_type;
4952 #ifdef ENABLE_COMPOSITE_CHARS
4953 unsigned_char_dynarr *real_dst = dst;
4955 Lisp_Object coding_system;
4957 XSETCODING_SYSTEM(coding_system, str->codesys);
4959 #ifdef ENABLE_COMPOSITE_CHARS
4960 if (flags & CODING_STATE_COMPOSITE)
4961 dst = str->iso2022.composite_chars;
4962 #endif /* ENABLE_COMPOSITE_CHARS */
4965 const unsigned char c = *(const unsigned char *)src++;
4966 if (flags & CODING_STATE_ESCAPE) {
4967 /* Within ESC sequence */
4968 int retval = parse_iso2022_esc(
4969 coding_system, &str->iso2022, c, &flags, 1);
4972 switch (str->iso2022.esc) {
4973 #ifdef ENABLE_COMPOSITE_CHARS
4974 case ISO_ESC_START_COMPOSITE:
4975 if (str->iso2022.composite_chars)
4976 Dynarr_reset(str->iso2022.
4979 str->iso2022.composite_chars =
4980 Dynarr_new(unsigned_char);
4981 dst = str->iso2022.composite_chars;
4983 case ISO_ESC_END_COMPOSITE:
4985 Bufbyte comstr[MAX_EMCHAR_LEN];
4988 lookup_composite_char
4989 (Dynarr_atp(dst, 0),
4990 Dynarr_length(dst));
4993 set_charptr_emchar(comstr,
4995 Dynarr_add_many(dst, comstr,
4999 #endif /* ENABLE_COMPOSITE_CHARS */
5001 case ISO_ESC_LITERAL:
5002 DECODE_ADD_BINARY_CHAR(c, dst);
5005 case ISO_ESC_NOTHING:
5018 case ISO_ESC_2_4_10:
5019 case ISO_ESC_2_4_11:
5020 case ISO_ESC_2_4_12:
5021 case ISO_ESC_2_4_13:
5022 case ISO_ESC_2_4_14:
5023 case ISO_ESC_2_4_15:
5025 case ISO_ESC_5_11_0:
5026 case ISO_ESC_5_11_1:
5027 case ISO_ESC_5_11_2:
5028 case ISO_ESC_SINGLE_SHIFT:
5029 case ISO_ESC_LOCKING_SHIFT:
5030 case ISO_ESC_DESIGNATE:
5031 case ISO_ESC_DIRECTIONALITY:
5034 /* Everything else handled already */
5039 /* Attempted error recovery. */
5040 if (str->iso2022.output_direction_sequence)
5041 ensure_correct_direction(flags &
5043 CHARSET_RIGHT_TO_LEFT :
5044 CHARSET_LEFT_TO_RIGHT,
5045 str->codesys, dst, 0,
5047 /* More error recovery. */
5048 if (!retval || str->iso2022.output_literally) {
5049 /* Output the (possibly invalid) sequence */
5051 for (i = 0; i < str->iso2022.esc_bytes_index;
5053 DECODE_ADD_BINARY_CHAR(str->iso2022.
5056 flags &= CODING_STATE_ISO2022_LOCK;
5058 n++, src--; /* Repeat the loop with the same character. */
5060 /* No sense in reprocessing the final byte of the
5061 escape sequence; it could mess things up anyway.
5063 DECODE_ADD_BINARY_CHAR(c, dst);
5067 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) { /* Control characters */
5069 /***** Error-handling *****/
5071 /* If we were in the middle of a character, dump out the
5072 partial character. */
5073 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5075 /* If we just saw a single-shift character, dump it out.
5076 This may dump out the wrong sort of single-shift character,
5077 but least it will give an indication that something went
5079 if (flags & CODING_STATE_SS2) {
5080 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5081 flags &= ~CODING_STATE_SS2;
5083 if (flags & CODING_STATE_SS3) {
5084 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5085 flags &= ~CODING_STATE_SS3;
5088 /***** Now handle the control characters. *****/
5091 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5093 flags &= CODING_STATE_ISO2022_LOCK;
5095 if (!parse_iso2022_esc
5096 (coding_system, &str->iso2022, c, &flags, 1))
5097 DECODE_ADD_BINARY_CHAR(c, dst);
5098 } else { /* Graphic characters */
5099 Lisp_Object charset;
5103 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5105 /* Now determine the charset. */
5106 reg = ((flags & CODING_STATE_SS2) ? 2
5107 : (flags & CODING_STATE_SS3) ? 3
5108 : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5109 : str->iso2022.register_left);
5110 charset = str->iso2022.charset[reg];
5112 /* Error checking: */
5113 if (!CHARSETP(charset)
5114 || str->iso2022.invalid_designated[reg]
5116 (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5117 && XCHARSET_CHARS(charset) == 94))
5118 /* Mrmph. We are trying to invoke a register that has no
5119 or an invalid charset in it, or trying to add a character
5120 outside the range of the charset. Insert that char literally
5121 to preserve it for the output. */
5123 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5124 DECODE_ADD_BINARY_CHAR(c, dst);
5128 /* Things are probably hunky-dorey. */
5130 /* Fetch reverse charset, maybe. */
5131 if (((flags & CODING_STATE_R2L) &&
5132 XCHARSET_DIRECTION(charset) ==
5133 CHARSET_LEFT_TO_RIGHT)
5134 || (!(flags & CODING_STATE_R2L)
5135 && XCHARSET_DIRECTION(charset) ==
5136 CHARSET_RIGHT_TO_LEFT)) {
5137 Lisp_Object new_charset =
5138 XCHARSET_REVERSE_DIRECTION_CHARSET
5140 if (!NILP(new_charset))
5141 charset = new_charset;
5144 lb = XCHARSET_LEADING_BYTE(charset);
5145 switch (XCHARSET_REP_BYTES(charset)) {
5147 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5148 Dynarr_add(dst, c & 0x7F);
5151 case 2: /* one-byte official */
5152 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5153 Dynarr_add(dst, lb);
5154 Dynarr_add(dst, c | 0x80);
5157 case 3: /* one-byte private or two-byte official */
5158 if (XCHARSET_PRIVATE_P(charset)) {
5159 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5161 PRE_LEADING_BYTE_PRIVATE_1);
5162 Dynarr_add(dst, lb);
5163 Dynarr_add(dst, c | 0x80);
5166 Dynarr_add(dst, lb);
5177 default: /* two-byte private */
5180 PRE_LEADING_BYTE_PRIVATE_2);
5181 Dynarr_add(dst, lb);
5182 Dynarr_add(dst, ch | 0x80);
5183 Dynarr_add(dst, c | 0x80);
5191 flags &= CODING_STATE_ISO2022_LOCK;
5194 label_continue_loop:;
5197 if (flags & CODING_STATE_END)
5198 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5204 /***** ISO2022 encoder *****/
5206 /* Designate CHARSET into register REG. */
5209 iso2022_designate(Lisp_Object charset, unsigned char reg,
5210 encoding_stream_t str, unsigned_char_dynarr * dst)
5212 static const char inter94[] = "()*+";
5213 static const char inter96[] = ",-./";
5215 unsigned char final;
5216 Lisp_Object old_charset = str->iso2022.charset[reg];
5218 str->iso2022.charset[reg] = charset;
5219 if (!CHARSETP(charset))
5220 /* charset might be an initial nil or t. */
5222 type = XCHARSET_TYPE(charset);
5223 final = XCHARSET_FINAL(charset);
5224 if (!str->iso2022.force_charset_on_output[reg] &&
5225 CHARSETP(old_charset) &&
5226 XCHARSET_TYPE(old_charset) == type &&
5227 XCHARSET_FINAL(old_charset) == final)
5230 str->iso2022.force_charset_on_output[reg] = 0;
5233 charset_conversion_spec_dynarr *dyn =
5234 str->codesys->iso2022.output_conv;
5239 for (i = 0; i < Dynarr_length(dyn); i++) {
5240 struct charset_conversion_spec *spec =
5242 if (EQ(charset, spec->from_charset))
5243 charset = spec->to_charset;
5248 Dynarr_add(dst, ISO_CODE_ESC);
5250 case CHARSET_TYPE_94:
5251 Dynarr_add(dst, inter94[reg]);
5253 case CHARSET_TYPE_96:
5254 Dynarr_add(dst, inter96[reg]);
5256 case CHARSET_TYPE_94X94:
5257 Dynarr_add(dst, '$');
5258 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5259 || final < '@' || final > 'B')
5260 Dynarr_add(dst, inter94[reg]);
5262 case CHARSET_TYPE_96X96:
5263 Dynarr_add(dst, '$');
5264 Dynarr_add(dst, inter96[reg]);
5269 Dynarr_add(dst, final);
5273 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5275 if (str->iso2022.register_left != 0) {
5276 Dynarr_add(dst, ISO_CODE_SI);
5277 str->iso2022.register_left = 0;
5282 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5284 if (str->iso2022.register_left != 1) {
5285 Dynarr_add(dst, ISO_CODE_SO);
5286 str->iso2022.register_left = 1;
5290 /* Convert internally-formatted data to ISO2022 format. */
5293 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5294 unsigned_char_dynarr * dst, Lstream_data_count n)
5296 unsigned char charmask, c;
5297 unsigned char char_boundary;
5298 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5299 unsigned int flags = str->flags;
5300 unsigned int ch = str->ch;
5301 Lisp_Coding_System *codesys = str->codesys;
5302 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5304 Lisp_Object charset;
5307 #ifdef ENABLE_COMPOSITE_CHARS
5308 /* flags for handling composite chars. We do a little switcharoo
5309 on the source while we're outputting the composite char. */
5310 unsigned int saved_n = 0;
5311 const unsigned char *saved_src = NULL;
5312 int in_composite = 0;
5313 #endif /* ENABLE_COMPOSITE_CHARS */
5315 char_boundary = str->iso2022.current_char_boundary;
5316 charset = str->iso2022.current_charset;
5317 half = str->iso2022.current_half;
5319 #ifdef ENABLE_COMPOSITE_CHARS
5325 if (BYTE_ASCII_P(c)) { /* Processing ASCII character */
5328 restore_left_to_right_direction(codesys, dst, &flags,
5331 /* Make sure G0 contains ASCII */
5332 if ((c > ' ' && c < ISO_CODE_DEL) ||
5333 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5334 ensure_normal_shift(str, dst);
5335 iso2022_designate(Vcharset_ascii, 0, str, dst);
5338 /* If necessary, restore everything to the default state
5341 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5342 restore_left_to_right_direction(codesys, dst,
5345 ensure_normal_shift(str, dst);
5347 for (i = 0; i < 4; i++) {
5348 Lisp_Object initial_charset =
5349 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5351 iso2022_designate(initial_charset, i,
5356 if (eol_type != EOL_LF
5357 && eol_type != EOL_AUTODETECT)
5358 Dynarr_add(dst, '\r');
5359 if (eol_type != EOL_CR)
5362 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5363 && fit_to_be_escape_quoted(c))
5364 Dynarr_add(dst, ISO_CODE_ESC);
5370 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) { /* Processing Leading Byte */
5372 charset = CHARSET_BY_LEADING_BYTE(c);
5373 if (LEADING_BYTE_PREFIX_P(c))
5375 else if (!EQ(charset, Vcharset_control_1)
5376 #ifdef ENABLE_COMPOSITE_CHARS
5377 && !EQ(charset, Vcharset_composite)
5382 ensure_correct_direction(XCHARSET_DIRECTION
5386 /* Now determine which register to use. */
5388 for (i = 0; i < 4; i++) {
5389 if (EQ(charset, str->iso2022.charset[i])
5391 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5399 if (XCHARSET_GRAPHIC(charset) != 0) {
5401 (str->iso2022.charset[1])
5403 (!CODING_SYSTEM_ISO2022_SEVEN
5406 CODING_SYSTEM_ISO2022_LOCK_SHIFT
5423 iso2022_designate(charset, reg, str, dst);
5425 /* Now invoke that register. */
5428 ensure_normal_shift(str, dst);
5433 if (CODING_SYSTEM_ISO2022_SEVEN
5435 ensure_shift_out(str, dst);
5442 if (CODING_SYSTEM_ISO2022_SEVEN
5444 Dynarr_add(dst, ISO_CODE_ESC);
5445 Dynarr_add(dst, 'N');
5448 Dynarr_add(dst, ISO_CODE_SS2);
5454 if (CODING_SYSTEM_ISO2022_SEVEN
5456 Dynarr_add(dst, ISO_CODE_ESC);
5457 Dynarr_add(dst, 'O');
5460 Dynarr_add(dst, ISO_CODE_SS3);
5470 } else { /* Processing Non-ASCII character */
5471 charmask = (half == 0 ? 0x7F : 0xFF);
5473 if (EQ(charset, Vcharset_control_1)) {
5474 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5475 && fit_to_be_escape_quoted(c))
5476 Dynarr_add(dst, ISO_CODE_ESC);
5477 /* you asked for it ... */
5478 Dynarr_add(dst, c - 0x20);
5480 switch (XCHARSET_REP_BYTES(charset)) {
5482 Dynarr_add(dst, c & charmask);
5485 if (XCHARSET_PRIVATE_P(charset)) {
5486 Dynarr_add(dst, c & charmask);
5489 #ifdef ENABLE_COMPOSITE_CHARS
5492 Vcharset_composite)) {
5494 /* #### Bother! We don't know how to
5501 (Vcharset_composite,
5506 composite_char_string
5515 n = XSTRING_LENGTH(lstr);
5518 Dynarr_add(dst, '0'); /* start composing */
5521 #endif /* ENABLE_COMPOSITE_CHARS */
5538 Dynarr_add(dst, ch & charmask);
5539 Dynarr_add(dst, c & charmask);
5553 #ifdef ENABLE_COMPOSITE_CHARS
5558 Dynarr_add(dst, ISO_CODE_ESC);
5559 Dynarr_add(dst, '1'); /* end composing */
5560 goto back_to_square_n; /* Wheeeeeeeee ..... */
5562 #endif /* ENABLE_COMPOSITE_CHARS */
5564 if (char_boundary && flags & CODING_STATE_END) {
5565 restore_left_to_right_direction(codesys, dst, &flags, 0);
5566 ensure_normal_shift(str, dst);
5567 for (i = 0; i < 4; i++) {
5568 Lisp_Object initial_charset =
5569 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5570 iso2022_designate(initial_charset, i, str, dst);
5576 str->iso2022.current_char_boundary = char_boundary;
5577 str->iso2022.current_charset = charset;
5578 str->iso2022.current_half = half;
5580 /* Verbum caro factum est! */
5584 /************************************************************************/
5585 /* No-conversion methods */
5586 /************************************************************************/
5588 /* This is used when reading in "binary" files -- i.e. files that may
5589 contain all 256 possible byte values and that are not to be
5590 interpreted as being in any particular decoding. */
5592 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5593 unsigned_char_dynarr * dst, Lstream_data_count n)
5595 decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5596 unsigned int flags = str->flags;
5597 unsigned int ch = str->ch;
5598 eol_type_t eol_type = str->eol_type;
5601 const unsigned char c = *(const unsigned char *)src++;
5603 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5604 DECODE_ADD_BINARY_CHAR(c, dst);
5605 label_continue_loop:;
5608 DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5615 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5616 unsigned_char_dynarr * dst, Lstream_data_count n)
5619 encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5620 unsigned int flags = str->flags;
5621 unsigned int ch = str->ch;
5622 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5627 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5628 Dynarr_add(dst, '\r');
5629 if (eol_type != EOL_CR)
5630 Dynarr_add(dst, '\n');
5632 } else if (BYTE_ASCII_P(c)) {
5635 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5637 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5638 c == LEADING_BYTE_CONTROL_1)
5641 Dynarr_add(dst, '~'); /* untranslatable character */
5643 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5645 else if (ch == LEADING_BYTE_CONTROL_1) {
5647 Dynarr_add(dst, c - 0x20);
5649 /* else it should be the second or third byte of an
5650 untranslatable character, so ignore it */
5659 /************************************************************************/
5660 /* Initialization */
5661 /************************************************************************/
5663 void syms_of_file_coding(void)
5665 INIT_LRECORD_IMPLEMENTATION(coding_system);
5667 DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5669 DEFSUBR(Fcoding_system_p);
5670 DEFSUBR(Ffind_coding_system);
5671 DEFSUBR(Fget_coding_system);
5672 DEFSUBR(Fcoding_system_list);
5673 DEFSUBR(Fcoding_system_name);
5674 DEFSUBR(Fmake_coding_system);
5675 DEFSUBR(Fcopy_coding_system);
5676 DEFSUBR(Fcoding_system_canonical_name_p);
5677 DEFSUBR(Fcoding_system_alias_p);
5678 DEFSUBR(Fcoding_system_aliasee);
5679 DEFSUBR(Fdefine_coding_system_alias);
5680 DEFSUBR(Fsubsidiary_coding_system);
5682 DEFSUBR(Fcoding_system_type);
5683 DEFSUBR(Fcoding_system_doc_string);
5685 DEFSUBR(Fcoding_system_charset);
5687 DEFSUBR(Fcoding_system_property);
5689 DEFSUBR(Fcoding_category_list);
5690 DEFSUBR(Fset_coding_priority_list);
5691 DEFSUBR(Fcoding_priority_list);
5692 DEFSUBR(Fset_coding_category_system);
5693 DEFSUBR(Fcoding_category_system);
5695 DEFSUBR(Fdetect_coding_region);
5696 DEFSUBR(Fdecode_coding_region);
5697 DEFSUBR(Fencode_coding_region);
5699 DEFSUBR(Fdecode_shift_jis_char);
5700 DEFSUBR(Fencode_shift_jis_char);
5701 DEFSUBR(Fdecode_big5_char);
5702 DEFSUBR(Fencode_big5_char);
5703 DEFSUBR(Fset_ucs_char);
5705 DEFSUBR(Fset_char_ucs);
5708 defsymbol(&Qcoding_systemp, "coding-system-p");
5709 defsymbol(&Qno_conversion, "no-conversion");
5710 defsymbol(&Qraw_text, "raw-text");
5712 defsymbol(&Qbig5, "big5");
5713 defsymbol(&Qshift_jis, "shift-jis");
5714 defsymbol(&Qucs4, "ucs-4");
5715 defsymbol(&Qutf8, "utf-8");
5716 defsymbol(&Qccl, "ccl");
5717 defsymbol(&Qiso2022, "iso2022");
5719 defsymbol(&Qmnemonic, "mnemonic");
5720 defsymbol(&Qeol_type, "eol-type");
5721 defsymbol(&Qpost_read_conversion, "post-read-conversion");
5722 defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5724 defsymbol(&Qcr, "cr");
5725 defsymbol(&Qlf, "lf");
5726 defsymbol(&Qcrlf, "crlf");
5727 defsymbol(&Qeol_cr, "eol-cr");
5728 defsymbol(&Qeol_lf, "eol-lf");
5729 defsymbol(&Qeol_crlf, "eol-crlf");
5731 defsymbol(&Qcharset_g0, "charset-g0");
5732 defsymbol(&Qcharset_g1, "charset-g1");
5733 defsymbol(&Qcharset_g2, "charset-g2");
5734 defsymbol(&Qcharset_g3, "charset-g3");
5735 defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5736 defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5737 defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5738 defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5739 defsymbol(&Qno_iso6429, "no-iso6429");
5740 defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5741 defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5743 defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5744 defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5745 defsymbol(&Qseven, "seven");
5746 defsymbol(&Qlock_shift, "lock-shift");
5747 defsymbol(&Qescape_quoted, "escape-quoted");
5749 defsymbol(&Qencode, "encode");
5750 defsymbol(&Qdecode, "decode");
5753 defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5755 defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5756 defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5757 defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5758 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5759 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5761 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5762 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5763 defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5766 defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5770 void lstream_type_create_file_coding(void)
5772 LSTREAM_HAS_METHOD(decoding, reader);
5773 LSTREAM_HAS_METHOD(decoding, writer);
5774 LSTREAM_HAS_METHOD(decoding, rewinder);
5775 LSTREAM_HAS_METHOD(decoding, seekable_p);
5776 LSTREAM_HAS_METHOD(decoding, flusher);
5777 LSTREAM_HAS_METHOD(decoding, closer);
5778 LSTREAM_HAS_METHOD(decoding, marker);
5780 LSTREAM_HAS_METHOD(encoding, reader);
5781 LSTREAM_HAS_METHOD(encoding, writer);
5782 LSTREAM_HAS_METHOD(encoding, rewinder);
5783 LSTREAM_HAS_METHOD(encoding, seekable_p);
5784 LSTREAM_HAS_METHOD(encoding, flusher);
5785 LSTREAM_HAS_METHOD(encoding, closer);
5786 LSTREAM_HAS_METHOD(encoding, marker);
5789 void vars_of_file_coding(void)
5793 fcd = xnew(struct file_coding_dump);
5794 dump_add_root_struct_ptr(&fcd, &fcd_description);
5796 /* Initialize to something reasonable ... */
5797 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5798 fcd->coding_category_system[i] = Qnil;
5799 fcd->coding_category_by_priority[i] = i;
5802 Fprovide(intern("file-coding"));
5804 DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system /*
5805 Coding system used for TTY keyboard input.
5806 Not used under a windowing system.
5808 Vkeyboard_coding_system = Qnil;
5810 DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system /*
5811 Coding system used for TTY display output.
5812 Not used under a windowing system.
5814 Vterminal_coding_system = Qnil;
5816 DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read /*
5817 Overriding coding system used when reading from a file or process.
5818 You should bind this variable with `let', but do not set it globally.
5819 If this is non-nil, it specifies the coding system that will be used
5820 to decode input on read operations, such as from a file or process.
5821 It overrides `buffer-file-coding-system-for-read',
5822 `insert-file-contents-pre-hook', etc. Use those variables instead of
5823 this one for permanent changes to the environment. */ );
5824 Vcoding_system_for_read = Qnil;
5826 DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write /*
5827 Overriding coding system used when writing to a file or process.
5828 You should bind this variable with `let', but do not set it globally.
5829 If this is non-nil, it specifies the coding system that will be used
5830 to encode output for write operations, such as to a file or process.
5831 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5832 Use those variables instead of this one for permanent changes to the
5834 Vcoding_system_for_write = Qnil;
5836 DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system /*
5837 Coding system used to convert pathnames when accessing files.
5839 Vfile_name_coding_system = Qnil;
5841 DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5842 Non-nil means the buffer contents are regarded as multi-byte form
5843 of characters, not a binary code. This affects the display, file I/O,
5844 and behaviors of various editing commands.
5846 Setting this to nil does not do anything.
5848 enable_multibyte_characters = 1;
5851 void complex_vars_of_file_coding(void)
5853 staticpro(&Vcoding_system_hash_table);
5854 Vcoding_system_hash_table =
5855 make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5857 the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5858 dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5859 &codesys_prop_dynarr_description);
5861 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5863 struct codesys_prop csp; \
5865 csp.prop_type = (Prop_Type); \
5866 Dynarr_add (the_codesys_prop_dynarr, csp); \
5869 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5870 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5871 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5872 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5873 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5874 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5875 DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5877 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5878 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5879 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5880 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5881 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5882 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5883 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5884 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5885 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5886 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5887 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5888 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5889 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5890 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5891 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5892 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5893 DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5895 DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5896 DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5898 /* Need to create this here or we're really screwed. */
5900 (Qraw_text, Qno_conversion,
5902 ("Raw text, which means it converts only line-break-codes."),
5903 list2(Qmnemonic, build_string("Raw")));
5906 (Qbinary, Qno_conversion,
5907 build_string("Binary, which means it does not convert anything."),
5908 list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5910 Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5912 Fdefine_coding_system_alias(Qfile_name, Qbinary);
5914 Fdefine_coding_system_alias(Qterminal, Qbinary);
5915 Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5917 /* Need this for bootstrapping */
5918 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5919 Fget_coding_system(Qraw_text);
5925 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5926 fcd->ucs_to_mule_table[i] = Qnil;
5928 staticpro(&mule_to_ucs_table);
5929 mule_to_ucs_table = Fmake_char_table(Qgeneric);