Fix build
[sxemacs] / src / mule / file-coding.c
1 /* Code conversion functions.
2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3    Copyright (C) 1995 Sun Microsystems, Inc.
4
5 This file is part of SXEmacs
6
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>. */
19
20
21 /* Synched up with: Mule 2.3.   Not in FSF. */
22
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
24
25 #include <config.h>
26 #include "lisp.h"
27
28 #include "buffer.h"
29 #include "elhash.h"
30 #include "ui/insdel.h"
31 #include "lstream.h"
32 #include "opaque.h"
33 #ifdef MULE
34 #include "mule-ccl.h"
35 #include "chartab.h"
36 #endif
37 #include "file-coding.h"
38
39 Lisp_Object Qcoding_system_error;
40
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
46
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
49
50 struct file_coding_dump {
51         /* Coding system currently associated with each coding category. */
52         Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
53
54         /* Table of all coding categories in decreasing order of priority.
55            This describes a permutation of the possible coding categories. */
56         int coding_category_by_priority[CODING_CATEGORY_LAST];
57
58 #ifdef MULE
59         Lisp_Object ucs_to_mule_table[65536];
60 #endif
61 } *fcd;
62
63 static const struct lrecord_description fcd_description_1[] = {
64         {XD_LISP_OBJECT_ARRAY,
65          offsetof(struct file_coding_dump, coding_category_system),
66          CODING_CATEGORY_LAST},
67 #ifdef MULE
68         {XD_LISP_OBJECT_ARRAY,
69          offsetof(struct file_coding_dump, ucs_to_mule_table),
70          countof(fcd->ucs_to_mule_table)},
71 #endif
72         {XD_END}
73 };
74
75 static const struct struct_description fcd_description = {
76         sizeof(struct file_coding_dump),
77         fcd_description_1
78 };
79
80 Lisp_Object mule_to_ucs_table;
81
82 Lisp_Object Qcoding_systemp;
83
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
86
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
92
93 #ifdef MULE
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 #endif
104 Lisp_Object Qencode, Qdecode;
105
106 Lisp_Object Vcoding_system_hash_table;
107
108 int enable_multibyte_characters;
109
110 #ifdef MULE
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113         /* CHARSET holds the character sets currently assigned to the G0
114            through G3 variables.  It is initialized from the array
115            INITIAL_CHARSET in CODESYS. */
116         Lisp_Object charset[4];
117
118         /* Which registers are currently invoked into the left (GL) and
119            right (GR) halves of the 8-bit encoding space? */
120         int register_left, register_right;
121
122         /* ISO_ESC holds a value indicating part of an escape sequence
123            that has already been seen. */
124         enum iso_esc_flag esc;
125
126         /* This records the bytes we've seen so far in an escape sequence,
127            in case the sequence is invalid (we spit out the bytes unchanged). */
128         unsigned char esc_bytes[8];
129
130         /* Index for next byte to store in ISO escape sequence. */
131         int esc_bytes_index;
132
133 #ifdef ENABLE_COMPOSITE_CHARS
134         /* Stuff seen so far when composing a string. */
135         unsigned_char_dynarr *composite_chars;
136 #endif
137
138         /* If we saw an invalid designation sequence for a particular
139            register, we flag it here and switch to ASCII.  The next time we
140            see a valid designation for this register, we turn off the flag
141            and do the designation normally, but pretend the sequence was
142            invalid.  The effect of all this is that (most of the time) the
143            escape sequences for both the switch to the unknown charset, and
144            the switch back to the known charset, get inserted literally into
145            the buffer and saved out as such.  The hope is that we can
146            preserve the escape sequences so that the resulting written out
147            file makes sense.  If we don't do any of this, the designation
148            to the invalid charset will be preserved but that switch back
149            to the known charset will probably get eaten because it was
150            the same charset that was already present in the register. */
151         unsigned char invalid_designated[4];
152
153         /* We try to do similar things as above for direction-switching
154            sequences.  If we encountered a direction switch while an
155            invalid designation was present, or an invalid designation
156            just after a direction switch (i.e. no valid designation
157            encountered yet), we insert the direction-switch escape
158            sequence literally into the output stream, and later on
159            insert the corresponding direction-restoring escape sequence
160            literally also. */
161         unsigned int switched_dir_and_no_valid_charset_yet:1;
162         unsigned int invalid_switch_dir:1;
163
164         /* Tells the decoder to output the escape sequence literally
165            even though it was valid.  Used in the games we play to
166            avoid lossage when we encounter invalid designations. */
167         unsigned int output_literally:1;
168         /* We encountered a direction switch followed by an invalid
169            designation.  We didn't output the direction switch
170            literally because we didn't know about the invalid designation;
171            but we have to do so now. */
172         unsigned int output_direction_sequence:1;
173 };
174 #endif                          /* MULE */
175 EXFUN(Fcopy_coding_system, 2);
176 #ifdef MULE
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179                               const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181                                unsigned_char_dynarr * dst,
182                                Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184                                unsigned_char_dynarr * dst,
185                                Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187                               Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189                                unsigned_char_dynarr * dst,
190                                Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192                                unsigned_char_dynarr * dst,
193                                Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195                               Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197                                unsigned_char_dynarr * dst,
198                                Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200                                unsigned_char_dynarr * dst,
201                                Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203                               Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205                                unsigned_char_dynarr * dst,
206                                Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208                                unsigned_char_dynarr * dst,
209                                Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212                           struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214                                  const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216                                   unsigned_char_dynarr * dst,
217                                   Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219                                   unsigned_char_dynarr * dst,
220                                   Lstream_data_count n);
221 #endif                          /* MULE */
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223                                         unsigned_char_dynarr * dst,
224                                         Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226                                         unsigned_char_dynarr * dst,
227                                         Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229                         unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231                         unsigned_char_dynarr * dst, Lstream_data_count n);
232
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
235         Lisp_Object sym;
236         int prop_type;
237 };
238
239 typedef struct {
240         Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
242
243 static const struct lrecord_description codesys_prop_description_1[] = {
244         {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
245         {XD_END}
246 };
247
248 static const struct struct_description codesys_prop_description = {
249         sizeof(codesys_prop),
250         codesys_prop_description_1
251 };
252
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254         XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
255         {XD_END}
256 };
257
258 static const struct struct_description codesys_prop_dynarr_description = {
259         sizeof(codesys_prop_dynarr),
260         codesys_prop_dynarr_description_1
261 };
262
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
264
265 enum codesys_prop_enum {
266         CODESYS_PROP_ALL_OK,
267         CODESYS_PROP_ISO2022,
268         CODESYS_PROP_CCL
269 };
270 \f
271 /************************************************************************/
272 /*                       Coding system functions                        */
273 /************************************************************************/
274
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
278
279 #ifdef MULE
280 static const struct lrecord_description ccs_description_1[] = {
281         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
283         {XD_END}
284 };
285
286 static const struct struct_description ccs_description = {
287         sizeof(charset_conversion_spec),
288         ccs_description_1
289 };
290
291 static const struct lrecord_description ccsd_description_1[] = {
292         XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
293         {XD_END}
294 };
295
296 static const struct struct_description ccsd_description = {
297         sizeof(charset_conversion_spec_dynarr),
298         ccsd_description_1
299 };
300 #endif
301
302 static const struct lrecord_description coding_system_description[] = {
303         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
311 #ifdef MULE
312         {XD_LISP_OBJECT_ARRAY,
313          offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
315          &ccsd_description},
316         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
317          &ccsd_description},
318         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
320 #endif
321         {XD_END}
322 };
323
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325                               mark_coding_system, print_coding_system,
326                               finalize_coding_system,
327                               0, 0, coding_system_description,
328                               Lisp_Coding_System);
329
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
331 {
332         Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
333
334         mark_object(CODING_SYSTEM_NAME(codesys));
335         mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336         mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337         mark_object(CODING_SYSTEM_EOL_LF(codesys));
338         mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339         mark_object(CODING_SYSTEM_EOL_CR(codesys));
340
341         switch (CODING_SYSTEM_TYPE(codesys)) {
342 #ifdef MULE
343                 int i;
344         case CODESYS_ISO2022:
345                 for (i = 0; i < 4; i++)
346                         mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
347                                     (codesys, i));
348                 if (codesys->iso2022.input_conv) {
349                         for (i = 0;
350                              i < Dynarr_length(codesys->iso2022.input_conv);
351                              i++) {
352                                 struct charset_conversion_spec *ccs =
353                                     Dynarr_atp(codesys->iso2022.input_conv, i);
354                                 mark_object(ccs->from_charset);
355                                 mark_object(ccs->to_charset);
356                         }
357                 }
358                 if (codesys->iso2022.output_conv) {
359                         for (i = 0;
360                              i < Dynarr_length(codesys->iso2022.output_conv);
361                              i++) {
362                                 struct charset_conversion_spec *ccs =
363                                     Dynarr_atp(codesys->iso2022.output_conv, i);
364                                 mark_object(ccs->from_charset);
365                                 mark_object(ccs->to_charset);
366                         }
367                 }
368                 break;
369
370         case CODESYS_CCL:
371                 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372                 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
373                 break;
374
375                 /* list the rest of them lot explicitly */
376         case CODESYS_AUTODETECT:
377         case CODESYS_SHIFT_JIS:
378         case CODESYS_BIG5:
379         case CODESYS_UCS4:
380         case CODESYS_UTF8:
381         case CODESYS_NO_CONVERSION:
382 #ifdef DEBUG_SXEMACS
383         case CODESYS_INTERNAL:
384 #endif
385 #endif                          /* MULE */
386         default:
387                 break;
388         }
389
390         mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391         return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
392 }
393
394 static void
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
396 {
397         Lisp_Coding_System *c = XCODING_SYSTEM(obj);
398         if (print_readably)
399                 error("printing unreadable object #<coding-system 0x%x>",
400                       c->header.uid);
401
402         write_c_string("#<coding-system ", printcharfun);
403         print_internal(c->name, printcharfun, 1);
404         write_c_string(">", printcharfun);
405 }
406
407 static void finalize_coding_system(void *header, int for_disksave)
408 {
409         Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410         /* Since coding systems never go away, this function is not
411            necessary.  But it would be necessary if we changed things
412            so that coding systems could go away. */
413         if (!for_disksave) {    /* see comment in lstream.c */
414                 switch (CODING_SYSTEM_TYPE(c)) {
415 #ifdef MULE
416                 case CODESYS_ISO2022:
417                         if (c->iso2022.input_conv) {
418                                 Dynarr_free(c->iso2022.input_conv);
419                                 c->iso2022.input_conv = 0;
420                         }
421                         if (c->iso2022.output_conv) {
422                                 Dynarr_free(c->iso2022.output_conv);
423                                 c->iso2022.output_conv = 0;
424                         }
425                         break;
426
427                         /* list the rest of them lot explicitly */
428                 case CODESYS_AUTODETECT:
429                 case CODESYS_SHIFT_JIS:
430                 case CODESYS_BIG5:
431                 case CODESYS_UCS4:
432                 case CODESYS_UTF8:
433                 case CODESYS_CCL:
434                 case CODESYS_NO_CONVERSION:
435 #ifdef DEBUG_SXEMACS
436                 case CODESYS_INTERNAL:
437 #endif
438 #endif                          /* MULE */
439                 default:
440                         break;
441                 }
442         }
443 }
444
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
446 {
447         CHECK_SYMBOL(symbol);
448         if (NILP(symbol))
449                 return EOL_AUTODETECT;
450         if (EQ(symbol, Qlf))
451                 return EOL_LF;
452         if (EQ(symbol, Qcrlf))
453                 return EOL_CRLF;
454         if (EQ(symbol, Qcr))
455                 return EOL_CR;
456
457         signal_simple_error("Unrecognized eol type", symbol);
458         return EOL_AUTODETECT;  /* not reached */
459 }
460
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
462 {
463         switch (type) {
464         default:
465                 abort();
466                 break;
467         case EOL_LF:
468                 return Qlf;
469         case EOL_CRLF:
470                 return Qcrlf;
471         case EOL_CR:
472                 return Qcr;
473         case EOL_AUTODETECT:
474                 return Qnil;
475         }
476 }
477
478 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
479 {
480         Lisp_Object codesys_obj;
481         int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
482         char *codesys_name = (char *)alloca(len + 7);
483         int mlen = -1;
484         char *codesys_mnemonic = 0;
485
486         Lisp_Object codesys_name_sym, sub_codesys_obj;
487
488         /* kludge */
489
490         XSETCODING_SYSTEM(codesys_obj, codesys);
491
492         memcpy(codesys_name,
493                string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
494
495         if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
496                 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
497                 codesys_mnemonic = (char *)alloca(mlen + 7);
498                 memcpy(codesys_mnemonic,
499                        XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
500         }
501 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
502   strcpy (codesys_name + len, "-" op_sys);                                      \
503   if (mlen != -1)                                                               \
504     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
505   codesys_name_sym = intern (codesys_name);                                     \
506   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
507   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
508   if (mlen != -1)                                                               \
509     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
510       build_string (codesys_mnemonic);                                          \
511   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
512 } while (0)
513
514         DEFINE_SUB_CODESYS("unix", "", EOL_LF);
515         DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
516         DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
517 }
518
519 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0,     /*
520 Return t if OBJECT is a coding system.
521 A coding system is an object that defines how text containing multiple
522 character sets is encoded into a stream of (typically 8-bit) bytes.
523 The coding system is used to decode the stream into a series of
524 characters (which may be from multiple charsets) when the text is read
525 from a file or process, and is used to encode the text back into the
526 same format when it is written out to a file or process.
527
528 For example, many ISO2022-compliant coding systems (such as Compound
529 Text, which is used for inter-client data under the X Window System)
530 use escape sequences to switch between different charsets -- Japanese
531 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
532 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
533 `make-coding-system' for more information.
534
535 Coding systems are normally identified using a symbol, and the
536 symbol is accepted in place of the actual coding system object whenever
537 a coding system is called for. (This is similar to how faces work.)
538 */
539       (object))
540 {
541         return CODING_SYSTEMP(object) ? Qt : Qnil;
542 }
543
544 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0,       /*
545 Retrieve the coding system of the given name.
546
547 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
548 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
549 If there is no such coding system, nil is returned.  Otherwise the
550 associated coding system object is returned.
551 */
552       (coding_system_or_name))
553 {
554         if (NILP(coding_system_or_name))
555                 coding_system_or_name = Qbinary;
556         else if (CODING_SYSTEMP(coding_system_or_name))
557                 return coding_system_or_name;
558         else
559                 CHECK_SYMBOL(coding_system_or_name);
560
561         while (1) {
562                 coding_system_or_name =
563                     Fgethash(coding_system_or_name, Vcoding_system_hash_table,
564                              Qnil);
565
566                 if (CODING_SYSTEMP(coding_system_or_name)
567                     || NILP(coding_system_or_name))
568                         return coding_system_or_name;
569         }
570 }
571
572 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
573 Retrieve the coding system of the given name.
574 Same as `find-coding-system' except that if there is no such
575 coding system, an error is signaled instead of returning nil.
576 */
577       (name))
578 {
579         Lisp_Object coding_system = Ffind_coding_system(name);
580
581         if (NILP(coding_system))
582                 signal_simple_error("No such coding system", name);
583         return coding_system;
584 }
585
586 /* We store the coding systems in hash tables with the names as the key and the
587    actual coding system object as the value.  Occasionally we need to use them
588    in a list format.  These routines provide us with that. */
589 struct coding_system_list_closure {
590         Lisp_Object *coding_system_list;
591 };
592
593 static int
594 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
595                                  void *coding_system_list_closure)
596 {
597         /* This function can GC */
598         struct coding_system_list_closure *cscl =
599             (struct coding_system_list_closure *)coding_system_list_closure;
600         Lisp_Object *coding_system_list = cscl->coding_system_list;
601
602         *coding_system_list = Fcons(key, *coding_system_list);
603         return 0;
604 }
605
606 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0,       /*
607 Return a list of the names of all defined coding systems.
608 */
609       ())
610 {
611         Lisp_Object coding_system_list = Qnil;
612         struct gcpro gcpro1;
613         struct coding_system_list_closure coding_system_list_closure;
614
615         GCPRO1(coding_system_list);
616         coding_system_list_closure.coding_system_list = &coding_system_list;
617         elisp_maphash(add_coding_system_to_list_mapper,
618                       Vcoding_system_hash_table, &coding_system_list_closure);
619         UNGCPRO;
620
621         return coding_system_list;
622 }
623
624 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0,       /*
625 Return the name of the given coding system.
626 */
627       (coding_system))
628 {
629         coding_system = Fget_coding_system(coding_system);
630         return XCODING_SYSTEM_NAME(coding_system);
631 }
632
633 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
634                                                   Lisp_Object name)
635 {
636         Lisp_Coding_System *codesys =
637             alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
638
639         zero_lcrecord(codesys);
640         CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
641         CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
642         CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
643         CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
644         CODING_SYSTEM_EOL_CR(codesys) = Qnil;
645         CODING_SYSTEM_EOL_LF(codesys) = Qnil;
646         CODING_SYSTEM_TYPE(codesys) = type;
647         CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
648 #ifdef MULE
649         if (type == CODESYS_ISO2022) {
650                 int i;
651                 for (i = 0; i < 4; i++)
652                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
653                             Qnil;
654         } else if (type == CODESYS_CCL) {
655                 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
656                 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
657         }
658 #endif                          /* MULE */
659         CODING_SYSTEM_NAME(codesys) = name;
660
661         return codesys;
662 }
663
664 #ifdef MULE
665 /* Given a list of charset conversion specs as specified in a Lisp
666    program, parse it into STORE_HERE. */
667
668 static void
669 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
670                                Lisp_Object spec_list)
671 {
672         Lisp_Object rest;
673
674         EXTERNAL_LIST_LOOP(rest, spec_list) {
675                 Lisp_Object car = XCAR(rest);
676                 Lisp_Object from, to;
677                 struct charset_conversion_spec spec;
678
679                 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
680                         signal_simple_error("Invalid charset conversion spec",
681                                             car);
682                 from = Fget_charset(XCAR(car));
683                 to = Fget_charset(XCAR(XCDR(car)));
684                 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
685                         signal_simple_error_2
686                             ("Attempted conversion between different charset types",
687                              from, to);
688                 spec.from_charset = from;
689                 spec.to_charset = to;
690
691                 Dynarr_add(store_here, spec);
692         }
693 }
694
695 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
696    specs, return the equivalent as the Lisp programmer would see it.
697
698    If LOAD_HERE is 0, return Qnil. */
699
700 static Lisp_Object
701 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
702 {
703         int i;
704         Lisp_Object result;
705
706         if (!load_here)
707                 return Qnil;
708         for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
709                 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
710                 result =
711                     Fcons(list2(ccs->from_charset, ccs->to_charset), result);
712         }
713
714         return Fnreverse(result);
715 }
716
717 #endif                          /* MULE */
718
719 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0,       /*
720 Register symbol NAME as a coding system.
721
722 TYPE describes the conversion method used and should be one of
723
724 nil or 'undecided
725 Automatic conversion.  SXEmacs attempts to detect the coding system
726 used in the file.
727 'no-conversion
728 No conversion.  Use this for binary files and such.  On output,
729 graphic characters that are not in ASCII or Latin-1 will be
730 replaced by a ?. (For a no-conversion-encoded buffer, these
731 characters will only be present if you explicitly insert them.)
732 'shift-jis
733 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
734 'ucs-4
735 ISO 10646 UCS-4 encoding.
736 'utf-8
737 ISO 10646 UTF-8 encoding.
738 'iso2022
739 Any ISO2022-compliant encoding.  Among other things, this includes
740 JIS (the Japanese encoding commonly used for e-mail), EUC (the
741 standard Unix encoding for Japanese and other languages), and
742 Compound Text (the encoding used in X11).  You can specify more
743 specific information about the conversion with the PROPS argument.
744 'big5
745 Big5 (the encoding commonly used for Taiwanese).
746 'ccl
747 The conversion is performed using a user-written pseudo-code
748 program.  CCL (Code Conversion Language) is the name of this
749 pseudo-code.
750 'internal
751 Write out or read in the raw contents of the memory representing
752 the buffer's text.  This is primarily useful for debugging
753 purposes, and is only enabled when SXEmacs has been compiled with
754 DEBUG_SXEMACS defined (via the --debug configure option).
755 WARNING: Reading in a file using 'internal conversion can result
756 in an internal inconsistency in the memory representing a
757 buffer's text, which will produce unpredictable results and may
758 cause SXEmacs to crash.  Under normal circumstances you should
759 never use 'internal conversion.
760
761 DOC-STRING is a string describing the coding system.
762
763 PROPS is a property list, describing the specific nature of the
764 character set.  Recognized properties are:
765
766 'mnemonic
767 String to be displayed in the modeline when this coding system is
768 active.
769
770 'eol-type
771 End-of-line conversion to be used.  It should be one of
772
773 nil
774 Automatically detect the end-of-line type (LF, CRLF,
775 or CR).  Also generate subsidiary coding systems named
776 `NAME-unix', `NAME-dos', and `NAME-mac', that are
777 identical to this coding system but have an EOL-TYPE
778 value of 'lf, 'crlf, and 'cr, respectively.
779 'lf
780 The end of a line is marked externally using ASCII LF.
781 Since this is also the way that SXEmacs represents an
782 end-of-line internally, specifying this option results
783 in no end-of-line conversion.  This is the standard
784 format for Unix text files.
785 'crlf
786 The end of a line is marked externally using ASCII
787 CRLF.  This is the standard format for MS-DOS text
788 files.
789 'cr
790 The end of a line is marked externally using ASCII CR.
791 This is the standard format for Macintosh text files.
792 t
793 Automatically detect the end-of-line type but do not
794 generate subsidiary coding systems.  (This value is
795 converted to nil when stored internally, and
796 `coding-system-property' will return nil.)
797
798 'post-read-conversion
799 Function called after a file has been read in, to perform the
800 decoding.  Called with two arguments, START and END, denoting
801 a region of the current buffer to be decoded.
802
803 'pre-write-conversion
804 Function called before a file is written out, to perform the
805 encoding.  Called with two arguments, START and END, denoting
806 a region of the current buffer to be encoded.
807
808 The following additional properties are recognized if TYPE is 'iso2022:
809
810 'charset-g0
811 'charset-g1
812 'charset-g2
813 'charset-g3
814 The character set initially designated to the G0 - G3 registers.
815 The value should be one of
816
817 -- A charset object (designate that character set)
818 -- nil (do not ever use this register)
819 -- t (no character set is initially designated to
820 the register, but may be later on; this automatically
821 sets the corresponding `force-g*-on-output' property)
822
823 'force-g0-on-output
824 'force-g1-on-output
825 'force-g2-on-output
826 'force-g2-on-output
827 If non-nil, send an explicit designation sequence on output before
828 using the specified register.
829
830 'short
831 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
832 "ESC $ B" on output in place of the full designation sequences
833 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
834
835 'no-ascii-eol
836 If non-nil, don't designate ASCII to G0 at each end of line on output.
837 Setting this to non-nil also suppresses other state-resetting that
838 normally happens at the end of a line.
839
840 'no-ascii-cntl
841 If non-nil, don't designate ASCII to G0 before control chars on output.
842
843 'seven
844 If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
845 environment.
846
847 'lock-shift
848 If non-nil, use locking-shift (SO/SI) instead of single-shift
849 or designation by escape sequence.
850
851 'no-iso6429
852 If non-nil, don't use ISO6429's direction specification.
853
854 'escape-quoted
855 If non-nil, literal control characters that are the same as
856 the beginning of a recognized ISO2022 or ISO6429 escape sequence
857 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
858 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
859 so that they can be properly distinguished from an escape sequence.
860 (Note that doing this results in a non-portable encoding.) This
861 encoding flag is used for byte-compiled files.  Note that ESC
862 is a good choice for a quoting character because there are no
863 escape sequences whose second byte is a character from the Control-0
864 or Control-1 character sets; this is explicitly disallowed by the
865 ISO2022 standard.
866
867 'input-charset-conversion
868 A list of conversion specifications, specifying conversion of
869 characters in one charset to another when decoding is performed.
870 Each specification is a list of two elements: the source charset,
871 and the destination charset.
872
873 'output-charset-conversion
874 A list of conversion specifications, specifying conversion of
875 characters in one charset to another when encoding is performed.
876 The form of each specification is the same as for
877 'input-charset-conversion.
878
879 The following additional properties are recognized (and required)
880 if TYPE is 'ccl:
881
882 'decode
883 CCL program used for decoding (converting to internal format).
884
885 'encode
886 CCL program used for encoding (converting to external format).
887 */
888       (name, type, doc_string, props))
889 {
890         Lisp_Coding_System *codesys;
891         enum coding_system_type ty;
892         int need_to_setup_eol_systems = 1;
893
894         /* Convert type to constant */
895         if (NILP(type) || EQ(type, Qundecided)) {
896                 ty = CODESYS_AUTODETECT;
897         }
898 #ifdef MULE
899         else if (EQ(type, Qshift_jis)) {
900                 ty = CODESYS_SHIFT_JIS;
901         } else if (EQ(type, Qiso2022)) {
902                 ty = CODESYS_ISO2022;
903         } else if (EQ(type, Qbig5)) {
904                 ty = CODESYS_BIG5;
905         } else if (EQ(type, Qucs4)) {
906                 ty = CODESYS_UCS4;
907         } else if (EQ(type, Qutf8)) {
908                 ty = CODESYS_UTF8;
909         } else if (EQ(type, Qccl)) {
910                 ty = CODESYS_CCL;
911         }
912 #endif
913         else if (EQ(type, Qno_conversion)) {
914                 ty = CODESYS_NO_CONVERSION;
915         }
916 #ifdef DEBUG_SXEMACS
917         else if (EQ(type, Qinternal)) {
918                 ty = CODESYS_INTERNAL;
919         }
920 #endif
921         else
922                 signal_simple_error("Invalid coding system type", type);
923
924         CHECK_SYMBOL(name);
925
926         codesys = allocate_coding_system(ty, name);
927
928         if (NILP(doc_string))
929                 doc_string = build_string("");
930         else
931                 CHECK_STRING(doc_string);
932         CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
933
934         {
935                 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
936                         if (EQ(key, Qmnemonic)) {
937                                 if (!NILP(value))
938                                         CHECK_STRING(value);
939                                 CODING_SYSTEM_MNEMONIC(codesys) = value;
940                         }
941
942                         else if (EQ(key, Qeol_type)) {
943                                 need_to_setup_eol_systems = NILP(value);
944                                 if (EQ(value, Qt))
945                                         value = Qnil;
946                                 CODING_SYSTEM_EOL_TYPE(codesys) =
947                                     symbol_to_eol_type(value);
948                         }
949
950                         else if (EQ(key, Qpost_read_conversion))
951                                 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
952                                     value;
953                         else if (EQ(key, Qpre_write_conversion))
954                                 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
955                                     value;
956 #ifdef MULE
957                         else if (ty == CODESYS_ISO2022) {
958 #define FROB_INITIAL_CHARSET(charset_num) \
959   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
960     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
961
962                                 if (EQ(key, Qcharset_g0))
963                                         FROB_INITIAL_CHARSET(0);
964                                 else if (EQ(key, Qcharset_g1))
965                                         FROB_INITIAL_CHARSET(1);
966                                 else if (EQ(key, Qcharset_g2))
967                                         FROB_INITIAL_CHARSET(2);
968                                 else if (EQ(key, Qcharset_g3))
969                                         FROB_INITIAL_CHARSET(3);
970
971 #define FROB_FORCE_CHARSET(charset_num) \
972   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
973
974                                 else if (EQ(key, Qforce_g0_on_output))
975                                         FROB_FORCE_CHARSET(0);
976                                 else if (EQ(key, Qforce_g1_on_output))
977                                         FROB_FORCE_CHARSET(1);
978                                 else if (EQ(key, Qforce_g2_on_output))
979                                         FROB_FORCE_CHARSET(2);
980                                 else if (EQ(key, Qforce_g3_on_output))
981                                         FROB_FORCE_CHARSET(3);
982
983 #define FROB_BOOLEAN_PROPERTY(prop) \
984   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
985
986                                 else if (EQ(key, Qshort))
987                                         FROB_BOOLEAN_PROPERTY(SHORT);
988                                 else if (EQ(key, Qno_ascii_eol))
989                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
990                                 else if (EQ(key, Qno_ascii_cntl))
991                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
992                                 else if (EQ(key, Qseven))
993                                         FROB_BOOLEAN_PROPERTY(SEVEN);
994                                 else if (EQ(key, Qlock_shift))
995                                         FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
996                                 else if (EQ(key, Qno_iso6429))
997                                         FROB_BOOLEAN_PROPERTY(NO_ISO6429);
998                                 else if (EQ(key, Qescape_quoted))
999                                         FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
1000
1001                                 else if (EQ(key, Qinput_charset_conversion)) {
1002                                         codesys->iso2022.input_conv =
1003                                             Dynarr_new(charset_conversion_spec);
1004                                         parse_charset_conversion_specs(codesys->
1005                                                                        iso2022.
1006                                                                        input_conv,
1007                                                                        value);
1008                                 } else if (EQ(key, Qoutput_charset_conversion)) {
1009                                         codesys->iso2022.output_conv =
1010                                             Dynarr_new(charset_conversion_spec);
1011                                         parse_charset_conversion_specs(codesys->
1012                                                                        iso2022.
1013                                                                        output_conv,
1014                                                                        value);
1015                                 } else
1016                                         signal_simple_error
1017                                             ("Unrecognized property", key);
1018                         } else if (EQ(type, Qccl)) {
1019                                 Lisp_Object sym;
1020                                 struct ccl_program test_ccl;
1021                                 Extbyte *suffix;
1022
1023                                 /* Check key first.  */
1024                                 if (EQ(key, Qdecode))
1025                                         suffix = "-ccl-decode";
1026                                 else if (EQ(key, Qencode))
1027                                         suffix = "-ccl-encode";
1028                                 else
1029                                         signal_simple_error
1030                                             ("Unrecognized property", key);
1031
1032                                 /* If value is vector, register it as a ccl program
1033                                    associated with an newly created symbol for
1034                                    backward compatibility.  */
1035                                 if (VECTORP(value)) {
1036                                         sym =
1037                                             Fintern(concat2
1038                                                     (Fsymbol_name(name),
1039                                                      build_string(suffix)),
1040                                                     Qnil);
1041                                         Fregister_ccl_program(sym, value);
1042                                 } else {
1043                                         CHECK_SYMBOL(value);
1044                                         sym = value;
1045                                 }
1046                                 /* check if the given ccl programs are valid.  */
1047                                 if (setup_ccl_program(&test_ccl, sym) < 0)
1048                                         signal_simple_error
1049                                             ("Invalid CCL program", value);
1050
1051                                 if (EQ(key, Qdecode))
1052                                         CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1053                                 else if (EQ(key, Qencode))
1054                                         CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1055
1056                         }
1057 #endif                          /* MULE */
1058                         else
1059                                 signal_simple_error("Unrecognized property",
1060                                                     key);
1061                 }
1062         }
1063
1064         if (need_to_setup_eol_systems)
1065                 setup_eol_coding_systems(codesys);
1066
1067         {
1068                 Lisp_Object codesys_obj;
1069                 XSETCODING_SYSTEM(codesys_obj, codesys);
1070                 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1071                 return codesys_obj;
1072         }
1073 }
1074
1075 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0,       /*
1076 Copy OLD-CODING-SYSTEM to NEW-NAME.
1077 If NEW-NAME does not name an existing coding system, a new one will
1078 be created.
1079 */
1080       (old_coding_system, new_name))
1081 {
1082         Lisp_Object new_coding_system;
1083         old_coding_system = Fget_coding_system(old_coding_system);
1084         new_coding_system = Ffind_coding_system(new_name);
1085         if (NILP(new_coding_system)) {
1086                 XSETCODING_SYSTEM(new_coding_system,
1087                                   allocate_coding_system
1088                                   (XCODING_SYSTEM_TYPE(old_coding_system),
1089                                    new_name));
1090                 Fputhash(new_name, new_coding_system,
1091                          Vcoding_system_hash_table);
1092         }
1093
1094         {
1095                 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1096                 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1097                 memcpy(((char *)to) + sizeof(to->header),
1098                        ((char *)from) + sizeof(from->header),
1099                        sizeof(*from) - sizeof(from->header));
1100                 to->name = new_name;
1101         }
1102         return new_coding_system;
1103 }
1104
1105 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0,       /*
1106 Return t if OBJECT names a coding system, and is not a coding system alias.
1107 */
1108       (object))
1109 {
1110         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1111         return CODING_SYSTEMP(val) ? Qt : Qnil;
1112 }
1113
1114 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1115  Return t if OBJECT is a coding system alias.
1116 All coding system aliases are created by `define-coding-system-alias'.
1117 */
1118       (object))
1119 {
1120         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1121         return SYMBOLP(val) ? Qt : Qnil;
1122 }
1123
1124 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1125 Return the coding-system symbol for which symbol ALIAS is an alias.
1126 */
1127       (alias))
1128 {
1129         Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1130         if (SYMBOLP(aliasee)) {
1131                 return aliasee;
1132         } else {
1133                 signal_simple_error(
1134                         "Symbol is not a coding system alias", alias);
1135         }
1136         /* To keep the compiler happy */
1137         return Qnil;
1138 }
1139
1140 static Lisp_Object
1141 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1142 {
1143         return Fintern(concat2(Fsymbol_name(symbol),
1144                                build_string(ascii_string)), Qnil);
1145 }
1146
1147 /* A maphash function, for removing dangling coding system aliases. */
1148 static int
1149 dangling_coding_system_alias_p(Lisp_Object alias,
1150                                Lisp_Object aliasee, void *dangling_aliases)
1151 {
1152         if (SYMBOLP(aliasee)
1153             && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1154                 (*(int *)dangling_aliases)++;
1155                 return 1;
1156         } else {
1157                 return 0;
1158         }
1159 }
1160
1161 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0,       /*
1162 Define symbol ALIAS as an alias for coding system ALIASEE.
1163
1164 You can use this function to redefine an alias that has already been defined,
1165 but you cannot redefine a name which is the canonical name for a coding system.
1166 \(a canonical name of a coding system is what is returned when you call
1167 `coding-system-name' on a coding system).
1168
1169 ALIASEE itself can be an alias, which allows you to define nested aliases.
1170
1171 You are forbidden, however, from creating alias loops or `dangling' aliases.
1172 These will be detected, and an error will be signaled if you attempt to do so.
1173
1174 If ALIASEE is nil, then ALIAS will simply be undefined.
1175
1176 See also `coding-system-alias-p', `coding-system-aliasee',
1177 and `coding-system-canonical-name-p'.
1178 */
1179       (alias, aliasee))
1180 {
1181         Lisp_Object real_coding_system, probe;
1182
1183         CHECK_SYMBOL(alias);
1184
1185         if (!NILP(Fcoding_system_canonical_name_p(alias)))
1186                 signal_simple_error
1187                     ("Symbol is the canonical name of a coding system and cannot be redefined",
1188                      alias);
1189
1190         if (NILP(aliasee)) {
1191                 Lisp_Object subsidiary_unix =
1192                     append_suffix_to_symbol(alias, "-unix");
1193                 Lisp_Object subsidiary_dos =
1194                     append_suffix_to_symbol(alias, "-dos");
1195                 Lisp_Object subsidiary_mac =
1196                     append_suffix_to_symbol(alias, "-mac");
1197
1198                 Fremhash(alias, Vcoding_system_hash_table);
1199
1200                 /* Undefine subsidiary aliases,
1201                    presumably created by a previous call to this function */
1202                 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1203                     !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1204                     !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1205                         Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1206                         Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1207                         Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1208                 }
1209
1210                 /* Undefine dangling coding system aliases. */
1211                 {
1212                         int dangling_aliases;
1213
1214                         do {
1215                                 dangling_aliases = 0;
1216                                 elisp_map_remhash
1217                                     (dangling_coding_system_alias_p,
1218                                      Vcoding_system_hash_table,
1219                                      &dangling_aliases);
1220                         } while (dangling_aliases > 0);
1221                 }
1222
1223                 return Qnil;
1224         }
1225
1226         if (CODING_SYSTEMP(aliasee))
1227                 aliasee = XCODING_SYSTEM_NAME(aliasee);
1228
1229         /* Checks that aliasee names a coding-system */
1230         real_coding_system = Fget_coding_system(aliasee);
1231
1232         /* Check for coding system alias loops */
1233         if (EQ(alias, aliasee))
1234               alias_loop:signal_simple_error_2
1235                     ("Attempt to create a coding system alias loop", alias,
1236                      aliasee);
1237
1238         for (probe = aliasee;
1239              SYMBOLP(probe);
1240              probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1241                 if (EQ(probe, alias))
1242                         goto alias_loop;
1243         }
1244
1245         Fputhash(alias, aliasee, Vcoding_system_hash_table);
1246
1247         /* Set up aliases for subsidiaries.
1248            #### There must be a better way to handle subsidiary coding
1249            #### systems. */
1250         {
1251                 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1252
1253                 for (int i = 0; i < countof(suffixes); i++) {
1254                         Lisp_Object alias_subsidiary =
1255                                 append_suffix_to_symbol(alias, suffixes[i]);
1256                         Lisp_Object aliasee_subsidiary =
1257                                 append_suffix_to_symbol(aliasee, suffixes[i]);
1258
1259                         if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1260                                 Fdefine_coding_system_alias(alias_subsidiary,
1261                                                             aliasee_subsidiary);
1262                         }
1263                 }
1264         }
1265         /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1266            but it doesn't look intentional, so I'd rather return something
1267            meaningful or nothing at all. */
1268         return Qnil;
1269 }
1270
1271 static Lisp_Object
1272 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1273 {
1274         Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1275         Lisp_Object new_coding_system;
1276
1277         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1278                 return coding_system;
1279
1280         switch (type) {
1281         case EOL_AUTODETECT:
1282                 return coding_system;
1283         case EOL_LF:
1284                 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1285                 break;
1286         case EOL_CR:
1287                 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1288                 break;
1289         case EOL_CRLF:
1290                 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1291                 break;
1292         default:
1293                 abort();
1294                 return Qnil;
1295         }
1296
1297         return NILP(new_coding_system) ? coding_system : new_coding_system;
1298 }
1299
1300 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0,   /*
1301 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1302 */
1303       (coding_system, eol_type))
1304 {
1305         coding_system = Fget_coding_system(coding_system);
1306
1307         return subsidiary_coding_system(coding_system,
1308                                         symbol_to_eol_type(eol_type));
1309 }
1310 \f
1311 /************************************************************************/
1312 /*                         Coding system accessors                      */
1313 /************************************************************************/
1314
1315 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0,   /*
1316 Return the doc string for CODING-SYSTEM.
1317 */
1318       (coding_system))
1319 {
1320         coding_system = Fget_coding_system(coding_system);
1321         return XCODING_SYSTEM_DOC_STRING(coding_system);
1322 }
1323
1324 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0,       /*
1325 Return the type of CODING-SYSTEM.
1326 */
1327       (coding_system))
1328 {
1329         Lisp_Object tmp = Fget_coding_system(coding_system);
1330
1331         switch (XCODING_SYSTEM_TYPE(tmp)) {
1332         default:
1333                 abort();
1334                 break;
1335         case CODESYS_AUTODETECT:
1336                 return Qundecided;
1337 #ifdef MULE
1338         case CODESYS_SHIFT_JIS:
1339                 return Qshift_jis;
1340         case CODESYS_ISO2022:
1341                 return Qiso2022;
1342         case CODESYS_BIG5:
1343                 return Qbig5;
1344         case CODESYS_UCS4:
1345                 return Qucs4;
1346         case CODESYS_UTF8:
1347                 return Qutf8;
1348         case CODESYS_CCL:
1349                 return Qccl;
1350 #endif
1351         case CODESYS_NO_CONVERSION:
1352                 return Qno_conversion;
1353 #ifdef DEBUG_SXEMACS
1354         case CODESYS_INTERNAL:
1355                 return Qinternal;
1356 #endif
1357         }
1358 }
1359
1360 #ifdef MULE
1361 static
1362 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1363 {
1364         Lisp_Object cs
1365             = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1366
1367         return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1368 }
1369
1370 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1371 Return initial charset of CODING-SYSTEM designated to GNUM.
1372 GNUM allows 0 .. 3.
1373 */
1374       (coding_system, gnum))
1375 {
1376         coding_system = Fget_coding_system(coding_system);
1377         CHECK_INT(gnum);
1378
1379         return coding_system_charset(coding_system, XINT(gnum));
1380 }
1381 #endif                          /* MULE */
1382
1383 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0,       /*
1384 Return the PROP property of CODING-SYSTEM.
1385 */
1386       (coding_system, prop))
1387 {
1388         int i, ok = 0;
1389         enum coding_system_type type;
1390
1391         coding_system = Fget_coding_system(coding_system);
1392         CHECK_SYMBOL(prop);
1393         type = XCODING_SYSTEM_TYPE(coding_system);
1394
1395         for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1396                 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1397                         ok = 1;
1398                         switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1399                         case CODESYS_PROP_ALL_OK:
1400                                 break;
1401 #ifdef MULE
1402                         case CODESYS_PROP_ISO2022:
1403                                 if (type != CODESYS_ISO2022)
1404                                         signal_simple_error
1405                                             ("Property only valid in ISO2022 coding systems",
1406                                              prop);
1407                                 break;
1408
1409                         case CODESYS_PROP_CCL:
1410                                 if (type != CODESYS_CCL)
1411                                         signal_simple_error
1412                                             ("Property only valid in CCL coding systems",
1413                                              prop);
1414                                 break;
1415 #endif                          /* MULE */
1416                         default:
1417                                 abort();
1418                         }
1419                 }
1420
1421         if (!ok)
1422                 signal_simple_error("Unrecognized property", prop);
1423
1424         if (EQ(prop, Qname))
1425                 return XCODING_SYSTEM_NAME(coding_system);
1426         else if (EQ(prop, Qtype))
1427                 return Fcoding_system_type(coding_system);
1428         else if (EQ(prop, Qdoc_string))
1429                 return XCODING_SYSTEM_DOC_STRING(coding_system);
1430         else if (EQ(prop, Qmnemonic))
1431                 return XCODING_SYSTEM_MNEMONIC(coding_system);
1432         else if (EQ(prop, Qeol_type))
1433                 return
1434                     eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1435         else if (EQ(prop, Qeol_lf))
1436                 return XCODING_SYSTEM_EOL_LF(coding_system);
1437         else if (EQ(prop, Qeol_crlf))
1438                 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1439         else if (EQ(prop, Qeol_cr))
1440                 return XCODING_SYSTEM_EOL_CR(coding_system);
1441         else if (EQ(prop, Qpost_read_conversion))
1442                 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1443         else if (EQ(prop, Qpre_write_conversion))
1444                 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1445 #ifdef MULE
1446         else if (type == CODESYS_ISO2022) {
1447                 if (EQ(prop, Qcharset_g0))
1448                         return coding_system_charset(coding_system, 0);
1449                 else if (EQ(prop, Qcharset_g1))
1450                         return coding_system_charset(coding_system, 1);
1451                 else if (EQ(prop, Qcharset_g2))
1452                         return coding_system_charset(coding_system, 2);
1453                 else if (EQ(prop, Qcharset_g3))
1454                         return coding_system_charset(coding_system, 3);
1455
1456 #define FORCE_CHARSET(charset_num) \
1457   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1458    (coding_system, charset_num) ? Qt : Qnil)
1459
1460                 else if (EQ(prop, Qforce_g0_on_output))
1461                         return FORCE_CHARSET(0);
1462                 else if (EQ(prop, Qforce_g1_on_output))
1463                         return FORCE_CHARSET(1);
1464                 else if (EQ(prop, Qforce_g2_on_output))
1465                         return FORCE_CHARSET(2);
1466                 else if (EQ(prop, Qforce_g3_on_output))
1467                         return FORCE_CHARSET(3);
1468
1469 #define LISP_BOOLEAN(prop) \
1470   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1471
1472                 else if (EQ(prop, Qshort))
1473                         return LISP_BOOLEAN(SHORT);
1474                 else if (EQ(prop, Qno_ascii_eol))
1475                         return LISP_BOOLEAN(NO_ASCII_EOL);
1476                 else if (EQ(prop, Qno_ascii_cntl))
1477                         return LISP_BOOLEAN(NO_ASCII_CNTL);
1478                 else if (EQ(prop, Qseven))
1479                         return LISP_BOOLEAN(SEVEN);
1480                 else if (EQ(prop, Qlock_shift))
1481                         return LISP_BOOLEAN(LOCK_SHIFT);
1482                 else if (EQ(prop, Qno_iso6429))
1483                         return LISP_BOOLEAN(NO_ISO6429);
1484                 else if (EQ(prop, Qescape_quoted))
1485                         return LISP_BOOLEAN(ESCAPE_QUOTED);
1486
1487                 else if (EQ(prop, Qinput_charset_conversion))
1488                         return
1489                             unparse_charset_conversion_specs
1490                             (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1491                 else if (EQ(prop, Qoutput_charset_conversion))
1492                         return
1493                             unparse_charset_conversion_specs
1494                             (XCODING_SYSTEM(coding_system)->iso2022.
1495                              output_conv);
1496                 else
1497                         abort();
1498         } else if (type == CODESYS_CCL) {
1499                 if (EQ(prop, Qdecode))
1500                         return XCODING_SYSTEM_CCL_DECODE(coding_system);
1501                 else if (EQ(prop, Qencode))
1502                         return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1503                 else
1504                         abort();
1505         }
1506 #endif                          /* MULE */
1507         else
1508                 abort();
1509
1510         return Qnil;            /* not reached */
1511 }
1512 \f
1513 /************************************************************************/
1514 /*                       Coding category functions                      */
1515 /************************************************************************/
1516
1517 static int decode_coding_category(Lisp_Object symbol)
1518 {
1519         int i;
1520
1521         CHECK_SYMBOL(symbol);
1522         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1523                 if (EQ(coding_category_symbol[i], symbol))
1524                         return i;
1525
1526         signal_simple_error("Unrecognized coding category", symbol);
1527         return 0;               /* not reached */
1528 }
1529
1530 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0,   /*
1531 Return a list of all recognized coding categories.
1532 */
1533       ())
1534 {
1535         int i;
1536         Lisp_Object list = Qnil;
1537
1538         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1539                 list = Fcons(coding_category_symbol[i], list);
1540         return list;
1541 }
1542
1543 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0,   /*
1544 Change the priority order of the coding categories.
1545 LIST should be list of coding categories, in descending order of
1546 priority.  Unspecified coding categories will be lower in priority
1547 than all specified ones, in the same relative order they were in
1548 previously.
1549 */
1550       (list))
1551 {
1552         int category_to_priority[CODING_CATEGORY_LAST];
1553         int i, j;
1554         Lisp_Object rest;
1555
1556         /* First generate a list that maps coding categories to priorities. */
1557
1558         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1559                 category_to_priority[i] = -1;
1560
1561         /* Highest priority comes from the specified list. */
1562         i = 0;
1563         EXTERNAL_LIST_LOOP(rest, list) {
1564                 int cat = decode_coding_category(XCAR(rest));
1565
1566                 if (category_to_priority[cat] >= 0)
1567                         signal_simple_error("Duplicate coding category in list",
1568                                             XCAR(rest));
1569                 category_to_priority[cat] = i++;
1570         }
1571
1572         /* Now go through the existing categories by priority to retrieve
1573            the categories not yet specified and preserve their priority
1574            order. */
1575         for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1576                 int cat = fcd->coding_category_by_priority[j];
1577                 if (category_to_priority[cat] < 0)
1578                         category_to_priority[cat] = i++;
1579         }
1580
1581         /* Now we need to construct the inverse of the mapping we just
1582            constructed. */
1583
1584         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1585                 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1586
1587         /* Phew!  That was confusing. */
1588         return Qnil;
1589 }
1590
1591 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0,   /*
1592 Return a list of coding categories in descending order of priority.
1593 */
1594       ())
1595 {
1596         int i;
1597         Lisp_Object list = Qnil;
1598
1599         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1600                 list =
1601                     Fcons(coding_category_symbol
1602                           [fcd->coding_category_by_priority[i]], list);
1603         return list;
1604 }
1605
1606 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0,       /*
1607 Change the coding system associated with a coding category.
1608 */
1609       (coding_category, coding_system))
1610 {
1611         int cat = decode_coding_category(coding_category);
1612
1613         coding_system = Fget_coding_system(coding_system);
1614         fcd->coding_category_system[cat] = coding_system;
1615         return Qnil;
1616 }
1617
1618 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0,       /*
1619 Return the coding system associated with a coding category.
1620 */
1621       (coding_category))
1622 {
1623         int cat = decode_coding_category(coding_category);
1624         Lisp_Object sys = fcd->coding_category_system[cat];
1625
1626         if (!NILP(sys))
1627                 return XCODING_SYSTEM_NAME(sys);
1628         return Qnil;
1629 }
1630 \f
1631 /************************************************************************/
1632 /*                     Detecting the encoding of data                   */
1633 /************************************************************************/
1634
1635 struct detection_state {
1636         eol_type_t eol_type;
1637         int seen_non_ascii;
1638         int mask;
1639 #ifdef MULE
1640         struct {
1641                 int mask;
1642                 int in_second_byte;
1643         } big5;
1644
1645         struct {
1646                 int mask;
1647                 int in_second_byte;
1648         } shift_jis;
1649
1650         struct {
1651                 int mask;
1652                 int in_byte;
1653         } ucs4;
1654
1655         struct {
1656                 int mask;
1657                 int in_byte;
1658         } utf8;
1659
1660         struct {
1661                 int mask;
1662                 int initted;
1663                 struct iso2022_decoder iso;
1664                 unsigned int flags;
1665                 int high_byte_count;
1666                 unsigned int saw_single_shift:1;
1667         } iso2022;
1668 #endif
1669         struct {
1670                 int seen_anything;
1671                 int just_saw_cr;
1672         } eol;
1673 };
1674
1675 static int acceptable_control_char_p(int c)
1676 {
1677         switch (c) {
1678                 /* Allow and ignore control characters that you might
1679                    reasonably see in a text file */
1680         case '\r':
1681         case '\n':
1682         case '\t':
1683         case 7:         /* bell */
1684         case 8:         /* backspace */
1685         case 11:                /* vertical tab */
1686         case 12:                /* form feed */
1687         case 26:                /* MS-DOS C-z junk */
1688         case 31:                /* '^_' -- for info */
1689                 return 1;
1690         default:
1691                 return 0;
1692         }
1693 }
1694
1695 static int mask_has_at_most_one_bit_p(int mask)
1696 {
1697         /* Perhaps the only thing useful you learn from intensive Microsoft
1698            technical interviews */
1699         return (mask & (mask - 1)) == 0;
1700 }
1701
1702 static eol_type_t
1703 detect_eol_type(struct detection_state *st, const Extbyte * src,
1704                 Lstream_data_count n)
1705 {
1706         while (n--) {
1707                 const unsigned char c = *(const unsigned char*)src++;
1708                 if (c == '\n') {
1709                         if (st->eol.just_saw_cr)
1710                                 return EOL_CRLF;
1711                         else if (st->eol.seen_anything)
1712                                 return EOL_LF;
1713                 } else if (st->eol.just_saw_cr)
1714                         return EOL_CR;
1715                 else if (c == '\r')
1716                         st->eol.just_saw_cr = 1;
1717                 else
1718                         st->eol.just_saw_cr = 0;
1719                 st->eol.seen_anything = 1;
1720         }
1721
1722         return EOL_AUTODETECT;
1723 }
1724
1725 /* Attempt to determine the encoding and EOL type of the given text.
1726    Before calling this function for the first type, you must initialize
1727    st->eol_type as appropriate and initialize st->mask to ~0.
1728
1729    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1730    not yet known.
1731
1732    st->mask holds the determined coding category mask, or ~0 if only
1733    ASCII has been seen so far.
1734
1735    Returns:
1736
1737    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1738         is present in st->mask
1739    1 == definitive answers are here for both st->eol_type and st->mask
1740 */
1741
1742 static int
1743 detect_coding_type(struct detection_state *st, const Extbyte * src,
1744                    Lstream_data_count n, int just_do_eol)
1745 {
1746         if (st->eol_type == EOL_AUTODETECT)
1747                 st->eol_type = detect_eol_type(st, src, n);
1748
1749         if (just_do_eol)
1750                 return st->eol_type != EOL_AUTODETECT;
1751
1752         if (!st->seen_non_ascii) {
1753                 for (; n; n--, src++) {
1754                         const unsigned char c = *(const unsigned char *)src;
1755                         if ((c < 0x20 && !acceptable_control_char_p(c))
1756                             || c >= 0x80) {
1757                                 st->seen_non_ascii = 1;
1758 #ifdef MULE
1759                                 st->shift_jis.mask = ~0;
1760                                 st->big5.mask = ~0;
1761                                 st->ucs4.mask = ~0;
1762                                 st->utf8.mask = ~0;
1763                                 st->iso2022.mask = ~0;
1764 #endif
1765                                 break;
1766                         }
1767                 }
1768         }
1769
1770         if (!n) {
1771                 return 0;
1772         }
1773 #ifdef MULE
1774         if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1775                 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1776         if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1777                 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1778         if (!mask_has_at_most_one_bit_p(st->big5.mask))
1779                 st->big5.mask = detect_coding_big5(st, src, n);
1780         if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1781                 st->utf8.mask = detect_coding_utf8(st, src, n);
1782         if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1783                 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1784
1785         st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1786                 | st->utf8.mask | st->ucs4.mask;
1787 #endif
1788         {
1789                 int retval = mask_has_at_most_one_bit_p(st->mask);
1790                 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1791                 return retval && st->eol_type != EOL_AUTODETECT;
1792         }
1793 }
1794
1795 static Lisp_Object coding_system_from_mask(int mask)
1796 {
1797         if (mask == ~0) {
1798                 /* If the file was entirely or basically ASCII, use the
1799                    default value of `buffer-file-coding-system'. */
1800                 Lisp_Object retval =
1801                     XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1802                 if (!NILP(retval)) {
1803                         retval = Ffind_coding_system(retval);
1804                         if (NILP(retval)) {
1805                                 warn_when_safe
1806                                     (Qbad_variable, Qwarning,
1807                                      "Invalid `default-buffer-file-coding-system', set to nil");
1808                                 XBUFFER(Vbuffer_defaults)->
1809                                     buffer_file_coding_system = Qnil;
1810                         }
1811                 }
1812                 if (NILP(retval))
1813                         retval = Fget_coding_system(Qraw_text);
1814                 return retval;
1815         } else {
1816                 int i;
1817                 int cat = -1;
1818 #ifdef MULE
1819                 mask = postprocess_iso2022_mask(mask);
1820 #endif
1821                 /* Look through the coding categories by priority and find
1822                    the first one that is allowed. */
1823                 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1824                         cat = fcd->coding_category_by_priority[i];
1825                         if (cat<0)
1826                                 continue;
1827                         if ((mask & (1 << cat)) &&
1828                             !NILP(fcd->coding_category_system[cat]))
1829                                 break;
1830                 }
1831                 if (cat >= 0)
1832                         return fcd->coding_category_system[cat];
1833                 else
1834                         return Fget_coding_system(Qraw_text);
1835         }
1836 }
1837
1838 /* Given a seekable read stream and potential coding system and EOL type
1839    as specified, do any autodetection that is called for.  If the
1840    coding system and/or EOL type are not `autodetect', they will be left
1841    alone; but this function will never return an autodetect coding system
1842    or EOL type.
1843
1844    This function does not automatically fetch subsidiary coding systems;
1845    that should be unnecessary with the explicit eol-type argument. */
1846
1847 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1848 /* number of leading lines to check for a coding cookie */
1849 #define LINES_TO_CHECK 2
1850
1851 void
1852 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1853                              eol_type_t * eol_type_in_out)
1854 {
1855         static const char mime_name_valid_chars[] = 
1856                 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1857                 "abcdefghijklmnopqrstuvwxyz"
1858                 "0123456789"
1859                 "!$%&*+-.^_{|}~";
1860
1861         struct detection_state decst;
1862
1863         if (*eol_type_in_out == EOL_AUTODETECT)
1864                 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
1865
1866         xzero(decst);
1867         decst.eol_type = *eol_type_in_out;
1868         decst.mask = ~0;
1869
1870         /* If autodetection is called for, do it now. */
1871         if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1872             || *eol_type_in_out == EOL_AUTODETECT) {
1873                 Extbyte buf[4096];
1874                 Lisp_Object coding_system = Qnil;
1875                 Extbyte *p;
1876                 Lstream_data_count nread =
1877                     Lstream_read(stream, buf, sizeof(buf));
1878                 Extbyte *scan_end;
1879                 int lines_checked = 0;
1880
1881                 /* Look for initial "-*-"; mode line prefix */
1882                 for (p = buf, scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1883                      p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1884                         if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1885                                 Extbyte *local_vars_beg = p + 3;
1886                                 /* Look for final "-*-"; mode line suffix */
1887                                 for (p = local_vars_beg, scan_end = buf + nread - LENGTH("-*-");
1888                                      p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1889                                         if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1890                                                 Extbyte *suffix = p;
1891                                                 /* Look for "coding:" */
1892                                                 for (p = local_vars_beg, scan_end = suffix - LENGTH("coding:?");
1893                                                      p <= scan_end; p++) {
1894                                                         if (memcmp("coding:", p, LENGTH("coding:")) != 0)
1895                                                                 continue;
1896                                                         if (p != local_vars_beg && strchr(" \t;", *p) == NULL )
1897                                                                 continue;
1898                                                         Extbyte save;
1899                                                         int n;
1900                                                         p += LENGTH("coding:");
1901                                                         while (*p == ' ' || *p == '\t') {
1902                                                                 p++;
1903                                                         }
1904
1905                                                         /* Get coding system name */
1906                                                         save = *suffix;
1907                                                         *suffix = '\0';
1908                                                         /* Characters valid in a MIME charset name (rfc 1521),
1909                                                            and in a Lisp symbol name. */
1910                                                         n = strspn((char *)p, mime_name_valid_chars);
1911                                                         *suffix = save;
1912                                                         if (n > 0) {
1913                                                                 save = p[n];
1914                                                                 p[n] = '\0';
1915                                                                 coding_system = Ffind_coding_system(intern((char *)p));
1916                                                                 p[n] = save;
1917                                                         }
1918                                                         break;
1919                                                 }
1920                                                 break;
1921                                         }
1922                                 /* #### file must use standard EOLs or we miss 2d line */
1923                                 /* #### not to mention this is broken for UTF-16 DOS files */
1924                                         else if (*p == '\n' || *p == '\r') {
1925                                                 lines_checked++;
1926                                                 /* skip past multibyte (DOS) newline */
1927                                                 if (*p == '\r'
1928                                                     && *(p + 1) == '\n')
1929                                                         p++;
1930                                         }
1931                                 break;
1932                         }
1933                 /* #### file must use standard EOLs or we miss 2d line */
1934                 /* #### not to mention this is broken for UTF-16 DOS files */
1935                         else if (*p == '\n' || *p == '\r') {
1936                                 lines_checked++;
1937                                 /* skip past multibyte (DOS) newline */
1938                                 if (*p == '\r' && *(p + 1) == '\n')
1939                                         p++;
1940                         }
1941
1942                 if (NILP(coding_system)) {
1943                         do {
1944                                 if (detect_coding_type(&decst, buf, nread,
1945                                                        XCODING_SYSTEM_TYPE(*codesys_in_out)
1946                                                        != CODESYS_AUTODETECT))
1947                                         break;
1948                                 nread = Lstream_read(stream, buf, sizeof(buf));
1949                                 if (nread == 0)
1950                                         break;
1951                         }
1952                         while (1);
1953                 } else if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1954                            && XCODING_SYSTEM_EOL_TYPE(coding_system) == EOL_AUTODETECT) {
1955                         do {
1956                                 if (detect_coding_type(&decst, buf, nread, 1))
1957                                         break;
1958                                 nread = Lstream_read(stream, buf, sizeof(buf));
1959                                 if (!nread)
1960                                         break;
1961                         }
1962                         while (1);
1963                 }
1964                 *eol_type_in_out = decst.eol_type;
1965                 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
1966                         if (NILP(coding_system))
1967                                 *codesys_in_out =
1968                                     coding_system_from_mask(decst.mask);
1969                         else
1970                                 *codesys_in_out = coding_system;
1971                 }
1972         }
1973
1974         /* If we absolutely can't determine the EOL type, just assume LF. */
1975         if (*eol_type_in_out == EOL_AUTODETECT)
1976                 *eol_type_in_out = EOL_LF;
1977
1978         Lstream_rewind(stream);
1979 }
1980
1981 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0,   /*
1982 Detect coding system of the text in the region between START and END.
1983 Return a list of possible coding systems ordered by priority.
1984 If only ASCII characters are found, return 'undecided or one of
1985 its subsidiary coding systems according to a detected end-of-line
1986 type.  Optional arg BUFFER defaults to the current buffer.
1987 */
1988       (start, end, buffer))
1989 {
1990         Lisp_Object val = Qnil;
1991         struct buffer *buf = decode_buffer(buffer, 0);
1992         Bufpos b, e;
1993         Lisp_Object instream, lb_instream;
1994         lstream_t istr, lb_istr;
1995         struct detection_state decst;
1996         struct gcpro gcpro1, gcpro2;
1997
1998         get_buffer_range_char(buf, start, end, &b, &e, 0);
1999         lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2000         lb_istr = XLSTREAM(lb_instream);
2001         instream =
2002             make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2003         istr = XLSTREAM(instream);
2004         GCPRO2(instream, lb_instream);
2005         xzero(decst);
2006         decst.eol_type = EOL_AUTODETECT;
2007         decst.mask = ~0;
2008         while (1) {
2009                 Extbyte random_buffer[4096];
2010                 Lstream_data_count nread =
2011                     Lstream_read(istr, random_buffer, sizeof(random_buffer));
2012
2013                 if (!nread)
2014                         break;
2015                 if (detect_coding_type(&decst, random_buffer, nread, 0))
2016                         break;
2017         }
2018
2019         if (decst.mask == ~0)
2020                 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2021                                                decst.eol_type);
2022         else {
2023                 int i;
2024
2025                 val = Qnil;
2026 #ifdef MULE
2027                 decst.mask = postprocess_iso2022_mask(decst.mask);
2028 #endif
2029                 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2030                         int sys = fcd->coding_category_by_priority[i];
2031                         if (decst.mask & (1 << sys)) {
2032                                 Lisp_Object codesys =
2033                                     fcd->coding_category_system[sys];
2034                                 if (!NILP(codesys))
2035                                         codesys =
2036                                             subsidiary_coding_system(codesys,
2037                                                                      decst.
2038                                                                      eol_type);
2039                                 val = Fcons(codesys, val);
2040                         }
2041                 }
2042         }
2043         Lstream_close(istr);
2044         UNGCPRO;
2045         Lstream_delete(istr);
2046         Lstream_delete(lb_istr);
2047         return val;
2048 }
2049 \f
2050 /************************************************************************/
2051 /*           Converting to internal Mule format ("decoding")            */
2052 /************************************************************************/
2053
2054 /* A decoding stream is a stream used for decoding text (i.e.
2055    converting from some external format to internal format).
2056    The decoding-stream object keeps track of the actual coding
2057    stream, the stream that is at the other end, and data that
2058    needs to be persistent across the lifetime of the stream. */
2059
2060 /* Handle the EOL stuff related to just-read-in character C.
2061    EOL_TYPE is the EOL type of the coding stream.
2062    FLAGS is the current value of FLAGS in the coding stream, and may
2063    be modified by this macro.  (The macro only looks at the
2064    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2065    bytes are to be written.  You need to also define a local goto
2066    label "label_continue_loop" that is at the end of the main
2067    character-reading loop.
2068
2069    If C is a CR character, then this macro handles it entirely and
2070    jumps to label_continue_loop.  Otherwise, this macro does not add
2071    anything to DST, and continues normally.  You should continue
2072    processing C normally after this macro. */
2073
2074 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2075 do {                                                            \
2076   if (c == '\r')                                                \
2077     {                                                           \
2078       if (eol_type == EOL_CR)                                   \
2079         Dynarr_add (dst, '\n');                                 \
2080       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2081         Dynarr_add (dst, c);                                    \
2082       else                                                      \
2083         flags |= CODING_STATE_CR;                               \
2084       goto label_continue_loop;                                 \
2085     }                                                           \
2086   else if (flags & CODING_STATE_CR)                             \
2087     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2088       if (c != '\n')                                            \
2089         Dynarr_add (dst, '\r');                                 \
2090       flags &= ~CODING_STATE_CR;                                \
2091     }                                                           \
2092 } while (0)
2093
2094 /* C should be a binary character in the range 0 - 255; convert
2095    to internal format and add to Dynarr DST. */
2096
2097 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2098 do {                                            \
2099   if (BYTE_ASCII_P (c))                         \
2100     Dynarr_add (dst, c);                        \
2101   else if (BYTE_C1_P (c))                       \
2102     {                                           \
2103       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2104       Dynarr_add (dst, c + 0x20);               \
2105     }                                           \
2106   else                                          \
2107     {                                           \
2108       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2109       Dynarr_add (dst, c);                      \
2110     }                                           \
2111 } while (0)
2112
2113 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2114 do {                                    \
2115   if (ch)                               \
2116     {                                   \
2117       DECODE_ADD_BINARY_CHAR (ch, dst); \
2118       ch = 0;                           \
2119     }                                   \
2120 } while (0)
2121
2122 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2123 do {                                    \
2124   if (flags & CODING_STATE_END)         \
2125     {                                   \
2126       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2127       if (flags & CODING_STATE_CR)      \
2128         Dynarr_add (dst, '\r');         \
2129     }                                   \
2130 } while (0)
2131
2132 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2133
2134 typedef struct decoding_stream_s *decoding_stream_t;
2135 struct decoding_stream_s {
2136         /* Coding system that governs the conversion. */
2137         Lisp_Coding_System *codesys;
2138
2139         /* Stream that we read the encoded data from or
2140            write the decoded data to. */
2141         lstream_t other_end;
2142
2143         /* If we are reading, then we can return only a fixed amount of
2144            data, so if the conversion resulted in too much data, we store it
2145            here for retrieval the next time around. */
2146         unsigned_char_dynarr *runoff;
2147
2148         /* FLAGS holds flags indicating the current state of the decoding.
2149            Some of these flags are dependent on the coding system. */
2150         unsigned int flags;
2151
2152         /* CH holds a partially built-up character.  Since we only deal
2153            with one- and two-byte characters at the moment, we only use
2154            this to store the first byte of a two-byte character. */
2155         unsigned int ch;
2156
2157         /* EOL_TYPE specifies the type of end-of-line conversion that
2158            currently applies.  We need to keep this separate from the
2159            EOL type stored in CODESYS because the latter might indicate
2160            automatic EOL-type detection while the former will always
2161            indicate a particular EOL type. */
2162         eol_type_t eol_type;
2163 #ifdef MULE
2164         /* Additional ISO2022 information.  We define the structure above
2165            because it's also needed by the detection routines. */
2166         struct iso2022_decoder iso2022;
2167
2168         /* Additional information (the state of the running CCL program)
2169            used by the CCL decoder. */
2170         struct ccl_program ccl;
2171
2172         /* counter for UTF-8 or UCS-4 */
2173         unsigned char counter;
2174 #endif
2175         struct detection_state decst;
2176 };
2177
2178 static Lstream_data_count
2179 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2180 static Lstream_data_count
2181 decoding_writer(lstream_t stream,
2182                 const unsigned char *data, Lstream_data_count size);
2183 static int decoding_rewinder(lstream_t stream);
2184 static int decoding_seekable_p(lstream_t stream);
2185 static int decoding_flusher(lstream_t stream);
2186 static int decoding_closer(lstream_t stream);
2187
2188 static Lisp_Object decoding_marker(Lisp_Object stream);
2189
2190 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2191                               sizeof(struct decoding_stream_s));
2192
2193 static Lisp_Object
2194 decoding_marker(Lisp_Object stream)
2195 {
2196         lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2197         Lisp_Object str_obj;
2198
2199         /* We do not need to mark the coding systems or charsets stored
2200            within the stream because they are stored in a global list
2201            and automatically marked. */
2202
2203         XSETLSTREAM(str_obj, str);
2204         mark_object(str_obj);
2205         if (str->imp->marker) {
2206                 return str->imp->marker(str_obj);
2207         } else {
2208                 return Qnil;
2209         }
2210 }
2211
2212 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2213    so we read data from the other end, decode it, and store it into DATA. */
2214
2215 static Lstream_data_count
2216 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2217 {
2218         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2219         unsigned char *orig_data = data;
2220         Lstream_data_count read_size;
2221         int error_occurred = 0;
2222
2223         /* We need to interface to mule_decode(), which expects to take some
2224            amount of data and store the result into a Dynarr.  We have
2225            mule_decode() store into str->runoff, and take data from there
2226            as necessary. */
2227
2228         /* We loop until we have enough data, reading chunks from the other
2229            end and decoding it. */
2230         while (1) {
2231                 /* Take data from the runoff if we can.  Make sure to take at
2232                    most SIZE bytes, and delete the data from the runoff. */
2233                 if (Dynarr_length(str->runoff) > 0) {
2234                         Lstream_data_count chunk =
2235                                 min(size,
2236                                     (Lstream_data_count)
2237                                     Dynarr_length(str->runoff));
2238                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2239                         Dynarr_delete_many(str->runoff, 0, chunk);
2240                         data += chunk;
2241                         size -= chunk;
2242                 }
2243
2244                 if (size == 0) {
2245                         /* No more room for data */
2246                         break;
2247                 }
2248
2249                 if (str->flags & CODING_STATE_END) {
2250                         /* This means that on the previous iteration, we hit the
2251                            EOF on the other end.  We loop once more so that
2252                            mule_decode() can output any final stuff it may be
2253                            holding, or any "go back to a sane state" escape
2254                            sequences. (This latter makes sense during
2255                            encoding.) */
2256                         break;
2257                 }
2258
2259                 /* Exhausted the runoff, so get some more.  DATA has at least
2260                    SIZE bytes left of storage in it, so it's OK to read directly
2261                    into it.  (We'll be overwriting above, after we've decoded it
2262                    into the runoff.) */
2263                 read_size = Lstream_read(str->other_end, data, size);
2264                 if (read_size < 0) {
2265                         error_occurred = 1;
2266                         break;
2267                 }
2268                 if (read_size == 0) {
2269                         /* There might be some more end data produced in the
2270                            translation.  See the comment above. */
2271                         str->flags |= CODING_STATE_END;
2272                 }
2273                 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2274         }
2275
2276         if (data - orig_data == 0) {
2277                 return error_occurred ? -1 : 0;
2278         } else {
2279                 return data - orig_data;
2280         }
2281 }
2282
2283 static Lstream_data_count
2284 decoding_writer(lstream_t stream, const unsigned char *data,
2285                 Lstream_data_count size)
2286 {
2287         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2288         Lstream_data_count retval;
2289
2290         /* Decode all our data into the runoff, and then attempt to write
2291            it all out to the other end.  Remove whatever chunk we succeeded
2292            in writing. */
2293         mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2294         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2295                                Dynarr_length(str->runoff));
2296         if (retval > 0) {
2297                 Dynarr_delete_many(str->runoff, 0, retval);
2298         }
2299         /* Do NOT return retval.  The return value indicates how much
2300            of the incoming data was written, not how many bytes were
2301            written. */
2302         return size;
2303 }
2304
2305 static void
2306 reset_decoding_stream(decoding_stream_t str)
2307 {
2308 #ifdef MULE
2309         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2310                 Lisp_Object coding_system;
2311                 XSETCODING_SYSTEM(coding_system, str->codesys);
2312                 reset_iso2022(coding_system, &str->iso2022);
2313         } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2314                 setup_ccl_program(&str->ccl,
2315                                   CODING_SYSTEM_CCL_DECODE(str->codesys));
2316         }
2317         str->counter = 0;
2318 #endif                          /* MULE */
2319         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2320             || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2321                 xzero(str->decst);
2322                 str->decst.eol_type = EOL_AUTODETECT;
2323                 str->decst.mask = ~0;
2324         }
2325         str->flags = str->ch = 0;
2326 }
2327
2328 static int
2329 decoding_rewinder(lstream_t stream)
2330 {
2331         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2332         reset_decoding_stream(str);
2333         Dynarr_reset(str->runoff);
2334         return Lstream_rewind(str->other_end);
2335 }
2336
2337 static int
2338 decoding_seekable_p(lstream_t stream)
2339 {
2340         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2341         return Lstream_seekable_p(str->other_end);
2342 }
2343
2344 static int
2345 decoding_flusher(lstream_t stream)
2346 {
2347         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2348         return Lstream_flush(str->other_end);
2349 }
2350
2351 static int
2352 decoding_closer(lstream_t stream)
2353 {
2354         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2355         if (stream->flags & LSTREAM_FL_WRITE) {
2356                 str->flags |= CODING_STATE_END;
2357                 decoding_writer(stream, 0, 0);
2358         }
2359         Dynarr_free(str->runoff);
2360 #ifdef MULE
2361 #ifdef ENABLE_COMPOSITE_CHARS
2362         if (str->iso2022.composite_chars) {
2363                 Dynarr_free(str->iso2022.composite_chars);
2364         }
2365 #endif
2366 #endif
2367         return Lstream_close(str->other_end);
2368 }
2369
2370 Lisp_Object
2371 decoding_stream_coding_system(lstream_t stream)
2372 {
2373         Lisp_Object coding_system;
2374         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2375
2376         XSETCODING_SYSTEM(coding_system, str->codesys);
2377         return subsidiary_coding_system(coding_system, str->eol_type);
2378 }
2379
2380 void
2381 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2382 {
2383         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2384         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2385         str->codesys = cs;
2386         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2387                 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2388         }
2389         reset_decoding_stream(str);
2390         return;
2391 }
2392
2393 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2394    stream for writing, no automatic code detection will be performed.
2395    The reason for this is that automatic code detection requires a
2396    seekable input.  Things will also fail if you open a decoding
2397    stream for reading using a non-fully-specified coding system and
2398    a non-seekable input stream. */
2399
2400 static Lisp_Object
2401 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2402 {
2403         lstream_t lstr = Lstream_new(lstream_decoding, mode);
2404         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2405         Lisp_Object obj;
2406
2407         xzero(*str);
2408         str->other_end = stream;
2409         str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2410         str->eol_type = EOL_AUTODETECT;
2411         if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2412                 /* We can determine the coding system now. */
2413                 determine_real_coding_system(stream, &codesys, &str->eol_type);
2414         }
2415         set_decoding_stream_coding_system(lstr, codesys);
2416         str->decst.eol_type = str->eol_type;
2417         str->decst.mask = ~0;
2418         XSETLSTREAM(obj, lstr);
2419         return obj;
2420 }
2421
2422 Lisp_Object
2423 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2424 {
2425         return make_decoding_stream_1(stream, codesys, "r");
2426 }
2427
2428 Lisp_Object
2429 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2430 {
2431         return make_decoding_stream_1(stream, codesys, "w");
2432 }
2433
2434 /* Note: the decode_coding_* functions all take the same
2435    arguments as mule_decode(), which is to say some SRC data of
2436    size N, which is to be stored into dynamic array DST.
2437    DECODING is the stream within which the decoding is
2438    taking place, but no data is actually read from or
2439    written to that stream; that is handled in decoding_reader()
2440    or decoding_writer().  This allows the same functions to
2441    be used for both reading and writing. */
2442
2443 static void
2444 mule_decode(lstream_t decoding, const Extbyte * src,
2445             unsigned_char_dynarr * dst, Lstream_data_count n)
2446 {
2447         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2448
2449         /* If necessary, do encoding-detection now.  We do this when
2450            we're a writing stream or a non-seekable reading stream,
2451            meaning that we can't just process the whole input,
2452            rewind, and start over. */
2453
2454         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2455             str->eol_type == EOL_AUTODETECT) {
2456                 Lisp_Object codesys;
2457
2458                 XSETCODING_SYSTEM(codesys, str->codesys);
2459                 detect_coding_type(&str->decst, src, n,
2460                                    CODING_SYSTEM_TYPE(str->codesys) !=
2461                                    CODESYS_AUTODETECT);
2462                 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2463                     str->decst.mask != ~0)
2464                         /* #### This is cheesy.  What we really ought to do is
2465                            buffer up a certain amount of data so as to get a
2466                            less random result. */
2467                         codesys = coding_system_from_mask(str->decst.mask);
2468                 str->eol_type = str->decst.eol_type;
2469                 if (XCODING_SYSTEM(codesys) != str->codesys) {
2470                         /* Preserve the CODING_STATE_END flag in case it was set.
2471                            If we erase it, bad things might happen. */
2472                         int was_end = str->flags & CODING_STATE_END;
2473                         set_decoding_stream_coding_system(decoding, codesys);
2474                         if (was_end)
2475                                 str->flags |= CODING_STATE_END;
2476                 }
2477         }
2478
2479         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2480 #ifdef DEBUG_SXEMACS
2481         case CODESYS_INTERNAL:
2482                 Dynarr_add_many(dst, src, n);
2483                 break;
2484 #endif
2485         case CODESYS_AUTODETECT:
2486                 /* If we got this far and still haven't decided on the coding
2487                    system, then do no conversion. */
2488         case CODESYS_NO_CONVERSION:
2489                 decode_coding_no_conversion(decoding, src, dst, n);
2490                 break;
2491 #ifdef MULE
2492         case CODESYS_SHIFT_JIS:
2493                 decode_coding_sjis(decoding, src, dst, n);
2494                 break;
2495         case CODESYS_BIG5:
2496                 decode_coding_big5(decoding, src, dst, n);
2497                 break;
2498         case CODESYS_UCS4:
2499                 decode_coding_ucs4(decoding, src, dst, n);
2500                 break;
2501         case CODESYS_UTF8:
2502                 decode_coding_utf8(decoding, src, dst, n);
2503                 break;
2504         case CODESYS_CCL:
2505                 str->ccl.last_block = str->flags & CODING_STATE_END;
2506                 /* When applying ccl program to stream, MUST NOT set NULL
2507                    pointer to src.  */
2508                 ccl_driver(&str->ccl,
2509                            (src
2510                             ? (const unsigned char *)src
2511                             : (const unsigned char *)""),
2512                            dst, n, 0, CCL_MODE_DECODING);
2513                 break;
2514         case CODESYS_ISO2022:
2515                 decode_coding_iso2022(decoding, src, dst, n);
2516                 break;
2517 #endif                          /* MULE */
2518         default:
2519                 abort();
2520         }
2521 }
2522
2523 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0,   /*
2524 Decode the text between START and END which is encoded in CODING-SYSTEM.
2525 This is useful if you've read in encoded text from a file without decoding
2526 it (e.g. you read in a JIS-formatted file but used the `binary' or
2527 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2528 Return length of decoded text.
2529 BUFFER defaults to the current buffer if unspecified.
2530 */
2531       (start, end, coding_system, buffer))
2532 {
2533         Bufpos b, e;
2534         struct buffer *buf = decode_buffer(buffer, 0);
2535         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2536         lstream_t istr, ostr;
2537         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2538
2539         get_buffer_range_char(buf, start, end, &b, &e, 0);
2540
2541         barf_if_buffer_read_only(buf, b, e);
2542
2543         coding_system = Fget_coding_system(coding_system);
2544         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2545         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2546         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2547                                                    coding_system);
2548         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2549                                                 Fget_coding_system(Qbinary));
2550         istr = XLSTREAM(instream);
2551         ostr = XLSTREAM(outstream);
2552         GCPRO4(instream, lb_outstream, de_outstream, outstream);
2553
2554         /* The chain of streams looks like this:
2555
2556            [BUFFER] <----- send through
2557            ------> [ENCODE AS BINARY]
2558            ------> [DECODE AS SPECIFIED]
2559            ------> [BUFFER]
2560          */
2561
2562         while (1) {
2563                 char tempbuf[1024];     /* some random amount */
2564                 Bufpos newpos, even_newer_pos;
2565                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2566                 Lstream_data_count size_in_bytes =
2567                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
2568
2569                 if (!size_in_bytes)
2570                         break;
2571                 newpos = lisp_buffer_stream_startpos(istr);
2572                 Lstream_write(ostr, tempbuf, size_in_bytes);
2573                 even_newer_pos = lisp_buffer_stream_startpos(istr);
2574                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2575                                     even_newer_pos, 0);
2576         }
2577         Lstream_close(istr);
2578         Lstream_close(ostr);
2579         UNGCPRO;
2580         Lstream_delete(istr);
2581         Lstream_delete(ostr);
2582         Lstream_delete(XLSTREAM(de_outstream));
2583         Lstream_delete(XLSTREAM(lb_outstream));
2584         return Qnil;
2585 }
2586 \f
2587 /************************************************************************/
2588 /*           Converting to an external encoding ("encoding")            */
2589 /************************************************************************/
2590
2591 /* An encoding stream is an output stream.  When you create the
2592    stream, you specify the coding system that governs the encoding
2593    and another stream that the resulting encoded data is to be
2594    sent to, and then start sending data to it. */
2595
2596 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2597
2598 typedef struct encoding_stream_s *encoding_stream_t;
2599 struct encoding_stream_s {
2600         /* Coding system that governs the conversion. */
2601         Lisp_Coding_System *codesys;
2602
2603         /* Stream that we read the encoded data from or
2604            write the decoded data to. */
2605         lstream_t other_end;
2606
2607         /* If we are reading, then we can return only a fixed amount of
2608            data, so if the conversion resulted in too much data, we store it
2609            here for retrieval the next time around. */
2610         unsigned_char_dynarr *runoff;
2611
2612         /* FLAGS holds flags indicating the current state of the encoding.
2613            Some of these flags are dependent on the coding system. */
2614         unsigned int flags;
2615
2616         /* CH holds a partially built-up character.  Since we only deal
2617            with one- and two-byte characters at the moment, we only use
2618            this to store the first byte of a two-byte character. */
2619         unsigned int ch;
2620 #ifdef MULE
2621         /* Additional information used by the ISO2022 encoder. */
2622         struct {
2623                 /* CHARSET holds the character sets currently assigned to the G0
2624                    through G3 registers.  It is initialized from the array
2625                    INITIAL_CHARSET in CODESYS. */
2626                 Lisp_Object charset[4];
2627
2628                 /* Which registers are currently invoked into the left (GL) and
2629                    right (GR) halves of the 8-bit encoding space? */
2630                 int register_left, register_right;
2631
2632                 /* Whether we need to explicitly designate the charset in the
2633                    G? register before using it.  It is initialized from the
2634                    array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2635                 unsigned char force_charset_on_output[4];
2636
2637                 /* Other state variables that need to be preserved across
2638                    invocations. */
2639                 Lisp_Object current_charset;
2640                 int current_half;
2641                 int current_char_boundary;
2642         } iso2022;
2643
2644         /* Additional information (the state of the running CCL program)
2645            used by the CCL encoder. */
2646         struct ccl_program ccl;
2647 #endif                          /* MULE */
2648 };
2649
2650 static Lstream_data_count
2651 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2652 static Lstream_data_count
2653 encoding_writer(lstream_t stream,
2654                 const unsigned char *data, Lstream_data_count size);
2655 static int encoding_rewinder(lstream_t stream);
2656 static int encoding_seekable_p(lstream_t stream);
2657 static int encoding_flusher(lstream_t stream);
2658 static int encoding_closer(lstream_t stream);
2659
2660 static Lisp_Object encoding_marker(Lisp_Object stream);
2661
2662 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2663                               sizeof(struct encoding_stream_s));
2664
2665 static Lisp_Object
2666 encoding_marker(Lisp_Object stream)
2667 {
2668         lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2669         Lisp_Object str_obj;
2670
2671         /* We do not need to mark the coding systems or charsets stored
2672            within the stream because they are stored in a global list
2673            and automatically marked. */
2674
2675         XSETLSTREAM(str_obj, str);
2676         mark_object(str_obj);
2677         if (str->imp->marker) {
2678                 return str->imp->marker(str_obj);
2679         } else {
2680                 return Qnil;
2681         }
2682 }
2683
2684 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2685    so we read data from the other end, encode it, and store it into DATA. */
2686
2687 static Lstream_data_count
2688 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2689 {
2690         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2691         unsigned char *orig_data = data;
2692         Lstream_data_count read_size;
2693         int error_occurred = 0;
2694
2695         /* We need to interface to mule_encode(), which expects to take some
2696            amount of data and store the result into a Dynarr.  We have
2697            mule_encode() store into str->runoff, and take data from there
2698            as necessary. */
2699
2700         /* We loop until we have enough data, reading chunks from the other
2701            end and encoding it. */
2702         while (1) {
2703                 /* Take data from the runoff if we can.  Make sure to take at
2704                    most SIZE bytes, and delete the data from the runoff. */
2705                 if (Dynarr_length(str->runoff) > 0) {
2706                         int chunk = min((int)size, Dynarr_length(str->runoff));
2707                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2708                         Dynarr_delete_many(str->runoff, 0, chunk);
2709                         data += chunk;
2710                         size -= chunk;
2711                 }
2712
2713                 if (size == 0) {
2714                         /* No more room for data */
2715                         break;
2716                 }
2717
2718                 if (str->flags & CODING_STATE_END) {
2719                         /* This means that on the previous iteration, we hit the
2720                            EOF on the other end.  We loop once more so that
2721                            mule_encode() can output any final stuff it may be
2722                            holding, or any "go back to a sane state" escape
2723                            sequences. (This latter makes sense during
2724                            encoding.) */
2725                         break;
2726                 }
2727
2728                 /* Exhausted the runoff, so get some more.  DATA at least SIZE
2729                    bytes left of storage in it, so it's OK to read directly into
2730                    it.  (We'll be overwriting above, after we've encoded it into
2731                    the runoff.) */
2732                 read_size = Lstream_read(str->other_end, data, size);
2733                 if (read_size < 0) {
2734                         error_occurred = 1;
2735                         break;
2736                 }
2737                 if (read_size == 0) {
2738                         /* There might be some more end data produced in the
2739                            translation.  See the comment above. */
2740                         str->flags |= CODING_STATE_END;
2741                 }
2742                 mule_encode(stream, data, str->runoff, read_size);
2743         }
2744
2745         if (data == orig_data) {
2746                 return error_occurred ? -1 : 0;
2747         } else {
2748                 return data - orig_data;
2749         }
2750 }
2751
2752 static Lstream_data_count
2753 encoding_writer(lstream_t stream, const unsigned char *data,
2754                 Lstream_data_count size)
2755 {
2756         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2757         Lstream_data_count retval;
2758
2759         /* Encode all our data into the runoff, and then attempt to write
2760            it all out to the other end.  Remove whatever chunk we succeeded
2761            in writing. */
2762         mule_encode(stream, data, str->runoff, size);
2763         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2764                                Dynarr_length(str->runoff));
2765         if (retval > 0) {
2766                 Dynarr_delete_many(str->runoff, 0, retval);
2767         }
2768         /* Do NOT return retval.  The return value indicates how much
2769            of the incoming data was written, not how many bytes were
2770            written. */
2771         return size;
2772 }
2773
2774 static void
2775 reset_encoding_stream(encoding_stream_t str)
2776 {
2777 #ifdef MULE
2778         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2779         case CODESYS_ISO2022: {
2780                 int i;
2781
2782                 for (i = 0; i < 4; i++) {
2783                         str->iso2022.charset[i] =
2784                                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2785                                         str->codesys, i);
2786                         str->iso2022.force_charset_on_output[i] =
2787                                 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2788                                         str->codesys, i);
2789                 }
2790                 str->iso2022.register_left = 0;
2791                 str->iso2022.register_right = 1;
2792                 str->iso2022.current_charset = Qnil;
2793                 str->iso2022.current_half = 0;
2794                 str->iso2022.current_char_boundary = 1;
2795                 break;
2796         }
2797         case CODESYS_CCL:
2798                 setup_ccl_program(&str->ccl,
2799                                   CODING_SYSTEM_CCL_ENCODE(str->codesys));
2800                 break;
2801
2802                 /* list the rest of them lot explicitly */
2803         case CODESYS_AUTODETECT:
2804         case CODESYS_SHIFT_JIS:
2805         case CODESYS_BIG5:
2806         case CODESYS_UCS4:
2807         case CODESYS_UTF8:
2808         case CODESYS_NO_CONVERSION:
2809 #ifdef DEBUG_SXEMACS
2810         case CODESYS_INTERNAL:
2811 #endif
2812         default:
2813                 break;
2814         }
2815 #endif                          /* MULE */
2816
2817         str->flags = str->ch = 0;
2818 }
2819
2820 static int
2821 encoding_rewinder(lstream_t stream)
2822 {
2823         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2824         reset_encoding_stream(str);
2825         Dynarr_reset(str->runoff);
2826         return Lstream_rewind(str->other_end);
2827 }
2828
2829 static int
2830 encoding_seekable_p(lstream_t stream)
2831 {
2832         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2833         return Lstream_seekable_p(str->other_end);
2834 }
2835
2836 static int
2837 encoding_flusher(lstream_t stream)
2838 {
2839         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2840         return Lstream_flush(str->other_end);
2841 }
2842
2843 static int
2844 encoding_closer(lstream_t stream)
2845 {
2846         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2847         if (stream->flags & LSTREAM_FL_WRITE) {
2848                 str->flags |= CODING_STATE_END;
2849                 encoding_writer(stream, 0, 0);
2850         }
2851         Dynarr_free(str->runoff);
2852         return Lstream_close(str->other_end);
2853 }
2854
2855 Lisp_Object
2856 encoding_stream_coding_system(lstream_t stream)
2857 {
2858         Lisp_Object coding_system;
2859         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2860
2861         XSETCODING_SYSTEM(coding_system, str->codesys);
2862         return coding_system;
2863 }
2864
2865 void
2866 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2867 {
2868         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2869         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2870         str->codesys = cs;
2871         reset_encoding_stream(str);
2872 }
2873
2874 static Lisp_Object
2875 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2876 {
2877         lstream_t lstr = Lstream_new(lstream_encoding, mode);
2878         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2879         Lisp_Object obj;
2880
2881         xzero(*str);
2882         str->runoff = Dynarr_new(unsigned_char);
2883         str->other_end = stream;
2884         set_encoding_stream_coding_system(lstr, codesys);
2885         XSETLSTREAM(obj, lstr);
2886         return obj;
2887 }
2888
2889 Lisp_Object
2890 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2891 {
2892         return make_encoding_stream_1(stream, codesys, "r");
2893 }
2894
2895 Lisp_Object
2896 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2897 {
2898         return make_encoding_stream_1(stream, codesys, "w");
2899 }
2900
2901 /* Convert N bytes of internally-formatted data stored in SRC to an
2902    external format, according to the encoding stream ENCODING.
2903    Store the encoded data into DST. */
2904
2905 static void
2906 mule_encode(lstream_t encoding, const Bufbyte * src,
2907             unsigned_char_dynarr * dst, Lstream_data_count n)
2908 {
2909         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2910
2911         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2912 #ifdef DEBUG_SXEMACS
2913         case CODESYS_INTERNAL:
2914                 Dynarr_add_many(dst, src, n);
2915                 break;
2916 #endif
2917         case CODESYS_AUTODETECT:
2918                 /* If we got this far and still haven't decided on the coding
2919                    system, then do no conversion. */
2920         case CODESYS_NO_CONVERSION:
2921                 encode_coding_no_conversion(encoding, src, dst, n);
2922                 break;
2923 #ifdef MULE
2924         case CODESYS_SHIFT_JIS:
2925                 encode_coding_sjis(encoding, src, dst, n);
2926                 break;
2927         case CODESYS_BIG5:
2928                 encode_coding_big5(encoding, src, dst, n);
2929                 break;
2930         case CODESYS_UCS4:
2931                 encode_coding_ucs4(encoding, src, dst, n);
2932                 break;
2933         case CODESYS_UTF8:
2934                 encode_coding_utf8(encoding, src, dst, n);
2935                 break;
2936         case CODESYS_CCL:
2937                 str->ccl.last_block = str->flags & CODING_STATE_END;
2938                 /* When applying ccl program to stream, MUST NOT set NULL
2939                    pointer to src.  */
2940                 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
2941                            dst, n, 0, CCL_MODE_ENCODING);
2942                 break;
2943         case CODESYS_ISO2022:
2944                 encode_coding_iso2022(encoding, src, dst, n);
2945                 break;
2946 #endif                          /* MULE */
2947         default:
2948                 abort();
2949         }
2950 }
2951
2952 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0,   /*
2953 Encode the text between START and END using CODING-SYSTEM.
2954 This will, for example, convert Japanese characters into stuff such as
2955 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2956 text.  BUFFER defaults to the current buffer if unspecified.
2957 */
2958       (start, end, coding_system, buffer))
2959 {
2960         Bufpos b, e;
2961         struct buffer *buf = decode_buffer(buffer, 0);
2962         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2963         lstream_t istr, ostr;
2964         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2965
2966         get_buffer_range_char(buf, start, end, &b, &e, 0);
2967
2968         barf_if_buffer_read_only(buf, b, e);
2969
2970         coding_system = Fget_coding_system(coding_system);
2971         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2972         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2973         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2974                                                    Fget_coding_system(Qbinary));
2975         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2976                                                 coding_system);
2977         istr = XLSTREAM(instream);
2978         ostr = XLSTREAM(outstream);
2979         GCPRO4(instream, outstream, de_outstream, lb_outstream);
2980         /* The chain of streams looks like this:
2981
2982            [BUFFER] <----- send through
2983            ------> [ENCODE AS SPECIFIED]
2984            ------> [DECODE AS BINARY]
2985            ------> [BUFFER]
2986          */
2987         while (1) {
2988                 char tempbuf[1024];     /* some random amount */
2989                 Bufpos newpos, even_newer_pos;
2990                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2991                 Lstream_data_count size_in_bytes =
2992                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
2993
2994                 if (!size_in_bytes)
2995                         break;
2996                 newpos = lisp_buffer_stream_startpos(istr);
2997                 Lstream_write(ostr, tempbuf, size_in_bytes);
2998                 even_newer_pos = lisp_buffer_stream_startpos(istr);
2999                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3000                                     even_newer_pos, 0);
3001         }
3002
3003         {
3004                 Charcount retlen =
3005                     lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3006                 Lstream_close(istr);
3007                 Lstream_close(ostr);
3008                 UNGCPRO;
3009                 Lstream_delete(istr);
3010                 Lstream_delete(ostr);
3011                 Lstream_delete(XLSTREAM(de_outstream));
3012                 Lstream_delete(XLSTREAM(lb_outstream));
3013                 return make_int(retlen);
3014         }
3015 }
3016
3017 #ifdef MULE
3018 \f
3019 /************************************************************************/
3020 /*                          Shift-JIS methods                           */
3021 /************************************************************************/
3022
3023 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3024    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3025    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3026    encoded by "position-code + 0x80".  A character of JISX0208
3027    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3028    position-codes are divided and shifted so that it fit in the range
3029    below.
3030
3031    --- CODE RANGE of Shift-JIS ---
3032    (character set)      (range)
3033    ASCII                0x00 .. 0x7F
3034    JISX0201-Kana        0xA0 .. 0xDF
3035    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3036             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3037    -------------------------------
3038
3039 */
3040
3041 /* Is this the first byte of a Shift-JIS two-byte char? */
3042
3043 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3044   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3045
3046 /* Is this the second byte of a Shift-JIS two-byte char? */
3047
3048 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3049   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3050
3051 #define BYTE_SJIS_KATAKANA_P(c) \
3052   ((c) >= 0xA1 && (c) <= 0xDF)
3053
3054 static int
3055 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3056                    Lstream_data_count n)
3057 {
3058         while (n--) {
3059                 const unsigned char c = *(const unsigned char *)src++;
3060                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3061                         return 0;
3062                 if (st->shift_jis.in_second_byte) {
3063                         st->shift_jis.in_second_byte = 0;
3064                         if (c < 0x40)
3065                                 return 0;
3066                 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3067                         st->shift_jis.in_second_byte = 1;
3068         }
3069         return CODING_CATEGORY_SHIFT_JIS_MASK;
3070 }
3071
3072 /* Convert Shift-JIS data to internal format. */
3073
3074 static void
3075 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3076                    unsigned_char_dynarr * dst, Lstream_data_count n)
3077 {
3078         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3079         unsigned int flags = str->flags;
3080         unsigned int ch = str->ch;
3081         eol_type_t eol_type = str->eol_type;
3082
3083         while (n--) {
3084                 const unsigned char c = *(const unsigned char *)src++;
3085
3086                 if (ch) {
3087                         /* Previous character was first byte of Shift-JIS Kanji
3088                            char. */
3089                         if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3090                                 unsigned char e1, e2;
3091
3092                                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3093                                 DECODE_SJIS(ch, c, e1, e2);
3094                                 Dynarr_add(dst, e1);
3095                                 Dynarr_add(dst, e2);
3096                         } else {
3097                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3098                                 DECODE_ADD_BINARY_CHAR(c, dst);
3099                         }
3100                         ch = 0;
3101                 } else {
3102                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3103                         if (BYTE_SJIS_TWO_BYTE_1_P(c))
3104                                 ch = c;
3105                         else if (BYTE_SJIS_KATAKANA_P(c)) {
3106                                 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3107                                 Dynarr_add(dst, c);
3108                         } else
3109                                 DECODE_ADD_BINARY_CHAR(c, dst);
3110                 }
3111               label_continue_loop:;
3112         }
3113
3114         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3115
3116         str->flags = flags;
3117         str->ch = ch;
3118 }
3119
3120 /* Convert internally-formatted data to Shift-JIS. */
3121
3122 static void
3123 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3124                    unsigned_char_dynarr * dst, Lstream_data_count n)
3125 {
3126         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3127         unsigned int flags = str->flags;
3128         unsigned int ch = str->ch;
3129         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3130
3131         while (n--) {
3132                 Bufbyte c = *src++;
3133                 if (c == '\n') {
3134                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3135                                 Dynarr_add(dst, '\r');
3136                         if (eol_type != EOL_CR)
3137                                 Dynarr_add(dst, '\n');
3138                         ch = 0;
3139                 } else if (BYTE_ASCII_P(c)) {
3140                         Dynarr_add(dst, c);
3141                         ch = 0;
3142                 } else if (BUFBYTE_LEADING_BYTE_P(c))
3143                         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3144                               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3145                               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3146                 else if (ch) {
3147                         if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3148                                 Dynarr_add(dst, c);
3149                                 ch = 0;
3150                         } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3151                                    ch == LEADING_BYTE_JAPANESE_JISX0208)
3152                                 ch = c;
3153                         else {
3154                                 /* j1 is bessel j1 function,
3155                                  * so we use something else */
3156                                 /* unsigned char j1, j2; */
3157                                 unsigned char tt1, tt2;
3158
3159                                 ENCODE_SJIS(ch, c, tt1, tt2);
3160                                 Dynarr_add(dst, tt1);
3161                                 Dynarr_add(dst, tt2);
3162                                 ch = 0;
3163                         }
3164                 }
3165         }
3166
3167         str->flags = flags;
3168         str->ch = ch;
3169 }
3170
3171 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3172 Decode a JISX0208 character of Shift-JIS coding-system.
3173 CODE is the character code in Shift-JIS as a cons of type bytes.
3174 Return the corresponding character.
3175 */
3176       (code))
3177 {
3178         unsigned char c1, c2, s1, s2;
3179
3180         CHECK_CONS(code);
3181         CHECK_INT(XCAR(code));
3182         CHECK_INT(XCDR(code));
3183         s1 = XINT(XCAR(code));
3184         s2 = XINT(XCDR(code));
3185         if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3186                 DECODE_SJIS(s1, s2, c1, c2);
3187                 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3188                                            c1 & 0x7F, c2 & 0x7F));
3189         } else
3190                 return Qnil;
3191 }
3192
3193 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3194 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3195 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3196 */
3197       (character))
3198 {
3199         Lisp_Object charset;
3200         int c1, c2, s1, s2;
3201
3202         CHECK_CHAR_COERCE_INT(character);
3203         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3204         if (EQ(charset, Vcharset_japanese_jisx0208)) {
3205                 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3206                 return Fcons(make_int(s1), make_int(s2));
3207         } else
3208                 return Qnil;
3209 }
3210 \f
3211 /************************************************************************/
3212 /*                            Big5 methods                              */
3213 /************************************************************************/
3214
3215 /* BIG5 is a coding system encoding two character sets: ASCII and
3216    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3217    character set and is encoded in two-byte.
3218
3219    --- CODE RANGE of BIG5 ---
3220    (character set)      (range)
3221    ASCII                0x00 .. 0x7F
3222    Big5 (1st byte)      0xA1 .. 0xFE
3223         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3224    --------------------------
3225
3226    Since the number of characters in Big5 is larger than maximum
3227    characters in Emacs' charset (96x96), it can't be handled as one
3228    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3229    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3230    contains frequently used characters and the latter contains less
3231    frequently used characters.  */
3232
3233 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3234   ((c) >= 0xA1 && (c) <= 0xFE)
3235
3236 /* Is this the second byte of a Shift-JIS two-byte char? */
3237
3238 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3239   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3240
3241 /* Number of Big5 characters which have the same code in 1st byte.  */
3242
3243 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3244
3245 /* Code conversion macros.  These are macros because they are used in
3246    inner loops during code conversion.
3247
3248    Note that temporary variables in macros introduce the classic
3249    dynamic-scoping problems with variable names.  We use capital-
3250    lettered variables in the assumption that SXEmacs does not use
3251    capital letters in variables except in a very formalized way
3252    (e.g. Qstring). */
3253
3254 /* Convert Big5 code (b1, b2) into its internal string representation
3255    (lb, c1, c2). */
3256
3257 /* There is a much simpler way to split the Big5 charset into two.
3258    For the moment I'm going to leave the algorithm as-is because it
3259    claims to separate out the most-used characters into a single
3260    charset, which perhaps will lead to optimizations in various
3261    places.
3262
3263    The way the algorithm works is something like this:
3264
3265    Big5 can be viewed as a 94x157 charset, where the row is
3266    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3267    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3268    the split between low and high column numbers is apparently
3269    meaningless; ascending rows produce less and less frequent chars.
3270    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3271    the first charset, and the upper half (0xC9 .. 0xFE) to the
3272    second.  To do the conversion, we convert the character into
3273    a single number where 0 .. 156 is the first row, 157 .. 313
3274    is the second, etc.  That way, the characters are ordered by
3275    decreasing frequency.  Then we just chop the space in two
3276    and coerce the result into a 94x94 space.
3277    */
3278
3279 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3280 {                                                                       \
3281   int B1 = b1, B2 = b2;                                                 \
3282   unsigned int I                                                        \
3283     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3284                                                                         \
3285   if (B1 < 0xC9)                                                        \
3286     {                                                                   \
3287       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3288     }                                                                   \
3289   else                                                                  \
3290     {                                                                   \
3291       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3292       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3293     }                                                                   \
3294   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3295   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3296 } while (0)
3297
3298 /* Convert the internal string representation of a Big5 character
3299    (lb, c1, c2) into Big5 code (b1, b2). */
3300
3301 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3302 {                                                                       \
3303   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3304                                                                         \
3305   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3306     {                                                                   \
3307       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3308     }                                                                   \
3309   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3310   b2 = I % BIG5_SAME_ROW;                                               \
3311   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3312 } while (0)
3313
3314 static int
3315 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3316                    Lstream_data_count n)
3317 {
3318         while (n--) {
3319                 const unsigned char c = *(const unsigned char *)src++;
3320                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3321                     (c >= 0x80 && c <= 0xA0))
3322                         return 0;
3323                 if (st->big5.in_second_byte) {
3324                         st->big5.in_second_byte = 0;
3325                         if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3326                                 return 0;
3327                 } else if (c >= 0xA1)
3328                         st->big5.in_second_byte = 1;
3329         }
3330         return CODING_CATEGORY_BIG5_MASK;
3331 }
3332
3333 /* Convert Big5 data to internal format. */
3334
3335 static void
3336 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3337                    unsigned_char_dynarr * dst, Lstream_data_count n)
3338 {
3339         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3340         unsigned int flags = str->flags;
3341         unsigned int ch = str->ch;
3342         eol_type_t eol_type = str->eol_type;
3343
3344         while (n--) {
3345                 const unsigned char c = *(const unsigned char *)src++;
3346                 if (ch) {
3347                         /* Previous character was first byte of Big5 char. */
3348                         if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3349                                 unsigned char b1, b2, b3;
3350                                 DECODE_BIG5(ch, c, b1, b2, b3);
3351                                 Dynarr_add(dst, b1);
3352                                 Dynarr_add(dst, b2);
3353                                 Dynarr_add(dst, b3);
3354                         } else {
3355                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3356                                 DECODE_ADD_BINARY_CHAR(c, dst);
3357                         }
3358                         ch = 0;
3359                 } else {
3360                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3361                         if (BYTE_BIG5_TWO_BYTE_1_P(c))
3362                                 ch = c;
3363                         else
3364                                 DECODE_ADD_BINARY_CHAR(c, dst);
3365                 }
3366               label_continue_loop:;
3367         }
3368
3369         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3370
3371         str->flags = flags;
3372         str->ch = ch;
3373 }
3374
3375 /* Convert internally-formatted data to Big5. */
3376
3377 static void
3378 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3379                    unsigned_char_dynarr * dst, Lstream_data_count n)
3380 {
3381         unsigned char c;
3382         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3383         unsigned int flags = str->flags;
3384         unsigned int ch = str->ch;
3385         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3386
3387         while (n--) {
3388                 c = *src++;
3389                 if (c == '\n') {
3390                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3391                                 Dynarr_add(dst, '\r');
3392                         if (eol_type != EOL_CR)
3393                                 Dynarr_add(dst, '\n');
3394                 } else if (BYTE_ASCII_P(c)) {
3395                         /* ASCII. */
3396                         Dynarr_add(dst, c);
3397                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3398                         if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3399                             c == LEADING_BYTE_CHINESE_BIG5_2) {
3400                                 /* A recognized leading byte. */
3401                                 ch = c;
3402                                 continue;       /* not done with this character. */
3403                         }
3404                         /* otherwise just ignore this character. */
3405                 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3406                            ch == LEADING_BYTE_CHINESE_BIG5_2) {
3407                         /* Previous char was a recognized leading byte. */
3408                         ch = (ch << 8) | c;
3409                         continue;       /* not done with this character. */
3410                 } else if (ch) {
3411                         /* Encountering second byte of a Big5 character. */
3412                         unsigned char b1, b2;
3413
3414                         ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3415                         Dynarr_add(dst, b1);
3416                         Dynarr_add(dst, b2);
3417                 }
3418
3419                 ch = 0;
3420         }
3421
3422         str->flags = flags;
3423         str->ch = ch;
3424 }
3425
3426 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0,   /*
3427 Decode a Big5 character CODE of BIG5 coding-system.
3428 CODE is the character code in BIG5, a cons of two integers.
3429 Return the corresponding character.
3430 */
3431       (code))
3432 {
3433         unsigned char c1, c2, b1, b2;
3434
3435         CHECK_CONS(code);
3436         CHECK_INT(XCAR(code));
3437         CHECK_INT(XCDR(code));
3438         b1 = XINT(XCAR(code));
3439         b2 = XINT(XCDR(code));
3440         if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3441                 int leading_byte;
3442                 Lisp_Object charset;
3443                 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3444                 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3445                 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3446         } else
3447                 return Qnil;
3448 }
3449
3450 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0,   /*
3451 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3452 Return the corresponding character code in Big5.
3453 */
3454       (character))
3455 {
3456         Lisp_Object charset;
3457         int c1, c2, b1, b2;
3458
3459         CHECK_CHAR_COERCE_INT(character);
3460         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3461         if (EQ(charset, Vcharset_chinese_big5_1) ||
3462             EQ(charset, Vcharset_chinese_big5_2)) {
3463                 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3464                             c2 | 0x80, b1, b2);
3465                 return Fcons(make_int(b1), make_int(b2));
3466         } else
3467                 return Qnil;
3468 }
3469 \f
3470 /************************************************************************/
3471 /*                           UCS-4 methods                              */
3472 /*                                                                      */
3473 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3474 /*                                                                      */
3475 /************************************************************************/
3476
3477 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0,   /*
3478 Map UCS-4 code CODE to Mule character CHARACTER.
3479
3480 Return T on success, NIL on failure.
3481 */
3482       (code, character))
3483 {
3484         size_t c;
3485
3486         CHECK_CHAR(character);
3487         CHECK_NATNUM(code);
3488         c = XINT(code);
3489
3490         if (c < countof(fcd->ucs_to_mule_table)) {
3491                 fcd->ucs_to_mule_table[c] = character;
3492                 return Qt;
3493         } else
3494                 return Qnil;
3495 }
3496
3497 static Lisp_Object ucs_to_char(unsigned long code)
3498 {
3499         if (code < countof(fcd->ucs_to_mule_table)) {
3500                 return fcd->ucs_to_mule_table[code];
3501         } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3502                 unsigned int c;
3503
3504                 code -= 0xe00000;
3505                 c = code % (94 * 94);
3506                 return make_char
3507                     (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3508                                (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3509                                 CHARSET_LEFT_TO_RIGHT),
3510                                c / 94 + 33, c % 94 + 33));
3511         } else
3512                 return Qnil;
3513 }
3514
3515 DEFUN("ucs-char", Fucs_char, 1, 1, 0,   /*
3516 Return Mule character corresponding to UCS code CODE (a positive integer).
3517 */
3518       (code))
3519 {
3520         CHECK_NATNUM(code);
3521         return ucs_to_char(XINT(code));
3522 }
3523
3524 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0,   /*
3525 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3526 */
3527       (character, code))
3528 {
3529         /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3530            Fset_char_ucs is more restrictive on index arg, but should
3531            check code arg in a char_table method. */
3532         CHECK_CHAR(character);
3533         CHECK_NATNUM(code);
3534         return Fput_char_table(character, code, mule_to_ucs_table);
3535 }
3536
3537 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0,   /*
3538 Return the UCS code (a positive integer) corresponding to CHARACTER.
3539 */
3540       (character))
3541 {
3542         return Fget_char_table(character, mule_to_ucs_table);
3543 }
3544
3545 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3546    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3547    is not found, instead.
3548    #### do something more appropriate (use blob?)
3549         Danger, Will Robinson!  Data loss.  Should we signal user? */
3550 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3551 {
3552         Lisp_Object chr = ucs_to_char(ch);
3553
3554         if (!NILP(chr)) {
3555                 Bufbyte work[MAX_EMCHAR_LEN];
3556                 int len;
3557
3558                 ch = XCHAR(chr);
3559                 len = (ch < 128) ?
3560                     simple_set_charptr_emchar(work, ch) :
3561                     non_ascii_set_charptr_emchar(work, ch);
3562                 Dynarr_add_many(dst, work, len);
3563         } else {
3564                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3565                 Dynarr_add(dst, 34 + 128);
3566                 Dynarr_add(dst, 46 + 128);
3567         }
3568 }
3569
3570 static unsigned long
3571 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3572 {
3573         Lisp_Object code
3574             = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3575                               mule_to_ucs_table);
3576
3577         if (INTP(code)) {
3578                 return XINT(code);
3579         } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3580                    (XCHARSET_CHARS(charset) == 94)) {
3581                 unsigned char final = XCHARSET_FINAL(charset);
3582
3583                 if (('@' <= final) && (final < 0x7f)) {
3584                         return 0xe00000 + (final - '@') * 94 * 94
3585                             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3586                 } else {
3587                         return '?';
3588                 }
3589         } else {
3590                 return '?';
3591         }
3592 }
3593
3594 static void
3595 encode_ucs4(Lisp_Object charset,
3596             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3597 {
3598         unsigned long code = mule_char_to_ucs4(charset, h, l);
3599         Dynarr_add(dst, code >> 24);
3600         Dynarr_add(dst, (code >> 16) & 255);
3601         Dynarr_add(dst, (code >> 8) & 255);
3602         Dynarr_add(dst, code & 255);
3603 }
3604
3605 static int
3606 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3607                    Lstream_data_count n)
3608 {
3609         while (n--) {
3610                 const unsigned char c = *(const unsigned char *)src++;
3611                 switch (st->ucs4.in_byte) {
3612                 case 0:
3613                         if (c >= 128)
3614                                 return 0;
3615                         else
3616                                 st->ucs4.in_byte++;
3617                         break;
3618                 case 3:
3619                         st->ucs4.in_byte = 0;
3620                         break;
3621                 default:
3622                         st->ucs4.in_byte++;
3623                 }
3624         }
3625         return CODING_CATEGORY_UCS4_MASK;
3626 }
3627
3628 static void
3629 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3630                    unsigned_char_dynarr * dst, Lstream_data_count n)
3631 {
3632         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3633         unsigned int flags = str->flags;
3634         unsigned int ch = str->ch;
3635         unsigned char counter = str->counter;
3636
3637         while (n--) {
3638                 const unsigned char c = *(const unsigned char *)src++;
3639                 switch (counter) {
3640                 case 0:
3641                         ch = c;
3642                         counter = 3;
3643                         break;
3644                 case 1:
3645                         decode_ucs4((ch << 8) | c, dst);
3646                         ch = 0;
3647                         counter = 0;
3648                         break;
3649                 default:
3650                         ch = (ch << 8) | c;
3651                         counter--;
3652                 }
3653         }
3654         if (counter & CODING_STATE_END)
3655                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3656
3657         str->flags = flags;
3658         str->ch = ch;
3659         str->counter = counter;
3660 }
3661
3662 static void
3663 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3664                    unsigned_char_dynarr * dst, Lstream_data_count n)
3665 {
3666         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3667         unsigned int flags = str->flags;
3668         unsigned int ch = str->ch;
3669         unsigned char char_boundary = str->iso2022.current_char_boundary;
3670         Lisp_Object charset = str->iso2022.current_charset;
3671
3672 #ifdef ENABLE_COMPOSITE_CHARS
3673         /* flags for handling composite chars.  We do a little switcharoo
3674            on the source while we're outputting the composite char. */
3675         unsigned int saved_n = 0;
3676         const unsigned char *saved_src = NULL;
3677         int in_composite = 0;
3678
3679       back_to_square_n:
3680 #endif
3681
3682         while (n--) {
3683                 unsigned char c = *src++;
3684
3685                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3686                         ch = 0;
3687                         encode_ucs4(Vcharset_ascii, c, 0, dst);
3688                         char_boundary = 1;
3689                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3690                         ch = 0;
3691                         charset = CHARSET_BY_LEADING_BYTE(c);
3692                         if (LEADING_BYTE_PREFIX_P(c))
3693                                 ch = c;
3694                         char_boundary = 0;
3695                 } else {        /* Processing Non-ASCII character */
3696                         char_boundary = 1;
3697                         if (EQ(charset, Vcharset_control_1)) {
3698                                 encode_ucs4(Vcharset_control_1, c, 0, dst);
3699                         } else {
3700                                 switch (XCHARSET_REP_BYTES(charset)) {
3701                                 case 2:
3702                                         encode_ucs4(charset, c, 0, dst);
3703                                         break;
3704                                 case 3:
3705                                         if (XCHARSET_PRIVATE_P(charset)) {
3706                                                 encode_ucs4(charset, c, 0, dst);
3707                                                 ch = 0;
3708                                         } else if (ch) {
3709 #ifdef ENABLE_COMPOSITE_CHARS
3710                                                 if (EQ
3711                                                     (charset,
3712                                                      Vcharset_composite)) {
3713                                                         if (in_composite) {
3714                                                                 /* #### Bother! We don't know how to
3715                                                                    handle this yet. */
3716                                                                 Dynarr_add(dst,
3717                                                                            '\0');
3718                                                                 Dynarr_add(dst,
3719                                                                            '\0');
3720                                                                 Dynarr_add(dst,
3721                                                                            '\0');
3722                                                                 Dynarr_add(dst,
3723                                                                            '~');
3724                                                         } else {
3725                                                                 Emchar emch =
3726                                                                     MAKE_CHAR
3727                                                                     (Vcharset_composite,
3728                                                                      ch & 0x7F,
3729                                                                      c & 0x7F);
3730                                                                 Lisp_Object lstr
3731                                                                     =
3732                                                                     composite_char_string
3733                                                                     (emch);
3734                                                                 saved_n = n;
3735                                                                 saved_src = src;
3736                                                                 in_composite =
3737                                                                     1;
3738                                                                 src =
3739                                                                     XSTRING_DATA
3740                                                                     (lstr);
3741                                                                 n = XSTRING_LENGTH(lstr);
3742                                                         }
3743                                                 } else
3744 #endif                          /* ENABLE_COMPOSITE_CHARS */
3745                                                 {
3746                                                         encode_ucs4(charset, ch,
3747                                                                     c, dst);
3748                                                 }
3749                                                 ch = 0;
3750                                         } else {
3751                                                 ch = c;
3752                                                 char_boundary = 0;
3753                                         }
3754                                         break;
3755                                 case 4:
3756                                         if (ch) {
3757                                                 encode_ucs4(charset, ch, c,
3758                                                             dst);
3759                                                 ch = 0;
3760                                         } else {
3761                                                 ch = c;
3762                                                 char_boundary = 0;
3763                                         }
3764                                         break;
3765                                 default:
3766                                         abort();
3767                                 }
3768                         }
3769                 }
3770         }
3771
3772 #ifdef ENABLE_COMPOSITE_CHARS
3773         if (in_composite) {
3774                 n = saved_n;
3775                 src = saved_src;
3776                 in_composite = 0;
3777                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
3778         }
3779 #endif                          /* ENABLE_COMPOSITE_CHARS */
3780
3781         str->flags = flags;
3782         str->ch = ch;
3783         str->iso2022.current_char_boundary = char_boundary;
3784         str->iso2022.current_charset = charset;
3785
3786         /* Verbum caro factum est! */
3787 }
3788 \f
3789 /************************************************************************/
3790 /*                           UTF-8 methods                              */
3791 /************************************************************************/
3792
3793 static int
3794 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3795                    Lstream_data_count n)
3796 {
3797         while (n--) {
3798                 const unsigned char c = *(const unsigned char *)src++;
3799                 switch (st->utf8.in_byte) {
3800                 case 0:
3801                         if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3802                             || c == ISO_CODE_SO)
3803                                 return 0;
3804                         else if (c >= 0xfc)
3805                                 st->utf8.in_byte = 5;
3806                         else if (c >= 0xf8)
3807                                 st->utf8.in_byte = 4;
3808                         else if (c >= 0xf0)
3809                                 st->utf8.in_byte = 3;
3810                         else if (c >= 0xe0)
3811                                 st->utf8.in_byte = 2;
3812                         else if (c >= 0xc0)
3813                                 st->utf8.in_byte = 1;
3814                         else if (c >= 0x80)
3815                                 return 0;
3816                         break;
3817                 default:
3818                         if ((c & 0xc0) != 0x80)
3819                                 return 0;
3820                         else
3821                                 st->utf8.in_byte--;
3822                 }
3823         }
3824         return CODING_CATEGORY_UTF8_MASK;
3825 }
3826
3827 static void
3828 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3829                    unsigned_char_dynarr * dst, Lstream_data_count n)
3830 {
3831         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3832         unsigned int flags = str->flags;
3833         unsigned int ch = str->ch;
3834         eol_type_t eol_type = str->eol_type;
3835         unsigned char counter = str->counter;
3836
3837         while (n--) {
3838                 const unsigned char c = *(const unsigned char *)src++;
3839                 switch (counter) {
3840                 case 0:
3841                         if (c >= 0xfc) {
3842                                 ch = c & 0x01;
3843                                 counter = 5;
3844                         } else if (c >= 0xf8) {
3845                                 ch = c & 0x03;
3846                                 counter = 4;
3847                         } else if (c >= 0xf0) {
3848                                 ch = c & 0x07;
3849                                 counter = 3;
3850                         } else if (c >= 0xe0) {
3851                                 ch = c & 0x0f;
3852                                 counter = 2;
3853                         } else if (c >= 0xc0) {
3854                                 ch = c & 0x1f;
3855                                 counter = 1;
3856                         } else {
3857                                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3858                                 decode_ucs4(c, dst);
3859                         }
3860                         break;
3861                 case 1:
3862                         ch = (ch << 6) | (c & 0x3f);
3863                         decode_ucs4(ch, dst);
3864                         ch = 0;
3865                         counter = 0;
3866                         break;
3867                 default:
3868                         ch = (ch << 6) | (c & 0x3f);
3869                         counter--;
3870                 }
3871               label_continue_loop:;
3872         }
3873
3874         if (flags & CODING_STATE_END)
3875                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3876
3877         str->flags = flags;
3878         str->ch = ch;
3879         str->counter = counter;
3880 }
3881
3882 static void
3883 encode_utf8(Lisp_Object charset,
3884             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3885 {
3886         unsigned long code = mule_char_to_ucs4(charset, h, l);
3887         if (code <= 0x7f) {
3888                 Dynarr_add(dst, code);
3889         } else if (code <= 0x7ff) {
3890                 Dynarr_add(dst, (code >> 6) | 0xc0);
3891                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3892         } else if (code <= 0xffff) {
3893                 Dynarr_add(dst, (code >> 12) | 0xe0);
3894                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3895                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3896         } else if (code <= 0x1fffff) {
3897                 Dynarr_add(dst, (code >> 18) | 0xf0);
3898                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3899                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3900                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3901         } else if (code <= 0x3ffffff) {
3902                 Dynarr_add(dst, (code >> 24) | 0xf8);
3903                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3904                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3905                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3906                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3907         } else {
3908                 Dynarr_add(dst, (code >> 30) | 0xfc);
3909                 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3910                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3911                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3912                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3913                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3914         }
3915 }
3916
3917 static void
3918 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3919                    unsigned_char_dynarr * dst, Lstream_data_count n)
3920 {
3921         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3922         unsigned int flags = str->flags;
3923         unsigned int ch = str->ch;
3924         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3925         unsigned char char_boundary = str->iso2022.current_char_boundary;
3926         Lisp_Object charset = str->iso2022.current_charset;
3927
3928 #ifdef ENABLE_COMPOSITE_CHARS
3929         /* flags for handling composite chars.  We do a little switcharoo
3930            on the source while we're outputting the composite char. */
3931         unsigned int saved_n = 0;
3932         const unsigned char *saved_src = NULL;
3933         int in_composite = 0;
3934
3935       back_to_square_n:
3936 #endif                          /* ENABLE_COMPOSITE_CHARS */
3937
3938         while (n--) {
3939                 unsigned char c = *src++;
3940
3941                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3942                         ch = 0;
3943                         if (c == '\n') {
3944                                 if (eol_type != EOL_LF
3945                                     && eol_type != EOL_AUTODETECT)
3946                                         Dynarr_add(dst, '\r');
3947                                 if (eol_type != EOL_CR)
3948                                         Dynarr_add(dst, c);
3949                         } else
3950                                 encode_utf8(Vcharset_ascii, c, 0, dst);
3951                         char_boundary = 1;
3952                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3953                         ch = 0;
3954                         charset = CHARSET_BY_LEADING_BYTE(c);
3955                         if (LEADING_BYTE_PREFIX_P(c))
3956                                 ch = c;
3957                         char_boundary = 0;
3958                 } else {        /* Processing Non-ASCII character */
3959                         char_boundary = 1;
3960                         if (EQ(charset, Vcharset_control_1)) {
3961                                 encode_utf8(Vcharset_control_1, c, 0, dst);
3962                         } else {
3963                                 switch (XCHARSET_REP_BYTES(charset)) {
3964                                 case 2:
3965                                         encode_utf8(charset, c, 0, dst);
3966                                         break;
3967                                 case 3:
3968                                         if (XCHARSET_PRIVATE_P(charset)) {
3969                                                 encode_utf8(charset, c, 0, dst);
3970                                                 ch = 0;
3971                                         } else if (ch) {
3972 #ifdef ENABLE_COMPOSITE_CHARS
3973                                                 if (EQ
3974                                                     (charset,
3975                                                      Vcharset_composite)) {
3976                                                         if (in_composite) {
3977                                                                 /* #### Bother! We don't know how to
3978                                                                    handle this yet. */
3979                                                                 encode_utf8
3980                                                                     (Vcharset_ascii,
3981                                                                      '~', 0,
3982                                                                      dst);
3983                                                         } else {
3984                                                                 Emchar emch =
3985                                                                     MAKE_CHAR
3986                                                                     (Vcharset_composite,
3987                                                                      ch & 0x7F,
3988                                                                      c & 0x7F);
3989                                                                 Lisp_Object lstr
3990                                                                     =
3991                                                                     composite_char_string
3992                                                                     (emch);
3993                                                                 saved_n = n;
3994                                                                 saved_src = src;
3995                                                                 in_composite =
3996                                                                     1;
3997                                                                 src =
3998                                                                     XSTRING_DATA
3999                                                                     (lstr);
4000                                                                 n = XSTRING_LENGTH(lstr);
4001                                                         }
4002                                                 } else
4003 #endif                          /* ENABLE_COMPOSITE_CHARS */
4004                                                 {
4005                                                         encode_utf8(charset, ch,
4006                                                                     c, dst);
4007                                                 }
4008                                                 ch = 0;
4009                                         } else {
4010                                                 ch = c;
4011                                                 char_boundary = 0;
4012                                         }
4013                                         break;
4014                                 case 4:
4015                                         if (ch) {
4016                                                 encode_utf8(charset, ch, c,
4017                                                             dst);
4018                                                 ch = 0;
4019                                         } else {
4020                                                 ch = c;
4021                                                 char_boundary = 0;
4022                                         }
4023                                         break;
4024                                 default:
4025                                         abort();
4026                                 }
4027                         }
4028                 }
4029         }
4030
4031 #ifdef ENABLE_COMPOSITE_CHARS
4032         if (in_composite) {
4033                 n = saved_n;
4034                 src = saved_src;
4035                 in_composite = 0;
4036                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
4037         }
4038 #endif
4039
4040         str->flags = flags;
4041         str->ch = ch;
4042         str->iso2022.current_char_boundary = char_boundary;
4043         str->iso2022.current_charset = charset;
4044
4045         /* Verbum caro factum est! */
4046 }
4047 \f
4048 /************************************************************************/
4049 /*                           ISO2022 methods                            */
4050 /************************************************************************/
4051
4052 /* The following note describes the coding system ISO2022 briefly.
4053    Since the intention of this note is to help understand the
4054    functions in this file, some parts are NOT ACCURATE or OVERLY
4055    SIMPLIFIED.  For thorough understanding, please refer to the
4056    original document of ISO2022.
4057
4058    ISO2022 provides many mechanisms to encode several character sets
4059    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4060    is encoded using bytes less than 128.  This may make the encoded
4061    text a little bit longer, but the text passes more easily through
4062    several gateways, some of which strip off MSB (Most Signigant Bit).
4063
4064    There are two kinds of character sets: control character set and
4065    graphic character set.  The former contains control characters such
4066    as `newline' and `escape' to provide control functions (control
4067    functions are also provided by escape sequences).  The latter
4068    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4069    two control character sets and many graphic character sets.
4070
4071    Graphic character sets are classified into one of the following
4072    four classes, according to the number of bytes (DIMENSION) and
4073    number of characters in one dimension (CHARS) of the set:
4074    - DIMENSION1_CHARS94
4075    - DIMENSION1_CHARS96
4076    - DIMENSION2_CHARS94
4077    - DIMENSION2_CHARS96
4078
4079    In addition, each character set is assigned an identification tag,
4080    unique for each set, called "final character" (denoted as <F>
4081    hereafter).  The <F> of each character set is decided by ECMA(*)
4082    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4083    (0x30..0x3F are for private use only).
4084
4085    Note (*): ECMA = European Computer Manufacturers Association
4086
4087    Here are examples of graphic character set [NAME(<F>)]:
4088         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4089         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4090         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4091         o DIMENSION2_CHARS96 -- none for the moment
4092
4093    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4094         C0 [0x00..0x1F] -- control character plane 0
4095         GL [0x20..0x7F] -- graphic character plane 0
4096         C1 [0x80..0x9F] -- control character plane 1
4097         GR [0xA0..0xFF] -- graphic character plane 1
4098
4099    A control character set is directly designated and invoked to C0 or
4100    C1 by an escape sequence.  The most common case is that:
4101    - ISO646's  control character set is designated/invoked to C0, and
4102    - ISO6429's control character set is designated/invoked to C1,
4103    and usually these designations/invocations are omitted in encoded
4104    text.  In a 7-bit environment, only C0 can be used, and a control
4105    character for C1 is encoded by an appropriate escape sequence to
4106    fit into the environment.  All control characters for C1 are
4107    defined to have corresponding escape sequences.
4108
4109    A graphic character set is at first designated to one of four
4110    graphic registers (G0 through G3), then these graphic registers are
4111    invoked to GL or GR.  These designations and invocations can be
4112    done independently.  The most common case is that G0 is invoked to
4113    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4114    these invocations and designations are omitted in encoded text.
4115    In a 7-bit environment, only GL can be used.
4116
4117    When a graphic character set of CHARS94 is invoked to GL, codes
4118    0x20 and 0x7F of the GL area work as control characters SPACE and
4119    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4120    be used.
4121
4122    There are two ways of invocation: locking-shift and single-shift.
4123    With locking-shift, the invocation lasts until the next different
4124    invocation, whereas with single-shift, the invocation affects the
4125    following character only and doesn't affect the locking-shift
4126    state.  Invocations are done by the following control characters or
4127    escape sequences:
4128
4129    ----------------------------------------------------------------------
4130    abbrev  function                  cntrl escape seq   description
4131    ----------------------------------------------------------------------
4132    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4133    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4134    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4135    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4136    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4137    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4138    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4139    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4140    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4141    ----------------------------------------------------------------------
4142    (*) These are not used by any known coding system.
4143
4144    Control characters for these functions are defined by macros
4145    ISO_CODE_XXX in `coding.h'.
4146
4147    Designations are done by the following escape sequences:
4148    ----------------------------------------------------------------------
4149    escape sequence      description
4150    ----------------------------------------------------------------------
4151    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4152    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4153    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4154    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4155    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4156    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4157    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4158    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4159    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4160    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4161    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4162    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4163    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4164    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4165    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4166    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4167    ----------------------------------------------------------------------
4168
4169    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4170    of dimension 1, chars 94, and final character <F>, etc...
4171
4172    Note (*): Although these designations are not allowed in ISO2022,
4173    Emacs accepts them on decoding, and produces them on encoding
4174    CHARS96 character sets in a coding system which is characterized as
4175    7-bit environment, non-locking-shift, and non-single-shift.
4176
4177    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4178    '(' can be omitted.  We refer to this as "short-form" hereafter.
4179
4180    Now you may notice that there are a lot of ways for encoding the
4181    same multilingual text in ISO2022.  Actually, there exist many
4182    coding systems such as Compound Text (used in X11's inter client
4183    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4184    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4185    localized platforms), and all of these are variants of ISO2022.
4186
4187    In addition to the above, Emacs handles two more kinds of escape
4188    sequences: ISO6429's direction specification and Emacs' private
4189    sequence for specifying character composition.
4190
4191    ISO6429's direction specification takes the following form:
4192         o CSI ']'      -- end of the current direction
4193         o CSI '0' ']'  -- end of the current direction
4194         o CSI '1' ']'  -- start of left-to-right text
4195         o CSI '2' ']'  -- start of right-to-left text
4196    The control character CSI (0x9B: control sequence introducer) is
4197    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4198
4199    Character composition specification takes the following form:
4200         o ESC '0' -- start character composition
4201         o ESC '1' -- end character composition
4202    Since these are not standard escape sequences of any ISO standard,
4203    their use with these meanings is restricted to Emacs only.  */
4204
4205 static void
4206 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4207 {
4208         int i;
4209
4210         for (i = 0; i < 4; i++) {
4211                 if (!NILP(coding_system))
4212                         iso->charset[i] =
4213                             XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4214                             (coding_system, i);
4215                 else
4216                         iso->charset[i] = Qt;
4217                 iso->invalid_designated[i] = 0;
4218         }
4219         iso->esc = ISO_ESC_NOTHING;
4220         iso->esc_bytes_index = 0;
4221         iso->register_left = 0;
4222         iso->register_right = 1;
4223         iso->switched_dir_and_no_valid_charset_yet = 0;
4224         iso->invalid_switch_dir = 0;
4225         iso->output_direction_sequence = 0;
4226         iso->output_literally = 0;
4227 #ifdef ENABLE_COMPOSITE_CHARS
4228         if (iso->composite_chars)
4229                 Dynarr_reset(iso->composite_chars);
4230 #endif
4231 }
4232
4233 static int fit_to_be_escape_quoted(unsigned char c)
4234 {
4235         switch (c) {
4236         case ISO_CODE_ESC:
4237         case ISO_CODE_CSI:
4238         case ISO_CODE_SS2:
4239         case ISO_CODE_SS3:
4240         case ISO_CODE_SO:
4241         case ISO_CODE_SI:
4242                 return 1;
4243
4244         default:
4245                 return 0;
4246         }
4247 }
4248
4249 /* Parse one byte of an ISO2022 escape sequence.
4250    If the result is an invalid escape sequence, return 0 and
4251    do not change anything in STR.  Otherwise, if the result is
4252    an incomplete escape sequence, update ISO2022.ESC and
4253    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4254    all the state variables (but not ISO2022.ESC_BYTES) and
4255    return 1.
4256
4257    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4258    or invocation of an invalid character set and treat that as
4259    an unrecognized escape sequence.
4260
4261    ********************************************************************
4262
4263    #### Strategies for error annotation and coding orthogonalization
4264
4265    We really want to separate out a number of things.  Conceptually,
4266    there is a nested syntax.
4267
4268    At the top level is the ISO 2022 extension syntax, including charset
4269    designation and invocation, and certain auxiliary controls such as the
4270    ISO 6429 direction specification.  These are octet-oriented, with the
4271    single exception (AFAIK) of the "exit Unicode" sequence which uses the
4272    UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4273    UTF-16, and 4 bytes for UCS-4 and UTF-32).  This will be treated as a
4274    (deprecated) special case in Unicode processing.
4275
4276    The middle layer is ISO 2022 character interpretation.  This will depend
4277    on the current state of the ISO 2022 registers, and assembles octets
4278    into the character's internal representation.
4279
4280    The lowest level is translating system control conventions.  At present
4281    this is restricted to newline translation, but one could imagine doing
4282    tab conversion or line wrapping here.  "Escape from Unicode" processing
4283    would be done at this level.
4284
4285    At each level the parser will verify the syntax.  In the case of a
4286    syntax error or warning (such as a redundant escape sequence that affects
4287    no characters), the parser will take some action, typically inserting the
4288    erroneous octets directly into the output and creating an annotation
4289    which can be used by higher level I/O to mark the affected region.
4290
4291    This should make it possible to do something sensible about separating
4292    newline convention processing from character construction, and about
4293    preventing ISO 2022 escape sequences from being recognized
4294    inappropriately.
4295
4296    The basic strategy will be to have octet classification tables, and
4297    switch processing according to the table entry.
4298
4299    It's possible that, by doing the processing with tables of functions or
4300    the like, the parser can be used for both detection and translation. */
4301
4302 static int
4303 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4304                   unsigned char c, unsigned int *flags,
4305                   int check_invalid_charsets)
4306 {
4307         /* (1) If we're at the end of a designation sequence, CS is the
4308            charset being designated and REG is the register to designate
4309            it to.
4310
4311            (2) If we're at the end of a locking-shift sequence, REG is
4312            the register to invoke and HALF (0 == left, 1 == right) is
4313            the half to invoke it into.
4314
4315            (3) If we're at the end of a single-shift sequence, REG is
4316            the register to invoke. */
4317         Lisp_Object cs = Qnil;
4318         int reg, half;
4319
4320         /* NOTE: This code does goto's all over the fucking place.
4321            The reason for this is that we're basically implementing
4322            a state machine here, and hierarchical languages like C
4323            don't really provide a clean way of doing this. */
4324
4325         if (!(*flags & CODING_STATE_ESCAPE))
4326                 /* At beginning of escape sequence; we need to reset our
4327                    escape-state variables. */
4328                 iso->esc = ISO_ESC_NOTHING;
4329
4330         iso->output_literally = 0;
4331         iso->output_direction_sequence = 0;
4332
4333         switch (iso->esc) {
4334         case ISO_ESC_NOTHING:
4335                 iso->esc_bytes_index = 0;
4336                 switch (c) {
4337                 case ISO_CODE_ESC:      /* Start escape sequence */
4338                         *flags |= CODING_STATE_ESCAPE;
4339                         iso->esc = ISO_ESC;
4340                         goto not_done;
4341
4342                 case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4343                         *flags |= CODING_STATE_ESCAPE;
4344                         iso->esc = ISO_ESC_5_11;
4345                         goto not_done;
4346
4347                 case ISO_CODE_SO:       /* locking shift 1 */
4348                         reg = 1;
4349                         half = 0;
4350                         goto locking_shift;
4351                 case ISO_CODE_SI:       /* locking shift 0 */
4352                         reg = 0;
4353                         half = 0;
4354                         goto locking_shift;
4355
4356                 case ISO_CODE_SS2:      /* single shift */
4357                         reg = 2;
4358                         goto single_shift;
4359                 case ISO_CODE_SS3:      /* single shift */
4360                         reg = 3;
4361                         goto single_shift;
4362
4363                 default:        /* Other control characters */
4364                         return 0;
4365                 }
4366
4367         case ISO_ESC:
4368                 switch (c) {
4369           /**** single shift ****/
4370
4371                 case 'N':       /* single shift 2 */
4372                         reg = 2;
4373                         goto single_shift;
4374                 case 'O':       /* single shift 3 */
4375                         reg = 3;
4376                         goto single_shift;
4377
4378           /**** locking shift ****/
4379
4380                 case '~':       /* locking shift 1 right */
4381                         reg = 1;
4382                         half = 1;
4383                         goto locking_shift;
4384                 case 'n':       /* locking shift 2 */
4385                         reg = 2;
4386                         half = 0;
4387                         goto locking_shift;
4388                 case '}':       /* locking shift 2 right */
4389                         reg = 2;
4390                         half = 1;
4391                         goto locking_shift;
4392                 case 'o':       /* locking shift 3 */
4393                         reg = 3;
4394                         half = 0;
4395                         goto locking_shift;
4396                 case '|':       /* locking shift 3 right */
4397                         reg = 3;
4398                         half = 1;
4399                         goto locking_shift;
4400
4401 #ifdef ENABLE_COMPOSITE_CHARS
4402           /**** composite ****/
4403
4404                 case '0':
4405                         iso->esc = ISO_ESC_START_COMPOSITE;
4406                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4407                             CODING_STATE_COMPOSITE;
4408                         return 1;
4409
4410                 case '1':
4411                         iso->esc = ISO_ESC_END_COMPOSITE;
4412                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4413                             ~CODING_STATE_COMPOSITE;
4414                         return 1;
4415 #endif                          /* ENABLE_COMPOSITE_CHARS */
4416
4417           /**** directionality ****/
4418
4419                 case '[':
4420                         iso->esc = ISO_ESC_5_11;
4421                         goto not_done;
4422
4423           /**** designation ****/
4424
4425                 case '$':       /* multibyte charset prefix */
4426                         iso->esc = ISO_ESC_2_4;
4427                         goto not_done;
4428
4429                 default:
4430                         if (0x28 <= c && c <= 0x2F) {
4431                                 iso->esc =
4432                                     (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4433                                 goto not_done;
4434                         }
4435
4436                         /* This function is called with CODESYS equal to nil when
4437                            doing coding-system detection. */
4438                         if (!NILP(codesys)
4439                             && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4440                             && fit_to_be_escape_quoted(c)) {
4441                                 iso->esc = ISO_ESC_LITERAL;
4442                                 *flags &= CODING_STATE_ISO2022_LOCK;
4443                                 return 1;
4444                         }
4445
4446                         /* bzzzt! */
4447                         return 0;
4448                 }
4449
4450       /**** directionality ****/
4451
4452         case ISO_ESC_5_11:      /* ISO6429 direction control */
4453                 if (c == ']') {
4454                         *flags &=
4455                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4456                         goto directionality;
4457                 }
4458                 if (c == '0')
4459                         iso->esc = ISO_ESC_5_11_0;
4460                 else if (c == '1')
4461                         iso->esc = ISO_ESC_5_11_1;
4462                 else if (c == '2')
4463                         iso->esc = ISO_ESC_5_11_2;
4464                 else
4465                         return 0;
4466                 goto not_done;
4467
4468         case ISO_ESC_5_11_0:
4469                 if (c == ']') {
4470                         *flags &=
4471                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4472                         goto directionality;
4473                 }
4474                 return 0;
4475
4476         case ISO_ESC_5_11_1:
4477                 if (c == ']') {
4478                         *flags =
4479                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4480                         goto directionality;
4481                 }
4482                 return 0;
4483
4484         case ISO_ESC_5_11_2:
4485                 if (c == ']') {
4486                         *flags =
4487                             (*flags & CODING_STATE_ISO2022_LOCK) |
4488                             CODING_STATE_R2L;
4489                         goto directionality;
4490                 }
4491                 return 0;
4492
4493         directionality:
4494                 iso->esc = ISO_ESC_DIRECTIONALITY;
4495                 /* Various junk here to attempt to preserve the direction
4496                    sequences literally in the text if they would otherwise be
4497                    swallowed due to invalid designations that don't show up as
4498                    actual charset changes in the text. */
4499                 if (iso->invalid_switch_dir) {
4500                         /* We already inserted a direction switch literally into
4501                            the text.  We assume (#### this may not be right)
4502                            that the next direction switch is the one going the
4503                            other way, and we need to output that literally as
4504                            well. */
4505                         iso->output_literally = 1;
4506                         iso->invalid_switch_dir = 0;
4507                 } else {
4508                         int jj;
4509
4510                         /* If we are in the thrall of an invalid designation,
4511                            then stick the directionality sequence literally into
4512                            the output stream so it ends up in the original text
4513                            again. */
4514                         for (jj = 0; jj < 4; jj++)
4515                                 if (iso->invalid_designated[jj])
4516                                         break;
4517                         if (jj < 4) {
4518                                 iso->output_literally = 1;
4519                                 iso->invalid_switch_dir = 1;
4520                         } else
4521                                 /* Indicate that we haven't yet seen a valid
4522                                    designation, so that if a switch-dir is
4523                                    directly followed by an invalid designation,
4524                                    both get inserted literally. */
4525                                 iso->switched_dir_and_no_valid_charset_yet = 1;
4526                 }
4527                 return 1;
4528
4529                 /**** designation ****/
4530
4531         case ISO_ESC_2_4:
4532                 if (0x28 <= c && c <= 0x2F) {
4533                         iso->esc =
4534                             (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4535                         goto not_done;
4536                 }
4537                 if (0x40 <= c && c <= 0x42) {
4538                         cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4539                                                    *flags & CODING_STATE_R2L ?
4540                                                    CHARSET_RIGHT_TO_LEFT :
4541                                                    CHARSET_LEFT_TO_RIGHT);
4542                         reg = 0;
4543                         goto designated;
4544                 }
4545                 return 0;
4546
4547                 /* list the rest */
4548         case ISO_ESC_2_8:
4549         case ISO_ESC_2_9:
4550         case ISO_ESC_2_10:
4551         case ISO_ESC_2_11:
4552         case ISO_ESC_2_12:
4553         case ISO_ESC_2_13:
4554         case ISO_ESC_2_14:
4555         case ISO_ESC_2_15:
4556         case ISO_ESC_2_4_8:
4557         case ISO_ESC_2_4_9:
4558         case ISO_ESC_2_4_10:
4559         case ISO_ESC_2_4_11:
4560         case ISO_ESC_2_4_12:
4561         case ISO_ESC_2_4_13:
4562         case ISO_ESC_2_4_14:
4563         case ISO_ESC_2_4_15:
4564         case ISO_ESC_SINGLE_SHIFT:
4565         case ISO_ESC_LOCKING_SHIFT:
4566         case ISO_ESC_DESIGNATE:
4567         case ISO_ESC_DIRECTIONALITY:
4568         case ISO_ESC_LITERAL:
4569
4570         default: {
4571                 int type = -1;
4572
4573                 if (c < '0' || c > '~')
4574                         return 0;       /* bad final byte */
4575
4576                 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4577                         type = ((iso->esc >= ISO_ESC_2_12) ?
4578                                 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4579                         reg = (iso->esc - ISO_ESC_2_8) & 3;
4580                 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4581                            iso->esc <= ISO_ESC_2_4_15) {
4582                         type = ((iso->esc >= ISO_ESC_2_4_12) ?
4583                                 CHARSET_TYPE_96X96 :
4584                                 CHARSET_TYPE_94X94);
4585                         reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4586                 } else {
4587                         /* Can this ever be reached? -slb */
4588                         abort();
4589                         return 0;
4590                 }
4591
4592                 cs = CHARSET_BY_ATTRIBUTES(type, c,
4593                                            *flags & CODING_STATE_R2L ?
4594                                            CHARSET_RIGHT_TO_LEFT :
4595                                            CHARSET_LEFT_TO_RIGHT);
4596                 goto designated;
4597         }
4598         }
4599
4600       not_done:
4601         iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4602         return -1;
4603
4604       single_shift:
4605         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4606                 /* can't invoke something that ain't there. */
4607                 return 0;
4608         iso->esc = ISO_ESC_SINGLE_SHIFT;
4609         *flags &= CODING_STATE_ISO2022_LOCK;
4610         if (reg == 2)
4611                 *flags |= CODING_STATE_SS2;
4612         else
4613                 *flags |= CODING_STATE_SS3;
4614         return 1;
4615
4616       locking_shift:
4617         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4618                 /* can't invoke something that ain't there. */
4619                 return 0;
4620         if (half)
4621                 iso->register_right = reg;
4622         else
4623                 iso->register_left = reg;
4624         *flags &= CODING_STATE_ISO2022_LOCK;
4625         iso->esc = ISO_ESC_LOCKING_SHIFT;
4626         return 1;
4627
4628       designated:
4629         if (NILP(cs) && check_invalid_charsets) {
4630                 iso->invalid_designated[reg] = 1;
4631                 iso->charset[reg] = Vcharset_ascii;
4632                 iso->esc = ISO_ESC_DESIGNATE;
4633                 *flags &= CODING_STATE_ISO2022_LOCK;
4634                 iso->output_literally = 1;
4635                 if (iso->switched_dir_and_no_valid_charset_yet) {
4636                         /* We encountered a switch-direction followed by an
4637                            invalid designation.  Ensure that the switch-direction
4638                            gets outputted; otherwise it will probably get eaten
4639                            when the text is written out again. */
4640                         iso->switched_dir_and_no_valid_charset_yet = 0;
4641                         iso->output_direction_sequence = 1;
4642                         /* And make sure that the switch-dir going the other
4643                            way gets outputted, as well. */
4644                         iso->invalid_switch_dir = 1;
4645                 }
4646                 return 1;
4647         }
4648         /* This function is called with CODESYS equal to nil when
4649            doing coding-system detection. */
4650         if (!NILP(codesys)) {
4651                 charset_conversion_spec_dynarr *dyn =
4652                     XCODING_SYSTEM(codesys)->iso2022.input_conv;
4653
4654                 if (dyn) {
4655                         int i;
4656
4657                         for (i = 0; i < Dynarr_length(dyn); i++) {
4658                                 struct charset_conversion_spec *spec =
4659                                     Dynarr_atp(dyn, i);
4660                                 if (EQ(cs, spec->from_charset))
4661                                         cs = spec->to_charset;
4662                         }
4663                 }
4664         }
4665
4666         iso->charset[reg] = cs;
4667         iso->esc = ISO_ESC_DESIGNATE;
4668         *flags &= CODING_STATE_ISO2022_LOCK;
4669         if (iso->invalid_designated[reg]) {
4670                 iso->invalid_designated[reg] = 0;
4671                 iso->output_literally = 1;
4672         }
4673         if (iso->switched_dir_and_no_valid_charset_yet)
4674                 iso->switched_dir_and_no_valid_charset_yet = 0;
4675         return 1;
4676 }
4677
4678 static int
4679 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4680                       Lstream_data_count n)
4681 {
4682         int mask;
4683
4684         /* #### There are serious deficiencies in the recognition mechanism
4685            here.  This needs to be much smarter if it's going to cut it.
4686            The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4687            it should be detected as Latin-1.
4688            All the ISO2022 stuff in this file should be synced up with the
4689            code from FSF Emacs-20.4, in which Mule should be more or less stable.
4690            Perhaps we should wait till R2L works in FSF Emacs? */
4691
4692         if (!st->iso2022.initted) {
4693                 reset_iso2022(Qnil, &st->iso2022.iso);
4694                 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4695                                     CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4696                                     CODING_CATEGORY_ISO_8_1_MASK |
4697                                     CODING_CATEGORY_ISO_8_2_MASK |
4698                                     CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4699                 st->iso2022.flags = 0;
4700                 st->iso2022.high_byte_count = 0;
4701                 st->iso2022.saw_single_shift = 0;
4702                 st->iso2022.initted = 1;
4703         }
4704
4705         mask = st->iso2022.mask;
4706
4707         while (n--) {
4708                 const unsigned char c = *(const unsigned char *)src++;
4709                 if (c >= 0xA0) {
4710                         mask &= ~CODING_CATEGORY_ISO_7_MASK;
4711                         st->iso2022.high_byte_count++;
4712                 } else {
4713                         if (st->iso2022.high_byte_count
4714                             && !st->iso2022.saw_single_shift) {
4715                                 if (st->iso2022.high_byte_count & 1)
4716                                         /* odd number of high bytes; assume not iso-8-2 */
4717                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4718                         }
4719                         st->iso2022.high_byte_count = 0;
4720                         st->iso2022.saw_single_shift = 0;
4721                         if (c > 0x80)
4722                                 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4723                 }
4724                 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4725                     && (BYTE_C0_P(c) || BYTE_C1_P(c))) {        /* control chars */
4726                         switch (c) {
4727                                 /* Allow and ignore control characters that you might
4728                                    reasonably see in a text file */
4729                         case '\r':
4730                         case '\n':
4731                         case '\t':
4732                         case 7: /* bell */
4733                         case 8: /* backspace */
4734                         case 11:        /* vertical tab */
4735                         case 12:        /* form feed */
4736                         case 26:        /* MS-DOS C-z junk */
4737                         case 31:        /* '^_' -- for info */
4738                                 goto label_continue_loop;
4739
4740                         default:
4741                                 break;
4742                         }
4743                 }
4744
4745                 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4746                     || BYTE_C1_P(c)) {
4747                         if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4748                                               &st->iso2022.flags, 0)) {
4749                                 switch (st->iso2022.iso.esc) {
4750                                 case ISO_ESC_DESIGNATE:
4751                                         mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4752                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4753                                         break;
4754                                 case ISO_ESC_LOCKING_SHIFT:
4755                                         mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4756                                         goto ran_out_of_chars;
4757                                 case ISO_ESC_SINGLE_SHIFT:
4758                                         mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4759                                         st->iso2022.saw_single_shift = 1;
4760                                         break;
4761
4762                                         /* list the rest */
4763                                 case ISO_ESC_NOTHING:
4764                                 case ISO_ESC:
4765                                 case ISO_ESC_2_4:
4766                                 case ISO_ESC_2_8:
4767                                 case ISO_ESC_2_9:
4768                                 case ISO_ESC_2_10:
4769                                 case ISO_ESC_2_11:
4770                                 case ISO_ESC_2_12:
4771                                 case ISO_ESC_2_13:
4772                                 case ISO_ESC_2_14:
4773                                 case ISO_ESC_2_15:
4774                                 case ISO_ESC_2_4_8:
4775                                 case ISO_ESC_2_4_9:
4776                                 case ISO_ESC_2_4_10:
4777                                 case ISO_ESC_2_4_11:
4778                                 case ISO_ESC_2_4_12:
4779                                 case ISO_ESC_2_4_13:
4780                                 case ISO_ESC_2_4_14:
4781                                 case ISO_ESC_2_4_15:
4782                                 case ISO_ESC_5_11:
4783                                 case ISO_ESC_5_11_0:
4784                                 case ISO_ESC_5_11_1:
4785                                 case ISO_ESC_5_11_2:
4786                                 case ISO_ESC_DIRECTIONALITY:
4787                                 case ISO_ESC_LITERAL:
4788                                 default:
4789                                         break;
4790                                 }
4791                         } else {
4792                                 mask = 0;
4793                                 goto ran_out_of_chars;
4794                         }
4795                 }
4796         label_continue_loop:;
4797         }
4798
4799 ran_out_of_chars:
4800         return mask;
4801 }
4802
4803 static int postprocess_iso2022_mask(int mask)
4804 {
4805         /* #### kind of cheesy */
4806         /* If seven-bit ISO is allowed, then assume that the encoding is
4807            entirely seven-bit and turn off the eight-bit ones. */
4808         if (mask & CODING_CATEGORY_ISO_7_MASK)
4809                 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4810                           CODING_CATEGORY_ISO_8_1_MASK |
4811                           CODING_CATEGORY_ISO_8_2_MASK);
4812         return mask;
4813 }
4814
4815 /* If FLAGS is a null pointer or specifies right-to-left motion,
4816    output a switch-dir-to-left-to-right sequence to DST.
4817    Also update FLAGS if it is not a null pointer.
4818    If INTERNAL_P is set, we are outputting in internal format and
4819    need to handle the CSI differently. */
4820
4821 static void
4822 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4823                                 unsigned_char_dynarr * dst,
4824                                 unsigned int *flags, int internal_p)
4825 {
4826         if (!flags || (*flags & CODING_STATE_R2L)) {
4827                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4828                         Dynarr_add(dst, ISO_CODE_ESC);
4829                         Dynarr_add(dst, '[');
4830                 } else if (internal_p)
4831                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4832                 else
4833                         Dynarr_add(dst, ISO_CODE_CSI);
4834                 Dynarr_add(dst, '0');
4835                 Dynarr_add(dst, ']');
4836                 if (flags)
4837                         *flags &= ~CODING_STATE_R2L;
4838         }
4839 }
4840
4841 /* If FLAGS is a null pointer or specifies a direction different from
4842    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4843    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4844    sequence to DST.  Also update FLAGS if it is not a null pointer.
4845    If INTERNAL_P is set, we are outputting in internal format and
4846    need to handle the CSI differently. */
4847
4848 static void
4849 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4850                          unsigned_char_dynarr * dst, unsigned int *flags,
4851                          int internal_p)
4852 {
4853         if ((!flags || (*flags & CODING_STATE_R2L)) &&
4854             direction == CHARSET_LEFT_TO_RIGHT)
4855                 restore_left_to_right_direction(codesys, dst, flags,
4856                                                 internal_p);
4857         else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4858                  && (!flags || !(*flags & CODING_STATE_R2L)) &&
4859                  direction == CHARSET_RIGHT_TO_LEFT) {
4860                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4861                         Dynarr_add(dst, ISO_CODE_ESC);
4862                         Dynarr_add(dst, '[');
4863                 } else if (internal_p)
4864                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4865                 else
4866                         Dynarr_add(dst, ISO_CODE_CSI);
4867                 Dynarr_add(dst, '2');
4868                 Dynarr_add(dst, ']');
4869                 if (flags)
4870                         *flags |= CODING_STATE_R2L;
4871         }
4872 }
4873
4874 /* Convert ISO2022-format data to internal format. */
4875
4876 static void
4877 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4878                       unsigned_char_dynarr * dst, Lstream_data_count n)
4879 {
4880         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4881         unsigned int flags = str->flags;
4882         unsigned int ch = str->ch;
4883         eol_type_t eol_type = str->eol_type;
4884 #ifdef ENABLE_COMPOSITE_CHARS
4885         unsigned_char_dynarr *real_dst = dst;
4886 #endif
4887         Lisp_Object coding_system;
4888
4889         XSETCODING_SYSTEM(coding_system, str->codesys);
4890
4891 #ifdef ENABLE_COMPOSITE_CHARS
4892         if (flags & CODING_STATE_COMPOSITE)
4893                 dst = str->iso2022.composite_chars;
4894 #endif                          /* ENABLE_COMPOSITE_CHARS */
4895
4896         while (n--) {
4897                 const unsigned char c = *(const unsigned char *)src++;
4898                 if (flags & CODING_STATE_ESCAPE) {
4899                         /* Within ESC sequence */
4900                         int retval = parse_iso2022_esc(
4901                                 coding_system, &str->iso2022, c, &flags, 1);
4902
4903                         if (retval) {
4904                                 switch (str->iso2022.esc) {
4905 #ifdef ENABLE_COMPOSITE_CHARS
4906                                 case ISO_ESC_START_COMPOSITE:
4907                                         if (str->iso2022.composite_chars)
4908                                                 Dynarr_reset(str->iso2022.
4909                                                              composite_chars);
4910                                         else
4911                                                 str->iso2022.composite_chars =
4912                                                     Dynarr_new(unsigned_char);
4913                                         dst = str->iso2022.composite_chars;
4914                                         break;
4915                                 case ISO_ESC_END_COMPOSITE:
4916                                         {
4917                                                 Bufbyte comstr[MAX_EMCHAR_LEN];
4918                                                 Bytecount len;
4919                                                 Emchar emch =
4920                                                     lookup_composite_char
4921                                                     (Dynarr_atp(dst, 0),
4922                                                      Dynarr_length(dst));
4923                                                 dst = real_dst;
4924                                                 len =
4925                                                     set_charptr_emchar(comstr,
4926                                                                        emch);
4927                                                 Dynarr_add_many(dst, comstr,
4928                                                                 len);
4929                                                 break;
4930                                         }
4931 #endif                          /* ENABLE_COMPOSITE_CHARS */
4932
4933                                 case ISO_ESC_LITERAL:
4934                                         DECODE_ADD_BINARY_CHAR(c, dst);
4935                                         break;
4936
4937                                 case ISO_ESC_NOTHING:
4938                                 case ISO_ESC:
4939                                 case ISO_ESC_2_4:
4940                                 case ISO_ESC_2_8:
4941                                 case ISO_ESC_2_9:
4942                                 case ISO_ESC_2_10:
4943                                 case ISO_ESC_2_11:
4944                                 case ISO_ESC_2_12:
4945                                 case ISO_ESC_2_13:
4946                                 case ISO_ESC_2_14:
4947                                 case ISO_ESC_2_15:
4948                                 case ISO_ESC_2_4_8:
4949                                 case ISO_ESC_2_4_9:
4950                                 case ISO_ESC_2_4_10:
4951                                 case ISO_ESC_2_4_11:
4952                                 case ISO_ESC_2_4_12:
4953                                 case ISO_ESC_2_4_13:
4954                                 case ISO_ESC_2_4_14:
4955                                 case ISO_ESC_2_4_15:
4956                                 case ISO_ESC_5_11:
4957                                 case ISO_ESC_5_11_0:
4958                                 case ISO_ESC_5_11_1:
4959                                 case ISO_ESC_5_11_2:
4960                                 case ISO_ESC_SINGLE_SHIFT:
4961                                 case ISO_ESC_LOCKING_SHIFT:
4962                                 case ISO_ESC_DESIGNATE:
4963                                 case ISO_ESC_DIRECTIONALITY:
4964
4965                                 default:
4966                                         /* Everything else handled already */
4967                                         break;
4968                                 }
4969                         }
4970
4971                         /* Attempted error recovery. */
4972                         if (str->iso2022.output_direction_sequence)
4973                                 ensure_correct_direction(flags &
4974                                                          CODING_STATE_R2L ?
4975                                                          CHARSET_RIGHT_TO_LEFT :
4976                                                          CHARSET_LEFT_TO_RIGHT,
4977                                                          str->codesys, dst, 0,
4978                                                          1);
4979                         /* More error recovery. */
4980                         if (!retval || str->iso2022.output_literally) {
4981                                 /* Output the (possibly invalid) sequence */
4982                                 int i;
4983                                 for (i = 0; i < str->iso2022.esc_bytes_index;
4984                                      i++)
4985                                         DECODE_ADD_BINARY_CHAR(str->iso2022.
4986                                                                esc_bytes[i],
4987                                                                dst);
4988                                 flags &= CODING_STATE_ISO2022_LOCK;
4989                                 if (!retval)
4990                                         n++, src--;     /* Repeat the loop with the same character. */
4991                                 else {
4992                                         /* No sense in reprocessing the final byte of the
4993                                            escape sequence; it could mess things up anyway.
4994                                            Just add it now. */
4995                                         DECODE_ADD_BINARY_CHAR(c, dst);
4996                                 }
4997                         }
4998                         ch = 0;
4999                 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) {      /* Control characters */
5000
5001           /***** Error-handling *****/
5002
5003                         /* If we were in the middle of a character, dump out the
5004                            partial character. */
5005                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5006
5007                         /* If we just saw a single-shift character, dump it out.
5008                            This may dump out the wrong sort of single-shift character,
5009                            but least it will give an indication that something went
5010                            wrong. */
5011                         if (flags & CODING_STATE_SS2) {
5012                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5013                                 flags &= ~CODING_STATE_SS2;
5014                         }
5015                         if (flags & CODING_STATE_SS3) {
5016                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5017                                 flags &= ~CODING_STATE_SS3;
5018                         }
5019
5020           /***** Now handle the control characters. *****/
5021
5022                         /* Handle CR/LF */
5023                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5024
5025                         flags &= CODING_STATE_ISO2022_LOCK;
5026
5027                         if (!parse_iso2022_esc
5028                             (coding_system, &str->iso2022, c, &flags, 1))
5029                                 DECODE_ADD_BINARY_CHAR(c, dst);
5030                 } else {        /* Graphic characters */
5031                         Lisp_Object charset;
5032                         int lb;
5033                         int reg;
5034
5035                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5036
5037                         /* Now determine the charset. */
5038                         reg = ((flags & CODING_STATE_SS2) ? 2
5039                                : (flags & CODING_STATE_SS3) ? 3
5040                                : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5041                                : str->iso2022.register_left);
5042                         charset = str->iso2022.charset[reg];
5043
5044                         /* Error checking: */
5045                         if (!CHARSETP(charset)
5046                             || str->iso2022.invalid_designated[reg]
5047                             ||
5048                             (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5049                              && XCHARSET_CHARS(charset) == 94))
5050                                 /* Mrmph.  We are trying to invoke a register that has no
5051                                    or an invalid charset in it, or trying to add a character
5052                                    outside the range of the charset.  Insert that char literally
5053                                    to preserve it for the output. */
5054                         {
5055                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5056                                 DECODE_ADD_BINARY_CHAR(c, dst);
5057                         }
5058
5059                         else {
5060                                 /* Things are probably hunky-dorey. */
5061
5062                                 /* Fetch reverse charset, maybe. */
5063                                 if (((flags & CODING_STATE_R2L) &&
5064                                      XCHARSET_DIRECTION(charset) ==
5065                                      CHARSET_LEFT_TO_RIGHT)
5066                                     || (!(flags & CODING_STATE_R2L)
5067                                         && XCHARSET_DIRECTION(charset) ==
5068                                         CHARSET_RIGHT_TO_LEFT)) {
5069                                         Lisp_Object new_charset =
5070                                             XCHARSET_REVERSE_DIRECTION_CHARSET
5071                                             (charset);
5072                                         if (!NILP(new_charset))
5073                                                 charset = new_charset;
5074                                 }
5075
5076                                 lb = XCHARSET_LEADING_BYTE(charset);
5077                                 switch (XCHARSET_REP_BYTES(charset)) {
5078                                 case 1: /* ASCII */
5079                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5080                                         Dynarr_add(dst, c & 0x7F);
5081                                         break;
5082
5083                                 case 2: /* one-byte official */
5084                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5085                                         Dynarr_add(dst, lb);
5086                                         Dynarr_add(dst, c | 0x80);
5087                                         break;
5088
5089                                 case 3: /* one-byte private or two-byte official */
5090                                         if (XCHARSET_PRIVATE_P(charset)) {
5091                                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5092                                                 Dynarr_add(dst,
5093                                                            PRE_LEADING_BYTE_PRIVATE_1);
5094                                                 Dynarr_add(dst, lb);
5095                                                 Dynarr_add(dst, c | 0x80);
5096                                         } else {
5097                                                 if (ch) {
5098                                                         Dynarr_add(dst, lb);
5099                                                         Dynarr_add(dst,
5100                                                                    ch | 0x80);
5101                                                         Dynarr_add(dst,
5102                                                                    c | 0x80);
5103                                                         ch = 0;
5104                                                 } else
5105                                                         ch = c;
5106                                         }
5107                                         break;
5108
5109                                 default:        /* two-byte private */
5110                                         if (ch) {
5111                                                 Dynarr_add(dst,
5112                                                            PRE_LEADING_BYTE_PRIVATE_2);
5113                                                 Dynarr_add(dst, lb);
5114                                                 Dynarr_add(dst, ch | 0x80);
5115                                                 Dynarr_add(dst, c | 0x80);
5116                                                 ch = 0;
5117                                         } else
5118                                                 ch = c;
5119                                 }
5120                         }
5121
5122                         if (!ch)
5123                                 flags &= CODING_STATE_ISO2022_LOCK;
5124                 }
5125
5126               label_continue_loop:;
5127         }
5128
5129         if (flags & CODING_STATE_END)
5130                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5131
5132         str->flags = flags;
5133         str->ch = ch;
5134 }
5135
5136 /***** ISO2022 encoder *****/
5137
5138 /* Designate CHARSET into register REG. */
5139
5140 static void
5141 iso2022_designate(Lisp_Object charset, unsigned char reg,
5142                   encoding_stream_t str, unsigned_char_dynarr * dst)
5143 {
5144         static const char inter94[] = "()*+";
5145         static const char inter96[] = ",-./";
5146         unsigned int type;
5147         unsigned char final;
5148         Lisp_Object old_charset = str->iso2022.charset[reg];
5149
5150         str->iso2022.charset[reg] = charset;
5151         if (!CHARSETP(charset))
5152                 /* charset might be an initial nil or t. */
5153                 return;
5154         type = XCHARSET_TYPE(charset);
5155         final = XCHARSET_FINAL(charset);
5156         if (!str->iso2022.force_charset_on_output[reg] &&
5157             CHARSETP(old_charset) &&
5158             XCHARSET_TYPE(old_charset) == type &&
5159             XCHARSET_FINAL(old_charset) == final)
5160                 return;
5161
5162         str->iso2022.force_charset_on_output[reg] = 0;
5163
5164         {
5165                 charset_conversion_spec_dynarr *dyn =
5166                     str->codesys->iso2022.output_conv;
5167
5168                 if (dyn) {
5169                         int i;
5170
5171                         for (i = 0; i < Dynarr_length(dyn); i++) {
5172                                 struct charset_conversion_spec *spec =
5173                                     Dynarr_atp(dyn, i);
5174                                 if (EQ(charset, spec->from_charset))
5175                                         charset = spec->to_charset;
5176                         }
5177                 }
5178         }
5179
5180         Dynarr_add(dst, ISO_CODE_ESC);
5181         switch (type) {
5182         case CHARSET_TYPE_94:
5183                 Dynarr_add(dst, inter94[reg]);
5184                 break;
5185         case CHARSET_TYPE_96:
5186                 Dynarr_add(dst, inter96[reg]);
5187                 break;
5188         case CHARSET_TYPE_94X94:
5189                 Dynarr_add(dst, '$');
5190                 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5191                     || final < '@' || final > 'B')
5192                         Dynarr_add(dst, inter94[reg]);
5193                 break;
5194         case CHARSET_TYPE_96X96:
5195                 Dynarr_add(dst, '$');
5196                 Dynarr_add(dst, inter96[reg]);
5197                 break;
5198         default:
5199                 break;
5200         }
5201         Dynarr_add(dst, final);
5202 }
5203
5204 static void
5205 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5206 {
5207         if (str->iso2022.register_left != 0) {
5208                 Dynarr_add(dst, ISO_CODE_SI);
5209                 str->iso2022.register_left = 0;
5210         }
5211 }
5212
5213 static void
5214 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5215 {
5216         if (str->iso2022.register_left != 1) {
5217                 Dynarr_add(dst, ISO_CODE_SO);
5218                 str->iso2022.register_left = 1;
5219         }
5220 }
5221
5222 /* Convert internally-formatted data to ISO2022 format. */
5223
5224 static void
5225 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5226                       unsigned_char_dynarr * dst, Lstream_data_count n)
5227 {
5228         unsigned char charmask, c;
5229         unsigned char char_boundary;
5230         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5231         unsigned int flags = str->flags;
5232         unsigned int ch = str->ch;
5233         Lisp_Coding_System *codesys = str->codesys;
5234         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5235         int i;
5236         Lisp_Object charset;
5237         int half;
5238
5239 #ifdef ENABLE_COMPOSITE_CHARS
5240         /* flags for handling composite chars.  We do a little switcharoo
5241            on the source while we're outputting the composite char. */
5242         unsigned int saved_n = 0;
5243         const unsigned char *saved_src = NULL;
5244         int in_composite = 0;
5245 #endif                          /* ENABLE_COMPOSITE_CHARS */
5246
5247         char_boundary = str->iso2022.current_char_boundary;
5248         charset = str->iso2022.current_charset;
5249         half = str->iso2022.current_half;
5250
5251 #ifdef ENABLE_COMPOSITE_CHARS
5252       back_to_square_n:
5253 #endif
5254         while (n--) {
5255                 c = *src++;
5256
5257                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
5258                         ch = 0;
5259
5260                         restore_left_to_right_direction(codesys, dst, &flags,
5261                                                         0);
5262
5263                         /* Make sure G0 contains ASCII */
5264                         if ((c > ' ' && c < ISO_CODE_DEL) ||
5265                             !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5266                                 ensure_normal_shift(str, dst);
5267                                 iso2022_designate(Vcharset_ascii, 0, str, dst);
5268                         }
5269
5270                         /* If necessary, restore everything to the default state
5271                            at end-of-line */
5272                         if (c == '\n' &&
5273                             !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5274                                 restore_left_to_right_direction(codesys, dst,
5275                                                                 &flags, 0);
5276
5277                                 ensure_normal_shift(str, dst);
5278
5279                                 for (i = 0; i < 4; i++) {
5280                                         Lisp_Object initial_charset =
5281                                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5282                                             (codesys, i);
5283                                         iso2022_designate(initial_charset, i,
5284                                                           str, dst);
5285                                 }
5286                         }
5287                         if (c == '\n') {
5288                                 if (eol_type != EOL_LF
5289                                     && eol_type != EOL_AUTODETECT)
5290                                         Dynarr_add(dst, '\r');
5291                                 if (eol_type != EOL_CR)
5292                                         Dynarr_add(dst, c);
5293                         } else {
5294                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5295                                     && fit_to_be_escape_quoted(c))
5296                                         Dynarr_add(dst, ISO_CODE_ESC);
5297                                 Dynarr_add(dst, c);
5298                         }
5299                         char_boundary = 1;
5300                 }
5301
5302                 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {     /* Processing Leading Byte */
5303                         ch = 0;
5304                         charset = CHARSET_BY_LEADING_BYTE(c);
5305                         if (LEADING_BYTE_PREFIX_P(c))
5306                                 ch = c;
5307                         else if (!EQ(charset, Vcharset_control_1)
5308 #ifdef ENABLE_COMPOSITE_CHARS
5309                                  && !EQ(charset, Vcharset_composite)
5310 #endif
5311                             ) {
5312                                 int reg;
5313
5314                                 ensure_correct_direction(XCHARSET_DIRECTION
5315                                                          (charset), codesys,
5316                                                          dst, &flags, 0);
5317
5318                                 /* Now determine which register to use. */
5319                                 reg = -1;
5320                                 for (i = 0; i < 4; i++) {
5321                                         if (EQ(charset, str->iso2022.charset[i])
5322                                             || EQ(charset,
5323                                                   CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5324                                                   (codesys, i))) {
5325                                                 reg = i;
5326                                                 break;
5327                                         }
5328                                 }
5329
5330                                 if (reg == -1) {
5331                                         if (XCHARSET_GRAPHIC(charset) != 0) {
5332                                                 if (!NILP
5333                                                     (str->iso2022.charset[1])
5334                                                     &&
5335                                                     (!CODING_SYSTEM_ISO2022_SEVEN
5336                                                      (codesys)
5337                                                      ||
5338                                                      CODING_SYSTEM_ISO2022_LOCK_SHIFT
5339                                                      (codesys)))
5340                                                         reg = 1;
5341                                                 else if (!NILP
5342                                                          (str->iso2022.
5343                                                           charset[2]))
5344                                                         reg = 2;
5345                                                 else if (!NILP
5346                                                          (str->iso2022.
5347                                                           charset[3]))
5348                                                         reg = 3;
5349                                                 else
5350                                                         reg = 0;
5351                                         } else
5352                                                 reg = 0;
5353                                 }
5354
5355                                 iso2022_designate(charset, reg, str, dst);
5356
5357                                 /* Now invoke that register. */
5358                                 switch (reg) {
5359                                 case 0:
5360                                         ensure_normal_shift(str, dst);
5361                                         half = 0;
5362                                         break;
5363
5364                                 case 1:
5365                                         if (CODING_SYSTEM_ISO2022_SEVEN
5366                                             (codesys)) {
5367                                                 ensure_shift_out(str, dst);
5368                                                 half = 0;
5369                                         } else
5370                                                 half = 1;
5371                                         break;
5372
5373                                 case 2:
5374                                         if (CODING_SYSTEM_ISO2022_SEVEN
5375                                             (str->codesys)) {
5376                                                 Dynarr_add(dst, ISO_CODE_ESC);
5377                                                 Dynarr_add(dst, 'N');
5378                                                 half = 0;
5379                                         } else {
5380                                                 Dynarr_add(dst, ISO_CODE_SS2);
5381                                                 half = 1;
5382                                         }
5383                                         break;
5384
5385                                 case 3:
5386                                         if (CODING_SYSTEM_ISO2022_SEVEN
5387                                             (str->codesys)) {
5388                                                 Dynarr_add(dst, ISO_CODE_ESC);
5389                                                 Dynarr_add(dst, 'O');
5390                                                 half = 0;
5391                                         } else {
5392                                                 Dynarr_add(dst, ISO_CODE_SS3);
5393                                                 half = 1;
5394                                         }
5395                                         break;
5396
5397                                 default:
5398                                         abort();
5399                                 }
5400                         }
5401                         char_boundary = 0;
5402                 } else {        /* Processing Non-ASCII character */
5403                         charmask = (half == 0 ? 0x7F : 0xFF);
5404                         char_boundary = 1;
5405                         if (EQ(charset, Vcharset_control_1)) {
5406                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5407                                     && fit_to_be_escape_quoted(c))
5408                                         Dynarr_add(dst, ISO_CODE_ESC);
5409                                 /* you asked for it ... */
5410                                 Dynarr_add(dst, c - 0x20);
5411                         } else {
5412                                 switch (XCHARSET_REP_BYTES(charset)) {
5413                                 case 2:
5414                                         Dynarr_add(dst, c & charmask);
5415                                         break;
5416                                 case 3:
5417                                         if (XCHARSET_PRIVATE_P(charset)) {
5418                                                 Dynarr_add(dst, c & charmask);
5419                                                 ch = 0;
5420                                         } else if (ch) {
5421 #ifdef ENABLE_COMPOSITE_CHARS
5422                                                 if (EQ
5423                                                     (charset,
5424                                                      Vcharset_composite)) {
5425                                                         if (in_composite) {
5426                                                                 /* #### Bother! We don't know how to
5427                                                                    handle this yet. */
5428                                                                 Dynarr_add(dst,
5429                                                                            '~');
5430                                                         } else {
5431                                                                 Emchar emch =
5432                                                                     MAKE_CHAR
5433                                                                     (Vcharset_composite,
5434                                                                      ch & 0x7F,
5435                                                                      c & 0x7F);
5436                                                                 Lisp_Object lstr
5437                                                                     =
5438                                                                     composite_char_string
5439                                                                     (emch);
5440                                                                 saved_n = n;
5441                                                                 saved_src = src;
5442                                                                 in_composite =
5443                                                                     1;
5444                                                                 src =
5445                                                                     XSTRING_DATA
5446                                                                     (lstr);
5447                                                                 n = XSTRING_LENGTH(lstr);
5448                                                                 Dynarr_add(dst,
5449                                                                            ISO_CODE_ESC);
5450                                                                 Dynarr_add(dst, '0');   /* start composing */
5451                                                         }
5452                                                 } else
5453 #endif                          /* ENABLE_COMPOSITE_CHARS */
5454                                                 {
5455                                                         Dynarr_add(dst,
5456                                                                    ch &
5457                                                                    charmask);
5458                                                         Dynarr_add(dst,
5459                                                                    c &
5460                                                                    charmask);
5461                                                 }
5462                                                 ch = 0;
5463                                         } else {
5464                                                 ch = c;
5465                                                 char_boundary = 0;
5466                                         }
5467                                         break;
5468                                 case 4:
5469                                         if (ch) {
5470                                                 Dynarr_add(dst, ch & charmask);
5471                                                 Dynarr_add(dst, c & charmask);
5472                                                 ch = 0;
5473                                         } else {
5474                                                 ch = c;
5475                                                 char_boundary = 0;
5476                                         }
5477                                         break;
5478                                 default:
5479                                         abort();
5480                                 }
5481                         }
5482                 }
5483         }
5484
5485 #ifdef ENABLE_COMPOSITE_CHARS
5486         if (in_composite) {
5487                 n = saved_n;
5488                 src = saved_src;
5489                 in_composite = 0;
5490                 Dynarr_add(dst, ISO_CODE_ESC);
5491                 Dynarr_add(dst, '1');   /* end composing */
5492                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
5493         }
5494 #endif                          /* ENABLE_COMPOSITE_CHARS */
5495
5496         if (char_boundary && flags & CODING_STATE_END) {
5497                 restore_left_to_right_direction(codesys, dst, &flags, 0);
5498                 ensure_normal_shift(str, dst);
5499                 for (i = 0; i < 4; i++) {
5500                         Lisp_Object initial_charset =
5501                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5502                         iso2022_designate(initial_charset, i, str, dst);
5503                 }
5504         }
5505
5506         str->flags = flags;
5507         str->ch = ch;
5508         str->iso2022.current_char_boundary = char_boundary;
5509         str->iso2022.current_charset = charset;
5510         str->iso2022.current_half = half;
5511
5512         /* Verbum caro factum est! */
5513 }
5514 #endif                          /* MULE */
5515 \f
5516 /************************************************************************/
5517 /*                     No-conversion methods                            */
5518 /************************************************************************/
5519
5520 /* This is used when reading in "binary" files -- i.e. files that may
5521    contain all 256 possible byte values and that are not to be
5522    interpreted as being in any particular decoding. */
5523 static void
5524 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5525                             unsigned_char_dynarr * dst, Lstream_data_count n)
5526 {
5527         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5528         unsigned int flags = str->flags;
5529         unsigned int ch = str->ch;
5530         eol_type_t eol_type = str->eol_type;
5531
5532         while (n--) {
5533                 const unsigned char c = *(const unsigned char *)src++;
5534
5535                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5536                 DECODE_ADD_BINARY_CHAR(c, dst);
5537         label_continue_loop:;
5538         }
5539
5540         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5541
5542         str->flags = flags;
5543         str->ch = ch;
5544 }
5545
5546 static void
5547 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5548                             unsigned_char_dynarr * dst, Lstream_data_count n)
5549 {
5550         unsigned char c;
5551         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5552         unsigned int flags = str->flags;
5553         unsigned int ch = str->ch;
5554         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5555
5556         while (n--) {
5557                 c = *src++;
5558                 if (c == '\n') {
5559                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5560                                 Dynarr_add(dst, '\r');
5561                         if (eol_type != EOL_CR)
5562                                 Dynarr_add(dst, '\n');
5563                         ch = 0;
5564                 } else if (BYTE_ASCII_P(c)) {
5565                         assert(ch == 0);
5566                         Dynarr_add(dst, c);
5567                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5568                         assert(ch == 0);
5569                         if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5570                             c == LEADING_BYTE_CONTROL_1)
5571                                 ch = c;
5572                         else
5573                                 Dynarr_add(dst, '~');   /* untranslatable character */
5574                 } else {
5575                         if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5576                                 Dynarr_add(dst, c);
5577                         else if (ch == LEADING_BYTE_CONTROL_1) {
5578                                 assert(c < 0xC0);
5579                                 Dynarr_add(dst, c - 0x20);
5580                         }
5581                         /* else it should be the second or third byte of an
5582                            untranslatable character, so ignore it */
5583                         ch = 0;
5584                 }
5585         }
5586
5587         str->flags = flags;
5588         str->ch = ch;
5589 }
5590 \f
5591 /************************************************************************/
5592 /*                             Initialization                           */
5593 /************************************************************************/
5594
5595 void syms_of_file_coding(void)
5596 {
5597         INIT_LRECORD_IMPLEMENTATION(coding_system);
5598
5599         DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5600
5601         DEFSUBR(Fcoding_system_p);
5602         DEFSUBR(Ffind_coding_system);
5603         DEFSUBR(Fget_coding_system);
5604         DEFSUBR(Fcoding_system_list);
5605         DEFSUBR(Fcoding_system_name);
5606         DEFSUBR(Fmake_coding_system);
5607         DEFSUBR(Fcopy_coding_system);
5608         DEFSUBR(Fcoding_system_canonical_name_p);
5609         DEFSUBR(Fcoding_system_alias_p);
5610         DEFSUBR(Fcoding_system_aliasee);
5611         DEFSUBR(Fdefine_coding_system_alias);
5612         DEFSUBR(Fsubsidiary_coding_system);
5613
5614         DEFSUBR(Fcoding_system_type);
5615         DEFSUBR(Fcoding_system_doc_string);
5616 #ifdef MULE
5617         DEFSUBR(Fcoding_system_charset);
5618 #endif
5619         DEFSUBR(Fcoding_system_property);
5620
5621         DEFSUBR(Fcoding_category_list);
5622         DEFSUBR(Fset_coding_priority_list);
5623         DEFSUBR(Fcoding_priority_list);
5624         DEFSUBR(Fset_coding_category_system);
5625         DEFSUBR(Fcoding_category_system);
5626
5627         DEFSUBR(Fdetect_coding_region);
5628         DEFSUBR(Fdecode_coding_region);
5629         DEFSUBR(Fencode_coding_region);
5630 #ifdef MULE
5631         DEFSUBR(Fdecode_shift_jis_char);
5632         DEFSUBR(Fencode_shift_jis_char);
5633         DEFSUBR(Fdecode_big5_char);
5634         DEFSUBR(Fencode_big5_char);
5635         DEFSUBR(Fset_ucs_char);
5636         DEFSUBR(Fucs_char);
5637         DEFSUBR(Fset_char_ucs);
5638         DEFSUBR(Fchar_ucs);
5639 #endif                          /* MULE */
5640         defsymbol(&Qcoding_systemp, "coding-system-p");
5641         defsymbol(&Qno_conversion, "no-conversion");
5642         defsymbol(&Qraw_text, "raw-text");
5643 #ifdef MULE
5644         defsymbol(&Qbig5, "big5");
5645         defsymbol(&Qshift_jis, "shift-jis");
5646         defsymbol(&Qucs4, "ucs-4");
5647         defsymbol(&Qutf8, "utf-8");
5648         defsymbol(&Qccl, "ccl");
5649         defsymbol(&Qiso2022, "iso2022");
5650 #endif                          /* MULE */
5651         defsymbol(&Qmnemonic, "mnemonic");
5652         defsymbol(&Qeol_type, "eol-type");
5653         defsymbol(&Qpost_read_conversion, "post-read-conversion");
5654         defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5655
5656         defsymbol(&Qcr, "cr");
5657         defsymbol(&Qlf, "lf");
5658         defsymbol(&Qcrlf, "crlf");
5659         defsymbol(&Qeol_cr, "eol-cr");
5660         defsymbol(&Qeol_lf, "eol-lf");
5661         defsymbol(&Qeol_crlf, "eol-crlf");
5662 #ifdef MULE
5663         defsymbol(&Qcharset_g0, "charset-g0");
5664         defsymbol(&Qcharset_g1, "charset-g1");
5665         defsymbol(&Qcharset_g2, "charset-g2");
5666         defsymbol(&Qcharset_g3, "charset-g3");
5667         defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5668         defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5669         defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5670         defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5671         defsymbol(&Qno_iso6429, "no-iso6429");
5672         defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5673         defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5674
5675         defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5676         defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5677         defsymbol(&Qseven, "seven");
5678         defsymbol(&Qlock_shift, "lock-shift");
5679         defsymbol(&Qescape_quoted, "escape-quoted");
5680 #endif                          /* MULE */
5681         defsymbol(&Qencode, "encode");
5682         defsymbol(&Qdecode, "decode");
5683
5684 #ifdef MULE
5685         defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5686                   "shift-jis");
5687         defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5688         defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5689         defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5690         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5691         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5692                   "iso-8-designate");
5693         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5694         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5695         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5696                   "iso-lock-shift");
5697 #endif                          /* MULE */
5698         defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5699                   "no-conversion");
5700 }
5701
5702 void lstream_type_create_file_coding(void)
5703 {
5704         LSTREAM_HAS_METHOD(decoding, reader);
5705         LSTREAM_HAS_METHOD(decoding, writer);
5706         LSTREAM_HAS_METHOD(decoding, rewinder);
5707         LSTREAM_HAS_METHOD(decoding, seekable_p);
5708         LSTREAM_HAS_METHOD(decoding, flusher);
5709         LSTREAM_HAS_METHOD(decoding, closer);
5710         LSTREAM_HAS_METHOD(decoding, marker);
5711
5712         LSTREAM_HAS_METHOD(encoding, reader);
5713         LSTREAM_HAS_METHOD(encoding, writer);
5714         LSTREAM_HAS_METHOD(encoding, rewinder);
5715         LSTREAM_HAS_METHOD(encoding, seekable_p);
5716         LSTREAM_HAS_METHOD(encoding, flusher);
5717         LSTREAM_HAS_METHOD(encoding, closer);
5718         LSTREAM_HAS_METHOD(encoding, marker);
5719 }
5720
5721 void vars_of_file_coding(void)
5722 {
5723         int i;
5724
5725         fcd = xnew(struct file_coding_dump);
5726         dump_add_root_struct_ptr(&fcd, &fcd_description);
5727
5728         /* Initialize to something reasonable ... */
5729         for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5730                 fcd->coding_category_system[i] = Qnil;
5731                 fcd->coding_category_by_priority[i] = i;
5732         }
5733
5734         Fprovide(intern("file-coding"));
5735
5736         DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system  /*
5737 Coding system used for TTY keyboard input.
5738 Not used under a windowing system.
5739                                                                          */ );
5740         Vkeyboard_coding_system = Qnil;
5741
5742         DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system  /*
5743 Coding system used for TTY display output.
5744 Not used under a windowing system.
5745                                                                          */ );
5746         Vterminal_coding_system = Qnil;
5747
5748         DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read  /*
5749 Overriding coding system used when reading from a file or process.
5750 You should bind this variable with `let', but do not set it globally.
5751 If this is non-nil, it specifies the coding system that will be used
5752 to decode input on read operations, such as from a file or process.
5753 It overrides `buffer-file-coding-system-for-read',
5754 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5755 this one for permanent changes to the environment.  */ );
5756         Vcoding_system_for_read = Qnil;
5757
5758         DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write        /*
5759 Overriding coding system used when writing to a file or process.
5760 You should bind this variable with `let', but do not set it globally.
5761 If this is non-nil, it specifies the coding system that will be used
5762 to encode output for write operations, such as to a file or process.
5763 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5764 Use those variables instead of this one for permanent changes to the
5765 environment.  */ );
5766         Vcoding_system_for_write = Qnil;
5767
5768         DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system        /*
5769 Coding system used to convert pathnames when accessing files.
5770                                                                                  */ );
5771         Vfile_name_coding_system = Qnil;
5772
5773         DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5774 Non-nil means the buffer contents are regarded as multi-byte form
5775 of characters, not a binary code.  This affects the display, file I/O,
5776 and behaviors of various editing commands.
5777
5778 Setting this to nil does not do anything.
5779                                                                                  */ );
5780         enable_multibyte_characters = 1;
5781 }
5782
5783 void complex_vars_of_file_coding(void)
5784 {
5785         staticpro(&Vcoding_system_hash_table);
5786         Vcoding_system_hash_table =
5787             make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5788
5789         the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5790         dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5791                                  &codesys_prop_dynarr_description);
5792
5793 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5794 {                                               \
5795   struct codesys_prop csp;                      \
5796   csp.sym = (Sym);                              \
5797   csp.prop_type = (Prop_Type);                  \
5798   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5799 } while (0)
5800
5801         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5802         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5803         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5804         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5805         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5806         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5807         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5808 #ifdef MULE
5809         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5810         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5811         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5812         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5813         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5814         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5815         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5816         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5817         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5818         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5819         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5820         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5821         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5822         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5823         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5824         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5825         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5826
5827         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5828         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5829 #endif                          /* MULE */
5830         /* Need to create this here or we're really screwed. */
5831         Fmake_coding_system
5832             (Qraw_text, Qno_conversion,
5833              build_string
5834              ("Raw text, which means it converts only line-break-codes."),
5835              list2(Qmnemonic, build_string("Raw")));
5836
5837         Fmake_coding_system
5838             (Qbinary, Qno_conversion,
5839              build_string("Binary, which means it does not convert anything."),
5840              list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5841
5842         Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5843
5844         Fdefine_coding_system_alias(Qfile_name, Qbinary);
5845
5846         Fdefine_coding_system_alias(Qterminal, Qbinary);
5847         Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5848
5849         /* Need this for bootstrapping */
5850         fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5851             Fget_coding_system(Qraw_text);
5852
5853 #ifdef MULE
5854         {
5855                 size_t i;
5856
5857                 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5858                         fcd->ucs_to_mule_table[i] = Qnil;
5859         }
5860         staticpro(&mule_to_ucs_table);
5861         mule_to_ucs_table = Fmake_char_table(Qgeneric);
5862 #endif                          /* MULE */
5863 }