SXEmacs, meet Dbus. Dbus, meet SXEmacs
[sxemacs] / src / mule / file-coding.c
1 /* Code conversion functions.
2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3    Copyright (C) 1995 Sun Microsystems, Inc.
4
5 This file is part of SXEmacs
6
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>. */
19
20
21 /* Synched up with: Mule 2.3.   Not in FSF. */
22
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
24
25 #include <config.h>
26 #include "lisp.h"
27
28 #include "buffer.h"
29 #include "elhash.h"
30 #include "ui/insdel.h"
31 #include "lstream.h"
32 #include "opaque.h"
33 #ifdef MULE
34 #include "mule-ccl.h"
35 #include "chartab.h"
36 #endif
37 #include "file-coding.h"
38
39 Lisp_Object Qcoding_system_error;
40
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
46
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
49
50 struct file_coding_dump {
51         /* Coding system currently associated with each coding category. */
52         Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
53
54         /* Table of all coding categories in decreasing order of priority.
55            This describes a permutation of the possible coding categories. */
56         int coding_category_by_priority[CODING_CATEGORY_LAST];
57
58 #ifdef MULE
59         Lisp_Object ucs_to_mule_table[65536];
60 #endif
61 } *fcd;
62
63 static const struct lrecord_description fcd_description_1[] = {
64         {XD_LISP_OBJECT_ARRAY,
65          offsetof(struct file_coding_dump, coding_category_system),
66          CODING_CATEGORY_LAST},
67 #ifdef MULE
68         {XD_LISP_OBJECT_ARRAY,
69          offsetof(struct file_coding_dump, ucs_to_mule_table),
70          countof(fcd->ucs_to_mule_table)},
71 #endif
72         {XD_END}
73 };
74
75 static const struct struct_description fcd_description = {
76         sizeof(struct file_coding_dump),
77         fcd_description_1
78 };
79
80 Lisp_Object mule_to_ucs_table;
81
82 Lisp_Object Qcoding_systemp;
83
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
86
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
92
93 #ifdef MULE
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 #endif
104 Lisp_Object Qencode, Qdecode;
105
106 Lisp_Object Vcoding_system_hash_table;
107
108 int enable_multibyte_characters;
109
110 #ifdef MULE
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113         /* CHARSET holds the character sets currently assigned to the G0
114            through G3 variables.  It is initialized from the array
115            INITIAL_CHARSET in CODESYS. */
116         Lisp_Object charset[4];
117
118         /* Which registers are currently invoked into the left (GL) and
119            right (GR) halves of the 8-bit encoding space? */
120         int register_left, register_right;
121
122         /* ISO_ESC holds a value indicating part of an escape sequence
123            that has already been seen. */
124         enum iso_esc_flag esc;
125
126         /* This records the bytes we've seen so far in an escape sequence,
127            in case the sequence is invalid (we spit out the bytes unchanged). */
128         unsigned char esc_bytes[8];
129
130         /* Index for next byte to store in ISO escape sequence. */
131         int esc_bytes_index;
132
133 #ifdef ENABLE_COMPOSITE_CHARS
134         /* Stuff seen so far when composing a string. */
135         unsigned_char_dynarr *composite_chars;
136 #endif
137
138         /* If we saw an invalid designation sequence for a particular
139            register, we flag it here and switch to ASCII.  The next time we
140            see a valid designation for this register, we turn off the flag
141            and do the designation normally, but pretend the sequence was
142            invalid.  The effect of all this is that (most of the time) the
143            escape sequences for both the switch to the unknown charset, and
144            the switch back to the known charset, get inserted literally into
145            the buffer and saved out as such.  The hope is that we can
146            preserve the escape sequences so that the resulting written out
147            file makes sense.  If we don't do any of this, the designation
148            to the invalid charset will be preserved but that switch back
149            to the known charset will probably get eaten because it was
150            the same charset that was already present in the register. */
151         unsigned char invalid_designated[4];
152
153         /* We try to do similar things as above for direction-switching
154            sequences.  If we encountered a direction switch while an
155            invalid designation was present, or an invalid designation
156            just after a direction switch (i.e. no valid designation
157            encountered yet), we insert the direction-switch escape
158            sequence literally into the output stream, and later on
159            insert the corresponding direction-restoring escape sequence
160            literally also. */
161         unsigned int switched_dir_and_no_valid_charset_yet:1;
162         unsigned int invalid_switch_dir:1;
163
164         /* Tells the decoder to output the escape sequence literally
165            even though it was valid.  Used in the games we play to
166            avoid lossage when we encounter invalid designations. */
167         unsigned int output_literally:1;
168         /* We encountered a direction switch followed by an invalid
169            designation.  We didn't output the direction switch
170            literally because we didn't know about the invalid designation;
171            but we have to do so now. */
172         unsigned int output_direction_sequence:1;
173 };
174 #endif                          /* MULE */
175 EXFUN(Fcopy_coding_system, 2);
176 #ifdef MULE
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179                               const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181                                unsigned_char_dynarr * dst,
182                                Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184                                unsigned_char_dynarr * dst,
185                                Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187                               Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189                                unsigned_char_dynarr * dst,
190                                Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192                                unsigned_char_dynarr * dst,
193                                Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195                               Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197                                unsigned_char_dynarr * dst,
198                                Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200                                unsigned_char_dynarr * dst,
201                                Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203                               Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205                                unsigned_char_dynarr * dst,
206                                Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208                                unsigned_char_dynarr * dst,
209                                Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212                           struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214                                  const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216                                   unsigned_char_dynarr * dst,
217                                   Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219                                   unsigned_char_dynarr * dst,
220                                   Lstream_data_count n);
221 #endif                          /* MULE */
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223                                         unsigned_char_dynarr * dst,
224                                         Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226                                         unsigned_char_dynarr * dst,
227                                         Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229                         unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231                         unsigned_char_dynarr * dst, Lstream_data_count n);
232
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
235         Lisp_Object sym;
236         int prop_type;
237 };
238
239 typedef struct {
240         Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
242
243 static const struct lrecord_description codesys_prop_description_1[] = {
244         {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
245         {XD_END}
246 };
247
248 static const struct struct_description codesys_prop_description = {
249         sizeof(codesys_prop),
250         codesys_prop_description_1
251 };
252
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254         XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
255         {XD_END}
256 };
257
258 static const struct struct_description codesys_prop_dynarr_description = {
259         sizeof(codesys_prop_dynarr),
260         codesys_prop_dynarr_description_1
261 };
262
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
264
265 enum codesys_prop_enum {
266         CODESYS_PROP_ALL_OK,
267         CODESYS_PROP_ISO2022,
268         CODESYS_PROP_CCL
269 };
270 \f
271 /************************************************************************/
272 /*                       Coding system functions                        */
273 /************************************************************************/
274
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
278
279 #ifdef MULE
280 static const struct lrecord_description ccs_description_1[] = {
281         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
283         {XD_END}
284 };
285
286 static const struct struct_description ccs_description = {
287         sizeof(charset_conversion_spec),
288         ccs_description_1
289 };
290
291 static const struct lrecord_description ccsd_description_1[] = {
292         XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
293         {XD_END}
294 };
295
296 static const struct struct_description ccsd_description = {
297         sizeof(charset_conversion_spec_dynarr),
298         ccsd_description_1
299 };
300 #endif
301
302 static const struct lrecord_description coding_system_description[] = {
303         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
311 #ifdef MULE
312         {XD_LISP_OBJECT_ARRAY,
313          offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
315          &ccsd_description},
316         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
317          &ccsd_description},
318         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
320 #endif
321         {XD_END}
322 };
323
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325                               mark_coding_system, print_coding_system,
326                               finalize_coding_system,
327                               0, 0, coding_system_description,
328                               Lisp_Coding_System);
329
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
331 {
332         Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
333
334         mark_object(CODING_SYSTEM_NAME(codesys));
335         mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336         mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337         mark_object(CODING_SYSTEM_EOL_LF(codesys));
338         mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339         mark_object(CODING_SYSTEM_EOL_CR(codesys));
340
341         switch (CODING_SYSTEM_TYPE(codesys)) {
342 #ifdef MULE
343                 int i;
344         case CODESYS_ISO2022:
345                 for (i = 0; i < 4; i++)
346                         mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
347                                     (codesys, i));
348                 if (codesys->iso2022.input_conv) {
349                         for (i = 0;
350                              i < Dynarr_length(codesys->iso2022.input_conv);
351                              i++) {
352                                 struct charset_conversion_spec *ccs =
353                                     Dynarr_atp(codesys->iso2022.input_conv, i);
354                                 mark_object(ccs->from_charset);
355                                 mark_object(ccs->to_charset);
356                         }
357                 }
358                 if (codesys->iso2022.output_conv) {
359                         for (i = 0;
360                              i < Dynarr_length(codesys->iso2022.output_conv);
361                              i++) {
362                                 struct charset_conversion_spec *ccs =
363                                     Dynarr_atp(codesys->iso2022.output_conv, i);
364                                 mark_object(ccs->from_charset);
365                                 mark_object(ccs->to_charset);
366                         }
367                 }
368                 break;
369
370         case CODESYS_CCL:
371                 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372                 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
373                 break;
374
375                 /* list the rest of them lot explicitly */
376         case CODESYS_AUTODETECT:
377         case CODESYS_SHIFT_JIS:
378         case CODESYS_BIG5:
379         case CODESYS_UCS4:
380         case CODESYS_UTF8:
381         case CODESYS_NO_CONVERSION:
382 #ifdef DEBUG_SXEMACS
383         case CODESYS_INTERNAL:
384 #endif
385 #endif                          /* MULE */
386         default:
387                 break;
388         }
389
390         mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391         return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
392 }
393
394 static void
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
396 {
397         Lisp_Coding_System *c = XCODING_SYSTEM(obj);
398         if (print_readably)
399                 error("printing unreadable object #<coding-system 0x%x>",
400                       c->header.uid);
401
402         write_c_string("#<coding-system ", printcharfun);
403         print_internal(c->name, printcharfun, 1);
404         write_c_string(">", printcharfun);
405 }
406
407 static void finalize_coding_system(void *header, int for_disksave)
408 {
409         Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410         /* Since coding systems never go away, this function is not
411            necessary.  But it would be necessary if we changed things
412            so that coding systems could go away. */
413         if (!for_disksave) {    /* see comment in lstream.c */
414                 switch (CODING_SYSTEM_TYPE(c)) {
415 #ifdef MULE
416                 case CODESYS_ISO2022:
417                         if (c->iso2022.input_conv) {
418                                 Dynarr_free(c->iso2022.input_conv);
419                                 c->iso2022.input_conv = 0;
420                         }
421                         if (c->iso2022.output_conv) {
422                                 Dynarr_free(c->iso2022.output_conv);
423                                 c->iso2022.output_conv = 0;
424                         }
425                         break;
426
427                         /* list the rest of them lot explicitly */
428                 case CODESYS_AUTODETECT:
429                 case CODESYS_SHIFT_JIS:
430                 case CODESYS_BIG5:
431                 case CODESYS_UCS4:
432                 case CODESYS_UTF8:
433                 case CODESYS_CCL:
434                 case CODESYS_NO_CONVERSION:
435 #ifdef DEBUG_SXEMACS
436                 case CODESYS_INTERNAL:
437 #endif
438 #endif                          /* MULE */
439                 default:
440                         break;
441                 }
442         }
443 }
444
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
446 {
447         CHECK_SYMBOL(symbol);
448         if (NILP(symbol))
449                 return EOL_AUTODETECT;
450         if (EQ(symbol, Qlf))
451                 return EOL_LF;
452         if (EQ(symbol, Qcrlf))
453                 return EOL_CRLF;
454         if (EQ(symbol, Qcr))
455                 return EOL_CR;
456
457         signal_simple_error("Unrecognized eol type", symbol);
458         return EOL_AUTODETECT;  /* not reached */
459 }
460
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
462 {
463         switch (type) {
464         default:
465                 abort();
466                 return Qnil;
467         case EOL_LF:
468                 return Qlf;
469         case EOL_CRLF:
470                 return Qcrlf;
471         case EOL_CR:
472                 return Qcr;
473         case EOL_AUTODETECT:
474                 return Qnil;
475         }
476 }
477
478 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
479 {
480         Lisp_Object codesys_obj;
481         int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
482         char *codesys_name = (char *)alloca(len + 7);
483         int mlen = -1;
484         char *codesys_mnemonic = 0;
485
486         Lisp_Object codesys_name_sym, sub_codesys_obj;
487
488         /* kludge */
489
490         XSETCODING_SYSTEM(codesys_obj, codesys);
491
492         memcpy(codesys_name,
493                string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
494
495         if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
496                 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
497                 codesys_mnemonic = (char *)alloca(mlen + 7);
498                 memcpy(codesys_mnemonic,
499                        XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
500         }
501 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
502   strcpy (codesys_name + len, "-" op_sys);                                      \
503   if (mlen != -1)                                                               \
504     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
505   codesys_name_sym = intern (codesys_name);                                     \
506   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
507   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
508   if (mlen != -1)                                                               \
509     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
510       build_string (codesys_mnemonic);                                          \
511   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
512 } while (0)
513
514         DEFINE_SUB_CODESYS("unix", "", EOL_LF);
515         DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
516         DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
517 }
518
519 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0,     /*
520 Return t if OBJECT is a coding system.
521 A coding system is an object that defines how text containing multiple
522 character sets is encoded into a stream of (typically 8-bit) bytes.
523 The coding system is used to decode the stream into a series of
524 characters (which may be from multiple charsets) when the text is read
525 from a file or process, and is used to encode the text back into the
526 same format when it is written out to a file or process.
527
528 For example, many ISO2022-compliant coding systems (such as Compound
529 Text, which is used for inter-client data under the X Window System)
530 use escape sequences to switch between different charsets -- Japanese
531 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
532 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
533 `make-coding-system' for more information.
534
535 Coding systems are normally identified using a symbol, and the
536 symbol is accepted in place of the actual coding system object whenever
537 a coding system is called for. (This is similar to how faces work.)
538 */
539       (object))
540 {
541         return CODING_SYSTEMP(object) ? Qt : Qnil;
542 }
543
544 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0,       /*
545 Retrieve the coding system of the given name.
546
547 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
548 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
549 If there is no such coding system, nil is returned.  Otherwise the
550 associated coding system object is returned.
551 */
552       (coding_system_or_name))
553 {
554         if (NILP(coding_system_or_name))
555                 coding_system_or_name = Qbinary;
556         else if (CODING_SYSTEMP(coding_system_or_name))
557                 return coding_system_or_name;
558         else
559                 CHECK_SYMBOL(coding_system_or_name);
560
561         while (1) {
562                 coding_system_or_name =
563                     Fgethash(coding_system_or_name, Vcoding_system_hash_table,
564                              Qnil);
565
566                 if (CODING_SYSTEMP(coding_system_or_name)
567                     || NILP(coding_system_or_name))
568                         return coding_system_or_name;
569         }
570 }
571
572 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
573 Retrieve the coding system of the given name.
574 Same as `find-coding-system' except that if there is no such
575 coding system, an error is signaled instead of returning nil.
576 */
577       (name))
578 {
579         Lisp_Object coding_system = Ffind_coding_system(name);
580
581         if (NILP(coding_system))
582                 signal_simple_error("No such coding system", name);
583         return coding_system;
584 }
585
586 /* We store the coding systems in hash tables with the names as the key and the
587    actual coding system object as the value.  Occasionally we need to use them
588    in a list format.  These routines provide us with that. */
589 struct coding_system_list_closure {
590         Lisp_Object *coding_system_list;
591 };
592
593 static int
594 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
595                                  void *coding_system_list_closure)
596 {
597         /* This function can GC */
598         struct coding_system_list_closure *cscl =
599             (struct coding_system_list_closure *)coding_system_list_closure;
600         Lisp_Object *coding_system_list = cscl->coding_system_list;
601
602         *coding_system_list = Fcons(key, *coding_system_list);
603         return 0;
604 }
605
606 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0,       /*
607 Return a list of the names of all defined coding systems.
608 */
609       ())
610 {
611         Lisp_Object coding_system_list = Qnil;
612         struct gcpro gcpro1;
613         struct coding_system_list_closure coding_system_list_closure;
614
615         GCPRO1(coding_system_list);
616         coding_system_list_closure.coding_system_list = &coding_system_list;
617         elisp_maphash(add_coding_system_to_list_mapper,
618                       Vcoding_system_hash_table, &coding_system_list_closure);
619         UNGCPRO;
620
621         return coding_system_list;
622 }
623
624 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0,       /*
625 Return the name of the given coding system.
626 */
627       (coding_system))
628 {
629         coding_system = Fget_coding_system(coding_system);
630         return XCODING_SYSTEM_NAME(coding_system);
631 }
632
633 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
634                                                   Lisp_Object name)
635 {
636         Lisp_Coding_System *codesys =
637             alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
638
639         zero_lcrecord(codesys);
640         CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
641         CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
642         CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
643         CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
644         CODING_SYSTEM_EOL_CR(codesys) = Qnil;
645         CODING_SYSTEM_EOL_LF(codesys) = Qnil;
646         CODING_SYSTEM_TYPE(codesys) = type;
647         CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
648 #ifdef MULE
649         if (type == CODESYS_ISO2022) {
650                 int i;
651                 for (i = 0; i < 4; i++)
652                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
653                             Qnil;
654         } else if (type == CODESYS_CCL) {
655                 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
656                 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
657         }
658 #endif                          /* MULE */
659         CODING_SYSTEM_NAME(codesys) = name;
660
661         return codesys;
662 }
663
664 #ifdef MULE
665 /* Given a list of charset conversion specs as specified in a Lisp
666    program, parse it into STORE_HERE. */
667
668 static void
669 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
670                                Lisp_Object spec_list)
671 {
672         Lisp_Object rest;
673
674         EXTERNAL_LIST_LOOP(rest, spec_list) {
675                 Lisp_Object car = XCAR(rest);
676                 Lisp_Object from, to;
677                 struct charset_conversion_spec spec;
678
679                 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
680                         signal_simple_error("Invalid charset conversion spec",
681                                             car);
682                 from = Fget_charset(XCAR(car));
683                 to = Fget_charset(XCAR(XCDR(car)));
684                 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
685                         signal_simple_error_2
686                             ("Attempted conversion between different charset types",
687                              from, to);
688                 spec.from_charset = from;
689                 spec.to_charset = to;
690
691                 Dynarr_add(store_here, spec);
692         }
693 }
694
695 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
696    specs, return the equivalent as the Lisp programmer would see it.
697
698    If LOAD_HERE is 0, return Qnil. */
699
700 static Lisp_Object
701 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
702 {
703         int i;
704         Lisp_Object result;
705
706         if (!load_here)
707                 return Qnil;
708         for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
709                 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
710                 result =
711                     Fcons(list2(ccs->from_charset, ccs->to_charset), result);
712         }
713
714         return Fnreverse(result);
715 }
716
717 #endif                          /* MULE */
718
719 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0,       /*
720 Register symbol NAME as a coding system.
721
722 TYPE describes the conversion method used and should be one of
723
724 nil or 'undecided
725 Automatic conversion.  SXEmacs attempts to detect the coding system
726 used in the file.
727 'no-conversion
728 No conversion.  Use this for binary files and such.  On output,
729 graphic characters that are not in ASCII or Latin-1 will be
730 replaced by a ?. (For a no-conversion-encoded buffer, these
731 characters will only be present if you explicitly insert them.)
732 'shift-jis
733 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
734 'ucs-4
735 ISO 10646 UCS-4 encoding.
736 'utf-8
737 ISO 10646 UTF-8 encoding.
738 'iso2022
739 Any ISO2022-compliant encoding.  Among other things, this includes
740 JIS (the Japanese encoding commonly used for e-mail), EUC (the
741 standard Unix encoding for Japanese and other languages), and
742 Compound Text (the encoding used in X11).  You can specify more
743 specific information about the conversion with the PROPS argument.
744 'big5
745 Big5 (the encoding commonly used for Taiwanese).
746 'ccl
747 The conversion is performed using a user-written pseudo-code
748 program.  CCL (Code Conversion Language) is the name of this
749 pseudo-code.
750 'internal
751 Write out or read in the raw contents of the memory representing
752 the buffer's text.  This is primarily useful for debugging
753 purposes, and is only enabled when SXEmacs has been compiled with
754 DEBUG_SXEMACS defined (via the --debug configure option).
755 WARNING: Reading in a file using 'internal conversion can result
756 in an internal inconsistency in the memory representing a
757 buffer's text, which will produce unpredictable results and may
758 cause SXEmacs to crash.  Under normal circumstances you should
759 never use 'internal conversion.
760
761 DOC-STRING is a string describing the coding system.
762
763 PROPS is a property list, describing the specific nature of the
764 character set.  Recognized properties are:
765
766 'mnemonic
767 String to be displayed in the modeline when this coding system is
768 active.
769
770 'eol-type
771 End-of-line conversion to be used.  It should be one of
772
773 nil
774 Automatically detect the end-of-line type (LF, CRLF,
775 or CR).  Also generate subsidiary coding systems named
776 `NAME-unix', `NAME-dos', and `NAME-mac', that are
777 identical to this coding system but have an EOL-TYPE
778 value of 'lf, 'crlf, and 'cr, respectively.
779 'lf
780 The end of a line is marked externally using ASCII LF.
781 Since this is also the way that SXEmacs represents an
782 end-of-line internally, specifying this option results
783 in no end-of-line conversion.  This is the standard
784 format for Unix text files.
785 'crlf
786 The end of a line is marked externally using ASCII
787 CRLF.  This is the standard format for MS-DOS text
788 files.
789 'cr
790 The end of a line is marked externally using ASCII CR.
791 This is the standard format for Macintosh text files.
792 t
793 Automatically detect the end-of-line type but do not
794 generate subsidiary coding systems.  (This value is
795 converted to nil when stored internally, and
796 `coding-system-property' will return nil.)
797
798 'post-read-conversion
799 Function called after a file has been read in, to perform the
800 decoding.  Called with two arguments, START and END, denoting
801 a region of the current buffer to be decoded.
802
803 'pre-write-conversion
804 Function called before a file is written out, to perform the
805 encoding.  Called with two arguments, START and END, denoting
806 a region of the current buffer to be encoded.
807
808 The following additional properties are recognized if TYPE is 'iso2022:
809
810 'charset-g0
811 'charset-g1
812 'charset-g2
813 'charset-g3
814 The character set initially designated to the G0 - G3 registers.
815 The value should be one of
816
817 -- A charset object (designate that character set)
818 -- nil (do not ever use this register)
819 -- t (no character set is initially designated to
820 the register, but may be later on; this automatically
821 sets the corresponding `force-g*-on-output' property)
822
823 'force-g0-on-output
824 'force-g1-on-output
825 'force-g2-on-output
826 'force-g2-on-output
827 If non-nil, send an explicit designation sequence on output before
828 using the specified register.
829
830 'short
831 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
832 "ESC $ B" on output in place of the full designation sequences
833 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
834
835 'no-ascii-eol
836 If non-nil, don't designate ASCII to G0 at each end of line on output.
837 Setting this to non-nil also suppresses other state-resetting that
838 normally happens at the end of a line.
839
840 'no-ascii-cntl
841 If non-nil, don't designate ASCII to G0 before control chars on output.
842
843 'seven
844 If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
845 environment.
846
847 'lock-shift
848 If non-nil, use locking-shift (SO/SI) instead of single-shift
849 or designation by escape sequence.
850
851 'no-iso6429
852 If non-nil, don't use ISO6429's direction specification.
853
854 'escape-quoted
855 If non-nil, literal control characters that are the same as
856 the beginning of a recognized ISO2022 or ISO6429 escape sequence
857 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
858 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
859 so that they can be properly distinguished from an escape sequence.
860 (Note that doing this results in a non-portable encoding.) This
861 encoding flag is used for byte-compiled files.  Note that ESC
862 is a good choice for a quoting character because there are no
863 escape sequences whose second byte is a character from the Control-0
864 or Control-1 character sets; this is explicitly disallowed by the
865 ISO2022 standard.
866
867 'input-charset-conversion
868 A list of conversion specifications, specifying conversion of
869 characters in one charset to another when decoding is performed.
870 Each specification is a list of two elements: the source charset,
871 and the destination charset.
872
873 'output-charset-conversion
874 A list of conversion specifications, specifying conversion of
875 characters in one charset to another when encoding is performed.
876 The form of each specification is the same as for
877 'input-charset-conversion.
878
879 The following additional properties are recognized (and required)
880 if TYPE is 'ccl:
881
882 'decode
883 CCL program used for decoding (converting to internal format).
884
885 'encode
886 CCL program used for encoding (converting to external format).
887 */
888       (name, type, doc_string, props))
889 {
890         Lisp_Coding_System *codesys;
891         enum coding_system_type ty;
892         int need_to_setup_eol_systems = 1;
893
894         /* Convert type to constant */
895         if (NILP(type) || EQ(type, Qundecided)) {
896                 ty = CODESYS_AUTODETECT;
897         }
898 #ifdef MULE
899         else if (EQ(type, Qshift_jis)) {
900                 ty = CODESYS_SHIFT_JIS;
901         } else if (EQ(type, Qiso2022)) {
902                 ty = CODESYS_ISO2022;
903         } else if (EQ(type, Qbig5)) {
904                 ty = CODESYS_BIG5;
905         } else if (EQ(type, Qucs4)) {
906                 ty = CODESYS_UCS4;
907         } else if (EQ(type, Qutf8)) {
908                 ty = CODESYS_UTF8;
909         } else if (EQ(type, Qccl)) {
910                 ty = CODESYS_CCL;
911         }
912 #endif
913         else if (EQ(type, Qno_conversion)) {
914                 ty = CODESYS_NO_CONVERSION;
915         }
916 #ifdef DEBUG_SXEMACS
917         else if (EQ(type, Qinternal)) {
918                 ty = CODESYS_INTERNAL;
919         }
920 #endif
921         else
922                 signal_simple_error("Invalid coding system type", type);
923
924         CHECK_SYMBOL(name);
925
926         codesys = allocate_coding_system(ty, name);
927
928         if (NILP(doc_string))
929                 doc_string = build_string("");
930         else
931                 CHECK_STRING(doc_string);
932         CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
933
934         {
935                 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
936                         if (EQ(key, Qmnemonic)) {
937                                 if (!NILP(value))
938                                         CHECK_STRING(value);
939                                 CODING_SYSTEM_MNEMONIC(codesys) = value;
940                         }
941
942                         else if (EQ(key, Qeol_type)) {
943                                 need_to_setup_eol_systems = NILP(value);
944                                 if (EQ(value, Qt))
945                                         value = Qnil;
946                                 CODING_SYSTEM_EOL_TYPE(codesys) =
947                                     symbol_to_eol_type(value);
948                         }
949
950                         else if (EQ(key, Qpost_read_conversion))
951                                 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
952                                     value;
953                         else if (EQ(key, Qpre_write_conversion))
954                                 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
955                                     value;
956 #ifdef MULE
957                         else if (ty == CODESYS_ISO2022) {
958 #define FROB_INITIAL_CHARSET(charset_num) \
959   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
960     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
961
962                                 if (EQ(key, Qcharset_g0))
963                                         FROB_INITIAL_CHARSET(0);
964                                 else if (EQ(key, Qcharset_g1))
965                                         FROB_INITIAL_CHARSET(1);
966                                 else if (EQ(key, Qcharset_g2))
967                                         FROB_INITIAL_CHARSET(2);
968                                 else if (EQ(key, Qcharset_g3))
969                                         FROB_INITIAL_CHARSET(3);
970
971 #define FROB_FORCE_CHARSET(charset_num) \
972   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
973
974                                 else if (EQ(key, Qforce_g0_on_output))
975                                         FROB_FORCE_CHARSET(0);
976                                 else if (EQ(key, Qforce_g1_on_output))
977                                         FROB_FORCE_CHARSET(1);
978                                 else if (EQ(key, Qforce_g2_on_output))
979                                         FROB_FORCE_CHARSET(2);
980                                 else if (EQ(key, Qforce_g3_on_output))
981                                         FROB_FORCE_CHARSET(3);
982
983 #define FROB_BOOLEAN_PROPERTY(prop) \
984   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
985
986                                 else if (EQ(key, Qshort))
987                                         FROB_BOOLEAN_PROPERTY(SHORT);
988                                 else if (EQ(key, Qno_ascii_eol))
989                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
990                                 else if (EQ(key, Qno_ascii_cntl))
991                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
992                                 else if (EQ(key, Qseven))
993                                         FROB_BOOLEAN_PROPERTY(SEVEN);
994                                 else if (EQ(key, Qlock_shift))
995                                         FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
996                                 else if (EQ(key, Qno_iso6429))
997                                         FROB_BOOLEAN_PROPERTY(NO_ISO6429);
998                                 else if (EQ(key, Qescape_quoted))
999                                         FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
1000
1001                                 else if (EQ(key, Qinput_charset_conversion)) {
1002                                         codesys->iso2022.input_conv =
1003                                             Dynarr_new(charset_conversion_spec);
1004                                         parse_charset_conversion_specs(codesys->
1005                                                                        iso2022.
1006                                                                        input_conv,
1007                                                                        value);
1008                                 } else if (EQ(key, Qoutput_charset_conversion)) {
1009                                         codesys->iso2022.output_conv =
1010                                             Dynarr_new(charset_conversion_spec);
1011                                         parse_charset_conversion_specs(codesys->
1012                                                                        iso2022.
1013                                                                        output_conv,
1014                                                                        value);
1015                                 } else
1016                                         signal_simple_error
1017                                             ("Unrecognized property", key);
1018                         } else if (EQ(type, Qccl)) {
1019                                 Lisp_Object sym;
1020                                 struct ccl_program test_ccl;
1021                                 Extbyte *suffix;
1022
1023                                 /* Check key first.  */
1024                                 if (EQ(key, Qdecode))
1025                                         suffix = "-ccl-decode";
1026                                 else if (EQ(key, Qencode))
1027                                         suffix = "-ccl-encode";
1028                                 else
1029                                         signal_simple_error
1030                                             ("Unrecognized property", key);
1031
1032                                 /* If value is vector, register it as a ccl program
1033                                    associated with an newly created symbol for
1034                                    backward compatibility.  */
1035                                 if (VECTORP(value)) {
1036                                         sym =
1037                                             Fintern(concat2
1038                                                     (Fsymbol_name(name),
1039                                                      build_string(suffix)),
1040                                                     Qnil);
1041                                         Fregister_ccl_program(sym, value);
1042                                 } else {
1043                                         CHECK_SYMBOL(value);
1044                                         sym = value;
1045                                 }
1046                                 /* check if the given ccl programs are valid.  */
1047                                 if (setup_ccl_program(&test_ccl, sym) < 0)
1048                                         signal_simple_error
1049                                             ("Invalid CCL program", value);
1050
1051                                 if (EQ(key, Qdecode))
1052                                         CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1053                                 else if (EQ(key, Qencode))
1054                                         CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1055
1056                         }
1057 #endif                          /* MULE */
1058                         else
1059                                 signal_simple_error("Unrecognized property",
1060                                                     key);
1061                 }
1062         }
1063
1064         if (need_to_setup_eol_systems)
1065                 setup_eol_coding_systems(codesys);
1066
1067         {
1068                 Lisp_Object codesys_obj;
1069                 XSETCODING_SYSTEM(codesys_obj, codesys);
1070                 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1071                 return codesys_obj;
1072         }
1073 }
1074
1075 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0,       /*
1076 Copy OLD-CODING-SYSTEM to NEW-NAME.
1077 If NEW-NAME does not name an existing coding system, a new one will
1078 be created.
1079 */
1080       (old_coding_system, new_name))
1081 {
1082         Lisp_Object new_coding_system;
1083         old_coding_system = Fget_coding_system(old_coding_system);
1084         new_coding_system = Ffind_coding_system(new_name);
1085         if (NILP(new_coding_system)) {
1086                 XSETCODING_SYSTEM(new_coding_system,
1087                                   allocate_coding_system
1088                                   (XCODING_SYSTEM_TYPE(old_coding_system),
1089                                    new_name));
1090                 Fputhash(new_name, new_coding_system,
1091                          Vcoding_system_hash_table);
1092         }
1093
1094         {
1095                 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1096                 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1097                 memcpy(((char *)to) + sizeof(to->header),
1098                        ((char *)from) + sizeof(from->header),
1099                        sizeof(*from) - sizeof(from->header));
1100                 to->name = new_name;
1101         }
1102         return new_coding_system;
1103 }
1104
1105 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0,       /*
1106 Return t if OBJECT names a coding system, and is not a coding system alias.
1107 */
1108       (object))
1109 {
1110         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1111         return CODING_SYSTEMP(val) ? Qt : Qnil;
1112 }
1113
1114 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1115  Return t if OBJECT is a coding system alias.
1116 All coding system aliases are created by `define-coding-system-alias'.
1117 */
1118       (object))
1119 {
1120         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1121         return SYMBOLP(val) ? Qt : Qnil;
1122 }
1123
1124 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1125 Return the coding-system symbol for which symbol ALIAS is an alias.
1126 */
1127       (alias))
1128 {
1129         Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1130         if (SYMBOLP(aliasee)) {
1131                 return aliasee;
1132         } else {
1133                 signal_simple_error(
1134                         "Symbol is not a coding system alias", alias);
1135         }
1136         /* To keep the compiler happy */
1137         return Qnil;
1138 }
1139
1140 static Lisp_Object
1141 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1142 {
1143         return Fintern(concat2(Fsymbol_name(symbol),
1144                                build_string(ascii_string)), Qnil);
1145 }
1146
1147 /* A maphash function, for removing dangling coding system aliases. */
1148 static int
1149 dangling_coding_system_alias_p(Lisp_Object alias,
1150                                Lisp_Object aliasee, void *dangling_aliases)
1151 {
1152         if (SYMBOLP(aliasee)
1153             && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1154                 (*(int *)dangling_aliases)++;
1155                 return 1;
1156         } else {
1157                 return 0;
1158         }
1159 }
1160
1161 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0,       /*
1162 Define symbol ALIAS as an alias for coding system ALIASEE.
1163
1164 You can use this function to redefine an alias that has already been defined,
1165 but you cannot redefine a name which is the canonical name for a coding system.
1166 \(a canonical name of a coding system is what is returned when you call
1167 `coding-system-name' on a coding system).
1168
1169 ALIASEE itself can be an alias, which allows you to define nested aliases.
1170
1171 You are forbidden, however, from creating alias loops or `dangling' aliases.
1172 These will be detected, and an error will be signaled if you attempt to do so.
1173
1174 If ALIASEE is nil, then ALIAS will simply be undefined.
1175
1176 See also `coding-system-alias-p', `coding-system-aliasee',
1177 and `coding-system-canonical-name-p'.
1178 */
1179       (alias, aliasee))
1180 {
1181         Lisp_Object real_coding_system, probe;
1182
1183         CHECK_SYMBOL(alias);
1184
1185         if (!NILP(Fcoding_system_canonical_name_p(alias)))
1186                 signal_simple_error
1187                     ("Symbol is the canonical name of a coding system and cannot be redefined",
1188                      alias);
1189
1190         if (NILP(aliasee)) {
1191                 Lisp_Object subsidiary_unix =
1192                     append_suffix_to_symbol(alias, "-unix");
1193                 Lisp_Object subsidiary_dos =
1194                     append_suffix_to_symbol(alias, "-dos");
1195                 Lisp_Object subsidiary_mac =
1196                     append_suffix_to_symbol(alias, "-mac");
1197
1198                 Fremhash(alias, Vcoding_system_hash_table);
1199
1200                 /* Undefine subsidiary aliases,
1201                    presumably created by a previous call to this function */
1202                 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1203                     !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1204                     !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1205                         Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1206                         Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1207                         Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1208                 }
1209
1210                 /* Undefine dangling coding system aliases. */
1211                 {
1212                         int dangling_aliases;
1213
1214                         do {
1215                                 dangling_aliases = 0;
1216                                 elisp_map_remhash
1217                                     (dangling_coding_system_alias_p,
1218                                      Vcoding_system_hash_table,
1219                                      &dangling_aliases);
1220                         } while (dangling_aliases > 0);
1221                 }
1222
1223                 return Qnil;
1224         }
1225
1226         if (CODING_SYSTEMP(aliasee))
1227                 aliasee = XCODING_SYSTEM_NAME(aliasee);
1228
1229         /* Checks that aliasee names a coding-system */
1230         real_coding_system = Fget_coding_system(aliasee);
1231
1232         /* Check for coding system alias loops */
1233         if (EQ(alias, aliasee))
1234               alias_loop:signal_simple_error_2
1235                     ("Attempt to create a coding system alias loop", alias,
1236                      aliasee);
1237
1238         for (probe = aliasee;
1239              SYMBOLP(probe);
1240              probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1241                 if (EQ(probe, alias))
1242                         goto alias_loop;
1243         }
1244
1245         Fputhash(alias, aliasee, Vcoding_system_hash_table);
1246
1247         /* Set up aliases for subsidiaries.
1248            #### There must be a better way to handle subsidiary coding
1249            #### systems. */
1250         {
1251                 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1252
1253                 for (int i = 0; i < countof(suffixes); i++) {
1254                         Lisp_Object alias_subsidiary =
1255                                 append_suffix_to_symbol(alias, suffixes[i]);
1256                         Lisp_Object aliasee_subsidiary =
1257                                 append_suffix_to_symbol(aliasee, suffixes[i]);
1258
1259                         if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1260                                 Fdefine_coding_system_alias(alias_subsidiary,
1261                                                             aliasee_subsidiary);
1262                         }
1263                 }
1264         }
1265         /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1266            but it doesn't look intentional, so I'd rather return something
1267            meaningful or nothing at all. */
1268         return Qnil;
1269 }
1270
1271 static Lisp_Object
1272 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1273 {
1274         Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1275         Lisp_Object new_coding_system;
1276
1277         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1278                 return coding_system;
1279
1280         switch (type) {
1281         case EOL_AUTODETECT:
1282                 return coding_system;
1283         case EOL_LF:
1284                 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1285                 break;
1286         case EOL_CR:
1287                 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1288                 break;
1289         case EOL_CRLF:
1290                 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1291                 break;
1292         default:
1293                 abort();
1294                 return Qnil;
1295         }
1296
1297         return NILP(new_coding_system) ? coding_system : new_coding_system;
1298 }
1299
1300 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0,   /*
1301 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1302 */
1303       (coding_system, eol_type))
1304 {
1305         coding_system = Fget_coding_system(coding_system);
1306
1307         return subsidiary_coding_system(coding_system,
1308                                         symbol_to_eol_type(eol_type));
1309 }
1310 \f
1311 /************************************************************************/
1312 /*                         Coding system accessors                      */
1313 /************************************************************************/
1314
1315 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0,   /*
1316 Return the doc string for CODING-SYSTEM.
1317 */
1318       (coding_system))
1319 {
1320         coding_system = Fget_coding_system(coding_system);
1321         return XCODING_SYSTEM_DOC_STRING(coding_system);
1322 }
1323
1324 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0,       /*
1325 Return the type of CODING-SYSTEM.
1326 */
1327       (coding_system))
1328 {
1329         Lisp_Object tmp = Fget_coding_system(coding_system);
1330
1331         switch (XCODING_SYSTEM_TYPE(tmp)) {
1332         default:
1333                 abort();
1334                 return Qnil;
1335         case CODESYS_AUTODETECT:
1336                 return Qundecided;
1337 #ifdef MULE
1338         case CODESYS_SHIFT_JIS:
1339                 return Qshift_jis;
1340         case CODESYS_ISO2022:
1341                 return Qiso2022;
1342         case CODESYS_BIG5:
1343                 return Qbig5;
1344         case CODESYS_UCS4:
1345                 return Qucs4;
1346         case CODESYS_UTF8:
1347                 return Qutf8;
1348         case CODESYS_CCL:
1349                 return Qccl;
1350 #endif
1351         case CODESYS_NO_CONVERSION:
1352                 return Qno_conversion;
1353 #ifdef DEBUG_SXEMACS
1354         case CODESYS_INTERNAL:
1355                 return Qinternal;
1356 #endif
1357         }
1358 }
1359
1360 #ifdef MULE
1361 static
1362 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1363 {
1364         Lisp_Object cs
1365             = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1366
1367         return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1368 }
1369
1370 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1371 Return initial charset of CODING-SYSTEM designated to GNUM.
1372 GNUM allows 0 .. 3.
1373 */
1374       (coding_system, gnum))
1375 {
1376         coding_system = Fget_coding_system(coding_system);
1377         CHECK_INT(gnum);
1378
1379         return coding_system_charset(coding_system, XINT(gnum));
1380 }
1381 #endif                          /* MULE */
1382
1383 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0,       /*
1384 Return the PROP property of CODING-SYSTEM.
1385 */
1386       (coding_system, prop))
1387 {
1388         int i, ok = 0;
1389         enum coding_system_type type;
1390
1391         coding_system = Fget_coding_system(coding_system);
1392         CHECK_SYMBOL(prop);
1393         type = XCODING_SYSTEM_TYPE(coding_system);
1394
1395         for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1396                 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1397                         ok = 1;
1398                         switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1399                         case CODESYS_PROP_ALL_OK:
1400                                 break;
1401 #ifdef MULE
1402                         case CODESYS_PROP_ISO2022:
1403                                 if (type != CODESYS_ISO2022)
1404                                         signal_simple_error
1405                                             ("Property only valid in ISO2022 coding systems",
1406                                              prop);
1407                                 break;
1408
1409                         case CODESYS_PROP_CCL:
1410                                 if (type != CODESYS_CCL)
1411                                         signal_simple_error
1412                                             ("Property only valid in CCL coding systems",
1413                                              prop);
1414                                 break;
1415 #endif                          /* MULE */
1416                         default:
1417                                 abort();
1418                         }
1419                 }
1420
1421         if (!ok)
1422                 signal_simple_error("Unrecognized property", prop);
1423
1424         if (EQ(prop, Qname))
1425                 return XCODING_SYSTEM_NAME(coding_system);
1426         else if (EQ(prop, Qtype))
1427                 return Fcoding_system_type(coding_system);
1428         else if (EQ(prop, Qdoc_string))
1429                 return XCODING_SYSTEM_DOC_STRING(coding_system);
1430         else if (EQ(prop, Qmnemonic))
1431                 return XCODING_SYSTEM_MNEMONIC(coding_system);
1432         else if (EQ(prop, Qeol_type))
1433                 return
1434                     eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1435         else if (EQ(prop, Qeol_lf))
1436                 return XCODING_SYSTEM_EOL_LF(coding_system);
1437         else if (EQ(prop, Qeol_crlf))
1438                 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1439         else if (EQ(prop, Qeol_cr))
1440                 return XCODING_SYSTEM_EOL_CR(coding_system);
1441         else if (EQ(prop, Qpost_read_conversion))
1442                 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1443         else if (EQ(prop, Qpre_write_conversion))
1444                 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1445 #ifdef MULE
1446         else if (type == CODESYS_ISO2022) {
1447                 if (EQ(prop, Qcharset_g0))
1448                         return coding_system_charset(coding_system, 0);
1449                 else if (EQ(prop, Qcharset_g1))
1450                         return coding_system_charset(coding_system, 1);
1451                 else if (EQ(prop, Qcharset_g2))
1452                         return coding_system_charset(coding_system, 2);
1453                 else if (EQ(prop, Qcharset_g3))
1454                         return coding_system_charset(coding_system, 3);
1455
1456 #define FORCE_CHARSET(charset_num) \
1457   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1458    (coding_system, charset_num) ? Qt : Qnil)
1459
1460                 else if (EQ(prop, Qforce_g0_on_output))
1461                         return FORCE_CHARSET(0);
1462                 else if (EQ(prop, Qforce_g1_on_output))
1463                         return FORCE_CHARSET(1);
1464                 else if (EQ(prop, Qforce_g2_on_output))
1465                         return FORCE_CHARSET(2);
1466                 else if (EQ(prop, Qforce_g3_on_output))
1467                         return FORCE_CHARSET(3);
1468
1469 #define LISP_BOOLEAN(prop) \
1470   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1471
1472                 else if (EQ(prop, Qshort))
1473                         return LISP_BOOLEAN(SHORT);
1474                 else if (EQ(prop, Qno_ascii_eol))
1475                         return LISP_BOOLEAN(NO_ASCII_EOL);
1476                 else if (EQ(prop, Qno_ascii_cntl))
1477                         return LISP_BOOLEAN(NO_ASCII_CNTL);
1478                 else if (EQ(prop, Qseven))
1479                         return LISP_BOOLEAN(SEVEN);
1480                 else if (EQ(prop, Qlock_shift))
1481                         return LISP_BOOLEAN(LOCK_SHIFT);
1482                 else if (EQ(prop, Qno_iso6429))
1483                         return LISP_BOOLEAN(NO_ISO6429);
1484                 else if (EQ(prop, Qescape_quoted))
1485                         return LISP_BOOLEAN(ESCAPE_QUOTED);
1486
1487                 else if (EQ(prop, Qinput_charset_conversion))
1488                         return
1489                             unparse_charset_conversion_specs
1490                             (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1491                 else if (EQ(prop, Qoutput_charset_conversion))
1492                         return
1493                             unparse_charset_conversion_specs
1494                             (XCODING_SYSTEM(coding_system)->iso2022.
1495                              output_conv);
1496                 else
1497                         abort();
1498         } else if (type == CODESYS_CCL) {
1499                 if (EQ(prop, Qdecode))
1500                         return XCODING_SYSTEM_CCL_DECODE(coding_system);
1501                 else if (EQ(prop, Qencode))
1502                         return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1503                 else
1504                         abort();
1505         }
1506 #endif                          /* MULE */
1507         else
1508                 abort();
1509
1510         return Qnil;            /* not reached */
1511 }
1512 \f
1513 /************************************************************************/
1514 /*                       Coding category functions                      */
1515 /************************************************************************/
1516
1517 static int decode_coding_category(Lisp_Object symbol)
1518 {
1519         int i;
1520
1521         CHECK_SYMBOL(symbol);
1522         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1523                 if (EQ(coding_category_symbol[i], symbol))
1524                         return i;
1525
1526         signal_simple_error("Unrecognized coding category", symbol);
1527         return 0;               /* not reached */
1528 }
1529
1530 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0,   /*
1531 Return a list of all recognized coding categories.
1532 */
1533       ())
1534 {
1535         int i;
1536         Lisp_Object list = Qnil;
1537
1538         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1539                 list = Fcons(coding_category_symbol[i], list);
1540         return list;
1541 }
1542
1543 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0,   /*
1544 Change the priority order of the coding categories.
1545 LIST should be list of coding categories, in descending order of
1546 priority.  Unspecified coding categories will be lower in priority
1547 than all specified ones, in the same relative order they were in
1548 previously.
1549 */
1550       (list))
1551 {
1552         int category_to_priority[CODING_CATEGORY_LAST];
1553         int i, j;
1554         Lisp_Object rest;
1555
1556         /* First generate a list that maps coding categories to priorities. */
1557
1558         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1559                 category_to_priority[i] = -1;
1560
1561         /* Highest priority comes from the specified list. */
1562         i = 0;
1563         EXTERNAL_LIST_LOOP(rest, list) {
1564                 int cat = decode_coding_category(XCAR(rest));
1565
1566                 if (category_to_priority[cat] >= 0)
1567                         signal_simple_error("Duplicate coding category in list",
1568                                             XCAR(rest));
1569                 category_to_priority[cat] = i++;
1570         }
1571
1572         /* Now go through the existing categories by priority to retrieve
1573            the categories not yet specified and preserve their priority
1574            order. */
1575         for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1576                 int cat = fcd->coding_category_by_priority[j];
1577                 if (category_to_priority[cat] < 0)
1578                         category_to_priority[cat] = i++;
1579         }
1580
1581         /* Now we need to construct the inverse of the mapping we just
1582            constructed. */
1583
1584         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1585                 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1586
1587         /* Phew!  That was confusing. */
1588         return Qnil;
1589 }
1590
1591 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0,   /*
1592 Return a list of coding categories in descending order of priority.
1593 */
1594       ())
1595 {
1596         int i;
1597         Lisp_Object list = Qnil;
1598
1599         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1600                 list =
1601                     Fcons(coding_category_symbol
1602                           [fcd->coding_category_by_priority[i]], list);
1603         return list;
1604 }
1605
1606 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0,       /*
1607 Change the coding system associated with a coding category.
1608 */
1609       (coding_category, coding_system))
1610 {
1611         int cat = decode_coding_category(coding_category);
1612
1613         coding_system = Fget_coding_system(coding_system);
1614         fcd->coding_category_system[cat] = coding_system;
1615         return Qnil;
1616 }
1617
1618 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0,       /*
1619 Return the coding system associated with a coding category.
1620 */
1621       (coding_category))
1622 {
1623         int cat = decode_coding_category(coding_category);
1624         Lisp_Object sys = fcd->coding_category_system[cat];
1625
1626         if (!NILP(sys))
1627                 return XCODING_SYSTEM_NAME(sys);
1628         return Qnil;
1629 }
1630 \f
1631 /************************************************************************/
1632 /*                     Detecting the encoding of data                   */
1633 /************************************************************************/
1634
1635 struct detection_state {
1636         eol_type_t eol_type;
1637         int seen_non_ascii;
1638         int mask;
1639 #ifdef MULE
1640         struct {
1641                 int mask;
1642                 int in_second_byte;
1643         } big5;
1644
1645         struct {
1646                 int mask;
1647                 int in_second_byte;
1648         } shift_jis;
1649
1650         struct {
1651                 int mask;
1652                 int in_byte;
1653         } ucs4;
1654
1655         struct {
1656                 int mask;
1657                 int in_byte;
1658         } utf8;
1659
1660         struct {
1661                 int mask;
1662                 int initted;
1663                 struct iso2022_decoder iso;
1664                 unsigned int flags;
1665                 int high_byte_count;
1666                 unsigned int saw_single_shift:1;
1667         } iso2022;
1668 #endif
1669         struct {
1670                 int seen_anything;
1671                 int just_saw_cr;
1672         } eol;
1673 };
1674
1675 static int acceptable_control_char_p(int c)
1676 {
1677         switch (c) {
1678                 /* Allow and ignore control characters that you might
1679                    reasonably see in a text file */
1680         case '\r':
1681         case '\n':
1682         case '\t':
1683         case 7:         /* bell */
1684         case 8:         /* backspace */
1685         case 11:                /* vertical tab */
1686         case 12:                /* form feed */
1687         case 26:                /* MS-DOS C-z junk */
1688         case 31:                /* '^_' -- for info */
1689                 return 1;
1690         default:
1691                 return 0;
1692         }
1693 }
1694
1695 static int mask_has_at_most_one_bit_p(int mask)
1696 {
1697         /* Perhaps the only thing useful you learn from intensive Microsoft
1698            technical interviews */
1699         return (mask & (mask - 1)) == 0;
1700 }
1701
1702 static eol_type_t
1703 detect_eol_type(struct detection_state *st, const Extbyte * src,
1704                 Lstream_data_count n)
1705 {
1706         while (n--) {
1707                 const unsigned char c = *(const unsigned char*)src++;
1708                 if (c == '\n') {
1709                         if (st->eol.just_saw_cr)
1710                                 return EOL_CRLF;
1711                         else if (st->eol.seen_anything)
1712                                 return EOL_LF;
1713                 } else if (st->eol.just_saw_cr)
1714                         return EOL_CR;
1715                 else if (c == '\r')
1716                         st->eol.just_saw_cr = 1;
1717                 else
1718                         st->eol.just_saw_cr = 0;
1719                 st->eol.seen_anything = 1;
1720         }
1721
1722         return EOL_AUTODETECT;
1723 }
1724
1725 /* Attempt to determine the encoding and EOL type of the given text.
1726    Before calling this function for the first type, you must initialize
1727    st->eol_type as appropriate and initialize st->mask to ~0.
1728
1729    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1730    not yet known.
1731
1732    st->mask holds the determined coding category mask, or ~0 if only
1733    ASCII has been seen so far.
1734
1735    Returns:
1736
1737    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1738         is present in st->mask
1739    1 == definitive answers are here for both st->eol_type and st->mask
1740 */
1741
1742 static int
1743 detect_coding_type(struct detection_state *st, const Extbyte * src,
1744                    Lstream_data_count n, int just_do_eol)
1745 {
1746         if (st->eol_type == EOL_AUTODETECT)
1747                 st->eol_type = detect_eol_type(st, src, n);
1748
1749         if (just_do_eol)
1750                 return st->eol_type != EOL_AUTODETECT;
1751
1752         if (!st->seen_non_ascii) {
1753                 for (; n; n--, src++) {
1754                         const unsigned char c = *(const unsigned char *)src;
1755                         if ((c < 0x20 && !acceptable_control_char_p(c))
1756                             || c >= 0x80) {
1757                                 st->seen_non_ascii = 1;
1758 #ifdef MULE
1759                                 st->shift_jis.mask = ~0;
1760                                 st->big5.mask = ~0;
1761                                 st->ucs4.mask = ~0;
1762                                 st->utf8.mask = ~0;
1763                                 st->iso2022.mask = ~0;
1764 #endif
1765                                 break;
1766                         }
1767                 }
1768         }
1769
1770         if (!n) {
1771                 return 0;
1772         }
1773 #ifdef MULE
1774         if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1775                 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1776         if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1777                 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1778         if (!mask_has_at_most_one_bit_p(st->big5.mask))
1779                 st->big5.mask = detect_coding_big5(st, src, n);
1780         if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1781                 st->utf8.mask = detect_coding_utf8(st, src, n);
1782         if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1783                 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1784
1785         st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1786                 | st->utf8.mask | st->ucs4.mask;
1787 #endif
1788         {
1789                 int retval = mask_has_at_most_one_bit_p(st->mask);
1790                 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1791                 return retval && st->eol_type != EOL_AUTODETECT;
1792         }
1793 }
1794
1795 static Lisp_Object coding_system_from_mask(int mask)
1796 {
1797         if (mask == ~0) {
1798                 /* If the file was entirely or basically ASCII, use the
1799                    default value of `buffer-file-coding-system'. */
1800                 Lisp_Object retval =
1801                     XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1802                 if (!NILP(retval)) {
1803                         retval = Ffind_coding_system(retval);
1804                         if (NILP(retval)) {
1805                                 warn_when_safe
1806                                     (Qbad_variable, Qwarning,
1807                                      "Invalid `default-buffer-file-coding-system', set to nil");
1808                                 XBUFFER(Vbuffer_defaults)->
1809                                     buffer_file_coding_system = Qnil;
1810                         }
1811                 }
1812                 if (NILP(retval))
1813                         retval = Fget_coding_system(Qraw_text);
1814                 return retval;
1815         } else {
1816                 int i;
1817                 int cat = -1;
1818 #ifdef MULE
1819                 mask = postprocess_iso2022_mask(mask);
1820 #endif
1821                 /* Look through the coding categories by priority and find
1822                    the first one that is allowed. */
1823                 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1824                         cat = fcd->coding_category_by_priority[i];
1825                         if (cat<0)
1826                                 continue;
1827                         if ((mask & (1 << cat)) &&
1828                             !NILP(fcd->coding_category_system[cat]))
1829                                 break;
1830                 }
1831                 if (cat >= 0)
1832                         return fcd->coding_category_system[cat];
1833                 else
1834                         return Fget_coding_system(Qraw_text);
1835         }
1836 }
1837
1838 /* Given a seekable read stream and potential coding system and EOL type
1839    as specified, do any autodetection that is called for.  If the
1840    coding system and/or EOL type are not `autodetect', they will be left
1841    alone; but this function will never return an autodetect coding system
1842    or EOL type.
1843
1844    This function does not automatically fetch subsidiary coding systems;
1845    that should be unnecessary with the explicit eol-type argument. */
1846
1847 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1848 /* number of leading lines to check for a coding cookie */
1849 #define LINES_TO_CHECK 2
1850
1851
1852 void
1853 autodetect_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1854                              eol_type_t * eol_type_in_out)
1855 {
1856         static const char mime_name_valid_chars[] = 
1857                 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1858                 "abcdefghijklmnopqrstuvwxyz"
1859                 "0123456789"
1860                 "!$%&*+-.^_{|}~";
1861
1862         struct detection_state decst;
1863
1864         Extbyte buf[4096];
1865         Lisp_Object coding_system = Qnil;
1866         Extbyte *p = NULL,
1867                 *scan_end = NULL;
1868         int lines_checked = 0;
1869         Lstream_data_count nread =
1870                 Lstream_read(stream, buf, sizeof(buf));
1871
1872         if (nread < 0)
1873                 /* Nothing more can be done here */
1874                 return;
1875
1876         xzero(decst);
1877         decst.eol_type = *eol_type_in_out;
1878         decst.mask = ~0;
1879
1880
1881         /* Look for initial "-*-"; mode line prefix */
1882         for (p = buf, scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1883              p <= scan_end && lines_checked < LINES_TO_CHECK; p++) {
1884                 Extbyte *local_vars_beg = p + 3;
1885
1886                 if (*p == '\n' || *p == '\r') {
1887                         /* file must use standard EOLs or we miss 2d
1888                            line not to mention this is broken for
1889                            UTF-16 DOS files */
1890                         lines_checked++;
1891                         /* skip past multibyte (DOS) newline */
1892                         if (*p == '\r' && *(p + 1) == '\n')
1893                                 p++;
1894                         continue;
1895                 }
1896                 if (*p != '-' || *(p + 1) != '*' || *(p + 2) != '-') {
1897                         continue;
1898                 }
1899
1900                 /* Look for final "-*-"; mode line suffix */
1901                 for (p = local_vars_beg, scan_end = buf + nread - LENGTH("-*-");
1902                      p <= scan_end && lines_checked < LINES_TO_CHECK; p++) {
1903                         Extbyte *suffix = p;
1904                         if (*p == '\n' || *p == '\r') {
1905                                 /* file must use standard EOLs or we
1906                                    miss 2d line not to mention this is
1907                                    broken for UTF-16 DOS files */
1908                                 lines_checked++;
1909                                 /* skip past multibyte (DOS) newline */
1910                                 if (*p == '\r'
1911                                     && *(p + 1) == '\n')
1912                                         p++;
1913                                 continue;
1914                         }
1915                         if (*p != '-' || *(p + 1) != '*' || *(p + 2) != '-') {
1916                                 continue;
1917                         }
1918
1919                         /* Look for "coding:" */
1920                         for (p = local_vars_beg, scan_end = suffix - LENGTH("coding:?");
1921                              p <= scan_end; p++) {
1922                                 Extbyte save;
1923                                 int n;
1924
1925                                 if (memcmp("coding:", p, LENGTH("coding:")) != 0) {
1926                                         continue;
1927                                 }
1928                                 if (p != local_vars_beg && strchr(" \t;", *p) == NULL ) {
1929                                         continue;
1930                                 }
1931                                 p += LENGTH("coding:");
1932                                 while (*p == ' ' || *p == '\t') {
1933                                         p++;
1934                                 }
1935                                         
1936                                 /* Get coding system name */
1937                                 save = *suffix;
1938                                 *suffix = '\0';
1939                                 /* Characters valid in a MIME charset
1940                                    name (rfc 1521), and in a Lisp
1941                                    symbol name. */
1942                                 n = strspn((char *)p, mime_name_valid_chars);
1943                                 *suffix = save;
1944                                 if (n > 0) {
1945                                         save = p[n];
1946                                         p[n] = '\0';
1947                                         coding_system = Ffind_coding_system(
1948                                                 intern((char *)p));
1949                                         p[n] = save;
1950                                 }
1951                                 break;
1952                         }
1953                         break;
1954                 }
1955                 break;
1956         }
1957         if (NILP(coding_system)) {
1958                 do {
1959                         if (detect_coding_type(&decst, buf, nread,
1960                                                XCODING_SYSTEM_TYPE(*codesys_in_out)
1961                                                != CODESYS_AUTODETECT))
1962                                 break;
1963                         nread = Lstream_read(stream, buf, sizeof(buf));
1964                         if (nread == 0)
1965                                 break;
1966                 }
1967                 while (1);
1968         } else if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1969                    && XCODING_SYSTEM_EOL_TYPE(coding_system) == EOL_AUTODETECT) {
1970                 do {
1971                         if (detect_coding_type(&decst, buf, nread, 1))
1972                                 break;
1973                         nread = Lstream_read(stream, buf, sizeof(buf));
1974                         if (!nread)
1975                                 break;
1976                 }
1977                 while (1);
1978         }
1979         *eol_type_in_out = decst.eol_type;
1980         if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
1981                 if (NILP(coding_system))
1982                         *codesys_in_out =
1983                                 coding_system_from_mask(decst.mask);
1984                 else
1985                         *codesys_in_out = coding_system;
1986         }
1987 }
1988
1989 void
1990 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1991                              eol_type_t * eol_type_in_out)
1992 {
1993
1994         if (*eol_type_in_out == EOL_AUTODETECT)
1995                 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
1996
1997
1998         /* If autodetection is called for, do it now. */
1999         if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
2000             || *eol_type_in_out == EOL_AUTODETECT) {
2001                 autodetect_real_coding_system(stream, codesys_in_out,
2002                                               eol_type_in_out);
2003         }
2004
2005         /* If we absolutely can't determine the EOL type, just assume LF. */
2006         if (*eol_type_in_out == EOL_AUTODETECT)
2007                 *eol_type_in_out = EOL_LF;
2008
2009         Lstream_rewind(stream);
2010 }
2011
2012 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0,   /*
2013 Detect coding system of the text in the region between START and END.
2014 Return a list of possible coding systems ordered by priority.
2015 If only ASCII characters are found, return 'undecided or one of
2016 its subsidiary coding systems according to a detected end-of-line
2017 type.  Optional arg BUFFER defaults to the current buffer.
2018 */
2019       (start, end, buffer))
2020 {
2021         Lisp_Object val = Qnil;
2022         struct buffer *buf = decode_buffer(buffer, 0);
2023         Bufpos b, e;
2024         Lisp_Object instream, lb_instream;
2025         lstream_t istr, lb_istr;
2026         struct detection_state decst;
2027         struct gcpro gcpro1, gcpro2;
2028
2029         get_buffer_range_char(buf, start, end, &b, &e, 0);
2030         lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2031         lb_istr = XLSTREAM(lb_instream);
2032         instream =
2033             make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2034         istr = XLSTREAM(instream);
2035         GCPRO2(instream, lb_instream);
2036         xzero(decst);
2037         decst.eol_type = EOL_AUTODETECT;
2038         decst.mask = ~0;
2039         while (1) {
2040                 Extbyte random_buffer[4096];
2041                 Lstream_data_count nread =
2042                     Lstream_read(istr, random_buffer, sizeof(random_buffer));
2043
2044                 if (!nread)
2045                         break;
2046                 if (detect_coding_type(&decst, random_buffer, nread, 0))
2047                         break;
2048         }
2049
2050         if (decst.mask == ~0)
2051                 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2052                                                decst.eol_type);
2053         else {
2054                 int i;
2055
2056                 val = Qnil;
2057 #ifdef MULE
2058                 decst.mask = postprocess_iso2022_mask(decst.mask);
2059 #endif
2060                 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2061                         int sys = fcd->coding_category_by_priority[i];
2062                         if (decst.mask & (1 << sys)) {
2063                                 Lisp_Object codesys =
2064                                     fcd->coding_category_system[sys];
2065                                 if (!NILP(codesys))
2066                                         codesys =
2067                                             subsidiary_coding_system(codesys,
2068                                                                      decst.
2069                                                                      eol_type);
2070                                 val = Fcons(codesys, val);
2071                         }
2072                 }
2073         }
2074         Lstream_close(istr);
2075         UNGCPRO;
2076         Lstream_delete(istr);
2077         Lstream_delete(lb_istr);
2078         return val;
2079 }
2080 \f
2081 /************************************************************************/
2082 /*           Converting to internal Mule format ("decoding")            */
2083 /************************************************************************/
2084
2085 /* A decoding stream is a stream used for decoding text (i.e.
2086    converting from some external format to internal format).
2087    The decoding-stream object keeps track of the actual coding
2088    stream, the stream that is at the other end, and data that
2089    needs to be persistent across the lifetime of the stream. */
2090
2091 /* Handle the EOL stuff related to just-read-in character C.
2092    EOL_TYPE is the EOL type of the coding stream.
2093    FLAGS is the current value of FLAGS in the coding stream, and may
2094    be modified by this macro.  (The macro only looks at the
2095    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2096    bytes are to be written.  You need to also define a local goto
2097    label "label_continue_loop" that is at the end of the main
2098    character-reading loop.
2099
2100    If C is a CR character, then this macro handles it entirely and
2101    jumps to label_continue_loop.  Otherwise, this macro does not add
2102    anything to DST, and continues normally.  You should continue
2103    processing C normally after this macro. */
2104
2105 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2106 do {                                                            \
2107   if (c == '\r')                                                \
2108     {                                                           \
2109       if (eol_type == EOL_CR)                                   \
2110         Dynarr_add (dst, '\n');                                 \
2111       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2112         Dynarr_add (dst, c);                                    \
2113       else                                                      \
2114         flags |= CODING_STATE_CR;                               \
2115       goto label_continue_loop;                                 \
2116     }                                                           \
2117   else if (flags & CODING_STATE_CR)                             \
2118     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2119       if (c != '\n')                                            \
2120         Dynarr_add (dst, '\r');                                 \
2121       flags &= ~CODING_STATE_CR;                                \
2122     }                                                           \
2123 } while (0)
2124
2125 /* C should be a binary character in the range 0 - 255; convert
2126    to internal format and add to Dynarr DST. */
2127
2128 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2129 do {                                            \
2130   if (BYTE_ASCII_P (c))                         \
2131     Dynarr_add (dst, c);                        \
2132   else if (BYTE_C1_P (c))                       \
2133     {                                           \
2134       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2135       Dynarr_add (dst, c + 0x20);               \
2136     }                                           \
2137   else                                          \
2138     {                                           \
2139       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2140       Dynarr_add (dst, c);                      \
2141     }                                           \
2142 } while (0)
2143
2144 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2145 do {                                    \
2146   if (ch)                               \
2147     {                                   \
2148       DECODE_ADD_BINARY_CHAR (ch, dst); \
2149       ch = 0;                           \
2150     }                                   \
2151 } while (0)
2152
2153 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2154 do {                                    \
2155   if (flags & CODING_STATE_END)         \
2156     {                                   \
2157       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2158       if (flags & CODING_STATE_CR)      \
2159         Dynarr_add (dst, '\r');         \
2160     }                                   \
2161 } while (0)
2162
2163 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2164
2165 typedef struct decoding_stream_s *decoding_stream_t;
2166 struct decoding_stream_s {
2167         /* Coding system that governs the conversion. */
2168         Lisp_Coding_System *codesys;
2169
2170         /* Stream that we read the encoded data from or
2171            write the decoded data to. */
2172         lstream_t other_end;
2173
2174         /* If we are reading, then we can return only a fixed amount of
2175            data, so if the conversion resulted in too much data, we store it
2176            here for retrieval the next time around. */
2177         unsigned_char_dynarr *runoff;
2178
2179         /* FLAGS holds flags indicating the current state of the decoding.
2180            Some of these flags are dependent on the coding system. */
2181         unsigned int flags;
2182
2183         /* CH holds a partially built-up character.  Since we only deal
2184            with one- and two-byte characters at the moment, we only use
2185            this to store the first byte of a two-byte character. */
2186         unsigned int ch;
2187
2188         /* EOL_TYPE specifies the type of end-of-line conversion that
2189            currently applies.  We need to keep this separate from the
2190            EOL type stored in CODESYS because the latter might indicate
2191            automatic EOL-type detection while the former will always
2192            indicate a particular EOL type. */
2193         eol_type_t eol_type;
2194 #ifdef MULE
2195         /* Additional ISO2022 information.  We define the structure above
2196            because it's also needed by the detection routines. */
2197         struct iso2022_decoder iso2022;
2198
2199         /* Additional information (the state of the running CCL program)
2200            used by the CCL decoder. */
2201         struct ccl_program ccl;
2202
2203         /* counter for UTF-8 or UCS-4 */
2204         unsigned char counter;
2205 #endif
2206         struct detection_state decst;
2207 };
2208
2209 static Lstream_data_count
2210 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2211 static Lstream_data_count
2212 decoding_writer(lstream_t stream,
2213                 const unsigned char *data, Lstream_data_count size);
2214 static int decoding_rewinder(lstream_t stream);
2215 static int decoding_seekable_p(lstream_t stream);
2216 static int decoding_flusher(lstream_t stream);
2217 static int decoding_closer(lstream_t stream);
2218
2219 static Lisp_Object decoding_marker(Lisp_Object stream);
2220
2221 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2222                               sizeof(struct decoding_stream_s));
2223
2224 static Lisp_Object
2225 decoding_marker(Lisp_Object stream)
2226 {
2227         lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2228         Lisp_Object str_obj;
2229
2230         /* We do not need to mark the coding systems or charsets stored
2231            within the stream because they are stored in a global list
2232            and automatically marked. */
2233
2234         XSETLSTREAM(str_obj, str);
2235         mark_object(str_obj);
2236         if (str->imp->marker) {
2237                 return str->imp->marker(str_obj);
2238         } else {
2239                 return Qnil;
2240         }
2241 }
2242
2243 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2244    so we read data from the other end, decode it, and store it into DATA. */
2245
2246 static Lstream_data_count
2247 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2248 {
2249         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2250         unsigned char *orig_data = data;
2251         Lstream_data_count read_size;
2252         int error_occurred = 0;
2253
2254         /* We need to interface to mule_decode(), which expects to take some
2255            amount of data and store the result into a Dynarr.  We have
2256            mule_decode() store into str->runoff, and take data from there
2257            as necessary. */
2258
2259         /* We loop until we have enough data, reading chunks from the other
2260            end and decoding it. */
2261         while (1) {
2262                 /* Take data from the runoff if we can.  Make sure to take at
2263                    most SIZE bytes, and delete the data from the runoff. */
2264                 if (Dynarr_length(str->runoff) > 0) {
2265                         Lstream_data_count chunk =
2266                                 min(size,
2267                                     (Lstream_data_count)
2268                                     Dynarr_length(str->runoff));
2269                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2270                         Dynarr_delete_many(str->runoff, 0, chunk);
2271                         data += chunk;
2272                         size -= chunk;
2273                 }
2274
2275                 if (size == 0) {
2276                         /* No more room for data */
2277                         break;
2278                 }
2279
2280                 if (str->flags & CODING_STATE_END) {
2281                         /* This means that on the previous iteration, we hit the
2282                            EOF on the other end.  We loop once more so that
2283                            mule_decode() can output any final stuff it may be
2284                            holding, or any "go back to a sane state" escape
2285                            sequences. (This latter makes sense during
2286                            encoding.) */
2287                         break;
2288                 }
2289
2290                 /* Exhausted the runoff, so get some more.  DATA has at least
2291                    SIZE bytes left of storage in it, so it's OK to read directly
2292                    into it.  (We'll be overwriting above, after we've decoded it
2293                    into the runoff.) */
2294                 read_size = Lstream_read(str->other_end, data, size);
2295                 if (read_size < 0) {
2296                         error_occurred = 1;
2297                         break;
2298                 }
2299                 if (read_size == 0) {
2300                         /* There might be some more end data produced in the
2301                            translation.  See the comment above. */
2302                         str->flags |= CODING_STATE_END;
2303                 }
2304                 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2305         }
2306
2307         if (data - orig_data == 0) {
2308                 return error_occurred ? -1 : 0;
2309         } else {
2310                 return data - orig_data;
2311         }
2312 }
2313
2314 static Lstream_data_count
2315 decoding_writer(lstream_t stream, const unsigned char *data,
2316                 Lstream_data_count size)
2317 {
2318         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2319         Lstream_data_count retval;
2320
2321         /* Decode all our data into the runoff, and then attempt to write
2322            it all out to the other end.  Remove whatever chunk we succeeded
2323            in writing. */
2324         mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2325         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2326                                Dynarr_length(str->runoff));
2327         if (retval > 0) {
2328                 Dynarr_delete_many(str->runoff, 0, retval);
2329         }
2330         /* Do NOT return retval.  The return value indicates how much
2331            of the incoming data was written, not how many bytes were
2332            written. */
2333         return size;
2334 }
2335
2336 static void
2337 reset_decoding_stream(decoding_stream_t str)
2338 {
2339 #ifdef MULE
2340         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2341                 Lisp_Object coding_system;
2342                 XSETCODING_SYSTEM(coding_system, str->codesys);
2343                 reset_iso2022(coding_system, &str->iso2022);
2344         } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2345                 setup_ccl_program(&str->ccl,
2346                                   CODING_SYSTEM_CCL_DECODE(str->codesys));
2347         }
2348         str->counter = 0;
2349 #endif                          /* MULE */
2350         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2351             || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2352                 xzero(str->decst);
2353                 str->decst.eol_type = EOL_AUTODETECT;
2354                 str->decst.mask = ~0;
2355         }
2356         str->flags = str->ch = 0;
2357 }
2358
2359 static int
2360 decoding_rewinder(lstream_t stream)
2361 {
2362         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2363         reset_decoding_stream(str);
2364         Dynarr_reset(str->runoff);
2365         return Lstream_rewind(str->other_end);
2366 }
2367
2368 static int
2369 decoding_seekable_p(lstream_t stream)
2370 {
2371         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2372         return Lstream_seekable_p(str->other_end);
2373 }
2374
2375 static int
2376 decoding_flusher(lstream_t stream)
2377 {
2378         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2379         return Lstream_flush(str->other_end);
2380 }
2381
2382 static int
2383 decoding_closer(lstream_t stream)
2384 {
2385         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2386         if (stream->flags & LSTREAM_FL_WRITE) {
2387                 str->flags |= CODING_STATE_END;
2388                 decoding_writer(stream, 0, 0);
2389         }
2390         Dynarr_free(str->runoff);
2391 #ifdef MULE
2392 #ifdef ENABLE_COMPOSITE_CHARS
2393         if (str->iso2022.composite_chars) {
2394                 Dynarr_free(str->iso2022.composite_chars);
2395         }
2396 #endif
2397 #endif
2398         return Lstream_close(str->other_end);
2399 }
2400
2401 Lisp_Object
2402 decoding_stream_coding_system(lstream_t stream)
2403 {
2404         Lisp_Object coding_system;
2405         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2406
2407         XSETCODING_SYSTEM(coding_system, str->codesys);
2408         return subsidiary_coding_system(coding_system, str->eol_type);
2409 }
2410
2411 void
2412 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2413 {
2414         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2415         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2416         str->codesys = cs;
2417         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2418                 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2419         }
2420         reset_decoding_stream(str);
2421         return;
2422 }
2423
2424 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2425    stream for writing, no automatic code detection will be performed.
2426    The reason for this is that automatic code detection requires a
2427    seekable input.  Things will also fail if you open a decoding
2428    stream for reading using a non-fully-specified coding system and
2429    a non-seekable input stream. */
2430
2431 static Lisp_Object
2432 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2433 {
2434         lstream_t lstr = Lstream_new(lstream_decoding, mode);
2435         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2436         Lisp_Object obj;
2437
2438         xzero(*str);
2439         str->other_end = stream;
2440         str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2441         str->eol_type = EOL_AUTODETECT;
2442         if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2443                 /* We can determine the coding system now. */
2444                 determine_real_coding_system(stream, &codesys, &str->eol_type);
2445         }
2446         set_decoding_stream_coding_system(lstr, codesys);
2447         str->decst.eol_type = str->eol_type;
2448         str->decst.mask = ~0;
2449         XSETLSTREAM(obj, lstr);
2450         return obj;
2451 }
2452
2453 Lisp_Object
2454 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2455 {
2456         return make_decoding_stream_1(stream, codesys, "r");
2457 }
2458
2459 Lisp_Object
2460 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2461 {
2462         return make_decoding_stream_1(stream, codesys, "w");
2463 }
2464
2465 /* Note: the decode_coding_* functions all take the same
2466    arguments as mule_decode(), which is to say some SRC data of
2467    size N, which is to be stored into dynamic array DST.
2468    DECODING is the stream within which the decoding is
2469    taking place, but no data is actually read from or
2470    written to that stream; that is handled in decoding_reader()
2471    or decoding_writer().  This allows the same functions to
2472    be used for both reading and writing. */
2473
2474 static void
2475 mule_decode(lstream_t decoding, const Extbyte * src,
2476             unsigned_char_dynarr * dst, Lstream_data_count n)
2477 {
2478         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2479
2480         /* If necessary, do encoding-detection now.  We do this when
2481            we're a writing stream or a non-seekable reading stream,
2482            meaning that we can't just process the whole input,
2483            rewind, and start over. */
2484
2485         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2486             str->eol_type == EOL_AUTODETECT) {
2487                 Lisp_Object codesys;
2488
2489                 XSETCODING_SYSTEM(codesys, str->codesys);
2490                 detect_coding_type(&str->decst, src, n,
2491                                    CODING_SYSTEM_TYPE(str->codesys) !=
2492                                    CODESYS_AUTODETECT);
2493                 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2494                     str->decst.mask != ~0)
2495                         /* #### This is cheesy.  What we really ought to do is
2496                            buffer up a certain amount of data so as to get a
2497                            less random result. */
2498                         codesys = coding_system_from_mask(str->decst.mask);
2499                 str->eol_type = str->decst.eol_type;
2500                 if (XCODING_SYSTEM(codesys) != str->codesys) {
2501                         /* Preserve the CODING_STATE_END flag in case it was set.
2502                            If we erase it, bad things might happen. */
2503                         int was_end = str->flags & CODING_STATE_END;
2504                         set_decoding_stream_coding_system(decoding, codesys);
2505                         if (was_end)
2506                                 str->flags |= CODING_STATE_END;
2507                 }
2508         }
2509
2510         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2511 #ifdef DEBUG_SXEMACS
2512         case CODESYS_INTERNAL:
2513                 Dynarr_add_many(dst, src, n);
2514                 break;
2515 #endif
2516         case CODESYS_AUTODETECT:
2517                 /* If we got this far and still haven't decided on the coding
2518                    system, then do no conversion. */
2519         case CODESYS_NO_CONVERSION:
2520                 decode_coding_no_conversion(decoding, src, dst, n);
2521                 break;
2522 #ifdef MULE
2523         case CODESYS_SHIFT_JIS:
2524                 decode_coding_sjis(decoding, src, dst, n);
2525                 break;
2526         case CODESYS_BIG5:
2527                 decode_coding_big5(decoding, src, dst, n);
2528                 break;
2529         case CODESYS_UCS4:
2530                 decode_coding_ucs4(decoding, src, dst, n);
2531                 break;
2532         case CODESYS_UTF8:
2533                 decode_coding_utf8(decoding, src, dst, n);
2534                 break;
2535         case CODESYS_CCL:
2536                 str->ccl.last_block = str->flags & CODING_STATE_END;
2537                 /* When applying ccl program to stream, MUST NOT set NULL
2538                    pointer to src.  */
2539                 ccl_driver(&str->ccl,
2540                            (src
2541                             ? (const unsigned char *)src
2542                             : (const unsigned char *)""),
2543                            dst, n, 0, CCL_MODE_DECODING);
2544                 break;
2545         case CODESYS_ISO2022:
2546                 decode_coding_iso2022(decoding, src, dst, n);
2547                 break;
2548 #endif                          /* MULE */
2549         default:
2550                 abort();
2551         }
2552 }
2553
2554 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0,   /*
2555 Decode the text between START and END which is encoded in CODING-SYSTEM.
2556 This is useful if you've read in encoded text from a file without decoding
2557 it (e.g. you read in a JIS-formatted file but used the `binary' or
2558 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2559 Return length of decoded text.
2560 BUFFER defaults to the current buffer if unspecified.
2561 */
2562       (start, end, coding_system, buffer))
2563 {
2564         Bufpos b, e;
2565         struct buffer *buf = decode_buffer(buffer, 0);
2566         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2567         lstream_t istr, ostr;
2568         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2569
2570         get_buffer_range_char(buf, start, end, &b, &e, 0);
2571
2572         barf_if_buffer_read_only(buf, b, e);
2573
2574         coding_system = Fget_coding_system(coding_system);
2575         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2576         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2577         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2578                                                    coding_system);
2579         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2580                                                 Fget_coding_system(Qbinary));
2581         istr = XLSTREAM(instream);
2582         ostr = XLSTREAM(outstream);
2583         GCPRO4(instream, lb_outstream, de_outstream, outstream);
2584
2585         /* The chain of streams looks like this:
2586
2587            [BUFFER] <----- send through
2588            ------> [ENCODE AS BINARY]
2589            ------> [DECODE AS SPECIFIED]
2590            ------> [BUFFER]
2591          */
2592
2593         while (1) {
2594                 char tempbuf[1024];     /* some random amount */
2595                 Bufpos newpos, even_newer_pos;
2596                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2597                 Lstream_data_count size_in_bytes =
2598                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
2599
2600                 if (!size_in_bytes)
2601                         break;
2602                 newpos = lisp_buffer_stream_startpos(istr);
2603                 Lstream_write(ostr, tempbuf, size_in_bytes);
2604                 even_newer_pos = lisp_buffer_stream_startpos(istr);
2605                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2606                                     even_newer_pos, 0);
2607         }
2608         Lstream_close(istr);
2609         Lstream_close(ostr);
2610         UNGCPRO;
2611         Lstream_delete(istr);
2612         Lstream_delete(ostr);
2613         Lstream_delete(XLSTREAM(de_outstream));
2614         Lstream_delete(XLSTREAM(lb_outstream));
2615         return Qnil;
2616 }
2617 \f
2618 /************************************************************************/
2619 /*           Converting to an external encoding ("encoding")            */
2620 /************************************************************************/
2621
2622 /* An encoding stream is an output stream.  When you create the
2623    stream, you specify the coding system that governs the encoding
2624    and another stream that the resulting encoded data is to be
2625    sent to, and then start sending data to it. */
2626
2627 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2628
2629 typedef struct encoding_stream_s *encoding_stream_t;
2630 struct encoding_stream_s {
2631         /* Coding system that governs the conversion. */
2632         Lisp_Coding_System *codesys;
2633
2634         /* Stream that we read the encoded data from or
2635            write the decoded data to. */
2636         lstream_t other_end;
2637
2638         /* If we are reading, then we can return only a fixed amount of
2639            data, so if the conversion resulted in too much data, we store it
2640            here for retrieval the next time around. */
2641         unsigned_char_dynarr *runoff;
2642
2643         /* FLAGS holds flags indicating the current state of the encoding.
2644            Some of these flags are dependent on the coding system. */
2645         unsigned int flags;
2646
2647         /* CH holds a partially built-up character.  Since we only deal
2648            with one- and two-byte characters at the moment, we only use
2649            this to store the first byte of a two-byte character. */
2650         unsigned int ch;
2651 #ifdef MULE
2652         /* Additional information used by the ISO2022 encoder. */
2653         struct {
2654                 /* CHARSET holds the character sets currently assigned to the G0
2655                    through G3 registers.  It is initialized from the array
2656                    INITIAL_CHARSET in CODESYS. */
2657                 Lisp_Object charset[4];
2658
2659                 /* Which registers are currently invoked into the left (GL) and
2660                    right (GR) halves of the 8-bit encoding space? */
2661                 int register_left, register_right;
2662
2663                 /* Whether we need to explicitly designate the charset in the
2664                    G? register before using it.  It is initialized from the
2665                    array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2666                 unsigned char force_charset_on_output[4];
2667
2668                 /* Other state variables that need to be preserved across
2669                    invocations. */
2670                 Lisp_Object current_charset;
2671                 int current_half;
2672                 int current_char_boundary;
2673         } iso2022;
2674
2675         /* Additional information (the state of the running CCL program)
2676            used by the CCL encoder. */
2677         struct ccl_program ccl;
2678 #endif                          /* MULE */
2679 };
2680
2681 static Lstream_data_count
2682 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2683 static Lstream_data_count
2684 encoding_writer(lstream_t stream,
2685                 const unsigned char *data, Lstream_data_count size);
2686 static int encoding_rewinder(lstream_t stream);
2687 static int encoding_seekable_p(lstream_t stream);
2688 static int encoding_flusher(lstream_t stream);
2689 static int encoding_closer(lstream_t stream);
2690
2691 static Lisp_Object encoding_marker(Lisp_Object stream);
2692
2693 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2694                               sizeof(struct encoding_stream_s));
2695
2696 static Lisp_Object
2697 encoding_marker(Lisp_Object stream)
2698 {
2699         lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2700         Lisp_Object str_obj;
2701
2702         /* We do not need to mark the coding systems or charsets stored
2703            within the stream because they are stored in a global list
2704            and automatically marked. */
2705
2706         XSETLSTREAM(str_obj, str);
2707         mark_object(str_obj);
2708         if (str->imp->marker) {
2709                 return str->imp->marker(str_obj);
2710         } else {
2711                 return Qnil;
2712         }
2713 }
2714
2715 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2716    so we read data from the other end, encode it, and store it into DATA. */
2717
2718 static Lstream_data_count
2719 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2720 {
2721         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2722         unsigned char *orig_data = data;
2723         Lstream_data_count read_size;
2724         int error_occurred = 0;
2725
2726         /* We need to interface to mule_encode(), which expects to take some
2727            amount of data and store the result into a Dynarr.  We have
2728            mule_encode() store into str->runoff, and take data from there
2729            as necessary. */
2730
2731         /* We loop until we have enough data, reading chunks from the other
2732            end and encoding it. */
2733         while (1) {
2734                 /* Take data from the runoff if we can.  Make sure to take at
2735                    most SIZE bytes, and delete the data from the runoff. */
2736                 if (Dynarr_length(str->runoff) > 0) {
2737                         int chunk = min((int)size, Dynarr_length(str->runoff));
2738                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2739                         Dynarr_delete_many(str->runoff, 0, chunk);
2740                         data += chunk;
2741                         size -= chunk;
2742                 }
2743
2744                 if (size == 0) {
2745                         /* No more room for data */
2746                         break;
2747                 }
2748
2749                 if (str->flags & CODING_STATE_END) {
2750                         /* This means that on the previous iteration, we hit the
2751                            EOF on the other end.  We loop once more so that
2752                            mule_encode() can output any final stuff it may be
2753                            holding, or any "go back to a sane state" escape
2754                            sequences. (This latter makes sense during
2755                            encoding.) */
2756                         break;
2757                 }
2758
2759                 /* Exhausted the runoff, so get some more.  DATA at least SIZE
2760                    bytes left of storage in it, so it's OK to read directly into
2761                    it.  (We'll be overwriting above, after we've encoded it into
2762                    the runoff.) */
2763                 read_size = Lstream_read(str->other_end, data, size);
2764                 if (read_size < 0) {
2765                         error_occurred = 1;
2766                         break;
2767                 }
2768                 if (read_size == 0) {
2769                         /* There might be some more end data produced in the
2770                            translation.  See the comment above. */
2771                         str->flags |= CODING_STATE_END;
2772                 }
2773                 mule_encode(stream, data, str->runoff, read_size);
2774         }
2775
2776         if (data == orig_data) {
2777                 return error_occurred ? -1 : 0;
2778         } else {
2779                 return data - orig_data;
2780         }
2781 }
2782
2783 static Lstream_data_count
2784 encoding_writer(lstream_t stream, const unsigned char *data,
2785                 Lstream_data_count size)
2786 {
2787         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2788         Lstream_data_count retval;
2789
2790         /* Encode all our data into the runoff, and then attempt to write
2791            it all out to the other end.  Remove whatever chunk we succeeded
2792            in writing. */
2793         mule_encode(stream, data, str->runoff, size);
2794         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2795                                Dynarr_length(str->runoff));
2796         if (retval > 0) {
2797                 Dynarr_delete_many(str->runoff, 0, retval);
2798         }
2799         /* Do NOT return retval.  The return value indicates how much
2800            of the incoming data was written, not how many bytes were
2801            written. */
2802         return size;
2803 }
2804
2805 static void
2806 reset_encoding_stream(encoding_stream_t str)
2807 {
2808 #ifdef MULE
2809         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2810         case CODESYS_ISO2022: {
2811                 int i;
2812
2813                 for (i = 0; i < 4; i++) {
2814                         str->iso2022.charset[i] =
2815                                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2816                                         str->codesys, i);
2817                         str->iso2022.force_charset_on_output[i] =
2818                                 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2819                                         str->codesys, i);
2820                 }
2821                 str->iso2022.register_left = 0;
2822                 str->iso2022.register_right = 1;
2823                 str->iso2022.current_charset = Qnil;
2824                 str->iso2022.current_half = 0;
2825                 str->iso2022.current_char_boundary = 1;
2826                 break;
2827         }
2828         case CODESYS_CCL:
2829                 setup_ccl_program(&str->ccl,
2830                                   CODING_SYSTEM_CCL_ENCODE(str->codesys));
2831                 break;
2832
2833                 /* list the rest of them lot explicitly */
2834         case CODESYS_AUTODETECT:
2835         case CODESYS_SHIFT_JIS:
2836         case CODESYS_BIG5:
2837         case CODESYS_UCS4:
2838         case CODESYS_UTF8:
2839         case CODESYS_NO_CONVERSION:
2840 #ifdef DEBUG_SXEMACS
2841         case CODESYS_INTERNAL:
2842 #endif
2843         default:
2844                 break;
2845         }
2846 #endif                          /* MULE */
2847
2848         str->flags = str->ch = 0;
2849 }
2850
2851 static int
2852 encoding_rewinder(lstream_t stream)
2853 {
2854         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2855         reset_encoding_stream(str);
2856         Dynarr_reset(str->runoff);
2857         return Lstream_rewind(str->other_end);
2858 }
2859
2860 static int
2861 encoding_seekable_p(lstream_t stream)
2862 {
2863         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2864         return Lstream_seekable_p(str->other_end);
2865 }
2866
2867 static int
2868 encoding_flusher(lstream_t stream)
2869 {
2870         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2871         return Lstream_flush(str->other_end);
2872 }
2873
2874 static int
2875 encoding_closer(lstream_t stream)
2876 {
2877         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2878         if (stream->flags & LSTREAM_FL_WRITE) {
2879                 str->flags |= CODING_STATE_END;
2880                 encoding_writer(stream, 0, 0);
2881         }
2882         Dynarr_free(str->runoff);
2883         return Lstream_close(str->other_end);
2884 }
2885
2886 Lisp_Object
2887 encoding_stream_coding_system(lstream_t stream)
2888 {
2889         Lisp_Object coding_system;
2890         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2891
2892         XSETCODING_SYSTEM(coding_system, str->codesys);
2893         return coding_system;
2894 }
2895
2896 void
2897 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2898 {
2899         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2900         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2901         str->codesys = cs;
2902         reset_encoding_stream(str);
2903 }
2904
2905 static Lisp_Object
2906 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2907 {
2908         lstream_t lstr = Lstream_new(lstream_encoding, mode);
2909         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2910         Lisp_Object obj;
2911
2912         xzero(*str);
2913         str->runoff = Dynarr_new(unsigned_char);
2914         str->other_end = stream;
2915         set_encoding_stream_coding_system(lstr, codesys);
2916         XSETLSTREAM(obj, lstr);
2917         return obj;
2918 }
2919
2920 Lisp_Object
2921 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2922 {
2923         return make_encoding_stream_1(stream, codesys, "r");
2924 }
2925
2926 Lisp_Object
2927 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2928 {
2929         return make_encoding_stream_1(stream, codesys, "w");
2930 }
2931
2932 /* Convert N bytes of internally-formatted data stored in SRC to an
2933    external format, according to the encoding stream ENCODING.
2934    Store the encoded data into DST. */
2935
2936 static void
2937 mule_encode(lstream_t encoding, const Bufbyte * src,
2938             unsigned_char_dynarr * dst, Lstream_data_count n)
2939 {
2940         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2941
2942         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2943 #ifdef DEBUG_SXEMACS
2944         case CODESYS_INTERNAL:
2945                 Dynarr_add_many(dst, src, n);
2946                 break;
2947 #endif
2948         case CODESYS_AUTODETECT:
2949                 /* If we got this far and still haven't decided on the coding
2950                    system, then do no conversion. */
2951         case CODESYS_NO_CONVERSION:
2952                 encode_coding_no_conversion(encoding, src, dst, n);
2953                 break;
2954 #ifdef MULE
2955         case CODESYS_SHIFT_JIS:
2956                 encode_coding_sjis(encoding, src, dst, n);
2957                 break;
2958         case CODESYS_BIG5:
2959                 encode_coding_big5(encoding, src, dst, n);
2960                 break;
2961         case CODESYS_UCS4:
2962                 encode_coding_ucs4(encoding, src, dst, n);
2963                 break;
2964         case CODESYS_UTF8:
2965                 encode_coding_utf8(encoding, src, dst, n);
2966                 break;
2967         case CODESYS_CCL:
2968                 str->ccl.last_block = str->flags & CODING_STATE_END;
2969                 /* When applying ccl program to stream, MUST NOT set NULL
2970                    pointer to src.  */
2971                 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
2972                            dst, n, 0, CCL_MODE_ENCODING);
2973                 break;
2974         case CODESYS_ISO2022:
2975                 encode_coding_iso2022(encoding, src, dst, n);
2976                 break;
2977 #endif                          /* MULE */
2978         default:
2979                 abort();
2980         }
2981 }
2982
2983 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0,   /*
2984 Encode the text between START and END using CODING-SYSTEM.
2985 This will, for example, convert Japanese characters into stuff such as
2986 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2987 text.  BUFFER defaults to the current buffer if unspecified.
2988 */
2989       (start, end, coding_system, buffer))
2990 {
2991         Bufpos b, e;
2992         struct buffer *buf = decode_buffer(buffer, 0);
2993         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2994         lstream_t istr, ostr;
2995         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2996
2997         get_buffer_range_char(buf, start, end, &b, &e, 0);
2998
2999         barf_if_buffer_read_only(buf, b, e);
3000
3001         coding_system = Fget_coding_system(coding_system);
3002         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
3003         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
3004         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
3005                                                    Fget_coding_system(Qbinary));
3006         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
3007                                                 coding_system);
3008         istr = XLSTREAM(instream);
3009         ostr = XLSTREAM(outstream);
3010         GCPRO4(instream, outstream, de_outstream, lb_outstream);
3011         /* The chain of streams looks like this:
3012
3013            [BUFFER] <----- send through
3014            ------> [ENCODE AS SPECIFIED]
3015            ------> [DECODE AS BINARY]
3016            ------> [BUFFER]
3017          */
3018         while (1) {
3019                 char tempbuf[1024];     /* some random amount */
3020                 Bufpos newpos, even_newer_pos;
3021                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
3022                 Lstream_data_count size_in_bytes =
3023                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
3024
3025                 if (!size_in_bytes)
3026                         break;
3027                 newpos = lisp_buffer_stream_startpos(istr);
3028                 Lstream_write(ostr, tempbuf, size_in_bytes);
3029                 even_newer_pos = lisp_buffer_stream_startpos(istr);
3030                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3031                                     even_newer_pos, 0);
3032         }
3033
3034         {
3035                 Charcount retlen =
3036                     lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3037                 Lstream_close(istr);
3038                 Lstream_close(ostr);
3039                 UNGCPRO;
3040                 Lstream_delete(istr);
3041                 Lstream_delete(ostr);
3042                 Lstream_delete(XLSTREAM(de_outstream));
3043                 Lstream_delete(XLSTREAM(lb_outstream));
3044                 return make_int(retlen);
3045         }
3046 }
3047
3048 #ifdef MULE
3049 \f
3050 /************************************************************************/
3051 /*                          Shift-JIS methods                           */
3052 /************************************************************************/
3053
3054 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3055    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3056    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3057    encoded by "position-code + 0x80".  A character of JISX0208
3058    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3059    position-codes are divided and shifted so that it fit in the range
3060    below.
3061
3062    --- CODE RANGE of Shift-JIS ---
3063    (character set)      (range)
3064    ASCII                0x00 .. 0x7F
3065    JISX0201-Kana        0xA0 .. 0xDF
3066    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3067             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3068    -------------------------------
3069
3070 */
3071
3072 /* Is this the first byte of a Shift-JIS two-byte char? */
3073
3074 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3075   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3076
3077 /* Is this the second byte of a Shift-JIS two-byte char? */
3078
3079 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3080   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3081
3082 #define BYTE_SJIS_KATAKANA_P(c) \
3083   ((c) >= 0xA1 && (c) <= 0xDF)
3084
3085 static int
3086 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3087                    Lstream_data_count n)
3088 {
3089         while (n--) {
3090                 const unsigned char c = *(const unsigned char *)src++;
3091                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3092                         return 0;
3093                 if (st->shift_jis.in_second_byte) {
3094                         st->shift_jis.in_second_byte = 0;
3095                         if (c < 0x40)
3096                                 return 0;
3097                 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3098                         st->shift_jis.in_second_byte = 1;
3099         }
3100         return CODING_CATEGORY_SHIFT_JIS_MASK;
3101 }
3102
3103 /* Convert Shift-JIS data to internal format. */
3104
3105 static void
3106 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3107                    unsigned_char_dynarr * dst, Lstream_data_count n)
3108 {
3109         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3110         unsigned int flags = str->flags;
3111         unsigned int ch = str->ch;
3112         eol_type_t eol_type = str->eol_type;
3113
3114         while (n--) {
3115                 const unsigned char c = *(const unsigned char *)src++;
3116
3117                 if (ch) {
3118                         /* Previous character was first byte of Shift-JIS Kanji
3119                            char. */
3120                         if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3121                                 unsigned char e1, e2;
3122
3123                                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3124                                 DECODE_SJIS(ch, c, e1, e2);
3125                                 Dynarr_add(dst, e1);
3126                                 Dynarr_add(dst, e2);
3127                         } else {
3128                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3129                                 DECODE_ADD_BINARY_CHAR(c, dst);
3130                         }
3131                         ch = 0;
3132                 } else {
3133                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3134                         if (BYTE_SJIS_TWO_BYTE_1_P(c))
3135                                 ch = c;
3136                         else if (BYTE_SJIS_KATAKANA_P(c)) {
3137                                 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3138                                 Dynarr_add(dst, c);
3139                         } else
3140                                 DECODE_ADD_BINARY_CHAR(c, dst);
3141                 }
3142               label_continue_loop:;
3143         }
3144
3145         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3146
3147         str->flags = flags;
3148         str->ch = ch;
3149 }
3150
3151 /* Convert internally-formatted data to Shift-JIS. */
3152
3153 static void
3154 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3155                    unsigned_char_dynarr * dst, Lstream_data_count n)
3156 {
3157         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3158         unsigned int flags = str->flags;
3159         unsigned int ch = str->ch;
3160         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3161
3162         while (n--) {
3163                 Bufbyte c = *src++;
3164                 if (c == '\n') {
3165                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3166                                 Dynarr_add(dst, '\r');
3167                         if (eol_type != EOL_CR)
3168                                 Dynarr_add(dst, '\n');
3169                         ch = 0;
3170                 } else if (BYTE_ASCII_P(c)) {
3171                         Dynarr_add(dst, c);
3172                         ch = 0;
3173                 } else if (BUFBYTE_LEADING_BYTE_P(c))
3174                         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3175                               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3176                               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3177                 else if (ch) {
3178                         if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3179                                 Dynarr_add(dst, c);
3180                                 ch = 0;
3181                         } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3182                                    ch == LEADING_BYTE_JAPANESE_JISX0208)
3183                                 ch = c;
3184                         else {
3185                                 /* j1 is bessel j1 function,
3186                                  * so we use something else */
3187                                 /* unsigned char j1, j2; */
3188                                 unsigned char tt1, tt2;
3189
3190                                 ENCODE_SJIS(ch, c, tt1, tt2);
3191                                 Dynarr_add(dst, tt1);
3192                                 Dynarr_add(dst, tt2);
3193                                 ch = 0;
3194                         }
3195                 }
3196         }
3197
3198         str->flags = flags;
3199         str->ch = ch;
3200 }
3201
3202 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3203 Decode a JISX0208 character of Shift-JIS coding-system.
3204 CODE is the character code in Shift-JIS as a cons of type bytes.
3205 Return the corresponding character.
3206 */
3207       (code))
3208 {
3209         unsigned char c1, c2, s1, s2;
3210
3211         CHECK_CONS(code);
3212         CHECK_INT(XCAR(code));
3213         CHECK_INT(XCDR(code));
3214         s1 = XINT(XCAR(code));
3215         s2 = XINT(XCDR(code));
3216         if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3217                 DECODE_SJIS(s1, s2, c1, c2);
3218                 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3219                                            c1 & 0x7F, c2 & 0x7F));
3220         } else
3221                 return Qnil;
3222 }
3223
3224 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3225 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3226 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3227 */
3228       (character))
3229 {
3230         Lisp_Object charset;
3231         int c1, c2, s1, s2;
3232
3233         CHECK_CHAR_COERCE_INT(character);
3234         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3235         if (EQ(charset, Vcharset_japanese_jisx0208)) {
3236                 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3237                 return Fcons(make_int(s1), make_int(s2));
3238         } else
3239                 return Qnil;
3240 }
3241 \f
3242 /************************************************************************/
3243 /*                            Big5 methods                              */
3244 /************************************************************************/
3245
3246 /* BIG5 is a coding system encoding two character sets: ASCII and
3247    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3248    character set and is encoded in two-byte.
3249
3250    --- CODE RANGE of BIG5 ---
3251    (character set)      (range)
3252    ASCII                0x00 .. 0x7F
3253    Big5 (1st byte)      0xA1 .. 0xFE
3254         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3255    --------------------------
3256
3257    Since the number of characters in Big5 is larger than maximum
3258    characters in Emacs' charset (96x96), it can't be handled as one
3259    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3260    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3261    contains frequently used characters and the latter contains less
3262    frequently used characters.  */
3263
3264 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3265   ((c) >= 0xA1 && (c) <= 0xFE)
3266
3267 /* Is this the second byte of a Shift-JIS two-byte char? */
3268
3269 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3270   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3271
3272 /* Number of Big5 characters which have the same code in 1st byte.  */
3273
3274 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3275
3276 /* Code conversion macros.  These are macros because they are used in
3277    inner loops during code conversion.
3278
3279    Note that temporary variables in macros introduce the classic
3280    dynamic-scoping problems with variable names.  We use capital-
3281    lettered variables in the assumption that SXEmacs does not use
3282    capital letters in variables except in a very formalized way
3283    (e.g. Qstring). */
3284
3285 /* Convert Big5 code (b1, b2) into its internal string representation
3286    (lb, c1, c2). */
3287
3288 /* There is a much simpler way to split the Big5 charset into two.
3289    For the moment I'm going to leave the algorithm as-is because it
3290    claims to separate out the most-used characters into a single
3291    charset, which perhaps will lead to optimizations in various
3292    places.
3293
3294    The way the algorithm works is something like this:
3295
3296    Big5 can be viewed as a 94x157 charset, where the row is
3297    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3298    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3299    the split between low and high column numbers is apparently
3300    meaningless; ascending rows produce less and less frequent chars.
3301    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3302    the first charset, and the upper half (0xC9 .. 0xFE) to the
3303    second.  To do the conversion, we convert the character into
3304    a single number where 0 .. 156 is the first row, 157 .. 313
3305    is the second, etc.  That way, the characters are ordered by
3306    decreasing frequency.  Then we just chop the space in two
3307    and coerce the result into a 94x94 space.
3308    */
3309
3310 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3311 {                                                                       \
3312   int B1 = b1, B2 = b2;                                                 \
3313   unsigned int I                                                        \
3314     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3315                                                                         \
3316   if (B1 < 0xC9)                                                        \
3317     {                                                                   \
3318       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3319     }                                                                   \
3320   else                                                                  \
3321     {                                                                   \
3322       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3323       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3324     }                                                                   \
3325   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3326   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3327 } while (0)
3328
3329 /* Convert the internal string representation of a Big5 character
3330    (lb, c1, c2) into Big5 code (b1, b2). */
3331
3332 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3333 {                                                                       \
3334   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3335                                                                         \
3336   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3337     {                                                                   \
3338       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3339     }                                                                   \
3340   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3341   b2 = I % BIG5_SAME_ROW;                                               \
3342   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3343 } while (0)
3344
3345 static int
3346 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3347                    Lstream_data_count n)
3348 {
3349         while (n--) {
3350                 const unsigned char c = *(const unsigned char *)src++;
3351                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3352                     (c >= 0x80 && c <= 0xA0))
3353                         return 0;
3354                 if (st->big5.in_second_byte) {
3355                         st->big5.in_second_byte = 0;
3356                         if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3357                                 return 0;
3358                 } else if (c >= 0xA1)
3359                         st->big5.in_second_byte = 1;
3360         }
3361         return CODING_CATEGORY_BIG5_MASK;
3362 }
3363
3364 /* Convert Big5 data to internal format. */
3365
3366 static void
3367 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3368                    unsigned_char_dynarr * dst, Lstream_data_count n)
3369 {
3370         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3371         unsigned int flags = str->flags;
3372         unsigned int ch = str->ch;
3373         eol_type_t eol_type = str->eol_type;
3374
3375         while (n--) {
3376                 const unsigned char c = *(const unsigned char *)src++;
3377                 if (ch) {
3378                         /* Previous character was first byte of Big5 char. */
3379                         if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3380                                 unsigned char b1, b2, b3;
3381                                 DECODE_BIG5(ch, c, b1, b2, b3);
3382                                 Dynarr_add(dst, b1);
3383                                 Dynarr_add(dst, b2);
3384                                 Dynarr_add(dst, b3);
3385                         } else {
3386                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3387                                 DECODE_ADD_BINARY_CHAR(c, dst);
3388                         }
3389                         ch = 0;
3390                 } else {
3391                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3392                         if (BYTE_BIG5_TWO_BYTE_1_P(c))
3393                                 ch = c;
3394                         else
3395                                 DECODE_ADD_BINARY_CHAR(c, dst);
3396                 }
3397               label_continue_loop:;
3398         }
3399
3400         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3401
3402         str->flags = flags;
3403         str->ch = ch;
3404 }
3405
3406 /* Convert internally-formatted data to Big5. */
3407
3408 static void
3409 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3410                    unsigned_char_dynarr * dst, Lstream_data_count n)
3411 {
3412         unsigned char c;
3413         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3414         unsigned int flags = str->flags;
3415         unsigned int ch = str->ch;
3416         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3417
3418         while (n--) {
3419                 c = *src++;
3420                 if (c == '\n') {
3421                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3422                                 Dynarr_add(dst, '\r');
3423                         if (eol_type != EOL_CR)
3424                                 Dynarr_add(dst, '\n');
3425                 } else if (BYTE_ASCII_P(c)) {
3426                         /* ASCII. */
3427                         Dynarr_add(dst, c);
3428                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3429                         if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3430                             c == LEADING_BYTE_CHINESE_BIG5_2) {
3431                                 /* A recognized leading byte. */
3432                                 ch = c;
3433                                 continue;       /* not done with this character. */
3434                         }
3435                         /* otherwise just ignore this character. */
3436                 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3437                            ch == LEADING_BYTE_CHINESE_BIG5_2) {
3438                         /* Previous char was a recognized leading byte. */
3439                         ch = (ch << 8) | c;
3440                         continue;       /* not done with this character. */
3441                 } else if (ch) {
3442                         /* Encountering second byte of a Big5 character. */
3443                         unsigned char b1, b2;
3444
3445                         ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3446                         Dynarr_add(dst, b1);
3447                         Dynarr_add(dst, b2);
3448                 }
3449
3450                 ch = 0;
3451         }
3452
3453         str->flags = flags;
3454         str->ch = ch;
3455 }
3456
3457 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0,   /*
3458 Decode a Big5 character CODE of BIG5 coding-system.
3459 CODE is the character code in BIG5, a cons of two integers.
3460 Return the corresponding character.
3461 */
3462       (code))
3463 {
3464         unsigned char c1, c2, b1, b2;
3465
3466         CHECK_CONS(code);
3467         CHECK_INT(XCAR(code));
3468         CHECK_INT(XCDR(code));
3469         b1 = XINT(XCAR(code));
3470         b2 = XINT(XCDR(code));
3471         if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3472                 int leading_byte;
3473                 Lisp_Object charset;
3474                 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3475                 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3476                 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3477         } else
3478                 return Qnil;
3479 }
3480
3481 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0,   /*
3482 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3483 Return the corresponding character code in Big5.
3484 */
3485       (character))
3486 {
3487         Lisp_Object charset;
3488         int c1, c2, b1, b2;
3489
3490         CHECK_CHAR_COERCE_INT(character);
3491         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3492         if (EQ(charset, Vcharset_chinese_big5_1) ||
3493             EQ(charset, Vcharset_chinese_big5_2)) {
3494                 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3495                             c2 | 0x80, b1, b2);
3496                 return Fcons(make_int(b1), make_int(b2));
3497         } else
3498                 return Qnil;
3499 }
3500 \f
3501 /************************************************************************/
3502 /*                           UCS-4 methods                              */
3503 /*                                                                      */
3504 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3505 /*                                                                      */
3506 /************************************************************************/
3507
3508 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0,   /*
3509 Map UCS-4 code CODE to Mule character CHARACTER.
3510
3511 Return T on success, NIL on failure.
3512 */
3513       (code, character))
3514 {
3515         size_t c;
3516
3517         CHECK_CHAR(character);
3518         CHECK_NATNUM(code);
3519         c = XINT(code);
3520
3521         if (c < countof(fcd->ucs_to_mule_table)) {
3522                 fcd->ucs_to_mule_table[c] = character;
3523                 return Qt;
3524         } else
3525                 return Qnil;
3526 }
3527
3528 static Lisp_Object ucs_to_char(unsigned long code)
3529 {
3530         if (code < countof(fcd->ucs_to_mule_table)) {
3531                 return fcd->ucs_to_mule_table[code];
3532         } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3533                 unsigned int c;
3534
3535                 code -= 0xe00000;
3536                 c = code % (94 * 94);
3537                 return make_char
3538                     (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3539                                (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3540                                 CHARSET_LEFT_TO_RIGHT),
3541                                c / 94 + 33, c % 94 + 33));
3542         } else
3543                 return Qnil;
3544 }
3545
3546 DEFUN("ucs-char", Fucs_char, 1, 1, 0,   /*
3547 Return Mule character corresponding to UCS code CODE (a positive integer).
3548 */
3549       (code))
3550 {
3551         CHECK_NATNUM(code);
3552         return ucs_to_char(XINT(code));
3553 }
3554
3555 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0,   /*
3556 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3557 */
3558       (character, code))
3559 {
3560         /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3561            Fset_char_ucs is more restrictive on index arg, but should
3562            check code arg in a char_table method. */
3563         CHECK_CHAR(character);
3564         CHECK_NATNUM(code);
3565         return Fput_char_table(character, code, mule_to_ucs_table);
3566 }
3567
3568 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0,   /*
3569 Return the UCS code (a positive integer) corresponding to CHARACTER.
3570 */
3571       (character))
3572 {
3573         return Fget_char_table(character, mule_to_ucs_table);
3574 }
3575
3576 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3577    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3578    is not found, instead.
3579    #### do something more appropriate (use blob?)
3580         Danger, Will Robinson!  Data loss.  Should we signal user? */
3581 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3582 {
3583         Lisp_Object chr = ucs_to_char(ch);
3584
3585         if (!NILP(chr)) {
3586                 Bufbyte work[MAX_EMCHAR_LEN];
3587                 int len;
3588
3589                 ch = XCHAR(chr);
3590                 len = (ch < 128) ?
3591                     simple_set_charptr_emchar(work, ch) :
3592                     non_ascii_set_charptr_emchar(work, ch);
3593                 Dynarr_add_many(dst, work, len);
3594         } else {
3595                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3596                 Dynarr_add(dst, 34 + 128);
3597                 Dynarr_add(dst, 46 + 128);
3598         }
3599 }
3600
3601 static unsigned long
3602 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3603 {
3604         Lisp_Object code
3605             = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3606                               mule_to_ucs_table);
3607
3608         if (INTP(code)) {
3609                 return XINT(code);
3610         } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3611                    (XCHARSET_CHARS(charset) == 94)) {
3612                 unsigned char final = XCHARSET_FINAL(charset);
3613
3614                 if (('@' <= final) && (final < 0x7f)) {
3615                         return 0xe00000 + (final - '@') * 94 * 94
3616                             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3617                 } else {
3618                         return '?';
3619                 }
3620         } else {
3621                 return '?';
3622         }
3623 }
3624
3625 static void
3626 encode_ucs4(Lisp_Object charset,
3627             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3628 {
3629         unsigned long code = mule_char_to_ucs4(charset, h, l);
3630         Dynarr_add(dst, code >> 24);
3631         Dynarr_add(dst, (code >> 16) & 255);
3632         Dynarr_add(dst, (code >> 8) & 255);
3633         Dynarr_add(dst, code & 255);
3634 }
3635
3636 static int
3637 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3638                    Lstream_data_count n)
3639 {
3640         while (n--) {
3641                 const unsigned char c = *(const unsigned char *)src++;
3642                 switch (st->ucs4.in_byte) {
3643                 case 0:
3644                         if (c >= 128)
3645                                 return 0;
3646                         else
3647                                 st->ucs4.in_byte++;
3648                         break;
3649                 case 3:
3650                         st->ucs4.in_byte = 0;
3651                         break;
3652                 default:
3653                         st->ucs4.in_byte++;
3654                 }
3655         }
3656         return CODING_CATEGORY_UCS4_MASK;
3657 }
3658
3659 static void
3660 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3661                    unsigned_char_dynarr * dst, Lstream_data_count n)
3662 {
3663         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3664         unsigned int flags = str->flags;
3665         unsigned int ch = str->ch;
3666         unsigned char counter = str->counter;
3667
3668         while (n--) {
3669                 const unsigned char c = *(const unsigned char *)src++;
3670                 switch (counter) {
3671                 case 0:
3672                         ch = c;
3673                         counter = 3;
3674                         break;
3675                 case 1:
3676                         decode_ucs4((ch << 8) | c, dst);
3677                         ch = 0;
3678                         counter = 0;
3679                         break;
3680                 default:
3681                         ch = (ch << 8) | c;
3682                         counter--;
3683                 }
3684         }
3685         if (counter & CODING_STATE_END)
3686                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3687
3688         str->flags = flags;
3689         str->ch = ch;
3690         str->counter = counter;
3691 }
3692
3693 static void
3694 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3695                    unsigned_char_dynarr * dst, Lstream_data_count n)
3696 {
3697         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3698         unsigned int flags = str->flags;
3699         unsigned int ch = str->ch;
3700         unsigned char char_boundary = str->iso2022.current_char_boundary;
3701         Lisp_Object charset = str->iso2022.current_charset;
3702
3703 #ifdef ENABLE_COMPOSITE_CHARS
3704         /* flags for handling composite chars.  We do a little switcharoo
3705            on the source while we're outputting the composite char. */
3706         unsigned int saved_n = 0;
3707         const unsigned char *saved_src = NULL;
3708         int in_composite = 0;
3709
3710       back_to_square_n:
3711 #endif
3712
3713         while (n--) {
3714                 unsigned char c = *src++;
3715
3716                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3717                         ch = 0;
3718                         encode_ucs4(Vcharset_ascii, c, 0, dst);
3719                         char_boundary = 1;
3720                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3721                         ch = 0;
3722                         charset = CHARSET_BY_LEADING_BYTE(c);
3723                         if (LEADING_BYTE_PREFIX_P(c))
3724                                 ch = c;
3725                         char_boundary = 0;
3726                 } else {        /* Processing Non-ASCII character */
3727                         char_boundary = 1;
3728                         if (EQ(charset, Vcharset_control_1)) {
3729                                 encode_ucs4(Vcharset_control_1, c, 0, dst);
3730                         } else {
3731                                 switch (XCHARSET_REP_BYTES(charset)) {
3732                                 case 2:
3733                                         encode_ucs4(charset, c, 0, dst);
3734                                         break;
3735                                 case 3:
3736                                         if (XCHARSET_PRIVATE_P(charset)) {
3737                                                 encode_ucs4(charset, c, 0, dst);
3738                                                 ch = 0;
3739                                         } else if (ch) {
3740 #ifdef ENABLE_COMPOSITE_CHARS
3741                                                 if (EQ
3742                                                     (charset,
3743                                                      Vcharset_composite)) {
3744                                                         if (in_composite) {
3745                                                                 /* #### Bother! We don't know how to
3746                                                                    handle this yet. */
3747                                                                 Dynarr_add(dst,
3748                                                                            '\0');
3749                                                                 Dynarr_add(dst,
3750                                                                            '\0');
3751                                                                 Dynarr_add(dst,
3752                                                                            '\0');
3753                                                                 Dynarr_add(dst,
3754                                                                            '~');
3755                                                         } else {
3756                                                                 Emchar emch =
3757                                                                     MAKE_CHAR
3758                                                                     (Vcharset_composite,
3759                                                                      ch & 0x7F,
3760                                                                      c & 0x7F);
3761                                                                 Lisp_Object lstr
3762                                                                     =
3763                                                                     composite_char_string
3764                                                                     (emch);
3765                                                                 saved_n = n;
3766                                                                 saved_src = src;
3767                                                                 in_composite =
3768                                                                     1;
3769                                                                 src =
3770                                                                     XSTRING_DATA
3771                                                                     (lstr);
3772                                                                 n = XSTRING_LENGTH(lstr);
3773                                                         }
3774                                                 } else
3775 #endif                          /* ENABLE_COMPOSITE_CHARS */
3776                                                 {
3777                                                         encode_ucs4(charset, ch,
3778                                                                     c, dst);
3779                                                 }
3780                                                 ch = 0;
3781                                         } else {
3782                                                 ch = c;
3783                                                 char_boundary = 0;
3784                                         }
3785                                         break;
3786                                 case 4:
3787                                         if (ch) {
3788                                                 encode_ucs4(charset, ch, c,
3789                                                             dst);
3790                                                 ch = 0;
3791                                         } else {
3792                                                 ch = c;
3793                                                 char_boundary = 0;
3794                                         }
3795                                         break;
3796                                 default:
3797                                         abort();
3798                                 }
3799                         }
3800                 }
3801         }
3802
3803 #ifdef ENABLE_COMPOSITE_CHARS
3804         if (in_composite) {
3805                 n = saved_n;
3806                 src = saved_src;
3807                 in_composite = 0;
3808                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
3809         }
3810 #endif                          /* ENABLE_COMPOSITE_CHARS */
3811
3812         str->flags = flags;
3813         str->ch = ch;
3814         str->iso2022.current_char_boundary = char_boundary;
3815         str->iso2022.current_charset = charset;
3816
3817         /* Verbum caro factum est! */
3818 }
3819 \f
3820 /************************************************************************/
3821 /*                           UTF-8 methods                              */
3822 /************************************************************************/
3823
3824 static int
3825 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3826                    Lstream_data_count n)
3827 {
3828         while (n--) {
3829                 const unsigned char c = *(const unsigned char *)src++;
3830                 switch (st->utf8.in_byte) {
3831                 case 0:
3832                         if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3833                             || c == ISO_CODE_SO)
3834                                 return 0;
3835                         else if (c >= 0xfc)
3836                                 st->utf8.in_byte = 5;
3837                         else if (c >= 0xf8)
3838                                 st->utf8.in_byte = 4;
3839                         else if (c >= 0xf0)
3840                                 st->utf8.in_byte = 3;
3841                         else if (c >= 0xe0)
3842                                 st->utf8.in_byte = 2;
3843                         else if (c >= 0xc0)
3844                                 st->utf8.in_byte = 1;
3845                         else if (c >= 0x80)
3846                                 return 0;
3847                         break;
3848                 default:
3849                         if ((c & 0xc0) != 0x80)
3850                                 return 0;
3851                         else
3852                                 st->utf8.in_byte--;
3853                 }
3854         }
3855         return CODING_CATEGORY_UTF8_MASK;
3856 }
3857
3858 static void
3859 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3860                    unsigned_char_dynarr * dst, Lstream_data_count n)
3861 {
3862         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3863         unsigned int flags = str->flags;
3864         unsigned int ch = str->ch;
3865         eol_type_t eol_type = str->eol_type;
3866         unsigned char counter = str->counter;
3867
3868         while (n--) {
3869                 const unsigned char c = *(const unsigned char *)src++;
3870                 switch (counter) {
3871                 case 0:
3872                         if (c >= 0xfc) {
3873                                 ch = c & 0x01;
3874                                 counter = 5;
3875                         } else if (c >= 0xf8) {
3876                                 ch = c & 0x03;
3877                                 counter = 4;
3878                         } else if (c >= 0xf0) {
3879                                 ch = c & 0x07;
3880                                 counter = 3;
3881                         } else if (c >= 0xe0) {
3882                                 ch = c & 0x0f;
3883                                 counter = 2;
3884                         } else if (c >= 0xc0) {
3885                                 ch = c & 0x1f;
3886                                 counter = 1;
3887                         } else {
3888                                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3889                                 decode_ucs4(c, dst);
3890                         }
3891                         break;
3892                 case 1:
3893                         ch = (ch << 6) | (c & 0x3f);
3894                         decode_ucs4(ch, dst);
3895                         ch = 0;
3896                         counter = 0;
3897                         break;
3898                 default:
3899                         ch = (ch << 6) | (c & 0x3f);
3900                         counter--;
3901                 }
3902               label_continue_loop:;
3903         }
3904
3905         if (flags & CODING_STATE_END)
3906                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3907
3908         str->flags = flags;
3909         str->ch = ch;
3910         str->counter = counter;
3911 }
3912
3913 static void
3914 encode_utf8(Lisp_Object charset,
3915             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3916 {
3917         unsigned long code = mule_char_to_ucs4(charset, h, l);
3918         if (code <= 0x7f) {
3919                 Dynarr_add(dst, code);
3920         } else if (code <= 0x7ff) {
3921                 Dynarr_add(dst, (code >> 6) | 0xc0);
3922                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3923         } else if (code <= 0xffff) {
3924                 Dynarr_add(dst, (code >> 12) | 0xe0);
3925                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3926                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3927         } else if (code <= 0x1fffff) {
3928                 Dynarr_add(dst, (code >> 18) | 0xf0);
3929                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3930                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3931                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3932         } else if (code <= 0x3ffffff) {
3933                 Dynarr_add(dst, (code >> 24) | 0xf8);
3934                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3935                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3936                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3937                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3938         } else {
3939                 Dynarr_add(dst, (code >> 30) | 0xfc);
3940                 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3941                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3942                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3943                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3944                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3945         }
3946 }
3947
3948 static void
3949 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3950                    unsigned_char_dynarr * dst, Lstream_data_count n)
3951 {
3952         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3953         unsigned int flags = str->flags;
3954         unsigned int ch = str->ch;
3955         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3956         unsigned char char_boundary = str->iso2022.current_char_boundary;
3957         Lisp_Object charset = str->iso2022.current_charset;
3958
3959 #ifdef ENABLE_COMPOSITE_CHARS
3960         /* flags for handling composite chars.  We do a little switcharoo
3961            on the source while we're outputting the composite char. */
3962         unsigned int saved_n = 0;
3963         const unsigned char *saved_src = NULL;
3964         int in_composite = 0;
3965
3966       back_to_square_n:
3967 #endif                          /* ENABLE_COMPOSITE_CHARS */
3968
3969         while (n--) {
3970                 unsigned char c = *src++;
3971
3972                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3973                         ch = 0;
3974                         if (c == '\n') {
3975                                 if (eol_type != EOL_LF
3976                                     && eol_type != EOL_AUTODETECT)
3977                                         Dynarr_add(dst, '\r');
3978                                 if (eol_type != EOL_CR)
3979                                         Dynarr_add(dst, c);
3980                         } else
3981                                 encode_utf8(Vcharset_ascii, c, 0, dst);
3982                         char_boundary = 1;
3983                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3984                         ch = 0;
3985                         charset = CHARSET_BY_LEADING_BYTE(c);
3986                         if (LEADING_BYTE_PREFIX_P(c))
3987                                 ch = c;
3988                         char_boundary = 0;
3989                 } else {        /* Processing Non-ASCII character */
3990                         char_boundary = 1;
3991                         if (EQ(charset, Vcharset_control_1)) {
3992                                 encode_utf8(Vcharset_control_1, c, 0, dst);
3993                         } else {
3994                                 switch (XCHARSET_REP_BYTES(charset)) {
3995                                 case 2:
3996                                         encode_utf8(charset, c, 0, dst);
3997                                         break;
3998                                 case 3:
3999                                         if (XCHARSET_PRIVATE_P(charset)) {
4000                                                 encode_utf8(charset, c, 0, dst);
4001                                                 ch = 0;
4002                                         } else if (ch) {
4003 #ifdef ENABLE_COMPOSITE_CHARS
4004                                                 if (EQ
4005                                                     (charset,
4006                                                      Vcharset_composite)) {
4007                                                         if (in_composite) {
4008                                                                 /* #### Bother! We don't know how to
4009                                                                    handle this yet. */
4010                                                                 encode_utf8
4011                                                                     (Vcharset_ascii,
4012                                                                      '~', 0,
4013                                                                      dst);
4014                                                         } else {
4015                                                                 Emchar emch =
4016                                                                     MAKE_CHAR
4017                                                                     (Vcharset_composite,
4018                                                                      ch & 0x7F,
4019                                                                      c & 0x7F);
4020                                                                 Lisp_Object lstr
4021                                                                     =
4022                                                                     composite_char_string
4023                                                                     (emch);
4024                                                                 saved_n = n;
4025                                                                 saved_src = src;
4026                                                                 in_composite =
4027                                                                     1;
4028                                                                 src =
4029                                                                     XSTRING_DATA
4030                                                                     (lstr);
4031                                                                 n = XSTRING_LENGTH(lstr);
4032                                                         }
4033                                                 } else
4034 #endif                          /* ENABLE_COMPOSITE_CHARS */
4035                                                 {
4036                                                         encode_utf8(charset, ch,
4037                                                                     c, dst);
4038                                                 }
4039                                                 ch = 0;
4040                                         } else {
4041                                                 ch = c;
4042                                                 char_boundary = 0;
4043                                         }
4044                                         break;
4045                                 case 4:
4046                                         if (ch) {
4047                                                 encode_utf8(charset, ch, c,
4048                                                             dst);
4049                                                 ch = 0;
4050                                         } else {
4051                                                 ch = c;
4052                                                 char_boundary = 0;
4053                                         }
4054                                         break;
4055                                 default:
4056                                         abort();
4057                                 }
4058                         }
4059                 }
4060         }
4061
4062 #ifdef ENABLE_COMPOSITE_CHARS
4063         if (in_composite) {
4064                 n = saved_n;
4065                 src = saved_src;
4066                 in_composite = 0;
4067                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
4068         }
4069 #endif
4070
4071         str->flags = flags;
4072         str->ch = ch;
4073         str->iso2022.current_char_boundary = char_boundary;
4074         str->iso2022.current_charset = charset;
4075
4076         /* Verbum caro factum est! */
4077 }
4078 \f
4079 /************************************************************************/
4080 /*                           ISO2022 methods                            */
4081 /************************************************************************/
4082
4083 /* The following note describes the coding system ISO2022 briefly.
4084    Since the intention of this note is to help understand the
4085    functions in this file, some parts are NOT ACCURATE or OVERLY
4086    SIMPLIFIED.  For thorough understanding, please refer to the
4087    original document of ISO2022.
4088
4089    ISO2022 provides many mechanisms to encode several character sets
4090    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4091    is encoded using bytes less than 128.  This may make the encoded
4092    text a little bit longer, but the text passes more easily through
4093    several gateways, some of which strip off MSB (Most Signigant Bit).
4094
4095    There are two kinds of character sets: control character set and
4096    graphic character set.  The former contains control characters such
4097    as `newline' and `escape' to provide control functions (control
4098    functions are also provided by escape sequences).  The latter
4099    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4100    two control character sets and many graphic character sets.
4101
4102    Graphic character sets are classified into one of the following
4103    four classes, according to the number of bytes (DIMENSION) and
4104    number of characters in one dimension (CHARS) of the set:
4105    - DIMENSION1_CHARS94
4106    - DIMENSION1_CHARS96
4107    - DIMENSION2_CHARS94
4108    - DIMENSION2_CHARS96
4109
4110    In addition, each character set is assigned an identification tag,
4111    unique for each set, called "final character" (denoted as <F>
4112    hereafter).  The <F> of each character set is decided by ECMA(*)
4113    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4114    (0x30..0x3F are for private use only).
4115
4116    Note (*): ECMA = European Computer Manufacturers Association
4117
4118    Here are examples of graphic character set [NAME(<F>)]:
4119         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4120         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4121         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4122         o DIMENSION2_CHARS96 -- none for the moment
4123
4124    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4125         C0 [0x00..0x1F] -- control character plane 0
4126         GL [0x20..0x7F] -- graphic character plane 0
4127         C1 [0x80..0x9F] -- control character plane 1
4128         GR [0xA0..0xFF] -- graphic character plane 1
4129
4130    A control character set is directly designated and invoked to C0 or
4131    C1 by an escape sequence.  The most common case is that:
4132    - ISO646's  control character set is designated/invoked to C0, and
4133    - ISO6429's control character set is designated/invoked to C1,
4134    and usually these designations/invocations are omitted in encoded
4135    text.  In a 7-bit environment, only C0 can be used, and a control
4136    character for C1 is encoded by an appropriate escape sequence to
4137    fit into the environment.  All control characters for C1 are
4138    defined to have corresponding escape sequences.
4139
4140    A graphic character set is at first designated to one of four
4141    graphic registers (G0 through G3), then these graphic registers are
4142    invoked to GL or GR.  These designations and invocations can be
4143    done independently.  The most common case is that G0 is invoked to
4144    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4145    these invocations and designations are omitted in encoded text.
4146    In a 7-bit environment, only GL can be used.
4147
4148    When a graphic character set of CHARS94 is invoked to GL, codes
4149    0x20 and 0x7F of the GL area work as control characters SPACE and
4150    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4151    be used.
4152
4153    There are two ways of invocation: locking-shift and single-shift.
4154    With locking-shift, the invocation lasts until the next different
4155    invocation, whereas with single-shift, the invocation affects the
4156    following character only and doesn't affect the locking-shift
4157    state.  Invocations are done by the following control characters or
4158    escape sequences:
4159
4160    ----------------------------------------------------------------------
4161    abbrev  function                  cntrl escape seq   description
4162    ----------------------------------------------------------------------
4163    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4164    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4165    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4166    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4167    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4168    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4169    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4170    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4171    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4172    ----------------------------------------------------------------------
4173    (*) These are not used by any known coding system.
4174
4175    Control characters for these functions are defined by macros
4176    ISO_CODE_XXX in `coding.h'.
4177
4178    Designations are done by the following escape sequences:
4179    ----------------------------------------------------------------------
4180    escape sequence      description
4181    ----------------------------------------------------------------------
4182    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4183    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4184    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4185    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4186    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4187    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4188    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4189    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4190    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4191    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4192    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4193    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4194    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4195    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4196    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4197    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4198    ----------------------------------------------------------------------
4199
4200    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4201    of dimension 1, chars 94, and final character <F>, etc...
4202
4203    Note (*): Although these designations are not allowed in ISO2022,
4204    Emacs accepts them on decoding, and produces them on encoding
4205    CHARS96 character sets in a coding system which is characterized as
4206    7-bit environment, non-locking-shift, and non-single-shift.
4207
4208    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4209    '(' can be omitted.  We refer to this as "short-form" hereafter.
4210
4211    Now you may notice that there are a lot of ways for encoding the
4212    same multilingual text in ISO2022.  Actually, there exist many
4213    coding systems such as Compound Text (used in X11's inter client
4214    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4215    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4216    localized platforms), and all of these are variants of ISO2022.
4217
4218    In addition to the above, Emacs handles two more kinds of escape
4219    sequences: ISO6429's direction specification and Emacs' private
4220    sequence for specifying character composition.
4221
4222    ISO6429's direction specification takes the following form:
4223         o CSI ']'      -- end of the current direction
4224         o CSI '0' ']'  -- end of the current direction
4225         o CSI '1' ']'  -- start of left-to-right text
4226         o CSI '2' ']'  -- start of right-to-left text
4227    The control character CSI (0x9B: control sequence introducer) is
4228    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4229
4230    Character composition specification takes the following form:
4231         o ESC '0' -- start character composition
4232         o ESC '1' -- end character composition
4233    Since these are not standard escape sequences of any ISO standard,
4234    their use with these meanings is restricted to Emacs only.  */
4235
4236 static void
4237 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4238 {
4239         int i;
4240
4241         for (i = 0; i < 4; i++) {
4242                 if (!NILP(coding_system))
4243                         iso->charset[i] =
4244                             XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4245                             (coding_system, i);
4246                 else
4247                         iso->charset[i] = Qt;
4248                 iso->invalid_designated[i] = 0;
4249         }
4250         iso->esc = ISO_ESC_NOTHING;
4251         iso->esc_bytes_index = 0;
4252         iso->register_left = 0;
4253         iso->register_right = 1;
4254         iso->switched_dir_and_no_valid_charset_yet = 0;
4255         iso->invalid_switch_dir = 0;
4256         iso->output_direction_sequence = 0;
4257         iso->output_literally = 0;
4258 #ifdef ENABLE_COMPOSITE_CHARS
4259         if (iso->composite_chars)
4260                 Dynarr_reset(iso->composite_chars);
4261 #endif
4262 }
4263
4264 static int fit_to_be_escape_quoted(unsigned char c)
4265 {
4266         switch (c) {
4267         case ISO_CODE_ESC:
4268         case ISO_CODE_CSI:
4269         case ISO_CODE_SS2:
4270         case ISO_CODE_SS3:
4271         case ISO_CODE_SO:
4272         case ISO_CODE_SI:
4273                 return 1;
4274
4275         default:
4276                 return 0;
4277         }
4278 }
4279
4280 /* Parse one byte of an ISO2022 escape sequence.
4281    If the result is an invalid escape sequence, return 0 and
4282    do not change anything in STR.  Otherwise, if the result is
4283    an incomplete escape sequence, update ISO2022.ESC and
4284    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4285    all the state variables (but not ISO2022.ESC_BYTES) and
4286    return 1.
4287
4288    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4289    or invocation of an invalid character set and treat that as
4290    an unrecognized escape sequence.
4291
4292    ********************************************************************
4293
4294    #### Strategies for error annotation and coding orthogonalization
4295
4296    We really want to separate out a number of things.  Conceptually,
4297    there is a nested syntax.
4298
4299    At the top level is the ISO 2022 extension syntax, including charset
4300    designation and invocation, and certain auxiliary controls such as the
4301    ISO 6429 direction specification.  These are octet-oriented, with the
4302    single exception (AFAIK) of the "exit Unicode" sequence which uses the
4303    UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4304    UTF-16, and 4 bytes for UCS-4 and UTF-32).  This will be treated as a
4305    (deprecated) special case in Unicode processing.
4306
4307    The middle layer is ISO 2022 character interpretation.  This will depend
4308    on the current state of the ISO 2022 registers, and assembles octets
4309    into the character's internal representation.
4310
4311    The lowest level is translating system control conventions.  At present
4312    this is restricted to newline translation, but one could imagine doing
4313    tab conversion or line wrapping here.  "Escape from Unicode" processing
4314    would be done at this level.
4315
4316    At each level the parser will verify the syntax.  In the case of a
4317    syntax error or warning (such as a redundant escape sequence that affects
4318    no characters), the parser will take some action, typically inserting the
4319    erroneous octets directly into the output and creating an annotation
4320    which can be used by higher level I/O to mark the affected region.
4321
4322    This should make it possible to do something sensible about separating
4323    newline convention processing from character construction, and about
4324    preventing ISO 2022 escape sequences from being recognized
4325    inappropriately.
4326
4327    The basic strategy will be to have octet classification tables, and
4328    switch processing according to the table entry.
4329
4330    It's possible that, by doing the processing with tables of functions or
4331    the like, the parser can be used for both detection and translation. */
4332
4333 static int
4334 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4335                   unsigned char c, unsigned int *flags,
4336                   int check_invalid_charsets)
4337 {
4338         /* (1) If we're at the end of a designation sequence, CS is the
4339            charset being designated and REG is the register to designate
4340            it to.
4341
4342            (2) If we're at the end of a locking-shift sequence, REG is
4343            the register to invoke and HALF (0 == left, 1 == right) is
4344            the half to invoke it into.
4345
4346            (3) If we're at the end of a single-shift sequence, REG is
4347            the register to invoke. */
4348         Lisp_Object cs = Qnil;
4349         int reg, half;
4350
4351         /* NOTE: This code does goto's all over the fucking place.
4352            The reason for this is that we're basically implementing
4353            a state machine here, and hierarchical languages like C
4354            don't really provide a clean way of doing this. */
4355
4356         if (!(*flags & CODING_STATE_ESCAPE))
4357                 /* At beginning of escape sequence; we need to reset our
4358                    escape-state variables. */
4359                 iso->esc = ISO_ESC_NOTHING;
4360
4361         iso->output_literally = 0;
4362         iso->output_direction_sequence = 0;
4363
4364         switch (iso->esc) {
4365         case ISO_ESC_NOTHING:
4366                 iso->esc_bytes_index = 0;
4367                 switch (c) {
4368                 case ISO_CODE_ESC:      /* Start escape sequence */
4369                         *flags |= CODING_STATE_ESCAPE;
4370                         iso->esc = ISO_ESC;
4371                         goto not_done;
4372
4373                 case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4374                         *flags |= CODING_STATE_ESCAPE;
4375                         iso->esc = ISO_ESC_5_11;
4376                         goto not_done;
4377
4378                 case ISO_CODE_SO:       /* locking shift 1 */
4379                         reg = 1;
4380                         half = 0;
4381                         goto locking_shift;
4382                 case ISO_CODE_SI:       /* locking shift 0 */
4383                         reg = 0;
4384                         half = 0;
4385                         goto locking_shift;
4386
4387                 case ISO_CODE_SS2:      /* single shift */
4388                         reg = 2;
4389                         goto single_shift;
4390                 case ISO_CODE_SS3:      /* single shift */
4391                         reg = 3;
4392                         goto single_shift;
4393
4394                 default:        /* Other control characters */
4395                         return 0;
4396                 }
4397
4398         case ISO_ESC:
4399                 switch (c) {
4400           /**** single shift ****/
4401
4402                 case 'N':       /* single shift 2 */
4403                         reg = 2;
4404                         goto single_shift;
4405                 case 'O':       /* single shift 3 */
4406                         reg = 3;
4407                         goto single_shift;
4408
4409           /**** locking shift ****/
4410
4411                 case '~':       /* locking shift 1 right */
4412                         reg = 1;
4413                         half = 1;
4414                         goto locking_shift;
4415                 case 'n':       /* locking shift 2 */
4416                         reg = 2;
4417                         half = 0;
4418                         goto locking_shift;
4419                 case '}':       /* locking shift 2 right */
4420                         reg = 2;
4421                         half = 1;
4422                         goto locking_shift;
4423                 case 'o':       /* locking shift 3 */
4424                         reg = 3;
4425                         half = 0;
4426                         goto locking_shift;
4427                 case '|':       /* locking shift 3 right */
4428                         reg = 3;
4429                         half = 1;
4430                         goto locking_shift;
4431
4432 #ifdef ENABLE_COMPOSITE_CHARS
4433           /**** composite ****/
4434
4435                 case '0':
4436                         iso->esc = ISO_ESC_START_COMPOSITE;
4437                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4438                             CODING_STATE_COMPOSITE;
4439                         return 1;
4440
4441                 case '1':
4442                         iso->esc = ISO_ESC_END_COMPOSITE;
4443                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4444                             ~CODING_STATE_COMPOSITE;
4445                         return 1;
4446 #endif                          /* ENABLE_COMPOSITE_CHARS */
4447
4448           /**** directionality ****/
4449
4450                 case '[':
4451                         iso->esc = ISO_ESC_5_11;
4452                         goto not_done;
4453
4454           /**** designation ****/
4455
4456                 case '$':       /* multibyte charset prefix */
4457                         iso->esc = ISO_ESC_2_4;
4458                         goto not_done;
4459
4460                 default:
4461                         if (0x28 <= c && c <= 0x2F) {
4462                                 iso->esc =
4463                                     (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4464                                 goto not_done;
4465                         }
4466
4467                         /* This function is called with CODESYS equal to nil when
4468                            doing coding-system detection. */
4469                         if (!NILP(codesys)
4470                             && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4471                             && fit_to_be_escape_quoted(c)) {
4472                                 iso->esc = ISO_ESC_LITERAL;
4473                                 *flags &= CODING_STATE_ISO2022_LOCK;
4474                                 return 1;
4475                         }
4476
4477                         /* bzzzt! */
4478                         return 0;
4479                 }
4480
4481       /**** directionality ****/
4482
4483         case ISO_ESC_5_11:      /* ISO6429 direction control */
4484                 if (c == ']') {
4485                         *flags &=
4486                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4487                         goto directionality;
4488                 }
4489                 if (c == '0')
4490                         iso->esc = ISO_ESC_5_11_0;
4491                 else if (c == '1')
4492                         iso->esc = ISO_ESC_5_11_1;
4493                 else if (c == '2')
4494                         iso->esc = ISO_ESC_5_11_2;
4495                 else
4496                         return 0;
4497                 goto not_done;
4498
4499         case ISO_ESC_5_11_0:
4500                 if (c == ']') {
4501                         *flags &=
4502                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4503                         goto directionality;
4504                 }
4505                 return 0;
4506
4507         case ISO_ESC_5_11_1:
4508                 if (c == ']') {
4509                         *flags =
4510                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4511                         goto directionality;
4512                 }
4513                 return 0;
4514
4515         case ISO_ESC_5_11_2:
4516                 if (c == ']') {
4517                         *flags =
4518                             (*flags & CODING_STATE_ISO2022_LOCK) |
4519                             CODING_STATE_R2L;
4520                         goto directionality;
4521                 }
4522                 return 0;
4523
4524         directionality:
4525                 iso->esc = ISO_ESC_DIRECTIONALITY;
4526                 /* Various junk here to attempt to preserve the direction
4527                    sequences literally in the text if they would otherwise be
4528                    swallowed due to invalid designations that don't show up as
4529                    actual charset changes in the text. */
4530                 if (iso->invalid_switch_dir) {
4531                         /* We already inserted a direction switch literally into
4532                            the text.  We assume (#### this may not be right)
4533                            that the next direction switch is the one going the
4534                            other way, and we need to output that literally as
4535                            well. */
4536                         iso->output_literally = 1;
4537                         iso->invalid_switch_dir = 0;
4538                 } else {
4539                         int jj;
4540
4541                         /* If we are in the thrall of an invalid designation,
4542                            then stick the directionality sequence literally into
4543                            the output stream so it ends up in the original text
4544                            again. */
4545                         for (jj = 0; jj < 4; jj++)
4546                                 if (iso->invalid_designated[jj])
4547                                         break;
4548                         if (jj < 4) {
4549                                 iso->output_literally = 1;
4550                                 iso->invalid_switch_dir = 1;
4551                         } else
4552                                 /* Indicate that we haven't yet seen a valid
4553                                    designation, so that if a switch-dir is
4554                                    directly followed by an invalid designation,
4555                                    both get inserted literally. */
4556                                 iso->switched_dir_and_no_valid_charset_yet = 1;
4557                 }
4558                 return 1;
4559
4560                 /**** designation ****/
4561
4562         case ISO_ESC_2_4:
4563                 if (0x28 <= c && c <= 0x2F) {
4564                         iso->esc =
4565                             (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4566                         goto not_done;
4567                 }
4568                 if (0x40 <= c && c <= 0x42) {
4569                         cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4570                                                    *flags & CODING_STATE_R2L ?
4571                                                    CHARSET_RIGHT_TO_LEFT :
4572                                                    CHARSET_LEFT_TO_RIGHT);
4573                         reg = 0;
4574                         goto designated;
4575                 }
4576                 return 0;
4577
4578                 /* list the rest */
4579         case ISO_ESC_2_8:
4580         case ISO_ESC_2_9:
4581         case ISO_ESC_2_10:
4582         case ISO_ESC_2_11:
4583         case ISO_ESC_2_12:
4584         case ISO_ESC_2_13:
4585         case ISO_ESC_2_14:
4586         case ISO_ESC_2_15:
4587         case ISO_ESC_2_4_8:
4588         case ISO_ESC_2_4_9:
4589         case ISO_ESC_2_4_10:
4590         case ISO_ESC_2_4_11:
4591         case ISO_ESC_2_4_12:
4592         case ISO_ESC_2_4_13:
4593         case ISO_ESC_2_4_14:
4594         case ISO_ESC_2_4_15:
4595         case ISO_ESC_SINGLE_SHIFT:
4596         case ISO_ESC_LOCKING_SHIFT:
4597         case ISO_ESC_DESIGNATE:
4598         case ISO_ESC_DIRECTIONALITY:
4599         case ISO_ESC_LITERAL:
4600
4601         default: {
4602                 int type = -1;
4603
4604                 if (c < '0' || c > '~')
4605                         return 0;       /* bad final byte */
4606
4607                 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4608                         type = ((iso->esc >= ISO_ESC_2_12) ?
4609                                 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4610                         reg = (iso->esc - ISO_ESC_2_8) & 3;
4611                 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4612                            iso->esc <= ISO_ESC_2_4_15) {
4613                         type = ((iso->esc >= ISO_ESC_2_4_12) ?
4614                                 CHARSET_TYPE_96X96 :
4615                                 CHARSET_TYPE_94X94);
4616                         reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4617                 } else {
4618                         /* Can this ever be reached? -slb */
4619                         abort();
4620                         return 0;
4621                 }
4622
4623                 cs = CHARSET_BY_ATTRIBUTES(type, c,
4624                                            *flags & CODING_STATE_R2L ?
4625                                            CHARSET_RIGHT_TO_LEFT :
4626                                            CHARSET_LEFT_TO_RIGHT);
4627                 goto designated;
4628         }
4629         }
4630
4631       not_done:
4632         iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4633         return -1;
4634
4635       single_shift:
4636         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4637                 /* can't invoke something that ain't there. */
4638                 return 0;
4639         iso->esc = ISO_ESC_SINGLE_SHIFT;
4640         *flags &= CODING_STATE_ISO2022_LOCK;
4641         if (reg == 2)
4642                 *flags |= CODING_STATE_SS2;
4643         else
4644                 *flags |= CODING_STATE_SS3;
4645         return 1;
4646
4647       locking_shift:
4648         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4649                 /* can't invoke something that ain't there. */
4650                 return 0;
4651         if (half)
4652                 iso->register_right = reg;
4653         else
4654                 iso->register_left = reg;
4655         *flags &= CODING_STATE_ISO2022_LOCK;
4656         iso->esc = ISO_ESC_LOCKING_SHIFT;
4657         return 1;
4658
4659       designated:
4660         if (NILP(cs) && check_invalid_charsets) {
4661                 iso->invalid_designated[reg] = 1;
4662                 iso->charset[reg] = Vcharset_ascii;
4663                 iso->esc = ISO_ESC_DESIGNATE;
4664                 *flags &= CODING_STATE_ISO2022_LOCK;
4665                 iso->output_literally = 1;
4666                 if (iso->switched_dir_and_no_valid_charset_yet) {
4667                         /* We encountered a switch-direction followed by an
4668                            invalid designation.  Ensure that the switch-direction
4669                            gets outputted; otherwise it will probably get eaten
4670                            when the text is written out again. */
4671                         iso->switched_dir_and_no_valid_charset_yet = 0;
4672                         iso->output_direction_sequence = 1;
4673                         /* And make sure that the switch-dir going the other
4674                            way gets outputted, as well. */
4675                         iso->invalid_switch_dir = 1;
4676                 }
4677                 return 1;
4678         }
4679         /* This function is called with CODESYS equal to nil when
4680            doing coding-system detection. */
4681         if (!NILP(codesys)) {
4682                 charset_conversion_spec_dynarr *dyn =
4683                     XCODING_SYSTEM(codesys)->iso2022.input_conv;
4684
4685                 if (dyn) {
4686                         int i;
4687
4688                         for (i = 0; i < Dynarr_length(dyn); i++) {
4689                                 struct charset_conversion_spec *spec =
4690                                     Dynarr_atp(dyn, i);
4691                                 if (EQ(cs, spec->from_charset))
4692                                         cs = spec->to_charset;
4693                         }
4694                 }
4695         }
4696
4697         iso->charset[reg] = cs;
4698         iso->esc = ISO_ESC_DESIGNATE;
4699         *flags &= CODING_STATE_ISO2022_LOCK;
4700         if (iso->invalid_designated[reg]) {
4701                 iso->invalid_designated[reg] = 0;
4702                 iso->output_literally = 1;
4703         }
4704         if (iso->switched_dir_and_no_valid_charset_yet)
4705                 iso->switched_dir_and_no_valid_charset_yet = 0;
4706         return 1;
4707 }
4708
4709 static int
4710 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4711                       Lstream_data_count n)
4712 {
4713         int mask;
4714
4715         /* #### There are serious deficiencies in the recognition mechanism
4716            here.  This needs to be much smarter if it's going to cut it.
4717            The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4718            it should be detected as Latin-1.
4719            All the ISO2022 stuff in this file should be synced up with the
4720            code from FSF Emacs-20.4, in which Mule should be more or less stable.
4721            Perhaps we should wait till R2L works in FSF Emacs? */
4722
4723         if (!st->iso2022.initted) {
4724                 reset_iso2022(Qnil, &st->iso2022.iso);
4725                 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4726                                     CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4727                                     CODING_CATEGORY_ISO_8_1_MASK |
4728                                     CODING_CATEGORY_ISO_8_2_MASK |
4729                                     CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4730                 st->iso2022.flags = 0;
4731                 st->iso2022.high_byte_count = 0;
4732                 st->iso2022.saw_single_shift = 0;
4733                 st->iso2022.initted = 1;
4734         }
4735
4736         mask = st->iso2022.mask;
4737
4738         while (n--) {
4739                 const unsigned char c = *(const unsigned char *)src++;
4740                 if (c >= 0xA0) {
4741                         mask &= ~CODING_CATEGORY_ISO_7_MASK;
4742                         st->iso2022.high_byte_count++;
4743                 } else {
4744                         if (st->iso2022.high_byte_count
4745                             && !st->iso2022.saw_single_shift) {
4746                                 if (st->iso2022.high_byte_count & 1)
4747                                         /* odd number of high bytes; assume not iso-8-2 */
4748                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4749                         }
4750                         st->iso2022.high_byte_count = 0;
4751                         st->iso2022.saw_single_shift = 0;
4752                         if (c > 0x80)
4753                                 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4754                 }
4755                 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4756                     && (BYTE_C0_P(c) || BYTE_C1_P(c))) {        /* control chars */
4757                         switch (c) {
4758                                 /* Allow and ignore control characters that you might
4759                                    reasonably see in a text file */
4760                         case '\r':
4761                         case '\n':
4762                         case '\t':
4763                         case 7: /* bell */
4764                         case 8: /* backspace */
4765                         case 11:        /* vertical tab */
4766                         case 12:        /* form feed */
4767                         case 26:        /* MS-DOS C-z junk */
4768                         case 31:        /* '^_' -- for info */
4769                                 goto label_continue_loop;
4770
4771                         default:
4772                                 break;
4773                         }
4774                 }
4775
4776                 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4777                     || BYTE_C1_P(c)) {
4778                         if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4779                                               &st->iso2022.flags, 0)) {
4780                                 switch (st->iso2022.iso.esc) {
4781                                 case ISO_ESC_DESIGNATE:
4782                                         mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4783                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4784                                         break;
4785                                 case ISO_ESC_LOCKING_SHIFT:
4786                                         mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4787                                         goto ran_out_of_chars;
4788                                 case ISO_ESC_SINGLE_SHIFT:
4789                                         mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4790                                         st->iso2022.saw_single_shift = 1;
4791                                         break;
4792
4793                                         /* list the rest */
4794                                 case ISO_ESC_NOTHING:
4795                                 case ISO_ESC:
4796                                 case ISO_ESC_2_4:
4797                                 case ISO_ESC_2_8:
4798                                 case ISO_ESC_2_9:
4799                                 case ISO_ESC_2_10:
4800                                 case ISO_ESC_2_11:
4801                                 case ISO_ESC_2_12:
4802                                 case ISO_ESC_2_13:
4803                                 case ISO_ESC_2_14:
4804                                 case ISO_ESC_2_15:
4805                                 case ISO_ESC_2_4_8:
4806                                 case ISO_ESC_2_4_9:
4807                                 case ISO_ESC_2_4_10:
4808                                 case ISO_ESC_2_4_11:
4809                                 case ISO_ESC_2_4_12:
4810                                 case ISO_ESC_2_4_13:
4811                                 case ISO_ESC_2_4_14:
4812                                 case ISO_ESC_2_4_15:
4813                                 case ISO_ESC_5_11:
4814                                 case ISO_ESC_5_11_0:
4815                                 case ISO_ESC_5_11_1:
4816                                 case ISO_ESC_5_11_2:
4817                                 case ISO_ESC_DIRECTIONALITY:
4818                                 case ISO_ESC_LITERAL:
4819                                 default:
4820                                         break;
4821                                 }
4822                         } else {
4823                                 mask = 0;
4824                                 goto ran_out_of_chars;
4825                         }
4826                 }
4827         label_continue_loop:;
4828         }
4829
4830 ran_out_of_chars:
4831         return mask;
4832 }
4833
4834 static int postprocess_iso2022_mask(int mask)
4835 {
4836         /* #### kind of cheesy */
4837         /* If seven-bit ISO is allowed, then assume that the encoding is
4838            entirely seven-bit and turn off the eight-bit ones. */
4839         if (mask & CODING_CATEGORY_ISO_7_MASK)
4840                 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4841                           CODING_CATEGORY_ISO_8_1_MASK |
4842                           CODING_CATEGORY_ISO_8_2_MASK);
4843         return mask;
4844 }
4845
4846 /* If FLAGS is a null pointer or specifies right-to-left motion,
4847    output a switch-dir-to-left-to-right sequence to DST.
4848    Also update FLAGS if it is not a null pointer.
4849    If INTERNAL_P is set, we are outputting in internal format and
4850    need to handle the CSI differently. */
4851
4852 static void
4853 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4854                                 unsigned_char_dynarr * dst,
4855                                 unsigned int *flags, int internal_p)
4856 {
4857         if (!flags || (*flags & CODING_STATE_R2L)) {
4858                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4859                         Dynarr_add(dst, ISO_CODE_ESC);
4860                         Dynarr_add(dst, '[');
4861                 } else if (internal_p)
4862                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4863                 else
4864                         Dynarr_add(dst, ISO_CODE_CSI);
4865                 Dynarr_add(dst, '0');
4866                 Dynarr_add(dst, ']');
4867                 if (flags)
4868                         *flags &= ~CODING_STATE_R2L;
4869         }
4870 }
4871
4872 /* If FLAGS is a null pointer or specifies a direction different from
4873    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4874    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4875    sequence to DST.  Also update FLAGS if it is not a null pointer.
4876    If INTERNAL_P is set, we are outputting in internal format and
4877    need to handle the CSI differently. */
4878
4879 static void
4880 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4881                          unsigned_char_dynarr * dst, unsigned int *flags,
4882                          int internal_p)
4883 {
4884         if ((!flags || (*flags & CODING_STATE_R2L)) &&
4885             direction == CHARSET_LEFT_TO_RIGHT)
4886                 restore_left_to_right_direction(codesys, dst, flags,
4887                                                 internal_p);
4888         else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4889                  && (!flags || !(*flags & CODING_STATE_R2L)) &&
4890                  direction == CHARSET_RIGHT_TO_LEFT) {
4891                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4892                         Dynarr_add(dst, ISO_CODE_ESC);
4893                         Dynarr_add(dst, '[');
4894                 } else if (internal_p)
4895                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4896                 else
4897                         Dynarr_add(dst, ISO_CODE_CSI);
4898                 Dynarr_add(dst, '2');
4899                 Dynarr_add(dst, ']');
4900                 if (flags)
4901                         *flags |= CODING_STATE_R2L;
4902         }
4903 }
4904
4905 /* Convert ISO2022-format data to internal format. */
4906
4907 static void
4908 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4909                       unsigned_char_dynarr * dst, Lstream_data_count n)
4910 {
4911         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4912         unsigned int flags = str->flags;
4913         unsigned int ch = str->ch;
4914         eol_type_t eol_type = str->eol_type;
4915 #ifdef ENABLE_COMPOSITE_CHARS
4916         unsigned_char_dynarr *real_dst = dst;
4917 #endif
4918         Lisp_Object coding_system;
4919
4920         XSETCODING_SYSTEM(coding_system, str->codesys);
4921
4922 #ifdef ENABLE_COMPOSITE_CHARS
4923         if (flags & CODING_STATE_COMPOSITE)
4924                 dst = str->iso2022.composite_chars;
4925 #endif                          /* ENABLE_COMPOSITE_CHARS */
4926
4927         while (n--) {
4928                 const unsigned char c = *(const unsigned char *)src++;
4929                 if (flags & CODING_STATE_ESCAPE) {
4930                         /* Within ESC sequence */
4931                         int retval = parse_iso2022_esc(
4932                                 coding_system, &str->iso2022, c, &flags, 1);
4933
4934                         if (retval) {
4935                                 switch (str->iso2022.esc) {
4936 #ifdef ENABLE_COMPOSITE_CHARS
4937                                 case ISO_ESC_START_COMPOSITE:
4938                                         if (str->iso2022.composite_chars)
4939                                                 Dynarr_reset(str->iso2022.
4940                                                              composite_chars);
4941                                         else
4942                                                 str->iso2022.composite_chars =
4943                                                     Dynarr_new(unsigned_char);
4944                                         dst = str->iso2022.composite_chars;
4945                                         break;
4946                                 case ISO_ESC_END_COMPOSITE:
4947                                         {
4948                                                 Bufbyte comstr[MAX_EMCHAR_LEN];
4949                                                 Bytecount len;
4950                                                 Emchar emch =
4951                                                     lookup_composite_char
4952                                                     (Dynarr_atp(dst, 0),
4953                                                      Dynarr_length(dst));
4954                                                 dst = real_dst;
4955                                                 len =
4956                                                     set_charptr_emchar(comstr,
4957                                                                        emch);
4958                                                 Dynarr_add_many(dst, comstr,
4959                                                                 len);
4960                                                 break;
4961                                         }
4962 #endif                          /* ENABLE_COMPOSITE_CHARS */
4963
4964                                 case ISO_ESC_LITERAL:
4965                                         DECODE_ADD_BINARY_CHAR(c, dst);
4966                                         break;
4967
4968                                 case ISO_ESC_NOTHING:
4969                                 case ISO_ESC:
4970                                 case ISO_ESC_2_4:
4971                                 case ISO_ESC_2_8:
4972                                 case ISO_ESC_2_9:
4973                                 case ISO_ESC_2_10:
4974                                 case ISO_ESC_2_11:
4975                                 case ISO_ESC_2_12:
4976                                 case ISO_ESC_2_13:
4977                                 case ISO_ESC_2_14:
4978                                 case ISO_ESC_2_15:
4979                                 case ISO_ESC_2_4_8:
4980                                 case ISO_ESC_2_4_9:
4981                                 case ISO_ESC_2_4_10:
4982                                 case ISO_ESC_2_4_11:
4983                                 case ISO_ESC_2_4_12:
4984                                 case ISO_ESC_2_4_13:
4985                                 case ISO_ESC_2_4_14:
4986                                 case ISO_ESC_2_4_15:
4987                                 case ISO_ESC_5_11:
4988                                 case ISO_ESC_5_11_0:
4989                                 case ISO_ESC_5_11_1:
4990                                 case ISO_ESC_5_11_2:
4991                                 case ISO_ESC_SINGLE_SHIFT:
4992                                 case ISO_ESC_LOCKING_SHIFT:
4993                                 case ISO_ESC_DESIGNATE:
4994                                 case ISO_ESC_DIRECTIONALITY:
4995
4996                                 default:
4997                                         /* Everything else handled already */
4998                                         break;
4999                                 }
5000                         }
5001
5002                         /* Attempted error recovery. */
5003                         if (str->iso2022.output_direction_sequence)
5004                                 ensure_correct_direction(flags &
5005                                                          CODING_STATE_R2L ?
5006                                                          CHARSET_RIGHT_TO_LEFT :
5007                                                          CHARSET_LEFT_TO_RIGHT,
5008                                                          str->codesys, dst, 0,
5009                                                          1);
5010                         /* More error recovery. */
5011                         if (!retval || str->iso2022.output_literally) {
5012                                 /* Output the (possibly invalid) sequence */
5013                                 int i;
5014                                 for (i = 0; i < str->iso2022.esc_bytes_index;
5015                                      i++)
5016                                         DECODE_ADD_BINARY_CHAR(str->iso2022.
5017                                                                esc_bytes[i],
5018                                                                dst);
5019                                 flags &= CODING_STATE_ISO2022_LOCK;
5020                                 if (!retval)
5021                                         n++, src--;     /* Repeat the loop with the same character. */
5022                                 else {
5023                                         /* No sense in reprocessing the final byte of the
5024                                            escape sequence; it could mess things up anyway.
5025                                            Just add it now. */
5026                                         DECODE_ADD_BINARY_CHAR(c, dst);
5027                                 }
5028                         }
5029                         ch = 0;
5030                 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) {      /* Control characters */
5031
5032           /***** Error-handling *****/
5033
5034                         /* If we were in the middle of a character, dump out the
5035                            partial character. */
5036                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5037
5038                         /* If we just saw a single-shift character, dump it out.
5039                            This may dump out the wrong sort of single-shift character,
5040                            but least it will give an indication that something went
5041                            wrong. */
5042                         if (flags & CODING_STATE_SS2) {
5043                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5044                                 flags &= ~CODING_STATE_SS2;
5045                         }
5046                         if (flags & CODING_STATE_SS3) {
5047                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5048                                 flags &= ~CODING_STATE_SS3;
5049                         }
5050
5051           /***** Now handle the control characters. *****/
5052
5053                         /* Handle CR/LF */
5054                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5055
5056                         flags &= CODING_STATE_ISO2022_LOCK;
5057
5058                         if (!parse_iso2022_esc
5059                             (coding_system, &str->iso2022, c, &flags, 1))
5060                                 DECODE_ADD_BINARY_CHAR(c, dst);
5061                 } else {        /* Graphic characters */
5062                         Lisp_Object charset;
5063                         int lb;
5064                         int reg;
5065
5066                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5067
5068                         /* Now determine the charset. */
5069                         reg = ((flags & CODING_STATE_SS2) ? 2
5070                                : (flags & CODING_STATE_SS3) ? 3
5071                                : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5072                                : str->iso2022.register_left);
5073                         charset = str->iso2022.charset[reg];
5074
5075                         /* Error checking: */
5076                         if (!CHARSETP(charset)
5077                             || str->iso2022.invalid_designated[reg]
5078                             ||
5079                             (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5080                              && XCHARSET_CHARS(charset) == 94))
5081                                 /* Mrmph.  We are trying to invoke a register that has no
5082                                    or an invalid charset in it, or trying to add a character
5083                                    outside the range of the charset.  Insert that char literally
5084                                    to preserve it for the output. */
5085                         {
5086                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5087                                 DECODE_ADD_BINARY_CHAR(c, dst);
5088                         }
5089
5090                         else {
5091                                 /* Things are probably hunky-dorey. */
5092
5093                                 /* Fetch reverse charset, maybe. */
5094                                 if (((flags & CODING_STATE_R2L) &&
5095                                      XCHARSET_DIRECTION(charset) ==
5096                                      CHARSET_LEFT_TO_RIGHT)
5097                                     || (!(flags & CODING_STATE_R2L)
5098                                         && XCHARSET_DIRECTION(charset) ==
5099                                         CHARSET_RIGHT_TO_LEFT)) {
5100                                         Lisp_Object new_charset =
5101                                             XCHARSET_REVERSE_DIRECTION_CHARSET
5102                                             (charset);
5103                                         if (!NILP(new_charset))
5104                                                 charset = new_charset;
5105                                 }
5106
5107                                 lb = XCHARSET_LEADING_BYTE(charset);
5108                                 switch (XCHARSET_REP_BYTES(charset)) {
5109                                 case 1: /* ASCII */
5110                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5111                                         Dynarr_add(dst, c & 0x7F);
5112                                         break;
5113
5114                                 case 2: /* one-byte official */
5115                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5116                                         Dynarr_add(dst, lb);
5117                                         Dynarr_add(dst, c | 0x80);
5118                                         break;
5119
5120                                 case 3: /* one-byte private or two-byte official */
5121                                         if (XCHARSET_PRIVATE_P(charset)) {
5122                                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5123                                                 Dynarr_add(dst,
5124                                                            PRE_LEADING_BYTE_PRIVATE_1);
5125                                                 Dynarr_add(dst, lb);
5126                                                 Dynarr_add(dst, c | 0x80);
5127                                         } else {
5128                                                 if (ch) {
5129                                                         Dynarr_add(dst, lb);
5130                                                         Dynarr_add(dst,
5131                                                                    ch | 0x80);
5132                                                         Dynarr_add(dst,
5133                                                                    c | 0x80);
5134                                                         ch = 0;
5135                                                 } else
5136                                                         ch = c;
5137                                         }
5138                                         break;
5139
5140                                 default:        /* two-byte private */
5141                                         if (ch) {
5142                                                 Dynarr_add(dst,
5143                                                            PRE_LEADING_BYTE_PRIVATE_2);
5144                                                 Dynarr_add(dst, lb);
5145                                                 Dynarr_add(dst, ch | 0x80);
5146                                                 Dynarr_add(dst, c | 0x80);
5147                                                 ch = 0;
5148                                         } else
5149                                                 ch = c;
5150                                 }
5151                         }
5152
5153                         if (!ch)
5154                                 flags &= CODING_STATE_ISO2022_LOCK;
5155                 }
5156
5157               label_continue_loop:;
5158         }
5159
5160         if (flags & CODING_STATE_END)
5161                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5162
5163         str->flags = flags;
5164         str->ch = ch;
5165 }
5166
5167 /***** ISO2022 encoder *****/
5168
5169 /* Designate CHARSET into register REG. */
5170
5171 static void
5172 iso2022_designate(Lisp_Object charset, unsigned char reg,
5173                   encoding_stream_t str, unsigned_char_dynarr * dst)
5174 {
5175         static const char inter94[] = "()*+";
5176         static const char inter96[] = ",-./";
5177         unsigned int type;
5178         unsigned char final;
5179         Lisp_Object old_charset = str->iso2022.charset[reg];
5180
5181         str->iso2022.charset[reg] = charset;
5182         if (!CHARSETP(charset))
5183                 /* charset might be an initial nil or t. */
5184                 return;
5185         type = XCHARSET_TYPE(charset);
5186         final = XCHARSET_FINAL(charset);
5187         if (!str->iso2022.force_charset_on_output[reg] &&
5188             CHARSETP(old_charset) &&
5189             XCHARSET_TYPE(old_charset) == type &&
5190             XCHARSET_FINAL(old_charset) == final)
5191                 return;
5192
5193         str->iso2022.force_charset_on_output[reg] = 0;
5194
5195         {
5196                 charset_conversion_spec_dynarr *dyn =
5197                     str->codesys->iso2022.output_conv;
5198
5199                 if (dyn) {
5200                         int i;
5201
5202                         for (i = 0; i < Dynarr_length(dyn); i++) {
5203                                 struct charset_conversion_spec *spec =
5204                                     Dynarr_atp(dyn, i);
5205                                 if (EQ(charset, spec->from_charset))
5206                                         charset = spec->to_charset;
5207                         }
5208                 }
5209         }
5210
5211         Dynarr_add(dst, ISO_CODE_ESC);
5212         switch (type) {
5213         case CHARSET_TYPE_94:
5214                 Dynarr_add(dst, inter94[reg]);
5215                 break;
5216         case CHARSET_TYPE_96:
5217                 Dynarr_add(dst, inter96[reg]);
5218                 break;
5219         case CHARSET_TYPE_94X94:
5220                 Dynarr_add(dst, '$');
5221                 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5222                     || final < '@' || final > 'B')
5223                         Dynarr_add(dst, inter94[reg]);
5224                 break;
5225         case CHARSET_TYPE_96X96:
5226                 Dynarr_add(dst, '$');
5227                 Dynarr_add(dst, inter96[reg]);
5228                 break;
5229         default:
5230                 break;
5231         }
5232         Dynarr_add(dst, final);
5233 }
5234
5235 static void
5236 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5237 {
5238         if (str->iso2022.register_left != 0) {
5239                 Dynarr_add(dst, ISO_CODE_SI);
5240                 str->iso2022.register_left = 0;
5241         }
5242 }
5243
5244 static void
5245 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5246 {
5247         if (str->iso2022.register_left != 1) {
5248                 Dynarr_add(dst, ISO_CODE_SO);
5249                 str->iso2022.register_left = 1;
5250         }
5251 }
5252
5253 /* Convert internally-formatted data to ISO2022 format. */
5254
5255 static void
5256 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5257                       unsigned_char_dynarr * dst, Lstream_data_count n)
5258 {
5259         unsigned char charmask, c;
5260         unsigned char char_boundary;
5261         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5262         unsigned int flags = str->flags;
5263         unsigned int ch = str->ch;
5264         Lisp_Coding_System *codesys = str->codesys;
5265         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5266         int i;
5267         Lisp_Object charset;
5268         int half;
5269
5270 #ifdef ENABLE_COMPOSITE_CHARS
5271         /* flags for handling composite chars.  We do a little switcharoo
5272            on the source while we're outputting the composite char. */
5273         unsigned int saved_n = 0;
5274         const unsigned char *saved_src = NULL;
5275         int in_composite = 0;
5276 #endif                          /* ENABLE_COMPOSITE_CHARS */
5277
5278         char_boundary = str->iso2022.current_char_boundary;
5279         charset = str->iso2022.current_charset;
5280         half = str->iso2022.current_half;
5281
5282 #ifdef ENABLE_COMPOSITE_CHARS
5283       back_to_square_n:
5284 #endif
5285         while (n--) {
5286                 c = *src++;
5287
5288                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
5289                         ch = 0;
5290
5291                         restore_left_to_right_direction(codesys, dst, &flags,
5292                                                         0);
5293
5294                         /* Make sure G0 contains ASCII */
5295                         if ((c > ' ' && c < ISO_CODE_DEL) ||
5296                             !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5297                                 ensure_normal_shift(str, dst);
5298                                 iso2022_designate(Vcharset_ascii, 0, str, dst);
5299                         }
5300
5301                         /* If necessary, restore everything to the default state
5302                            at end-of-line */
5303                         if (c == '\n' &&
5304                             !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5305                                 restore_left_to_right_direction(codesys, dst,
5306                                                                 &flags, 0);
5307
5308                                 ensure_normal_shift(str, dst);
5309
5310                                 for (i = 0; i < 4; i++) {
5311                                         Lisp_Object initial_charset =
5312                                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5313                                             (codesys, i);
5314                                         iso2022_designate(initial_charset, i,
5315                                                           str, dst);
5316                                 }
5317                         }
5318                         if (c == '\n') {
5319                                 if (eol_type != EOL_LF
5320                                     && eol_type != EOL_AUTODETECT)
5321                                         Dynarr_add(dst, '\r');
5322                                 if (eol_type != EOL_CR)
5323                                         Dynarr_add(dst, c);
5324                         } else {
5325                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5326                                     && fit_to_be_escape_quoted(c))
5327                                         Dynarr_add(dst, ISO_CODE_ESC);
5328                                 Dynarr_add(dst, c);
5329                         }
5330                         char_boundary = 1;
5331                 }
5332
5333                 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {     /* Processing Leading Byte */
5334                         ch = 0;
5335                         charset = CHARSET_BY_LEADING_BYTE(c);
5336                         if (LEADING_BYTE_PREFIX_P(c))
5337                                 ch = c;
5338                         else if (!EQ(charset, Vcharset_control_1)
5339 #ifdef ENABLE_COMPOSITE_CHARS
5340                                  && !EQ(charset, Vcharset_composite)
5341 #endif
5342                             ) {
5343                                 int reg;
5344
5345                                 ensure_correct_direction(XCHARSET_DIRECTION
5346                                                          (charset), codesys,
5347                                                          dst, &flags, 0);
5348
5349                                 /* Now determine which register to use. */
5350                                 reg = -1;
5351                                 for (i = 0; i < 4; i++) {
5352                                         if (EQ(charset, str->iso2022.charset[i])
5353                                             || EQ(charset,
5354                                                   CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5355                                                   (codesys, i))) {
5356                                                 reg = i;
5357                                                 break;
5358                                         }
5359                                 }
5360
5361                                 if (reg == -1) {
5362                                         if (XCHARSET_GRAPHIC(charset) != 0) {
5363                                                 if (!NILP
5364                                                     (str->iso2022.charset[1])
5365                                                     &&
5366                                                     (!CODING_SYSTEM_ISO2022_SEVEN
5367                                                      (codesys)
5368                                                      ||
5369                                                      CODING_SYSTEM_ISO2022_LOCK_SHIFT
5370                                                      (codesys)))
5371                                                         reg = 1;
5372                                                 else if (!NILP
5373                                                          (str->iso2022.
5374                                                           charset[2]))
5375                                                         reg = 2;
5376                                                 else if (!NILP
5377                                                          (str->iso2022.
5378                                                           charset[3]))
5379                                                         reg = 3;
5380                                                 else
5381                                                         reg = 0;
5382                                         } else
5383                                                 reg = 0;
5384                                 }
5385
5386                                 iso2022_designate(charset, reg, str, dst);
5387
5388                                 /* Now invoke that register. */
5389                                 switch (reg) {
5390                                 case 0:
5391                                         ensure_normal_shift(str, dst);
5392                                         half = 0;
5393                                         break;
5394
5395                                 case 1:
5396                                         if (CODING_SYSTEM_ISO2022_SEVEN
5397                                             (codesys)) {
5398                                                 ensure_shift_out(str, dst);
5399                                                 half = 0;
5400                                         } else
5401                                                 half = 1;
5402                                         break;
5403
5404                                 case 2:
5405                                         if (CODING_SYSTEM_ISO2022_SEVEN
5406                                             (str->codesys)) {
5407                                                 Dynarr_add(dst, ISO_CODE_ESC);
5408                                                 Dynarr_add(dst, 'N');
5409                                                 half = 0;
5410                                         } else {
5411                                                 Dynarr_add(dst, ISO_CODE_SS2);
5412                                                 half = 1;
5413                                         }
5414                                         break;
5415
5416                                 case 3:
5417                                         if (CODING_SYSTEM_ISO2022_SEVEN
5418                                             (str->codesys)) {
5419                                                 Dynarr_add(dst, ISO_CODE_ESC);
5420                                                 Dynarr_add(dst, 'O');
5421                                                 half = 0;
5422                                         } else {
5423                                                 Dynarr_add(dst, ISO_CODE_SS3);
5424                                                 half = 1;
5425                                         }
5426                                         break;
5427
5428                                 default:
5429                                         abort();
5430                                 }
5431                         }
5432                         char_boundary = 0;
5433                 } else {        /* Processing Non-ASCII character */
5434                         charmask = (half == 0 ? 0x7F : 0xFF);
5435                         char_boundary = 1;
5436                         if (EQ(charset, Vcharset_control_1)) {
5437                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5438                                     && fit_to_be_escape_quoted(c))
5439                                         Dynarr_add(dst, ISO_CODE_ESC);
5440                                 /* you asked for it ... */
5441                                 Dynarr_add(dst, c - 0x20);
5442                         } else {
5443                                 switch (XCHARSET_REP_BYTES(charset)) {
5444                                 case 2:
5445                                         Dynarr_add(dst, c & charmask);
5446                                         break;
5447                                 case 3:
5448                                         if (XCHARSET_PRIVATE_P(charset)) {
5449                                                 Dynarr_add(dst, c & charmask);
5450                                                 ch = 0;
5451                                         } else if (ch) {
5452 #ifdef ENABLE_COMPOSITE_CHARS
5453                                                 if (EQ
5454                                                     (charset,
5455                                                      Vcharset_composite)) {
5456                                                         if (in_composite) {
5457                                                                 /* #### Bother! We don't know how to
5458                                                                    handle this yet. */
5459                                                                 Dynarr_add(dst,
5460                                                                            '~');
5461                                                         } else {
5462                                                                 Emchar emch =
5463                                                                     MAKE_CHAR
5464                                                                     (Vcharset_composite,
5465                                                                      ch & 0x7F,
5466                                                                      c & 0x7F);
5467                                                                 Lisp_Object lstr
5468                                                                     =
5469                                                                     composite_char_string
5470                                                                     (emch);
5471                                                                 saved_n = n;
5472                                                                 saved_src = src;
5473                                                                 in_composite =
5474                                                                     1;
5475                                                                 src =
5476                                                                     XSTRING_DATA
5477                                                                     (lstr);
5478                                                                 n = XSTRING_LENGTH(lstr);
5479                                                                 Dynarr_add(dst,
5480                                                                            ISO_CODE_ESC);
5481                                                                 Dynarr_add(dst, '0');   /* start composing */
5482                                                         }
5483                                                 } else
5484 #endif                          /* ENABLE_COMPOSITE_CHARS */
5485                                                 {
5486                                                         Dynarr_add(dst,
5487                                                                    ch &
5488                                                                    charmask);
5489                                                         Dynarr_add(dst,
5490                                                                    c &
5491                                                                    charmask);
5492                                                 }
5493                                                 ch = 0;
5494                                         } else {
5495                                                 ch = c;
5496                                                 char_boundary = 0;
5497                                         }
5498                                         break;
5499                                 case 4:
5500                                         if (ch) {
5501                                                 Dynarr_add(dst, ch & charmask);
5502                                                 Dynarr_add(dst, c & charmask);
5503                                                 ch = 0;
5504                                         } else {
5505                                                 ch = c;
5506                                                 char_boundary = 0;
5507                                         }
5508                                         break;
5509                                 default:
5510                                         abort();
5511                                 }
5512                         }
5513                 }
5514         }
5515
5516 #ifdef ENABLE_COMPOSITE_CHARS
5517         if (in_composite) {
5518                 n = saved_n;
5519                 src = saved_src;
5520                 in_composite = 0;
5521                 Dynarr_add(dst, ISO_CODE_ESC);
5522                 Dynarr_add(dst, '1');   /* end composing */
5523                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
5524         }
5525 #endif                          /* ENABLE_COMPOSITE_CHARS */
5526
5527         if (char_boundary && flags & CODING_STATE_END) {
5528                 restore_left_to_right_direction(codesys, dst, &flags, 0);
5529                 ensure_normal_shift(str, dst);
5530                 for (i = 0; i < 4; i++) {
5531                         Lisp_Object initial_charset =
5532                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5533                         iso2022_designate(initial_charset, i, str, dst);
5534                 }
5535         }
5536
5537         str->flags = flags;
5538         str->ch = ch;
5539         str->iso2022.current_char_boundary = char_boundary;
5540         str->iso2022.current_charset = charset;
5541         str->iso2022.current_half = half;
5542
5543         /* Verbum caro factum est! */
5544 }
5545 #endif                          /* MULE */
5546 \f
5547 /************************************************************************/
5548 /*                     No-conversion methods                            */
5549 /************************************************************************/
5550
5551 /* This is used when reading in "binary" files -- i.e. files that may
5552    contain all 256 possible byte values and that are not to be
5553    interpreted as being in any particular decoding. */
5554 static void
5555 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5556                             unsigned_char_dynarr * dst, Lstream_data_count n)
5557 {
5558         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5559         unsigned int flags = str->flags;
5560         unsigned int ch = str->ch;
5561         eol_type_t eol_type = str->eol_type;
5562
5563         while (n--) {
5564                 const unsigned char c = *(const unsigned char *)src++;
5565
5566                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5567                 DECODE_ADD_BINARY_CHAR(c, dst);
5568         label_continue_loop:;
5569         }
5570
5571         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5572
5573         str->flags = flags;
5574         str->ch = ch;
5575 }
5576
5577 static void
5578 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5579                             unsigned_char_dynarr * dst, Lstream_data_count n)
5580 {
5581         unsigned char c;
5582         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5583         unsigned int flags = str->flags;
5584         unsigned int ch = str->ch;
5585         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5586
5587         while (n--) {
5588                 c = *src++;
5589                 if (c == '\n') {
5590                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5591                                 Dynarr_add(dst, '\r');
5592                         if (eol_type != EOL_CR)
5593                                 Dynarr_add(dst, '\n');
5594                         ch = 0;
5595                 } else if (BYTE_ASCII_P(c)) {
5596                         assert(ch == 0);
5597                         Dynarr_add(dst, c);
5598                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5599                         assert(ch == 0);
5600                         if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5601                             c == LEADING_BYTE_CONTROL_1)
5602                                 ch = c;
5603                         else
5604                                 Dynarr_add(dst, '~');   /* untranslatable character */
5605                 } else {
5606                         if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5607                                 Dynarr_add(dst, c);
5608                         else if (ch == LEADING_BYTE_CONTROL_1) {
5609                                 assert(c < 0xC0);
5610                                 Dynarr_add(dst, c - 0x20);
5611                         }
5612                         /* else it should be the second or third byte of an
5613                            untranslatable character, so ignore it */
5614                         ch = 0;
5615                 }
5616         }
5617
5618         str->flags = flags;
5619         str->ch = ch;
5620 }
5621 \f
5622 /************************************************************************/
5623 /*                             Initialization                           */
5624 /************************************************************************/
5625
5626 void syms_of_file_coding(void)
5627 {
5628         INIT_LRECORD_IMPLEMENTATION(coding_system);
5629
5630         DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5631
5632         DEFSUBR(Fcoding_system_p);
5633         DEFSUBR(Ffind_coding_system);
5634         DEFSUBR(Fget_coding_system);
5635         DEFSUBR(Fcoding_system_list);
5636         DEFSUBR(Fcoding_system_name);
5637         DEFSUBR(Fmake_coding_system);
5638         DEFSUBR(Fcopy_coding_system);
5639         DEFSUBR(Fcoding_system_canonical_name_p);
5640         DEFSUBR(Fcoding_system_alias_p);
5641         DEFSUBR(Fcoding_system_aliasee);
5642         DEFSUBR(Fdefine_coding_system_alias);
5643         DEFSUBR(Fsubsidiary_coding_system);
5644
5645         DEFSUBR(Fcoding_system_type);
5646         DEFSUBR(Fcoding_system_doc_string);
5647 #ifdef MULE
5648         DEFSUBR(Fcoding_system_charset);
5649 #endif
5650         DEFSUBR(Fcoding_system_property);
5651
5652         DEFSUBR(Fcoding_category_list);
5653         DEFSUBR(Fset_coding_priority_list);
5654         DEFSUBR(Fcoding_priority_list);
5655         DEFSUBR(Fset_coding_category_system);
5656         DEFSUBR(Fcoding_category_system);
5657
5658         DEFSUBR(Fdetect_coding_region);
5659         DEFSUBR(Fdecode_coding_region);
5660         DEFSUBR(Fencode_coding_region);
5661 #ifdef MULE
5662         DEFSUBR(Fdecode_shift_jis_char);
5663         DEFSUBR(Fencode_shift_jis_char);
5664         DEFSUBR(Fdecode_big5_char);
5665         DEFSUBR(Fencode_big5_char);
5666         DEFSUBR(Fset_ucs_char);
5667         DEFSUBR(Fucs_char);
5668         DEFSUBR(Fset_char_ucs);
5669         DEFSUBR(Fchar_ucs);
5670 #endif                          /* MULE */
5671         defsymbol(&Qcoding_systemp, "coding-system-p");
5672         defsymbol(&Qno_conversion, "no-conversion");
5673         defsymbol(&Qraw_text, "raw-text");
5674 #ifdef MULE
5675         defsymbol(&Qbig5, "big5");
5676         defsymbol(&Qshift_jis, "shift-jis");
5677         defsymbol(&Qucs4, "ucs-4");
5678         defsymbol(&Qutf8, "utf-8");
5679         defsymbol(&Qccl, "ccl");
5680         defsymbol(&Qiso2022, "iso2022");
5681 #endif                          /* MULE */
5682         defsymbol(&Qmnemonic, "mnemonic");
5683         defsymbol(&Qeol_type, "eol-type");
5684         defsymbol(&Qpost_read_conversion, "post-read-conversion");
5685         defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5686
5687         defsymbol(&Qcr, "cr");
5688         defsymbol(&Qlf, "lf");
5689         defsymbol(&Qcrlf, "crlf");
5690         defsymbol(&Qeol_cr, "eol-cr");
5691         defsymbol(&Qeol_lf, "eol-lf");
5692         defsymbol(&Qeol_crlf, "eol-crlf");
5693 #ifdef MULE
5694         defsymbol(&Qcharset_g0, "charset-g0");
5695         defsymbol(&Qcharset_g1, "charset-g1");
5696         defsymbol(&Qcharset_g2, "charset-g2");
5697         defsymbol(&Qcharset_g3, "charset-g3");
5698         defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5699         defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5700         defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5701         defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5702         defsymbol(&Qno_iso6429, "no-iso6429");
5703         defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5704         defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5705
5706         defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5707         defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5708         defsymbol(&Qseven, "seven");
5709         defsymbol(&Qlock_shift, "lock-shift");
5710         defsymbol(&Qescape_quoted, "escape-quoted");
5711 #endif                          /* MULE */
5712         defsymbol(&Qencode, "encode");
5713         defsymbol(&Qdecode, "decode");
5714
5715 #ifdef MULE
5716         defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5717                   "shift-jis");
5718         defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5719         defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5720         defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5721         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5722         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5723                   "iso-8-designate");
5724         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5725         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5726         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5727                   "iso-lock-shift");
5728 #endif                          /* MULE */
5729         defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5730                   "no-conversion");
5731 }
5732
5733 void lstream_type_create_file_coding(void)
5734 {
5735         LSTREAM_HAS_METHOD(decoding, reader);
5736         LSTREAM_HAS_METHOD(decoding, writer);
5737         LSTREAM_HAS_METHOD(decoding, rewinder);
5738         LSTREAM_HAS_METHOD(decoding, seekable_p);
5739         LSTREAM_HAS_METHOD(decoding, flusher);
5740         LSTREAM_HAS_METHOD(decoding, closer);
5741         LSTREAM_HAS_METHOD(decoding, marker);
5742
5743         LSTREAM_HAS_METHOD(encoding, reader);
5744         LSTREAM_HAS_METHOD(encoding, writer);
5745         LSTREAM_HAS_METHOD(encoding, rewinder);
5746         LSTREAM_HAS_METHOD(encoding, seekable_p);
5747         LSTREAM_HAS_METHOD(encoding, flusher);
5748         LSTREAM_HAS_METHOD(encoding, closer);
5749         LSTREAM_HAS_METHOD(encoding, marker);
5750 }
5751
5752 void vars_of_file_coding(void)
5753 {
5754         int i;
5755
5756         fcd = xnew(struct file_coding_dump);
5757         dump_add_root_struct_ptr(&fcd, &fcd_description);
5758
5759         /* Initialize to something reasonable ... */
5760         for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5761                 fcd->coding_category_system[i] = Qnil;
5762                 fcd->coding_category_by_priority[i] = i;
5763         }
5764
5765         Fprovide(intern("file-coding"));
5766
5767         DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system  /*
5768 Coding system used for TTY keyboard input.
5769 Not used under a windowing system.
5770                                                                          */ );
5771         Vkeyboard_coding_system = Qnil;
5772
5773         DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system  /*
5774 Coding system used for TTY display output.
5775 Not used under a windowing system.
5776                                                                          */ );
5777         Vterminal_coding_system = Qnil;
5778
5779         DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read  /*
5780 Overriding coding system used when reading from a file or process.
5781 You should bind this variable with `let', but do not set it globally.
5782 If this is non-nil, it specifies the coding system that will be used
5783 to decode input on read operations, such as from a file or process.
5784 It overrides `buffer-file-coding-system-for-read',
5785 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5786 this one for permanent changes to the environment.  */ );
5787         Vcoding_system_for_read = Qnil;
5788
5789         DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write        /*
5790 Overriding coding system used when writing to a file or process.
5791 You should bind this variable with `let', but do not set it globally.
5792 If this is non-nil, it specifies the coding system that will be used
5793 to encode output for write operations, such as to a file or process.
5794 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5795 Use those variables instead of this one for permanent changes to the
5796 environment.  */ );
5797         Vcoding_system_for_write = Qnil;
5798
5799         DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system        /*
5800 Coding system used to convert pathnames when accessing files.
5801                                                                                  */ );
5802         Vfile_name_coding_system = Qnil;
5803
5804         DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5805 Non-nil means the buffer contents are regarded as multi-byte form
5806 of characters, not a binary code.  This affects the display, file I/O,
5807 and behaviors of various editing commands.
5808
5809 Setting this to nil does not do anything.
5810                                                                                  */ );
5811         enable_multibyte_characters = 1;
5812 }
5813
5814 void complex_vars_of_file_coding(void)
5815 {
5816         staticpro(&Vcoding_system_hash_table);
5817         Vcoding_system_hash_table =
5818             make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5819
5820         the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5821         dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5822                                  &codesys_prop_dynarr_description);
5823
5824 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5825 {                                               \
5826   struct codesys_prop csp;                      \
5827   csp.sym = (Sym);                              \
5828   csp.prop_type = (Prop_Type);                  \
5829   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5830 } while (0)
5831
5832         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5833         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5834         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5835         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5836         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5837         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5838         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5839 #ifdef MULE
5840         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5841         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5842         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5843         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5844         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5845         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5846         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5847         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5848         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5849         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5850         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5851         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5852         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5853         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5854         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5855         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5856         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5857
5858         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5859         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5860 #endif                          /* MULE */
5861         /* Need to create this here or we're really screwed. */
5862         Fmake_coding_system
5863             (Qraw_text, Qno_conversion,
5864              build_string
5865              ("Raw text, which means it converts only line-break-codes."),
5866              list2(Qmnemonic, build_string("Raw")));
5867
5868         Fmake_coding_system
5869             (Qbinary, Qno_conversion,
5870              build_string("Binary, which means it does not convert anything."),
5871              list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5872
5873         Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5874
5875         Fdefine_coding_system_alias(Qfile_name, Qbinary);
5876
5877         Fdefine_coding_system_alias(Qterminal, Qbinary);
5878         Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5879
5880         /* Need this for bootstrapping */
5881         fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5882             Fget_coding_system(Qraw_text);
5883
5884 #ifdef MULE
5885         {
5886                 size_t i;
5887
5888                 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5889                         fcd->ucs_to_mule_table[i] = Qnil;
5890         }
5891         staticpro(&mule_to_ucs_table);
5892         mule_to_ucs_table = Fmake_char_table(Qgeneric);
5893 #endif                          /* MULE */
5894 }