Initial git import
[sxemacs] / src / mule / file-coding.c
1 /* Code conversion functions.
2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3    Copyright (C) 1995 Sun Microsystems, Inc.
4
5 This file is part of SXEmacs
6
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>. */
19
20
21 /* Synched up with: Mule 2.3.   Not in FSF. */
22
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
24
25 #include <config.h>
26 #include "lisp.h"
27
28 #include "buffer.h"
29 #include "elhash.h"
30 #include "ui/insdel.h"
31 #include "lstream.h"
32 #include "opaque.h"
33 #ifdef MULE
34 #include "mule-ccl.h"
35 #include "chartab.h"
36 #endif
37 #include "file-coding.h"
38
39 Lisp_Object Qcoding_system_error;
40
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
46
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
49
50 struct file_coding_dump {
51         /* Coding system currently associated with each coding category. */
52         Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
53
54         /* Table of all coding categories in decreasing order of priority.
55            This describes a permutation of the possible coding categories. */
56         int coding_category_by_priority[CODING_CATEGORY_LAST];
57
58 #ifdef MULE
59         Lisp_Object ucs_to_mule_table[65536];
60 #endif
61 } *fcd;
62
63 static const struct lrecord_description fcd_description_1[] = {
64         {XD_LISP_OBJECT_ARRAY,
65          offsetof(struct file_coding_dump, coding_category_system),
66          CODING_CATEGORY_LAST},
67 #ifdef MULE
68         {XD_LISP_OBJECT_ARRAY,
69          offsetof(struct file_coding_dump, ucs_to_mule_table),
70          countof(fcd->ucs_to_mule_table)},
71 #endif
72         {XD_END}
73 };
74
75 static const struct struct_description fcd_description = {
76         sizeof(struct file_coding_dump),
77         fcd_description_1
78 };
79
80 Lisp_Object mule_to_ucs_table;
81
82 Lisp_Object Qcoding_systemp;
83
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
86
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
92
93 #ifdef MULE
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 #endif
104 Lisp_Object Qencode, Qdecode;
105
106 Lisp_Object Vcoding_system_hash_table;
107
108 int enable_multibyte_characters;
109
110 #ifdef MULE
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113         /* CHARSET holds the character sets currently assigned to the G0
114            through G3 variables.  It is initialized from the array
115            INITIAL_CHARSET in CODESYS. */
116         Lisp_Object charset[4];
117
118         /* Which registers are currently invoked into the left (GL) and
119            right (GR) halves of the 8-bit encoding space? */
120         int register_left, register_right;
121
122         /* ISO_ESC holds a value indicating part of an escape sequence
123            that has already been seen. */
124         enum iso_esc_flag esc;
125
126         /* This records the bytes we've seen so far in an escape sequence,
127            in case the sequence is invalid (we spit out the bytes unchanged). */
128         unsigned char esc_bytes[8];
129
130         /* Index for next byte to store in ISO escape sequence. */
131         int esc_bytes_index;
132
133 #ifdef ENABLE_COMPOSITE_CHARS
134         /* Stuff seen so far when composing a string. */
135         unsigned_char_dynarr *composite_chars;
136 #endif
137
138         /* If we saw an invalid designation sequence for a particular
139            register, we flag it here and switch to ASCII.  The next time we
140            see a valid designation for this register, we turn off the flag
141            and do the designation normally, but pretend the sequence was
142            invalid.  The effect of all this is that (most of the time) the
143            escape sequences for both the switch to the unknown charset, and
144            the switch back to the known charset, get inserted literally into
145            the buffer and saved out as such.  The hope is that we can
146            preserve the escape sequences so that the resulting written out
147            file makes sense.  If we don't do any of this, the designation
148            to the invalid charset will be preserved but that switch back
149            to the known charset will probably get eaten because it was
150            the same charset that was already present in the register. */
151         unsigned char invalid_designated[4];
152
153         /* We try to do similar things as above for direction-switching
154            sequences.  If we encountered a direction switch while an
155            invalid designation was present, or an invalid designation
156            just after a direction switch (i.e. no valid designation
157            encountered yet), we insert the direction-switch escape
158            sequence literally into the output stream, and later on
159            insert the corresponding direction-restoring escape sequence
160            literally also. */
161         unsigned int switched_dir_and_no_valid_charset_yet:1;
162         unsigned int invalid_switch_dir:1;
163
164         /* Tells the decoder to output the escape sequence literally
165            even though it was valid.  Used in the games we play to
166            avoid lossage when we encounter invalid designations. */
167         unsigned int output_literally:1;
168         /* We encountered a direction switch followed by an invalid
169            designation.  We didn't output the direction switch
170            literally because we didn't know about the invalid designation;
171            but we have to do so now. */
172         unsigned int output_direction_sequence:1;
173 };
174 #endif                          /* MULE */
175 EXFUN(Fcopy_coding_system, 2);
176 #ifdef MULE
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179                               const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181                                unsigned_char_dynarr * dst,
182                                Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184                                unsigned_char_dynarr * dst,
185                                Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187                               Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189                                unsigned_char_dynarr * dst,
190                                Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192                                unsigned_char_dynarr * dst,
193                                Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195                               Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197                                unsigned_char_dynarr * dst,
198                                Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200                                unsigned_char_dynarr * dst,
201                                Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203                               Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205                                unsigned_char_dynarr * dst,
206                                Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208                                unsigned_char_dynarr * dst,
209                                Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212                           struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214                                  const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216                                   unsigned_char_dynarr * dst,
217                                   Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219                                   unsigned_char_dynarr * dst,
220                                   Lstream_data_count n);
221 #endif                          /* MULE */
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223                                         unsigned_char_dynarr * dst,
224                                         Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226                                         unsigned_char_dynarr * dst,
227                                         Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229                         unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231                         unsigned_char_dynarr * dst, Lstream_data_count n);
232
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
235         Lisp_Object sym;
236         int prop_type;
237 };
238
239 typedef struct {
240         Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
242
243 static const struct lrecord_description codesys_prop_description_1[] = {
244         {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
245         {XD_END}
246 };
247
248 static const struct struct_description codesys_prop_description = {
249         sizeof(codesys_prop),
250         codesys_prop_description_1
251 };
252
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254         XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
255         {XD_END}
256 };
257
258 static const struct struct_description codesys_prop_dynarr_description = {
259         sizeof(codesys_prop_dynarr),
260         codesys_prop_dynarr_description_1
261 };
262
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
264
265 enum codesys_prop_enum {
266         CODESYS_PROP_ALL_OK,
267         CODESYS_PROP_ISO2022,
268         CODESYS_PROP_CCL
269 };
270 \f
271 /************************************************************************/
272 /*                       Coding system functions                        */
273 /************************************************************************/
274
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
278
279 #ifdef MULE
280 static const struct lrecord_description ccs_description_1[] = {
281         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
283         {XD_END}
284 };
285
286 static const struct struct_description ccs_description = {
287         sizeof(charset_conversion_spec),
288         ccs_description_1
289 };
290
291 static const struct lrecord_description ccsd_description_1[] = {
292         XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
293         {XD_END}
294 };
295
296 static const struct struct_description ccsd_description = {
297         sizeof(charset_conversion_spec_dynarr),
298         ccsd_description_1
299 };
300 #endif
301
302 static const struct lrecord_description coding_system_description[] = {
303         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
311 #ifdef MULE
312         {XD_LISP_OBJECT_ARRAY,
313          offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
315          &ccsd_description},
316         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
317          &ccsd_description},
318         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
320 #endif
321         {XD_END}
322 };
323
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325                               mark_coding_system, print_coding_system,
326                               finalize_coding_system,
327                               0, 0, coding_system_description,
328                               Lisp_Coding_System);
329
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
331 {
332         Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
333
334         mark_object(CODING_SYSTEM_NAME(codesys));
335         mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336         mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337         mark_object(CODING_SYSTEM_EOL_LF(codesys));
338         mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339         mark_object(CODING_SYSTEM_EOL_CR(codesys));
340
341         switch (CODING_SYSTEM_TYPE(codesys)) {
342 #ifdef MULE
343                 int i;
344         case CODESYS_ISO2022:
345                 for (i = 0; i < 4; i++)
346                         mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
347                                     (codesys, i));
348                 if (codesys->iso2022.input_conv) {
349                         for (i = 0;
350                              i < Dynarr_length(codesys->iso2022.input_conv);
351                              i++) {
352                                 struct charset_conversion_spec *ccs =
353                                     Dynarr_atp(codesys->iso2022.input_conv, i);
354                                 mark_object(ccs->from_charset);
355                                 mark_object(ccs->to_charset);
356                         }
357                 }
358                 if (codesys->iso2022.output_conv) {
359                         for (i = 0;
360                              i < Dynarr_length(codesys->iso2022.output_conv);
361                              i++) {
362                                 struct charset_conversion_spec *ccs =
363                                     Dynarr_atp(codesys->iso2022.output_conv, i);
364                                 mark_object(ccs->from_charset);
365                                 mark_object(ccs->to_charset);
366                         }
367                 }
368                 break;
369
370         case CODESYS_CCL:
371                 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372                 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
373                 break;
374
375                 /* list the rest of them lot explicitly */
376         case CODESYS_AUTODETECT:
377         case CODESYS_SHIFT_JIS:
378         case CODESYS_BIG5:
379         case CODESYS_UCS4:
380         case CODESYS_UTF8:
381         case CODESYS_NO_CONVERSION:
382 #ifdef DEBUG_SXEMACS
383         case CODESYS_INTERNAL:
384 #endif
385 #endif                          /* MULE */
386         default:
387                 break;
388         }
389
390         mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391         return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
392 }
393
394 static void
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
396 {
397         Lisp_Coding_System *c = XCODING_SYSTEM(obj);
398         if (print_readably)
399                 error("printing unreadable object #<coding-system 0x%x>",
400                       c->header.uid);
401
402         write_c_string("#<coding-system ", printcharfun);
403         print_internal(c->name, printcharfun, 1);
404         write_c_string(">", printcharfun);
405 }
406
407 static void finalize_coding_system(void *header, int for_disksave)
408 {
409         Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410         /* Since coding systems never go away, this function is not
411            necessary.  But it would be necessary if we changed things
412            so that coding systems could go away. */
413         if (!for_disksave) {    /* see comment in lstream.c */
414                 switch (CODING_SYSTEM_TYPE(c)) {
415 #ifdef MULE
416                 case CODESYS_ISO2022:
417                         if (c->iso2022.input_conv) {
418                                 Dynarr_free(c->iso2022.input_conv);
419                                 c->iso2022.input_conv = 0;
420                         }
421                         if (c->iso2022.output_conv) {
422                                 Dynarr_free(c->iso2022.output_conv);
423                                 c->iso2022.output_conv = 0;
424                         }
425                         break;
426
427                         /* list the rest of them lot explicitly */
428                 case CODESYS_AUTODETECT:
429                 case CODESYS_SHIFT_JIS:
430                 case CODESYS_BIG5:
431                 case CODESYS_UCS4:
432                 case CODESYS_UTF8:
433                 case CODESYS_CCL:
434                 case CODESYS_NO_CONVERSION:
435 #ifdef DEBUG_SXEMACS
436                 case CODESYS_INTERNAL:
437 #endif
438 #endif                          /* MULE */
439                 default:
440                         break;
441                 }
442         }
443 }
444
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
446 {
447         CHECK_SYMBOL(symbol);
448         if (NILP(symbol))
449                 return EOL_AUTODETECT;
450         if (EQ(symbol, Qlf))
451                 return EOL_LF;
452         if (EQ(symbol, Qcrlf))
453                 return EOL_CRLF;
454         if (EQ(symbol, Qcr))
455                 return EOL_CR;
456
457         signal_simple_error("Unrecognized eol type", symbol);
458         return EOL_AUTODETECT;  /* not reached */
459 }
460
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
462 {
463         switch (type) {
464         default:
465                 abort();
466         case EOL_LF:
467                 return Qlf;
468         case EOL_CRLF:
469                 return Qcrlf;
470         case EOL_CR:
471                 return Qcr;
472         case EOL_AUTODETECT:
473                 return Qnil;
474         }
475 }
476
477 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
478 {
479         Lisp_Object codesys_obj;
480         int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
481         char *codesys_name = (char *)alloca(len + 7);
482         int mlen = -1;
483         char *codesys_mnemonic = 0;
484
485         Lisp_Object codesys_name_sym, sub_codesys_obj;
486
487         /* kludge */
488
489         XSETCODING_SYSTEM(codesys_obj, codesys);
490
491         memcpy(codesys_name,
492                string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
493
494         if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
495                 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
496                 codesys_mnemonic = (char *)alloca(mlen + 7);
497                 memcpy(codesys_mnemonic,
498                        XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
499         }
500 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
501   strcpy (codesys_name + len, "-" op_sys);                                      \
502   if (mlen != -1)                                                               \
503     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
504   codesys_name_sym = intern (codesys_name);                                     \
505   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
506   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
507   if (mlen != -1)                                                               \
508     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
509       build_string (codesys_mnemonic);                                          \
510   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
511 } while (0)
512
513         DEFINE_SUB_CODESYS("unix", "", EOL_LF);
514         DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
515         DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
516 }
517
518 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0,     /*
519 Return t if OBJECT is a coding system.
520 A coding system is an object that defines how text containing multiple
521 character sets is encoded into a stream of (typically 8-bit) bytes.
522 The coding system is used to decode the stream into a series of
523 characters (which may be from multiple charsets) when the text is read
524 from a file or process, and is used to encode the text back into the
525 same format when it is written out to a file or process.
526
527 For example, many ISO2022-compliant coding systems (such as Compound
528 Text, which is used for inter-client data under the X Window System)
529 use escape sequences to switch between different charsets -- Japanese
530 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
531 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
532 `make-coding-system' for more information.
533
534 Coding systems are normally identified using a symbol, and the
535 symbol is accepted in place of the actual coding system object whenever
536 a coding system is called for. (This is similar to how faces work.)
537 */
538       (object))
539 {
540         return CODING_SYSTEMP(object) ? Qt : Qnil;
541 }
542
543 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0,       /*
544 Retrieve the coding system of the given name.
545
546 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
547 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
548 If there is no such coding system, nil is returned.  Otherwise the
549 associated coding system object is returned.
550 */
551       (coding_system_or_name))
552 {
553         if (NILP(coding_system_or_name))
554                 coding_system_or_name = Qbinary;
555         else if (CODING_SYSTEMP(coding_system_or_name))
556                 return coding_system_or_name;
557         else
558                 CHECK_SYMBOL(coding_system_or_name);
559
560         while (1) {
561                 coding_system_or_name =
562                     Fgethash(coding_system_or_name, Vcoding_system_hash_table,
563                              Qnil);
564
565                 if (CODING_SYSTEMP(coding_system_or_name)
566                     || NILP(coding_system_or_name))
567                         return coding_system_or_name;
568         }
569 }
570
571 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
572 Retrieve the coding system of the given name.
573 Same as `find-coding-system' except that if there is no such
574 coding system, an error is signaled instead of returning nil.
575 */
576       (name))
577 {
578         Lisp_Object coding_system = Ffind_coding_system(name);
579
580         if (NILP(coding_system))
581                 signal_simple_error("No such coding system", name);
582         return coding_system;
583 }
584
585 /* We store the coding systems in hash tables with the names as the key and the
586    actual coding system object as the value.  Occasionally we need to use them
587    in a list format.  These routines provide us with that. */
588 struct coding_system_list_closure {
589         Lisp_Object *coding_system_list;
590 };
591
592 static int
593 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
594                                  void *coding_system_list_closure)
595 {
596         /* This function can GC */
597         struct coding_system_list_closure *cscl =
598             (struct coding_system_list_closure *)coding_system_list_closure;
599         Lisp_Object *coding_system_list = cscl->coding_system_list;
600
601         *coding_system_list = Fcons(key, *coding_system_list);
602         return 0;
603 }
604
605 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0,       /*
606 Return a list of the names of all defined coding systems.
607 */
608       ())
609 {
610         Lisp_Object coding_system_list = Qnil;
611         struct gcpro gcpro1;
612         struct coding_system_list_closure coding_system_list_closure;
613
614         GCPRO1(coding_system_list);
615         coding_system_list_closure.coding_system_list = &coding_system_list;
616         elisp_maphash(add_coding_system_to_list_mapper,
617                       Vcoding_system_hash_table, &coding_system_list_closure);
618         UNGCPRO;
619
620         return coding_system_list;
621 }
622
623 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0,       /*
624 Return the name of the given coding system.
625 */
626       (coding_system))
627 {
628         coding_system = Fget_coding_system(coding_system);
629         return XCODING_SYSTEM_NAME(coding_system);
630 }
631
632 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
633                                                   Lisp_Object name)
634 {
635         Lisp_Coding_System *codesys =
636             alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
637
638         zero_lcrecord(codesys);
639         CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
640         CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
641         CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
642         CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
643         CODING_SYSTEM_EOL_CR(codesys) = Qnil;
644         CODING_SYSTEM_EOL_LF(codesys) = Qnil;
645         CODING_SYSTEM_TYPE(codesys) = type;
646         CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
647 #ifdef MULE
648         if (type == CODESYS_ISO2022) {
649                 int i;
650                 for (i = 0; i < 4; i++)
651                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
652                             Qnil;
653         } else if (type == CODESYS_CCL) {
654                 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
655                 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
656         }
657 #endif                          /* MULE */
658         CODING_SYSTEM_NAME(codesys) = name;
659
660         return codesys;
661 }
662
663 #ifdef MULE
664 /* Given a list of charset conversion specs as specified in a Lisp
665    program, parse it into STORE_HERE. */
666
667 static void
668 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
669                                Lisp_Object spec_list)
670 {
671         Lisp_Object rest;
672
673         EXTERNAL_LIST_LOOP(rest, spec_list) {
674                 Lisp_Object car = XCAR(rest);
675                 Lisp_Object from, to;
676                 struct charset_conversion_spec spec;
677
678                 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
679                         signal_simple_error("Invalid charset conversion spec",
680                                             car);
681                 from = Fget_charset(XCAR(car));
682                 to = Fget_charset(XCAR(XCDR(car)));
683                 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
684                         signal_simple_error_2
685                             ("Attempted conversion between different charset types",
686                              from, to);
687                 spec.from_charset = from;
688                 spec.to_charset = to;
689
690                 Dynarr_add(store_here, spec);
691         }
692 }
693
694 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
695    specs, return the equivalent as the Lisp programmer would see it.
696
697    If LOAD_HERE is 0, return Qnil. */
698
699 static Lisp_Object
700 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
701 {
702         int i;
703         Lisp_Object result;
704
705         if (!load_here)
706                 return Qnil;
707         for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
708                 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
709                 result =
710                     Fcons(list2(ccs->from_charset, ccs->to_charset), result);
711         }
712
713         return Fnreverse(result);
714 }
715
716 #endif                          /* MULE */
717
718 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0,       /*
719 Register symbol NAME as a coding system.
720
721 TYPE describes the conversion method used and should be one of
722
723 nil or 'undecided
724 Automatic conversion.  SXEmacs attempts to detect the coding system
725 used in the file.
726 'no-conversion
727 No conversion.  Use this for binary files and such.  On output,
728 graphic characters that are not in ASCII or Latin-1 will be
729 replaced by a ?. (For a no-conversion-encoded buffer, these
730 characters will only be present if you explicitly insert them.)
731 'shift-jis
732 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
733 'ucs-4
734 ISO 10646 UCS-4 encoding.
735 'utf-8
736 ISO 10646 UTF-8 encoding.
737 'iso2022
738 Any ISO2022-compliant encoding.  Among other things, this includes
739 JIS (the Japanese encoding commonly used for e-mail), EUC (the
740 standard Unix encoding for Japanese and other languages), and
741 Compound Text (the encoding used in X11).  You can specify more
742 specific information about the conversion with the PROPS argument.
743 'big5
744 Big5 (the encoding commonly used for Taiwanese).
745 'ccl
746 The conversion is performed using a user-written pseudo-code
747 program.  CCL (Code Conversion Language) is the name of this
748 pseudo-code.
749 'internal
750 Write out or read in the raw contents of the memory representing
751 the buffer's text.  This is primarily useful for debugging
752 purposes, and is only enabled when SXEmacs has been compiled with
753 DEBUG_SXEMACS defined (via the --debug configure option).
754 WARNING: Reading in a file using 'internal conversion can result
755 in an internal inconsistency in the memory representing a
756 buffer's text, which will produce unpredictable results and may
757 cause SXEmacs to crash.  Under normal circumstances you should
758 never use 'internal conversion.
759
760 DOC-STRING is a string describing the coding system.
761
762 PROPS is a property list, describing the specific nature of the
763 character set.  Recognized properties are:
764
765 'mnemonic
766 String to be displayed in the modeline when this coding system is
767 active.
768
769 'eol-type
770 End-of-line conversion to be used.  It should be one of
771
772 nil
773 Automatically detect the end-of-line type (LF, CRLF,
774 or CR).  Also generate subsidiary coding systems named
775 `NAME-unix', `NAME-dos', and `NAME-mac', that are
776 identical to this coding system but have an EOL-TYPE
777 value of 'lf, 'crlf, and 'cr, respectively.
778 'lf
779 The end of a line is marked externally using ASCII LF.
780 Since this is also the way that SXEmacs represents an
781 end-of-line internally, specifying this option results
782 in no end-of-line conversion.  This is the standard
783 format for Unix text files.
784 'crlf
785 The end of a line is marked externally using ASCII
786 CRLF.  This is the standard format for MS-DOS text
787 files.
788 'cr
789 The end of a line is marked externally using ASCII CR.
790 This is the standard format for Macintosh text files.
791 t
792 Automatically detect the end-of-line type but do not
793 generate subsidiary coding systems.  (This value is
794 converted to nil when stored internally, and
795 `coding-system-property' will return nil.)
796
797 'post-read-conversion
798 Function called after a file has been read in, to perform the
799 decoding.  Called with two arguments, START and END, denoting
800 a region of the current buffer to be decoded.
801
802 'pre-write-conversion
803 Function called before a file is written out, to perform the
804 encoding.  Called with two arguments, START and END, denoting
805 a region of the current buffer to be encoded.
806
807 The following additional properties are recognized if TYPE is 'iso2022:
808
809 'charset-g0
810 'charset-g1
811 'charset-g2
812 'charset-g3
813 The character set initially designated to the G0 - G3 registers.
814 The value should be one of
815
816 -- A charset object (designate that character set)
817 -- nil (do not ever use this register)
818 -- t (no character set is initially designated to
819 the register, but may be later on; this automatically
820 sets the corresponding `force-g*-on-output' property)
821
822 'force-g0-on-output
823 'force-g1-on-output
824 'force-g2-on-output
825 'force-g2-on-output
826 If non-nil, send an explicit designation sequence on output before
827 using the specified register.
828
829 'short
830 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
831 "ESC $ B" on output in place of the full designation sequences
832 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
833
834 'no-ascii-eol
835 If non-nil, don't designate ASCII to G0 at each end of line on output.
836 Setting this to non-nil also suppresses other state-resetting that
837 normally happens at the end of a line.
838
839 'no-ascii-cntl
840 If non-nil, don't designate ASCII to G0 before control chars on output.
841
842 'seven
843 If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
844 environment.
845
846 'lock-shift
847 If non-nil, use locking-shift (SO/SI) instead of single-shift
848 or designation by escape sequence.
849
850 'no-iso6429
851 If non-nil, don't use ISO6429's direction specification.
852
853 'escape-quoted
854 If non-nil, literal control characters that are the same as
855 the beginning of a recognized ISO2022 or ISO6429 escape sequence
856 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
857 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
858 so that they can be properly distinguished from an escape sequence.
859 (Note that doing this results in a non-portable encoding.) This
860 encoding flag is used for byte-compiled files.  Note that ESC
861 is a good choice for a quoting character because there are no
862 escape sequences whose second byte is a character from the Control-0
863 or Control-1 character sets; this is explicitly disallowed by the
864 ISO2022 standard.
865
866 'input-charset-conversion
867 A list of conversion specifications, specifying conversion of
868 characters in one charset to another when decoding is performed.
869 Each specification is a list of two elements: the source charset,
870 and the destination charset.
871
872 'output-charset-conversion
873 A list of conversion specifications, specifying conversion of
874 characters in one charset to another when encoding is performed.
875 The form of each specification is the same as for
876 'input-charset-conversion.
877
878 The following additional properties are recognized (and required)
879 if TYPE is 'ccl:
880
881 'decode
882 CCL program used for decoding (converting to internal format).
883
884 'encode
885 CCL program used for encoding (converting to external format).
886 */
887       (name, type, doc_string, props))
888 {
889         Lisp_Coding_System *codesys;
890         enum coding_system_type ty;
891         int need_to_setup_eol_systems = 1;
892
893         /* Convert type to constant */
894         if (NILP(type) || EQ(type, Qundecided)) {
895                 ty = CODESYS_AUTODETECT;
896         }
897 #ifdef MULE
898         else if (EQ(type, Qshift_jis)) {
899                 ty = CODESYS_SHIFT_JIS;
900         } else if (EQ(type, Qiso2022)) {
901                 ty = CODESYS_ISO2022;
902         } else if (EQ(type, Qbig5)) {
903                 ty = CODESYS_BIG5;
904         } else if (EQ(type, Qucs4)) {
905                 ty = CODESYS_UCS4;
906         } else if (EQ(type, Qutf8)) {
907                 ty = CODESYS_UTF8;
908         } else if (EQ(type, Qccl)) {
909                 ty = CODESYS_CCL;
910         }
911 #endif
912         else if (EQ(type, Qno_conversion)) {
913                 ty = CODESYS_NO_CONVERSION;
914         }
915 #ifdef DEBUG_SXEMACS
916         else if (EQ(type, Qinternal)) {
917                 ty = CODESYS_INTERNAL;
918         }
919 #endif
920         else
921                 signal_simple_error("Invalid coding system type", type);
922
923         CHECK_SYMBOL(name);
924
925         codesys = allocate_coding_system(ty, name);
926
927         if (NILP(doc_string))
928                 doc_string = build_string("");
929         else
930                 CHECK_STRING(doc_string);
931         CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
932
933         {
934                 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
935                         if (EQ(key, Qmnemonic)) {
936                                 if (!NILP(value))
937                                         CHECK_STRING(value);
938                                 CODING_SYSTEM_MNEMONIC(codesys) = value;
939                         }
940
941                         else if (EQ(key, Qeol_type)) {
942                                 need_to_setup_eol_systems = NILP(value);
943                                 if (EQ(value, Qt))
944                                         value = Qnil;
945                                 CODING_SYSTEM_EOL_TYPE(codesys) =
946                                     symbol_to_eol_type(value);
947                         }
948
949                         else if (EQ(key, Qpost_read_conversion))
950                                 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
951                                     value;
952                         else if (EQ(key, Qpre_write_conversion))
953                                 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
954                                     value;
955 #ifdef MULE
956                         else if (ty == CODESYS_ISO2022) {
957 #define FROB_INITIAL_CHARSET(charset_num) \
958   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
959     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
960
961                                 if (EQ(key, Qcharset_g0))
962                                         FROB_INITIAL_CHARSET(0);
963                                 else if (EQ(key, Qcharset_g1))
964                                         FROB_INITIAL_CHARSET(1);
965                                 else if (EQ(key, Qcharset_g2))
966                                         FROB_INITIAL_CHARSET(2);
967                                 else if (EQ(key, Qcharset_g3))
968                                         FROB_INITIAL_CHARSET(3);
969
970 #define FROB_FORCE_CHARSET(charset_num) \
971   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
972
973                                 else if (EQ(key, Qforce_g0_on_output))
974                                         FROB_FORCE_CHARSET(0);
975                                 else if (EQ(key, Qforce_g1_on_output))
976                                         FROB_FORCE_CHARSET(1);
977                                 else if (EQ(key, Qforce_g2_on_output))
978                                         FROB_FORCE_CHARSET(2);
979                                 else if (EQ(key, Qforce_g3_on_output))
980                                         FROB_FORCE_CHARSET(3);
981
982 #define FROB_BOOLEAN_PROPERTY(prop) \
983   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
984
985                                 else if (EQ(key, Qshort))
986                                         FROB_BOOLEAN_PROPERTY(SHORT);
987                                 else if (EQ(key, Qno_ascii_eol))
988                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
989                                 else if (EQ(key, Qno_ascii_cntl))
990                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
991                                 else if (EQ(key, Qseven))
992                                         FROB_BOOLEAN_PROPERTY(SEVEN);
993                                 else if (EQ(key, Qlock_shift))
994                                         FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
995                                 else if (EQ(key, Qno_iso6429))
996                                         FROB_BOOLEAN_PROPERTY(NO_ISO6429);
997                                 else if (EQ(key, Qescape_quoted))
998                                         FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
999
1000                                 else if (EQ(key, Qinput_charset_conversion)) {
1001                                         codesys->iso2022.input_conv =
1002                                             Dynarr_new(charset_conversion_spec);
1003                                         parse_charset_conversion_specs(codesys->
1004                                                                        iso2022.
1005                                                                        input_conv,
1006                                                                        value);
1007                                 } else if (EQ(key, Qoutput_charset_conversion)) {
1008                                         codesys->iso2022.output_conv =
1009                                             Dynarr_new(charset_conversion_spec);
1010                                         parse_charset_conversion_specs(codesys->
1011                                                                        iso2022.
1012                                                                        output_conv,
1013                                                                        value);
1014                                 } else
1015                                         signal_simple_error
1016                                             ("Unrecognized property", key);
1017                         } else if (EQ(type, Qccl)) {
1018                                 Lisp_Object sym;
1019                                 struct ccl_program test_ccl;
1020                                 Extbyte *suffix;
1021
1022                                 /* Check key first.  */
1023                                 if (EQ(key, Qdecode))
1024                                         suffix = "-ccl-decode";
1025                                 else if (EQ(key, Qencode))
1026                                         suffix = "-ccl-encode";
1027                                 else
1028                                         signal_simple_error
1029                                             ("Unrecognized property", key);
1030
1031                                 /* If value is vector, register it as a ccl program
1032                                    associated with an newly created symbol for
1033                                    backward compatibility.  */
1034                                 if (VECTORP(value)) {
1035                                         sym =
1036                                             Fintern(concat2
1037                                                     (Fsymbol_name(name),
1038                                                      build_string(suffix)),
1039                                                     Qnil);
1040                                         Fregister_ccl_program(sym, value);
1041                                 } else {
1042                                         CHECK_SYMBOL(value);
1043                                         sym = value;
1044                                 }
1045                                 /* check if the given ccl programs are valid.  */
1046                                 if (setup_ccl_program(&test_ccl, sym) < 0)
1047                                         signal_simple_error
1048                                             ("Invalid CCL program", value);
1049
1050                                 if (EQ(key, Qdecode))
1051                                         CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1052                                 else if (EQ(key, Qencode))
1053                                         CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1054
1055                         }
1056 #endif                          /* MULE */
1057                         else
1058                                 signal_simple_error("Unrecognized property",
1059                                                     key);
1060                 }
1061         }
1062
1063         if (need_to_setup_eol_systems)
1064                 setup_eol_coding_systems(codesys);
1065
1066         {
1067                 Lisp_Object codesys_obj;
1068                 XSETCODING_SYSTEM(codesys_obj, codesys);
1069                 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1070                 return codesys_obj;
1071         }
1072 }
1073
1074 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0,       /*
1075 Copy OLD-CODING-SYSTEM to NEW-NAME.
1076 If NEW-NAME does not name an existing coding system, a new one will
1077 be created.
1078 */
1079       (old_coding_system, new_name))
1080 {
1081         Lisp_Object new_coding_system;
1082         old_coding_system = Fget_coding_system(old_coding_system);
1083         new_coding_system = Ffind_coding_system(new_name);
1084         if (NILP(new_coding_system)) {
1085                 XSETCODING_SYSTEM(new_coding_system,
1086                                   allocate_coding_system
1087                                   (XCODING_SYSTEM_TYPE(old_coding_system),
1088                                    new_name));
1089                 Fputhash(new_name, new_coding_system,
1090                          Vcoding_system_hash_table);
1091         }
1092
1093         {
1094                 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1095                 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1096                 memcpy(((char *)to) + sizeof(to->header),
1097                        ((char *)from) + sizeof(from->header),
1098                        sizeof(*from) - sizeof(from->header));
1099                 to->name = new_name;
1100         }
1101         return new_coding_system;
1102 }
1103
1104 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0,       /*
1105 Return t if OBJECT names a coding system, and is not a coding system alias.
1106 */
1107       (object))
1108 {
1109         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1110         return CODING_SYSTEMP(val) ? Qt : Qnil;
1111 }
1112
1113 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1114  Return t if OBJECT is a coding system alias.
1115 All coding system aliases are created by `define-coding-system-alias'.
1116 */
1117       (object))
1118 {
1119         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1120         return SYMBOLP(val) ? Qt : Qnil;
1121 }
1122
1123 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1124 Return the coding-system symbol for which symbol ALIAS is an alias.
1125 */
1126       (alias))
1127 {
1128         Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1129         if (SYMBOLP(aliasee)) {
1130                 return aliasee;
1131         } else {
1132                 signal_simple_error(
1133                         "Symbol is not a coding system alias", alias);
1134         }
1135         /* To keep the compiler happy */
1136         return Qnil;
1137 }
1138
1139 static Lisp_Object
1140 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1141 {
1142         return Fintern(concat2(Fsymbol_name(symbol),
1143                                build_string(ascii_string)), Qnil);
1144 }
1145
1146 /* A maphash function, for removing dangling coding system aliases. */
1147 static int
1148 dangling_coding_system_alias_p(Lisp_Object alias,
1149                                Lisp_Object aliasee, void *dangling_aliases)
1150 {
1151         if (SYMBOLP(aliasee)
1152             && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1153                 (*(int *)dangling_aliases)++;
1154                 return 1;
1155         } else {
1156                 return 0;
1157         }
1158 }
1159
1160 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0,       /*
1161 Define symbol ALIAS as an alias for coding system ALIASEE.
1162
1163 You can use this function to redefine an alias that has already been defined,
1164 but you cannot redefine a name which is the canonical name for a coding system.
1165 \(a canonical name of a coding system is what is returned when you call
1166 `coding-system-name' on a coding system).
1167
1168 ALIASEE itself can be an alias, which allows you to define nested aliases.
1169
1170 You are forbidden, however, from creating alias loops or `dangling' aliases.
1171 These will be detected, and an error will be signaled if you attempt to do so.
1172
1173 If ALIASEE is nil, then ALIAS will simply be undefined.
1174
1175 See also `coding-system-alias-p', `coding-system-aliasee',
1176 and `coding-system-canonical-name-p'.
1177 */
1178       (alias, aliasee))
1179 {
1180         Lisp_Object real_coding_system, probe;
1181
1182         CHECK_SYMBOL(alias);
1183
1184         if (!NILP(Fcoding_system_canonical_name_p(alias)))
1185                 signal_simple_error
1186                     ("Symbol is the canonical name of a coding system and cannot be redefined",
1187                      alias);
1188
1189         if (NILP(aliasee)) {
1190                 Lisp_Object subsidiary_unix =
1191                     append_suffix_to_symbol(alias, "-unix");
1192                 Lisp_Object subsidiary_dos =
1193                     append_suffix_to_symbol(alias, "-dos");
1194                 Lisp_Object subsidiary_mac =
1195                     append_suffix_to_symbol(alias, "-mac");
1196
1197                 Fremhash(alias, Vcoding_system_hash_table);
1198
1199                 /* Undefine subsidiary aliases,
1200                    presumably created by a previous call to this function */
1201                 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1202                     !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1203                     !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1204                         Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1205                         Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1206                         Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1207                 }
1208
1209                 /* Undefine dangling coding system aliases. */
1210                 {
1211                         int dangling_aliases;
1212
1213                         do {
1214                                 dangling_aliases = 0;
1215                                 elisp_map_remhash
1216                                     (dangling_coding_system_alias_p,
1217                                      Vcoding_system_hash_table,
1218                                      &dangling_aliases);
1219                         } while (dangling_aliases > 0);
1220                 }
1221
1222                 return Qnil;
1223         }
1224
1225         if (CODING_SYSTEMP(aliasee))
1226                 aliasee = XCODING_SYSTEM_NAME(aliasee);
1227
1228         /* Checks that aliasee names a coding-system */
1229         real_coding_system = Fget_coding_system(aliasee);
1230
1231         /* Check for coding system alias loops */
1232         if (EQ(alias, aliasee))
1233               alias_loop:signal_simple_error_2
1234                     ("Attempt to create a coding system alias loop", alias,
1235                      aliasee);
1236
1237         for (probe = aliasee;
1238              SYMBOLP(probe);
1239              probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1240                 if (EQ(probe, alias))
1241                         goto alias_loop;
1242         }
1243
1244         Fputhash(alias, aliasee, Vcoding_system_hash_table);
1245
1246         /* Set up aliases for subsidiaries.
1247            #### There must be a better way to handle subsidiary coding
1248            #### systems. */
1249         {
1250                 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1251
1252                 for (int i = 0; i < countof(suffixes); i++) {
1253                         Lisp_Object alias_subsidiary =
1254                                 append_suffix_to_symbol(alias, suffixes[i]);
1255                         Lisp_Object aliasee_subsidiary =
1256                                 append_suffix_to_symbol(aliasee, suffixes[i]);
1257
1258                         if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1259                                 Fdefine_coding_system_alias(alias_subsidiary,
1260                                                             aliasee_subsidiary);
1261                         }
1262                 }
1263         }
1264         /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1265            but it doesn't look intentional, so I'd rather return something
1266            meaningful or nothing at all. */
1267         return Qnil;
1268 }
1269
1270 static Lisp_Object
1271 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1272 {
1273         Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1274         Lisp_Object new_coding_system;
1275
1276         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1277                 return coding_system;
1278
1279         switch (type) {
1280         case EOL_AUTODETECT:
1281                 return coding_system;
1282         case EOL_LF:
1283                 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1284                 break;
1285         case EOL_CR:
1286                 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1287                 break;
1288         case EOL_CRLF:
1289                 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1290                 break;
1291         default:
1292                 abort();
1293                 return Qnil;
1294         }
1295
1296         return NILP(new_coding_system) ? coding_system : new_coding_system;
1297 }
1298
1299 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0,   /*
1300 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1301 */
1302       (coding_system, eol_type))
1303 {
1304         coding_system = Fget_coding_system(coding_system);
1305
1306         return subsidiary_coding_system(coding_system,
1307                                         symbol_to_eol_type(eol_type));
1308 }
1309 \f
1310 /************************************************************************/
1311 /*                         Coding system accessors                      */
1312 /************************************************************************/
1313
1314 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0,   /*
1315 Return the doc string for CODING-SYSTEM.
1316 */
1317       (coding_system))
1318 {
1319         coding_system = Fget_coding_system(coding_system);
1320         return XCODING_SYSTEM_DOC_STRING(coding_system);
1321 }
1322
1323 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0,       /*
1324 Return the type of CODING-SYSTEM.
1325 */
1326       (coding_system))
1327 {
1328         Lisp_Object tmp = Fget_coding_system(coding_system);
1329
1330         switch (XCODING_SYSTEM_TYPE(tmp)) {
1331         default:
1332                 abort();
1333         case CODESYS_AUTODETECT:
1334                 return Qundecided;
1335 #ifdef MULE
1336         case CODESYS_SHIFT_JIS:
1337                 return Qshift_jis;
1338         case CODESYS_ISO2022:
1339                 return Qiso2022;
1340         case CODESYS_BIG5:
1341                 return Qbig5;
1342         case CODESYS_UCS4:
1343                 return Qucs4;
1344         case CODESYS_UTF8:
1345                 return Qutf8;
1346         case CODESYS_CCL:
1347                 return Qccl;
1348 #endif
1349         case CODESYS_NO_CONVERSION:
1350                 return Qno_conversion;
1351 #ifdef DEBUG_SXEMACS
1352         case CODESYS_INTERNAL:
1353                 return Qinternal;
1354 #endif
1355         }
1356 }
1357
1358 #ifdef MULE
1359 static
1360 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1361 {
1362         Lisp_Object cs
1363             = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1364
1365         return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1366 }
1367
1368 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1369 Return initial charset of CODING-SYSTEM designated to GNUM.
1370 GNUM allows 0 .. 3.
1371 */
1372       (coding_system, gnum))
1373 {
1374         coding_system = Fget_coding_system(coding_system);
1375         CHECK_INT(gnum);
1376
1377         return coding_system_charset(coding_system, XINT(gnum));
1378 }
1379 #endif                          /* MULE */
1380
1381 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0,       /*
1382 Return the PROP property of CODING-SYSTEM.
1383 */
1384       (coding_system, prop))
1385 {
1386         int i, ok = 0;
1387         enum coding_system_type type;
1388
1389         coding_system = Fget_coding_system(coding_system);
1390         CHECK_SYMBOL(prop);
1391         type = XCODING_SYSTEM_TYPE(coding_system);
1392
1393         for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1394                 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1395                         ok = 1;
1396                         switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1397                         case CODESYS_PROP_ALL_OK:
1398                                 break;
1399 #ifdef MULE
1400                         case CODESYS_PROP_ISO2022:
1401                                 if (type != CODESYS_ISO2022)
1402                                         signal_simple_error
1403                                             ("Property only valid in ISO2022 coding systems",
1404                                              prop);
1405                                 break;
1406
1407                         case CODESYS_PROP_CCL:
1408                                 if (type != CODESYS_CCL)
1409                                         signal_simple_error
1410                                             ("Property only valid in CCL coding systems",
1411                                              prop);
1412                                 break;
1413 #endif                          /* MULE */
1414                         default:
1415                                 abort();
1416                         }
1417                 }
1418
1419         if (!ok)
1420                 signal_simple_error("Unrecognized property", prop);
1421
1422         if (EQ(prop, Qname))
1423                 return XCODING_SYSTEM_NAME(coding_system);
1424         else if (EQ(prop, Qtype))
1425                 return Fcoding_system_type(coding_system);
1426         else if (EQ(prop, Qdoc_string))
1427                 return XCODING_SYSTEM_DOC_STRING(coding_system);
1428         else if (EQ(prop, Qmnemonic))
1429                 return XCODING_SYSTEM_MNEMONIC(coding_system);
1430         else if (EQ(prop, Qeol_type))
1431                 return
1432                     eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1433         else if (EQ(prop, Qeol_lf))
1434                 return XCODING_SYSTEM_EOL_LF(coding_system);
1435         else if (EQ(prop, Qeol_crlf))
1436                 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1437         else if (EQ(prop, Qeol_cr))
1438                 return XCODING_SYSTEM_EOL_CR(coding_system);
1439         else if (EQ(prop, Qpost_read_conversion))
1440                 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1441         else if (EQ(prop, Qpre_write_conversion))
1442                 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1443 #ifdef MULE
1444         else if (type == CODESYS_ISO2022) {
1445                 if (EQ(prop, Qcharset_g0))
1446                         return coding_system_charset(coding_system, 0);
1447                 else if (EQ(prop, Qcharset_g1))
1448                         return coding_system_charset(coding_system, 1);
1449                 else if (EQ(prop, Qcharset_g2))
1450                         return coding_system_charset(coding_system, 2);
1451                 else if (EQ(prop, Qcharset_g3))
1452                         return coding_system_charset(coding_system, 3);
1453
1454 #define FORCE_CHARSET(charset_num) \
1455   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1456    (coding_system, charset_num) ? Qt : Qnil)
1457
1458                 else if (EQ(prop, Qforce_g0_on_output))
1459                         return FORCE_CHARSET(0);
1460                 else if (EQ(prop, Qforce_g1_on_output))
1461                         return FORCE_CHARSET(1);
1462                 else if (EQ(prop, Qforce_g2_on_output))
1463                         return FORCE_CHARSET(2);
1464                 else if (EQ(prop, Qforce_g3_on_output))
1465                         return FORCE_CHARSET(3);
1466
1467 #define LISP_BOOLEAN(prop) \
1468   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1469
1470                 else if (EQ(prop, Qshort))
1471                         return LISP_BOOLEAN(SHORT);
1472                 else if (EQ(prop, Qno_ascii_eol))
1473                         return LISP_BOOLEAN(NO_ASCII_EOL);
1474                 else if (EQ(prop, Qno_ascii_cntl))
1475                         return LISP_BOOLEAN(NO_ASCII_CNTL);
1476                 else if (EQ(prop, Qseven))
1477                         return LISP_BOOLEAN(SEVEN);
1478                 else if (EQ(prop, Qlock_shift))
1479                         return LISP_BOOLEAN(LOCK_SHIFT);
1480                 else if (EQ(prop, Qno_iso6429))
1481                         return LISP_BOOLEAN(NO_ISO6429);
1482                 else if (EQ(prop, Qescape_quoted))
1483                         return LISP_BOOLEAN(ESCAPE_QUOTED);
1484
1485                 else if (EQ(prop, Qinput_charset_conversion))
1486                         return
1487                             unparse_charset_conversion_specs
1488                             (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1489                 else if (EQ(prop, Qoutput_charset_conversion))
1490                         return
1491                             unparse_charset_conversion_specs
1492                             (XCODING_SYSTEM(coding_system)->iso2022.
1493                              output_conv);
1494                 else
1495                         abort();
1496         } else if (type == CODESYS_CCL) {
1497                 if (EQ(prop, Qdecode))
1498                         return XCODING_SYSTEM_CCL_DECODE(coding_system);
1499                 else if (EQ(prop, Qencode))
1500                         return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1501                 else
1502                         abort();
1503         }
1504 #endif                          /* MULE */
1505         else
1506                 abort();
1507
1508         return Qnil;            /* not reached */
1509 }
1510 \f
1511 /************************************************************************/
1512 /*                       Coding category functions                      */
1513 /************************************************************************/
1514
1515 static int decode_coding_category(Lisp_Object symbol)
1516 {
1517         int i;
1518
1519         CHECK_SYMBOL(symbol);
1520         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1521                 if (EQ(coding_category_symbol[i], symbol))
1522                         return i;
1523
1524         signal_simple_error("Unrecognized coding category", symbol);
1525         return 0;               /* not reached */
1526 }
1527
1528 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0,   /*
1529 Return a list of all recognized coding categories.
1530 */
1531       ())
1532 {
1533         int i;
1534         Lisp_Object list = Qnil;
1535
1536         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1537                 list = Fcons(coding_category_symbol[i], list);
1538         return list;
1539 }
1540
1541 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0,   /*
1542 Change the priority order of the coding categories.
1543 LIST should be list of coding categories, in descending order of
1544 priority.  Unspecified coding categories will be lower in priority
1545 than all specified ones, in the same relative order they were in
1546 previously.
1547 */
1548       (list))
1549 {
1550         int category_to_priority[CODING_CATEGORY_LAST];
1551         int i, j;
1552         Lisp_Object rest;
1553
1554         /* First generate a list that maps coding categories to priorities. */
1555
1556         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1557                 category_to_priority[i] = -1;
1558
1559         /* Highest priority comes from the specified list. */
1560         i = 0;
1561         EXTERNAL_LIST_LOOP(rest, list) {
1562                 int cat = decode_coding_category(XCAR(rest));
1563
1564                 if (category_to_priority[cat] >= 0)
1565                         signal_simple_error("Duplicate coding category in list",
1566                                             XCAR(rest));
1567                 category_to_priority[cat] = i++;
1568         }
1569
1570         /* Now go through the existing categories by priority to retrieve
1571            the categories not yet specified and preserve their priority
1572            order. */
1573         for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1574                 int cat = fcd->coding_category_by_priority[j];
1575                 if (category_to_priority[cat] < 0)
1576                         category_to_priority[cat] = i++;
1577         }
1578
1579         /* Now we need to construct the inverse of the mapping we just
1580            constructed. */
1581
1582         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1583                 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1584
1585         /* Phew!  That was confusing. */
1586         return Qnil;
1587 }
1588
1589 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0,   /*
1590 Return a list of coding categories in descending order of priority.
1591 */
1592       ())
1593 {
1594         int i;
1595         Lisp_Object list = Qnil;
1596
1597         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1598                 list =
1599                     Fcons(coding_category_symbol
1600                           [fcd->coding_category_by_priority[i]], list);
1601         return list;
1602 }
1603
1604 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0,       /*
1605 Change the coding system associated with a coding category.
1606 */
1607       (coding_category, coding_system))
1608 {
1609         int cat = decode_coding_category(coding_category);
1610
1611         coding_system = Fget_coding_system(coding_system);
1612         fcd->coding_category_system[cat] = coding_system;
1613         return Qnil;
1614 }
1615
1616 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0,       /*
1617 Return the coding system associated with a coding category.
1618 */
1619       (coding_category))
1620 {
1621         int cat = decode_coding_category(coding_category);
1622         Lisp_Object sys = fcd->coding_category_system[cat];
1623
1624         if (!NILP(sys))
1625                 return XCODING_SYSTEM_NAME(sys);
1626         return Qnil;
1627 }
1628 \f
1629 /************************************************************************/
1630 /*                     Detecting the encoding of data                   */
1631 /************************************************************************/
1632
1633 struct detection_state {
1634         eol_type_t eol_type;
1635         int seen_non_ascii;
1636         int mask;
1637 #ifdef MULE
1638         struct {
1639                 int mask;
1640                 int in_second_byte;
1641         } big5;
1642
1643         struct {
1644                 int mask;
1645                 int in_second_byte;
1646         } shift_jis;
1647
1648         struct {
1649                 int mask;
1650                 int in_byte;
1651         } ucs4;
1652
1653         struct {
1654                 int mask;
1655                 int in_byte;
1656         } utf8;
1657
1658         struct {
1659                 int mask;
1660                 int initted;
1661                 struct iso2022_decoder iso;
1662                 unsigned int flags;
1663                 int high_byte_count;
1664                 unsigned int saw_single_shift:1;
1665         } iso2022;
1666 #endif
1667         struct {
1668                 int seen_anything;
1669                 int just_saw_cr;
1670         } eol;
1671 };
1672
1673 static int acceptable_control_char_p(int c)
1674 {
1675         switch (c) {
1676                 /* Allow and ignore control characters that you might
1677                    reasonably see in a text file */
1678         case '\r':
1679         case '\n':
1680         case '\t':
1681         case 7:         /* bell */
1682         case 8:         /* backspace */
1683         case 11:                /* vertical tab */
1684         case 12:                /* form feed */
1685         case 26:                /* MS-DOS C-z junk */
1686         case 31:                /* '^_' -- for info */
1687                 return 1;
1688         default:
1689                 return 0;
1690         }
1691 }
1692
1693 static int mask_has_at_most_one_bit_p(int mask)
1694 {
1695         /* Perhaps the only thing useful you learn from intensive Microsoft
1696            technical interviews */
1697         return (mask & (mask - 1)) == 0;
1698 }
1699
1700 static eol_type_t
1701 detect_eol_type(struct detection_state *st, const Extbyte * src,
1702                 Lstream_data_count n)
1703 {
1704         while (n--) {
1705                 const unsigned char c = *(const unsigned char*)src++;
1706                 if (c == '\n') {
1707                         if (st->eol.just_saw_cr)
1708                                 return EOL_CRLF;
1709                         else if (st->eol.seen_anything)
1710                                 return EOL_LF;
1711                 } else if (st->eol.just_saw_cr)
1712                         return EOL_CR;
1713                 else if (c == '\r')
1714                         st->eol.just_saw_cr = 1;
1715                 else
1716                         st->eol.just_saw_cr = 0;
1717                 st->eol.seen_anything = 1;
1718         }
1719
1720         return EOL_AUTODETECT;
1721 }
1722
1723 /* Attempt to determine the encoding and EOL type of the given text.
1724    Before calling this function for the first type, you must initialize
1725    st->eol_type as appropriate and initialize st->mask to ~0.
1726
1727    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1728    not yet known.
1729
1730    st->mask holds the determined coding category mask, or ~0 if only
1731    ASCII has been seen so far.
1732
1733    Returns:
1734
1735    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1736         is present in st->mask
1737    1 == definitive answers are here for both st->eol_type and st->mask
1738 */
1739
1740 static int
1741 detect_coding_type(struct detection_state *st, const Extbyte * src,
1742                    Lstream_data_count n, int just_do_eol)
1743 {
1744         if (st->eol_type == EOL_AUTODETECT)
1745                 st->eol_type = detect_eol_type(st, src, n);
1746
1747         if (just_do_eol)
1748                 return st->eol_type != EOL_AUTODETECT;
1749
1750         if (!st->seen_non_ascii) {
1751                 for (; n; n--, src++) {
1752                         const unsigned char c = *(const unsigned char *)src;
1753                         if ((c < 0x20 && !acceptable_control_char_p(c))
1754                             || c >= 0x80) {
1755                                 st->seen_non_ascii = 1;
1756 #ifdef MULE
1757                                 st->shift_jis.mask = ~0;
1758                                 st->big5.mask = ~0;
1759                                 st->ucs4.mask = ~0;
1760                                 st->utf8.mask = ~0;
1761                                 st->iso2022.mask = ~0;
1762 #endif
1763                                 break;
1764                         }
1765                 }
1766         }
1767
1768         if (!n) {
1769                 return 0;
1770         }
1771 #ifdef MULE
1772         if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1773                 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1774         if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1775                 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1776         if (!mask_has_at_most_one_bit_p(st->big5.mask))
1777                 st->big5.mask = detect_coding_big5(st, src, n);
1778         if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1779                 st->utf8.mask = detect_coding_utf8(st, src, n);
1780         if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1781                 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1782
1783         st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1784                 | st->utf8.mask | st->ucs4.mask;
1785 #endif
1786         {
1787                 int retval = mask_has_at_most_one_bit_p(st->mask);
1788                 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1789                 return retval && st->eol_type != EOL_AUTODETECT;
1790         }
1791 }
1792
1793 static Lisp_Object coding_system_from_mask(int mask)
1794 {
1795         if (mask == ~0) {
1796                 /* If the file was entirely or basically ASCII, use the
1797                    default value of `buffer-file-coding-system'. */
1798                 Lisp_Object retval =
1799                     XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1800                 if (!NILP(retval)) {
1801                         retval = Ffind_coding_system(retval);
1802                         if (NILP(retval)) {
1803                                 warn_when_safe
1804                                     (Qbad_variable, Qwarning,
1805                                      "Invalid `default-buffer-file-coding-system', set to nil");
1806                                 XBUFFER(Vbuffer_defaults)->
1807                                     buffer_file_coding_system = Qnil;
1808                         }
1809                 }
1810                 if (NILP(retval))
1811                         retval = Fget_coding_system(Qraw_text);
1812                 return retval;
1813         } else {
1814                 int i;
1815                 int cat = -1;
1816 #ifdef MULE
1817                 mask = postprocess_iso2022_mask(mask);
1818 #endif
1819                 /* Look through the coding categories by priority and find
1820                    the first one that is allowed. */
1821                 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1822                         cat = fcd->coding_category_by_priority[i];
1823                         if ((mask & (1 << cat)) &&
1824                             !NILP(fcd->coding_category_system[cat]))
1825                                 break;
1826                 }
1827                 if (cat >= 0)
1828                         return fcd->coding_category_system[cat];
1829                 else
1830                         return Fget_coding_system(Qraw_text);
1831         }
1832 }
1833
1834 /* Given a seekable read stream and potential coding system and EOL type
1835    as specified, do any autodetection that is called for.  If the
1836    coding system and/or EOL type are not `autodetect', they will be left
1837    alone; but this function will never return an autodetect coding system
1838    or EOL type.
1839
1840    This function does not automatically fetch subsidiary coding systems;
1841    that should be unnecessary with the explicit eol-type argument. */
1842
1843 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1844 /* number of leading lines to check for a coding cookie */
1845 #define LINES_TO_CHECK 2
1846
1847 void
1848 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1849                              eol_type_t * eol_type_in_out)
1850 {
1851         struct detection_state decst;
1852
1853         if (*eol_type_in_out == EOL_AUTODETECT)
1854                 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
1855
1856         xzero(decst);
1857         decst.eol_type = *eol_type_in_out;
1858         decst.mask = ~0;
1859
1860         /* If autodetection is called for, do it now. */
1861         if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1862             || *eol_type_in_out == EOL_AUTODETECT) {
1863                 Extbyte buf[4096];
1864                 Lisp_Object coding_system = Qnil;
1865                 Extbyte *p;
1866                 Lstream_data_count nread =
1867                     Lstream_read(stream, buf, sizeof(buf));
1868                 Extbyte *scan_end;
1869                 int lines_checked = 0;
1870
1871                 /* Look for initial "-*-"; mode line prefix */
1872                 for (p = buf,
1873                      scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1874                      p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1875                         if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1876                                 Extbyte *local_vars_beg = p + 3;
1877                                 /* Look for final "-*-"; mode line suffix */
1878                                 for (p = local_vars_beg,
1879                                      scan_end = buf + nread - LENGTH("-*-");
1880                                      p <= scan_end
1881                                      && lines_checked < LINES_TO_CHECK; p++)
1882                                         if (*p == '-' && *(p + 1) == '*'
1883                                             && *(p + 2) == '-') {
1884                                                 Extbyte *suffix = p;
1885                                                 /* Look for "coding:" */
1886                                                 for (p = local_vars_beg,
1887                                                      scan_end =
1888                                                      suffix -
1889                                                      LENGTH("coding:?");
1890                                                      p <= scan_end; p++)
1891                                                         if (memcmp
1892                                                             ("coding:", p,
1893                                                              LENGTH("coding:"))
1894                                                             == 0
1895                                                             && (p ==
1896                                                                 local_vars_beg
1897                                                                 || (*(p - 1) ==
1898                                                                     ' '
1899                                                                     || *(p -
1900                                                                          1) ==
1901                                                                     '\t'
1902                                                                     || *(p -
1903                                                                          1) ==
1904                                                                     ';'))) {
1905                                                                 Extbyte save;
1906                                                                 int n;
1907                                                                 p += LENGTH
1908                                                                     ("coding:");
1909                                                                 while (*p == ' '
1910                                                                        || *p ==
1911                                                                        '\t')
1912                                                                         p++;
1913
1914                                                                 /* Get coding system name */
1915                                                                 save = *suffix;
1916                                                                 *suffix = '\0';
1917                                                                 /* Characters valid in a MIME charset name (rfc 1521),
1918                                                                    and in a Lisp symbol name. */
1919                                                                 n = strspn((char
1920                                                                             *)p,
1921                                                                            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1922                                                                            "abcdefghijklmnopqrstuvwxyz"
1923                                                                            "0123456789"
1924                                                                            "!$%&*+-.^_{|}~");
1925                                                                 *suffix = save;
1926                                                                 if (n > 0) {
1927                                                                         save =
1928                                                                             p
1929                                                                             [n];
1930                                                                         p[n] =
1931                                                                             '\0';
1932                                                                         coding_system
1933                                                                             =
1934                                                                             Ffind_coding_system
1935                                                                             (intern
1936                                                                              ((char *)p));
1937                                                                         p[n] =
1938                                                                             save;
1939                                                                 }
1940                                                                 break;
1941                                                         }
1942                                                 break;
1943                                         }
1944                                 /* #### file must use standard EOLs or we miss 2d line */
1945                                 /* #### not to mention this is broken for UTF-16 DOS files */
1946                                         else if (*p == '\n' || *p == '\r') {
1947                                                 lines_checked++;
1948                                                 /* skip past multibyte (DOS) newline */
1949                                                 if (*p == '\r'
1950                                                     && *(p + 1) == '\n')
1951                                                         p++;
1952                                         }
1953                                 break;
1954                         }
1955                 /* #### file must use standard EOLs or we miss 2d line */
1956                 /* #### not to mention this is broken for UTF-16 DOS files */
1957                         else if (*p == '\n' || *p == '\r') {
1958                                 lines_checked++;
1959                                 /* skip past multibyte (DOS) newline */
1960                                 if (*p == '\r' && *(p + 1) == '\n')
1961                                         p++;
1962                         }
1963
1964                 if (NILP(coding_system))
1965                         do {
1966                                 if (detect_coding_type(&decst, buf, nread,
1967                                                        XCODING_SYSTEM_TYPE
1968                                                        (*codesys_in_out)
1969                                                        != CODESYS_AUTODETECT))
1970                                         break;
1971                                 nread = Lstream_read(stream, buf, sizeof(buf));
1972                                 if (nread == 0)
1973                                         break;
1974                         }
1975                         while (1);
1976
1977                 else if (XCODING_SYSTEM_TYPE(*codesys_in_out) ==
1978                          CODESYS_AUTODETECT
1979                          && XCODING_SYSTEM_EOL_TYPE(coding_system) ==
1980                          EOL_AUTODETECT)
1981                         do {
1982                                 if (detect_coding_type(&decst, buf, nread, 1))
1983                                         break;
1984                                 nread = Lstream_read(stream, buf, sizeof(buf));
1985                                 if (!nread)
1986                                         break;
1987                         }
1988                         while (1);
1989
1990                 *eol_type_in_out = decst.eol_type;
1991                 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
1992                         if (NILP(coding_system))
1993                                 *codesys_in_out =
1994                                     coding_system_from_mask(decst.mask);
1995                         else
1996                                 *codesys_in_out = coding_system;
1997                 }
1998         }
1999
2000         /* If we absolutely can't determine the EOL type, just assume LF. */
2001         if (*eol_type_in_out == EOL_AUTODETECT)
2002                 *eol_type_in_out = EOL_LF;
2003
2004         Lstream_rewind(stream);
2005 }
2006
2007 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0,   /*
2008 Detect coding system of the text in the region between START and END.
2009 Return a list of possible coding systems ordered by priority.
2010 If only ASCII characters are found, return 'undecided or one of
2011 its subsidiary coding systems according to a detected end-of-line
2012 type.  Optional arg BUFFER defaults to the current buffer.
2013 */
2014       (start, end, buffer))
2015 {
2016         Lisp_Object val = Qnil;
2017         struct buffer *buf = decode_buffer(buffer, 0);
2018         Bufpos b, e;
2019         Lisp_Object instream, lb_instream;
2020         lstream_t istr, lb_istr;
2021         struct detection_state decst;
2022         struct gcpro gcpro1, gcpro2;
2023
2024         get_buffer_range_char(buf, start, end, &b, &e, 0);
2025         lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2026         lb_istr = XLSTREAM(lb_instream);
2027         instream =
2028             make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2029         istr = XLSTREAM(instream);
2030         GCPRO2(instream, lb_instream);
2031         xzero(decst);
2032         decst.eol_type = EOL_AUTODETECT;
2033         decst.mask = ~0;
2034         while (1) {
2035                 Extbyte random_buffer[4096];
2036                 Lstream_data_count nread =
2037                     Lstream_read(istr, random_buffer, sizeof(random_buffer));
2038
2039                 if (!nread)
2040                         break;
2041                 if (detect_coding_type(&decst, random_buffer, nread, 0))
2042                         break;
2043         }
2044
2045         if (decst.mask == ~0)
2046                 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2047                                                decst.eol_type);
2048         else {
2049                 int i;
2050
2051                 val = Qnil;
2052 #ifdef MULE
2053                 decst.mask = postprocess_iso2022_mask(decst.mask);
2054 #endif
2055                 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2056                         int sys = fcd->coding_category_by_priority[i];
2057                         if (decst.mask & (1 << sys)) {
2058                                 Lisp_Object codesys =
2059                                     fcd->coding_category_system[sys];
2060                                 if (!NILP(codesys))
2061                                         codesys =
2062                                             subsidiary_coding_system(codesys,
2063                                                                      decst.
2064                                                                      eol_type);
2065                                 val = Fcons(codesys, val);
2066                         }
2067                 }
2068         }
2069         Lstream_close(istr);
2070         UNGCPRO;
2071         Lstream_delete(istr);
2072         Lstream_delete(lb_istr);
2073         return val;
2074 }
2075 \f
2076 /************************************************************************/
2077 /*           Converting to internal Mule format ("decoding")            */
2078 /************************************************************************/
2079
2080 /* A decoding stream is a stream used for decoding text (i.e.
2081    converting from some external format to internal format).
2082    The decoding-stream object keeps track of the actual coding
2083    stream, the stream that is at the other end, and data that
2084    needs to be persistent across the lifetime of the stream. */
2085
2086 /* Handle the EOL stuff related to just-read-in character C.
2087    EOL_TYPE is the EOL type of the coding stream.
2088    FLAGS is the current value of FLAGS in the coding stream, and may
2089    be modified by this macro.  (The macro only looks at the
2090    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2091    bytes are to be written.  You need to also define a local goto
2092    label "label_continue_loop" that is at the end of the main
2093    character-reading loop.
2094
2095    If C is a CR character, then this macro handles it entirely and
2096    jumps to label_continue_loop.  Otherwise, this macro does not add
2097    anything to DST, and continues normally.  You should continue
2098    processing C normally after this macro. */
2099
2100 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2101 do {                                                            \
2102   if (c == '\r')                                                \
2103     {                                                           \
2104       if (eol_type == EOL_CR)                                   \
2105         Dynarr_add (dst, '\n');                                 \
2106       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2107         Dynarr_add (dst, c);                                    \
2108       else                                                      \
2109         flags |= CODING_STATE_CR;                               \
2110       goto label_continue_loop;                                 \
2111     }                                                           \
2112   else if (flags & CODING_STATE_CR)                             \
2113     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2114       if (c != '\n')                                            \
2115         Dynarr_add (dst, '\r');                                 \
2116       flags &= ~CODING_STATE_CR;                                \
2117     }                                                           \
2118 } while (0)
2119
2120 /* C should be a binary character in the range 0 - 255; convert
2121    to internal format and add to Dynarr DST. */
2122
2123 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2124 do {                                            \
2125   if (BYTE_ASCII_P (c))                         \
2126     Dynarr_add (dst, c);                        \
2127   else if (BYTE_C1_P (c))                       \
2128     {                                           \
2129       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2130       Dynarr_add (dst, c + 0x20);               \
2131     }                                           \
2132   else                                          \
2133     {                                           \
2134       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2135       Dynarr_add (dst, c);                      \
2136     }                                           \
2137 } while (0)
2138
2139 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2140 do {                                    \
2141   if (ch)                               \
2142     {                                   \
2143       DECODE_ADD_BINARY_CHAR (ch, dst); \
2144       ch = 0;                           \
2145     }                                   \
2146 } while (0)
2147
2148 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2149 do {                                    \
2150   if (flags & CODING_STATE_END)         \
2151     {                                   \
2152       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2153       if (flags & CODING_STATE_CR)      \
2154         Dynarr_add (dst, '\r');         \
2155     }                                   \
2156 } while (0)
2157
2158 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2159
2160 typedef struct decoding_stream_s *decoding_stream_t;
2161 struct decoding_stream_s {
2162         /* Coding system that governs the conversion. */
2163         Lisp_Coding_System *codesys;
2164
2165         /* Stream that we read the encoded data from or
2166            write the decoded data to. */
2167         lstream_t other_end;
2168
2169         /* If we are reading, then we can return only a fixed amount of
2170            data, so if the conversion resulted in too much data, we store it
2171            here for retrieval the next time around. */
2172         unsigned_char_dynarr *runoff;
2173
2174         /* FLAGS holds flags indicating the current state of the decoding.
2175            Some of these flags are dependent on the coding system. */
2176         unsigned int flags;
2177
2178         /* CH holds a partially built-up character.  Since we only deal
2179            with one- and two-byte characters at the moment, we only use
2180            this to store the first byte of a two-byte character. */
2181         unsigned int ch;
2182
2183         /* EOL_TYPE specifies the type of end-of-line conversion that
2184            currently applies.  We need to keep this separate from the
2185            EOL type stored in CODESYS because the latter might indicate
2186            automatic EOL-type detection while the former will always
2187            indicate a particular EOL type. */
2188         eol_type_t eol_type;
2189 #ifdef MULE
2190         /* Additional ISO2022 information.  We define the structure above
2191            because it's also needed by the detection routines. */
2192         struct iso2022_decoder iso2022;
2193
2194         /* Additional information (the state of the running CCL program)
2195            used by the CCL decoder. */
2196         struct ccl_program ccl;
2197
2198         /* counter for UTF-8 or UCS-4 */
2199         unsigned char counter;
2200 #endif
2201         struct detection_state decst;
2202 };
2203
2204 static Lstream_data_count
2205 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2206 static Lstream_data_count
2207 decoding_writer(lstream_t stream,
2208                 const unsigned char *data, Lstream_data_count size);
2209 static int decoding_rewinder(lstream_t stream);
2210 static int decoding_seekable_p(lstream_t stream);
2211 static int decoding_flusher(lstream_t stream);
2212 static int decoding_closer(lstream_t stream);
2213
2214 static Lisp_Object decoding_marker(Lisp_Object stream);
2215
2216 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2217                               sizeof(struct decoding_stream_s));
2218
2219 static Lisp_Object
2220 decoding_marker(Lisp_Object stream)
2221 {
2222         lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2223         Lisp_Object str_obj;
2224
2225         /* We do not need to mark the coding systems or charsets stored
2226            within the stream because they are stored in a global list
2227            and automatically marked. */
2228
2229         XSETLSTREAM(str_obj, str);
2230         mark_object(str_obj);
2231         if (str->imp->marker) {
2232                 return str->imp->marker(str_obj);
2233         } else {
2234                 return Qnil;
2235         }
2236 }
2237
2238 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2239    so we read data from the other end, decode it, and store it into DATA. */
2240
2241 static Lstream_data_count
2242 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2243 {
2244         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2245         unsigned char *orig_data = data;
2246         Lstream_data_count read_size;
2247         int error_occurred = 0;
2248
2249         /* We need to interface to mule_decode(), which expects to take some
2250            amount of data and store the result into a Dynarr.  We have
2251            mule_decode() store into str->runoff, and take data from there
2252            as necessary. */
2253
2254         /* We loop until we have enough data, reading chunks from the other
2255            end and decoding it. */
2256         while (1) {
2257                 /* Take data from the runoff if we can.  Make sure to take at
2258                    most SIZE bytes, and delete the data from the runoff. */
2259                 if (Dynarr_length(str->runoff) > 0) {
2260                         Lstream_data_count chunk =
2261                                 min(size,
2262                                     (Lstream_data_count)
2263                                     Dynarr_length(str->runoff));
2264                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2265                         Dynarr_delete_many(str->runoff, 0, chunk);
2266                         data += chunk;
2267                         size -= chunk;
2268                 }
2269
2270                 if (size == 0) {
2271                         /* No more room for data */
2272                         break;
2273                 }
2274
2275                 if (str->flags & CODING_STATE_END) {
2276                         /* This means that on the previous iteration, we hit the
2277                            EOF on the other end.  We loop once more so that
2278                            mule_decode() can output any final stuff it may be
2279                            holding, or any "go back to a sane state" escape
2280                            sequences. (This latter makes sense during
2281                            encoding.) */
2282                         break;
2283                 }
2284
2285                 /* Exhausted the runoff, so get some more.  DATA has at least
2286                    SIZE bytes left of storage in it, so it's OK to read directly
2287                    into it.  (We'll be overwriting above, after we've decoded it
2288                    into the runoff.) */
2289                 read_size = Lstream_read(str->other_end, data, size);
2290                 if (read_size < 0) {
2291                         error_occurred = 1;
2292                         break;
2293                 }
2294                 if (read_size == 0) {
2295                         /* There might be some more end data produced in the
2296                            translation.  See the comment above. */
2297                         str->flags |= CODING_STATE_END;
2298                 }
2299                 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2300         }
2301
2302         if (data - orig_data == 0) {
2303                 return error_occurred ? -1 : 0;
2304         } else {
2305                 return data - orig_data;
2306         }
2307 }
2308
2309 static Lstream_data_count
2310 decoding_writer(lstream_t stream, const unsigned char *data,
2311                 Lstream_data_count size)
2312 {
2313         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2314         Lstream_data_count retval;
2315
2316         /* Decode all our data into the runoff, and then attempt to write
2317            it all out to the other end.  Remove whatever chunk we succeeded
2318            in writing. */
2319         mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2320         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2321                                Dynarr_length(str->runoff));
2322         if (retval > 0) {
2323                 Dynarr_delete_many(str->runoff, 0, retval);
2324         }
2325         /* Do NOT return retval.  The return value indicates how much
2326            of the incoming data was written, not how many bytes were
2327            written. */
2328         return size;
2329 }
2330
2331 static void
2332 reset_decoding_stream(decoding_stream_t str)
2333 {
2334 #ifdef MULE
2335         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2336                 Lisp_Object coding_system;
2337                 XSETCODING_SYSTEM(coding_system, str->codesys);
2338                 reset_iso2022(coding_system, &str->iso2022);
2339         } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2340                 setup_ccl_program(&str->ccl,
2341                                   CODING_SYSTEM_CCL_DECODE(str->codesys));
2342         }
2343         str->counter = 0;
2344 #endif                          /* MULE */
2345         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2346             || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2347                 xzero(str->decst);
2348                 str->decst.eol_type = EOL_AUTODETECT;
2349                 str->decst.mask = ~0;
2350         }
2351         str->flags = str->ch = 0;
2352 }
2353
2354 static int
2355 decoding_rewinder(lstream_t stream)
2356 {
2357         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2358         reset_decoding_stream(str);
2359         Dynarr_reset(str->runoff);
2360         return Lstream_rewind(str->other_end);
2361 }
2362
2363 static int
2364 decoding_seekable_p(lstream_t stream)
2365 {
2366         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2367         return Lstream_seekable_p(str->other_end);
2368 }
2369
2370 static int
2371 decoding_flusher(lstream_t stream)
2372 {
2373         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2374         return Lstream_flush(str->other_end);
2375 }
2376
2377 static int
2378 decoding_closer(lstream_t stream)
2379 {
2380         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2381         if (stream->flags & LSTREAM_FL_WRITE) {
2382                 str->flags |= CODING_STATE_END;
2383                 decoding_writer(stream, 0, 0);
2384         }
2385         Dynarr_free(str->runoff);
2386 #ifdef MULE
2387 #ifdef ENABLE_COMPOSITE_CHARS
2388         if (str->iso2022.composite_chars) {
2389                 Dynarr_free(str->iso2022.composite_chars);
2390         }
2391 #endif
2392 #endif
2393         return Lstream_close(str->other_end);
2394 }
2395
2396 Lisp_Object
2397 decoding_stream_coding_system(lstream_t stream)
2398 {
2399         Lisp_Object coding_system;
2400         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2401
2402         XSETCODING_SYSTEM(coding_system, str->codesys);
2403         return subsidiary_coding_system(coding_system, str->eol_type);
2404 }
2405
2406 void
2407 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2408 {
2409         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2410         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2411         str->codesys = cs;
2412         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2413                 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2414         }
2415         reset_decoding_stream(str);
2416         return;
2417 }
2418
2419 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2420    stream for writing, no automatic code detection will be performed.
2421    The reason for this is that automatic code detection requires a
2422    seekable input.  Things will also fail if you open a decoding
2423    stream for reading using a non-fully-specified coding system and
2424    a non-seekable input stream. */
2425
2426 static Lisp_Object
2427 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2428 {
2429         lstream_t lstr = Lstream_new(lstream_decoding, mode);
2430         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2431         Lisp_Object obj;
2432
2433         xzero(*str);
2434         str->other_end = stream;
2435         str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2436         str->eol_type = EOL_AUTODETECT;
2437         if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2438                 /* We can determine the coding system now. */
2439                 determine_real_coding_system(stream, &codesys, &str->eol_type);
2440         }
2441         set_decoding_stream_coding_system(lstr, codesys);
2442         str->decst.eol_type = str->eol_type;
2443         str->decst.mask = ~0;
2444         XSETLSTREAM(obj, lstr);
2445         return obj;
2446 }
2447
2448 Lisp_Object
2449 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2450 {
2451         return make_decoding_stream_1(stream, codesys, "r");
2452 }
2453
2454 Lisp_Object
2455 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2456 {
2457         return make_decoding_stream_1(stream, codesys, "w");
2458 }
2459
2460 /* Note: the decode_coding_* functions all take the same
2461    arguments as mule_decode(), which is to say some SRC data of
2462    size N, which is to be stored into dynamic array DST.
2463    DECODING is the stream within which the decoding is
2464    taking place, but no data is actually read from or
2465    written to that stream; that is handled in decoding_reader()
2466    or decoding_writer().  This allows the same functions to
2467    be used for both reading and writing. */
2468
2469 static void
2470 mule_decode(lstream_t decoding, const Extbyte * src,
2471             unsigned_char_dynarr * dst, Lstream_data_count n)
2472 {
2473         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2474
2475         /* If necessary, do encoding-detection now.  We do this when
2476            we're a writing stream or a non-seekable reading stream,
2477            meaning that we can't just process the whole input,
2478            rewind, and start over. */
2479
2480         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2481             str->eol_type == EOL_AUTODETECT) {
2482                 Lisp_Object codesys;
2483
2484                 XSETCODING_SYSTEM(codesys, str->codesys);
2485                 detect_coding_type(&str->decst, src, n,
2486                                    CODING_SYSTEM_TYPE(str->codesys) !=
2487                                    CODESYS_AUTODETECT);
2488                 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2489                     str->decst.mask != ~0)
2490                         /* #### This is cheesy.  What we really ought to do is
2491                            buffer up a certain amount of data so as to get a
2492                            less random result. */
2493                         codesys = coding_system_from_mask(str->decst.mask);
2494                 str->eol_type = str->decst.eol_type;
2495                 if (XCODING_SYSTEM(codesys) != str->codesys) {
2496                         /* Preserve the CODING_STATE_END flag in case it was set.
2497                            If we erase it, bad things might happen. */
2498                         int was_end = str->flags & CODING_STATE_END;
2499                         set_decoding_stream_coding_system(decoding, codesys);
2500                         if (was_end)
2501                                 str->flags |= CODING_STATE_END;
2502                 }
2503         }
2504
2505         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2506 #ifdef DEBUG_SXEMACS
2507         case CODESYS_INTERNAL:
2508                 Dynarr_add_many(dst, src, n);
2509                 break;
2510 #endif
2511         case CODESYS_AUTODETECT:
2512                 /* If we got this far and still haven't decided on the coding
2513                    system, then do no conversion. */
2514         case CODESYS_NO_CONVERSION:
2515                 decode_coding_no_conversion(decoding, src, dst, n);
2516                 break;
2517 #ifdef MULE
2518         case CODESYS_SHIFT_JIS:
2519                 decode_coding_sjis(decoding, src, dst, n);
2520                 break;
2521         case CODESYS_BIG5:
2522                 decode_coding_big5(decoding, src, dst, n);
2523                 break;
2524         case CODESYS_UCS4:
2525                 decode_coding_ucs4(decoding, src, dst, n);
2526                 break;
2527         case CODESYS_UTF8:
2528                 decode_coding_utf8(decoding, src, dst, n);
2529                 break;
2530         case CODESYS_CCL:
2531                 str->ccl.last_block = str->flags & CODING_STATE_END;
2532                 /* When applying ccl program to stream, MUST NOT set NULL
2533                    pointer to src.  */
2534                 ccl_driver(&str->ccl,
2535                            (src
2536                             ? (const unsigned char *)src
2537                             : (const unsigned char *)""),
2538                            dst, n, 0, CCL_MODE_DECODING);
2539                 break;
2540         case CODESYS_ISO2022:
2541                 decode_coding_iso2022(decoding, src, dst, n);
2542                 break;
2543 #endif                          /* MULE */
2544         default:
2545                 abort();
2546         }
2547 }
2548
2549 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0,   /*
2550 Decode the text between START and END which is encoded in CODING-SYSTEM.
2551 This is useful if you've read in encoded text from a file without decoding
2552 it (e.g. you read in a JIS-formatted file but used the `binary' or
2553 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2554 Return length of decoded text.
2555 BUFFER defaults to the current buffer if unspecified.
2556 */
2557       (start, end, coding_system, buffer))
2558 {
2559         Bufpos b, e;
2560         struct buffer *buf = decode_buffer(buffer, 0);
2561         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2562         lstream_t istr, ostr;
2563         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2564
2565         get_buffer_range_char(buf, start, end, &b, &e, 0);
2566
2567         barf_if_buffer_read_only(buf, b, e);
2568
2569         coding_system = Fget_coding_system(coding_system);
2570         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2571         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2572         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2573                                                    coding_system);
2574         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2575                                                 Fget_coding_system(Qbinary));
2576         istr = XLSTREAM(instream);
2577         ostr = XLSTREAM(outstream);
2578         GCPRO4(instream, lb_outstream, de_outstream, outstream);
2579
2580         /* The chain of streams looks like this:
2581
2582            [BUFFER] <----- send through
2583            ------> [ENCODE AS BINARY]
2584            ------> [DECODE AS SPECIFIED]
2585            ------> [BUFFER]
2586          */
2587
2588         while (1) {
2589                 char tempbuf[1024];     /* some random amount */
2590                 Bufpos newpos, even_newer_pos;
2591                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2592                 Lstream_data_count size_in_bytes =
2593                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
2594
2595                 if (!size_in_bytes)
2596                         break;
2597                 newpos = lisp_buffer_stream_startpos(istr);
2598                 Lstream_write(ostr, tempbuf, size_in_bytes);
2599                 even_newer_pos = lisp_buffer_stream_startpos(istr);
2600                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2601                                     even_newer_pos, 0);
2602         }
2603         Lstream_close(istr);
2604         Lstream_close(ostr);
2605         UNGCPRO;
2606         Lstream_delete(istr);
2607         Lstream_delete(ostr);
2608         Lstream_delete(XLSTREAM(de_outstream));
2609         Lstream_delete(XLSTREAM(lb_outstream));
2610         return Qnil;
2611 }
2612 \f
2613 /************************************************************************/
2614 /*           Converting to an external encoding ("encoding")            */
2615 /************************************************************************/
2616
2617 /* An encoding stream is an output stream.  When you create the
2618    stream, you specify the coding system that governs the encoding
2619    and another stream that the resulting encoded data is to be
2620    sent to, and then start sending data to it. */
2621
2622 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2623
2624 typedef struct encoding_stream_s *encoding_stream_t;
2625 struct encoding_stream_s {
2626         /* Coding system that governs the conversion. */
2627         Lisp_Coding_System *codesys;
2628
2629         /* Stream that we read the encoded data from or
2630            write the decoded data to. */
2631         lstream_t other_end;
2632
2633         /* If we are reading, then we can return only a fixed amount of
2634            data, so if the conversion resulted in too much data, we store it
2635            here for retrieval the next time around. */
2636         unsigned_char_dynarr *runoff;
2637
2638         /* FLAGS holds flags indicating the current state of the encoding.
2639            Some of these flags are dependent on the coding system. */
2640         unsigned int flags;
2641
2642         /* CH holds a partially built-up character.  Since we only deal
2643            with one- and two-byte characters at the moment, we only use
2644            this to store the first byte of a two-byte character. */
2645         unsigned int ch;
2646 #ifdef MULE
2647         /* Additional information used by the ISO2022 encoder. */
2648         struct {
2649                 /* CHARSET holds the character sets currently assigned to the G0
2650                    through G3 registers.  It is initialized from the array
2651                    INITIAL_CHARSET in CODESYS. */
2652                 Lisp_Object charset[4];
2653
2654                 /* Which registers are currently invoked into the left (GL) and
2655                    right (GR) halves of the 8-bit encoding space? */
2656                 int register_left, register_right;
2657
2658                 /* Whether we need to explicitly designate the charset in the
2659                    G? register before using it.  It is initialized from the
2660                    array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2661                 unsigned char force_charset_on_output[4];
2662
2663                 /* Other state variables that need to be preserved across
2664                    invocations. */
2665                 Lisp_Object current_charset;
2666                 int current_half;
2667                 int current_char_boundary;
2668         } iso2022;
2669
2670         /* Additional information (the state of the running CCL program)
2671            used by the CCL encoder. */
2672         struct ccl_program ccl;
2673 #endif                          /* MULE */
2674 };
2675
2676 static Lstream_data_count
2677 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2678 static Lstream_data_count
2679 encoding_writer(lstream_t stream,
2680                 const unsigned char *data, Lstream_data_count size);
2681 static int encoding_rewinder(lstream_t stream);
2682 static int encoding_seekable_p(lstream_t stream);
2683 static int encoding_flusher(lstream_t stream);
2684 static int encoding_closer(lstream_t stream);
2685
2686 static Lisp_Object encoding_marker(Lisp_Object stream);
2687
2688 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2689                               sizeof(struct encoding_stream_s));
2690
2691 static Lisp_Object
2692 encoding_marker(Lisp_Object stream)
2693 {
2694         lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2695         Lisp_Object str_obj;
2696
2697         /* We do not need to mark the coding systems or charsets stored
2698            within the stream because they are stored in a global list
2699            and automatically marked. */
2700
2701         XSETLSTREAM(str_obj, str);
2702         mark_object(str_obj);
2703         if (str->imp->marker) {
2704                 return str->imp->marker(str_obj);
2705         } else {
2706                 return Qnil;
2707         }
2708 }
2709
2710 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2711    so we read data from the other end, encode it, and store it into DATA. */
2712
2713 static Lstream_data_count
2714 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2715 {
2716         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2717         unsigned char *orig_data = data;
2718         Lstream_data_count read_size;
2719         int error_occurred = 0;
2720
2721         /* We need to interface to mule_encode(), which expects to take some
2722            amount of data and store the result into a Dynarr.  We have
2723            mule_encode() store into str->runoff, and take data from there
2724            as necessary. */
2725
2726         /* We loop until we have enough data, reading chunks from the other
2727            end and encoding it. */
2728         while (1) {
2729                 /* Take data from the runoff if we can.  Make sure to take at
2730                    most SIZE bytes, and delete the data from the runoff. */
2731                 if (Dynarr_length(str->runoff) > 0) {
2732                         int chunk = min((int)size, Dynarr_length(str->runoff));
2733                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2734                         Dynarr_delete_many(str->runoff, 0, chunk);
2735                         data += chunk;
2736                         size -= chunk;
2737                 }
2738
2739                 if (size == 0) {
2740                         /* No more room for data */
2741                         break;
2742                 }
2743
2744                 if (str->flags & CODING_STATE_END) {
2745                         /* This means that on the previous iteration, we hit the
2746                            EOF on the other end.  We loop once more so that
2747                            mule_encode() can output any final stuff it may be
2748                            holding, or any "go back to a sane state" escape
2749                            sequences. (This latter makes sense during
2750                            encoding.) */
2751                         break;
2752                 }
2753
2754                 /* Exhausted the runoff, so get some more.  DATA at least SIZE
2755                    bytes left of storage in it, so it's OK to read directly into
2756                    it.  (We'll be overwriting above, after we've encoded it into
2757                    the runoff.) */
2758                 read_size = Lstream_read(str->other_end, data, size);
2759                 if (read_size < 0) {
2760                         error_occurred = 1;
2761                         break;
2762                 }
2763                 if (read_size == 0) {
2764                         /* There might be some more end data produced in the
2765                            translation.  See the comment above. */
2766                         str->flags |= CODING_STATE_END;
2767                 }
2768                 mule_encode(stream, data, str->runoff, read_size);
2769         }
2770
2771         if (data == orig_data) {
2772                 return error_occurred ? -1 : 0;
2773         } else {
2774                 return data - orig_data;
2775         }
2776 }
2777
2778 static Lstream_data_count
2779 encoding_writer(lstream_t stream, const unsigned char *data,
2780                 Lstream_data_count size)
2781 {
2782         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2783         Lstream_data_count retval;
2784
2785         /* Encode all our data into the runoff, and then attempt to write
2786            it all out to the other end.  Remove whatever chunk we succeeded
2787            in writing. */
2788         mule_encode(stream, data, str->runoff, size);
2789         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2790                                Dynarr_length(str->runoff));
2791         if (retval > 0) {
2792                 Dynarr_delete_many(str->runoff, 0, retval);
2793         }
2794         /* Do NOT return retval.  The return value indicates how much
2795            of the incoming data was written, not how many bytes were
2796            written. */
2797         return size;
2798 }
2799
2800 static void
2801 reset_encoding_stream(encoding_stream_t str)
2802 {
2803 #ifdef MULE
2804         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2805         case CODESYS_ISO2022: {
2806                 int i;
2807
2808                 for (i = 0; i < 4; i++) {
2809                         str->iso2022.charset[i] =
2810                                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2811                                         str->codesys, i);
2812                         str->iso2022.force_charset_on_output[i] =
2813                                 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2814                                         str->codesys, i);
2815                 }
2816                 str->iso2022.register_left = 0;
2817                 str->iso2022.register_right = 1;
2818                 str->iso2022.current_charset = Qnil;
2819                 str->iso2022.current_half = 0;
2820                 str->iso2022.current_char_boundary = 1;
2821                 break;
2822         }
2823         case CODESYS_CCL:
2824                 setup_ccl_program(&str->ccl,
2825                                   CODING_SYSTEM_CCL_ENCODE(str->codesys));
2826                 break;
2827
2828                 /* list the rest of them lot explicitly */
2829         case CODESYS_AUTODETECT:
2830         case CODESYS_SHIFT_JIS:
2831         case CODESYS_BIG5:
2832         case CODESYS_UCS4:
2833         case CODESYS_UTF8:
2834         case CODESYS_NO_CONVERSION:
2835 #ifdef DEBUG_SXEMACS
2836         case CODESYS_INTERNAL:
2837 #endif
2838         default:
2839                 break;
2840         }
2841 #endif                          /* MULE */
2842
2843         str->flags = str->ch = 0;
2844 }
2845
2846 static int
2847 encoding_rewinder(lstream_t stream)
2848 {
2849         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2850         reset_encoding_stream(str);
2851         Dynarr_reset(str->runoff);
2852         return Lstream_rewind(str->other_end);
2853 }
2854
2855 static int
2856 encoding_seekable_p(lstream_t stream)
2857 {
2858         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2859         return Lstream_seekable_p(str->other_end);
2860 }
2861
2862 static int
2863 encoding_flusher(lstream_t stream)
2864 {
2865         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2866         return Lstream_flush(str->other_end);
2867 }
2868
2869 static int
2870 encoding_closer(lstream_t stream)
2871 {
2872         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2873         if (stream->flags & LSTREAM_FL_WRITE) {
2874                 str->flags |= CODING_STATE_END;
2875                 encoding_writer(stream, 0, 0);
2876         }
2877         Dynarr_free(str->runoff);
2878         return Lstream_close(str->other_end);
2879 }
2880
2881 Lisp_Object
2882 encoding_stream_coding_system(lstream_t stream)
2883 {
2884         Lisp_Object coding_system;
2885         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2886
2887         XSETCODING_SYSTEM(coding_system, str->codesys);
2888         return coding_system;
2889 }
2890
2891 void
2892 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2893 {
2894         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2895         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2896         str->codesys = cs;
2897         reset_encoding_stream(str);
2898 }
2899
2900 static Lisp_Object
2901 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2902 {
2903         lstream_t lstr = Lstream_new(lstream_encoding, mode);
2904         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2905         Lisp_Object obj;
2906
2907         xzero(*str);
2908         str->runoff = Dynarr_new(unsigned_char);
2909         str->other_end = stream;
2910         set_encoding_stream_coding_system(lstr, codesys);
2911         XSETLSTREAM(obj, lstr);
2912         return obj;
2913 }
2914
2915 Lisp_Object
2916 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2917 {
2918         return make_encoding_stream_1(stream, codesys, "r");
2919 }
2920
2921 Lisp_Object
2922 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2923 {
2924         return make_encoding_stream_1(stream, codesys, "w");
2925 }
2926
2927 /* Convert N bytes of internally-formatted data stored in SRC to an
2928    external format, according to the encoding stream ENCODING.
2929    Store the encoded data into DST. */
2930
2931 static void
2932 mule_encode(lstream_t encoding, const Bufbyte * src,
2933             unsigned_char_dynarr * dst, Lstream_data_count n)
2934 {
2935         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2936
2937         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2938 #ifdef DEBUG_SXEMACS
2939         case CODESYS_INTERNAL:
2940                 Dynarr_add_many(dst, src, n);
2941                 break;
2942 #endif
2943         case CODESYS_AUTODETECT:
2944                 /* If we got this far and still haven't decided on the coding
2945                    system, then do no conversion. */
2946         case CODESYS_NO_CONVERSION:
2947                 encode_coding_no_conversion(encoding, src, dst, n);
2948                 break;
2949 #ifdef MULE
2950         case CODESYS_SHIFT_JIS:
2951                 encode_coding_sjis(encoding, src, dst, n);
2952                 break;
2953         case CODESYS_BIG5:
2954                 encode_coding_big5(encoding, src, dst, n);
2955                 break;
2956         case CODESYS_UCS4:
2957                 encode_coding_ucs4(encoding, src, dst, n);
2958                 break;
2959         case CODESYS_UTF8:
2960                 encode_coding_utf8(encoding, src, dst, n);
2961                 break;
2962         case CODESYS_CCL:
2963                 str->ccl.last_block = str->flags & CODING_STATE_END;
2964                 /* When applying ccl program to stream, MUST NOT set NULL
2965                    pointer to src.  */
2966                 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
2967                            dst, n, 0, CCL_MODE_ENCODING);
2968                 break;
2969         case CODESYS_ISO2022:
2970                 encode_coding_iso2022(encoding, src, dst, n);
2971                 break;
2972 #endif                          /* MULE */
2973         default:
2974                 abort();
2975         }
2976 }
2977
2978 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0,   /*
2979 Encode the text between START and END using CODING-SYSTEM.
2980 This will, for example, convert Japanese characters into stuff such as
2981 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2982 text.  BUFFER defaults to the current buffer if unspecified.
2983 */
2984       (start, end, coding_system, buffer))
2985 {
2986         Bufpos b, e;
2987         struct buffer *buf = decode_buffer(buffer, 0);
2988         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2989         lstream_t istr, ostr;
2990         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2991
2992         get_buffer_range_char(buf, start, end, &b, &e, 0);
2993
2994         barf_if_buffer_read_only(buf, b, e);
2995
2996         coding_system = Fget_coding_system(coding_system);
2997         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2998         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2999         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
3000                                                    Fget_coding_system(Qbinary));
3001         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
3002                                                 coding_system);
3003         istr = XLSTREAM(instream);
3004         ostr = XLSTREAM(outstream);
3005         GCPRO4(instream, outstream, de_outstream, lb_outstream);
3006         /* The chain of streams looks like this:
3007
3008            [BUFFER] <----- send through
3009            ------> [ENCODE AS SPECIFIED]
3010            ------> [DECODE AS BINARY]
3011            ------> [BUFFER]
3012          */
3013         while (1) {
3014                 char tempbuf[1024];     /* some random amount */
3015                 Bufpos newpos, even_newer_pos;
3016                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
3017                 Lstream_data_count size_in_bytes =
3018                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
3019
3020                 if (!size_in_bytes)
3021                         break;
3022                 newpos = lisp_buffer_stream_startpos(istr);
3023                 Lstream_write(ostr, tempbuf, size_in_bytes);
3024                 even_newer_pos = lisp_buffer_stream_startpos(istr);
3025                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3026                                     even_newer_pos, 0);
3027         }
3028
3029         {
3030                 Charcount retlen =
3031                     lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3032                 Lstream_close(istr);
3033                 Lstream_close(ostr);
3034                 UNGCPRO;
3035                 Lstream_delete(istr);
3036                 Lstream_delete(ostr);
3037                 Lstream_delete(XLSTREAM(de_outstream));
3038                 Lstream_delete(XLSTREAM(lb_outstream));
3039                 return make_int(retlen);
3040         }
3041 }
3042
3043 #ifdef MULE
3044 \f
3045 /************************************************************************/
3046 /*                          Shift-JIS methods                           */
3047 /************************************************************************/
3048
3049 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3050    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3051    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3052    encoded by "position-code + 0x80".  A character of JISX0208
3053    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3054    position-codes are divided and shifted so that it fit in the range
3055    below.
3056
3057    --- CODE RANGE of Shift-JIS ---
3058    (character set)      (range)
3059    ASCII                0x00 .. 0x7F
3060    JISX0201-Kana        0xA0 .. 0xDF
3061    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3062             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3063    -------------------------------
3064
3065 */
3066
3067 /* Is this the first byte of a Shift-JIS two-byte char? */
3068
3069 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3070   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3071
3072 /* Is this the second byte of a Shift-JIS two-byte char? */
3073
3074 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3075   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3076
3077 #define BYTE_SJIS_KATAKANA_P(c) \
3078   ((c) >= 0xA1 && (c) <= 0xDF)
3079
3080 static int
3081 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3082                    Lstream_data_count n)
3083 {
3084         while (n--) {
3085                 const unsigned char c = *(const unsigned char *)src++;
3086                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3087                         return 0;
3088                 if (st->shift_jis.in_second_byte) {
3089                         st->shift_jis.in_second_byte = 0;
3090                         if (c < 0x40)
3091                                 return 0;
3092                 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3093                         st->shift_jis.in_second_byte = 1;
3094         }
3095         return CODING_CATEGORY_SHIFT_JIS_MASK;
3096 }
3097
3098 /* Convert Shift-JIS data to internal format. */
3099
3100 static void
3101 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3102                    unsigned_char_dynarr * dst, Lstream_data_count n)
3103 {
3104         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3105         unsigned int flags = str->flags;
3106         unsigned int ch = str->ch;
3107         eol_type_t eol_type = str->eol_type;
3108
3109         while (n--) {
3110                 const unsigned char c = *(const unsigned char *)src++;
3111
3112                 if (ch) {
3113                         /* Previous character was first byte of Shift-JIS Kanji
3114                            char. */
3115                         if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3116                                 unsigned char e1, e2;
3117
3118                                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3119                                 DECODE_SJIS(ch, c, e1, e2);
3120                                 Dynarr_add(dst, e1);
3121                                 Dynarr_add(dst, e2);
3122                         } else {
3123                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3124                                 DECODE_ADD_BINARY_CHAR(c, dst);
3125                         }
3126                         ch = 0;
3127                 } else {
3128                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3129                         if (BYTE_SJIS_TWO_BYTE_1_P(c))
3130                                 ch = c;
3131                         else if (BYTE_SJIS_KATAKANA_P(c)) {
3132                                 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3133                                 Dynarr_add(dst, c);
3134                         } else
3135                                 DECODE_ADD_BINARY_CHAR(c, dst);
3136                 }
3137               label_continue_loop:;
3138         }
3139
3140         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3141
3142         str->flags = flags;
3143         str->ch = ch;
3144 }
3145
3146 /* Convert internally-formatted data to Shift-JIS. */
3147
3148 static void
3149 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3150                    unsigned_char_dynarr * dst, Lstream_data_count n)
3151 {
3152         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3153         unsigned int flags = str->flags;
3154         unsigned int ch = str->ch;
3155         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3156
3157         while (n--) {
3158                 Bufbyte c = *src++;
3159                 if (c == '\n') {
3160                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3161                                 Dynarr_add(dst, '\r');
3162                         if (eol_type != EOL_CR)
3163                                 Dynarr_add(dst, '\n');
3164                         ch = 0;
3165                 } else if (BYTE_ASCII_P(c)) {
3166                         Dynarr_add(dst, c);
3167                         ch = 0;
3168                 } else if (BUFBYTE_LEADING_BYTE_P(c))
3169                         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3170                               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3171                               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3172                 else if (ch) {
3173                         if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3174                                 Dynarr_add(dst, c);
3175                                 ch = 0;
3176                         } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3177                                    ch == LEADING_BYTE_JAPANESE_JISX0208)
3178                                 ch = c;
3179                         else {
3180                                 /* j1 is bessel j1 function,
3181                                  * so we use something else */
3182                                 /* unsigned char j1, j2; */
3183                                 unsigned char tt1, tt2;
3184
3185                                 ENCODE_SJIS(ch, c, tt1, tt2);
3186                                 Dynarr_add(dst, tt1);
3187                                 Dynarr_add(dst, tt2);
3188                                 ch = 0;
3189                         }
3190                 }
3191         }
3192
3193         str->flags = flags;
3194         str->ch = ch;
3195 }
3196
3197 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3198 Decode a JISX0208 character of Shift-JIS coding-system.
3199 CODE is the character code in Shift-JIS as a cons of type bytes.
3200 Return the corresponding character.
3201 */
3202       (code))
3203 {
3204         unsigned char c1, c2, s1, s2;
3205
3206         CHECK_CONS(code);
3207         CHECK_INT(XCAR(code));
3208         CHECK_INT(XCDR(code));
3209         s1 = XINT(XCAR(code));
3210         s2 = XINT(XCDR(code));
3211         if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3212                 DECODE_SJIS(s1, s2, c1, c2);
3213                 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3214                                            c1 & 0x7F, c2 & 0x7F));
3215         } else
3216                 return Qnil;
3217 }
3218
3219 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3220 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3221 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3222 */
3223       (character))
3224 {
3225         Lisp_Object charset;
3226         int c1, c2, s1, s2;
3227
3228         CHECK_CHAR_COERCE_INT(character);
3229         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3230         if (EQ(charset, Vcharset_japanese_jisx0208)) {
3231                 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3232                 return Fcons(make_int(s1), make_int(s2));
3233         } else
3234                 return Qnil;
3235 }
3236 \f
3237 /************************************************************************/
3238 /*                            Big5 methods                              */
3239 /************************************************************************/
3240
3241 /* BIG5 is a coding system encoding two character sets: ASCII and
3242    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3243    character set and is encoded in two-byte.
3244
3245    --- CODE RANGE of BIG5 ---
3246    (character set)      (range)
3247    ASCII                0x00 .. 0x7F
3248    Big5 (1st byte)      0xA1 .. 0xFE
3249         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3250    --------------------------
3251
3252    Since the number of characters in Big5 is larger than maximum
3253    characters in Emacs' charset (96x96), it can't be handled as one
3254    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3255    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3256    contains frequently used characters and the latter contains less
3257    frequently used characters.  */
3258
3259 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3260   ((c) >= 0xA1 && (c) <= 0xFE)
3261
3262 /* Is this the second byte of a Shift-JIS two-byte char? */
3263
3264 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3265   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3266
3267 /* Number of Big5 characters which have the same code in 1st byte.  */
3268
3269 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3270
3271 /* Code conversion macros.  These are macros because they are used in
3272    inner loops during code conversion.
3273
3274    Note that temporary variables in macros introduce the classic
3275    dynamic-scoping problems with variable names.  We use capital-
3276    lettered variables in the assumption that SXEmacs does not use
3277    capital letters in variables except in a very formalized way
3278    (e.g. Qstring). */
3279
3280 /* Convert Big5 code (b1, b2) into its internal string representation
3281    (lb, c1, c2). */
3282
3283 /* There is a much simpler way to split the Big5 charset into two.
3284    For the moment I'm going to leave the algorithm as-is because it
3285    claims to separate out the most-used characters into a single
3286    charset, which perhaps will lead to optimizations in various
3287    places.
3288
3289    The way the algorithm works is something like this:
3290
3291    Big5 can be viewed as a 94x157 charset, where the row is
3292    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3293    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3294    the split between low and high column numbers is apparently
3295    meaningless; ascending rows produce less and less frequent chars.
3296    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3297    the first charset, and the upper half (0xC9 .. 0xFE) to the
3298    second.  To do the conversion, we convert the character into
3299    a single number where 0 .. 156 is the first row, 157 .. 313
3300    is the second, etc.  That way, the characters are ordered by
3301    decreasing frequency.  Then we just chop the space in two
3302    and coerce the result into a 94x94 space.
3303    */
3304
3305 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3306 {                                                                       \
3307   int B1 = b1, B2 = b2;                                                 \
3308   unsigned int I                                                        \
3309     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3310                                                                         \
3311   if (B1 < 0xC9)                                                        \
3312     {                                                                   \
3313       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3314     }                                                                   \
3315   else                                                                  \
3316     {                                                                   \
3317       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3318       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3319     }                                                                   \
3320   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3321   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3322 } while (0)
3323
3324 /* Convert the internal string representation of a Big5 character
3325    (lb, c1, c2) into Big5 code (b1, b2). */
3326
3327 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3328 {                                                                       \
3329   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3330                                                                         \
3331   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3332     {                                                                   \
3333       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3334     }                                                                   \
3335   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3336   b2 = I % BIG5_SAME_ROW;                                               \
3337   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3338 } while (0)
3339
3340 static int
3341 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3342                    Lstream_data_count n)
3343 {
3344         while (n--) {
3345                 const unsigned char c = *(const unsigned char *)src++;
3346                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3347                     (c >= 0x80 && c <= 0xA0))
3348                         return 0;
3349                 if (st->big5.in_second_byte) {
3350                         st->big5.in_second_byte = 0;
3351                         if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3352                                 return 0;
3353                 } else if (c >= 0xA1)
3354                         st->big5.in_second_byte = 1;
3355         }
3356         return CODING_CATEGORY_BIG5_MASK;
3357 }
3358
3359 /* Convert Big5 data to internal format. */
3360
3361 static void
3362 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3363                    unsigned_char_dynarr * dst, Lstream_data_count n)
3364 {
3365         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3366         unsigned int flags = str->flags;
3367         unsigned int ch = str->ch;
3368         eol_type_t eol_type = str->eol_type;
3369
3370         while (n--) {
3371                 const unsigned char c = *(const unsigned char *)src++;
3372                 if (ch) {
3373                         /* Previous character was first byte of Big5 char. */
3374                         if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3375                                 unsigned char b1, b2, b3;
3376                                 DECODE_BIG5(ch, c, b1, b2, b3);
3377                                 Dynarr_add(dst, b1);
3378                                 Dynarr_add(dst, b2);
3379                                 Dynarr_add(dst, b3);
3380                         } else {
3381                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3382                                 DECODE_ADD_BINARY_CHAR(c, dst);
3383                         }
3384                         ch = 0;
3385                 } else {
3386                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3387                         if (BYTE_BIG5_TWO_BYTE_1_P(c))
3388                                 ch = c;
3389                         else
3390                                 DECODE_ADD_BINARY_CHAR(c, dst);
3391                 }
3392               label_continue_loop:;
3393         }
3394
3395         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3396
3397         str->flags = flags;
3398         str->ch = ch;
3399 }
3400
3401 /* Convert internally-formatted data to Big5. */
3402
3403 static void
3404 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3405                    unsigned_char_dynarr * dst, Lstream_data_count n)
3406 {
3407         unsigned char c;
3408         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3409         unsigned int flags = str->flags;
3410         unsigned int ch = str->ch;
3411         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3412
3413         while (n--) {
3414                 c = *src++;
3415                 if (c == '\n') {
3416                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3417                                 Dynarr_add(dst, '\r');
3418                         if (eol_type != EOL_CR)
3419                                 Dynarr_add(dst, '\n');
3420                 } else if (BYTE_ASCII_P(c)) {
3421                         /* ASCII. */
3422                         Dynarr_add(dst, c);
3423                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3424                         if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3425                             c == LEADING_BYTE_CHINESE_BIG5_2) {
3426                                 /* A recognized leading byte. */
3427                                 ch = c;
3428                                 continue;       /* not done with this character. */
3429                         }
3430                         /* otherwise just ignore this character. */
3431                 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3432                            ch == LEADING_BYTE_CHINESE_BIG5_2) {
3433                         /* Previous char was a recognized leading byte. */
3434                         ch = (ch << 8) | c;
3435                         continue;       /* not done with this character. */
3436                 } else if (ch) {
3437                         /* Encountering second byte of a Big5 character. */
3438                         unsigned char b1, b2;
3439
3440                         ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3441                         Dynarr_add(dst, b1);
3442                         Dynarr_add(dst, b2);
3443                 }
3444
3445                 ch = 0;
3446         }
3447
3448         str->flags = flags;
3449         str->ch = ch;
3450 }
3451
3452 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0,   /*
3453 Decode a Big5 character CODE of BIG5 coding-system.
3454 CODE is the character code in BIG5, a cons of two integers.
3455 Return the corresponding character.
3456 */
3457       (code))
3458 {
3459         unsigned char c1, c2, b1, b2;
3460
3461         CHECK_CONS(code);
3462         CHECK_INT(XCAR(code));
3463         CHECK_INT(XCDR(code));
3464         b1 = XINT(XCAR(code));
3465         b2 = XINT(XCDR(code));
3466         if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3467                 int leading_byte;
3468                 Lisp_Object charset;
3469                 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3470                 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3471                 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3472         } else
3473                 return Qnil;
3474 }
3475
3476 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0,   /*
3477 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3478 Return the corresponding character code in Big5.
3479 */
3480       (character))
3481 {
3482         Lisp_Object charset;
3483         int c1, c2, b1, b2;
3484
3485         CHECK_CHAR_COERCE_INT(character);
3486         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3487         if (EQ(charset, Vcharset_chinese_big5_1) ||
3488             EQ(charset, Vcharset_chinese_big5_2)) {
3489                 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3490                             c2 | 0x80, b1, b2);
3491                 return Fcons(make_int(b1), make_int(b2));
3492         } else
3493                 return Qnil;
3494 }
3495 \f
3496 /************************************************************************/
3497 /*                           UCS-4 methods                              */
3498 /*                                                                      */
3499 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3500 /*                                                                      */
3501 /************************************************************************/
3502
3503 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0,   /*
3504 Map UCS-4 code CODE to Mule character CHARACTER.
3505
3506 Return T on success, NIL on failure.
3507 */
3508       (code, character))
3509 {
3510         size_t c;
3511
3512         CHECK_CHAR(character);
3513         CHECK_NATNUM(code);
3514         c = XINT(code);
3515
3516         if (c < countof(fcd->ucs_to_mule_table)) {
3517                 fcd->ucs_to_mule_table[c] = character;
3518                 return Qt;
3519         } else
3520                 return Qnil;
3521 }
3522
3523 static Lisp_Object ucs_to_char(unsigned long code)
3524 {
3525         if (code < countof(fcd->ucs_to_mule_table)) {
3526                 return fcd->ucs_to_mule_table[code];
3527         } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3528                 unsigned int c;
3529
3530                 code -= 0xe00000;
3531                 c = code % (94 * 94);
3532                 return make_char
3533                     (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3534                                (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3535                                 CHARSET_LEFT_TO_RIGHT),
3536                                c / 94 + 33, c % 94 + 33));
3537         } else
3538                 return Qnil;
3539 }
3540
3541 DEFUN("ucs-char", Fucs_char, 1, 1, 0,   /*
3542 Return Mule character corresponding to UCS code CODE (a positive integer).
3543 */
3544       (code))
3545 {
3546         CHECK_NATNUM(code);
3547         return ucs_to_char(XINT(code));
3548 }
3549
3550 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0,   /*
3551 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3552 */
3553       (character, code))
3554 {
3555         /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3556            Fset_char_ucs is more restrictive on index arg, but should
3557            check code arg in a char_table method. */
3558         CHECK_CHAR(character);
3559         CHECK_NATNUM(code);
3560         return Fput_char_table(character, code, mule_to_ucs_table);
3561 }
3562
3563 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0,   /*
3564 Return the UCS code (a positive integer) corresponding to CHARACTER.
3565 */
3566       (character))
3567 {
3568         return Fget_char_table(character, mule_to_ucs_table);
3569 }
3570
3571 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3572    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3573    is not found, instead.
3574    #### do something more appropriate (use blob?)
3575         Danger, Will Robinson!  Data loss.  Should we signal user? */
3576 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3577 {
3578         Lisp_Object chr = ucs_to_char(ch);
3579
3580         if (!NILP(chr)) {
3581                 Bufbyte work[MAX_EMCHAR_LEN];
3582                 int len;
3583
3584                 ch = XCHAR(chr);
3585                 len = (ch < 128) ?
3586                     simple_set_charptr_emchar(work, ch) :
3587                     non_ascii_set_charptr_emchar(work, ch);
3588                 Dynarr_add_many(dst, work, len);
3589         } else {
3590                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3591                 Dynarr_add(dst, 34 + 128);
3592                 Dynarr_add(dst, 46 + 128);
3593         }
3594 }
3595
3596 static unsigned long
3597 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3598 {
3599         Lisp_Object code
3600             = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3601                               mule_to_ucs_table);
3602
3603         if (INTP(code)) {
3604                 return XINT(code);
3605         } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3606                    (XCHARSET_CHARS(charset) == 94)) {
3607                 unsigned char final = XCHARSET_FINAL(charset);
3608
3609                 if (('@' <= final) && (final < 0x7f)) {
3610                         return 0xe00000 + (final - '@') * 94 * 94
3611                             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3612                 } else {
3613                         return '?';
3614                 }
3615         } else {
3616                 return '?';
3617         }
3618 }
3619
3620 static void
3621 encode_ucs4(Lisp_Object charset,
3622             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3623 {
3624         unsigned long code = mule_char_to_ucs4(charset, h, l);
3625         Dynarr_add(dst, code >> 24);
3626         Dynarr_add(dst, (code >> 16) & 255);
3627         Dynarr_add(dst, (code >> 8) & 255);
3628         Dynarr_add(dst, code & 255);
3629 }
3630
3631 static int
3632 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3633                    Lstream_data_count n)
3634 {
3635         while (n--) {
3636                 const unsigned char c = *(const unsigned char *)src++;
3637                 switch (st->ucs4.in_byte) {
3638                 case 0:
3639                         if (c >= 128)
3640                                 return 0;
3641                         else
3642                                 st->ucs4.in_byte++;
3643                         break;
3644                 case 3:
3645                         st->ucs4.in_byte = 0;
3646                         break;
3647                 default:
3648                         st->ucs4.in_byte++;
3649                 }
3650         }
3651         return CODING_CATEGORY_UCS4_MASK;
3652 }
3653
3654 static void
3655 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3656                    unsigned_char_dynarr * dst, Lstream_data_count n)
3657 {
3658         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3659         unsigned int flags = str->flags;
3660         unsigned int ch = str->ch;
3661         unsigned char counter = str->counter;
3662
3663         while (n--) {
3664                 const unsigned char c = *(const unsigned char *)src++;
3665                 switch (counter) {
3666                 case 0:
3667                         ch = c;
3668                         counter = 3;
3669                         break;
3670                 case 1:
3671                         decode_ucs4((ch << 8) | c, dst);
3672                         ch = 0;
3673                         counter = 0;
3674                         break;
3675                 default:
3676                         ch = (ch << 8) | c;
3677                         counter--;
3678                 }
3679         }
3680         if (counter & CODING_STATE_END)
3681                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3682
3683         str->flags = flags;
3684         str->ch = ch;
3685         str->counter = counter;
3686 }
3687
3688 static void
3689 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3690                    unsigned_char_dynarr * dst, Lstream_data_count n)
3691 {
3692         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3693         unsigned int flags = str->flags;
3694         unsigned int ch = str->ch;
3695         unsigned char char_boundary = str->iso2022.current_char_boundary;
3696         Lisp_Object charset = str->iso2022.current_charset;
3697
3698 #ifdef ENABLE_COMPOSITE_CHARS
3699         /* flags for handling composite chars.  We do a little switcharoo
3700            on the source while we're outputting the composite char. */
3701         unsigned int saved_n = 0;
3702         const unsigned char *saved_src = NULL;
3703         int in_composite = 0;
3704
3705       back_to_square_n:
3706 #endif
3707
3708         while (n--) {
3709                 unsigned char c = *src++;
3710
3711                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3712                         ch = 0;
3713                         encode_ucs4(Vcharset_ascii, c, 0, dst);
3714                         char_boundary = 1;
3715                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3716                         ch = 0;
3717                         charset = CHARSET_BY_LEADING_BYTE(c);
3718                         if (LEADING_BYTE_PREFIX_P(c))
3719                                 ch = c;
3720                         char_boundary = 0;
3721                 } else {        /* Processing Non-ASCII character */
3722                         char_boundary = 1;
3723                         if (EQ(charset, Vcharset_control_1)) {
3724                                 encode_ucs4(Vcharset_control_1, c, 0, dst);
3725                         } else {
3726                                 switch (XCHARSET_REP_BYTES(charset)) {
3727                                 case 2:
3728                                         encode_ucs4(charset, c, 0, dst);
3729                                         break;
3730                                 case 3:
3731                                         if (XCHARSET_PRIVATE_P(charset)) {
3732                                                 encode_ucs4(charset, c, 0, dst);
3733                                                 ch = 0;
3734                                         } else if (ch) {
3735 #ifdef ENABLE_COMPOSITE_CHARS
3736                                                 if (EQ
3737                                                     (charset,
3738                                                      Vcharset_composite)) {
3739                                                         if (in_composite) {
3740                                                                 /* #### Bother! We don't know how to
3741                                                                    handle this yet. */
3742                                                                 Dynarr_add(dst,
3743                                                                            '\0');
3744                                                                 Dynarr_add(dst,
3745                                                                            '\0');
3746                                                                 Dynarr_add(dst,
3747                                                                            '\0');
3748                                                                 Dynarr_add(dst,
3749                                                                            '~');
3750                                                         } else {
3751                                                                 Emchar emch =
3752                                                                     MAKE_CHAR
3753                                                                     (Vcharset_composite,
3754                                                                      ch & 0x7F,
3755                                                                      c & 0x7F);
3756                                                                 Lisp_Object lstr
3757                                                                     =
3758                                                                     composite_char_string
3759                                                                     (emch);
3760                                                                 saved_n = n;
3761                                                                 saved_src = src;
3762                                                                 in_composite =
3763                                                                     1;
3764                                                                 src =
3765                                                                     XSTRING_DATA
3766                                                                     (lstr);
3767                                                                 n = XSTRING_LENGTH(lstr);
3768                                                         }
3769                                                 } else
3770 #endif                          /* ENABLE_COMPOSITE_CHARS */
3771                                                 {
3772                                                         encode_ucs4(charset, ch,
3773                                                                     c, dst);
3774                                                 }
3775                                                 ch = 0;
3776                                         } else {
3777                                                 ch = c;
3778                                                 char_boundary = 0;
3779                                         }
3780                                         break;
3781                                 case 4:
3782                                         if (ch) {
3783                                                 encode_ucs4(charset, ch, c,
3784                                                             dst);
3785                                                 ch = 0;
3786                                         } else {
3787                                                 ch = c;
3788                                                 char_boundary = 0;
3789                                         }
3790                                         break;
3791                                 default:
3792                                         abort();
3793                                 }
3794                         }
3795                 }
3796         }
3797
3798 #ifdef ENABLE_COMPOSITE_CHARS
3799         if (in_composite) {
3800                 n = saved_n;
3801                 src = saved_src;
3802                 in_composite = 0;
3803                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
3804         }
3805 #endif                          /* ENABLE_COMPOSITE_CHARS */
3806
3807         str->flags = flags;
3808         str->ch = ch;
3809         str->iso2022.current_char_boundary = char_boundary;
3810         str->iso2022.current_charset = charset;
3811
3812         /* Verbum caro factum est! */
3813 }
3814 \f
3815 /************************************************************************/
3816 /*                           UTF-8 methods                              */
3817 /************************************************************************/
3818
3819 static int
3820 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3821                    Lstream_data_count n)
3822 {
3823         while (n--) {
3824                 const unsigned char c = *(const unsigned char *)src++;
3825                 switch (st->utf8.in_byte) {
3826                 case 0:
3827                         if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3828                             || c == ISO_CODE_SO)
3829                                 return 0;
3830                         else if (c >= 0xfc)
3831                                 st->utf8.in_byte = 5;
3832                         else if (c >= 0xf8)
3833                                 st->utf8.in_byte = 4;
3834                         else if (c >= 0xf0)
3835                                 st->utf8.in_byte = 3;
3836                         else if (c >= 0xe0)
3837                                 st->utf8.in_byte = 2;
3838                         else if (c >= 0xc0)
3839                                 st->utf8.in_byte = 1;
3840                         else if (c >= 0x80)
3841                                 return 0;
3842                         break;
3843                 default:
3844                         if ((c & 0xc0) != 0x80)
3845                                 return 0;
3846                         else
3847                                 st->utf8.in_byte--;
3848                 }
3849         }
3850         return CODING_CATEGORY_UTF8_MASK;
3851 }
3852
3853 static void
3854 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3855                    unsigned_char_dynarr * dst, Lstream_data_count n)
3856 {
3857         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3858         unsigned int flags = str->flags;
3859         unsigned int ch = str->ch;
3860         eol_type_t eol_type = str->eol_type;
3861         unsigned char counter = str->counter;
3862
3863         while (n--) {
3864                 const unsigned char c = *(const unsigned char *)src++;
3865                 switch (counter) {
3866                 case 0:
3867                         if (c >= 0xfc) {
3868                                 ch = c & 0x01;
3869                                 counter = 5;
3870                         } else if (c >= 0xf8) {
3871                                 ch = c & 0x03;
3872                                 counter = 4;
3873                         } else if (c >= 0xf0) {
3874                                 ch = c & 0x07;
3875                                 counter = 3;
3876                         } else if (c >= 0xe0) {
3877                                 ch = c & 0x0f;
3878                                 counter = 2;
3879                         } else if (c >= 0xc0) {
3880                                 ch = c & 0x1f;
3881                                 counter = 1;
3882                         } else {
3883                                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3884                                 decode_ucs4(c, dst);
3885                         }
3886                         break;
3887                 case 1:
3888                         ch = (ch << 6) | (c & 0x3f);
3889                         decode_ucs4(ch, dst);
3890                         ch = 0;
3891                         counter = 0;
3892                         break;
3893                 default:
3894                         ch = (ch << 6) | (c & 0x3f);
3895                         counter--;
3896                 }
3897               label_continue_loop:;
3898         }
3899
3900         if (flags & CODING_STATE_END)
3901                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3902
3903         str->flags = flags;
3904         str->ch = ch;
3905         str->counter = counter;
3906 }
3907
3908 static void
3909 encode_utf8(Lisp_Object charset,
3910             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3911 {
3912         unsigned long code = mule_char_to_ucs4(charset, h, l);
3913         if (code <= 0x7f) {
3914                 Dynarr_add(dst, code);
3915         } else if (code <= 0x7ff) {
3916                 Dynarr_add(dst, (code >> 6) | 0xc0);
3917                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3918         } else if (code <= 0xffff) {
3919                 Dynarr_add(dst, (code >> 12) | 0xe0);
3920                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3921                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3922         } else if (code <= 0x1fffff) {
3923                 Dynarr_add(dst, (code >> 18) | 0xf0);
3924                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3925                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3926                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3927         } else if (code <= 0x3ffffff) {
3928                 Dynarr_add(dst, (code >> 24) | 0xf8);
3929                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3930                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3931                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3932                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3933         } else {
3934                 Dynarr_add(dst, (code >> 30) | 0xfc);
3935                 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3936                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3937                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3938                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3939                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3940         }
3941 }
3942
3943 static void
3944 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3945                    unsigned_char_dynarr * dst, Lstream_data_count n)
3946 {
3947         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3948         unsigned int flags = str->flags;
3949         unsigned int ch = str->ch;
3950         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3951         unsigned char char_boundary = str->iso2022.current_char_boundary;
3952         Lisp_Object charset = str->iso2022.current_charset;
3953
3954 #ifdef ENABLE_COMPOSITE_CHARS
3955         /* flags for handling composite chars.  We do a little switcharoo
3956            on the source while we're outputting the composite char. */
3957         unsigned int saved_n = 0;
3958         const unsigned char *saved_src = NULL;
3959         int in_composite = 0;
3960
3961       back_to_square_n:
3962 #endif                          /* ENABLE_COMPOSITE_CHARS */
3963
3964         while (n--) {
3965                 unsigned char c = *src++;
3966
3967                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3968                         ch = 0;
3969                         if (c == '\n') {
3970                                 if (eol_type != EOL_LF
3971                                     && eol_type != EOL_AUTODETECT)
3972                                         Dynarr_add(dst, '\r');
3973                                 if (eol_type != EOL_CR)
3974                                         Dynarr_add(dst, c);
3975                         } else
3976                                 encode_utf8(Vcharset_ascii, c, 0, dst);
3977                         char_boundary = 1;
3978                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3979                         ch = 0;
3980                         charset = CHARSET_BY_LEADING_BYTE(c);
3981                         if (LEADING_BYTE_PREFIX_P(c))
3982                                 ch = c;
3983                         char_boundary = 0;
3984                 } else {        /* Processing Non-ASCII character */
3985                         char_boundary = 1;
3986                         if (EQ(charset, Vcharset_control_1)) {
3987                                 encode_utf8(Vcharset_control_1, c, 0, dst);
3988                         } else {
3989                                 switch (XCHARSET_REP_BYTES(charset)) {
3990                                 case 2:
3991                                         encode_utf8(charset, c, 0, dst);
3992                                         break;
3993                                 case 3:
3994                                         if (XCHARSET_PRIVATE_P(charset)) {
3995                                                 encode_utf8(charset, c, 0, dst);
3996                                                 ch = 0;
3997                                         } else if (ch) {
3998 #ifdef ENABLE_COMPOSITE_CHARS
3999                                                 if (EQ
4000                                                     (charset,
4001                                                      Vcharset_composite)) {
4002                                                         if (in_composite) {
4003                                                                 /* #### Bother! We don't know how to
4004                                                                    handle this yet. */
4005                                                                 encode_utf8
4006                                                                     (Vcharset_ascii,
4007                                                                      '~', 0,
4008                                                                      dst);
4009                                                         } else {
4010                                                                 Emchar emch =
4011                                                                     MAKE_CHAR
4012                                                                     (Vcharset_composite,
4013                                                                      ch & 0x7F,
4014                                                                      c & 0x7F);
4015                                                                 Lisp_Object lstr
4016                                                                     =
4017                                                                     composite_char_string
4018                                                                     (emch);
4019                                                                 saved_n = n;
4020                                                                 saved_src = src;
4021                                                                 in_composite =
4022                                                                     1;
4023                                                                 src =
4024                                                                     XSTRING_DATA
4025                                                                     (lstr);
4026                                                                 n = XSTRING_LENGTH(lstr);
4027                                                         }
4028                                                 } else
4029 #endif                          /* ENABLE_COMPOSITE_CHARS */
4030                                                 {
4031                                                         encode_utf8(charset, ch,
4032                                                                     c, dst);
4033                                                 }
4034                                                 ch = 0;
4035                                         } else {
4036                                                 ch = c;
4037                                                 char_boundary = 0;
4038                                         }
4039                                         break;
4040                                 case 4:
4041                                         if (ch) {
4042                                                 encode_utf8(charset, ch, c,
4043                                                             dst);
4044                                                 ch = 0;
4045                                         } else {
4046                                                 ch = c;
4047                                                 char_boundary = 0;
4048                                         }
4049                                         break;
4050                                 default:
4051                                         abort();
4052                                 }
4053                         }
4054                 }
4055         }
4056
4057 #ifdef ENABLE_COMPOSITE_CHARS
4058         if (in_composite) {
4059                 n = saved_n;
4060                 src = saved_src;
4061                 in_composite = 0;
4062                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
4063         }
4064 #endif
4065
4066         str->flags = flags;
4067         str->ch = ch;
4068         str->iso2022.current_char_boundary = char_boundary;
4069         str->iso2022.current_charset = charset;
4070
4071         /* Verbum caro factum est! */
4072 }
4073 \f
4074 /************************************************************************/
4075 /*                           ISO2022 methods                            */
4076 /************************************************************************/
4077
4078 /* The following note describes the coding system ISO2022 briefly.
4079    Since the intention of this note is to help understand the
4080    functions in this file, some parts are NOT ACCURATE or OVERLY
4081    SIMPLIFIED.  For thorough understanding, please refer to the
4082    original document of ISO2022.
4083
4084    ISO2022 provides many mechanisms to encode several character sets
4085    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4086    is encoded using bytes less than 128.  This may make the encoded
4087    text a little bit longer, but the text passes more easily through
4088    several gateways, some of which strip off MSB (Most Signigant Bit).
4089
4090    There are two kinds of character sets: control character set and
4091    graphic character set.  The former contains control characters such
4092    as `newline' and `escape' to provide control functions (control
4093    functions are also provided by escape sequences).  The latter
4094    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4095    two control character sets and many graphic character sets.
4096
4097    Graphic character sets are classified into one of the following
4098    four classes, according to the number of bytes (DIMENSION) and
4099    number of characters in one dimension (CHARS) of the set:
4100    - DIMENSION1_CHARS94
4101    - DIMENSION1_CHARS96
4102    - DIMENSION2_CHARS94
4103    - DIMENSION2_CHARS96
4104
4105    In addition, each character set is assigned an identification tag,
4106    unique for each set, called "final character" (denoted as <F>
4107    hereafter).  The <F> of each character set is decided by ECMA(*)
4108    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4109    (0x30..0x3F are for private use only).
4110
4111    Note (*): ECMA = European Computer Manufacturers Association
4112
4113    Here are examples of graphic character set [NAME(<F>)]:
4114         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4115         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4116         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4117         o DIMENSION2_CHARS96 -- none for the moment
4118
4119    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4120         C0 [0x00..0x1F] -- control character plane 0
4121         GL [0x20..0x7F] -- graphic character plane 0
4122         C1 [0x80..0x9F] -- control character plane 1
4123         GR [0xA0..0xFF] -- graphic character plane 1
4124
4125    A control character set is directly designated and invoked to C0 or
4126    C1 by an escape sequence.  The most common case is that:
4127    - ISO646's  control character set is designated/invoked to C0, and
4128    - ISO6429's control character set is designated/invoked to C1,
4129    and usually these designations/invocations are omitted in encoded
4130    text.  In a 7-bit environment, only C0 can be used, and a control
4131    character for C1 is encoded by an appropriate escape sequence to
4132    fit into the environment.  All control characters for C1 are
4133    defined to have corresponding escape sequences.
4134
4135    A graphic character set is at first designated to one of four
4136    graphic registers (G0 through G3), then these graphic registers are
4137    invoked to GL or GR.  These designations and invocations can be
4138    done independently.  The most common case is that G0 is invoked to
4139    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4140    these invocations and designations are omitted in encoded text.
4141    In a 7-bit environment, only GL can be used.
4142
4143    When a graphic character set of CHARS94 is invoked to GL, codes
4144    0x20 and 0x7F of the GL area work as control characters SPACE and
4145    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4146    be used.
4147
4148    There are two ways of invocation: locking-shift and single-shift.
4149    With locking-shift, the invocation lasts until the next different
4150    invocation, whereas with single-shift, the invocation affects the
4151    following character only and doesn't affect the locking-shift
4152    state.  Invocations are done by the following control characters or
4153    escape sequences:
4154
4155    ----------------------------------------------------------------------
4156    abbrev  function                  cntrl escape seq   description
4157    ----------------------------------------------------------------------
4158    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4159    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4160    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4161    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4162    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4163    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4164    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4165    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4166    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4167    ----------------------------------------------------------------------
4168    (*) These are not used by any known coding system.
4169
4170    Control characters for these functions are defined by macros
4171    ISO_CODE_XXX in `coding.h'.
4172
4173    Designations are done by the following escape sequences:
4174    ----------------------------------------------------------------------
4175    escape sequence      description
4176    ----------------------------------------------------------------------
4177    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4178    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4179    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4180    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4181    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4182    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4183    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4184    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4185    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4186    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4187    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4188    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4189    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4190    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4191    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4192    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4193    ----------------------------------------------------------------------
4194
4195    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4196    of dimension 1, chars 94, and final character <F>, etc...
4197
4198    Note (*): Although these designations are not allowed in ISO2022,
4199    Emacs accepts them on decoding, and produces them on encoding
4200    CHARS96 character sets in a coding system which is characterized as
4201    7-bit environment, non-locking-shift, and non-single-shift.
4202
4203    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4204    '(' can be omitted.  We refer to this as "short-form" hereafter.
4205
4206    Now you may notice that there are a lot of ways for encoding the
4207    same multilingual text in ISO2022.  Actually, there exist many
4208    coding systems such as Compound Text (used in X11's inter client
4209    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4210    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4211    localized platforms), and all of these are variants of ISO2022.
4212
4213    In addition to the above, Emacs handles two more kinds of escape
4214    sequences: ISO6429's direction specification and Emacs' private
4215    sequence for specifying character composition.
4216
4217    ISO6429's direction specification takes the following form:
4218         o CSI ']'      -- end of the current direction
4219         o CSI '0' ']'  -- end of the current direction
4220         o CSI '1' ']'  -- start of left-to-right text
4221         o CSI '2' ']'  -- start of right-to-left text
4222    The control character CSI (0x9B: control sequence introducer) is
4223    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4224
4225    Character composition specification takes the following form:
4226         o ESC '0' -- start character composition
4227         o ESC '1' -- end character composition
4228    Since these are not standard escape sequences of any ISO standard,
4229    their use with these meanings is restricted to Emacs only.  */
4230
4231 static void
4232 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4233 {
4234         int i;
4235
4236         for (i = 0; i < 4; i++) {
4237                 if (!NILP(coding_system))
4238                         iso->charset[i] =
4239                             XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4240                             (coding_system, i);
4241                 else
4242                         iso->charset[i] = Qt;
4243                 iso->invalid_designated[i] = 0;
4244         }
4245         iso->esc = ISO_ESC_NOTHING;
4246         iso->esc_bytes_index = 0;
4247         iso->register_left = 0;
4248         iso->register_right = 1;
4249         iso->switched_dir_and_no_valid_charset_yet = 0;
4250         iso->invalid_switch_dir = 0;
4251         iso->output_direction_sequence = 0;
4252         iso->output_literally = 0;
4253 #ifdef ENABLE_COMPOSITE_CHARS
4254         if (iso->composite_chars)
4255                 Dynarr_reset(iso->composite_chars);
4256 #endif
4257 }
4258
4259 static int fit_to_be_escape_quoted(unsigned char c)
4260 {
4261         switch (c) {
4262         case ISO_CODE_ESC:
4263         case ISO_CODE_CSI:
4264         case ISO_CODE_SS2:
4265         case ISO_CODE_SS3:
4266         case ISO_CODE_SO:
4267         case ISO_CODE_SI:
4268                 return 1;
4269
4270         default:
4271                 return 0;
4272         }
4273 }
4274
4275 /* Parse one byte of an ISO2022 escape sequence.
4276    If the result is an invalid escape sequence, return 0 and
4277    do not change anything in STR.  Otherwise, if the result is
4278    an incomplete escape sequence, update ISO2022.ESC and
4279    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4280    all the state variables (but not ISO2022.ESC_BYTES) and
4281    return 1.
4282
4283    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4284    or invocation of an invalid character set and treat that as
4285    an unrecognized escape sequence.
4286
4287    ********************************************************************
4288
4289    #### Strategies for error annotation and coding orthogonalization
4290
4291    We really want to separate out a number of things.  Conceptually,
4292    there is a nested syntax.
4293
4294    At the top level is the ISO 2022 extension syntax, including charset
4295    designation and invocation, and certain auxiliary controls such as the
4296    ISO 6429 direction specification.  These are octet-oriented, with the
4297    single exception (AFAIK) of the "exit Unicode" sequence which uses the
4298    UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4299    UTF-16, and 4 bytes for UCS-4 and UTF-32).  This will be treated as a
4300    (deprecated) special case in Unicode processing.
4301
4302    The middle layer is ISO 2022 character interpretation.  This will depend
4303    on the current state of the ISO 2022 registers, and assembles octets
4304    into the character's internal representation.
4305
4306    The lowest level is translating system control conventions.  At present
4307    this is restricted to newline translation, but one could imagine doing
4308    tab conversion or line wrapping here.  "Escape from Unicode" processing
4309    would be done at this level.
4310
4311    At each level the parser will verify the syntax.  In the case of a
4312    syntax error or warning (such as a redundant escape sequence that affects
4313    no characters), the parser will take some action, typically inserting the
4314    erroneous octets directly into the output and creating an annotation
4315    which can be used by higher level I/O to mark the affected region.
4316
4317    This should make it possible to do something sensible about separating
4318    newline convention processing from character construction, and about
4319    preventing ISO 2022 escape sequences from being recognized
4320    inappropriately.
4321
4322    The basic strategy will be to have octet classification tables, and
4323    switch processing according to the table entry.
4324
4325    It's possible that, by doing the processing with tables of functions or
4326    the like, the parser can be used for both detection and translation. */
4327
4328 static int
4329 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4330                   unsigned char c, unsigned int *flags,
4331                   int check_invalid_charsets)
4332 {
4333         /* (1) If we're at the end of a designation sequence, CS is the
4334            charset being designated and REG is the register to designate
4335            it to.
4336
4337            (2) If we're at the end of a locking-shift sequence, REG is
4338            the register to invoke and HALF (0 == left, 1 == right) is
4339            the half to invoke it into.
4340
4341            (3) If we're at the end of a single-shift sequence, REG is
4342            the register to invoke. */
4343         Lisp_Object cs = Qnil;
4344         int reg, half;
4345
4346         /* NOTE: This code does goto's all over the fucking place.
4347            The reason for this is that we're basically implementing
4348            a state machine here, and hierarchical languages like C
4349            don't really provide a clean way of doing this. */
4350
4351         if (!(*flags & CODING_STATE_ESCAPE))
4352                 /* At beginning of escape sequence; we need to reset our
4353                    escape-state variables. */
4354                 iso->esc = ISO_ESC_NOTHING;
4355
4356         iso->output_literally = 0;
4357         iso->output_direction_sequence = 0;
4358
4359         switch (iso->esc) {
4360         case ISO_ESC_NOTHING:
4361                 iso->esc_bytes_index = 0;
4362                 switch (c) {
4363                 case ISO_CODE_ESC:      /* Start escape sequence */
4364                         *flags |= CODING_STATE_ESCAPE;
4365                         iso->esc = ISO_ESC;
4366                         goto not_done;
4367
4368                 case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4369                         *flags |= CODING_STATE_ESCAPE;
4370                         iso->esc = ISO_ESC_5_11;
4371                         goto not_done;
4372
4373                 case ISO_CODE_SO:       /* locking shift 1 */
4374                         reg = 1;
4375                         half = 0;
4376                         goto locking_shift;
4377                 case ISO_CODE_SI:       /* locking shift 0 */
4378                         reg = 0;
4379                         half = 0;
4380                         goto locking_shift;
4381
4382                 case ISO_CODE_SS2:      /* single shift */
4383                         reg = 2;
4384                         goto single_shift;
4385                 case ISO_CODE_SS3:      /* single shift */
4386                         reg = 3;
4387                         goto single_shift;
4388
4389                 default:        /* Other control characters */
4390                         return 0;
4391                 }
4392
4393         case ISO_ESC:
4394                 switch (c) {
4395           /**** single shift ****/
4396
4397                 case 'N':       /* single shift 2 */
4398                         reg = 2;
4399                         goto single_shift;
4400                 case 'O':       /* single shift 3 */
4401                         reg = 3;
4402                         goto single_shift;
4403
4404           /**** locking shift ****/
4405
4406                 case '~':       /* locking shift 1 right */
4407                         reg = 1;
4408                         half = 1;
4409                         goto locking_shift;
4410                 case 'n':       /* locking shift 2 */
4411                         reg = 2;
4412                         half = 0;
4413                         goto locking_shift;
4414                 case '}':       /* locking shift 2 right */
4415                         reg = 2;
4416                         half = 1;
4417                         goto locking_shift;
4418                 case 'o':       /* locking shift 3 */
4419                         reg = 3;
4420                         half = 0;
4421                         goto locking_shift;
4422                 case '|':       /* locking shift 3 right */
4423                         reg = 3;
4424                         half = 1;
4425                         goto locking_shift;
4426
4427 #ifdef ENABLE_COMPOSITE_CHARS
4428           /**** composite ****/
4429
4430                 case '0':
4431                         iso->esc = ISO_ESC_START_COMPOSITE;
4432                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4433                             CODING_STATE_COMPOSITE;
4434                         return 1;
4435
4436                 case '1':
4437                         iso->esc = ISO_ESC_END_COMPOSITE;
4438                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4439                             ~CODING_STATE_COMPOSITE;
4440                         return 1;
4441 #endif                          /* ENABLE_COMPOSITE_CHARS */
4442
4443           /**** directionality ****/
4444
4445                 case '[':
4446                         iso->esc = ISO_ESC_5_11;
4447                         goto not_done;
4448
4449           /**** designation ****/
4450
4451                 case '$':       /* multibyte charset prefix */
4452                         iso->esc = ISO_ESC_2_4;
4453                         goto not_done;
4454
4455                 default:
4456                         if (0x28 <= c && c <= 0x2F) {
4457                                 iso->esc =
4458                                     (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4459                                 goto not_done;
4460                         }
4461
4462                         /* This function is called with CODESYS equal to nil when
4463                            doing coding-system detection. */
4464                         if (!NILP(codesys)
4465                             && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4466                             && fit_to_be_escape_quoted(c)) {
4467                                 iso->esc = ISO_ESC_LITERAL;
4468                                 *flags &= CODING_STATE_ISO2022_LOCK;
4469                                 return 1;
4470                         }
4471
4472                         /* bzzzt! */
4473                         return 0;
4474                 }
4475
4476       /**** directionality ****/
4477
4478         case ISO_ESC_5_11:      /* ISO6429 direction control */
4479                 if (c == ']') {
4480                         *flags &=
4481                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4482                         goto directionality;
4483                 }
4484                 if (c == '0')
4485                         iso->esc = ISO_ESC_5_11_0;
4486                 else if (c == '1')
4487                         iso->esc = ISO_ESC_5_11_1;
4488                 else if (c == '2')
4489                         iso->esc = ISO_ESC_5_11_2;
4490                 else
4491                         return 0;
4492                 goto not_done;
4493
4494         case ISO_ESC_5_11_0:
4495                 if (c == ']') {
4496                         *flags &=
4497                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4498                         goto directionality;
4499                 }
4500                 return 0;
4501
4502         case ISO_ESC_5_11_1:
4503                 if (c == ']') {
4504                         *flags =
4505                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4506                         goto directionality;
4507                 }
4508                 return 0;
4509
4510         case ISO_ESC_5_11_2:
4511                 if (c == ']') {
4512                         *flags =
4513                             (*flags & CODING_STATE_ISO2022_LOCK) |
4514                             CODING_STATE_R2L;
4515                         goto directionality;
4516                 }
4517                 return 0;
4518
4519         directionality:
4520                 iso->esc = ISO_ESC_DIRECTIONALITY;
4521                 /* Various junk here to attempt to preserve the direction
4522                    sequences literally in the text if they would otherwise be
4523                    swallowed due to invalid designations that don't show up as
4524                    actual charset changes in the text. */
4525                 if (iso->invalid_switch_dir) {
4526                         /* We already inserted a direction switch literally into
4527                            the text.  We assume (#### this may not be right)
4528                            that the next direction switch is the one going the
4529                            other way, and we need to output that literally as
4530                            well. */
4531                         iso->output_literally = 1;
4532                         iso->invalid_switch_dir = 0;
4533                 } else {
4534                         int jj;
4535
4536                         /* If we are in the thrall of an invalid designation,
4537                            then stick the directionality sequence literally into
4538                            the output stream so it ends up in the original text
4539                            again. */
4540                         for (jj = 0; jj < 4; jj++)
4541                                 if (iso->invalid_designated[jj])
4542                                         break;
4543                         if (jj < 4) {
4544                                 iso->output_literally = 1;
4545                                 iso->invalid_switch_dir = 1;
4546                         } else
4547                                 /* Indicate that we haven't yet seen a valid
4548                                    designation, so that if a switch-dir is
4549                                    directly followed by an invalid designation,
4550                                    both get inserted literally. */
4551                                 iso->switched_dir_and_no_valid_charset_yet = 1;
4552                 }
4553                 return 1;
4554
4555                 /**** designation ****/
4556
4557         case ISO_ESC_2_4:
4558                 if (0x28 <= c && c <= 0x2F) {
4559                         iso->esc =
4560                             (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4561                         goto not_done;
4562                 }
4563                 if (0x40 <= c && c <= 0x42) {
4564                         cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4565                                                    *flags & CODING_STATE_R2L ?
4566                                                    CHARSET_RIGHT_TO_LEFT :
4567                                                    CHARSET_LEFT_TO_RIGHT);
4568                         reg = 0;
4569                         goto designated;
4570                 }
4571                 return 0;
4572
4573                 /* list the rest */
4574         case ISO_ESC_2_8:
4575         case ISO_ESC_2_9:
4576         case ISO_ESC_2_10:
4577         case ISO_ESC_2_11:
4578         case ISO_ESC_2_12:
4579         case ISO_ESC_2_13:
4580         case ISO_ESC_2_14:
4581         case ISO_ESC_2_15:
4582         case ISO_ESC_2_4_8:
4583         case ISO_ESC_2_4_9:
4584         case ISO_ESC_2_4_10:
4585         case ISO_ESC_2_4_11:
4586         case ISO_ESC_2_4_12:
4587         case ISO_ESC_2_4_13:
4588         case ISO_ESC_2_4_14:
4589         case ISO_ESC_2_4_15:
4590         case ISO_ESC_SINGLE_SHIFT:
4591         case ISO_ESC_LOCKING_SHIFT:
4592         case ISO_ESC_DESIGNATE:
4593         case ISO_ESC_DIRECTIONALITY:
4594         case ISO_ESC_LITERAL:
4595
4596         default: {
4597                 int type = -1;
4598
4599                 if (c < '0' || c > '~')
4600                         return 0;       /* bad final byte */
4601
4602                 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4603                         type = ((iso->esc >= ISO_ESC_2_12) ?
4604                                 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4605                         reg = (iso->esc - ISO_ESC_2_8) & 3;
4606                 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4607                            iso->esc <= ISO_ESC_2_4_15) {
4608                         type = ((iso->esc >= ISO_ESC_2_4_12) ?
4609                                 CHARSET_TYPE_96X96 :
4610                                 CHARSET_TYPE_94X94);
4611                         reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4612                 } else {
4613                         /* Can this ever be reached? -slb */
4614                         abort();
4615                         return 0;
4616                 }
4617
4618                 cs = CHARSET_BY_ATTRIBUTES(type, c,
4619                                            *flags & CODING_STATE_R2L ?
4620                                            CHARSET_RIGHT_TO_LEFT :
4621                                            CHARSET_LEFT_TO_RIGHT);
4622                 goto designated;
4623         }
4624         }
4625
4626       not_done:
4627         iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4628         return -1;
4629
4630       single_shift:
4631         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4632                 /* can't invoke something that ain't there. */
4633                 return 0;
4634         iso->esc = ISO_ESC_SINGLE_SHIFT;
4635         *flags &= CODING_STATE_ISO2022_LOCK;
4636         if (reg == 2)
4637                 *flags |= CODING_STATE_SS2;
4638         else
4639                 *flags |= CODING_STATE_SS3;
4640         return 1;
4641
4642       locking_shift:
4643         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4644                 /* can't invoke something that ain't there. */
4645                 return 0;
4646         if (half)
4647                 iso->register_right = reg;
4648         else
4649                 iso->register_left = reg;
4650         *flags &= CODING_STATE_ISO2022_LOCK;
4651         iso->esc = ISO_ESC_LOCKING_SHIFT;
4652         return 1;
4653
4654       designated:
4655         if (NILP(cs) && check_invalid_charsets) {
4656                 iso->invalid_designated[reg] = 1;
4657                 iso->charset[reg] = Vcharset_ascii;
4658                 iso->esc = ISO_ESC_DESIGNATE;
4659                 *flags &= CODING_STATE_ISO2022_LOCK;
4660                 iso->output_literally = 1;
4661                 if (iso->switched_dir_and_no_valid_charset_yet) {
4662                         /* We encountered a switch-direction followed by an
4663                            invalid designation.  Ensure that the switch-direction
4664                            gets outputted; otherwise it will probably get eaten
4665                            when the text is written out again. */
4666                         iso->switched_dir_and_no_valid_charset_yet = 0;
4667                         iso->output_direction_sequence = 1;
4668                         /* And make sure that the switch-dir going the other
4669                            way gets outputted, as well. */
4670                         iso->invalid_switch_dir = 1;
4671                 }
4672                 return 1;
4673         }
4674         /* This function is called with CODESYS equal to nil when
4675            doing coding-system detection. */
4676         if (!NILP(codesys)) {
4677                 charset_conversion_spec_dynarr *dyn =
4678                     XCODING_SYSTEM(codesys)->iso2022.input_conv;
4679
4680                 if (dyn) {
4681                         int i;
4682
4683                         for (i = 0; i < Dynarr_length(dyn); i++) {
4684                                 struct charset_conversion_spec *spec =
4685                                     Dynarr_atp(dyn, i);
4686                                 if (EQ(cs, spec->from_charset))
4687                                         cs = spec->to_charset;
4688                         }
4689                 }
4690         }
4691
4692         iso->charset[reg] = cs;
4693         iso->esc = ISO_ESC_DESIGNATE;
4694         *flags &= CODING_STATE_ISO2022_LOCK;
4695         if (iso->invalid_designated[reg]) {
4696                 iso->invalid_designated[reg] = 0;
4697                 iso->output_literally = 1;
4698         }
4699         if (iso->switched_dir_and_no_valid_charset_yet)
4700                 iso->switched_dir_and_no_valid_charset_yet = 0;
4701         return 1;
4702 }
4703
4704 static int
4705 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4706                       Lstream_data_count n)
4707 {
4708         int mask;
4709
4710         /* #### There are serious deficiencies in the recognition mechanism
4711            here.  This needs to be much smarter if it's going to cut it.
4712            The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4713            it should be detected as Latin-1.
4714            All the ISO2022 stuff in this file should be synced up with the
4715            code from FSF Emacs-20.4, in which Mule should be more or less stable.
4716            Perhaps we should wait till R2L works in FSF Emacs? */
4717
4718         if (!st->iso2022.initted) {
4719                 reset_iso2022(Qnil, &st->iso2022.iso);
4720                 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4721                                     CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4722                                     CODING_CATEGORY_ISO_8_1_MASK |
4723                                     CODING_CATEGORY_ISO_8_2_MASK |
4724                                     CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4725                 st->iso2022.flags = 0;
4726                 st->iso2022.high_byte_count = 0;
4727                 st->iso2022.saw_single_shift = 0;
4728                 st->iso2022.initted = 1;
4729         }
4730
4731         mask = st->iso2022.mask;
4732
4733         while (n--) {
4734                 const unsigned char c = *(const unsigned char *)src++;
4735                 if (c >= 0xA0) {
4736                         mask &= ~CODING_CATEGORY_ISO_7_MASK;
4737                         st->iso2022.high_byte_count++;
4738                 } else {
4739                         if (st->iso2022.high_byte_count
4740                             && !st->iso2022.saw_single_shift) {
4741                                 if (st->iso2022.high_byte_count & 1)
4742                                         /* odd number of high bytes; assume not iso-8-2 */
4743                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4744                         }
4745                         st->iso2022.high_byte_count = 0;
4746                         st->iso2022.saw_single_shift = 0;
4747                         if (c > 0x80)
4748                                 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4749                 }
4750                 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4751                     && (BYTE_C0_P(c) || BYTE_C1_P(c))) {        /* control chars */
4752                         switch (c) {
4753                                 /* Allow and ignore control characters that you might
4754                                    reasonably see in a text file */
4755                         case '\r':
4756                         case '\n':
4757                         case '\t':
4758                         case 7: /* bell */
4759                         case 8: /* backspace */
4760                         case 11:        /* vertical tab */
4761                         case 12:        /* form feed */
4762                         case 26:        /* MS-DOS C-z junk */
4763                         case 31:        /* '^_' -- for info */
4764                                 goto label_continue_loop;
4765
4766                         default:
4767                                 break;
4768                         }
4769                 }
4770
4771                 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4772                     || BYTE_C1_P(c)) {
4773                         if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4774                                               &st->iso2022.flags, 0)) {
4775                                 switch (st->iso2022.iso.esc) {
4776                                 case ISO_ESC_DESIGNATE:
4777                                         mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4778                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4779                                         break;
4780                                 case ISO_ESC_LOCKING_SHIFT:
4781                                         mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4782                                         goto ran_out_of_chars;
4783                                 case ISO_ESC_SINGLE_SHIFT:
4784                                         mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4785                                         st->iso2022.saw_single_shift = 1;
4786                                         break;
4787
4788                                         /* list the rest */
4789                                 case ISO_ESC_NOTHING:
4790                                 case ISO_ESC:
4791                                 case ISO_ESC_2_4:
4792                                 case ISO_ESC_2_8:
4793                                 case ISO_ESC_2_9:
4794                                 case ISO_ESC_2_10:
4795                                 case ISO_ESC_2_11:
4796                                 case ISO_ESC_2_12:
4797                                 case ISO_ESC_2_13:
4798                                 case ISO_ESC_2_14:
4799                                 case ISO_ESC_2_15:
4800                                 case ISO_ESC_2_4_8:
4801                                 case ISO_ESC_2_4_9:
4802                                 case ISO_ESC_2_4_10:
4803                                 case ISO_ESC_2_4_11:
4804                                 case ISO_ESC_2_4_12:
4805                                 case ISO_ESC_2_4_13:
4806                                 case ISO_ESC_2_4_14:
4807                                 case ISO_ESC_2_4_15:
4808                                 case ISO_ESC_5_11:
4809                                 case ISO_ESC_5_11_0:
4810                                 case ISO_ESC_5_11_1:
4811                                 case ISO_ESC_5_11_2:
4812                                 case ISO_ESC_DIRECTIONALITY:
4813                                 case ISO_ESC_LITERAL:
4814                                 default:
4815                                         break;
4816                                 }
4817                         } else {
4818                                 mask = 0;
4819                                 goto ran_out_of_chars;
4820                         }
4821                 }
4822         label_continue_loop:;
4823         }
4824
4825 ran_out_of_chars:
4826         return mask;
4827 }
4828
4829 static int postprocess_iso2022_mask(int mask)
4830 {
4831         /* #### kind of cheesy */
4832         /* If seven-bit ISO is allowed, then assume that the encoding is
4833            entirely seven-bit and turn off the eight-bit ones. */
4834         if (mask & CODING_CATEGORY_ISO_7_MASK)
4835                 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4836                           CODING_CATEGORY_ISO_8_1_MASK |
4837                           CODING_CATEGORY_ISO_8_2_MASK);
4838         return mask;
4839 }
4840
4841 /* If FLAGS is a null pointer or specifies right-to-left motion,
4842    output a switch-dir-to-left-to-right sequence to DST.
4843    Also update FLAGS if it is not a null pointer.
4844    If INTERNAL_P is set, we are outputting in internal format and
4845    need to handle the CSI differently. */
4846
4847 static void
4848 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4849                                 unsigned_char_dynarr * dst,
4850                                 unsigned int *flags, int internal_p)
4851 {
4852         if (!flags || (*flags & CODING_STATE_R2L)) {
4853                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4854                         Dynarr_add(dst, ISO_CODE_ESC);
4855                         Dynarr_add(dst, '[');
4856                 } else if (internal_p)
4857                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4858                 else
4859                         Dynarr_add(dst, ISO_CODE_CSI);
4860                 Dynarr_add(dst, '0');
4861                 Dynarr_add(dst, ']');
4862                 if (flags)
4863                         *flags &= ~CODING_STATE_R2L;
4864         }
4865 }
4866
4867 /* If FLAGS is a null pointer or specifies a direction different from
4868    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4869    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4870    sequence to DST.  Also update FLAGS if it is not a null pointer.
4871    If INTERNAL_P is set, we are outputting in internal format and
4872    need to handle the CSI differently. */
4873
4874 static void
4875 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4876                          unsigned_char_dynarr * dst, unsigned int *flags,
4877                          int internal_p)
4878 {
4879         if ((!flags || (*flags & CODING_STATE_R2L)) &&
4880             direction == CHARSET_LEFT_TO_RIGHT)
4881                 restore_left_to_right_direction(codesys, dst, flags,
4882                                                 internal_p);
4883         else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4884                  && (!flags || !(*flags & CODING_STATE_R2L)) &&
4885                  direction == CHARSET_RIGHT_TO_LEFT) {
4886                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4887                         Dynarr_add(dst, ISO_CODE_ESC);
4888                         Dynarr_add(dst, '[');
4889                 } else if (internal_p)
4890                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4891                 else
4892                         Dynarr_add(dst, ISO_CODE_CSI);
4893                 Dynarr_add(dst, '2');
4894                 Dynarr_add(dst, ']');
4895                 if (flags)
4896                         *flags |= CODING_STATE_R2L;
4897         }
4898 }
4899
4900 /* Convert ISO2022-format data to internal format. */
4901
4902 static void
4903 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4904                       unsigned_char_dynarr * dst, Lstream_data_count n)
4905 {
4906         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4907         unsigned int flags = str->flags;
4908         unsigned int ch = str->ch;
4909         eol_type_t eol_type = str->eol_type;
4910 #ifdef ENABLE_COMPOSITE_CHARS
4911         unsigned_char_dynarr *real_dst = dst;
4912 #endif
4913         Lisp_Object coding_system;
4914
4915         XSETCODING_SYSTEM(coding_system, str->codesys);
4916
4917 #ifdef ENABLE_COMPOSITE_CHARS
4918         if (flags & CODING_STATE_COMPOSITE)
4919                 dst = str->iso2022.composite_chars;
4920 #endif                          /* ENABLE_COMPOSITE_CHARS */
4921
4922         while (n--) {
4923                 const unsigned char c = *(const unsigned char *)src++;
4924                 if (flags & CODING_STATE_ESCAPE) {
4925                         /* Within ESC sequence */
4926                         int retval = parse_iso2022_esc(
4927                                 coding_system, &str->iso2022, c, &flags, 1);
4928
4929                         if (retval) {
4930                                 switch (str->iso2022.esc) {
4931 #ifdef ENABLE_COMPOSITE_CHARS
4932                                 case ISO_ESC_START_COMPOSITE:
4933                                         if (str->iso2022.composite_chars)
4934                                                 Dynarr_reset(str->iso2022.
4935                                                              composite_chars);
4936                                         else
4937                                                 str->iso2022.composite_chars =
4938                                                     Dynarr_new(unsigned_char);
4939                                         dst = str->iso2022.composite_chars;
4940                                         break;
4941                                 case ISO_ESC_END_COMPOSITE:
4942                                         {
4943                                                 Bufbyte comstr[MAX_EMCHAR_LEN];
4944                                                 Bytecount len;
4945                                                 Emchar emch =
4946                                                     lookup_composite_char
4947                                                     (Dynarr_atp(dst, 0),
4948                                                      Dynarr_length(dst));
4949                                                 dst = real_dst;
4950                                                 len =
4951                                                     set_charptr_emchar(comstr,
4952                                                                        emch);
4953                                                 Dynarr_add_many(dst, comstr,
4954                                                                 len);
4955                                                 break;
4956                                         }
4957 #endif                          /* ENABLE_COMPOSITE_CHARS */
4958
4959                                 case ISO_ESC_LITERAL:
4960                                         DECODE_ADD_BINARY_CHAR(c, dst);
4961                                         break;
4962
4963                                 case ISO_ESC_NOTHING:
4964                                 case ISO_ESC:
4965                                 case ISO_ESC_2_4:
4966                                 case ISO_ESC_2_8:
4967                                 case ISO_ESC_2_9:
4968                                 case ISO_ESC_2_10:
4969                                 case ISO_ESC_2_11:
4970                                 case ISO_ESC_2_12:
4971                                 case ISO_ESC_2_13:
4972                                 case ISO_ESC_2_14:
4973                                 case ISO_ESC_2_15:
4974                                 case ISO_ESC_2_4_8:
4975                                 case ISO_ESC_2_4_9:
4976                                 case ISO_ESC_2_4_10:
4977                                 case ISO_ESC_2_4_11:
4978                                 case ISO_ESC_2_4_12:
4979                                 case ISO_ESC_2_4_13:
4980                                 case ISO_ESC_2_4_14:
4981                                 case ISO_ESC_2_4_15:
4982                                 case ISO_ESC_5_11:
4983                                 case ISO_ESC_5_11_0:
4984                                 case ISO_ESC_5_11_1:
4985                                 case ISO_ESC_5_11_2:
4986                                 case ISO_ESC_SINGLE_SHIFT:
4987                                 case ISO_ESC_LOCKING_SHIFT:
4988                                 case ISO_ESC_DESIGNATE:
4989                                 case ISO_ESC_DIRECTIONALITY:
4990
4991                                 default:
4992                                         /* Everything else handled already */
4993                                         break;
4994                                 }
4995                         }
4996
4997                         /* Attempted error recovery. */
4998                         if (str->iso2022.output_direction_sequence)
4999                                 ensure_correct_direction(flags &
5000                                                          CODING_STATE_R2L ?
5001                                                          CHARSET_RIGHT_TO_LEFT :
5002                                                          CHARSET_LEFT_TO_RIGHT,
5003                                                          str->codesys, dst, 0,
5004                                                          1);
5005                         /* More error recovery. */
5006                         if (!retval || str->iso2022.output_literally) {
5007                                 /* Output the (possibly invalid) sequence */
5008                                 int i;
5009                                 for (i = 0; i < str->iso2022.esc_bytes_index;
5010                                      i++)
5011                                         DECODE_ADD_BINARY_CHAR(str->iso2022.
5012                                                                esc_bytes[i],
5013                                                                dst);
5014                                 flags &= CODING_STATE_ISO2022_LOCK;
5015                                 if (!retval)
5016                                         n++, src--;     /* Repeat the loop with the same character. */
5017                                 else {
5018                                         /* No sense in reprocessing the final byte of the
5019                                            escape sequence; it could mess things up anyway.
5020                                            Just add it now. */
5021                                         DECODE_ADD_BINARY_CHAR(c, dst);
5022                                 }
5023                         }
5024                         ch = 0;
5025                 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) {      /* Control characters */
5026
5027           /***** Error-handling *****/
5028
5029                         /* If we were in the middle of a character, dump out the
5030                            partial character. */
5031                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5032
5033                         /* If we just saw a single-shift character, dump it out.
5034                            This may dump out the wrong sort of single-shift character,
5035                            but least it will give an indication that something went
5036                            wrong. */
5037                         if (flags & CODING_STATE_SS2) {
5038                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5039                                 flags &= ~CODING_STATE_SS2;
5040                         }
5041                         if (flags & CODING_STATE_SS3) {
5042                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5043                                 flags &= ~CODING_STATE_SS3;
5044                         }
5045
5046           /***** Now handle the control characters. *****/
5047
5048                         /* Handle CR/LF */
5049                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5050
5051                         flags &= CODING_STATE_ISO2022_LOCK;
5052
5053                         if (!parse_iso2022_esc
5054                             (coding_system, &str->iso2022, c, &flags, 1))
5055                                 DECODE_ADD_BINARY_CHAR(c, dst);
5056                 } else {        /* Graphic characters */
5057                         Lisp_Object charset;
5058                         int lb;
5059                         int reg;
5060
5061                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5062
5063                         /* Now determine the charset. */
5064                         reg = ((flags & CODING_STATE_SS2) ? 2
5065                                : (flags & CODING_STATE_SS3) ? 3
5066                                : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5067                                : str->iso2022.register_left);
5068                         charset = str->iso2022.charset[reg];
5069
5070                         /* Error checking: */
5071                         if (!CHARSETP(charset)
5072                             || str->iso2022.invalid_designated[reg]
5073                             ||
5074                             (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5075                              && XCHARSET_CHARS(charset) == 94))
5076                                 /* Mrmph.  We are trying to invoke a register that has no
5077                                    or an invalid charset in it, or trying to add a character
5078                                    outside the range of the charset.  Insert that char literally
5079                                    to preserve it for the output. */
5080                         {
5081                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5082                                 DECODE_ADD_BINARY_CHAR(c, dst);
5083                         }
5084
5085                         else {
5086                                 /* Things are probably hunky-dorey. */
5087
5088                                 /* Fetch reverse charset, maybe. */
5089                                 if (((flags & CODING_STATE_R2L) &&
5090                                      XCHARSET_DIRECTION(charset) ==
5091                                      CHARSET_LEFT_TO_RIGHT)
5092                                     || (!(flags & CODING_STATE_R2L)
5093                                         && XCHARSET_DIRECTION(charset) ==
5094                                         CHARSET_RIGHT_TO_LEFT)) {
5095                                         Lisp_Object new_charset =
5096                                             XCHARSET_REVERSE_DIRECTION_CHARSET
5097                                             (charset);
5098                                         if (!NILP(new_charset))
5099                                                 charset = new_charset;
5100                                 }
5101
5102                                 lb = XCHARSET_LEADING_BYTE(charset);
5103                                 switch (XCHARSET_REP_BYTES(charset)) {
5104                                 case 1: /* ASCII */
5105                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5106                                         Dynarr_add(dst, c & 0x7F);
5107                                         break;
5108
5109                                 case 2: /* one-byte official */
5110                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5111                                         Dynarr_add(dst, lb);
5112                                         Dynarr_add(dst, c | 0x80);
5113                                         break;
5114
5115                                 case 3: /* one-byte private or two-byte official */
5116                                         if (XCHARSET_PRIVATE_P(charset)) {
5117                                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5118                                                 Dynarr_add(dst,
5119                                                            PRE_LEADING_BYTE_PRIVATE_1);
5120                                                 Dynarr_add(dst, lb);
5121                                                 Dynarr_add(dst, c | 0x80);
5122                                         } else {
5123                                                 if (ch) {
5124                                                         Dynarr_add(dst, lb);
5125                                                         Dynarr_add(dst,
5126                                                                    ch | 0x80);
5127                                                         Dynarr_add(dst,
5128                                                                    c | 0x80);
5129                                                         ch = 0;
5130                                                 } else
5131                                                         ch = c;
5132                                         }
5133                                         break;
5134
5135                                 default:        /* two-byte private */
5136                                         if (ch) {
5137                                                 Dynarr_add(dst,
5138                                                            PRE_LEADING_BYTE_PRIVATE_2);
5139                                                 Dynarr_add(dst, lb);
5140                                                 Dynarr_add(dst, ch | 0x80);
5141                                                 Dynarr_add(dst, c | 0x80);
5142                                                 ch = 0;
5143                                         } else
5144                                                 ch = c;
5145                                 }
5146                         }
5147
5148                         if (!ch)
5149                                 flags &= CODING_STATE_ISO2022_LOCK;
5150                 }
5151
5152               label_continue_loop:;
5153         }
5154
5155         if (flags & CODING_STATE_END)
5156                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5157
5158         str->flags = flags;
5159         str->ch = ch;
5160 }
5161
5162 /***** ISO2022 encoder *****/
5163
5164 /* Designate CHARSET into register REG. */
5165
5166 static void
5167 iso2022_designate(Lisp_Object charset, unsigned char reg,
5168                   encoding_stream_t str, unsigned_char_dynarr * dst)
5169 {
5170         static const char inter94[] = "()*+";
5171         static const char inter96[] = ",-./";
5172         unsigned int type;
5173         unsigned char final;
5174         Lisp_Object old_charset = str->iso2022.charset[reg];
5175
5176         str->iso2022.charset[reg] = charset;
5177         if (!CHARSETP(charset))
5178                 /* charset might be an initial nil or t. */
5179                 return;
5180         type = XCHARSET_TYPE(charset);
5181         final = XCHARSET_FINAL(charset);
5182         if (!str->iso2022.force_charset_on_output[reg] &&
5183             CHARSETP(old_charset) &&
5184             XCHARSET_TYPE(old_charset) == type &&
5185             XCHARSET_FINAL(old_charset) == final)
5186                 return;
5187
5188         str->iso2022.force_charset_on_output[reg] = 0;
5189
5190         {
5191                 charset_conversion_spec_dynarr *dyn =
5192                     str->codesys->iso2022.output_conv;
5193
5194                 if (dyn) {
5195                         int i;
5196
5197                         for (i = 0; i < Dynarr_length(dyn); i++) {
5198                                 struct charset_conversion_spec *spec =
5199                                     Dynarr_atp(dyn, i);
5200                                 if (EQ(charset, spec->from_charset))
5201                                         charset = spec->to_charset;
5202                         }
5203                 }
5204         }
5205
5206         Dynarr_add(dst, ISO_CODE_ESC);
5207         switch (type) {
5208         case CHARSET_TYPE_94:
5209                 Dynarr_add(dst, inter94[reg]);
5210                 break;
5211         case CHARSET_TYPE_96:
5212                 Dynarr_add(dst, inter96[reg]);
5213                 break;
5214         case CHARSET_TYPE_94X94:
5215                 Dynarr_add(dst, '$');
5216                 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5217                     || final < '@' || final > 'B')
5218                         Dynarr_add(dst, inter94[reg]);
5219                 break;
5220         case CHARSET_TYPE_96X96:
5221                 Dynarr_add(dst, '$');
5222                 Dynarr_add(dst, inter96[reg]);
5223                 break;
5224         default:
5225                 break;
5226         }
5227         Dynarr_add(dst, final);
5228 }
5229
5230 static void
5231 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5232 {
5233         if (str->iso2022.register_left != 0) {
5234                 Dynarr_add(dst, ISO_CODE_SI);
5235                 str->iso2022.register_left = 0;
5236         }
5237 }
5238
5239 static void
5240 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5241 {
5242         if (str->iso2022.register_left != 1) {
5243                 Dynarr_add(dst, ISO_CODE_SO);
5244                 str->iso2022.register_left = 1;
5245         }
5246 }
5247
5248 /* Convert internally-formatted data to ISO2022 format. */
5249
5250 static void
5251 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5252                       unsigned_char_dynarr * dst, Lstream_data_count n)
5253 {
5254         unsigned char charmask, c;
5255         unsigned char char_boundary;
5256         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5257         unsigned int flags = str->flags;
5258         unsigned int ch = str->ch;
5259         Lisp_Coding_System *codesys = str->codesys;
5260         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5261         int i;
5262         Lisp_Object charset;
5263         int half;
5264
5265 #ifdef ENABLE_COMPOSITE_CHARS
5266         /* flags for handling composite chars.  We do a little switcharoo
5267            on the source while we're outputting the composite char. */
5268         unsigned int saved_n = 0;
5269         const unsigned char *saved_src = NULL;
5270         int in_composite = 0;
5271 #endif                          /* ENABLE_COMPOSITE_CHARS */
5272
5273         char_boundary = str->iso2022.current_char_boundary;
5274         charset = str->iso2022.current_charset;
5275         half = str->iso2022.current_half;
5276
5277 #ifdef ENABLE_COMPOSITE_CHARS
5278       back_to_square_n:
5279 #endif
5280         while (n--) {
5281                 c = *src++;
5282
5283                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
5284                         ch = 0;
5285
5286                         restore_left_to_right_direction(codesys, dst, &flags,
5287                                                         0);
5288
5289                         /* Make sure G0 contains ASCII */
5290                         if ((c > ' ' && c < ISO_CODE_DEL) ||
5291                             !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5292                                 ensure_normal_shift(str, dst);
5293                                 iso2022_designate(Vcharset_ascii, 0, str, dst);
5294                         }
5295
5296                         /* If necessary, restore everything to the default state
5297                            at end-of-line */
5298                         if (c == '\n' &&
5299                             !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5300                                 restore_left_to_right_direction(codesys, dst,
5301                                                                 &flags, 0);
5302
5303                                 ensure_normal_shift(str, dst);
5304
5305                                 for (i = 0; i < 4; i++) {
5306                                         Lisp_Object initial_charset =
5307                                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5308                                             (codesys, i);
5309                                         iso2022_designate(initial_charset, i,
5310                                                           str, dst);
5311                                 }
5312                         }
5313                         if (c == '\n') {
5314                                 if (eol_type != EOL_LF
5315                                     && eol_type != EOL_AUTODETECT)
5316                                         Dynarr_add(dst, '\r');
5317                                 if (eol_type != EOL_CR)
5318                                         Dynarr_add(dst, c);
5319                         } else {
5320                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5321                                     && fit_to_be_escape_quoted(c))
5322                                         Dynarr_add(dst, ISO_CODE_ESC);
5323                                 Dynarr_add(dst, c);
5324                         }
5325                         char_boundary = 1;
5326                 }
5327
5328                 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {     /* Processing Leading Byte */
5329                         ch = 0;
5330                         charset = CHARSET_BY_LEADING_BYTE(c);
5331                         if (LEADING_BYTE_PREFIX_P(c))
5332                                 ch = c;
5333                         else if (!EQ(charset, Vcharset_control_1)
5334 #ifdef ENABLE_COMPOSITE_CHARS
5335                                  && !EQ(charset, Vcharset_composite)
5336 #endif
5337                             ) {
5338                                 int reg;
5339
5340                                 ensure_correct_direction(XCHARSET_DIRECTION
5341                                                          (charset), codesys,
5342                                                          dst, &flags, 0);
5343
5344                                 /* Now determine which register to use. */
5345                                 reg = -1;
5346                                 for (i = 0; i < 4; i++) {
5347                                         if (EQ(charset, str->iso2022.charset[i])
5348                                             || EQ(charset,
5349                                                   CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5350                                                   (codesys, i))) {
5351                                                 reg = i;
5352                                                 break;
5353                                         }
5354                                 }
5355
5356                                 if (reg == -1) {
5357                                         if (XCHARSET_GRAPHIC(charset) != 0) {
5358                                                 if (!NILP
5359                                                     (str->iso2022.charset[1])
5360                                                     &&
5361                                                     (!CODING_SYSTEM_ISO2022_SEVEN
5362                                                      (codesys)
5363                                                      ||
5364                                                      CODING_SYSTEM_ISO2022_LOCK_SHIFT
5365                                                      (codesys)))
5366                                                         reg = 1;
5367                                                 else if (!NILP
5368                                                          (str->iso2022.
5369                                                           charset[2]))
5370                                                         reg = 2;
5371                                                 else if (!NILP
5372                                                          (str->iso2022.
5373                                                           charset[3]))
5374                                                         reg = 3;
5375                                                 else
5376                                                         reg = 0;
5377                                         } else
5378                                                 reg = 0;
5379                                 }
5380
5381                                 iso2022_designate(charset, reg, str, dst);
5382
5383                                 /* Now invoke that register. */
5384                                 switch (reg) {
5385                                 case 0:
5386                                         ensure_normal_shift(str, dst);
5387                                         half = 0;
5388                                         break;
5389
5390                                 case 1:
5391                                         if (CODING_SYSTEM_ISO2022_SEVEN
5392                                             (codesys)) {
5393                                                 ensure_shift_out(str, dst);
5394                                                 half = 0;
5395                                         } else
5396                                                 half = 1;
5397                                         break;
5398
5399                                 case 2:
5400                                         if (CODING_SYSTEM_ISO2022_SEVEN
5401                                             (str->codesys)) {
5402                                                 Dynarr_add(dst, ISO_CODE_ESC);
5403                                                 Dynarr_add(dst, 'N');
5404                                                 half = 0;
5405                                         } else {
5406                                                 Dynarr_add(dst, ISO_CODE_SS2);
5407                                                 half = 1;
5408                                         }
5409                                         break;
5410
5411                                 case 3:
5412                                         if (CODING_SYSTEM_ISO2022_SEVEN
5413                                             (str->codesys)) {
5414                                                 Dynarr_add(dst, ISO_CODE_ESC);
5415                                                 Dynarr_add(dst, 'O');
5416                                                 half = 0;
5417                                         } else {
5418                                                 Dynarr_add(dst, ISO_CODE_SS3);
5419                                                 half = 1;
5420                                         }
5421                                         break;
5422
5423                                 default:
5424                                         abort();
5425                                 }
5426                         }
5427                         char_boundary = 0;
5428                 } else {        /* Processing Non-ASCII character */
5429                         charmask = (half == 0 ? 0x7F : 0xFF);
5430                         char_boundary = 1;
5431                         if (EQ(charset, Vcharset_control_1)) {
5432                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5433                                     && fit_to_be_escape_quoted(c))
5434                                         Dynarr_add(dst, ISO_CODE_ESC);
5435                                 /* you asked for it ... */
5436                                 Dynarr_add(dst, c - 0x20);
5437                         } else {
5438                                 switch (XCHARSET_REP_BYTES(charset)) {
5439                                 case 2:
5440                                         Dynarr_add(dst, c & charmask);
5441                                         break;
5442                                 case 3:
5443                                         if (XCHARSET_PRIVATE_P(charset)) {
5444                                                 Dynarr_add(dst, c & charmask);
5445                                                 ch = 0;
5446                                         } else if (ch) {
5447 #ifdef ENABLE_COMPOSITE_CHARS
5448                                                 if (EQ
5449                                                     (charset,
5450                                                      Vcharset_composite)) {
5451                                                         if (in_composite) {
5452                                                                 /* #### Bother! We don't know how to
5453                                                                    handle this yet. */
5454                                                                 Dynarr_add(dst,
5455                                                                            '~');
5456                                                         } else {
5457                                                                 Emchar emch =
5458                                                                     MAKE_CHAR
5459                                                                     (Vcharset_composite,
5460                                                                      ch & 0x7F,
5461                                                                      c & 0x7F);
5462                                                                 Lisp_Object lstr
5463                                                                     =
5464                                                                     composite_char_string
5465                                                                     (emch);
5466                                                                 saved_n = n;
5467                                                                 saved_src = src;
5468                                                                 in_composite =
5469                                                                     1;
5470                                                                 src =
5471                                                                     XSTRING_DATA
5472                                                                     (lstr);
5473                                                                 n = XSTRING_LENGTH(lstr);
5474                                                                 Dynarr_add(dst,
5475                                                                            ISO_CODE_ESC);
5476                                                                 Dynarr_add(dst, '0');   /* start composing */
5477                                                         }
5478                                                 } else
5479 #endif                          /* ENABLE_COMPOSITE_CHARS */
5480                                                 {
5481                                                         Dynarr_add(dst,
5482                                                                    ch &
5483                                                                    charmask);
5484                                                         Dynarr_add(dst,
5485                                                                    c &
5486                                                                    charmask);
5487                                                 }
5488                                                 ch = 0;
5489                                         } else {
5490                                                 ch = c;
5491                                                 char_boundary = 0;
5492                                         }
5493                                         break;
5494                                 case 4:
5495                                         if (ch) {
5496                                                 Dynarr_add(dst, ch & charmask);
5497                                                 Dynarr_add(dst, c & charmask);
5498                                                 ch = 0;
5499                                         } else {
5500                                                 ch = c;
5501                                                 char_boundary = 0;
5502                                         }
5503                                         break;
5504                                 default:
5505                                         abort();
5506                                 }
5507                         }
5508                 }
5509         }
5510
5511 #ifdef ENABLE_COMPOSITE_CHARS
5512         if (in_composite) {
5513                 n = saved_n;
5514                 src = saved_src;
5515                 in_composite = 0;
5516                 Dynarr_add(dst, ISO_CODE_ESC);
5517                 Dynarr_add(dst, '1');   /* end composing */
5518                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
5519         }
5520 #endif                          /* ENABLE_COMPOSITE_CHARS */
5521
5522         if (char_boundary && flags & CODING_STATE_END) {
5523                 restore_left_to_right_direction(codesys, dst, &flags, 0);
5524                 ensure_normal_shift(str, dst);
5525                 for (i = 0; i < 4; i++) {
5526                         Lisp_Object initial_charset =
5527                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5528                         iso2022_designate(initial_charset, i, str, dst);
5529                 }
5530         }
5531
5532         str->flags = flags;
5533         str->ch = ch;
5534         str->iso2022.current_char_boundary = char_boundary;
5535         str->iso2022.current_charset = charset;
5536         str->iso2022.current_half = half;
5537
5538         /* Verbum caro factum est! */
5539 }
5540 #endif                          /* MULE */
5541 \f
5542 /************************************************************************/
5543 /*                     No-conversion methods                            */
5544 /************************************************************************/
5545
5546 /* This is used when reading in "binary" files -- i.e. files that may
5547    contain all 256 possible byte values and that are not to be
5548    interpreted as being in any particular decoding. */
5549 static void
5550 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5551                             unsigned_char_dynarr * dst, Lstream_data_count n)
5552 {
5553         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5554         unsigned int flags = str->flags;
5555         unsigned int ch = str->ch;
5556         eol_type_t eol_type = str->eol_type;
5557
5558         while (n--) {
5559                 const unsigned char c = *(const unsigned char *)src++;
5560
5561                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5562                 DECODE_ADD_BINARY_CHAR(c, dst);
5563         label_continue_loop:;
5564         }
5565
5566         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5567
5568         str->flags = flags;
5569         str->ch = ch;
5570 }
5571
5572 static void
5573 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5574                             unsigned_char_dynarr * dst, Lstream_data_count n)
5575 {
5576         unsigned char c;
5577         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5578         unsigned int flags = str->flags;
5579         unsigned int ch = str->ch;
5580         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5581
5582         while (n--) {
5583                 c = *src++;
5584                 if (c == '\n') {
5585                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5586                                 Dynarr_add(dst, '\r');
5587                         if (eol_type != EOL_CR)
5588                                 Dynarr_add(dst, '\n');
5589                         ch = 0;
5590                 } else if (BYTE_ASCII_P(c)) {
5591                         assert(ch == 0);
5592                         Dynarr_add(dst, c);
5593                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5594                         assert(ch == 0);
5595                         if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5596                             c == LEADING_BYTE_CONTROL_1)
5597                                 ch = c;
5598                         else
5599                                 Dynarr_add(dst, '~');   /* untranslatable character */
5600                 } else {
5601                         if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5602                                 Dynarr_add(dst, c);
5603                         else if (ch == LEADING_BYTE_CONTROL_1) {
5604                                 assert(c < 0xC0);
5605                                 Dynarr_add(dst, c - 0x20);
5606                         }
5607                         /* else it should be the second or third byte of an
5608                            untranslatable character, so ignore it */
5609                         ch = 0;
5610                 }
5611         }
5612
5613         str->flags = flags;
5614         str->ch = ch;
5615 }
5616 \f
5617 /************************************************************************/
5618 /*                             Initialization                           */
5619 /************************************************************************/
5620
5621 void syms_of_file_coding(void)
5622 {
5623         INIT_LRECORD_IMPLEMENTATION(coding_system);
5624
5625         DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5626
5627         DEFSUBR(Fcoding_system_p);
5628         DEFSUBR(Ffind_coding_system);
5629         DEFSUBR(Fget_coding_system);
5630         DEFSUBR(Fcoding_system_list);
5631         DEFSUBR(Fcoding_system_name);
5632         DEFSUBR(Fmake_coding_system);
5633         DEFSUBR(Fcopy_coding_system);
5634         DEFSUBR(Fcoding_system_canonical_name_p);
5635         DEFSUBR(Fcoding_system_alias_p);
5636         DEFSUBR(Fcoding_system_aliasee);
5637         DEFSUBR(Fdefine_coding_system_alias);
5638         DEFSUBR(Fsubsidiary_coding_system);
5639
5640         DEFSUBR(Fcoding_system_type);
5641         DEFSUBR(Fcoding_system_doc_string);
5642 #ifdef MULE
5643         DEFSUBR(Fcoding_system_charset);
5644 #endif
5645         DEFSUBR(Fcoding_system_property);
5646
5647         DEFSUBR(Fcoding_category_list);
5648         DEFSUBR(Fset_coding_priority_list);
5649         DEFSUBR(Fcoding_priority_list);
5650         DEFSUBR(Fset_coding_category_system);
5651         DEFSUBR(Fcoding_category_system);
5652
5653         DEFSUBR(Fdetect_coding_region);
5654         DEFSUBR(Fdecode_coding_region);
5655         DEFSUBR(Fencode_coding_region);
5656 #ifdef MULE
5657         DEFSUBR(Fdecode_shift_jis_char);
5658         DEFSUBR(Fencode_shift_jis_char);
5659         DEFSUBR(Fdecode_big5_char);
5660         DEFSUBR(Fencode_big5_char);
5661         DEFSUBR(Fset_ucs_char);
5662         DEFSUBR(Fucs_char);
5663         DEFSUBR(Fset_char_ucs);
5664         DEFSUBR(Fchar_ucs);
5665 #endif                          /* MULE */
5666         defsymbol(&Qcoding_systemp, "coding-system-p");
5667         defsymbol(&Qno_conversion, "no-conversion");
5668         defsymbol(&Qraw_text, "raw-text");
5669 #ifdef MULE
5670         defsymbol(&Qbig5, "big5");
5671         defsymbol(&Qshift_jis, "shift-jis");
5672         defsymbol(&Qucs4, "ucs-4");
5673         defsymbol(&Qutf8, "utf-8");
5674         defsymbol(&Qccl, "ccl");
5675         defsymbol(&Qiso2022, "iso2022");
5676 #endif                          /* MULE */
5677         defsymbol(&Qmnemonic, "mnemonic");
5678         defsymbol(&Qeol_type, "eol-type");
5679         defsymbol(&Qpost_read_conversion, "post-read-conversion");
5680         defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5681
5682         defsymbol(&Qcr, "cr");
5683         defsymbol(&Qlf, "lf");
5684         defsymbol(&Qcrlf, "crlf");
5685         defsymbol(&Qeol_cr, "eol-cr");
5686         defsymbol(&Qeol_lf, "eol-lf");
5687         defsymbol(&Qeol_crlf, "eol-crlf");
5688 #ifdef MULE
5689         defsymbol(&Qcharset_g0, "charset-g0");
5690         defsymbol(&Qcharset_g1, "charset-g1");
5691         defsymbol(&Qcharset_g2, "charset-g2");
5692         defsymbol(&Qcharset_g3, "charset-g3");
5693         defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5694         defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5695         defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5696         defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5697         defsymbol(&Qno_iso6429, "no-iso6429");
5698         defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5699         defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5700
5701         defsymbol(&Qshort, "short");
5702         defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5703         defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5704         defsymbol(&Qseven, "seven");
5705         defsymbol(&Qlock_shift, "lock-shift");
5706         defsymbol(&Qescape_quoted, "escape-quoted");
5707 #endif                          /* MULE */
5708         defsymbol(&Qencode, "encode");
5709         defsymbol(&Qdecode, "decode");
5710
5711 #ifdef MULE
5712         defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5713                   "shift-jis");
5714         defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5715         defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5716         defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5717         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5718         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5719                   "iso-8-designate");
5720         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5721         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5722         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5723                   "iso-lock-shift");
5724 #endif                          /* MULE */
5725         defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5726                   "no-conversion");
5727 }
5728
5729 void lstream_type_create_file_coding(void)
5730 {
5731         LSTREAM_HAS_METHOD(decoding, reader);
5732         LSTREAM_HAS_METHOD(decoding, writer);
5733         LSTREAM_HAS_METHOD(decoding, rewinder);
5734         LSTREAM_HAS_METHOD(decoding, seekable_p);
5735         LSTREAM_HAS_METHOD(decoding, flusher);
5736         LSTREAM_HAS_METHOD(decoding, closer);
5737         LSTREAM_HAS_METHOD(decoding, marker);
5738
5739         LSTREAM_HAS_METHOD(encoding, reader);
5740         LSTREAM_HAS_METHOD(encoding, writer);
5741         LSTREAM_HAS_METHOD(encoding, rewinder);
5742         LSTREAM_HAS_METHOD(encoding, seekable_p);
5743         LSTREAM_HAS_METHOD(encoding, flusher);
5744         LSTREAM_HAS_METHOD(encoding, closer);
5745         LSTREAM_HAS_METHOD(encoding, marker);
5746 }
5747
5748 void vars_of_file_coding(void)
5749 {
5750         int i;
5751
5752         fcd = xnew(struct file_coding_dump);
5753         dump_add_root_struct_ptr(&fcd, &fcd_description);
5754
5755         /* Initialize to something reasonable ... */
5756         for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5757                 fcd->coding_category_system[i] = Qnil;
5758                 fcd->coding_category_by_priority[i] = i;
5759         }
5760
5761         Fprovide(intern("file-coding"));
5762
5763         DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system  /*
5764 Coding system used for TTY keyboard input.
5765 Not used under a windowing system.
5766                                                                          */ );
5767         Vkeyboard_coding_system = Qnil;
5768
5769         DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system  /*
5770 Coding system used for TTY display output.
5771 Not used under a windowing system.
5772                                                                          */ );
5773         Vterminal_coding_system = Qnil;
5774
5775         DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read  /*
5776 Overriding coding system used when reading from a file or process.
5777 You should bind this variable with `let', but do not set it globally.
5778 If this is non-nil, it specifies the coding system that will be used
5779 to decode input on read operations, such as from a file or process.
5780 It overrides `buffer-file-coding-system-for-read',
5781 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5782 this one for permanent changes to the environment.  */ );
5783         Vcoding_system_for_read = Qnil;
5784
5785         DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write        /*
5786 Overriding coding system used when writing to a file or process.
5787 You should bind this variable with `let', but do not set it globally.
5788 If this is non-nil, it specifies the coding system that will be used
5789 to encode output for write operations, such as to a file or process.
5790 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5791 Use those variables instead of this one for permanent changes to the
5792 environment.  */ );
5793         Vcoding_system_for_write = Qnil;
5794
5795         DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system        /*
5796 Coding system used to convert pathnames when accessing files.
5797                                                                                  */ );
5798         Vfile_name_coding_system = Qnil;
5799
5800         DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5801 Non-nil means the buffer contents are regarded as multi-byte form
5802 of characters, not a binary code.  This affects the display, file I/O,
5803 and behaviors of various editing commands.
5804
5805 Setting this to nil does not do anything.
5806                                                                                  */ );
5807         enable_multibyte_characters = 1;
5808 }
5809
5810 void complex_vars_of_file_coding(void)
5811 {
5812         staticpro(&Vcoding_system_hash_table);
5813         Vcoding_system_hash_table =
5814             make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5815
5816         the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5817         dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5818                                  &codesys_prop_dynarr_description);
5819
5820 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5821 {                                               \
5822   struct codesys_prop csp;                      \
5823   csp.sym = (Sym);                              \
5824   csp.prop_type = (Prop_Type);                  \
5825   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5826 } while (0)
5827
5828         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5829         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5830         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5831         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5832         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5833         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5834         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5835 #ifdef MULE
5836         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5837         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5838         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5839         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5840         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5841         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5842         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5843         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5844         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5845         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5846         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5847         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5848         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5849         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5850         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5851         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5852         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5853
5854         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5855         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5856 #endif                          /* MULE */
5857         /* Need to create this here or we're really screwed. */
5858         Fmake_coding_system
5859             (Qraw_text, Qno_conversion,
5860              build_string
5861              ("Raw text, which means it converts only line-break-codes."),
5862              list2(Qmnemonic, build_string("Raw")));
5863
5864         Fmake_coding_system
5865             (Qbinary, Qno_conversion,
5866              build_string("Binary, which means it does not convert anything."),
5867              list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5868
5869         Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5870
5871         Fdefine_coding_system_alias(Qfile_name, Qbinary);
5872
5873         Fdefine_coding_system_alias(Qterminal, Qbinary);
5874         Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5875
5876         /* Need this for bootstrapping */
5877         fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5878             Fget_coding_system(Qraw_text);
5879
5880 #ifdef MULE
5881         {
5882                 size_t i;
5883
5884                 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5885                         fcd->ucs_to_mule_table[i] = Qnil;
5886         }
5887         staticpro(&mule_to_ucs_table);
5888         mule_to_ucs_table = Fmake_char_table(Qgeneric);
5889 #endif                          /* MULE */
5890 }