Shut set but unused warnings.
[sxemacs] / src / mule / file-coding.c
1 /* Code conversion functions.
2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3    Copyright (C) 1995 Sun Microsystems, Inc.
4
5 This file is part of SXEmacs
6
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>. */
19
20
21 /* Synched up with: Mule 2.3.   Not in FSF. */
22
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
24
25 #include <config.h>
26 #include "lisp.h"
27
28 #include "buffer.h"
29 #include "elhash.h"
30 #include "ui/insdel.h"
31 #include "lstream.h"
32 #include "opaque.h"
33 #ifdef MULE
34 #include "mule-ccl.h"
35 #include "chartab.h"
36 #endif
37 #include "file-coding.h"
38
39 Lisp_Object Qcoding_system_error;
40
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
46
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
49
50 struct file_coding_dump {
51         /* Coding system currently associated with each coding category. */
52         Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
53
54         /* Table of all coding categories in decreasing order of priority.
55            This describes a permutation of the possible coding categories. */
56         int coding_category_by_priority[CODING_CATEGORY_LAST];
57
58 #ifdef MULE
59         Lisp_Object ucs_to_mule_table[65536];
60 #endif
61 } *fcd;
62
63 static const struct lrecord_description fcd_description_1[] = {
64         {XD_LISP_OBJECT_ARRAY,
65          offsetof(struct file_coding_dump, coding_category_system),
66          CODING_CATEGORY_LAST},
67 #ifdef MULE
68         {XD_LISP_OBJECT_ARRAY,
69          offsetof(struct file_coding_dump, ucs_to_mule_table),
70          countof(fcd->ucs_to_mule_table)},
71 #endif
72         {XD_END}
73 };
74
75 static const struct struct_description fcd_description = {
76         sizeof(struct file_coding_dump),
77         fcd_description_1
78 };
79
80 Lisp_Object mule_to_ucs_table;
81
82 Lisp_Object Qcoding_systemp;
83
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
86
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
92
93 #ifdef MULE
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 #endif
104 Lisp_Object Qencode, Qdecode;
105
106 Lisp_Object Vcoding_system_hash_table;
107
108 int enable_multibyte_characters;
109
110 #ifdef MULE
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113         /* CHARSET holds the character sets currently assigned to the G0
114            through G3 variables.  It is initialized from the array
115            INITIAL_CHARSET in CODESYS. */
116         Lisp_Object charset[4];
117
118         /* Which registers are currently invoked into the left (GL) and
119            right (GR) halves of the 8-bit encoding space? */
120         int register_left, register_right;
121
122         /* ISO_ESC holds a value indicating part of an escape sequence
123            that has already been seen. */
124         enum iso_esc_flag esc;
125
126         /* This records the bytes we've seen so far in an escape sequence,
127            in case the sequence is invalid (we spit out the bytes unchanged). */
128         unsigned char esc_bytes[8];
129
130         /* Index for next byte to store in ISO escape sequence. */
131         int esc_bytes_index;
132
133 #ifdef ENABLE_COMPOSITE_CHARS
134         /* Stuff seen so far when composing a string. */
135         unsigned_char_dynarr *composite_chars;
136 #endif
137
138         /* If we saw an invalid designation sequence for a particular
139            register, we flag it here and switch to ASCII.  The next time we
140            see a valid designation for this register, we turn off the flag
141            and do the designation normally, but pretend the sequence was
142            invalid.  The effect of all this is that (most of the time) the
143            escape sequences for both the switch to the unknown charset, and
144            the switch back to the known charset, get inserted literally into
145            the buffer and saved out as such.  The hope is that we can
146            preserve the escape sequences so that the resulting written out
147            file makes sense.  If we don't do any of this, the designation
148            to the invalid charset will be preserved but that switch back
149            to the known charset will probably get eaten because it was
150            the same charset that was already present in the register. */
151         unsigned char invalid_designated[4];
152
153         /* We try to do similar things as above for direction-switching
154            sequences.  If we encountered a direction switch while an
155            invalid designation was present, or an invalid designation
156            just after a direction switch (i.e. no valid designation
157            encountered yet), we insert the direction-switch escape
158            sequence literally into the output stream, and later on
159            insert the corresponding direction-restoring escape sequence
160            literally also. */
161         unsigned int switched_dir_and_no_valid_charset_yet:1;
162         unsigned int invalid_switch_dir:1;
163
164         /* Tells the decoder to output the escape sequence literally
165            even though it was valid.  Used in the games we play to
166            avoid lossage when we encounter invalid designations. */
167         unsigned int output_literally:1;
168         /* We encountered a direction switch followed by an invalid
169            designation.  We didn't output the direction switch
170            literally because we didn't know about the invalid designation;
171            but we have to do so now. */
172         unsigned int output_direction_sequence:1;
173 };
174 #endif                          /* MULE */
175 EXFUN(Fcopy_coding_system, 2);
176 #ifdef MULE
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179                               const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181                                unsigned_char_dynarr * dst,
182                                Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184                                unsigned_char_dynarr * dst,
185                                Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187                               Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189                                unsigned_char_dynarr * dst,
190                                Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192                                unsigned_char_dynarr * dst,
193                                Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195                               Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197                                unsigned_char_dynarr * dst,
198                                Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200                                unsigned_char_dynarr * dst,
201                                Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203                               Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205                                unsigned_char_dynarr * dst,
206                                Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208                                unsigned_char_dynarr * dst,
209                                Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212                           struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214                                  const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216                                   unsigned_char_dynarr * dst,
217                                   Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219                                   unsigned_char_dynarr * dst,
220                                   Lstream_data_count n);
221 #endif                          /* MULE */
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223                                         unsigned_char_dynarr * dst,
224                                         Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226                                         unsigned_char_dynarr * dst,
227                                         Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229                         unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231                         unsigned_char_dynarr * dst, Lstream_data_count n);
232
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
235         Lisp_Object sym;
236         int prop_type;
237 };
238
239 typedef struct {
240         Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
242
243 static const struct lrecord_description codesys_prop_description_1[] = {
244         {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
245         {XD_END}
246 };
247
248 static const struct struct_description codesys_prop_description = {
249         sizeof(codesys_prop),
250         codesys_prop_description_1
251 };
252
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254         XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
255         {XD_END}
256 };
257
258 static const struct struct_description codesys_prop_dynarr_description = {
259         sizeof(codesys_prop_dynarr),
260         codesys_prop_dynarr_description_1
261 };
262
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
264
265 enum codesys_prop_enum {
266         CODESYS_PROP_ALL_OK,
267         CODESYS_PROP_ISO2022,
268         CODESYS_PROP_CCL
269 };
270 \f
271 /************************************************************************/
272 /*                       Coding system functions                        */
273 /************************************************************************/
274
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
278
279 #ifdef MULE
280 static const struct lrecord_description ccs_description_1[] = {
281         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
283         {XD_END}
284 };
285
286 static const struct struct_description ccs_description = {
287         sizeof(charset_conversion_spec),
288         ccs_description_1
289 };
290
291 static const struct lrecord_description ccsd_description_1[] = {
292         XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
293         {XD_END}
294 };
295
296 static const struct struct_description ccsd_description = {
297         sizeof(charset_conversion_spec_dynarr),
298         ccsd_description_1
299 };
300 #endif
301
302 static const struct lrecord_description coding_system_description[] = {
303         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
311 #ifdef MULE
312         {XD_LISP_OBJECT_ARRAY,
313          offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
315          &ccsd_description},
316         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
317          &ccsd_description},
318         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
320 #endif
321         {XD_END}
322 };
323
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325                               mark_coding_system, print_coding_system,
326                               finalize_coding_system,
327                               0, 0, coding_system_description,
328                               Lisp_Coding_System);
329
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
331 {
332         Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
333
334         mark_object(CODING_SYSTEM_NAME(codesys));
335         mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336         mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337         mark_object(CODING_SYSTEM_EOL_LF(codesys));
338         mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339         mark_object(CODING_SYSTEM_EOL_CR(codesys));
340
341         switch (CODING_SYSTEM_TYPE(codesys)) {
342 #ifdef MULE
343                 int i;
344         case CODESYS_ISO2022:
345                 for (i = 0; i < 4; i++)
346                         mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
347                                     (codesys, i));
348                 if (codesys->iso2022.input_conv) {
349                         for (i = 0;
350                              i < Dynarr_length(codesys->iso2022.input_conv);
351                              i++) {
352                                 struct charset_conversion_spec *ccs =
353                                     Dynarr_atp(codesys->iso2022.input_conv, i);
354                                 mark_object(ccs->from_charset);
355                                 mark_object(ccs->to_charset);
356                         }
357                 }
358                 if (codesys->iso2022.output_conv) {
359                         for (i = 0;
360                              i < Dynarr_length(codesys->iso2022.output_conv);
361                              i++) {
362                                 struct charset_conversion_spec *ccs =
363                                     Dynarr_atp(codesys->iso2022.output_conv, i);
364                                 mark_object(ccs->from_charset);
365                                 mark_object(ccs->to_charset);
366                         }
367                 }
368                 break;
369
370         case CODESYS_CCL:
371                 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372                 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
373                 break;
374
375                 /* list the rest of them lot explicitly */
376         case CODESYS_AUTODETECT:
377         case CODESYS_SHIFT_JIS:
378         case CODESYS_BIG5:
379         case CODESYS_UCS4:
380         case CODESYS_UTF8:
381         case CODESYS_NO_CONVERSION:
382 #ifdef DEBUG_SXEMACS
383         case CODESYS_INTERNAL:
384 #endif
385 #endif                          /* MULE */
386         default:
387                 break;
388         }
389
390         mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391         return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
392 }
393
394 static void
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
396 {
397         Lisp_Coding_System *c = XCODING_SYSTEM(obj);
398         if (print_readably)
399                 error("printing unreadable object #<coding-system 0x%x>",
400                       c->header.uid);
401
402         write_c_string("#<coding-system ", printcharfun);
403         print_internal(c->name, printcharfun, 1);
404         write_c_string(">", printcharfun);
405 }
406
407 static void finalize_coding_system(void *header, int for_disksave)
408 {
409         Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410         /* Since coding systems never go away, this function is not
411            necessary.  But it would be necessary if we changed things
412            so that coding systems could go away. */
413         if (!for_disksave) {    /* see comment in lstream.c */
414                 switch (CODING_SYSTEM_TYPE(c)) {
415 #ifdef MULE
416                 case CODESYS_ISO2022:
417                         if (c->iso2022.input_conv) {
418                                 Dynarr_free(c->iso2022.input_conv);
419                                 c->iso2022.input_conv = 0;
420                         }
421                         if (c->iso2022.output_conv) {
422                                 Dynarr_free(c->iso2022.output_conv);
423                                 c->iso2022.output_conv = 0;
424                         }
425                         break;
426
427                         /* list the rest of them lot explicitly */
428                 case CODESYS_AUTODETECT:
429                 case CODESYS_SHIFT_JIS:
430                 case CODESYS_BIG5:
431                 case CODESYS_UCS4:
432                 case CODESYS_UTF8:
433                 case CODESYS_CCL:
434                 case CODESYS_NO_CONVERSION:
435 #ifdef DEBUG_SXEMACS
436                 case CODESYS_INTERNAL:
437 #endif
438 #endif                          /* MULE */
439                 default:
440                         break;
441                 }
442         }
443 }
444
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
446 {
447         CHECK_SYMBOL(symbol);
448         if (NILP(symbol))
449                 return EOL_AUTODETECT;
450         if (EQ(symbol, Qlf))
451                 return EOL_LF;
452         if (EQ(symbol, Qcrlf))
453                 return EOL_CRLF;
454         if (EQ(symbol, Qcr))
455                 return EOL_CR;
456
457         signal_simple_error("Unrecognized eol type", symbol);
458         return EOL_AUTODETECT;  /* not reached */
459 }
460
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
462 {
463         switch (type) {
464         default:
465                 abort();
466                 return Qnil;
467         case EOL_LF:
468                 return Qlf;
469         case EOL_CRLF:
470                 return Qcrlf;
471         case EOL_CR:
472                 return Qcr;
473         case EOL_AUTODETECT:
474                 return Qnil;
475         }
476 }
477
478 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
479 {
480         Lisp_Object codesys_obj;
481         int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
482         char *codesys_name = (char *)alloca(len + 7);
483         int mlen = -1;
484         char *codesys_mnemonic = 0;
485
486         Lisp_Object codesys_name_sym, sub_codesys_obj;
487
488         /* kludge */
489
490         XSETCODING_SYSTEM(codesys_obj, codesys);
491
492         memcpy(codesys_name,
493                string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
494
495         if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
496                 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
497                 codesys_mnemonic = (char *)alloca(mlen + 7);
498                 memcpy(codesys_mnemonic,
499                        XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
500         }
501 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
502   strcpy (codesys_name + len, "-" op_sys);                                      \
503   if (mlen != -1)                                                               \
504     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
505   codesys_name_sym = intern (codesys_name);                                     \
506   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
507   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
508   if (mlen != -1)                                                               \
509     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
510       build_string (codesys_mnemonic);                                          \
511   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
512 } while (0)
513
514         DEFINE_SUB_CODESYS("unix", "", EOL_LF);
515         DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
516         DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
517 }
518
519 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0,     /*
520 Return t if OBJECT is a coding system.
521 A coding system is an object that defines how text containing multiple
522 character sets is encoded into a stream of (typically 8-bit) bytes.
523 The coding system is used to decode the stream into a series of
524 characters (which may be from multiple charsets) when the text is read
525 from a file or process, and is used to encode the text back into the
526 same format when it is written out to a file or process.
527
528 For example, many ISO2022-compliant coding systems (such as Compound
529 Text, which is used for inter-client data under the X Window System)
530 use escape sequences to switch between different charsets -- Japanese
531 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
532 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
533 `make-coding-system' for more information.
534
535 Coding systems are normally identified using a symbol, and the
536 symbol is accepted in place of the actual coding system object whenever
537 a coding system is called for. (This is similar to how faces work.)
538 */
539       (object))
540 {
541         return CODING_SYSTEMP(object) ? Qt : Qnil;
542 }
543
544 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0,       /*
545 Retrieve the coding system of the given name.
546
547 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
548 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
549 If there is no such coding system, nil is returned.  Otherwise the
550 associated coding system object is returned.
551 */
552       (coding_system_or_name))
553 {
554         if (NILP(coding_system_or_name))
555                 coding_system_or_name = Qbinary;
556         else if (CODING_SYSTEMP(coding_system_or_name))
557                 return coding_system_or_name;
558         else
559                 CHECK_SYMBOL(coding_system_or_name);
560
561         while (1) {
562                 coding_system_or_name =
563                     Fgethash(coding_system_or_name, Vcoding_system_hash_table,
564                              Qnil);
565
566                 if (CODING_SYSTEMP(coding_system_or_name)
567                     || NILP(coding_system_or_name))
568                         return coding_system_or_name;
569         }
570 }
571
572 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
573 Retrieve the coding system of the given name.
574 Same as `find-coding-system' except that if there is no such
575 coding system, an error is signaled instead of returning nil.
576 */
577       (name))
578 {
579         Lisp_Object coding_system = Ffind_coding_system(name);
580
581         if (NILP(coding_system))
582                 signal_simple_error("No such coding system", name);
583         return coding_system;
584 }
585
586 /* We store the coding systems in hash tables with the names as the key and the
587    actual coding system object as the value.  Occasionally we need to use them
588    in a list format.  These routines provide us with that. */
589 struct coding_system_list_closure {
590         Lisp_Object *coding_system_list;
591 };
592
593 static int
594 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
595                                  void *coding_system_list_closure)
596 {
597         /* This function can GC */
598         struct coding_system_list_closure *cscl =
599             (struct coding_system_list_closure *)coding_system_list_closure;
600         Lisp_Object *coding_system_list = cscl->coding_system_list;
601
602         *coding_system_list = Fcons(key, *coding_system_list);
603         return 0;
604 }
605
606 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0,       /*
607 Return a list of the names of all defined coding systems.
608 */
609       ())
610 {
611         Lisp_Object coding_system_list = Qnil;
612         struct gcpro gcpro1;
613         struct coding_system_list_closure coding_system_list_closure;
614
615         GCPRO1(coding_system_list);
616         coding_system_list_closure.coding_system_list = &coding_system_list;
617         elisp_maphash(add_coding_system_to_list_mapper,
618                       Vcoding_system_hash_table, &coding_system_list_closure);
619         UNGCPRO;
620
621         return coding_system_list;
622 }
623
624 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0,       /*
625 Return the name of the given coding system.
626 */
627       (coding_system))
628 {
629         coding_system = Fget_coding_system(coding_system);
630         return XCODING_SYSTEM_NAME(coding_system);
631 }
632
633 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
634                                                   Lisp_Object name)
635 {
636         Lisp_Coding_System *codesys =
637             alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
638
639         zero_lcrecord(codesys);
640         CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
641         CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
642         CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
643         CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
644         CODING_SYSTEM_EOL_CR(codesys) = Qnil;
645         CODING_SYSTEM_EOL_LF(codesys) = Qnil;
646         CODING_SYSTEM_TYPE(codesys) = type;
647         CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
648 #ifdef MULE
649         if (type == CODESYS_ISO2022) {
650                 int i;
651                 for (i = 0; i < 4; i++)
652                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
653                             Qnil;
654         } else if (type == CODESYS_CCL) {
655                 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
656                 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
657         }
658 #endif                          /* MULE */
659         CODING_SYSTEM_NAME(codesys) = name;
660
661         return codesys;
662 }
663
664 #ifdef MULE
665 /* Given a list of charset conversion specs as specified in a Lisp
666    program, parse it into STORE_HERE. */
667
668 static void
669 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
670                                Lisp_Object spec_list)
671 {
672         Lisp_Object rest;
673
674         EXTERNAL_LIST_LOOP(rest, spec_list) {
675                 Lisp_Object car = XCAR(rest);
676                 Lisp_Object from, to;
677                 struct charset_conversion_spec spec;
678
679                 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
680                         signal_simple_error("Invalid charset conversion spec",
681                                             car);
682                 from = Fget_charset(XCAR(car));
683                 to = Fget_charset(XCAR(XCDR(car)));
684                 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
685                         signal_simple_error_2
686                             ("Attempted conversion between different charset types",
687                              from, to);
688                 spec.from_charset = from;
689                 spec.to_charset = to;
690
691                 Dynarr_add(store_here, spec);
692         }
693 }
694
695 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
696    specs, return the equivalent as the Lisp programmer would see it.
697
698    If LOAD_HERE is 0, return Qnil. */
699
700 static Lisp_Object
701 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
702 {
703         int i;
704         Lisp_Object result;
705
706         if (!load_here)
707                 return Qnil;
708         for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
709                 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
710                 result =
711                     Fcons(list2(ccs->from_charset, ccs->to_charset), result);
712         }
713
714         return Fnreverse(result);
715 }
716
717 #endif                          /* MULE */
718
719 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0,       /*
720 Register symbol NAME as a coding system.
721
722 TYPE describes the conversion method used and should be one of
723
724 nil or 'undecided
725 Automatic conversion.  SXEmacs attempts to detect the coding system
726 used in the file.
727 'no-conversion
728 No conversion.  Use this for binary files and such.  On output,
729 graphic characters that are not in ASCII or Latin-1 will be
730 replaced by a ?. (For a no-conversion-encoded buffer, these
731 characters will only be present if you explicitly insert them.)
732 'shift-jis
733 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
734 'ucs-4
735 ISO 10646 UCS-4 encoding.
736 'utf-8
737 ISO 10646 UTF-8 encoding.
738 'iso2022
739 Any ISO2022-compliant encoding.  Among other things, this includes
740 JIS (the Japanese encoding commonly used for e-mail), EUC (the
741 standard Unix encoding for Japanese and other languages), and
742 Compound Text (the encoding used in X11).  You can specify more
743 specific information about the conversion with the PROPS argument.
744 'big5
745 Big5 (the encoding commonly used for Taiwanese).
746 'ccl
747 The conversion is performed using a user-written pseudo-code
748 program.  CCL (Code Conversion Language) is the name of this
749 pseudo-code.
750 'internal
751 Write out or read in the raw contents of the memory representing
752 the buffer's text.  This is primarily useful for debugging
753 purposes, and is only enabled when SXEmacs has been compiled with
754 DEBUG_SXEMACS defined (via the --debug configure option).
755 WARNING: Reading in a file using 'internal conversion can result
756 in an internal inconsistency in the memory representing a
757 buffer's text, which will produce unpredictable results and may
758 cause SXEmacs to crash.  Under normal circumstances you should
759 never use 'internal conversion.
760
761 DOC-STRING is a string describing the coding system.
762
763 PROPS is a property list, describing the specific nature of the
764 character set.  Recognized properties are:
765
766 'mnemonic
767 String to be displayed in the modeline when this coding system is
768 active.
769
770 'eol-type
771 End-of-line conversion to be used.  It should be one of
772
773 nil
774 Automatically detect the end-of-line type (LF, CRLF,
775 or CR).  Also generate subsidiary coding systems named
776 `NAME-unix', `NAME-dos', and `NAME-mac', that are
777 identical to this coding system but have an EOL-TYPE
778 value of 'lf, 'crlf, and 'cr, respectively.
779 'lf
780 The end of a line is marked externally using ASCII LF.
781 Since this is also the way that SXEmacs represents an
782 end-of-line internally, specifying this option results
783 in no end-of-line conversion.  This is the standard
784 format for Unix text files.
785 'crlf
786 The end of a line is marked externally using ASCII
787 CRLF.  This is the standard format for MS-DOS text
788 files.
789 'cr
790 The end of a line is marked externally using ASCII CR.
791 This is the standard format for Macintosh text files.
792 t
793 Automatically detect the end-of-line type but do not
794 generate subsidiary coding systems.  (This value is
795 converted to nil when stored internally, and
796 `coding-system-property' will return nil.)
797
798 'post-read-conversion
799 Function called after a file has been read in, to perform the
800 decoding.  Called with two arguments, START and END, denoting
801 a region of the current buffer to be decoded.
802
803 'pre-write-conversion
804 Function called before a file is written out, to perform the
805 encoding.  Called with two arguments, START and END, denoting
806 a region of the current buffer to be encoded.
807
808 The following additional properties are recognized if TYPE is 'iso2022:
809
810 'charset-g0
811 'charset-g1
812 'charset-g2
813 'charset-g3
814 The character set initially designated to the G0 - G3 registers.
815 The value should be one of
816
817 -- A charset object (designate that character set)
818 -- nil (do not ever use this register)
819 -- t (no character set is initially designated to
820 the register, but may be later on; this automatically
821 sets the corresponding `force-g*-on-output' property)
822
823 'force-g0-on-output
824 'force-g1-on-output
825 'force-g2-on-output
826 'force-g2-on-output
827 If non-nil, send an explicit designation sequence on output before
828 using the specified register.
829
830 'short
831 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
832 "ESC $ B" on output in place of the full designation sequences
833 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
834
835 'no-ascii-eol
836 If non-nil, don't designate ASCII to G0 at each end of line on output.
837 Setting this to non-nil also suppresses other state-resetting that
838 normally happens at the end of a line.
839
840 'no-ascii-cntl
841 If non-nil, don't designate ASCII to G0 before control chars on output.
842
843 'seven
844 If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
845 environment.
846
847 'lock-shift
848 If non-nil, use locking-shift (SO/SI) instead of single-shift
849 or designation by escape sequence.
850
851 'no-iso6429
852 If non-nil, don't use ISO6429's direction specification.
853
854 'escape-quoted
855 If non-nil, literal control characters that are the same as
856 the beginning of a recognized ISO2022 or ISO6429 escape sequence
857 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
858 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
859 so that they can be properly distinguished from an escape sequence.
860 (Note that doing this results in a non-portable encoding.) This
861 encoding flag is used for byte-compiled files.  Note that ESC
862 is a good choice for a quoting character because there are no
863 escape sequences whose second byte is a character from the Control-0
864 or Control-1 character sets; this is explicitly disallowed by the
865 ISO2022 standard.
866
867 'input-charset-conversion
868 A list of conversion specifications, specifying conversion of
869 characters in one charset to another when decoding is performed.
870 Each specification is a list of two elements: the source charset,
871 and the destination charset.
872
873 'output-charset-conversion
874 A list of conversion specifications, specifying conversion of
875 characters in one charset to another when encoding is performed.
876 The form of each specification is the same as for
877 'input-charset-conversion.
878
879 The following additional properties are recognized (and required)
880 if TYPE is 'ccl:
881
882 'decode
883 CCL program used for decoding (converting to internal format).
884
885 'encode
886 CCL program used for encoding (converting to external format).
887 */
888       (name, type, doc_string, props))
889 {
890         Lisp_Coding_System *codesys;
891         enum coding_system_type ty;
892         int need_to_setup_eol_systems = 1;
893
894         /* Convert type to constant */
895         if (NILP(type) || EQ(type, Qundecided)) {
896                 ty = CODESYS_AUTODETECT;
897         }
898 #ifdef MULE
899         else if (EQ(type, Qshift_jis)) {
900                 ty = CODESYS_SHIFT_JIS;
901         } else if (EQ(type, Qiso2022)) {
902                 ty = CODESYS_ISO2022;
903         } else if (EQ(type, Qbig5)) {
904                 ty = CODESYS_BIG5;
905         } else if (EQ(type, Qucs4)) {
906                 ty = CODESYS_UCS4;
907         } else if (EQ(type, Qutf8)) {
908                 ty = CODESYS_UTF8;
909         } else if (EQ(type, Qccl)) {
910                 ty = CODESYS_CCL;
911         }
912 #endif
913         else if (EQ(type, Qno_conversion)) {
914                 ty = CODESYS_NO_CONVERSION;
915         }
916 #ifdef DEBUG_SXEMACS
917         else if (EQ(type, Qinternal)) {
918                 ty = CODESYS_INTERNAL;
919         }
920 #endif
921         else
922                 signal_simple_error("Invalid coding system type", type);
923
924         CHECK_SYMBOL(name);
925
926         codesys = allocate_coding_system(ty, name);
927
928         if (NILP(doc_string))
929                 doc_string = build_string("");
930         else
931                 CHECK_STRING(doc_string);
932         CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
933
934         {
935                 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
936                         if (EQ(key, Qmnemonic)) {
937                                 if (!NILP(value))
938                                         CHECK_STRING(value);
939                                 CODING_SYSTEM_MNEMONIC(codesys) = value;
940                         }
941
942                         else if (EQ(key, Qeol_type)) {
943                                 need_to_setup_eol_systems = NILP(value);
944                                 if (EQ(value, Qt))
945                                         value = Qnil;
946                                 CODING_SYSTEM_EOL_TYPE(codesys) =
947                                     symbol_to_eol_type(value);
948                         }
949
950                         else if (EQ(key, Qpost_read_conversion))
951                                 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
952                                     value;
953                         else if (EQ(key, Qpre_write_conversion))
954                                 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
955                                     value;
956 #ifdef MULE
957                         else if (ty == CODESYS_ISO2022) {
958 #define FROB_INITIAL_CHARSET(charset_num) \
959   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
960     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
961
962                                 if (EQ(key, Qcharset_g0))
963                                         FROB_INITIAL_CHARSET(0);
964                                 else if (EQ(key, Qcharset_g1))
965                                         FROB_INITIAL_CHARSET(1);
966                                 else if (EQ(key, Qcharset_g2))
967                                         FROB_INITIAL_CHARSET(2);
968                                 else if (EQ(key, Qcharset_g3))
969                                         FROB_INITIAL_CHARSET(3);
970
971 #define FROB_FORCE_CHARSET(charset_num) \
972   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
973
974                                 else if (EQ(key, Qforce_g0_on_output))
975                                         FROB_FORCE_CHARSET(0);
976                                 else if (EQ(key, Qforce_g1_on_output))
977                                         FROB_FORCE_CHARSET(1);
978                                 else if (EQ(key, Qforce_g2_on_output))
979                                         FROB_FORCE_CHARSET(2);
980                                 else if (EQ(key, Qforce_g3_on_output))
981                                         FROB_FORCE_CHARSET(3);
982
983 #define FROB_BOOLEAN_PROPERTY(prop) \
984   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
985
986                                 else if (EQ(key, Qshort))
987                                         FROB_BOOLEAN_PROPERTY(SHORT);
988                                 else if (EQ(key, Qno_ascii_eol))
989                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
990                                 else if (EQ(key, Qno_ascii_cntl))
991                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
992                                 else if (EQ(key, Qseven))
993                                         FROB_BOOLEAN_PROPERTY(SEVEN);
994                                 else if (EQ(key, Qlock_shift))
995                                         FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
996                                 else if (EQ(key, Qno_iso6429))
997                                         FROB_BOOLEAN_PROPERTY(NO_ISO6429);
998                                 else if (EQ(key, Qescape_quoted))
999                                         FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
1000
1001                                 else if (EQ(key, Qinput_charset_conversion)) {
1002                                         codesys->iso2022.input_conv =
1003                                             Dynarr_new(charset_conversion_spec);
1004                                         parse_charset_conversion_specs(codesys->
1005                                                                        iso2022.
1006                                                                        input_conv,
1007                                                                        value);
1008                                 } else if (EQ(key, Qoutput_charset_conversion)) {
1009                                         codesys->iso2022.output_conv =
1010                                             Dynarr_new(charset_conversion_spec);
1011                                         parse_charset_conversion_specs(codesys->
1012                                                                        iso2022.
1013                                                                        output_conv,
1014                                                                        value);
1015                                 } else
1016                                         signal_simple_error
1017                                             ("Unrecognized property", key);
1018                         } else if (EQ(type, Qccl)) {
1019                                 Lisp_Object sym;
1020                                 struct ccl_program test_ccl;
1021                                 Extbyte *suffix;
1022
1023                                 /* Check key first.  */
1024                                 if (EQ(key, Qdecode))
1025                                         suffix = "-ccl-decode";
1026                                 else if (EQ(key, Qencode))
1027                                         suffix = "-ccl-encode";
1028                                 else
1029                                         signal_simple_error
1030                                             ("Unrecognized property", key);
1031
1032                                 /* If value is vector, register it as a ccl program
1033                                    associated with an newly created symbol for
1034                                    backward compatibility.  */
1035                                 if (VECTORP(value)) {
1036                                         sym =
1037                                             Fintern(concat2
1038                                                     (Fsymbol_name(name),
1039                                                      build_string(suffix)),
1040                                                     Qnil);
1041                                         Fregister_ccl_program(sym, value);
1042                                 } else {
1043                                         CHECK_SYMBOL(value);
1044                                         sym = value;
1045                                 }
1046                                 /* check if the given ccl programs are valid.  */
1047                                 if (setup_ccl_program(&test_ccl, sym) < 0)
1048                                         signal_simple_error
1049                                             ("Invalid CCL program", value);
1050
1051                                 if (EQ(key, Qdecode))
1052                                         CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1053                                 else if (EQ(key, Qencode))
1054                                         CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1055
1056                         }
1057 #endif                          /* MULE */
1058                         else
1059                                 signal_simple_error("Unrecognized property",
1060                                                     key);
1061                 }
1062         }
1063
1064         if (need_to_setup_eol_systems)
1065                 setup_eol_coding_systems(codesys);
1066
1067         {
1068                 Lisp_Object codesys_obj;
1069                 XSETCODING_SYSTEM(codesys_obj, codesys);
1070                 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1071                 return codesys_obj;
1072         }
1073 }
1074
1075 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0,       /*
1076 Copy OLD-CODING-SYSTEM to NEW-NAME.
1077 If NEW-NAME does not name an existing coding system, a new one will
1078 be created.
1079 */
1080       (old_coding_system, new_name))
1081 {
1082         Lisp_Object new_coding_system;
1083         old_coding_system = Fget_coding_system(old_coding_system);
1084         new_coding_system = Ffind_coding_system(new_name);
1085         if (NILP(new_coding_system)) {
1086                 XSETCODING_SYSTEM(new_coding_system,
1087                                   allocate_coding_system
1088                                   (XCODING_SYSTEM_TYPE(old_coding_system),
1089                                    new_name));
1090                 Fputhash(new_name, new_coding_system,
1091                          Vcoding_system_hash_table);
1092         }
1093
1094         {
1095                 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1096                 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1097                 memcpy(((char *)to) + sizeof(to->header),
1098                        ((char *)from) + sizeof(from->header),
1099                        sizeof(*from) - sizeof(from->header));
1100                 to->name = new_name;
1101         }
1102         return new_coding_system;
1103 }
1104
1105 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0,       /*
1106 Return t if OBJECT names a coding system, and is not a coding system alias.
1107 */
1108       (object))
1109 {
1110         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1111         return CODING_SYSTEMP(val) ? Qt : Qnil;
1112 }
1113
1114 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1115  Return t if OBJECT is a coding system alias.
1116 All coding system aliases are created by `define-coding-system-alias'.
1117 */
1118       (object))
1119 {
1120         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1121         return SYMBOLP(val) ? Qt : Qnil;
1122 }
1123
1124 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1125 Return the coding-system symbol for which symbol ALIAS is an alias.
1126 */
1127       (alias))
1128 {
1129         Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1130         if (SYMBOLP(aliasee)) {
1131                 return aliasee;
1132         } else {
1133                 signal_simple_error(
1134                         "Symbol is not a coding system alias", alias);
1135         }
1136         /* To keep the compiler happy */
1137         return Qnil;
1138 }
1139
1140 static Lisp_Object
1141 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1142 {
1143         return Fintern(concat2(Fsymbol_name(symbol),
1144                                build_string(ascii_string)), Qnil);
1145 }
1146
1147 /* A maphash function, for removing dangling coding system aliases. */
1148 static int
1149 dangling_coding_system_alias_p(Lisp_Object alias,
1150                                Lisp_Object aliasee, void *dangling_aliases)
1151 {
1152         if (SYMBOLP(aliasee)
1153             && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1154                 (*(int *)dangling_aliases)++;
1155                 return 1;
1156         } else {
1157                 return 0;
1158         }
1159 }
1160
1161 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0,       /*
1162 Define symbol ALIAS as an alias for coding system ALIASEE.
1163
1164 You can use this function to redefine an alias that has already been defined,
1165 but you cannot redefine a name which is the canonical name for a coding system.
1166 \(a canonical name of a coding system is what is returned when you call
1167 `coding-system-name' on a coding system).
1168
1169 ALIASEE itself can be an alias, which allows you to define nested aliases.
1170
1171 You are forbidden, however, from creating alias loops or `dangling' aliases.
1172 These will be detected, and an error will be signaled if you attempt to do so.
1173
1174 If ALIASEE is nil, then ALIAS will simply be undefined.
1175
1176 See also `coding-system-alias-p', `coding-system-aliasee',
1177 and `coding-system-canonical-name-p'.
1178 */
1179       (alias, aliasee))
1180 {
1181         Lisp_Object real_coding_system, probe;
1182
1183         CHECK_SYMBOL(alias);
1184
1185         if (!NILP(Fcoding_system_canonical_name_p(alias)))
1186                 signal_simple_error
1187                     ("Symbol is the canonical name of a coding system and cannot be redefined",
1188                      alias);
1189
1190         if (NILP(aliasee)) {
1191                 Lisp_Object subsidiary_unix =
1192                     append_suffix_to_symbol(alias, "-unix");
1193                 Lisp_Object subsidiary_dos =
1194                     append_suffix_to_symbol(alias, "-dos");
1195                 Lisp_Object subsidiary_mac =
1196                     append_suffix_to_symbol(alias, "-mac");
1197
1198                 Fremhash(alias, Vcoding_system_hash_table);
1199
1200                 /* Undefine subsidiary aliases,
1201                    presumably created by a previous call to this function */
1202                 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1203                     !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1204                     !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1205                         Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1206                         Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1207                         Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1208                 }
1209
1210                 /* Undefine dangling coding system aliases. */
1211                 {
1212                         int dangling_aliases;
1213
1214                         do {
1215                                 dangling_aliases = 0;
1216                                 elisp_map_remhash
1217                                     (dangling_coding_system_alias_p,
1218                                      Vcoding_system_hash_table,
1219                                      &dangling_aliases);
1220                         } while (dangling_aliases > 0);
1221                 }
1222
1223                 return Qnil;
1224         }
1225
1226         if (CODING_SYSTEMP(aliasee))
1227                 aliasee = XCODING_SYSTEM_NAME(aliasee);
1228
1229         /* Checks that aliasee names a coding-system */
1230         real_coding_system = Fget_coding_system(aliasee);
1231         SXE_SET_UNUSED(real_coding_system);
1232
1233         /* Check for coding system alias loops */
1234         if (EQ(alias, aliasee))
1235               alias_loop:signal_simple_error_2
1236                     ("Attempt to create a coding system alias loop", alias,
1237                      aliasee);
1238
1239         for (probe = aliasee;
1240              SYMBOLP(probe);
1241              probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1242                 if (EQ(probe, alias))
1243                         goto alias_loop;
1244         }
1245
1246         Fputhash(alias, aliasee, Vcoding_system_hash_table);
1247
1248         /* Set up aliases for subsidiaries.
1249            #### There must be a better way to handle subsidiary coding
1250            #### systems. */
1251         {
1252                 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1253
1254                 for (int i = 0; i < countof(suffixes); i++) {
1255                         Lisp_Object alias_subsidiary =
1256                                 append_suffix_to_symbol(alias, suffixes[i]);
1257                         Lisp_Object aliasee_subsidiary =
1258                                 append_suffix_to_symbol(aliasee, suffixes[i]);
1259
1260                         if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1261                                 Fdefine_coding_system_alias(alias_subsidiary,
1262                                                             aliasee_subsidiary);
1263                         }
1264                 }
1265         }
1266         /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1267            but it doesn't look intentional, so I'd rather return something
1268            meaningful or nothing at all. */
1269         return Qnil;
1270 }
1271
1272 static Lisp_Object
1273 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1274 {
1275         Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1276         Lisp_Object new_coding_system;
1277
1278         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1279                 return coding_system;
1280
1281         switch (type) {
1282         case EOL_AUTODETECT:
1283                 return coding_system;
1284         case EOL_LF:
1285                 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1286                 break;
1287         case EOL_CR:
1288                 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1289                 break;
1290         case EOL_CRLF:
1291                 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1292                 break;
1293         default:
1294                 abort();
1295                 return Qnil;
1296         }
1297
1298         return NILP(new_coding_system) ? coding_system : new_coding_system;
1299 }
1300
1301 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0,   /*
1302 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1303 */
1304       (coding_system, eol_type))
1305 {
1306         coding_system = Fget_coding_system(coding_system);
1307
1308         return subsidiary_coding_system(coding_system,
1309                                         symbol_to_eol_type(eol_type));
1310 }
1311 \f
1312 /************************************************************************/
1313 /*                         Coding system accessors                      */
1314 /************************************************************************/
1315
1316 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0,   /*
1317 Return the doc string for CODING-SYSTEM.
1318 */
1319       (coding_system))
1320 {
1321         coding_system = Fget_coding_system(coding_system);
1322         return XCODING_SYSTEM_DOC_STRING(coding_system);
1323 }
1324
1325 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0,       /*
1326 Return the type of CODING-SYSTEM.
1327 */
1328       (coding_system))
1329 {
1330         Lisp_Object tmp = Fget_coding_system(coding_system);
1331
1332         switch (XCODING_SYSTEM_TYPE(tmp)) {
1333         default:
1334                 abort();
1335                 return Qnil;
1336         case CODESYS_AUTODETECT:
1337                 return Qundecided;
1338 #ifdef MULE
1339         case CODESYS_SHIFT_JIS:
1340                 return Qshift_jis;
1341         case CODESYS_ISO2022:
1342                 return Qiso2022;
1343         case CODESYS_BIG5:
1344                 return Qbig5;
1345         case CODESYS_UCS4:
1346                 return Qucs4;
1347         case CODESYS_UTF8:
1348                 return Qutf8;
1349         case CODESYS_CCL:
1350                 return Qccl;
1351 #endif
1352         case CODESYS_NO_CONVERSION:
1353                 return Qno_conversion;
1354 #ifdef DEBUG_SXEMACS
1355         case CODESYS_INTERNAL:
1356                 return Qinternal;
1357 #endif
1358         }
1359 }
1360
1361 #ifdef MULE
1362 static
1363 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1364 {
1365         Lisp_Object cs
1366             = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1367
1368         return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1369 }
1370
1371 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1372 Return initial charset of CODING-SYSTEM designated to GNUM.
1373 GNUM allows 0 .. 3.
1374 */
1375       (coding_system, gnum))
1376 {
1377         coding_system = Fget_coding_system(coding_system);
1378         CHECK_INT(gnum);
1379
1380         return coding_system_charset(coding_system, XINT(gnum));
1381 }
1382 #endif                          /* MULE */
1383
1384 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0,       /*
1385 Return the PROP property of CODING-SYSTEM.
1386 */
1387       (coding_system, prop))
1388 {
1389         int i, ok = 0;
1390         enum coding_system_type type;
1391
1392         coding_system = Fget_coding_system(coding_system);
1393         CHECK_SYMBOL(prop);
1394         type = XCODING_SYSTEM_TYPE(coding_system);
1395
1396         for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1397                 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1398                         ok = 1;
1399                         switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1400                         case CODESYS_PROP_ALL_OK:
1401                                 break;
1402 #ifdef MULE
1403                         case CODESYS_PROP_ISO2022:
1404                                 if (type != CODESYS_ISO2022)
1405                                         signal_simple_error
1406                                             ("Property only valid in ISO2022 coding systems",
1407                                              prop);
1408                                 break;
1409
1410                         case CODESYS_PROP_CCL:
1411                                 if (type != CODESYS_CCL)
1412                                         signal_simple_error
1413                                             ("Property only valid in CCL coding systems",
1414                                              prop);
1415                                 break;
1416 #endif                          /* MULE */
1417                         default:
1418                                 abort();
1419                         }
1420                 }
1421
1422         if (!ok)
1423                 signal_simple_error("Unrecognized property", prop);
1424
1425         if (EQ(prop, Qname))
1426                 return XCODING_SYSTEM_NAME(coding_system);
1427         else if (EQ(prop, Qtype))
1428                 return Fcoding_system_type(coding_system);
1429         else if (EQ(prop, Qdoc_string))
1430                 return XCODING_SYSTEM_DOC_STRING(coding_system);
1431         else if (EQ(prop, Qmnemonic))
1432                 return XCODING_SYSTEM_MNEMONIC(coding_system);
1433         else if (EQ(prop, Qeol_type))
1434                 return
1435                     eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1436         else if (EQ(prop, Qeol_lf))
1437                 return XCODING_SYSTEM_EOL_LF(coding_system);
1438         else if (EQ(prop, Qeol_crlf))
1439                 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1440         else if (EQ(prop, Qeol_cr))
1441                 return XCODING_SYSTEM_EOL_CR(coding_system);
1442         else if (EQ(prop, Qpost_read_conversion))
1443                 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1444         else if (EQ(prop, Qpre_write_conversion))
1445                 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1446 #ifdef MULE
1447         else if (type == CODESYS_ISO2022) {
1448                 if (EQ(prop, Qcharset_g0))
1449                         return coding_system_charset(coding_system, 0);
1450                 else if (EQ(prop, Qcharset_g1))
1451                         return coding_system_charset(coding_system, 1);
1452                 else if (EQ(prop, Qcharset_g2))
1453                         return coding_system_charset(coding_system, 2);
1454                 else if (EQ(prop, Qcharset_g3))
1455                         return coding_system_charset(coding_system, 3);
1456
1457 #define FORCE_CHARSET(charset_num) \
1458   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1459    (coding_system, charset_num) ? Qt : Qnil)
1460
1461                 else if (EQ(prop, Qforce_g0_on_output))
1462                         return FORCE_CHARSET(0);
1463                 else if (EQ(prop, Qforce_g1_on_output))
1464                         return FORCE_CHARSET(1);
1465                 else if (EQ(prop, Qforce_g2_on_output))
1466                         return FORCE_CHARSET(2);
1467                 else if (EQ(prop, Qforce_g3_on_output))
1468                         return FORCE_CHARSET(3);
1469
1470 #define LISP_BOOLEAN(prop) \
1471   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1472
1473                 else if (EQ(prop, Qshort))
1474                         return LISP_BOOLEAN(SHORT);
1475                 else if (EQ(prop, Qno_ascii_eol))
1476                         return LISP_BOOLEAN(NO_ASCII_EOL);
1477                 else if (EQ(prop, Qno_ascii_cntl))
1478                         return LISP_BOOLEAN(NO_ASCII_CNTL);
1479                 else if (EQ(prop, Qseven))
1480                         return LISP_BOOLEAN(SEVEN);
1481                 else if (EQ(prop, Qlock_shift))
1482                         return LISP_BOOLEAN(LOCK_SHIFT);
1483                 else if (EQ(prop, Qno_iso6429))
1484                         return LISP_BOOLEAN(NO_ISO6429);
1485                 else if (EQ(prop, Qescape_quoted))
1486                         return LISP_BOOLEAN(ESCAPE_QUOTED);
1487
1488                 else if (EQ(prop, Qinput_charset_conversion))
1489                         return
1490                             unparse_charset_conversion_specs
1491                             (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1492                 else if (EQ(prop, Qoutput_charset_conversion))
1493                         return
1494                             unparse_charset_conversion_specs
1495                             (XCODING_SYSTEM(coding_system)->iso2022.
1496                              output_conv);
1497                 else
1498                         abort();
1499         } else if (type == CODESYS_CCL) {
1500                 if (EQ(prop, Qdecode))
1501                         return XCODING_SYSTEM_CCL_DECODE(coding_system);
1502                 else if (EQ(prop, Qencode))
1503                         return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1504                 else
1505                         abort();
1506         }
1507 #endif                          /* MULE */
1508         else
1509                 abort();
1510
1511         return Qnil;            /* not reached */
1512 }
1513 \f
1514 /************************************************************************/
1515 /*                       Coding category functions                      */
1516 /************************************************************************/
1517
1518 static int decode_coding_category(Lisp_Object symbol)
1519 {
1520         int i;
1521
1522         CHECK_SYMBOL(symbol);
1523         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1524                 if (EQ(coding_category_symbol[i], symbol))
1525                         return i;
1526
1527         signal_simple_error("Unrecognized coding category", symbol);
1528         return 0;               /* not reached */
1529 }
1530
1531 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0,   /*
1532 Return a list of all recognized coding categories.
1533 */
1534       ())
1535 {
1536         int i;
1537         Lisp_Object list = Qnil;
1538
1539         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1540                 list = Fcons(coding_category_symbol[i], list);
1541         return list;
1542 }
1543
1544 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0,   /*
1545 Change the priority order of the coding categories.
1546 LIST should be list of coding categories, in descending order of
1547 priority.  Unspecified coding categories will be lower in priority
1548 than all specified ones, in the same relative order they were in
1549 previously.
1550 */
1551       (list))
1552 {
1553         int category_to_priority[CODING_CATEGORY_LAST];
1554         int i, j;
1555         Lisp_Object rest;
1556
1557         /* First generate a list that maps coding categories to priorities. */
1558
1559         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1560                 category_to_priority[i] = -1;
1561
1562         /* Highest priority comes from the specified list. */
1563         i = 0;
1564         EXTERNAL_LIST_LOOP(rest, list) {
1565                 int cat = decode_coding_category(XCAR(rest));
1566
1567                 if (category_to_priority[cat] >= 0)
1568                         signal_simple_error("Duplicate coding category in list",
1569                                             XCAR(rest));
1570                 category_to_priority[cat] = i++;
1571         }
1572
1573         /* Now go through the existing categories by priority to retrieve
1574            the categories not yet specified and preserve their priority
1575            order. */
1576         for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1577                 int cat = fcd->coding_category_by_priority[j];
1578                 if (category_to_priority[cat] < 0)
1579                         category_to_priority[cat] = i++;
1580         }
1581
1582         /* Now we need to construct the inverse of the mapping we just
1583            constructed. */
1584
1585         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1586                 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1587
1588         /* Phew!  That was confusing. */
1589         return Qnil;
1590 }
1591
1592 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0,   /*
1593 Return a list of coding categories in descending order of priority.
1594 */
1595       ())
1596 {
1597         int i;
1598         Lisp_Object list = Qnil;
1599
1600         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1601                 list =
1602                     Fcons(coding_category_symbol
1603                           [fcd->coding_category_by_priority[i]], list);
1604         return list;
1605 }
1606
1607 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0,       /*
1608 Change the coding system associated with a coding category.
1609 */
1610       (coding_category, coding_system))
1611 {
1612         int cat = decode_coding_category(coding_category);
1613
1614         coding_system = Fget_coding_system(coding_system);
1615         fcd->coding_category_system[cat] = coding_system;
1616         return Qnil;
1617 }
1618
1619 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0,       /*
1620 Return the coding system associated with a coding category.
1621 */
1622       (coding_category))
1623 {
1624         int cat = decode_coding_category(coding_category);
1625         Lisp_Object sys = fcd->coding_category_system[cat];
1626
1627         if (!NILP(sys))
1628                 return XCODING_SYSTEM_NAME(sys);
1629         return Qnil;
1630 }
1631 \f
1632 /************************************************************************/
1633 /*                     Detecting the encoding of data                   */
1634 /************************************************************************/
1635
1636 struct detection_state {
1637         eol_type_t eol_type;
1638         int seen_non_ascii;
1639         int mask;
1640 #ifdef MULE
1641         struct {
1642                 int mask;
1643                 int in_second_byte;
1644         } big5;
1645
1646         struct {
1647                 int mask;
1648                 int in_second_byte;
1649         } shift_jis;
1650
1651         struct {
1652                 int mask;
1653                 int in_byte;
1654         } ucs4;
1655
1656         struct {
1657                 int mask;
1658                 int in_byte;
1659         } utf8;
1660
1661         struct {
1662                 int mask;
1663                 int initted;
1664                 struct iso2022_decoder iso;
1665                 unsigned int flags;
1666                 int high_byte_count;
1667                 unsigned int saw_single_shift:1;
1668         } iso2022;
1669 #endif
1670         struct {
1671                 int seen_anything;
1672                 int just_saw_cr;
1673         } eol;
1674 };
1675
1676 static int acceptable_control_char_p(int c)
1677 {
1678         switch (c) {
1679                 /* Allow and ignore control characters that you might
1680                    reasonably see in a text file */
1681         case '\r':
1682         case '\n':
1683         case '\t':
1684         case 7:         /* bell */
1685         case 8:         /* backspace */
1686         case 11:                /* vertical tab */
1687         case 12:                /* form feed */
1688         case 26:                /* MS-DOS C-z junk */
1689         case 31:                /* '^_' -- for info */
1690                 return 1;
1691         default:
1692                 return 0;
1693         }
1694 }
1695
1696 static int mask_has_at_most_one_bit_p(int mask)
1697 {
1698         /* Perhaps the only thing useful you learn from intensive Microsoft
1699            technical interviews */
1700         return (mask & (mask - 1)) == 0;
1701 }
1702
1703 static eol_type_t
1704 detect_eol_type(struct detection_state *st, const Extbyte * src,
1705                 Lstream_data_count n)
1706 {
1707         while (n--) {
1708                 const unsigned char c = *(const unsigned char*)src++;
1709                 if (c == '\n') {
1710                         if (st->eol.just_saw_cr)
1711                                 return EOL_CRLF;
1712                         else if (st->eol.seen_anything)
1713                                 return EOL_LF;
1714                 } else if (st->eol.just_saw_cr)
1715                         return EOL_CR;
1716                 else if (c == '\r')
1717                         st->eol.just_saw_cr = 1;
1718                 else
1719                         st->eol.just_saw_cr = 0;
1720                 st->eol.seen_anything = 1;
1721         }
1722
1723         return EOL_AUTODETECT;
1724 }
1725
1726 /* Attempt to determine the encoding and EOL type of the given text.
1727    Before calling this function for the first type, you must initialize
1728    st->eol_type as appropriate and initialize st->mask to ~0.
1729
1730    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1731    not yet known.
1732
1733    st->mask holds the determined coding category mask, or ~0 if only
1734    ASCII has been seen so far.
1735
1736    Returns:
1737
1738    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1739         is present in st->mask
1740    1 == definitive answers are here for both st->eol_type and st->mask
1741 */
1742
1743 static int
1744 detect_coding_type(struct detection_state *st, const Extbyte * src,
1745                    Lstream_data_count n, int just_do_eol)
1746 {
1747         if (st->eol_type == EOL_AUTODETECT)
1748                 st->eol_type = detect_eol_type(st, src, n);
1749
1750         if (just_do_eol)
1751                 return st->eol_type != EOL_AUTODETECT;
1752
1753         if (!st->seen_non_ascii) {
1754                 for (; n; n--, src++) {
1755                         const unsigned char c = *(const unsigned char *)src;
1756                         if ((c < 0x20 && !acceptable_control_char_p(c))
1757                             || c >= 0x80) {
1758                                 st->seen_non_ascii = 1;
1759 #ifdef MULE
1760                                 st->shift_jis.mask = ~0;
1761                                 st->big5.mask = ~0;
1762                                 st->ucs4.mask = ~0;
1763                                 st->utf8.mask = ~0;
1764                                 st->iso2022.mask = ~0;
1765 #endif
1766                                 break;
1767                         }
1768                 }
1769         }
1770
1771         if (!n) {
1772                 return 0;
1773         }
1774 #ifdef MULE
1775         if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1776                 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1777         if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1778                 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1779         if (!mask_has_at_most_one_bit_p(st->big5.mask))
1780                 st->big5.mask = detect_coding_big5(st, src, n);
1781         if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1782                 st->utf8.mask = detect_coding_utf8(st, src, n);
1783         if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1784                 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1785
1786         st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1787                 | st->utf8.mask | st->ucs4.mask;
1788 #endif
1789         {
1790                 int retval = mask_has_at_most_one_bit_p(st->mask);
1791                 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1792                 return retval && st->eol_type != EOL_AUTODETECT;
1793         }
1794 }
1795
1796 static Lisp_Object coding_system_from_mask(int mask)
1797 {
1798         if (mask == ~0) {
1799                 /* If the file was entirely or basically ASCII, use the
1800                    default value of `buffer-file-coding-system'. */
1801                 Lisp_Object retval =
1802                     XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1803                 if (!NILP(retval)) {
1804                         retval = Ffind_coding_system(retval);
1805                         if (NILP(retval)) {
1806                                 warn_when_safe
1807                                     (Qbad_variable, Qwarning,
1808                                      "Invalid `default-buffer-file-coding-system', set to nil");
1809                                 XBUFFER(Vbuffer_defaults)->
1810                                     buffer_file_coding_system = Qnil;
1811                         }
1812                 }
1813                 if (NILP(retval))
1814                         retval = Fget_coding_system(Qraw_text);
1815                 return retval;
1816         } else {
1817                 int i;
1818                 int cat = -1;
1819 #ifdef MULE
1820                 mask = postprocess_iso2022_mask(mask);
1821 #endif
1822                 /* Look through the coding categories by priority and find
1823                    the first one that is allowed. */
1824                 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1825                         cat = fcd->coding_category_by_priority[i];
1826                         if (cat<0)
1827                                 continue;
1828                         if ((mask & (1 << cat)) &&
1829                             !NILP(fcd->coding_category_system[cat]))
1830                                 break;
1831                 }
1832                 if (cat >= 0)
1833                         return fcd->coding_category_system[cat];
1834                 else
1835                         return Fget_coding_system(Qraw_text);
1836         }
1837 }
1838
1839 /* Given a seekable read stream and potential coding system and EOL type
1840    as specified, do any autodetection that is called for.  If the
1841    coding system and/or EOL type are not `autodetect', they will be left
1842    alone; but this function will never return an autodetect coding system
1843    or EOL type.
1844
1845    This function does not automatically fetch subsidiary coding systems;
1846    that should be unnecessary with the explicit eol-type argument. */
1847
1848 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1849 /* number of leading lines to check for a coding cookie */
1850 #define LINES_TO_CHECK 2
1851
1852
1853 void
1854 autodetect_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1855                              eol_type_t * eol_type_in_out)
1856 {
1857         static const char mime_name_valid_chars[] =
1858                 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1859                 "abcdefghijklmnopqrstuvwxyz"
1860                 "0123456789"
1861                 "!$%&*+-.^_{|}~";
1862
1863         struct detection_state decst;
1864
1865         Extbyte buf[4096];
1866         Lisp_Object coding_system = Qnil;
1867         Extbyte *p = NULL,
1868                 *scan_end = NULL;
1869         int lines_checked = 0;
1870         Lstream_data_count nread =
1871                 Lstream_read(stream, buf, sizeof(buf));
1872
1873         if (nread < 0)
1874                 /* Nothing more can be done here */
1875                 return;
1876
1877         xzero(decst);
1878         decst.eol_type = *eol_type_in_out;
1879         decst.mask = ~0;
1880
1881
1882         /* Look for initial "-*-"; mode line prefix */
1883         for (p = buf, scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1884              p <= scan_end && lines_checked < LINES_TO_CHECK; p++) {
1885                 Extbyte *local_vars_beg = p + 3;
1886
1887                 if (*p == '\n' || *p == '\r') {
1888                         /* file must use standard EOLs or we miss 2d
1889                            line not to mention this is broken for
1890                            UTF-16 DOS files */
1891                         lines_checked++;
1892                         /* skip past multibyte (DOS) newline */
1893                         if (*p == '\r' && *(p + 1) == '\n')
1894                                 p++;
1895                         continue;
1896                 }
1897                 if (*p != '-' || *(p + 1) != '*' || *(p + 2) != '-') {
1898                         continue;
1899                 }
1900
1901                 /* Look for final "-*-"; mode line suffix */
1902                 for (p = local_vars_beg, scan_end = buf + nread - LENGTH("-*-");
1903                      p <= scan_end && lines_checked < LINES_TO_CHECK; p++) {
1904                         Extbyte *suffix = p;
1905                         if (*p == '\n' || *p == '\r') {
1906                                 /* file must use standard EOLs or we
1907                                    miss 2d line not to mention this is
1908                                    broken for UTF-16 DOS files */
1909                                 lines_checked++;
1910                                 /* skip past multibyte (DOS) newline */
1911                                 if (*p == '\r'
1912                                     && *(p + 1) == '\n')
1913                                         p++;
1914                                 continue;
1915                         }
1916                         if (*p != '-' || *(p + 1) != '*' || *(p + 2) != '-') {
1917                                 continue;
1918                         }
1919
1920                         /* Look for "coding:" */
1921                         for (p = local_vars_beg, scan_end = suffix - LENGTH("coding:?");
1922                              p <= scan_end; p++) {
1923                                 Extbyte save;
1924                                 int n;
1925
1926                                 if (memcmp("coding:", p, LENGTH("coding:")) != 0) {
1927                                         continue;
1928                                 }
1929                                 if (p != local_vars_beg && strchr(" \t;", *p) == NULL ) {
1930                                         continue;
1931                                 }
1932                                 p += LENGTH("coding:");
1933                                 while (*p == ' ' || *p == '\t') {
1934                                         p++;
1935                                 }
1936
1937                                 /* Get coding system name */
1938                                 save = *suffix;
1939                                 *suffix = '\0';
1940                                 /* Characters valid in a MIME charset
1941                                    name (rfc 1521), and in a Lisp
1942                                    symbol name. */
1943                                 n = strspn((char *)p, mime_name_valid_chars);
1944                                 *suffix = save;
1945                                 if (n > 0) {
1946                                         save = p[n];
1947                                         p[n] = '\0';
1948                                         coding_system = Ffind_coding_system(
1949                                                 intern((char *)p));
1950                                         p[n] = save;
1951                                 }
1952                                 break;
1953                         }
1954                         /* Try " coding:" */
1955                         if (NILP(coding_system)) {
1956                                 for (p = local_vars_beg, scan_end = suffix - LENGTH(" coding:?");
1957                                      p <= scan_end; p++) {
1958                                         Extbyte save;
1959                                         int n;
1960
1961                                         if (memcmp(" coding:", p, LENGTH(" coding:")) != 0) {
1962                                                 continue;
1963                                         }
1964                                         if (p != local_vars_beg && strchr(" \t;", *p) == NULL ) {
1965                                                 continue;
1966                                         }
1967                                         p += LENGTH(" coding:");
1968                                         while (*p == ' ' || *p == '\t') {
1969                                                 p++;
1970                                         }
1971
1972                                         /* Get coding system name */
1973                                         save = *suffix;
1974                                         *suffix = '\0';
1975                                         /* Characters valid in a MIME charset
1976                                            name (rfc 1521), and in a Lisp
1977                                            symbol name. */
1978                                         n = strspn((char *)p, mime_name_valid_chars);
1979                                         *suffix = save;
1980                                         if (n > 0) {
1981                                                 save = p[n];
1982                                                 p[n] = '\0';
1983                                                 coding_system = Ffind_coding_system(
1984                                                         intern((char *)p));
1985                                                 p[n] = save;
1986                                         }
1987                                         break;
1988                                 }
1989                         }
1990                         break;
1991                 }
1992                 break;
1993         }
1994         if (NILP(coding_system)) {
1995                 do {
1996                         if (detect_coding_type(&decst, buf, nread,
1997                                                XCODING_SYSTEM_TYPE(*codesys_in_out)
1998                                                != CODESYS_AUTODETECT))
1999                                 break;
2000                         nread = Lstream_read(stream, buf, sizeof(buf));
2001                         if (nread == 0)
2002                                 break;
2003                 }
2004                 while (1);
2005         } else if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
2006                    && XCODING_SYSTEM_EOL_TYPE(coding_system) == EOL_AUTODETECT) {
2007                 do {
2008                         if (detect_coding_type(&decst, buf, nread, 1))
2009                                 break;
2010                         nread = Lstream_read(stream, buf, sizeof(buf));
2011                         if (!nread)
2012                                 break;
2013                 }
2014                 while (1);
2015         }
2016         *eol_type_in_out = decst.eol_type;
2017         if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
2018                 if (NILP(coding_system))
2019                         *codesys_in_out =
2020                                 coding_system_from_mask(decst.mask);
2021                 else
2022                         *codesys_in_out = coding_system;
2023         }
2024 }
2025
2026 void
2027 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
2028                              eol_type_t * eol_type_in_out)
2029 {
2030
2031         if (*eol_type_in_out == EOL_AUTODETECT)
2032                 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
2033
2034
2035         /* If autodetection is called for, do it now. */
2036         if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
2037             || *eol_type_in_out == EOL_AUTODETECT) {
2038                 autodetect_real_coding_system(stream, codesys_in_out,
2039                                               eol_type_in_out);
2040         }
2041
2042         /* If we absolutely can't determine the EOL type, just assume LF. */
2043         if (*eol_type_in_out == EOL_AUTODETECT)
2044                 *eol_type_in_out = EOL_LF;
2045
2046         Lstream_rewind(stream);
2047 }
2048
2049 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0,   /*
2050 Detect coding system of the text in the region between START and END.
2051 Return a list of possible coding systems ordered by priority.
2052 If only ASCII characters are found, return 'undecided or one of
2053 its subsidiary coding systems according to a detected end-of-line
2054 type.  Optional arg BUFFER defaults to the current buffer.
2055 */
2056       (start, end, buffer))
2057 {
2058         Lisp_Object val = Qnil;
2059         struct buffer *buf = decode_buffer(buffer, 0);
2060         Bufpos b, e;
2061         Lisp_Object instream, lb_instream;
2062         lstream_t istr, lb_istr;
2063         struct detection_state decst;
2064         struct gcpro gcpro1, gcpro2;
2065
2066         get_buffer_range_char(buf, start, end, &b, &e, 0);
2067         lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2068         lb_istr = XLSTREAM(lb_instream);
2069         instream =
2070             make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2071         istr = XLSTREAM(instream);
2072         GCPRO2(instream, lb_instream);
2073         xzero(decst);
2074         decst.eol_type = EOL_AUTODETECT;
2075         decst.mask = ~0;
2076         while (1) {
2077                 Extbyte random_buffer[4096];
2078                 Lstream_data_count nread =
2079                     Lstream_read(istr, random_buffer, sizeof(random_buffer));
2080
2081                 if (!nread)
2082                         break;
2083                 if (detect_coding_type(&decst, random_buffer, nread, 0))
2084                         break;
2085         }
2086
2087         if (decst.mask == ~0)
2088                 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2089                                                decst.eol_type);
2090         else {
2091                 int i;
2092
2093                 val = Qnil;
2094 #ifdef MULE
2095                 decst.mask = postprocess_iso2022_mask(decst.mask);
2096 #endif
2097                 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2098                         int sys = fcd->coding_category_by_priority[i];
2099                         if (decst.mask & (1 << sys)) {
2100                                 Lisp_Object codesys =
2101                                     fcd->coding_category_system[sys];
2102                                 if (!NILP(codesys))
2103                                         codesys =
2104                                             subsidiary_coding_system(codesys,
2105                                                                      decst.
2106                                                                      eol_type);
2107                                 val = Fcons(codesys, val);
2108                         }
2109                 }
2110         }
2111         Lstream_close(istr);
2112         UNGCPRO;
2113         Lstream_delete(istr);
2114         Lstream_delete(lb_istr);
2115         return val;
2116 }
2117 \f
2118 /************************************************************************/
2119 /*           Converting to internal Mule format ("decoding")            */
2120 /************************************************************************/
2121
2122 /* A decoding stream is a stream used for decoding text (i.e.
2123    converting from some external format to internal format).
2124    The decoding-stream object keeps track of the actual coding
2125    stream, the stream that is at the other end, and data that
2126    needs to be persistent across the lifetime of the stream. */
2127
2128 /* Handle the EOL stuff related to just-read-in character C.
2129    EOL_TYPE is the EOL type of the coding stream.
2130    FLAGS is the current value of FLAGS in the coding stream, and may
2131    be modified by this macro.  (The macro only looks at the
2132    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2133    bytes are to be written.  You need to also define a local goto
2134    label "label_continue_loop" that is at the end of the main
2135    character-reading loop.
2136
2137    If C is a CR character, then this macro handles it entirely and
2138    jumps to label_continue_loop.  Otherwise, this macro does not add
2139    anything to DST, and continues normally.  You should continue
2140    processing C normally after this macro. */
2141
2142 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2143 do {                                                            \
2144   if (c == '\r')                                                \
2145     {                                                           \
2146       if (eol_type == EOL_CR)                                   \
2147         Dynarr_add (dst, '\n');                                 \
2148       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2149         Dynarr_add (dst, c);                                    \
2150       else                                                      \
2151         flags |= CODING_STATE_CR;                               \
2152       goto label_continue_loop;                                 \
2153     }                                                           \
2154   else if (flags & CODING_STATE_CR)                             \
2155     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2156       if (c != '\n')                                            \
2157         Dynarr_add (dst, '\r');                                 \
2158       flags &= ~CODING_STATE_CR;                                \
2159     }                                                           \
2160 } while (0)
2161
2162 /* C should be a binary character in the range 0 - 255; convert
2163    to internal format and add to Dynarr DST. */
2164
2165 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2166 do {                                            \
2167   if (BYTE_ASCII_P (c))                         \
2168     Dynarr_add (dst, c);                        \
2169   else if (BYTE_C1_P (c))                       \
2170     {                                           \
2171       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2172       Dynarr_add (dst, c + 0x20);               \
2173     }                                           \
2174   else                                          \
2175     {                                           \
2176       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2177       Dynarr_add (dst, c);                      \
2178     }                                           \
2179 } while (0)
2180
2181 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2182 do {                                    \
2183   if (ch)                               \
2184     {                                   \
2185       DECODE_ADD_BINARY_CHAR (ch, dst); \
2186       ch = 0;                           \
2187     }                                   \
2188 } while (0)
2189
2190 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2191 do {                                    \
2192   if (flags & CODING_STATE_END)         \
2193     {                                   \
2194       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2195       if (flags & CODING_STATE_CR)      \
2196         Dynarr_add (dst, '\r');         \
2197     }                                   \
2198 } while (0)
2199
2200 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2201
2202 typedef struct decoding_stream_s *decoding_stream_t;
2203 struct decoding_stream_s {
2204         /* Coding system that governs the conversion. */
2205         Lisp_Coding_System *codesys;
2206
2207         /* Stream that we read the encoded data from or
2208            write the decoded data to. */
2209         lstream_t other_end;
2210
2211         /* If we are reading, then we can return only a fixed amount of
2212            data, so if the conversion resulted in too much data, we store it
2213            here for retrieval the next time around. */
2214         unsigned_char_dynarr *runoff;
2215
2216         /* FLAGS holds flags indicating the current state of the decoding.
2217            Some of these flags are dependent on the coding system. */
2218         unsigned int flags;
2219
2220         /* CH holds a partially built-up character.  Since we only deal
2221            with one- and two-byte characters at the moment, we only use
2222            this to store the first byte of a two-byte character. */
2223         unsigned int ch;
2224
2225         /* EOL_TYPE specifies the type of end-of-line conversion that
2226            currently applies.  We need to keep this separate from the
2227            EOL type stored in CODESYS because the latter might indicate
2228            automatic EOL-type detection while the former will always
2229            indicate a particular EOL type. */
2230         eol_type_t eol_type;
2231 #ifdef MULE
2232         /* Additional ISO2022 information.  We define the structure above
2233            because it's also needed by the detection routines. */
2234         struct iso2022_decoder iso2022;
2235
2236         /* Additional information (the state of the running CCL program)
2237            used by the CCL decoder. */
2238         struct ccl_program ccl;
2239
2240         /* counter for UTF-8 or UCS-4 */
2241         unsigned char counter;
2242 #endif
2243         struct detection_state decst;
2244 };
2245
2246 static Lstream_data_count
2247 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2248 static Lstream_data_count
2249 decoding_writer(lstream_t stream,
2250                 const unsigned char *data, Lstream_data_count size);
2251 static int decoding_rewinder(lstream_t stream);
2252 static int decoding_seekable_p(lstream_t stream);
2253 static int decoding_flusher(lstream_t stream);
2254 static int decoding_closer(lstream_t stream);
2255
2256 static Lisp_Object decoding_marker(Lisp_Object stream);
2257
2258 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2259                               sizeof(struct decoding_stream_s));
2260
2261 static Lisp_Object
2262 decoding_marker(Lisp_Object stream)
2263 {
2264         lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2265         Lisp_Object str_obj;
2266
2267         /* We do not need to mark the coding systems or charsets stored
2268            within the stream because they are stored in a global list
2269            and automatically marked. */
2270
2271         XSETLSTREAM(str_obj, str);
2272         mark_object(str_obj);
2273         if (str->imp->marker) {
2274                 return str->imp->marker(str_obj);
2275         } else {
2276                 return Qnil;
2277         }
2278 }
2279
2280 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2281    so we read data from the other end, decode it, and store it into DATA. */
2282
2283 static Lstream_data_count
2284 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2285 {
2286         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2287         unsigned char *orig_data = data;
2288         Lstream_data_count read_size;
2289         int error_occurred = 0;
2290
2291         /* We need to interface to mule_decode(), which expects to take some
2292            amount of data and store the result into a Dynarr.  We have
2293            mule_decode() store into str->runoff, and take data from there
2294            as necessary. */
2295
2296         /* We loop until we have enough data, reading chunks from the other
2297            end and decoding it. */
2298         while (1) {
2299                 /* Take data from the runoff if we can.  Make sure to take at
2300                    most SIZE bytes, and delete the data from the runoff. */
2301                 if (Dynarr_length(str->runoff) > 0) {
2302                         Lstream_data_count chunk =
2303                                 min(size,
2304                                     (Lstream_data_count)
2305                                     Dynarr_length(str->runoff));
2306                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2307                         Dynarr_delete_many(str->runoff, 0, chunk);
2308                         data += chunk;
2309                         size -= chunk;
2310                 }
2311
2312                 if (size == 0) {
2313                         /* No more room for data */
2314                         break;
2315                 }
2316
2317                 if (str->flags & CODING_STATE_END) {
2318                         /* This means that on the previous iteration, we hit the
2319                            EOF on the other end.  We loop once more so that
2320                            mule_decode() can output any final stuff it may be
2321                            holding, or any "go back to a sane state" escape
2322                            sequences. (This latter makes sense during
2323                            encoding.) */
2324                         break;
2325                 }
2326
2327                 /* Exhausted the runoff, so get some more.  DATA has at least
2328                    SIZE bytes left of storage in it, so it's OK to read directly
2329                    into it.  (We'll be overwriting above, after we've decoded it
2330                    into the runoff.) */
2331                 read_size = Lstream_read(str->other_end, data, size);
2332                 if (read_size < 0) {
2333                         error_occurred = 1;
2334                         break;
2335                 }
2336                 if (read_size == 0) {
2337                         /* There might be some more end data produced in the
2338                            translation.  See the comment above. */
2339                         str->flags |= CODING_STATE_END;
2340                 }
2341                 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2342         }
2343
2344         if (data - orig_data == 0) {
2345                 return error_occurred ? -1 : 0;
2346         } else {
2347                 return data - orig_data;
2348         }
2349 }
2350
2351 static Lstream_data_count
2352 decoding_writer(lstream_t stream, const unsigned char *data,
2353                 Lstream_data_count size)
2354 {
2355         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2356         Lstream_data_count retval;
2357
2358         /* Decode all our data into the runoff, and then attempt to write
2359            it all out to the other end.  Remove whatever chunk we succeeded
2360            in writing. */
2361         mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2362         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2363                                Dynarr_length(str->runoff));
2364         if (retval > 0) {
2365                 Dynarr_delete_many(str->runoff, 0, retval);
2366         }
2367         /* Do NOT return retval.  The return value indicates how much
2368            of the incoming data was written, not how many bytes were
2369            written. */
2370         return size;
2371 }
2372
2373 static void
2374 reset_decoding_stream(decoding_stream_t str)
2375 {
2376 #ifdef MULE
2377         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2378                 Lisp_Object coding_system;
2379                 XSETCODING_SYSTEM(coding_system, str->codesys);
2380                 reset_iso2022(coding_system, &str->iso2022);
2381         } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2382                 setup_ccl_program(&str->ccl,
2383                                   CODING_SYSTEM_CCL_DECODE(str->codesys));
2384         }
2385         str->counter = 0;
2386 #endif                          /* MULE */
2387         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2388             || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2389                 xzero(str->decst);
2390                 str->decst.eol_type = EOL_AUTODETECT;
2391                 str->decst.mask = ~0;
2392         }
2393         str->flags = str->ch = 0;
2394 }
2395
2396 static int
2397 decoding_rewinder(lstream_t stream)
2398 {
2399         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2400         reset_decoding_stream(str);
2401         Dynarr_reset(str->runoff);
2402         return Lstream_rewind(str->other_end);
2403 }
2404
2405 static int
2406 decoding_seekable_p(lstream_t stream)
2407 {
2408         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2409         return Lstream_seekable_p(str->other_end);
2410 }
2411
2412 static int
2413 decoding_flusher(lstream_t stream)
2414 {
2415         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2416         return Lstream_flush(str->other_end);
2417 }
2418
2419 static int
2420 decoding_closer(lstream_t stream)
2421 {
2422         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2423         if (stream->flags & LSTREAM_FL_WRITE) {
2424                 str->flags |= CODING_STATE_END;
2425                 decoding_writer(stream, 0, 0);
2426         }
2427         Dynarr_free(str->runoff);
2428 #ifdef MULE
2429 #ifdef ENABLE_COMPOSITE_CHARS
2430         if (str->iso2022.composite_chars) {
2431                 Dynarr_free(str->iso2022.composite_chars);
2432         }
2433 #endif
2434 #endif
2435         return Lstream_close(str->other_end);
2436 }
2437
2438 Lisp_Object
2439 decoding_stream_coding_system(lstream_t stream)
2440 {
2441         Lisp_Object coding_system;
2442         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2443
2444         XSETCODING_SYSTEM(coding_system, str->codesys);
2445         return subsidiary_coding_system(coding_system, str->eol_type);
2446 }
2447
2448 void
2449 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2450 {
2451         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2452         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2453         str->codesys = cs;
2454         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2455                 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2456         }
2457         reset_decoding_stream(str);
2458         return;
2459 }
2460
2461 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2462    stream for writing, no automatic code detection will be performed.
2463    The reason for this is that automatic code detection requires a
2464    seekable input.  Things will also fail if you open a decoding
2465    stream for reading using a non-fully-specified coding system and
2466    a non-seekable input stream. */
2467
2468 static Lisp_Object
2469 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2470 {
2471         lstream_t lstr = Lstream_new(lstream_decoding, mode);
2472         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2473         Lisp_Object obj;
2474
2475         xzero(*str);
2476         str->other_end = stream;
2477         str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2478         str->eol_type = EOL_AUTODETECT;
2479         if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2480                 /* We can determine the coding system now. */
2481                 determine_real_coding_system(stream, &codesys, &str->eol_type);
2482         }
2483         set_decoding_stream_coding_system(lstr, codesys);
2484         str->decst.eol_type = str->eol_type;
2485         str->decst.mask = ~0;
2486         XSETLSTREAM(obj, lstr);
2487         return obj;
2488 }
2489
2490 Lisp_Object
2491 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2492 {
2493         return make_decoding_stream_1(stream, codesys, "r");
2494 }
2495
2496 Lisp_Object
2497 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2498 {
2499         return make_decoding_stream_1(stream, codesys, "w");
2500 }
2501
2502 /* Note: the decode_coding_* functions all take the same
2503    arguments as mule_decode(), which is to say some SRC data of
2504    size N, which is to be stored into dynamic array DST.
2505    DECODING is the stream within which the decoding is
2506    taking place, but no data is actually read from or
2507    written to that stream; that is handled in decoding_reader()
2508    or decoding_writer().  This allows the same functions to
2509    be used for both reading and writing. */
2510
2511 static void
2512 mule_decode(lstream_t decoding, const Extbyte * src,
2513             unsigned_char_dynarr * dst, Lstream_data_count n)
2514 {
2515         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2516
2517         /* If necessary, do encoding-detection now.  We do this when
2518            we're a writing stream or a non-seekable reading stream,
2519            meaning that we can't just process the whole input,
2520            rewind, and start over. */
2521
2522         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2523             str->eol_type == EOL_AUTODETECT) {
2524                 Lisp_Object codesys;
2525
2526                 XSETCODING_SYSTEM(codesys, str->codesys);
2527                 detect_coding_type(&str->decst, src, n,
2528                                    CODING_SYSTEM_TYPE(str->codesys) !=
2529                                    CODESYS_AUTODETECT);
2530                 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2531                     str->decst.mask != ~0)
2532                         /* #### This is cheesy.  What we really ought to do is
2533                            buffer up a certain amount of data so as to get a
2534                            less random result. */
2535                         codesys = coding_system_from_mask(str->decst.mask);
2536                 str->eol_type = str->decst.eol_type;
2537                 if (XCODING_SYSTEM(codesys) != str->codesys) {
2538                         /* Preserve the CODING_STATE_END flag in case it was set.
2539                            If we erase it, bad things might happen. */
2540                         int was_end = str->flags & CODING_STATE_END;
2541                         set_decoding_stream_coding_system(decoding, codesys);
2542                         if (was_end)
2543                                 str->flags |= CODING_STATE_END;
2544                 }
2545         }
2546
2547         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2548 #ifdef DEBUG_SXEMACS
2549         case CODESYS_INTERNAL:
2550                 Dynarr_add_many(dst, src, n);
2551                 break;
2552 #endif
2553         case CODESYS_AUTODETECT:
2554                 /* If we got this far and still haven't decided on the coding
2555                    system, then do no conversion. */
2556         case CODESYS_NO_CONVERSION:
2557                 decode_coding_no_conversion(decoding, src, dst, n);
2558                 break;
2559 #ifdef MULE
2560         case CODESYS_SHIFT_JIS:
2561                 decode_coding_sjis(decoding, src, dst, n);
2562                 break;
2563         case CODESYS_BIG5:
2564                 decode_coding_big5(decoding, src, dst, n);
2565                 break;
2566         case CODESYS_UCS4:
2567                 decode_coding_ucs4(decoding, src, dst, n);
2568                 break;
2569         case CODESYS_UTF8:
2570                 decode_coding_utf8(decoding, src, dst, n);
2571                 break;
2572         case CODESYS_CCL:
2573                 str->ccl.last_block = str->flags & CODING_STATE_END;
2574                 /* When applying ccl program to stream, MUST NOT set NULL
2575                    pointer to src.  */
2576                 ccl_driver(&str->ccl,
2577                            (src
2578                             ? (const unsigned char *)src
2579                             : (const unsigned char *)""),
2580                            dst, n, 0, CCL_MODE_DECODING);
2581                 break;
2582         case CODESYS_ISO2022:
2583                 decode_coding_iso2022(decoding, src, dst, n);
2584                 break;
2585 #endif                          /* MULE */
2586         default:
2587                 abort();
2588         }
2589 }
2590
2591 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0,   /*
2592 Decode the text between START and END which is encoded in CODING-SYSTEM.
2593 This is useful if you've read in encoded text from a file without decoding
2594 it (e.g. you read in a JIS-formatted file but used the `binary' or
2595 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2596 Return length of decoded text.
2597 BUFFER defaults to the current buffer if unspecified.
2598 */
2599       (start, end, coding_system, buffer))
2600 {
2601         Bufpos b, e;
2602         struct buffer *buf = decode_buffer(buffer, 0);
2603         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2604         lstream_t istr, ostr;
2605         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2606
2607         get_buffer_range_char(buf, start, end, &b, &e, 0);
2608
2609         barf_if_buffer_read_only(buf, b, e);
2610
2611         coding_system = Fget_coding_system(coding_system);
2612         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2613         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2614         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2615                                                    coding_system);
2616         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2617                                                 Fget_coding_system(Qbinary));
2618         istr = XLSTREAM(instream);
2619         ostr = XLSTREAM(outstream);
2620         GCPRO4(instream, lb_outstream, de_outstream, outstream);
2621
2622         /* The chain of streams looks like this:
2623
2624            [BUFFER] <----- send through
2625            ------> [ENCODE AS BINARY]
2626            ------> [DECODE AS SPECIFIED]
2627            ------> [BUFFER]
2628          */
2629
2630         while (1) {
2631                 char tempbuf[1024];     /* some random amount */
2632                 Bufpos newpos, even_newer_pos;
2633                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2634                 Lstream_data_count size_in_bytes =
2635                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
2636
2637                 if (!size_in_bytes)
2638                         break;
2639                 newpos = lisp_buffer_stream_startpos(istr);
2640                 Lstream_write(ostr, tempbuf, size_in_bytes);
2641                 even_newer_pos = lisp_buffer_stream_startpos(istr);
2642                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2643                                     even_newer_pos, 0);
2644         }
2645         Lstream_close(istr);
2646         Lstream_close(ostr);
2647         UNGCPRO;
2648         Lstream_delete(istr);
2649         Lstream_delete(ostr);
2650         Lstream_delete(XLSTREAM(de_outstream));
2651         Lstream_delete(XLSTREAM(lb_outstream));
2652         return Qnil;
2653 }
2654 \f
2655 /************************************************************************/
2656 /*           Converting to an external encoding ("encoding")            */
2657 /************************************************************************/
2658
2659 /* An encoding stream is an output stream.  When you create the
2660    stream, you specify the coding system that governs the encoding
2661    and another stream that the resulting encoded data is to be
2662    sent to, and then start sending data to it. */
2663
2664 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2665
2666 typedef struct encoding_stream_s *encoding_stream_t;
2667 struct encoding_stream_s {
2668         /* Coding system that governs the conversion. */
2669         Lisp_Coding_System *codesys;
2670
2671         /* Stream that we read the encoded data from or
2672            write the decoded data to. */
2673         lstream_t other_end;
2674
2675         /* If we are reading, then we can return only a fixed amount of
2676            data, so if the conversion resulted in too much data, we store it
2677            here for retrieval the next time around. */
2678         unsigned_char_dynarr *runoff;
2679
2680         /* FLAGS holds flags indicating the current state of the encoding.
2681            Some of these flags are dependent on the coding system. */
2682         unsigned int flags;
2683
2684         /* CH holds a partially built-up character.  Since we only deal
2685            with one- and two-byte characters at the moment, we only use
2686            this to store the first byte of a two-byte character. */
2687         unsigned int ch;
2688 #ifdef MULE
2689         /* Additional information used by the ISO2022 encoder. */
2690         struct {
2691                 /* CHARSET holds the character sets currently assigned to the G0
2692                    through G3 registers.  It is initialized from the array
2693                    INITIAL_CHARSET in CODESYS. */
2694                 Lisp_Object charset[4];
2695
2696                 /* Which registers are currently invoked into the left (GL) and
2697                    right (GR) halves of the 8-bit encoding space? */
2698                 int register_left, register_right;
2699
2700                 /* Whether we need to explicitly designate the charset in the
2701                    G? register before using it.  It is initialized from the
2702                    array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2703                 unsigned char force_charset_on_output[4];
2704
2705                 /* Other state variables that need to be preserved across
2706                    invocations. */
2707                 Lisp_Object current_charset;
2708                 int current_half;
2709                 int current_char_boundary;
2710         } iso2022;
2711
2712         /* Additional information (the state of the running CCL program)
2713            used by the CCL encoder. */
2714         struct ccl_program ccl;
2715 #endif                          /* MULE */
2716 };
2717
2718 static Lstream_data_count
2719 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2720 static Lstream_data_count
2721 encoding_writer(lstream_t stream,
2722                 const unsigned char *data, Lstream_data_count size);
2723 static int encoding_rewinder(lstream_t stream);
2724 static int encoding_seekable_p(lstream_t stream);
2725 static int encoding_flusher(lstream_t stream);
2726 static int encoding_closer(lstream_t stream);
2727
2728 static Lisp_Object encoding_marker(Lisp_Object stream);
2729
2730 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2731                               sizeof(struct encoding_stream_s));
2732
2733 static Lisp_Object
2734 encoding_marker(Lisp_Object stream)
2735 {
2736         lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2737         Lisp_Object str_obj;
2738
2739         /* We do not need to mark the coding systems or charsets stored
2740            within the stream because they are stored in a global list
2741            and automatically marked. */
2742
2743         XSETLSTREAM(str_obj, str);
2744         mark_object(str_obj);
2745         if (str->imp->marker) {
2746                 return str->imp->marker(str_obj);
2747         } else {
2748                 return Qnil;
2749         }
2750 }
2751
2752 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2753    so we read data from the other end, encode it, and store it into DATA. */
2754
2755 static Lstream_data_count
2756 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2757 {
2758         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2759         unsigned char *orig_data = data;
2760         Lstream_data_count read_size;
2761         int error_occurred = 0;
2762
2763         /* We need to interface to mule_encode(), which expects to take some
2764            amount of data and store the result into a Dynarr.  We have
2765            mule_encode() store into str->runoff, and take data from there
2766            as necessary. */
2767
2768         /* We loop until we have enough data, reading chunks from the other
2769            end and encoding it. */
2770         while (1) {
2771                 /* Take data from the runoff if we can.  Make sure to take at
2772                    most SIZE bytes, and delete the data from the runoff. */
2773                 if (Dynarr_length(str->runoff) > 0) {
2774                         int chunk = min((int)size, Dynarr_length(str->runoff));
2775                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2776                         Dynarr_delete_many(str->runoff, 0, chunk);
2777                         data += chunk;
2778                         size -= chunk;
2779                 }
2780
2781                 if (size == 0) {
2782                         /* No more room for data */
2783                         break;
2784                 }
2785
2786                 if (str->flags & CODING_STATE_END) {
2787                         /* This means that on the previous iteration, we hit the
2788                            EOF on the other end.  We loop once more so that
2789                            mule_encode() can output any final stuff it may be
2790                            holding, or any "go back to a sane state" escape
2791                            sequences. (This latter makes sense during
2792                            encoding.) */
2793                         break;
2794                 }
2795
2796                 /* Exhausted the runoff, so get some more.  DATA at least SIZE
2797                    bytes left of storage in it, so it's OK to read directly into
2798                    it.  (We'll be overwriting above, after we've encoded it into
2799                    the runoff.) */
2800                 read_size = Lstream_read(str->other_end, data, size);
2801                 if (read_size < 0) {
2802                         error_occurred = 1;
2803                         break;
2804                 }
2805                 if (read_size == 0) {
2806                         /* There might be some more end data produced in the
2807                            translation.  See the comment above. */
2808                         str->flags |= CODING_STATE_END;
2809                 }
2810                 mule_encode(stream, data, str->runoff, read_size);
2811         }
2812
2813         if (data == orig_data) {
2814                 return error_occurred ? -1 : 0;
2815         } else {
2816                 return data - orig_data;
2817         }
2818 }
2819
2820 static Lstream_data_count
2821 encoding_writer(lstream_t stream, const unsigned char *data,
2822                 Lstream_data_count size)
2823 {
2824         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2825         Lstream_data_count retval;
2826
2827         /* Encode all our data into the runoff, and then attempt to write
2828            it all out to the other end.  Remove whatever chunk we succeeded
2829            in writing. */
2830         mule_encode(stream, data, str->runoff, size);
2831         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2832                                Dynarr_length(str->runoff));
2833         if (retval > 0) {
2834                 Dynarr_delete_many(str->runoff, 0, retval);
2835         }
2836         /* Do NOT return retval.  The return value indicates how much
2837            of the incoming data was written, not how many bytes were
2838            written. */
2839         return size;
2840 }
2841
2842 static void
2843 reset_encoding_stream(encoding_stream_t str)
2844 {
2845 #ifdef MULE
2846         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2847         case CODESYS_ISO2022: {
2848                 int i;
2849
2850                 for (i = 0; i < 4; i++) {
2851                         str->iso2022.charset[i] =
2852                                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2853                                         str->codesys, i);
2854                         str->iso2022.force_charset_on_output[i] =
2855                                 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2856                                         str->codesys, i);
2857                 }
2858                 str->iso2022.register_left = 0;
2859                 str->iso2022.register_right = 1;
2860                 str->iso2022.current_charset = Qnil;
2861                 str->iso2022.current_half = 0;
2862                 str->iso2022.current_char_boundary = 1;
2863                 break;
2864         }
2865         case CODESYS_CCL:
2866                 setup_ccl_program(&str->ccl,
2867                                   CODING_SYSTEM_CCL_ENCODE(str->codesys));
2868                 break;
2869
2870                 /* list the rest of them lot explicitly */
2871         case CODESYS_AUTODETECT:
2872         case CODESYS_SHIFT_JIS:
2873         case CODESYS_BIG5:
2874         case CODESYS_UCS4:
2875         case CODESYS_UTF8:
2876         case CODESYS_NO_CONVERSION:
2877 #ifdef DEBUG_SXEMACS
2878         case CODESYS_INTERNAL:
2879 #endif
2880         default:
2881                 break;
2882         }
2883 #endif                          /* MULE */
2884
2885         str->flags = str->ch = 0;
2886 }
2887
2888 static int
2889 encoding_rewinder(lstream_t stream)
2890 {
2891         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2892         reset_encoding_stream(str);
2893         Dynarr_reset(str->runoff);
2894         return Lstream_rewind(str->other_end);
2895 }
2896
2897 static int
2898 encoding_seekable_p(lstream_t stream)
2899 {
2900         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2901         return Lstream_seekable_p(str->other_end);
2902 }
2903
2904 static int
2905 encoding_flusher(lstream_t stream)
2906 {
2907         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2908         return Lstream_flush(str->other_end);
2909 }
2910
2911 static int
2912 encoding_closer(lstream_t stream)
2913 {
2914         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2915         if (stream->flags & LSTREAM_FL_WRITE) {
2916                 str->flags |= CODING_STATE_END;
2917                 encoding_writer(stream, 0, 0);
2918         }
2919         Dynarr_free(str->runoff);
2920         return Lstream_close(str->other_end);
2921 }
2922
2923 Lisp_Object
2924 encoding_stream_coding_system(lstream_t stream)
2925 {
2926         Lisp_Object coding_system;
2927         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2928
2929         XSETCODING_SYSTEM(coding_system, str->codesys);
2930         return coding_system;
2931 }
2932
2933 void
2934 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2935 {
2936         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2937         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2938         str->codesys = cs;
2939         reset_encoding_stream(str);
2940 }
2941
2942 static Lisp_Object
2943 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2944 {
2945         lstream_t lstr = Lstream_new(lstream_encoding, mode);
2946         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2947         Lisp_Object obj;
2948
2949         xzero(*str);
2950         str->runoff = Dynarr_new(unsigned_char);
2951         str->other_end = stream;
2952         set_encoding_stream_coding_system(lstr, codesys);
2953         XSETLSTREAM(obj, lstr);
2954         return obj;
2955 }
2956
2957 Lisp_Object
2958 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2959 {
2960         return make_encoding_stream_1(stream, codesys, "r");
2961 }
2962
2963 Lisp_Object
2964 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2965 {
2966         return make_encoding_stream_1(stream, codesys, "w");
2967 }
2968
2969 /* Convert N bytes of internally-formatted data stored in SRC to an
2970    external format, according to the encoding stream ENCODING.
2971    Store the encoded data into DST. */
2972
2973 static void
2974 mule_encode(lstream_t encoding, const Bufbyte * src,
2975             unsigned_char_dynarr * dst, Lstream_data_count n)
2976 {
2977         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2978
2979         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2980 #ifdef DEBUG_SXEMACS
2981         case CODESYS_INTERNAL:
2982                 Dynarr_add_many(dst, src, n);
2983                 break;
2984 #endif
2985         case CODESYS_AUTODETECT:
2986                 /* If we got this far and still haven't decided on the coding
2987                    system, then do no conversion. */
2988         case CODESYS_NO_CONVERSION:
2989                 encode_coding_no_conversion(encoding, src, dst, n);
2990                 break;
2991 #ifdef MULE
2992         case CODESYS_SHIFT_JIS:
2993                 encode_coding_sjis(encoding, src, dst, n);
2994                 break;
2995         case CODESYS_BIG5:
2996                 encode_coding_big5(encoding, src, dst, n);
2997                 break;
2998         case CODESYS_UCS4:
2999                 encode_coding_ucs4(encoding, src, dst, n);
3000                 break;
3001         case CODESYS_UTF8:
3002                 encode_coding_utf8(encoding, src, dst, n);
3003                 break;
3004         case CODESYS_CCL:
3005                 str->ccl.last_block = str->flags & CODING_STATE_END;
3006                 /* When applying ccl program to stream, MUST NOT set NULL
3007                    pointer to src.  */
3008                 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
3009                            dst, n, 0, CCL_MODE_ENCODING);
3010                 break;
3011         case CODESYS_ISO2022:
3012                 encode_coding_iso2022(encoding, src, dst, n);
3013                 break;
3014 #endif                          /* MULE */
3015         default:
3016                 abort();
3017         }
3018 }
3019
3020 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0,   /*
3021 Encode the text between START and END using CODING-SYSTEM.
3022 This will, for example, convert Japanese characters into stuff such as
3023 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
3024 text.  BUFFER defaults to the current buffer if unspecified.
3025 */
3026       (start, end, coding_system, buffer))
3027 {
3028         Bufpos b, e;
3029         struct buffer *buf = decode_buffer(buffer, 0);
3030         Lisp_Object instream, lb_outstream, de_outstream, outstream;
3031         lstream_t istr, ostr;
3032         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3033
3034         get_buffer_range_char(buf, start, end, &b, &e, 0);
3035
3036         barf_if_buffer_read_only(buf, b, e);
3037
3038         coding_system = Fget_coding_system(coding_system);
3039         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
3040         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
3041         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
3042                                                    Fget_coding_system(Qbinary));
3043         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
3044                                                 coding_system);
3045         istr = XLSTREAM(instream);
3046         ostr = XLSTREAM(outstream);
3047         GCPRO4(instream, outstream, de_outstream, lb_outstream);
3048         /* The chain of streams looks like this:
3049
3050            [BUFFER] <----- send through
3051            ------> [ENCODE AS SPECIFIED]
3052            ------> [DECODE AS BINARY]
3053            ------> [BUFFER]
3054          */
3055         while (1) {
3056                 char tempbuf[1024];     /* some random amount */
3057                 Bufpos newpos, even_newer_pos;
3058                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
3059                 Lstream_data_count size_in_bytes =
3060                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
3061
3062                 if (!size_in_bytes)
3063                         break;
3064                 newpos = lisp_buffer_stream_startpos(istr);
3065                 Lstream_write(ostr, tempbuf, size_in_bytes);
3066                 even_newer_pos = lisp_buffer_stream_startpos(istr);
3067                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3068                                     even_newer_pos, 0);
3069         }
3070
3071         {
3072                 Charcount retlen =
3073                     lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3074                 Lstream_close(istr);
3075                 Lstream_close(ostr);
3076                 UNGCPRO;
3077                 Lstream_delete(istr);
3078                 Lstream_delete(ostr);
3079                 Lstream_delete(XLSTREAM(de_outstream));
3080                 Lstream_delete(XLSTREAM(lb_outstream));
3081                 return make_int(retlen);
3082         }
3083 }
3084
3085 #ifdef MULE
3086 \f
3087 /************************************************************************/
3088 /*                          Shift-JIS methods                           */
3089 /************************************************************************/
3090
3091 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3092    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3093    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3094    encoded by "position-code + 0x80".  A character of JISX0208
3095    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3096    position-codes are divided and shifted so that it fit in the range
3097    below.
3098
3099    --- CODE RANGE of Shift-JIS ---
3100    (character set)      (range)
3101    ASCII                0x00 .. 0x7F
3102    JISX0201-Kana        0xA0 .. 0xDF
3103    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3104             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3105    -------------------------------
3106
3107 */
3108
3109 /* Is this the first byte of a Shift-JIS two-byte char? */
3110
3111 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3112   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3113
3114 /* Is this the second byte of a Shift-JIS two-byte char? */
3115
3116 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3117   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3118
3119 #define BYTE_SJIS_KATAKANA_P(c) \
3120   ((c) >= 0xA1 && (c) <= 0xDF)
3121
3122 static int
3123 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3124                    Lstream_data_count n)
3125 {
3126         while (n--) {
3127                 const unsigned char c = *(const unsigned char *)src++;
3128                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3129                         return 0;
3130                 if (st->shift_jis.in_second_byte) {
3131                         st->shift_jis.in_second_byte = 0;
3132                         if (c < 0x40)
3133                                 return 0;
3134                 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3135                         st->shift_jis.in_second_byte = 1;
3136         }
3137         return CODING_CATEGORY_SHIFT_JIS_MASK;
3138 }
3139
3140 /* Convert Shift-JIS data to internal format. */
3141
3142 static void
3143 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3144                    unsigned_char_dynarr * dst, Lstream_data_count n)
3145 {
3146         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3147         unsigned int flags = str->flags;
3148         unsigned int ch = str->ch;
3149         eol_type_t eol_type = str->eol_type;
3150
3151         while (n--) {
3152                 const unsigned char c = *(const unsigned char *)src++;
3153
3154                 if (ch) {
3155                         /* Previous character was first byte of Shift-JIS Kanji
3156                            char. */
3157                         if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3158                                 unsigned char e1, e2;
3159
3160                                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3161                                 DECODE_SJIS(ch, c, e1, e2);
3162                                 Dynarr_add(dst, e1);
3163                                 Dynarr_add(dst, e2);
3164                         } else {
3165                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3166                                 DECODE_ADD_BINARY_CHAR(c, dst);
3167                         }
3168                         ch = 0;
3169                 } else {
3170                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3171                         if (BYTE_SJIS_TWO_BYTE_1_P(c))
3172                                 ch = c;
3173                         else if (BYTE_SJIS_KATAKANA_P(c)) {
3174                                 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3175                                 Dynarr_add(dst, c);
3176                         } else
3177                                 DECODE_ADD_BINARY_CHAR(c, dst);
3178                 }
3179               label_continue_loop:;
3180         }
3181
3182         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3183
3184         str->flags = flags;
3185         str->ch = ch;
3186 }
3187
3188 /* Convert internally-formatted data to Shift-JIS. */
3189
3190 static void
3191 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3192                    unsigned_char_dynarr * dst, Lstream_data_count n)
3193 {
3194         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3195         unsigned int flags = str->flags;
3196         unsigned int ch = str->ch;
3197         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3198
3199         while (n--) {
3200                 Bufbyte c = *src++;
3201                 if (c == '\n') {
3202                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3203                                 Dynarr_add(dst, '\r');
3204                         if (eol_type != EOL_CR)
3205                                 Dynarr_add(dst, '\n');
3206                         ch = 0;
3207                 } else if (BYTE_ASCII_P(c)) {
3208                         Dynarr_add(dst, c);
3209                         ch = 0;
3210                 } else if (BUFBYTE_LEADING_BYTE_P(c))
3211                         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3212                               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3213                               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3214                 else if (ch) {
3215                         if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3216                                 Dynarr_add(dst, c);
3217                                 ch = 0;
3218                         } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3219                                    ch == LEADING_BYTE_JAPANESE_JISX0208)
3220                                 ch = c;
3221                         else {
3222                                 /* j1 is bessel j1 function,
3223                                  * so we use something else */
3224                                 /* unsigned char j1, j2; */
3225                                 unsigned char tt1, tt2;
3226
3227                                 ENCODE_SJIS(ch, c, tt1, tt2);
3228                                 Dynarr_add(dst, tt1);
3229                                 Dynarr_add(dst, tt2);
3230                                 ch = 0;
3231                         }
3232                 }
3233         }
3234
3235         str->flags = flags;
3236         str->ch = ch;
3237 }
3238
3239 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3240 Decode a JISX0208 character of Shift-JIS coding-system.
3241 CODE is the character code in Shift-JIS as a cons of type bytes.
3242 Return the corresponding character.
3243 */
3244       (code))
3245 {
3246         unsigned char c1, c2, s1, s2;
3247
3248         CHECK_CONS(code);
3249         CHECK_INT(XCAR(code));
3250         CHECK_INT(XCDR(code));
3251         s1 = XINT(XCAR(code));
3252         s2 = XINT(XCDR(code));
3253         if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3254                 DECODE_SJIS(s1, s2, c1, c2);
3255                 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3256                                            c1 & 0x7F, c2 & 0x7F));
3257         } else
3258                 return Qnil;
3259 }
3260
3261 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3262 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3263 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3264 */
3265       (character))
3266 {
3267         Lisp_Object charset;
3268         int c1, c2, s1, s2;
3269
3270         CHECK_CHAR_COERCE_INT(character);
3271         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3272         if (EQ(charset, Vcharset_japanese_jisx0208)) {
3273                 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3274                 return Fcons(make_int(s1), make_int(s2));
3275         } else
3276                 return Qnil;
3277 }
3278 \f
3279 /************************************************************************/
3280 /*                            Big5 methods                              */
3281 /************************************************************************/
3282
3283 /* BIG5 is a coding system encoding two character sets: ASCII and
3284    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3285    character set and is encoded in two-byte.
3286
3287    --- CODE RANGE of BIG5 ---
3288    (character set)      (range)
3289    ASCII                0x00 .. 0x7F
3290    Big5 (1st byte)      0xA1 .. 0xFE
3291         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3292    --------------------------
3293
3294    Since the number of characters in Big5 is larger than maximum
3295    characters in Emacs' charset (96x96), it can't be handled as one
3296    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3297    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3298    contains frequently used characters and the latter contains less
3299    frequently used characters.  */
3300
3301 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3302   ((c) >= 0xA1 && (c) <= 0xFE)
3303
3304 /* Is this the second byte of a Shift-JIS two-byte char? */
3305
3306 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3307   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3308
3309 /* Number of Big5 characters which have the same code in 1st byte.  */
3310
3311 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3312
3313 /* Code conversion macros.  These are macros because they are used in
3314    inner loops during code conversion.
3315
3316    Note that temporary variables in macros introduce the classic
3317    dynamic-scoping problems with variable names.  We use capital-
3318    lettered variables in the assumption that SXEmacs does not use
3319    capital letters in variables except in a very formalized way
3320    (e.g. Qstring). */
3321
3322 /* Convert Big5 code (b1, b2) into its internal string representation
3323    (lb, c1, c2). */
3324
3325 /* There is a much simpler way to split the Big5 charset into two.
3326    For the moment I'm going to leave the algorithm as-is because it
3327    claims to separate out the most-used characters into a single
3328    charset, which perhaps will lead to optimizations in various
3329    places.
3330
3331    The way the algorithm works is something like this:
3332
3333    Big5 can be viewed as a 94x157 charset, where the row is
3334    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3335    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3336    the split between low and high column numbers is apparently
3337    meaningless; ascending rows produce less and less frequent chars.
3338    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3339    the first charset, and the upper half (0xC9 .. 0xFE) to the
3340    second.  To do the conversion, we convert the character into
3341    a single number where 0 .. 156 is the first row, 157 .. 313
3342    is the second, etc.  That way, the characters are ordered by
3343    decreasing frequency.  Then we just chop the space in two
3344    and coerce the result into a 94x94 space.
3345    */
3346
3347 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3348 {                                                                       \
3349   int B1 = b1, B2 = b2;                                                 \
3350   unsigned int I                                                        \
3351     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3352                                                                         \
3353   if (B1 < 0xC9)                                                        \
3354     {                                                                   \
3355       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3356     }                                                                   \
3357   else                                                                  \
3358     {                                                                   \
3359       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3360       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3361     }                                                                   \
3362   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3363   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3364 } while (0)
3365
3366 /* Convert the internal string representation of a Big5 character
3367    (lb, c1, c2) into Big5 code (b1, b2). */
3368
3369 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3370 {                                                                       \
3371   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3372                                                                         \
3373   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3374     {                                                                   \
3375       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3376     }                                                                   \
3377   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3378   b2 = I % BIG5_SAME_ROW;                                               \
3379   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3380 } while (0)
3381
3382 static int
3383 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3384                    Lstream_data_count n)
3385 {
3386         while (n--) {
3387                 const unsigned char c = *(const unsigned char *)src++;
3388                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3389                     (c >= 0x80 && c <= 0xA0))
3390                         return 0;
3391                 if (st->big5.in_second_byte) {
3392                         st->big5.in_second_byte = 0;
3393                         if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3394                                 return 0;
3395                 } else if (c >= 0xA1)
3396                         st->big5.in_second_byte = 1;
3397         }
3398         return CODING_CATEGORY_BIG5_MASK;
3399 }
3400
3401 /* Convert Big5 data to internal format. */
3402
3403 static void
3404 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3405                    unsigned_char_dynarr * dst, Lstream_data_count n)
3406 {
3407         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3408         unsigned int flags = str->flags;
3409         unsigned int ch = str->ch;
3410         eol_type_t eol_type = str->eol_type;
3411
3412         while (n--) {
3413                 const unsigned char c = *(const unsigned char *)src++;
3414                 if (ch) {
3415                         /* Previous character was first byte of Big5 char. */
3416                         if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3417                                 unsigned char b1, b2, b3;
3418                                 DECODE_BIG5(ch, c, b1, b2, b3);
3419                                 Dynarr_add(dst, b1);
3420                                 Dynarr_add(dst, b2);
3421                                 Dynarr_add(dst, b3);
3422                         } else {
3423                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3424                                 DECODE_ADD_BINARY_CHAR(c, dst);
3425                         }
3426                         ch = 0;
3427                 } else {
3428                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3429                         if (BYTE_BIG5_TWO_BYTE_1_P(c))
3430                                 ch = c;
3431                         else
3432                                 DECODE_ADD_BINARY_CHAR(c, dst);
3433                 }
3434               label_continue_loop:;
3435         }
3436
3437         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3438
3439         str->flags = flags;
3440         str->ch = ch;
3441 }
3442
3443 /* Convert internally-formatted data to Big5. */
3444
3445 static void
3446 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3447                    unsigned_char_dynarr * dst, Lstream_data_count n)
3448 {
3449         unsigned char c;
3450         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3451         unsigned int flags = str->flags;
3452         unsigned int ch = str->ch;
3453         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3454
3455         while (n--) {
3456                 c = *src++;
3457                 if (c == '\n') {
3458                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3459                                 Dynarr_add(dst, '\r');
3460                         if (eol_type != EOL_CR)
3461                                 Dynarr_add(dst, '\n');
3462                 } else if (BYTE_ASCII_P(c)) {
3463                         /* ASCII. */
3464                         Dynarr_add(dst, c);
3465                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3466                         if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3467                             c == LEADING_BYTE_CHINESE_BIG5_2) {
3468                                 /* A recognized leading byte. */
3469                                 ch = c;
3470                                 continue;       /* not done with this character. */
3471                         }
3472                         /* otherwise just ignore this character. */
3473                 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3474                            ch == LEADING_BYTE_CHINESE_BIG5_2) {
3475                         /* Previous char was a recognized leading byte. */
3476                         ch = (ch << 8) | c;
3477                         continue;       /* not done with this character. */
3478                 } else if (ch) {
3479                         /* Encountering second byte of a Big5 character. */
3480                         unsigned char b1, b2;
3481
3482                         ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3483                         Dynarr_add(dst, b1);
3484                         Dynarr_add(dst, b2);
3485                 }
3486
3487                 ch = 0;
3488         }
3489
3490         str->flags = flags;
3491         str->ch = ch;
3492 }
3493
3494 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0,   /*
3495 Decode a Big5 character CODE of BIG5 coding-system.
3496 CODE is the character code in BIG5, a cons of two integers.
3497 Return the corresponding character.
3498 */
3499       (code))
3500 {
3501         unsigned char c1, c2, b1, b2;
3502
3503         CHECK_CONS(code);
3504         CHECK_INT(XCAR(code));
3505         CHECK_INT(XCDR(code));
3506         b1 = XINT(XCAR(code));
3507         b2 = XINT(XCDR(code));
3508         if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3509                 int leading_byte;
3510                 Lisp_Object charset;
3511                 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3512                 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3513                 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3514         } else
3515                 return Qnil;
3516 }
3517
3518 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0,   /*
3519 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3520 Return the corresponding character code in Big5.
3521 */
3522       (character))
3523 {
3524         Lisp_Object charset;
3525         int c1, c2, b1, b2;
3526
3527         CHECK_CHAR_COERCE_INT(character);
3528         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3529         if (EQ(charset, Vcharset_chinese_big5_1) ||
3530             EQ(charset, Vcharset_chinese_big5_2)) {
3531                 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3532                             c2 | 0x80, b1, b2);
3533                 return Fcons(make_int(b1), make_int(b2));
3534         } else
3535                 return Qnil;
3536 }
3537 \f
3538 /************************************************************************/
3539 /*                           UCS-4 methods                              */
3540 /*                                                                      */
3541 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3542 /*                                                                      */
3543 /************************************************************************/
3544
3545 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0,   /*
3546 Map UCS-4 code CODE to Mule character CHARACTER.
3547
3548 Return T on success, NIL on failure.
3549 */
3550       (code, character))
3551 {
3552         size_t c;
3553
3554         CHECK_CHAR(character);
3555         CHECK_NATNUM(code);
3556         c = XINT(code);
3557
3558         if (c < countof(fcd->ucs_to_mule_table)) {
3559                 fcd->ucs_to_mule_table[c] = character;
3560                 return Qt;
3561         } else
3562                 return Qnil;
3563 }
3564
3565 static Lisp_Object ucs_to_char(unsigned long code)
3566 {
3567         if (code < countof(fcd->ucs_to_mule_table)) {
3568                 return fcd->ucs_to_mule_table[code];
3569         } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3570                 unsigned int c;
3571
3572                 code -= 0xe00000;
3573                 c = code % (94 * 94);
3574                 return make_char
3575                     (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3576                                (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3577                                 CHARSET_LEFT_TO_RIGHT),
3578                                c / 94 + 33, c % 94 + 33));
3579         } else
3580                 return Qnil;
3581 }
3582
3583 DEFUN("ucs-char", Fucs_char, 1, 1, 0,   /*
3584 Return Mule character corresponding to UCS code CODE (a positive integer).
3585 */
3586       (code))
3587 {
3588         CHECK_NATNUM(code);
3589         return ucs_to_char(XINT(code));
3590 }
3591
3592 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0,   /*
3593 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3594 */
3595       (character, code))
3596 {
3597         /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3598            Fset_char_ucs is more restrictive on index arg, but should
3599            check code arg in a char_table method. */
3600         CHECK_CHAR(character);
3601         CHECK_NATNUM(code);
3602         return Fput_char_table(character, code, mule_to_ucs_table);
3603 }
3604
3605 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0,   /*
3606 Return the UCS code (a positive integer) corresponding to CHARACTER.
3607 */
3608       (character))
3609 {
3610         return Fget_char_table(character, mule_to_ucs_table);
3611 }
3612
3613 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3614    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3615    is not found, instead.
3616    #### do something more appropriate (use blob?)
3617         Danger, Will Robinson!  Data loss.  Should we signal user? */
3618 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3619 {
3620         Lisp_Object chr = ucs_to_char(ch);
3621
3622         if (!NILP(chr)) {
3623                 Bufbyte work[MAX_EMCHAR_LEN];
3624                 int len;
3625
3626                 ch = XCHAR(chr);
3627                 len = (ch < 128) ?
3628                     simple_set_charptr_emchar(work, ch) :
3629                     non_ascii_set_charptr_emchar(work, ch);
3630                 Dynarr_add_many(dst, work, len);
3631         } else {
3632                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3633                 Dynarr_add(dst, 34 + 128);
3634                 Dynarr_add(dst, 46 + 128);
3635         }
3636 }
3637
3638 static unsigned long
3639 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3640 {
3641         Lisp_Object code
3642             = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3643                               mule_to_ucs_table);
3644
3645         if (INTP(code)) {
3646                 return XINT(code);
3647         } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3648                    (XCHARSET_CHARS(charset) == 94)) {
3649                 unsigned char final = XCHARSET_FINAL(charset);
3650
3651                 if (('@' <= final) && (final < 0x7f)) {
3652                         return 0xe00000 + (final - '@') * 94 * 94
3653                             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3654                 } else {
3655                         return '?';
3656                 }
3657         } else {
3658                 return '?';
3659         }
3660 }
3661
3662 static void
3663 encode_ucs4(Lisp_Object charset,
3664             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3665 {
3666         unsigned long code = mule_char_to_ucs4(charset, h, l);
3667         Dynarr_add(dst, code >> 24);
3668         Dynarr_add(dst, (code >> 16) & 255);
3669         Dynarr_add(dst, (code >> 8) & 255);
3670         Dynarr_add(dst, code & 255);
3671 }
3672
3673 static int
3674 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3675                    Lstream_data_count n)
3676 {
3677         while (n--) {
3678                 const unsigned char c = *(const unsigned char *)src++;
3679                 switch (st->ucs4.in_byte) {
3680                 case 0:
3681                         if (c >= 128)
3682                                 return 0;
3683                         else
3684                                 st->ucs4.in_byte++;
3685                         break;
3686                 case 3:
3687                         st->ucs4.in_byte = 0;
3688                         break;
3689                 default:
3690                         st->ucs4.in_byte++;
3691                 }
3692         }
3693         return CODING_CATEGORY_UCS4_MASK;
3694 }
3695
3696 static void
3697 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3698                    unsigned_char_dynarr * dst, Lstream_data_count n)
3699 {
3700         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3701         unsigned int flags = str->flags;
3702         unsigned int ch = str->ch;
3703         unsigned char counter = str->counter;
3704
3705         while (n--) {
3706                 const unsigned char c = *(const unsigned char *)src++;
3707                 switch (counter) {
3708                 case 0:
3709                         ch = c;
3710                         counter = 3;
3711                         break;
3712                 case 1:
3713                         decode_ucs4((ch << 8) | c, dst);
3714                         ch = 0;
3715                         counter = 0;
3716                         break;
3717                 default:
3718                         ch = (ch << 8) | c;
3719                         counter--;
3720                 }
3721         }
3722         if (counter & CODING_STATE_END)
3723                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3724
3725         str->flags = flags;
3726         str->ch = ch;
3727         str->counter = counter;
3728 }
3729
3730 static void
3731 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3732                    unsigned_char_dynarr * dst, Lstream_data_count n)
3733 {
3734         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3735         unsigned int flags = str->flags;
3736         unsigned int ch = str->ch;
3737         unsigned char char_boundary = str->iso2022.current_char_boundary;
3738         Lisp_Object charset = str->iso2022.current_charset;
3739
3740 #ifdef ENABLE_COMPOSITE_CHARS
3741         /* flags for handling composite chars.  We do a little switcharoo
3742            on the source while we're outputting the composite char. */
3743         unsigned int saved_n = 0;
3744         const unsigned char *saved_src = NULL;
3745         int in_composite = 0;
3746
3747       back_to_square_n:
3748 #endif
3749
3750         while (n--) {
3751                 unsigned char c = *src++;
3752
3753                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3754                         ch = 0;
3755                         encode_ucs4(Vcharset_ascii, c, 0, dst);
3756                         char_boundary = 1;
3757                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3758                         ch = 0;
3759                         charset = CHARSET_BY_LEADING_BYTE(c);
3760                         if (LEADING_BYTE_PREFIX_P(c))
3761                                 ch = c;
3762                         char_boundary = 0;
3763                 } else {        /* Processing Non-ASCII character */
3764                         char_boundary = 1;
3765                         if (EQ(charset, Vcharset_control_1)) {
3766                                 encode_ucs4(Vcharset_control_1, c, 0, dst);
3767                         } else {
3768                                 switch (XCHARSET_REP_BYTES(charset)) {
3769                                 case 2:
3770                                         encode_ucs4(charset, c, 0, dst);
3771                                         break;
3772                                 case 3:
3773                                         if (XCHARSET_PRIVATE_P(charset)) {
3774                                                 encode_ucs4(charset, c, 0, dst);
3775                                                 ch = 0;
3776                                         } else if (ch) {
3777 #ifdef ENABLE_COMPOSITE_CHARS
3778                                                 if (EQ
3779                                                     (charset,
3780                                                      Vcharset_composite)) {
3781                                                         if (in_composite) {
3782                                                                 /* #### Bother! We don't know how to
3783                                                                    handle this yet. */
3784                                                                 Dynarr_add(dst,
3785                                                                            '\0');
3786                                                                 Dynarr_add(dst,
3787                                                                            '\0');
3788                                                                 Dynarr_add(dst,
3789                                                                            '\0');
3790                                                                 Dynarr_add(dst,
3791                                                                            '~');
3792                                                         } else {
3793                                                                 Emchar emch =
3794                                                                     MAKE_CHAR
3795                                                                     (Vcharset_composite,
3796                                                                      ch & 0x7F,
3797                                                                      c & 0x7F);
3798                                                                 Lisp_Object lstr
3799                                                                     =
3800                                                                     composite_char_string
3801                                                                     (emch);
3802                                                                 saved_n = n;
3803                                                                 saved_src = src;
3804                                                                 in_composite =
3805                                                                     1;
3806                                                                 src =
3807                                                                     XSTRING_DATA
3808                                                                     (lstr);
3809                                                                 n = XSTRING_LENGTH(lstr);
3810                                                         }
3811                                                 } else
3812 #endif                          /* ENABLE_COMPOSITE_CHARS */
3813                                                 {
3814                                                         encode_ucs4(charset, ch,
3815                                                                     c, dst);
3816                                                 }
3817                                                 ch = 0;
3818                                         } else {
3819                                                 ch = c;
3820                                                 char_boundary = 0;
3821                                         }
3822                                         break;
3823                                 case 4:
3824                                         if (ch) {
3825                                                 encode_ucs4(charset, ch, c,
3826                                                             dst);
3827                                                 ch = 0;
3828                                         } else {
3829                                                 ch = c;
3830                                                 char_boundary = 0;
3831                                         }
3832                                         break;
3833                                 default:
3834                                         abort();
3835                                 }
3836                         }
3837                 }
3838         }
3839
3840 #ifdef ENABLE_COMPOSITE_CHARS
3841         if (in_composite) {
3842                 n = saved_n;
3843                 src = saved_src;
3844                 in_composite = 0;
3845                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
3846         }
3847 #endif                          /* ENABLE_COMPOSITE_CHARS */
3848
3849         str->flags = flags;
3850         str->ch = ch;
3851         str->iso2022.current_char_boundary = char_boundary;
3852         str->iso2022.current_charset = charset;
3853
3854         /* Verbum caro factum est! */
3855 }
3856 \f
3857 /************************************************************************/
3858 /*                           UTF-8 methods                              */
3859 /************************************************************************/
3860
3861 static int
3862 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3863                    Lstream_data_count n)
3864 {
3865         while (n--) {
3866                 const unsigned char c = *(const unsigned char *)src++;
3867                 switch (st->utf8.in_byte) {
3868                 case 0:
3869                         if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3870                             || c == ISO_CODE_SO)
3871                                 return 0;
3872                         else if (c >= 0xfc)
3873                                 st->utf8.in_byte = 5;
3874                         else if (c >= 0xf8)
3875                                 st->utf8.in_byte = 4;
3876                         else if (c >= 0xf0)
3877                                 st->utf8.in_byte = 3;
3878                         else if (c >= 0xe0)
3879                                 st->utf8.in_byte = 2;
3880                         else if (c >= 0xc0)
3881                                 st->utf8.in_byte = 1;
3882                         else if (c >= 0x80)
3883                                 return 0;
3884                         break;
3885                 default:
3886                         if ((c & 0xc0) != 0x80)
3887                                 return 0;
3888                         else
3889                                 st->utf8.in_byte--;
3890                 }
3891         }
3892         return CODING_CATEGORY_UTF8_MASK;
3893 }
3894
3895 static void
3896 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3897                    unsigned_char_dynarr * dst, Lstream_data_count n)
3898 {
3899         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3900         unsigned int flags = str->flags;
3901         unsigned int ch = str->ch;
3902         eol_type_t eol_type = str->eol_type;
3903         unsigned char counter = str->counter;
3904
3905         while (n--) {
3906                 const unsigned char c = *(const unsigned char *)src++;
3907                 switch (counter) {
3908                 case 0:
3909                         if (c >= 0xfc) {
3910                                 ch = c & 0x01;
3911                                 counter = 5;
3912                         } else if (c >= 0xf8) {
3913                                 ch = c & 0x03;
3914                                 counter = 4;
3915                         } else if (c >= 0xf0) {
3916                                 ch = c & 0x07;
3917                                 counter = 3;
3918                         } else if (c >= 0xe0) {
3919                                 ch = c & 0x0f;
3920                                 counter = 2;
3921                         } else if (c >= 0xc0) {
3922                                 ch = c & 0x1f;
3923                                 counter = 1;
3924                         } else {
3925                                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3926                                 decode_ucs4(c, dst);
3927                         }
3928                         break;
3929                 case 1:
3930                         ch = (ch << 6) | (c & 0x3f);
3931                         decode_ucs4(ch, dst);
3932                         ch = 0;
3933                         counter = 0;
3934                         break;
3935                 default:
3936                         ch = (ch << 6) | (c & 0x3f);
3937                         counter--;
3938                 }
3939               label_continue_loop:;
3940         }
3941
3942         if (flags & CODING_STATE_END)
3943                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3944
3945         str->flags = flags;
3946         str->ch = ch;
3947         str->counter = counter;
3948 }
3949
3950 static void
3951 encode_utf8(Lisp_Object charset,
3952             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3953 {
3954         unsigned long code = mule_char_to_ucs4(charset, h, l);
3955         if (code <= 0x7f) {
3956                 Dynarr_add(dst, code);
3957         } else if (code <= 0x7ff) {
3958                 Dynarr_add(dst, (code >> 6) | 0xc0);
3959                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3960         } else if (code <= 0xffff) {
3961                 Dynarr_add(dst, (code >> 12) | 0xe0);
3962                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3963                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3964         } else if (code <= 0x1fffff) {
3965                 Dynarr_add(dst, (code >> 18) | 0xf0);
3966                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3967                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3968                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3969         } else if (code <= 0x3ffffff) {
3970                 Dynarr_add(dst, (code >> 24) | 0xf8);
3971                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3972                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3973                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3974                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3975         } else {
3976                 Dynarr_add(dst, (code >> 30) | 0xfc);
3977                 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3978                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3979                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3980                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3981                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3982         }
3983 }
3984
3985 static void
3986 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3987                    unsigned_char_dynarr * dst, Lstream_data_count n)
3988 {
3989         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3990         unsigned int flags = str->flags;
3991         unsigned int ch = str->ch;
3992         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3993         unsigned char char_boundary = str->iso2022.current_char_boundary;
3994         Lisp_Object charset = str->iso2022.current_charset;
3995
3996 #ifdef ENABLE_COMPOSITE_CHARS
3997         /* flags for handling composite chars.  We do a little switcharoo
3998            on the source while we're outputting the composite char. */
3999         unsigned int saved_n = 0;
4000         const unsigned char *saved_src = NULL;
4001         int in_composite = 0;
4002
4003       back_to_square_n:
4004 #endif                          /* ENABLE_COMPOSITE_CHARS */
4005
4006         while (n--) {
4007                 unsigned char c = *src++;
4008
4009                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
4010                         ch = 0;
4011                         if (c == '\n') {
4012                                 if (eol_type != EOL_LF
4013                                     && eol_type != EOL_AUTODETECT)
4014                                         Dynarr_add(dst, '\r');
4015                                 if (eol_type != EOL_CR)
4016                                         Dynarr_add(dst, c);
4017                         } else
4018                                 encode_utf8(Vcharset_ascii, c, 0, dst);
4019                         char_boundary = 1;
4020                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
4021                         ch = 0;
4022                         charset = CHARSET_BY_LEADING_BYTE(c);
4023                         if (LEADING_BYTE_PREFIX_P(c))
4024                                 ch = c;
4025                         char_boundary = 0;
4026                 } else {        /* Processing Non-ASCII character */
4027                         char_boundary = 1;
4028                         if (EQ(charset, Vcharset_control_1)) {
4029                                 encode_utf8(Vcharset_control_1, c, 0, dst);
4030                         } else {
4031                                 switch (XCHARSET_REP_BYTES(charset)) {
4032                                 case 2:
4033                                         encode_utf8(charset, c, 0, dst);
4034                                         break;
4035                                 case 3:
4036                                         if (XCHARSET_PRIVATE_P(charset)) {
4037                                                 encode_utf8(charset, c, 0, dst);
4038                                                 ch = 0;
4039                                         } else if (ch) {
4040 #ifdef ENABLE_COMPOSITE_CHARS
4041                                                 if (EQ
4042                                                     (charset,
4043                                                      Vcharset_composite)) {
4044                                                         if (in_composite) {
4045                                                                 /* #### Bother! We don't know how to
4046                                                                    handle this yet. */
4047                                                                 encode_utf8
4048                                                                     (Vcharset_ascii,
4049                                                                      '~', 0,
4050                                                                      dst);
4051                                                         } else {
4052                                                                 Emchar emch =
4053                                                                     MAKE_CHAR
4054                                                                     (Vcharset_composite,
4055                                                                      ch & 0x7F,
4056                                                                      c & 0x7F);
4057                                                                 Lisp_Object lstr
4058                                                                     =
4059                                                                     composite_char_string
4060                                                                     (emch);
4061                                                                 saved_n = n;
4062                                                                 saved_src = src;
4063                                                                 in_composite =
4064                                                                     1;
4065                                                                 src =
4066                                                                     XSTRING_DATA
4067                                                                     (lstr);
4068                                                                 n = XSTRING_LENGTH(lstr);
4069                                                         }
4070                                                 } else
4071 #endif                          /* ENABLE_COMPOSITE_CHARS */
4072                                                 {
4073                                                         encode_utf8(charset, ch,
4074                                                                     c, dst);
4075                                                 }
4076                                                 ch = 0;
4077                                         } else {
4078                                                 ch = c;
4079                                                 char_boundary = 0;
4080                                         }
4081                                         break;
4082                                 case 4:
4083                                         if (ch) {
4084                                                 encode_utf8(charset, ch, c,
4085                                                             dst);
4086                                                 ch = 0;
4087                                         } else {
4088                                                 ch = c;
4089                                                 char_boundary = 0;
4090                                         }
4091                                         break;
4092                                 default:
4093                                         abort();
4094                                 }
4095                         }
4096                 }
4097         }
4098
4099 #ifdef ENABLE_COMPOSITE_CHARS
4100         if (in_composite) {
4101                 n = saved_n;
4102                 src = saved_src;
4103                 in_composite = 0;
4104                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
4105         }
4106 #endif
4107
4108         str->flags = flags;
4109         str->ch = ch;
4110         str->iso2022.current_char_boundary = char_boundary;
4111         str->iso2022.current_charset = charset;
4112
4113         /* Verbum caro factum est! */
4114 }
4115 \f
4116 /************************************************************************/
4117 /*                           ISO2022 methods                            */
4118 /************************************************************************/
4119
4120 /* The following note describes the coding system ISO2022 briefly.
4121    Since the intention of this note is to help understand the
4122    functions in this file, some parts are NOT ACCURATE or OVERLY
4123    SIMPLIFIED.  For thorough understanding, please refer to the
4124    original document of ISO2022.
4125
4126    ISO2022 provides many mechanisms to encode several character sets
4127    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4128    is encoded using bytes less than 128.  This may make the encoded
4129    text a little bit longer, but the text passes more easily through
4130    several gateways, some of which strip off MSB (Most Signigant Bit).
4131
4132    There are two kinds of character sets: control character set and
4133    graphic character set.  The former contains control characters such
4134    as `newline' and `escape' to provide control functions (control
4135    functions are also provided by escape sequences).  The latter
4136    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4137    two control character sets and many graphic character sets.
4138
4139    Graphic character sets are classified into one of the following
4140    four classes, according to the number of bytes (DIMENSION) and
4141    number of characters in one dimension (CHARS) of the set:
4142    - DIMENSION1_CHARS94
4143    - DIMENSION1_CHARS96
4144    - DIMENSION2_CHARS94
4145    - DIMENSION2_CHARS96
4146
4147    In addition, each character set is assigned an identification tag,
4148    unique for each set, called "final character" (denoted as <F>
4149    hereafter).  The <F> of each character set is decided by ECMA(*)
4150    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4151    (0x30..0x3F are for private use only).
4152
4153    Note (*): ECMA = European Computer Manufacturers Association
4154
4155    Here are examples of graphic character set [NAME(<F>)]:
4156         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4157         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4158         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4159         o DIMENSION2_CHARS96 -- none for the moment
4160
4161    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4162         C0 [0x00..0x1F] -- control character plane 0
4163         GL [0x20..0x7F] -- graphic character plane 0
4164         C1 [0x80..0x9F] -- control character plane 1
4165         GR [0xA0..0xFF] -- graphic character plane 1
4166
4167    A control character set is directly designated and invoked to C0 or
4168    C1 by an escape sequence.  The most common case is that:
4169    - ISO646's  control character set is designated/invoked to C0, and
4170    - ISO6429's control character set is designated/invoked to C1,
4171    and usually these designations/invocations are omitted in encoded
4172    text.  In a 7-bit environment, only C0 can be used, and a control
4173    character for C1 is encoded by an appropriate escape sequence to
4174    fit into the environment.  All control characters for C1 are
4175    defined to have corresponding escape sequences.
4176
4177    A graphic character set is at first designated to one of four
4178    graphic registers (G0 through G3), then these graphic registers are
4179    invoked to GL or GR.  These designations and invocations can be
4180    done independently.  The most common case is that G0 is invoked to
4181    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4182    these invocations and designations are omitted in encoded text.
4183    In a 7-bit environment, only GL can be used.
4184
4185    When a graphic character set of CHARS94 is invoked to GL, codes
4186    0x20 and 0x7F of the GL area work as control characters SPACE and
4187    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4188    be used.
4189
4190    There are two ways of invocation: locking-shift and single-shift.
4191    With locking-shift, the invocation lasts until the next different
4192    invocation, whereas with single-shift, the invocation affects the
4193    following character only and doesn't affect the locking-shift
4194    state.  Invocations are done by the following control characters or
4195    escape sequences:
4196
4197    ----------------------------------------------------------------------
4198    abbrev  function                  cntrl escape seq   description
4199    ----------------------------------------------------------------------
4200    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4201    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4202    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4203    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4204    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4205    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4206    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4207    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4208    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4209    ----------------------------------------------------------------------
4210    (*) These are not used by any known coding system.
4211
4212    Control characters for these functions are defined by macros
4213    ISO_CODE_XXX in `coding.h'.
4214
4215    Designations are done by the following escape sequences:
4216    ----------------------------------------------------------------------
4217    escape sequence      description
4218    ----------------------------------------------------------------------
4219    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4220    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4221    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4222    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4223    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4224    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4225    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4226    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4227    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4228    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4229    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4230    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4231    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4232    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4233    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4234    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4235    ----------------------------------------------------------------------
4236
4237    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4238    of dimension 1, chars 94, and final character <F>, etc...
4239
4240    Note (*): Although these designations are not allowed in ISO2022,
4241    Emacs accepts them on decoding, and produces them on encoding
4242    CHARS96 character sets in a coding system which is characterized as
4243    7-bit environment, non-locking-shift, and non-single-shift.
4244
4245    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4246    '(' can be omitted.  We refer to this as "short-form" hereafter.
4247
4248    Now you may notice that there are a lot of ways for encoding the
4249    same multilingual text in ISO2022.  Actually, there exist many
4250    coding systems such as Compound Text (used in X11's inter client
4251    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4252    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4253    localized platforms), and all of these are variants of ISO2022.
4254
4255    In addition to the above, Emacs handles two more kinds of escape
4256    sequences: ISO6429's direction specification and Emacs' private
4257    sequence for specifying character composition.
4258
4259    ISO6429's direction specification takes the following form:
4260         o CSI ']'      -- end of the current direction
4261         o CSI '0' ']'  -- end of the current direction
4262         o CSI '1' ']'  -- start of left-to-right text
4263         o CSI '2' ']'  -- start of right-to-left text
4264    The control character CSI (0x9B: control sequence introducer) is
4265    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4266
4267    Character composition specification takes the following form:
4268         o ESC '0' -- start character composition
4269         o ESC '1' -- end character composition
4270    Since these are not standard escape sequences of any ISO standard,
4271    their use with these meanings is restricted to Emacs only.  */
4272
4273 static void
4274 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4275 {
4276         int i;
4277
4278         for (i = 0; i < 4; i++) {
4279                 if (!NILP(coding_system))
4280                         iso->charset[i] =
4281                             XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4282                             (coding_system, i);
4283                 else
4284                         iso->charset[i] = Qt;
4285                 iso->invalid_designated[i] = 0;
4286         }
4287         iso->esc = ISO_ESC_NOTHING;
4288         iso->esc_bytes_index = 0;
4289         iso->register_left = 0;
4290         iso->register_right = 1;
4291         iso->switched_dir_and_no_valid_charset_yet = 0;
4292         iso->invalid_switch_dir = 0;
4293         iso->output_direction_sequence = 0;
4294         iso->output_literally = 0;
4295 #ifdef ENABLE_COMPOSITE_CHARS
4296         if (iso->composite_chars)
4297                 Dynarr_reset(iso->composite_chars);
4298 #endif
4299 }
4300
4301 static int fit_to_be_escape_quoted(unsigned char c)
4302 {
4303         switch (c) {
4304         case ISO_CODE_ESC:
4305         case ISO_CODE_CSI:
4306         case ISO_CODE_SS2:
4307         case ISO_CODE_SS3:
4308         case ISO_CODE_SO:
4309         case ISO_CODE_SI:
4310                 return 1;
4311
4312         default:
4313                 return 0;
4314         }
4315 }
4316
4317 /* Parse one byte of an ISO2022 escape sequence.
4318    If the result is an invalid escape sequence, return 0 and
4319    do not change anything in STR.  Otherwise, if the result is
4320    an incomplete escape sequence, update ISO2022.ESC and
4321    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4322    all the state variables (but not ISO2022.ESC_BYTES) and
4323    return 1.
4324
4325    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4326    or invocation of an invalid character set and treat that as
4327    an unrecognized escape sequence.
4328
4329    ********************************************************************
4330
4331    #### Strategies for error annotation and coding orthogonalization
4332
4333    We really want to separate out a number of things.  Conceptually,
4334    there is a nested syntax.
4335
4336    At the top level is the ISO 2022 extension syntax, including charset
4337    designation and invocation, and certain auxiliary controls such as the
4338    ISO 6429 direction specification.  These are octet-oriented, with the
4339    single exception (AFAIK) of the "exit Unicode" sequence which uses the
4340    UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4341    UTF-16, and 4 bytes for UCS-4 and UTF-32).  This will be treated as a
4342    (deprecated) special case in Unicode processing.
4343
4344    The middle layer is ISO 2022 character interpretation.  This will depend
4345    on the current state of the ISO 2022 registers, and assembles octets
4346    into the character's internal representation.
4347
4348    The lowest level is translating system control conventions.  At present
4349    this is restricted to newline translation, but one could imagine doing
4350    tab conversion or line wrapping here.  "Escape from Unicode" processing
4351    would be done at this level.
4352
4353    At each level the parser will verify the syntax.  In the case of a
4354    syntax error or warning (such as a redundant escape sequence that affects
4355    no characters), the parser will take some action, typically inserting the
4356    erroneous octets directly into the output and creating an annotation
4357    which can be used by higher level I/O to mark the affected region.
4358
4359    This should make it possible to do something sensible about separating
4360    newline convention processing from character construction, and about
4361    preventing ISO 2022 escape sequences from being recognized
4362    inappropriately.
4363
4364    The basic strategy will be to have octet classification tables, and
4365    switch processing according to the table entry.
4366
4367    It's possible that, by doing the processing with tables of functions or
4368    the like, the parser can be used for both detection and translation. */
4369
4370 static int
4371 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4372                   unsigned char c, unsigned int *flags,
4373                   int check_invalid_charsets)
4374 {
4375         /* (1) If we're at the end of a designation sequence, CS is the
4376            charset being designated and REG is the register to designate
4377            it to.
4378
4379            (2) If we're at the end of a locking-shift sequence, REG is
4380            the register to invoke and HALF (0 == left, 1 == right) is
4381            the half to invoke it into.
4382
4383            (3) If we're at the end of a single-shift sequence, REG is
4384            the register to invoke. */
4385         Lisp_Object cs = Qnil;
4386         int reg, half;
4387
4388         /* NOTE: This code does goto's all over the fucking place.
4389            The reason for this is that we're basically implementing
4390            a state machine here, and hierarchical languages like C
4391            don't really provide a clean way of doing this. */
4392
4393         if (!(*flags & CODING_STATE_ESCAPE))
4394                 /* At beginning of escape sequence; we need to reset our
4395                    escape-state variables. */
4396                 iso->esc = ISO_ESC_NOTHING;
4397
4398         iso->output_literally = 0;
4399         iso->output_direction_sequence = 0;
4400
4401         switch (iso->esc) {
4402         case ISO_ESC_NOTHING:
4403                 iso->esc_bytes_index = 0;
4404                 switch (c) {
4405                 case ISO_CODE_ESC:      /* Start escape sequence */
4406                         *flags |= CODING_STATE_ESCAPE;
4407                         iso->esc = ISO_ESC;
4408                         goto not_done;
4409
4410                 case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4411                         *flags |= CODING_STATE_ESCAPE;
4412                         iso->esc = ISO_ESC_5_11;
4413                         goto not_done;
4414
4415                 case ISO_CODE_SO:       /* locking shift 1 */
4416                         reg = 1;
4417                         half = 0;
4418                         goto locking_shift;
4419                 case ISO_CODE_SI:       /* locking shift 0 */
4420                         reg = 0;
4421                         half = 0;
4422                         goto locking_shift;
4423
4424                 case ISO_CODE_SS2:      /* single shift */
4425                         reg = 2;
4426                         goto single_shift;
4427                 case ISO_CODE_SS3:      /* single shift */
4428                         reg = 3;
4429                         goto single_shift;
4430
4431                 default:        /* Other control characters */
4432                         return 0;
4433                 }
4434
4435         case ISO_ESC:
4436                 switch (c) {
4437           /**** single shift ****/
4438
4439                 case 'N':       /* single shift 2 */
4440                         reg = 2;
4441                         goto single_shift;
4442                 case 'O':       /* single shift 3 */
4443                         reg = 3;
4444                         goto single_shift;
4445
4446           /**** locking shift ****/
4447
4448                 case '~':       /* locking shift 1 right */
4449                         reg = 1;
4450                         half = 1;
4451                         goto locking_shift;
4452                 case 'n':       /* locking shift 2 */
4453                         reg = 2;
4454                         half = 0;
4455                         goto locking_shift;
4456                 case '}':       /* locking shift 2 right */
4457                         reg = 2;
4458                         half = 1;
4459                         goto locking_shift;
4460                 case 'o':       /* locking shift 3 */
4461                         reg = 3;
4462                         half = 0;
4463                         goto locking_shift;
4464                 case '|':       /* locking shift 3 right */
4465                         reg = 3;
4466                         half = 1;
4467                         goto locking_shift;
4468
4469 #ifdef ENABLE_COMPOSITE_CHARS
4470           /**** composite ****/
4471
4472                 case '0':
4473                         iso->esc = ISO_ESC_START_COMPOSITE;
4474                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4475                             CODING_STATE_COMPOSITE;
4476                         return 1;
4477
4478                 case '1':
4479                         iso->esc = ISO_ESC_END_COMPOSITE;
4480                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4481                             ~CODING_STATE_COMPOSITE;
4482                         return 1;
4483 #endif                          /* ENABLE_COMPOSITE_CHARS */
4484
4485           /**** directionality ****/
4486
4487                 case '[':
4488                         iso->esc = ISO_ESC_5_11;
4489                         goto not_done;
4490
4491           /**** designation ****/
4492
4493                 case '$':       /* multibyte charset prefix */
4494                         iso->esc = ISO_ESC_2_4;
4495                         goto not_done;
4496
4497                 default:
4498                         if (0x28 <= c && c <= 0x2F) {
4499                                 iso->esc =
4500                                     (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4501                                 goto not_done;
4502                         }
4503
4504                         /* This function is called with CODESYS equal to nil when
4505                            doing coding-system detection. */
4506                         if (!NILP(codesys)
4507                             && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4508                             && fit_to_be_escape_quoted(c)) {
4509                                 iso->esc = ISO_ESC_LITERAL;
4510                                 *flags &= CODING_STATE_ISO2022_LOCK;
4511                                 return 1;
4512                         }
4513
4514                         /* bzzzt! */
4515                         return 0;
4516                 }
4517
4518       /**** directionality ****/
4519
4520         case ISO_ESC_5_11:      /* ISO6429 direction control */
4521                 if (c == ']') {
4522                         *flags &=
4523                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4524                         goto directionality;
4525                 }
4526                 if (c == '0')
4527                         iso->esc = ISO_ESC_5_11_0;
4528                 else if (c == '1')
4529                         iso->esc = ISO_ESC_5_11_1;
4530                 else if (c == '2')
4531                         iso->esc = ISO_ESC_5_11_2;
4532                 else
4533                         return 0;
4534                 goto not_done;
4535
4536         case ISO_ESC_5_11_0:
4537                 if (c == ']') {
4538                         *flags &=
4539                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4540                         goto directionality;
4541                 }
4542                 return 0;
4543
4544         case ISO_ESC_5_11_1:
4545                 if (c == ']') {
4546                         *flags =
4547                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4548                         goto directionality;
4549                 }
4550                 return 0;
4551
4552         case ISO_ESC_5_11_2:
4553                 if (c == ']') {
4554                         *flags =
4555                             (*flags & CODING_STATE_ISO2022_LOCK) |
4556                             CODING_STATE_R2L;
4557                         goto directionality;
4558                 }
4559                 return 0;
4560
4561         directionality:
4562                 iso->esc = ISO_ESC_DIRECTIONALITY;
4563                 /* Various junk here to attempt to preserve the direction
4564                    sequences literally in the text if they would otherwise be
4565                    swallowed due to invalid designations that don't show up as
4566                    actual charset changes in the text. */
4567                 if (iso->invalid_switch_dir) {
4568                         /* We already inserted a direction switch literally into
4569                            the text.  We assume (#### this may not be right)
4570                            that the next direction switch is the one going the
4571                            other way, and we need to output that literally as
4572                            well. */
4573                         iso->output_literally = 1;
4574                         iso->invalid_switch_dir = 0;
4575                 } else {
4576                         int jj;
4577
4578                         /* If we are in the thrall of an invalid designation,
4579                            then stick the directionality sequence literally into
4580                            the output stream so it ends up in the original text
4581                            again. */
4582                         for (jj = 0; jj < 4; jj++)
4583                                 if (iso->invalid_designated[jj])
4584                                         break;
4585                         if (jj < 4) {
4586                                 iso->output_literally = 1;
4587                                 iso->invalid_switch_dir = 1;
4588                         } else
4589                                 /* Indicate that we haven't yet seen a valid
4590                                    designation, so that if a switch-dir is
4591                                    directly followed by an invalid designation,
4592                                    both get inserted literally. */
4593                                 iso->switched_dir_and_no_valid_charset_yet = 1;
4594                 }
4595                 return 1;
4596
4597                 /**** designation ****/
4598
4599         case ISO_ESC_2_4:
4600                 if (0x28 <= c && c <= 0x2F) {
4601                         iso->esc =
4602                             (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4603                         goto not_done;
4604                 }
4605                 if (0x40 <= c && c <= 0x42) {
4606                         cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4607                                                    *flags & CODING_STATE_R2L ?
4608                                                    CHARSET_RIGHT_TO_LEFT :
4609                                                    CHARSET_LEFT_TO_RIGHT);
4610                         reg = 0;
4611                         goto designated;
4612                 }
4613                 return 0;
4614
4615                 /* list the rest */
4616         case ISO_ESC_2_8:
4617         case ISO_ESC_2_9:
4618         case ISO_ESC_2_10:
4619         case ISO_ESC_2_11:
4620         case ISO_ESC_2_12:
4621         case ISO_ESC_2_13:
4622         case ISO_ESC_2_14:
4623         case ISO_ESC_2_15:
4624         case ISO_ESC_2_4_8:
4625         case ISO_ESC_2_4_9:
4626         case ISO_ESC_2_4_10:
4627         case ISO_ESC_2_4_11:
4628         case ISO_ESC_2_4_12:
4629         case ISO_ESC_2_4_13:
4630         case ISO_ESC_2_4_14:
4631         case ISO_ESC_2_4_15:
4632         case ISO_ESC_SINGLE_SHIFT:
4633         case ISO_ESC_LOCKING_SHIFT:
4634         case ISO_ESC_DESIGNATE:
4635         case ISO_ESC_DIRECTIONALITY:
4636         case ISO_ESC_LITERAL:
4637
4638         default: {
4639                 int type = -1;
4640
4641                 if (c < '0' || c > '~')
4642                         return 0;       /* bad final byte */
4643
4644                 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4645                         type = ((iso->esc >= ISO_ESC_2_12) ?
4646                                 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4647                         reg = (iso->esc - ISO_ESC_2_8) & 3;
4648                 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4649                            iso->esc <= ISO_ESC_2_4_15) {
4650                         type = ((iso->esc >= ISO_ESC_2_4_12) ?
4651                                 CHARSET_TYPE_96X96 :
4652                                 CHARSET_TYPE_94X94);
4653                         reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4654                 } else {
4655                         /* Can this ever be reached? -slb */
4656                         abort();
4657                         return 0;
4658                 }
4659
4660                 cs = CHARSET_BY_ATTRIBUTES(type, c,
4661                                            *flags & CODING_STATE_R2L ?
4662                                            CHARSET_RIGHT_TO_LEFT :
4663                                            CHARSET_LEFT_TO_RIGHT);
4664                 goto designated;
4665         }
4666         }
4667
4668       not_done:
4669         iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4670         return -1;
4671
4672       single_shift:
4673         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4674                 /* can't invoke something that ain't there. */
4675                 return 0;
4676         iso->esc = ISO_ESC_SINGLE_SHIFT;
4677         *flags &= CODING_STATE_ISO2022_LOCK;
4678         if (reg == 2)
4679                 *flags |= CODING_STATE_SS2;
4680         else
4681                 *flags |= CODING_STATE_SS3;
4682         return 1;
4683
4684       locking_shift:
4685         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4686                 /* can't invoke something that ain't there. */
4687                 return 0;
4688         if (half)
4689                 iso->register_right = reg;
4690         else
4691                 iso->register_left = reg;
4692         *flags &= CODING_STATE_ISO2022_LOCK;
4693         iso->esc = ISO_ESC_LOCKING_SHIFT;
4694         return 1;
4695
4696       designated:
4697         if (NILP(cs) && check_invalid_charsets) {
4698                 iso->invalid_designated[reg] = 1;
4699                 iso->charset[reg] = Vcharset_ascii;
4700                 iso->esc = ISO_ESC_DESIGNATE;
4701                 *flags &= CODING_STATE_ISO2022_LOCK;
4702                 iso->output_literally = 1;
4703                 if (iso->switched_dir_and_no_valid_charset_yet) {
4704                         /* We encountered a switch-direction followed by an
4705                            invalid designation.  Ensure that the switch-direction
4706                            gets outputted; otherwise it will probably get eaten
4707                            when the text is written out again. */
4708                         iso->switched_dir_and_no_valid_charset_yet = 0;
4709                         iso->output_direction_sequence = 1;
4710                         /* And make sure that the switch-dir going the other
4711                            way gets outputted, as well. */
4712                         iso->invalid_switch_dir = 1;
4713                 }
4714                 return 1;
4715         }
4716         /* This function is called with CODESYS equal to nil when
4717            doing coding-system detection. */
4718         if (!NILP(codesys)) {
4719                 charset_conversion_spec_dynarr *dyn =
4720                     XCODING_SYSTEM(codesys)->iso2022.input_conv;
4721
4722                 if (dyn) {
4723                         int i;
4724
4725                         for (i = 0; i < Dynarr_length(dyn); i++) {
4726                                 struct charset_conversion_spec *spec =
4727                                     Dynarr_atp(dyn, i);
4728                                 if (EQ(cs, spec->from_charset))
4729                                         cs = spec->to_charset;
4730                         }
4731                 }
4732         }
4733
4734         iso->charset[reg] = cs;
4735         iso->esc = ISO_ESC_DESIGNATE;
4736         *flags &= CODING_STATE_ISO2022_LOCK;
4737         if (iso->invalid_designated[reg]) {
4738                 iso->invalid_designated[reg] = 0;
4739                 iso->output_literally = 1;
4740         }
4741         if (iso->switched_dir_and_no_valid_charset_yet)
4742                 iso->switched_dir_and_no_valid_charset_yet = 0;
4743         return 1;
4744 }
4745
4746 static int
4747 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4748                       Lstream_data_count n)
4749 {
4750         int mask;
4751
4752         /* #### There are serious deficiencies in the recognition mechanism
4753            here.  This needs to be much smarter if it's going to cut it.
4754            The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4755            it should be detected as Latin-1.
4756            All the ISO2022 stuff in this file should be synced up with the
4757            code from FSF Emacs-20.4, in which Mule should be more or less stable.
4758            Perhaps we should wait till R2L works in FSF Emacs? */
4759
4760         if (!st->iso2022.initted) {
4761                 reset_iso2022(Qnil, &st->iso2022.iso);
4762                 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4763                                     CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4764                                     CODING_CATEGORY_ISO_8_1_MASK |
4765                                     CODING_CATEGORY_ISO_8_2_MASK |
4766                                     CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4767                 st->iso2022.flags = 0;
4768                 st->iso2022.high_byte_count = 0;
4769                 st->iso2022.saw_single_shift = 0;
4770                 st->iso2022.initted = 1;
4771         }
4772
4773         mask = st->iso2022.mask;
4774
4775         while (n--) {
4776                 const unsigned char c = *(const unsigned char *)src++;
4777                 if (c >= 0xA0) {
4778                         mask &= ~CODING_CATEGORY_ISO_7_MASK;
4779                         st->iso2022.high_byte_count++;
4780                 } else {
4781                         if (st->iso2022.high_byte_count
4782                             && !st->iso2022.saw_single_shift) {
4783                                 if (st->iso2022.high_byte_count & 1)
4784                                         /* odd number of high bytes; assume not iso-8-2 */
4785                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4786                         }
4787                         st->iso2022.high_byte_count = 0;
4788                         st->iso2022.saw_single_shift = 0;
4789                         if (c > 0x80)
4790                                 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4791                 }
4792                 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4793                     && (BYTE_C0_P(c) || BYTE_C1_P(c))) {        /* control chars */
4794                         switch (c) {
4795                                 /* Allow and ignore control characters that you might
4796                                    reasonably see in a text file */
4797                         case '\r':
4798                         case '\n':
4799                         case '\t':
4800                         case 7: /* bell */
4801                         case 8: /* backspace */
4802                         case 11:        /* vertical tab */
4803                         case 12:        /* form feed */
4804                         case 26:        /* MS-DOS C-z junk */
4805                         case 31:        /* '^_' -- for info */
4806                                 goto label_continue_loop;
4807
4808                         default:
4809                                 break;
4810                         }
4811                 }
4812
4813                 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4814                     || BYTE_C1_P(c)) {
4815                         if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4816                                               &st->iso2022.flags, 0)) {
4817                                 switch (st->iso2022.iso.esc) {
4818                                 case ISO_ESC_DESIGNATE:
4819                                         mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4820                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4821                                         break;
4822                                 case ISO_ESC_LOCKING_SHIFT:
4823                                         mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4824                                         goto ran_out_of_chars;
4825                                 case ISO_ESC_SINGLE_SHIFT:
4826                                         mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4827                                         st->iso2022.saw_single_shift = 1;
4828                                         break;
4829
4830                                         /* list the rest */
4831                                 case ISO_ESC_NOTHING:
4832                                 case ISO_ESC:
4833                                 case ISO_ESC_2_4:
4834                                 case ISO_ESC_2_8:
4835                                 case ISO_ESC_2_9:
4836                                 case ISO_ESC_2_10:
4837                                 case ISO_ESC_2_11:
4838                                 case ISO_ESC_2_12:
4839                                 case ISO_ESC_2_13:
4840                                 case ISO_ESC_2_14:
4841                                 case ISO_ESC_2_15:
4842                                 case ISO_ESC_2_4_8:
4843                                 case ISO_ESC_2_4_9:
4844                                 case ISO_ESC_2_4_10:
4845                                 case ISO_ESC_2_4_11:
4846                                 case ISO_ESC_2_4_12:
4847                                 case ISO_ESC_2_4_13:
4848                                 case ISO_ESC_2_4_14:
4849                                 case ISO_ESC_2_4_15:
4850                                 case ISO_ESC_5_11:
4851                                 case ISO_ESC_5_11_0:
4852                                 case ISO_ESC_5_11_1:
4853                                 case ISO_ESC_5_11_2:
4854                                 case ISO_ESC_DIRECTIONALITY:
4855                                 case ISO_ESC_LITERAL:
4856                                 default:
4857                                         break;
4858                                 }
4859                         } else {
4860                                 mask = 0;
4861                                 goto ran_out_of_chars;
4862                         }
4863                 }
4864         label_continue_loop:;
4865         }
4866
4867 ran_out_of_chars:
4868         return mask;
4869 }
4870
4871 static int postprocess_iso2022_mask(int mask)
4872 {
4873         /* #### kind of cheesy */
4874         /* If seven-bit ISO is allowed, then assume that the encoding is
4875            entirely seven-bit and turn off the eight-bit ones. */
4876         if (mask & CODING_CATEGORY_ISO_7_MASK)
4877                 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4878                           CODING_CATEGORY_ISO_8_1_MASK |
4879                           CODING_CATEGORY_ISO_8_2_MASK);
4880         return mask;
4881 }
4882
4883 /* If FLAGS is a null pointer or specifies right-to-left motion,
4884    output a switch-dir-to-left-to-right sequence to DST.
4885    Also update FLAGS if it is not a null pointer.
4886    If INTERNAL_P is set, we are outputting in internal format and
4887    need to handle the CSI differently. */
4888
4889 static void
4890 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4891                                 unsigned_char_dynarr * dst,
4892                                 unsigned int *flags, int internal_p)
4893 {
4894         if (!flags || (*flags & CODING_STATE_R2L)) {
4895                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4896                         Dynarr_add(dst, ISO_CODE_ESC);
4897                         Dynarr_add(dst, '[');
4898                 } else if (internal_p)
4899                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4900                 else
4901                         Dynarr_add(dst, ISO_CODE_CSI);
4902                 Dynarr_add(dst, '0');
4903                 Dynarr_add(dst, ']');
4904                 if (flags)
4905                         *flags &= ~CODING_STATE_R2L;
4906         }
4907 }
4908
4909 /* If FLAGS is a null pointer or specifies a direction different from
4910    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4911    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4912    sequence to DST.  Also update FLAGS if it is not a null pointer.
4913    If INTERNAL_P is set, we are outputting in internal format and
4914    need to handle the CSI differently. */
4915
4916 static void
4917 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4918                          unsigned_char_dynarr * dst, unsigned int *flags,
4919                          int internal_p)
4920 {
4921         if ((!flags || (*flags & CODING_STATE_R2L)) &&
4922             direction == CHARSET_LEFT_TO_RIGHT)
4923                 restore_left_to_right_direction(codesys, dst, flags,
4924                                                 internal_p);
4925         else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4926                  && (!flags || !(*flags & CODING_STATE_R2L)) &&
4927                  direction == CHARSET_RIGHT_TO_LEFT) {
4928                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4929                         Dynarr_add(dst, ISO_CODE_ESC);
4930                         Dynarr_add(dst, '[');
4931                 } else if (internal_p)
4932                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4933                 else
4934                         Dynarr_add(dst, ISO_CODE_CSI);
4935                 Dynarr_add(dst, '2');
4936                 Dynarr_add(dst, ']');
4937                 if (flags)
4938                         *flags |= CODING_STATE_R2L;
4939         }
4940 }
4941
4942 /* Convert ISO2022-format data to internal format. */
4943
4944 static void
4945 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4946                       unsigned_char_dynarr * dst, Lstream_data_count n)
4947 {
4948         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4949         unsigned int flags = str->flags;
4950         unsigned int ch = str->ch;
4951         eol_type_t eol_type = str->eol_type;
4952 #ifdef ENABLE_COMPOSITE_CHARS
4953         unsigned_char_dynarr *real_dst = dst;
4954 #endif
4955         Lisp_Object coding_system;
4956
4957         XSETCODING_SYSTEM(coding_system, str->codesys);
4958
4959 #ifdef ENABLE_COMPOSITE_CHARS
4960         if (flags & CODING_STATE_COMPOSITE)
4961                 dst = str->iso2022.composite_chars;
4962 #endif                          /* ENABLE_COMPOSITE_CHARS */
4963
4964         while (n--) {
4965                 const unsigned char c = *(const unsigned char *)src++;
4966                 if (flags & CODING_STATE_ESCAPE) {
4967                         /* Within ESC sequence */
4968                         int retval = parse_iso2022_esc(
4969                                 coding_system, &str->iso2022, c, &flags, 1);
4970
4971                         if (retval) {
4972                                 switch (str->iso2022.esc) {
4973 #ifdef ENABLE_COMPOSITE_CHARS
4974                                 case ISO_ESC_START_COMPOSITE:
4975                                         if (str->iso2022.composite_chars)
4976                                                 Dynarr_reset(str->iso2022.
4977                                                              composite_chars);
4978                                         else
4979                                                 str->iso2022.composite_chars =
4980                                                     Dynarr_new(unsigned_char);
4981                                         dst = str->iso2022.composite_chars;
4982                                         break;
4983                                 case ISO_ESC_END_COMPOSITE:
4984                                         {
4985                                                 Bufbyte comstr[MAX_EMCHAR_LEN];
4986                                                 Bytecount len;
4987                                                 Emchar emch =
4988                                                     lookup_composite_char
4989                                                     (Dynarr_atp(dst, 0),
4990                                                      Dynarr_length(dst));
4991                                                 dst = real_dst;
4992                                                 len =
4993                                                     set_charptr_emchar(comstr,
4994                                                                        emch);
4995                                                 Dynarr_add_many(dst, comstr,
4996                                                                 len);
4997                                                 break;
4998                                         }
4999 #endif                          /* ENABLE_COMPOSITE_CHARS */
5000
5001                                 case ISO_ESC_LITERAL:
5002                                         DECODE_ADD_BINARY_CHAR(c, dst);
5003                                         break;
5004
5005                                 case ISO_ESC_NOTHING:
5006                                 case ISO_ESC:
5007                                 case ISO_ESC_2_4:
5008                                 case ISO_ESC_2_8:
5009                                 case ISO_ESC_2_9:
5010                                 case ISO_ESC_2_10:
5011                                 case ISO_ESC_2_11:
5012                                 case ISO_ESC_2_12:
5013                                 case ISO_ESC_2_13:
5014                                 case ISO_ESC_2_14:
5015                                 case ISO_ESC_2_15:
5016                                 case ISO_ESC_2_4_8:
5017                                 case ISO_ESC_2_4_9:
5018                                 case ISO_ESC_2_4_10:
5019                                 case ISO_ESC_2_4_11:
5020                                 case ISO_ESC_2_4_12:
5021                                 case ISO_ESC_2_4_13:
5022                                 case ISO_ESC_2_4_14:
5023                                 case ISO_ESC_2_4_15:
5024                                 case ISO_ESC_5_11:
5025                                 case ISO_ESC_5_11_0:
5026                                 case ISO_ESC_5_11_1:
5027                                 case ISO_ESC_5_11_2:
5028                                 case ISO_ESC_SINGLE_SHIFT:
5029                                 case ISO_ESC_LOCKING_SHIFT:
5030                                 case ISO_ESC_DESIGNATE:
5031                                 case ISO_ESC_DIRECTIONALITY:
5032
5033                                 default:
5034                                         /* Everything else handled already */
5035                                         break;
5036                                 }
5037                         }
5038
5039                         /* Attempted error recovery. */
5040                         if (str->iso2022.output_direction_sequence)
5041                                 ensure_correct_direction(flags &
5042                                                          CODING_STATE_R2L ?
5043                                                          CHARSET_RIGHT_TO_LEFT :
5044                                                          CHARSET_LEFT_TO_RIGHT,
5045                                                          str->codesys, dst, 0,
5046                                                          1);
5047                         /* More error recovery. */
5048                         if (!retval || str->iso2022.output_literally) {
5049                                 /* Output the (possibly invalid) sequence */
5050                                 int i;
5051                                 for (i = 0; i < str->iso2022.esc_bytes_index;
5052                                      i++)
5053                                         DECODE_ADD_BINARY_CHAR(str->iso2022.
5054                                                                esc_bytes[i],
5055                                                                dst);
5056                                 flags &= CODING_STATE_ISO2022_LOCK;
5057                                 if (!retval)
5058                                         n++, src--;     /* Repeat the loop with the same character. */
5059                                 else {
5060                                         /* No sense in reprocessing the final byte of the
5061                                            escape sequence; it could mess things up anyway.
5062                                            Just add it now. */
5063                                         DECODE_ADD_BINARY_CHAR(c, dst);
5064                                 }
5065                         }
5066                         ch = 0;
5067                 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) {      /* Control characters */
5068
5069           /***** Error-handling *****/
5070
5071                         /* If we were in the middle of a character, dump out the
5072                            partial character. */
5073                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5074
5075                         /* If we just saw a single-shift character, dump it out.
5076                            This may dump out the wrong sort of single-shift character,
5077                            but least it will give an indication that something went
5078                            wrong. */
5079                         if (flags & CODING_STATE_SS2) {
5080                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5081                                 flags &= ~CODING_STATE_SS2;
5082                         }
5083                         if (flags & CODING_STATE_SS3) {
5084                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5085                                 flags &= ~CODING_STATE_SS3;
5086                         }
5087
5088           /***** Now handle the control characters. *****/
5089
5090                         /* Handle CR/LF */
5091                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5092
5093                         flags &= CODING_STATE_ISO2022_LOCK;
5094
5095                         if (!parse_iso2022_esc
5096                             (coding_system, &str->iso2022, c, &flags, 1))
5097                                 DECODE_ADD_BINARY_CHAR(c, dst);
5098                 } else {        /* Graphic characters */
5099                         Lisp_Object charset;
5100                         int lb;
5101                         int reg;
5102
5103                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5104
5105                         /* Now determine the charset. */
5106                         reg = ((flags & CODING_STATE_SS2) ? 2
5107                                : (flags & CODING_STATE_SS3) ? 3
5108                                : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5109                                : str->iso2022.register_left);
5110                         charset = str->iso2022.charset[reg];
5111
5112                         /* Error checking: */
5113                         if (!CHARSETP(charset)
5114                             || str->iso2022.invalid_designated[reg]
5115                             ||
5116                             (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5117                              && XCHARSET_CHARS(charset) == 94))
5118                                 /* Mrmph.  We are trying to invoke a register that has no
5119                                    or an invalid charset in it, or trying to add a character
5120                                    outside the range of the charset.  Insert that char literally
5121                                    to preserve it for the output. */
5122                         {
5123                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5124                                 DECODE_ADD_BINARY_CHAR(c, dst);
5125                         }
5126
5127                         else {
5128                                 /* Things are probably hunky-dorey. */
5129
5130                                 /* Fetch reverse charset, maybe. */
5131                                 if (((flags & CODING_STATE_R2L) &&
5132                                      XCHARSET_DIRECTION(charset) ==
5133                                      CHARSET_LEFT_TO_RIGHT)
5134                                     || (!(flags & CODING_STATE_R2L)
5135                                         && XCHARSET_DIRECTION(charset) ==
5136                                         CHARSET_RIGHT_TO_LEFT)) {
5137                                         Lisp_Object new_charset =
5138                                             XCHARSET_REVERSE_DIRECTION_CHARSET
5139                                             (charset);
5140                                         if (!NILP(new_charset))
5141                                                 charset = new_charset;
5142                                 }
5143
5144                                 lb = XCHARSET_LEADING_BYTE(charset);
5145                                 switch (XCHARSET_REP_BYTES(charset)) {
5146                                 case 1: /* ASCII */
5147                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5148                                         Dynarr_add(dst, c & 0x7F);
5149                                         break;
5150
5151                                 case 2: /* one-byte official */
5152                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5153                                         Dynarr_add(dst, lb);
5154                                         Dynarr_add(dst, c | 0x80);
5155                                         break;
5156
5157                                 case 3: /* one-byte private or two-byte official */
5158                                         if (XCHARSET_PRIVATE_P(charset)) {
5159                                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5160                                                 Dynarr_add(dst,
5161                                                            PRE_LEADING_BYTE_PRIVATE_1);
5162                                                 Dynarr_add(dst, lb);
5163                                                 Dynarr_add(dst, c | 0x80);
5164                                         } else {
5165                                                 if (ch) {
5166                                                         Dynarr_add(dst, lb);
5167                                                         Dynarr_add(dst,
5168                                                                    ch | 0x80);
5169                                                         Dynarr_add(dst,
5170                                                                    c | 0x80);
5171                                                         ch = 0;
5172                                                 } else
5173                                                         ch = c;
5174                                         }
5175                                         break;
5176
5177                                 default:        /* two-byte private */
5178                                         if (ch) {
5179                                                 Dynarr_add(dst,
5180                                                            PRE_LEADING_BYTE_PRIVATE_2);
5181                                                 Dynarr_add(dst, lb);
5182                                                 Dynarr_add(dst, ch | 0x80);
5183                                                 Dynarr_add(dst, c | 0x80);
5184                                                 ch = 0;
5185                                         } else
5186                                                 ch = c;
5187                                 }
5188                         }
5189
5190                         if (!ch)
5191                                 flags &= CODING_STATE_ISO2022_LOCK;
5192                 }
5193
5194               label_continue_loop:;
5195         }
5196
5197         if (flags & CODING_STATE_END)
5198                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5199
5200         str->flags = flags;
5201         str->ch = ch;
5202 }
5203
5204 /***** ISO2022 encoder *****/
5205
5206 /* Designate CHARSET into register REG. */
5207
5208 static void
5209 iso2022_designate(Lisp_Object charset, unsigned char reg,
5210                   encoding_stream_t str, unsigned_char_dynarr * dst)
5211 {
5212         static const char inter94[] = "()*+";
5213         static const char inter96[] = ",-./";
5214         unsigned int type;
5215         unsigned char final;
5216         Lisp_Object old_charset = str->iso2022.charset[reg];
5217
5218         str->iso2022.charset[reg] = charset;
5219         if (!CHARSETP(charset))
5220                 /* charset might be an initial nil or t. */
5221                 return;
5222         type = XCHARSET_TYPE(charset);
5223         final = XCHARSET_FINAL(charset);
5224         if (!str->iso2022.force_charset_on_output[reg] &&
5225             CHARSETP(old_charset) &&
5226             XCHARSET_TYPE(old_charset) == type &&
5227             XCHARSET_FINAL(old_charset) == final)
5228                 return;
5229
5230         str->iso2022.force_charset_on_output[reg] = 0;
5231
5232         {
5233                 charset_conversion_spec_dynarr *dyn =
5234                     str->codesys->iso2022.output_conv;
5235
5236                 if (dyn) {
5237                         int i;
5238
5239                         for (i = 0; i < Dynarr_length(dyn); i++) {
5240                                 struct charset_conversion_spec *spec =
5241                                     Dynarr_atp(dyn, i);
5242                                 if (EQ(charset, spec->from_charset))
5243                                         charset = spec->to_charset;
5244                         }
5245                 }
5246         }
5247
5248         Dynarr_add(dst, ISO_CODE_ESC);
5249         switch (type) {
5250         case CHARSET_TYPE_94:
5251                 Dynarr_add(dst, inter94[reg]);
5252                 break;
5253         case CHARSET_TYPE_96:
5254                 Dynarr_add(dst, inter96[reg]);
5255                 break;
5256         case CHARSET_TYPE_94X94:
5257                 Dynarr_add(dst, '$');
5258                 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5259                     || final < '@' || final > 'B')
5260                         Dynarr_add(dst, inter94[reg]);
5261                 break;
5262         case CHARSET_TYPE_96X96:
5263                 Dynarr_add(dst, '$');
5264                 Dynarr_add(dst, inter96[reg]);
5265                 break;
5266         default:
5267                 break;
5268         }
5269         Dynarr_add(dst, final);
5270 }
5271
5272 static void
5273 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5274 {
5275         if (str->iso2022.register_left != 0) {
5276                 Dynarr_add(dst, ISO_CODE_SI);
5277                 str->iso2022.register_left = 0;
5278         }
5279 }
5280
5281 static void
5282 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5283 {
5284         if (str->iso2022.register_left != 1) {
5285                 Dynarr_add(dst, ISO_CODE_SO);
5286                 str->iso2022.register_left = 1;
5287         }
5288 }
5289
5290 /* Convert internally-formatted data to ISO2022 format. */
5291
5292 static void
5293 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5294                       unsigned_char_dynarr * dst, Lstream_data_count n)
5295 {
5296         unsigned char charmask, c;
5297         unsigned char char_boundary;
5298         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5299         unsigned int flags = str->flags;
5300         unsigned int ch = str->ch;
5301         Lisp_Coding_System *codesys = str->codesys;
5302         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5303         int i;
5304         Lisp_Object charset;
5305         int half;
5306
5307 #ifdef ENABLE_COMPOSITE_CHARS
5308         /* flags for handling composite chars.  We do a little switcharoo
5309            on the source while we're outputting the composite char. */
5310         unsigned int saved_n = 0;
5311         const unsigned char *saved_src = NULL;
5312         int in_composite = 0;
5313 #endif                          /* ENABLE_COMPOSITE_CHARS */
5314
5315         char_boundary = str->iso2022.current_char_boundary;
5316         charset = str->iso2022.current_charset;
5317         half = str->iso2022.current_half;
5318
5319 #ifdef ENABLE_COMPOSITE_CHARS
5320       back_to_square_n:
5321 #endif
5322         while (n--) {
5323                 c = *src++;
5324
5325                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
5326                         ch = 0;
5327
5328                         restore_left_to_right_direction(codesys, dst, &flags,
5329                                                         0);
5330
5331                         /* Make sure G0 contains ASCII */
5332                         if ((c > ' ' && c < ISO_CODE_DEL) ||
5333                             !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5334                                 ensure_normal_shift(str, dst);
5335                                 iso2022_designate(Vcharset_ascii, 0, str, dst);
5336                         }
5337
5338                         /* If necessary, restore everything to the default state
5339                            at end-of-line */
5340                         if (c == '\n' &&
5341                             !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5342                                 restore_left_to_right_direction(codesys, dst,
5343                                                                 &flags, 0);
5344
5345                                 ensure_normal_shift(str, dst);
5346
5347                                 for (i = 0; i < 4; i++) {
5348                                         Lisp_Object initial_charset =
5349                                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5350                                             (codesys, i);
5351                                         iso2022_designate(initial_charset, i,
5352                                                           str, dst);
5353                                 }
5354                         }
5355                         if (c == '\n') {
5356                                 if (eol_type != EOL_LF
5357                                     && eol_type != EOL_AUTODETECT)
5358                                         Dynarr_add(dst, '\r');
5359                                 if (eol_type != EOL_CR)
5360                                         Dynarr_add(dst, c);
5361                         } else {
5362                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5363                                     && fit_to_be_escape_quoted(c))
5364                                         Dynarr_add(dst, ISO_CODE_ESC);
5365                                 Dynarr_add(dst, c);
5366                         }
5367                         char_boundary = 1;
5368                 }
5369
5370                 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {     /* Processing Leading Byte */
5371                         ch = 0;
5372                         charset = CHARSET_BY_LEADING_BYTE(c);
5373                         if (LEADING_BYTE_PREFIX_P(c))
5374                                 ch = c;
5375                         else if (!EQ(charset, Vcharset_control_1)
5376 #ifdef ENABLE_COMPOSITE_CHARS
5377                                  && !EQ(charset, Vcharset_composite)
5378 #endif
5379                             ) {
5380                                 int reg;
5381
5382                                 ensure_correct_direction(XCHARSET_DIRECTION
5383                                                          (charset), codesys,
5384                                                          dst, &flags, 0);
5385
5386                                 /* Now determine which register to use. */
5387                                 reg = -1;
5388                                 for (i = 0; i < 4; i++) {
5389                                         if (EQ(charset, str->iso2022.charset[i])
5390                                             || EQ(charset,
5391                                                   CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5392                                                   (codesys, i))) {
5393                                                 reg = i;
5394                                                 break;
5395                                         }
5396                                 }
5397
5398                                 if (reg == -1) {
5399                                         if (XCHARSET_GRAPHIC(charset) != 0) {
5400                                                 if (!NILP
5401                                                     (str->iso2022.charset[1])
5402                                                     &&
5403                                                     (!CODING_SYSTEM_ISO2022_SEVEN
5404                                                      (codesys)
5405                                                      ||
5406                                                      CODING_SYSTEM_ISO2022_LOCK_SHIFT
5407                                                      (codesys)))
5408                                                         reg = 1;
5409                                                 else if (!NILP
5410                                                          (str->iso2022.
5411                                                           charset[2]))
5412                                                         reg = 2;
5413                                                 else if (!NILP
5414                                                          (str->iso2022.
5415                                                           charset[3]))
5416                                                         reg = 3;
5417                                                 else
5418                                                         reg = 0;
5419                                         } else
5420                                                 reg = 0;
5421                                 }
5422
5423                                 iso2022_designate(charset, reg, str, dst);
5424
5425                                 /* Now invoke that register. */
5426                                 switch (reg) {
5427                                 case 0:
5428                                         ensure_normal_shift(str, dst);
5429                                         half = 0;
5430                                         break;
5431
5432                                 case 1:
5433                                         if (CODING_SYSTEM_ISO2022_SEVEN
5434                                             (codesys)) {
5435                                                 ensure_shift_out(str, dst);
5436                                                 half = 0;
5437                                         } else
5438                                                 half = 1;
5439                                         break;
5440
5441                                 case 2:
5442                                         if (CODING_SYSTEM_ISO2022_SEVEN
5443                                             (str->codesys)) {
5444                                                 Dynarr_add(dst, ISO_CODE_ESC);
5445                                                 Dynarr_add(dst, 'N');
5446                                                 half = 0;
5447                                         } else {
5448                                                 Dynarr_add(dst, ISO_CODE_SS2);
5449                                                 half = 1;
5450                                         }
5451                                         break;
5452
5453                                 case 3:
5454                                         if (CODING_SYSTEM_ISO2022_SEVEN
5455                                             (str->codesys)) {
5456                                                 Dynarr_add(dst, ISO_CODE_ESC);
5457                                                 Dynarr_add(dst, 'O');
5458                                                 half = 0;
5459                                         } else {
5460                                                 Dynarr_add(dst, ISO_CODE_SS3);
5461                                                 half = 1;
5462                                         }
5463                                         break;
5464
5465                                 default:
5466                                         abort();
5467                                 }
5468                         }
5469                         char_boundary = 0;
5470                 } else {        /* Processing Non-ASCII character */
5471                         charmask = (half == 0 ? 0x7F : 0xFF);
5472                         char_boundary = 1;
5473                         if (EQ(charset, Vcharset_control_1)) {
5474                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5475                                     && fit_to_be_escape_quoted(c))
5476                                         Dynarr_add(dst, ISO_CODE_ESC);
5477                                 /* you asked for it ... */
5478                                 Dynarr_add(dst, c - 0x20);
5479                         } else {
5480                                 switch (XCHARSET_REP_BYTES(charset)) {
5481                                 case 2:
5482                                         Dynarr_add(dst, c & charmask);
5483                                         break;
5484                                 case 3:
5485                                         if (XCHARSET_PRIVATE_P(charset)) {
5486                                                 Dynarr_add(dst, c & charmask);
5487                                                 ch = 0;
5488                                         } else if (ch) {
5489 #ifdef ENABLE_COMPOSITE_CHARS
5490                                                 if (EQ
5491                                                     (charset,
5492                                                      Vcharset_composite)) {
5493                                                         if (in_composite) {
5494                                                                 /* #### Bother! We don't know how to
5495                                                                    handle this yet. */
5496                                                                 Dynarr_add(dst,
5497                                                                            '~');
5498                                                         } else {
5499                                                                 Emchar emch =
5500                                                                     MAKE_CHAR
5501                                                                     (Vcharset_composite,
5502                                                                      ch & 0x7F,
5503                                                                      c & 0x7F);
5504                                                                 Lisp_Object lstr
5505                                                                     =
5506                                                                     composite_char_string
5507                                                                     (emch);
5508                                                                 saved_n = n;
5509                                                                 saved_src = src;
5510                                                                 in_composite =
5511                                                                     1;
5512                                                                 src =
5513                                                                     XSTRING_DATA
5514                                                                     (lstr);
5515                                                                 n = XSTRING_LENGTH(lstr);
5516                                                                 Dynarr_add(dst,
5517                                                                            ISO_CODE_ESC);
5518                                                                 Dynarr_add(dst, '0');   /* start composing */
5519                                                         }
5520                                                 } else
5521 #endif                          /* ENABLE_COMPOSITE_CHARS */
5522                                                 {
5523                                                         Dynarr_add(dst,
5524                                                                    ch &
5525                                                                    charmask);
5526                                                         Dynarr_add(dst,
5527                                                                    c &
5528                                                                    charmask);
5529                                                 }
5530                                                 ch = 0;
5531                                         } else {
5532                                                 ch = c;
5533                                                 char_boundary = 0;
5534                                         }
5535                                         break;
5536                                 case 4:
5537                                         if (ch) {
5538                                                 Dynarr_add(dst, ch & charmask);
5539                                                 Dynarr_add(dst, c & charmask);
5540                                                 ch = 0;
5541                                         } else {
5542                                                 ch = c;
5543                                                 char_boundary = 0;
5544                                         }
5545                                         break;
5546                                 default:
5547                                         abort();
5548                                 }
5549                         }
5550                 }
5551         }
5552
5553 #ifdef ENABLE_COMPOSITE_CHARS
5554         if (in_composite) {
5555                 n = saved_n;
5556                 src = saved_src;
5557                 in_composite = 0;
5558                 Dynarr_add(dst, ISO_CODE_ESC);
5559                 Dynarr_add(dst, '1');   /* end composing */
5560                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
5561         }
5562 #endif                          /* ENABLE_COMPOSITE_CHARS */
5563
5564         if (char_boundary && flags & CODING_STATE_END) {
5565                 restore_left_to_right_direction(codesys, dst, &flags, 0);
5566                 ensure_normal_shift(str, dst);
5567                 for (i = 0; i < 4; i++) {
5568                         Lisp_Object initial_charset =
5569                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5570                         iso2022_designate(initial_charset, i, str, dst);
5571                 }
5572         }
5573
5574         str->flags = flags;
5575         str->ch = ch;
5576         str->iso2022.current_char_boundary = char_boundary;
5577         str->iso2022.current_charset = charset;
5578         str->iso2022.current_half = half;
5579
5580         /* Verbum caro factum est! */
5581 }
5582 #endif                          /* MULE */
5583 \f
5584 /************************************************************************/
5585 /*                     No-conversion methods                            */
5586 /************************************************************************/
5587
5588 /* This is used when reading in "binary" files -- i.e. files that may
5589    contain all 256 possible byte values and that are not to be
5590    interpreted as being in any particular decoding. */
5591 static void
5592 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5593                             unsigned_char_dynarr * dst, Lstream_data_count n)
5594 {
5595         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5596         unsigned int flags = str->flags;
5597         unsigned int ch = str->ch;
5598         eol_type_t eol_type = str->eol_type;
5599
5600         while (n--) {
5601                 const unsigned char c = *(const unsigned char *)src++;
5602
5603                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5604                 DECODE_ADD_BINARY_CHAR(c, dst);
5605         label_continue_loop:;
5606         }
5607
5608         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5609
5610         str->flags = flags;
5611         str->ch = ch;
5612 }
5613
5614 static void
5615 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5616                             unsigned_char_dynarr * dst, Lstream_data_count n)
5617 {
5618         unsigned char c;
5619         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5620         unsigned int flags = str->flags;
5621         unsigned int ch = str->ch;
5622         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5623
5624         while (n--) {
5625                 c = *src++;
5626                 if (c == '\n') {
5627                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5628                                 Dynarr_add(dst, '\r');
5629                         if (eol_type != EOL_CR)
5630                                 Dynarr_add(dst, '\n');
5631                         ch = 0;
5632                 } else if (BYTE_ASCII_P(c)) {
5633                         assert(ch == 0);
5634                         Dynarr_add(dst, c);
5635                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5636                         assert(ch == 0);
5637                         if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5638                             c == LEADING_BYTE_CONTROL_1)
5639                                 ch = c;
5640                         else
5641                                 Dynarr_add(dst, '~');   /* untranslatable character */
5642                 } else {
5643                         if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5644                                 Dynarr_add(dst, c);
5645                         else if (ch == LEADING_BYTE_CONTROL_1) {
5646                                 assert(c < 0xC0);
5647                                 Dynarr_add(dst, c - 0x20);
5648                         }
5649                         /* else it should be the second or third byte of an
5650                            untranslatable character, so ignore it */
5651                         ch = 0;
5652                 }
5653         }
5654
5655         str->flags = flags;
5656         str->ch = ch;
5657 }
5658 \f
5659 /************************************************************************/
5660 /*                             Initialization                           */
5661 /************************************************************************/
5662
5663 void syms_of_file_coding(void)
5664 {
5665         INIT_LRECORD_IMPLEMENTATION(coding_system);
5666
5667         DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5668
5669         DEFSUBR(Fcoding_system_p);
5670         DEFSUBR(Ffind_coding_system);
5671         DEFSUBR(Fget_coding_system);
5672         DEFSUBR(Fcoding_system_list);
5673         DEFSUBR(Fcoding_system_name);
5674         DEFSUBR(Fmake_coding_system);
5675         DEFSUBR(Fcopy_coding_system);
5676         DEFSUBR(Fcoding_system_canonical_name_p);
5677         DEFSUBR(Fcoding_system_alias_p);
5678         DEFSUBR(Fcoding_system_aliasee);
5679         DEFSUBR(Fdefine_coding_system_alias);
5680         DEFSUBR(Fsubsidiary_coding_system);
5681
5682         DEFSUBR(Fcoding_system_type);
5683         DEFSUBR(Fcoding_system_doc_string);
5684 #ifdef MULE
5685         DEFSUBR(Fcoding_system_charset);
5686 #endif
5687         DEFSUBR(Fcoding_system_property);
5688
5689         DEFSUBR(Fcoding_category_list);
5690         DEFSUBR(Fset_coding_priority_list);
5691         DEFSUBR(Fcoding_priority_list);
5692         DEFSUBR(Fset_coding_category_system);
5693         DEFSUBR(Fcoding_category_system);
5694
5695         DEFSUBR(Fdetect_coding_region);
5696         DEFSUBR(Fdecode_coding_region);
5697         DEFSUBR(Fencode_coding_region);
5698 #ifdef MULE
5699         DEFSUBR(Fdecode_shift_jis_char);
5700         DEFSUBR(Fencode_shift_jis_char);
5701         DEFSUBR(Fdecode_big5_char);
5702         DEFSUBR(Fencode_big5_char);
5703         DEFSUBR(Fset_ucs_char);
5704         DEFSUBR(Fucs_char);
5705         DEFSUBR(Fset_char_ucs);
5706         DEFSUBR(Fchar_ucs);
5707 #endif                          /* MULE */
5708         defsymbol(&Qcoding_systemp, "coding-system-p");
5709         defsymbol(&Qno_conversion, "no-conversion");
5710         defsymbol(&Qraw_text, "raw-text");
5711 #ifdef MULE
5712         defsymbol(&Qbig5, "big5");
5713         defsymbol(&Qshift_jis, "shift-jis");
5714         defsymbol(&Qucs4, "ucs-4");
5715         defsymbol(&Qutf8, "utf-8");
5716         defsymbol(&Qccl, "ccl");
5717         defsymbol(&Qiso2022, "iso2022");
5718 #endif                          /* MULE */
5719         defsymbol(&Qmnemonic, "mnemonic");
5720         defsymbol(&Qeol_type, "eol-type");
5721         defsymbol(&Qpost_read_conversion, "post-read-conversion");
5722         defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5723
5724         defsymbol(&Qcr, "cr");
5725         defsymbol(&Qlf, "lf");
5726         defsymbol(&Qcrlf, "crlf");
5727         defsymbol(&Qeol_cr, "eol-cr");
5728         defsymbol(&Qeol_lf, "eol-lf");
5729         defsymbol(&Qeol_crlf, "eol-crlf");
5730 #ifdef MULE
5731         defsymbol(&Qcharset_g0, "charset-g0");
5732         defsymbol(&Qcharset_g1, "charset-g1");
5733         defsymbol(&Qcharset_g2, "charset-g2");
5734         defsymbol(&Qcharset_g3, "charset-g3");
5735         defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5736         defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5737         defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5738         defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5739         defsymbol(&Qno_iso6429, "no-iso6429");
5740         defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5741         defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5742
5743         defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5744         defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5745         defsymbol(&Qseven, "seven");
5746         defsymbol(&Qlock_shift, "lock-shift");
5747         defsymbol(&Qescape_quoted, "escape-quoted");
5748 #endif                          /* MULE */
5749         defsymbol(&Qencode, "encode");
5750         defsymbol(&Qdecode, "decode");
5751
5752 #ifdef MULE
5753         defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5754                   "shift-jis");
5755         defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5756         defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5757         defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5758         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5759         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5760                   "iso-8-designate");
5761         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5762         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5763         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5764                   "iso-lock-shift");
5765 #endif                          /* MULE */
5766         defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5767                   "no-conversion");
5768 }
5769
5770 void lstream_type_create_file_coding(void)
5771 {
5772         LSTREAM_HAS_METHOD(decoding, reader);
5773         LSTREAM_HAS_METHOD(decoding, writer);
5774         LSTREAM_HAS_METHOD(decoding, rewinder);
5775         LSTREAM_HAS_METHOD(decoding, seekable_p);
5776         LSTREAM_HAS_METHOD(decoding, flusher);
5777         LSTREAM_HAS_METHOD(decoding, closer);
5778         LSTREAM_HAS_METHOD(decoding, marker);
5779
5780         LSTREAM_HAS_METHOD(encoding, reader);
5781         LSTREAM_HAS_METHOD(encoding, writer);
5782         LSTREAM_HAS_METHOD(encoding, rewinder);
5783         LSTREAM_HAS_METHOD(encoding, seekable_p);
5784         LSTREAM_HAS_METHOD(encoding, flusher);
5785         LSTREAM_HAS_METHOD(encoding, closer);
5786         LSTREAM_HAS_METHOD(encoding, marker);
5787 }
5788
5789 void vars_of_file_coding(void)
5790 {
5791         int i;
5792
5793         fcd = xnew(struct file_coding_dump);
5794         dump_add_root_struct_ptr(&fcd, &fcd_description);
5795
5796         /* Initialize to something reasonable ... */
5797         for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5798                 fcd->coding_category_system[i] = Qnil;
5799                 fcd->coding_category_by_priority[i] = i;
5800         }
5801
5802         Fprovide(intern("file-coding"));
5803
5804         DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system  /*
5805 Coding system used for TTY keyboard input.
5806 Not used under a windowing system.
5807                                                                          */ );
5808         Vkeyboard_coding_system = Qnil;
5809
5810         DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system  /*
5811 Coding system used for TTY display output.
5812 Not used under a windowing system.
5813                                                                          */ );
5814         Vterminal_coding_system = Qnil;
5815
5816         DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read  /*
5817 Overriding coding system used when reading from a file or process.
5818 You should bind this variable with `let', but do not set it globally.
5819 If this is non-nil, it specifies the coding system that will be used
5820 to decode input on read operations, such as from a file or process.
5821 It overrides `buffer-file-coding-system-for-read',
5822 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5823 this one for permanent changes to the environment.  */ );
5824         Vcoding_system_for_read = Qnil;
5825
5826         DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write        /*
5827 Overriding coding system used when writing to a file or process.
5828 You should bind this variable with `let', but do not set it globally.
5829 If this is non-nil, it specifies the coding system that will be used
5830 to encode output for write operations, such as to a file or process.
5831 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5832 Use those variables instead of this one for permanent changes to the
5833 environment.  */ );
5834         Vcoding_system_for_write = Qnil;
5835
5836         DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system        /*
5837 Coding system used to convert pathnames when accessing files.
5838                                                                                  */ );
5839         Vfile_name_coding_system = Qnil;
5840
5841         DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5842 Non-nil means the buffer contents are regarded as multi-byte form
5843 of characters, not a binary code.  This affects the display, file I/O,
5844 and behaviors of various editing commands.
5845
5846 Setting this to nil does not do anything.
5847                                                                                  */ );
5848         enable_multibyte_characters = 1;
5849 }
5850
5851 void complex_vars_of_file_coding(void)
5852 {
5853         staticpro(&Vcoding_system_hash_table);
5854         Vcoding_system_hash_table =
5855             make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5856
5857         the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5858         dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5859                                  &codesys_prop_dynarr_description);
5860
5861 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5862 {                                               \
5863   struct codesys_prop csp;                      \
5864   csp.sym = (Sym);                              \
5865   csp.prop_type = (Prop_Type);                  \
5866   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5867 } while (0)
5868
5869         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5870         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5871         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5872         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5873         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5874         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5875         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5876 #ifdef MULE
5877         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5878         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5879         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5880         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5881         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5882         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5883         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5884         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5885         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5886         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5887         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5888         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5889         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5890         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5891         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5892         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5893         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5894
5895         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5896         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5897 #endif                          /* MULE */
5898         /* Need to create this here or we're really screwed. */
5899         Fmake_coding_system
5900             (Qraw_text, Qno_conversion,
5901              build_string
5902              ("Raw text, which means it converts only line-break-codes."),
5903              list2(Qmnemonic, build_string("Raw")));
5904
5905         Fmake_coding_system
5906             (Qbinary, Qno_conversion,
5907              build_string("Binary, which means it does not convert anything."),
5908              list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5909
5910         Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5911
5912         Fdefine_coding_system_alias(Qfile_name, Qbinary);
5913
5914         Fdefine_coding_system_alias(Qterminal, Qbinary);
5915         Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5916
5917         /* Need this for bootstrapping */
5918         fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5919             Fget_coding_system(Qraw_text);
5920
5921 #ifdef MULE
5922         {
5923                 size_t i;
5924
5925                 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5926                         fcd->ucs_to_mule_table[i] = Qnil;
5927         }
5928         staticpro(&mule_to_ucs_table);
5929         mule_to_ucs_table = Fmake_char_table(Qgeneric);
5930 #endif                          /* MULE */
5931 }