Merge branch 'for-steve' into njsf-dbus
[sxemacs] / src / mule / file-coding.c
1 /* Code conversion functions.
2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3    Copyright (C) 1995 Sun Microsystems, Inc.
4
5 This file is part of SXEmacs
6
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>. */
19
20
21 /* Synched up with: Mule 2.3.   Not in FSF. */
22
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
24
25 #include <config.h>
26 #include "lisp.h"
27
28 #include "buffer.h"
29 #include "elhash.h"
30 #include "ui/insdel.h"
31 #include "lstream.h"
32 #include "opaque.h"
33 #ifdef MULE
34 #include "mule-ccl.h"
35 #include "chartab.h"
36 #endif
37 #include "file-coding.h"
38
39 Lisp_Object Qcoding_system_error;
40
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
46
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
49
50 struct file_coding_dump {
51         /* Coding system currently associated with each coding category. */
52         Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
53
54         /* Table of all coding categories in decreasing order of priority.
55            This describes a permutation of the possible coding categories. */
56         int coding_category_by_priority[CODING_CATEGORY_LAST];
57
58 #ifdef MULE
59         Lisp_Object ucs_to_mule_table[65536];
60 #endif
61 } *fcd;
62
63 static const struct lrecord_description fcd_description_1[] = {
64         {XD_LISP_OBJECT_ARRAY,
65          offsetof(struct file_coding_dump, coding_category_system),
66          CODING_CATEGORY_LAST},
67 #ifdef MULE
68         {XD_LISP_OBJECT_ARRAY,
69          offsetof(struct file_coding_dump, ucs_to_mule_table),
70          countof(fcd->ucs_to_mule_table)},
71 #endif
72         {XD_END}
73 };
74
75 static const struct struct_description fcd_description = {
76         sizeof(struct file_coding_dump),
77         fcd_description_1
78 };
79
80 Lisp_Object mule_to_ucs_table;
81
82 Lisp_Object Qcoding_systemp;
83
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
86
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
92
93 #ifdef MULE
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 #endif
104 Lisp_Object Qencode, Qdecode;
105
106 Lisp_Object Vcoding_system_hash_table;
107
108 int enable_multibyte_characters;
109
110 #ifdef MULE
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113         /* CHARSET holds the character sets currently assigned to the G0
114            through G3 variables.  It is initialized from the array
115            INITIAL_CHARSET in CODESYS. */
116         Lisp_Object charset[4];
117
118         /* Which registers are currently invoked into the left (GL) and
119            right (GR) halves of the 8-bit encoding space? */
120         int register_left, register_right;
121
122         /* ISO_ESC holds a value indicating part of an escape sequence
123            that has already been seen. */
124         enum iso_esc_flag esc;
125
126         /* This records the bytes we've seen so far in an escape sequence,
127            in case the sequence is invalid (we spit out the bytes unchanged). */
128         unsigned char esc_bytes[8];
129
130         /* Index for next byte to store in ISO escape sequence. */
131         int esc_bytes_index;
132
133 #ifdef ENABLE_COMPOSITE_CHARS
134         /* Stuff seen so far when composing a string. */
135         unsigned_char_dynarr *composite_chars;
136 #endif
137
138         /* If we saw an invalid designation sequence for a particular
139            register, we flag it here and switch to ASCII.  The next time we
140            see a valid designation for this register, we turn off the flag
141            and do the designation normally, but pretend the sequence was
142            invalid.  The effect of all this is that (most of the time) the
143            escape sequences for both the switch to the unknown charset, and
144            the switch back to the known charset, get inserted literally into
145            the buffer and saved out as such.  The hope is that we can
146            preserve the escape sequences so that the resulting written out
147            file makes sense.  If we don't do any of this, the designation
148            to the invalid charset will be preserved but that switch back
149            to the known charset will probably get eaten because it was
150            the same charset that was already present in the register. */
151         unsigned char invalid_designated[4];
152
153         /* We try to do similar things as above for direction-switching
154            sequences.  If we encountered a direction switch while an
155            invalid designation was present, or an invalid designation
156            just after a direction switch (i.e. no valid designation
157            encountered yet), we insert the direction-switch escape
158            sequence literally into the output stream, and later on
159            insert the corresponding direction-restoring escape sequence
160            literally also. */
161         unsigned int switched_dir_and_no_valid_charset_yet:1;
162         unsigned int invalid_switch_dir:1;
163
164         /* Tells the decoder to output the escape sequence literally
165            even though it was valid.  Used in the games we play to
166            avoid lossage when we encounter invalid designations. */
167         unsigned int output_literally:1;
168         /* We encountered a direction switch followed by an invalid
169            designation.  We didn't output the direction switch
170            literally because we didn't know about the invalid designation;
171            but we have to do so now. */
172         unsigned int output_direction_sequence:1;
173 };
174 #endif                          /* MULE */
175 EXFUN(Fcopy_coding_system, 2);
176 #ifdef MULE
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179                               const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181                                unsigned_char_dynarr * dst,
182                                Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184                                unsigned_char_dynarr * dst,
185                                Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187                               Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189                                unsigned_char_dynarr * dst,
190                                Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192                                unsigned_char_dynarr * dst,
193                                Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195                               Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197                                unsigned_char_dynarr * dst,
198                                Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200                                unsigned_char_dynarr * dst,
201                                Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203                               Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205                                unsigned_char_dynarr * dst,
206                                Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208                                unsigned_char_dynarr * dst,
209                                Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212                           struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214                                  const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216                                   unsigned_char_dynarr * dst,
217                                   Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219                                   unsigned_char_dynarr * dst,
220                                   Lstream_data_count n);
221 #endif                          /* MULE */
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223                                         unsigned_char_dynarr * dst,
224                                         Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226                                         unsigned_char_dynarr * dst,
227                                         Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229                         unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231                         unsigned_char_dynarr * dst, Lstream_data_count n);
232
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
235         Lisp_Object sym;
236         int prop_type;
237 };
238
239 typedef struct {
240         Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
242
243 static const struct lrecord_description codesys_prop_description_1[] = {
244         {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
245         {XD_END}
246 };
247
248 static const struct struct_description codesys_prop_description = {
249         sizeof(codesys_prop),
250         codesys_prop_description_1
251 };
252
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254         XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
255         {XD_END}
256 };
257
258 static const struct struct_description codesys_prop_dynarr_description = {
259         sizeof(codesys_prop_dynarr),
260         codesys_prop_dynarr_description_1
261 };
262
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
264
265 enum codesys_prop_enum {
266         CODESYS_PROP_ALL_OK,
267         CODESYS_PROP_ISO2022,
268         CODESYS_PROP_CCL
269 };
270 \f
271 /************************************************************************/
272 /*                       Coding system functions                        */
273 /************************************************************************/
274
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
278
279 #ifdef MULE
280 static const struct lrecord_description ccs_description_1[] = {
281         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
283         {XD_END}
284 };
285
286 static const struct struct_description ccs_description = {
287         sizeof(charset_conversion_spec),
288         ccs_description_1
289 };
290
291 static const struct lrecord_description ccsd_description_1[] = {
292         XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
293         {XD_END}
294 };
295
296 static const struct struct_description ccsd_description = {
297         sizeof(charset_conversion_spec_dynarr),
298         ccsd_description_1
299 };
300 #endif
301
302 static const struct lrecord_description coding_system_description[] = {
303         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
311 #ifdef MULE
312         {XD_LISP_OBJECT_ARRAY,
313          offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
315          &ccsd_description},
316         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
317          &ccsd_description},
318         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
320 #endif
321         {XD_END}
322 };
323
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325                               mark_coding_system, print_coding_system,
326                               finalize_coding_system,
327                               0, 0, coding_system_description,
328                               Lisp_Coding_System);
329
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
331 {
332         Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
333
334         mark_object(CODING_SYSTEM_NAME(codesys));
335         mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336         mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337         mark_object(CODING_SYSTEM_EOL_LF(codesys));
338         mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339         mark_object(CODING_SYSTEM_EOL_CR(codesys));
340
341         switch (CODING_SYSTEM_TYPE(codesys)) {
342 #ifdef MULE
343                 int i;
344         case CODESYS_ISO2022:
345                 for (i = 0; i < 4; i++)
346                         mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
347                                     (codesys, i));
348                 if (codesys->iso2022.input_conv) {
349                         for (i = 0;
350                              i < Dynarr_length(codesys->iso2022.input_conv);
351                              i++) {
352                                 struct charset_conversion_spec *ccs =
353                                     Dynarr_atp(codesys->iso2022.input_conv, i);
354                                 mark_object(ccs->from_charset);
355                                 mark_object(ccs->to_charset);
356                         }
357                 }
358                 if (codesys->iso2022.output_conv) {
359                         for (i = 0;
360                              i < Dynarr_length(codesys->iso2022.output_conv);
361                              i++) {
362                                 struct charset_conversion_spec *ccs =
363                                     Dynarr_atp(codesys->iso2022.output_conv, i);
364                                 mark_object(ccs->from_charset);
365                                 mark_object(ccs->to_charset);
366                         }
367                 }
368                 break;
369
370         case CODESYS_CCL:
371                 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372                 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
373                 break;
374
375                 /* list the rest of them lot explicitly */
376         case CODESYS_AUTODETECT:
377         case CODESYS_SHIFT_JIS:
378         case CODESYS_BIG5:
379         case CODESYS_UCS4:
380         case CODESYS_UTF8:
381         case CODESYS_NO_CONVERSION:
382 #ifdef DEBUG_SXEMACS
383         case CODESYS_INTERNAL:
384 #endif
385 #endif                          /* MULE */
386         default:
387                 break;
388         }
389
390         mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391         return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
392 }
393
394 static void
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
396 {
397         Lisp_Coding_System *c = XCODING_SYSTEM(obj);
398         if (print_readably)
399                 error("printing unreadable object #<coding-system 0x%x>",
400                       c->header.uid);
401
402         write_c_string("#<coding-system ", printcharfun);
403         print_internal(c->name, printcharfun, 1);
404         write_c_string(">", printcharfun);
405 }
406
407 static void finalize_coding_system(void *header, int for_disksave)
408 {
409         Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410         /* Since coding systems never go away, this function is not
411            necessary.  But it would be necessary if we changed things
412            so that coding systems could go away. */
413         if (!for_disksave) {    /* see comment in lstream.c */
414                 switch (CODING_SYSTEM_TYPE(c)) {
415 #ifdef MULE
416                 case CODESYS_ISO2022:
417                         if (c->iso2022.input_conv) {
418                                 Dynarr_free(c->iso2022.input_conv);
419                                 c->iso2022.input_conv = 0;
420                         }
421                         if (c->iso2022.output_conv) {
422                                 Dynarr_free(c->iso2022.output_conv);
423                                 c->iso2022.output_conv = 0;
424                         }
425                         break;
426
427                         /* list the rest of them lot explicitly */
428                 case CODESYS_AUTODETECT:
429                 case CODESYS_SHIFT_JIS:
430                 case CODESYS_BIG5:
431                 case CODESYS_UCS4:
432                 case CODESYS_UTF8:
433                 case CODESYS_CCL:
434                 case CODESYS_NO_CONVERSION:
435 #ifdef DEBUG_SXEMACS
436                 case CODESYS_INTERNAL:
437 #endif
438 #endif                          /* MULE */
439                 default:
440                         break;
441                 }
442         }
443 }
444
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
446 {
447         CHECK_SYMBOL(symbol);
448         if (NILP(symbol))
449                 return EOL_AUTODETECT;
450         if (EQ(symbol, Qlf))
451                 return EOL_LF;
452         if (EQ(symbol, Qcrlf))
453                 return EOL_CRLF;
454         if (EQ(symbol, Qcr))
455                 return EOL_CR;
456
457         signal_simple_error("Unrecognized eol type", symbol);
458         return EOL_AUTODETECT;  /* not reached */
459 }
460
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
462 {
463         switch (type) {
464         default:
465                 abort();
466                 break;
467         case EOL_LF:
468                 return Qlf;
469         case EOL_CRLF:
470                 return Qcrlf;
471         case EOL_CR:
472                 return Qcr;
473         case EOL_AUTODETECT:
474                 return Qnil;
475         }
476 }
477
478 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
479 {
480         Lisp_Object codesys_obj;
481         int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
482         char *codesys_name = (char *)alloca(len + 7);
483         int mlen = -1;
484         char *codesys_mnemonic = 0;
485
486         Lisp_Object codesys_name_sym, sub_codesys_obj;
487
488         /* kludge */
489
490         XSETCODING_SYSTEM(codesys_obj, codesys);
491
492         memcpy(codesys_name,
493                string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
494
495         if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
496                 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
497                 codesys_mnemonic = (char *)alloca(mlen + 7);
498                 memcpy(codesys_mnemonic,
499                        XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
500         }
501 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
502   strcpy (codesys_name + len, "-" op_sys);                                      \
503   if (mlen != -1)                                                               \
504     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
505   codesys_name_sym = intern (codesys_name);                                     \
506   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
507   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
508   if (mlen != -1)                                                               \
509     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
510       build_string (codesys_mnemonic);                                          \
511   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
512 } while (0)
513
514         DEFINE_SUB_CODESYS("unix", "", EOL_LF);
515         DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
516         DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
517 }
518
519 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0,     /*
520 Return t if OBJECT is a coding system.
521 A coding system is an object that defines how text containing multiple
522 character sets is encoded into a stream of (typically 8-bit) bytes.
523 The coding system is used to decode the stream into a series of
524 characters (which may be from multiple charsets) when the text is read
525 from a file or process, and is used to encode the text back into the
526 same format when it is written out to a file or process.
527
528 For example, many ISO2022-compliant coding systems (such as Compound
529 Text, which is used for inter-client data under the X Window System)
530 use escape sequences to switch between different charsets -- Japanese
531 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
532 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
533 `make-coding-system' for more information.
534
535 Coding systems are normally identified using a symbol, and the
536 symbol is accepted in place of the actual coding system object whenever
537 a coding system is called for. (This is similar to how faces work.)
538 */
539       (object))
540 {
541         return CODING_SYSTEMP(object) ? Qt : Qnil;
542 }
543
544 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0,       /*
545 Retrieve the coding system of the given name.
546
547 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
548 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
549 If there is no such coding system, nil is returned.  Otherwise the
550 associated coding system object is returned.
551 */
552       (coding_system_or_name))
553 {
554         if (NILP(coding_system_or_name))
555                 coding_system_or_name = Qbinary;
556         else if (CODING_SYSTEMP(coding_system_or_name))
557                 return coding_system_or_name;
558         else
559                 CHECK_SYMBOL(coding_system_or_name);
560
561         while (1) {
562                 coding_system_or_name =
563                     Fgethash(coding_system_or_name, Vcoding_system_hash_table,
564                              Qnil);
565
566                 if (CODING_SYSTEMP(coding_system_or_name)
567                     || NILP(coding_system_or_name))
568                         return coding_system_or_name;
569         }
570 }
571
572 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
573 Retrieve the coding system of the given name.
574 Same as `find-coding-system' except that if there is no such
575 coding system, an error is signaled instead of returning nil.
576 */
577       (name))
578 {
579         Lisp_Object coding_system = Ffind_coding_system(name);
580
581         if (NILP(coding_system))
582                 signal_simple_error("No such coding system", name);
583         return coding_system;
584 }
585
586 /* We store the coding systems in hash tables with the names as the key and the
587    actual coding system object as the value.  Occasionally we need to use them
588    in a list format.  These routines provide us with that. */
589 struct coding_system_list_closure {
590         Lisp_Object *coding_system_list;
591 };
592
593 static int
594 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
595                                  void *coding_system_list_closure)
596 {
597         /* This function can GC */
598         struct coding_system_list_closure *cscl =
599             (struct coding_system_list_closure *)coding_system_list_closure;
600         Lisp_Object *coding_system_list = cscl->coding_system_list;
601
602         *coding_system_list = Fcons(key, *coding_system_list);
603         return 0;
604 }
605
606 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0,       /*
607 Return a list of the names of all defined coding systems.
608 */
609       ())
610 {
611         Lisp_Object coding_system_list = Qnil;
612         struct gcpro gcpro1;
613         struct coding_system_list_closure coding_system_list_closure;
614
615         GCPRO1(coding_system_list);
616         coding_system_list_closure.coding_system_list = &coding_system_list;
617         elisp_maphash(add_coding_system_to_list_mapper,
618                       Vcoding_system_hash_table, &coding_system_list_closure);
619         UNGCPRO;
620
621         return coding_system_list;
622 }
623
624 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0,       /*
625 Return the name of the given coding system.
626 */
627       (coding_system))
628 {
629         coding_system = Fget_coding_system(coding_system);
630         return XCODING_SYSTEM_NAME(coding_system);
631 }
632
633 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
634                                                   Lisp_Object name)
635 {
636         Lisp_Coding_System *codesys =
637             alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
638
639         zero_lcrecord(codesys);
640         CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
641         CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
642         CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
643         CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
644         CODING_SYSTEM_EOL_CR(codesys) = Qnil;
645         CODING_SYSTEM_EOL_LF(codesys) = Qnil;
646         CODING_SYSTEM_TYPE(codesys) = type;
647         CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
648 #ifdef MULE
649         if (type == CODESYS_ISO2022) {
650                 int i;
651                 for (i = 0; i < 4; i++)
652                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
653                             Qnil;
654         } else if (type == CODESYS_CCL) {
655                 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
656                 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
657         }
658 #endif                          /* MULE */
659         CODING_SYSTEM_NAME(codesys) = name;
660
661         return codesys;
662 }
663
664 #ifdef MULE
665 /* Given a list of charset conversion specs as specified in a Lisp
666    program, parse it into STORE_HERE. */
667
668 static void
669 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
670                                Lisp_Object spec_list)
671 {
672         Lisp_Object rest;
673
674         EXTERNAL_LIST_LOOP(rest, spec_list) {
675                 Lisp_Object car = XCAR(rest);
676                 Lisp_Object from, to;
677                 struct charset_conversion_spec spec;
678
679                 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
680                         signal_simple_error("Invalid charset conversion spec",
681                                             car);
682                 from = Fget_charset(XCAR(car));
683                 to = Fget_charset(XCAR(XCDR(car)));
684                 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
685                         signal_simple_error_2
686                             ("Attempted conversion between different charset types",
687                              from, to);
688                 spec.from_charset = from;
689                 spec.to_charset = to;
690
691                 Dynarr_add(store_here, spec);
692         }
693 }
694
695 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
696    specs, return the equivalent as the Lisp programmer would see it.
697
698    If LOAD_HERE is 0, return Qnil. */
699
700 static Lisp_Object
701 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
702 {
703         int i;
704         Lisp_Object result;
705
706         if (!load_here)
707                 return Qnil;
708         for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
709                 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
710                 result =
711                     Fcons(list2(ccs->from_charset, ccs->to_charset), result);
712         }
713
714         return Fnreverse(result);
715 }
716
717 #endif                          /* MULE */
718
719 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0,       /*
720 Register symbol NAME as a coding system.
721
722 TYPE describes the conversion method used and should be one of
723
724 nil or 'undecided
725 Automatic conversion.  SXEmacs attempts to detect the coding system
726 used in the file.
727 'no-conversion
728 No conversion.  Use this for binary files and such.  On output,
729 graphic characters that are not in ASCII or Latin-1 will be
730 replaced by a ?. (For a no-conversion-encoded buffer, these
731 characters will only be present if you explicitly insert them.)
732 'shift-jis
733 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
734 'ucs-4
735 ISO 10646 UCS-4 encoding.
736 'utf-8
737 ISO 10646 UTF-8 encoding.
738 'iso2022
739 Any ISO2022-compliant encoding.  Among other things, this includes
740 JIS (the Japanese encoding commonly used for e-mail), EUC (the
741 standard Unix encoding for Japanese and other languages), and
742 Compound Text (the encoding used in X11).  You can specify more
743 specific information about the conversion with the PROPS argument.
744 'big5
745 Big5 (the encoding commonly used for Taiwanese).
746 'ccl
747 The conversion is performed using a user-written pseudo-code
748 program.  CCL (Code Conversion Language) is the name of this
749 pseudo-code.
750 'internal
751 Write out or read in the raw contents of the memory representing
752 the buffer's text.  This is primarily useful for debugging
753 purposes, and is only enabled when SXEmacs has been compiled with
754 DEBUG_SXEMACS defined (via the --debug configure option).
755 WARNING: Reading in a file using 'internal conversion can result
756 in an internal inconsistency in the memory representing a
757 buffer's text, which will produce unpredictable results and may
758 cause SXEmacs to crash.  Under normal circumstances you should
759 never use 'internal conversion.
760
761 DOC-STRING is a string describing the coding system.
762
763 PROPS is a property list, describing the specific nature of the
764 character set.  Recognized properties are:
765
766 'mnemonic
767 String to be displayed in the modeline when this coding system is
768 active.
769
770 'eol-type
771 End-of-line conversion to be used.  It should be one of
772
773 nil
774 Automatically detect the end-of-line type (LF, CRLF,
775 or CR).  Also generate subsidiary coding systems named
776 `NAME-unix', `NAME-dos', and `NAME-mac', that are
777 identical to this coding system but have an EOL-TYPE
778 value of 'lf, 'crlf, and 'cr, respectively.
779 'lf
780 The end of a line is marked externally using ASCII LF.
781 Since this is also the way that SXEmacs represents an
782 end-of-line internally, specifying this option results
783 in no end-of-line conversion.  This is the standard
784 format for Unix text files.
785 'crlf
786 The end of a line is marked externally using ASCII
787 CRLF.  This is the standard format for MS-DOS text
788 files.
789 'cr
790 The end of a line is marked externally using ASCII CR.
791 This is the standard format for Macintosh text files.
792 t
793 Automatically detect the end-of-line type but do not
794 generate subsidiary coding systems.  (This value is
795 converted to nil when stored internally, and
796 `coding-system-property' will return nil.)
797
798 'post-read-conversion
799 Function called after a file has been read in, to perform the
800 decoding.  Called with two arguments, START and END, denoting
801 a region of the current buffer to be decoded.
802
803 'pre-write-conversion
804 Function called before a file is written out, to perform the
805 encoding.  Called with two arguments, START and END, denoting
806 a region of the current buffer to be encoded.
807
808 The following additional properties are recognized if TYPE is 'iso2022:
809
810 'charset-g0
811 'charset-g1
812 'charset-g2
813 'charset-g3
814 The character set initially designated to the G0 - G3 registers.
815 The value should be one of
816
817 -- A charset object (designate that character set)
818 -- nil (do not ever use this register)
819 -- t (no character set is initially designated to
820 the register, but may be later on; this automatically
821 sets the corresponding `force-g*-on-output' property)
822
823 'force-g0-on-output
824 'force-g1-on-output
825 'force-g2-on-output
826 'force-g2-on-output
827 If non-nil, send an explicit designation sequence on output before
828 using the specified register.
829
830 'short
831 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
832 "ESC $ B" on output in place of the full designation sequences
833 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
834
835 'no-ascii-eol
836 If non-nil, don't designate ASCII to G0 at each end of line on output.
837 Setting this to non-nil also suppresses other state-resetting that
838 normally happens at the end of a line.
839
840 'no-ascii-cntl
841 If non-nil, don't designate ASCII to G0 before control chars on output.
842
843 'seven
844 If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
845 environment.
846
847 'lock-shift
848 If non-nil, use locking-shift (SO/SI) instead of single-shift
849 or designation by escape sequence.
850
851 'no-iso6429
852 If non-nil, don't use ISO6429's direction specification.
853
854 'escape-quoted
855 If non-nil, literal control characters that are the same as
856 the beginning of a recognized ISO2022 or ISO6429 escape sequence
857 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
858 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
859 so that they can be properly distinguished from an escape sequence.
860 (Note that doing this results in a non-portable encoding.) This
861 encoding flag is used for byte-compiled files.  Note that ESC
862 is a good choice for a quoting character because there are no
863 escape sequences whose second byte is a character from the Control-0
864 or Control-1 character sets; this is explicitly disallowed by the
865 ISO2022 standard.
866
867 'input-charset-conversion
868 A list of conversion specifications, specifying conversion of
869 characters in one charset to another when decoding is performed.
870 Each specification is a list of two elements: the source charset,
871 and the destination charset.
872
873 'output-charset-conversion
874 A list of conversion specifications, specifying conversion of
875 characters in one charset to another when encoding is performed.
876 The form of each specification is the same as for
877 'input-charset-conversion.
878
879 The following additional properties are recognized (and required)
880 if TYPE is 'ccl:
881
882 'decode
883 CCL program used for decoding (converting to internal format).
884
885 'encode
886 CCL program used for encoding (converting to external format).
887 */
888       (name, type, doc_string, props))
889 {
890         Lisp_Coding_System *codesys;
891         enum coding_system_type ty;
892         int need_to_setup_eol_systems = 1;
893
894         /* Convert type to constant */
895         if (NILP(type) || EQ(type, Qundecided)) {
896                 ty = CODESYS_AUTODETECT;
897         }
898 #ifdef MULE
899         else if (EQ(type, Qshift_jis)) {
900                 ty = CODESYS_SHIFT_JIS;
901         } else if (EQ(type, Qiso2022)) {
902                 ty = CODESYS_ISO2022;
903         } else if (EQ(type, Qbig5)) {
904                 ty = CODESYS_BIG5;
905         } else if (EQ(type, Qucs4)) {
906                 ty = CODESYS_UCS4;
907         } else if (EQ(type, Qutf8)) {
908                 ty = CODESYS_UTF8;
909         } else if (EQ(type, Qccl)) {
910                 ty = CODESYS_CCL;
911         }
912 #endif
913         else if (EQ(type, Qno_conversion)) {
914                 ty = CODESYS_NO_CONVERSION;
915         }
916 #ifdef DEBUG_SXEMACS
917         else if (EQ(type, Qinternal)) {
918                 ty = CODESYS_INTERNAL;
919         }
920 #endif
921         else
922                 signal_simple_error("Invalid coding system type", type);
923
924         CHECK_SYMBOL(name);
925
926         codesys = allocate_coding_system(ty, name);
927
928         if (NILP(doc_string))
929                 doc_string = build_string("");
930         else
931                 CHECK_STRING(doc_string);
932         CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
933
934         {
935                 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
936                         if (EQ(key, Qmnemonic)) {
937                                 if (!NILP(value))
938                                         CHECK_STRING(value);
939                                 CODING_SYSTEM_MNEMONIC(codesys) = value;
940                         }
941
942                         else if (EQ(key, Qeol_type)) {
943                                 need_to_setup_eol_systems = NILP(value);
944                                 if (EQ(value, Qt))
945                                         value = Qnil;
946                                 CODING_SYSTEM_EOL_TYPE(codesys) =
947                                     symbol_to_eol_type(value);
948                         }
949
950                         else if (EQ(key, Qpost_read_conversion))
951                                 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
952                                     value;
953                         else if (EQ(key, Qpre_write_conversion))
954                                 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
955                                     value;
956 #ifdef MULE
957                         else if (ty == CODESYS_ISO2022) {
958 #define FROB_INITIAL_CHARSET(charset_num) \
959   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
960     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
961
962                                 if (EQ(key, Qcharset_g0))
963                                         FROB_INITIAL_CHARSET(0);
964                                 else if (EQ(key, Qcharset_g1))
965                                         FROB_INITIAL_CHARSET(1);
966                                 else if (EQ(key, Qcharset_g2))
967                                         FROB_INITIAL_CHARSET(2);
968                                 else if (EQ(key, Qcharset_g3))
969                                         FROB_INITIAL_CHARSET(3);
970
971 #define FROB_FORCE_CHARSET(charset_num) \
972   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
973
974                                 else if (EQ(key, Qforce_g0_on_output))
975                                         FROB_FORCE_CHARSET(0);
976                                 else if (EQ(key, Qforce_g1_on_output))
977                                         FROB_FORCE_CHARSET(1);
978                                 else if (EQ(key, Qforce_g2_on_output))
979                                         FROB_FORCE_CHARSET(2);
980                                 else if (EQ(key, Qforce_g3_on_output))
981                                         FROB_FORCE_CHARSET(3);
982
983 #define FROB_BOOLEAN_PROPERTY(prop) \
984   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
985
986                                 else if (EQ(key, Qshort))
987                                         FROB_BOOLEAN_PROPERTY(SHORT);
988                                 else if (EQ(key, Qno_ascii_eol))
989                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
990                                 else if (EQ(key, Qno_ascii_cntl))
991                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
992                                 else if (EQ(key, Qseven))
993                                         FROB_BOOLEAN_PROPERTY(SEVEN);
994                                 else if (EQ(key, Qlock_shift))
995                                         FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
996                                 else if (EQ(key, Qno_iso6429))
997                                         FROB_BOOLEAN_PROPERTY(NO_ISO6429);
998                                 else if (EQ(key, Qescape_quoted))
999                                         FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
1000
1001                                 else if (EQ(key, Qinput_charset_conversion)) {
1002                                         codesys->iso2022.input_conv =
1003                                             Dynarr_new(charset_conversion_spec);
1004                                         parse_charset_conversion_specs(codesys->
1005                                                                        iso2022.
1006                                                                        input_conv,
1007                                                                        value);
1008                                 } else if (EQ(key, Qoutput_charset_conversion)) {
1009                                         codesys->iso2022.output_conv =
1010                                             Dynarr_new(charset_conversion_spec);
1011                                         parse_charset_conversion_specs(codesys->
1012                                                                        iso2022.
1013                                                                        output_conv,
1014                                                                        value);
1015                                 } else
1016                                         signal_simple_error
1017                                             ("Unrecognized property", key);
1018                         } else if (EQ(type, Qccl)) {
1019                                 Lisp_Object sym;
1020                                 struct ccl_program test_ccl;
1021                                 Extbyte *suffix;
1022
1023                                 /* Check key first.  */
1024                                 if (EQ(key, Qdecode))
1025                                         suffix = "-ccl-decode";
1026                                 else if (EQ(key, Qencode))
1027                                         suffix = "-ccl-encode";
1028                                 else
1029                                         signal_simple_error
1030                                             ("Unrecognized property", key);
1031
1032                                 /* If value is vector, register it as a ccl program
1033                                    associated with an newly created symbol for
1034                                    backward compatibility.  */
1035                                 if (VECTORP(value)) {
1036                                         sym =
1037                                             Fintern(concat2
1038                                                     (Fsymbol_name(name),
1039                                                      build_string(suffix)),
1040                                                     Qnil);
1041                                         Fregister_ccl_program(sym, value);
1042                                 } else {
1043                                         CHECK_SYMBOL(value);
1044                                         sym = value;
1045                                 }
1046                                 /* check if the given ccl programs are valid.  */
1047                                 if (setup_ccl_program(&test_ccl, sym) < 0)
1048                                         signal_simple_error
1049                                             ("Invalid CCL program", value);
1050
1051                                 if (EQ(key, Qdecode))
1052                                         CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1053                                 else if (EQ(key, Qencode))
1054                                         CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1055
1056                         }
1057 #endif                          /* MULE */
1058                         else
1059                                 signal_simple_error("Unrecognized property",
1060                                                     key);
1061                 }
1062         }
1063
1064         if (need_to_setup_eol_systems)
1065                 setup_eol_coding_systems(codesys);
1066
1067         {
1068                 Lisp_Object codesys_obj;
1069                 XSETCODING_SYSTEM(codesys_obj, codesys);
1070                 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1071                 return codesys_obj;
1072         }
1073 }
1074
1075 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0,       /*
1076 Copy OLD-CODING-SYSTEM to NEW-NAME.
1077 If NEW-NAME does not name an existing coding system, a new one will
1078 be created.
1079 */
1080       (old_coding_system, new_name))
1081 {
1082         Lisp_Object new_coding_system;
1083         old_coding_system = Fget_coding_system(old_coding_system);
1084         new_coding_system = Ffind_coding_system(new_name);
1085         if (NILP(new_coding_system)) {
1086                 XSETCODING_SYSTEM(new_coding_system,
1087                                   allocate_coding_system
1088                                   (XCODING_SYSTEM_TYPE(old_coding_system),
1089                                    new_name));
1090                 Fputhash(new_name, new_coding_system,
1091                          Vcoding_system_hash_table);
1092         }
1093
1094         {
1095                 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1096                 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1097                 memcpy(((char *)to) + sizeof(to->header),
1098                        ((char *)from) + sizeof(from->header),
1099                        sizeof(*from) - sizeof(from->header));
1100                 to->name = new_name;
1101         }
1102         return new_coding_system;
1103 }
1104
1105 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0,       /*
1106 Return t if OBJECT names a coding system, and is not a coding system alias.
1107 */
1108       (object))
1109 {
1110         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1111         return CODING_SYSTEMP(val) ? Qt : Qnil;
1112 }
1113
1114 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1115  Return t if OBJECT is a coding system alias.
1116 All coding system aliases are created by `define-coding-system-alias'.
1117 */
1118       (object))
1119 {
1120         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1121         return SYMBOLP(val) ? Qt : Qnil;
1122 }
1123
1124 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1125 Return the coding-system symbol for which symbol ALIAS is an alias.
1126 */
1127       (alias))
1128 {
1129         Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1130         if (SYMBOLP(aliasee)) {
1131                 return aliasee;
1132         } else {
1133                 signal_simple_error(
1134                         "Symbol is not a coding system alias", alias);
1135         }
1136         /* To keep the compiler happy */
1137         return Qnil;
1138 }
1139
1140 static Lisp_Object
1141 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1142 {
1143         return Fintern(concat2(Fsymbol_name(symbol),
1144                                build_string(ascii_string)), Qnil);
1145 }
1146
1147 /* A maphash function, for removing dangling coding system aliases. */
1148 static int
1149 dangling_coding_system_alias_p(Lisp_Object alias,
1150                                Lisp_Object aliasee, void *dangling_aliases)
1151 {
1152         if (SYMBOLP(aliasee)
1153             && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1154                 (*(int *)dangling_aliases)++;
1155                 return 1;
1156         } else {
1157                 return 0;
1158         }
1159 }
1160
1161 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0,       /*
1162 Define symbol ALIAS as an alias for coding system ALIASEE.
1163
1164 You can use this function to redefine an alias that has already been defined,
1165 but you cannot redefine a name which is the canonical name for a coding system.
1166 \(a canonical name of a coding system is what is returned when you call
1167 `coding-system-name' on a coding system).
1168
1169 ALIASEE itself can be an alias, which allows you to define nested aliases.
1170
1171 You are forbidden, however, from creating alias loops or `dangling' aliases.
1172 These will be detected, and an error will be signaled if you attempt to do so.
1173
1174 If ALIASEE is nil, then ALIAS will simply be undefined.
1175
1176 See also `coding-system-alias-p', `coding-system-aliasee',
1177 and `coding-system-canonical-name-p'.
1178 */
1179       (alias, aliasee))
1180 {
1181         Lisp_Object real_coding_system, probe;
1182
1183         CHECK_SYMBOL(alias);
1184
1185         if (!NILP(Fcoding_system_canonical_name_p(alias)))
1186                 signal_simple_error
1187                     ("Symbol is the canonical name of a coding system and cannot be redefined",
1188                      alias);
1189
1190         if (NILP(aliasee)) {
1191                 Lisp_Object subsidiary_unix =
1192                     append_suffix_to_symbol(alias, "-unix");
1193                 Lisp_Object subsidiary_dos =
1194                     append_suffix_to_symbol(alias, "-dos");
1195                 Lisp_Object subsidiary_mac =
1196                     append_suffix_to_symbol(alias, "-mac");
1197
1198                 Fremhash(alias, Vcoding_system_hash_table);
1199
1200                 /* Undefine subsidiary aliases,
1201                    presumably created by a previous call to this function */
1202                 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1203                     !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1204                     !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1205                         Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1206                         Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1207                         Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1208                 }
1209
1210                 /* Undefine dangling coding system aliases. */
1211                 {
1212                         int dangling_aliases;
1213
1214                         do {
1215                                 dangling_aliases = 0;
1216                                 elisp_map_remhash
1217                                     (dangling_coding_system_alias_p,
1218                                      Vcoding_system_hash_table,
1219                                      &dangling_aliases);
1220                         } while (dangling_aliases > 0);
1221                 }
1222
1223                 return Qnil;
1224         }
1225
1226         if (CODING_SYSTEMP(aliasee))
1227                 aliasee = XCODING_SYSTEM_NAME(aliasee);
1228
1229         /* Checks that aliasee names a coding-system */
1230         real_coding_system = Fget_coding_system(aliasee);
1231
1232         /* Check for coding system alias loops */
1233         if (EQ(alias, aliasee))
1234               alias_loop:signal_simple_error_2
1235                     ("Attempt to create a coding system alias loop", alias,
1236                      aliasee);
1237
1238         for (probe = aliasee;
1239              SYMBOLP(probe);
1240              probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1241                 if (EQ(probe, alias))
1242                         goto alias_loop;
1243         }
1244
1245         Fputhash(alias, aliasee, Vcoding_system_hash_table);
1246
1247         /* Set up aliases for subsidiaries.
1248            #### There must be a better way to handle subsidiary coding
1249            #### systems. */
1250         {
1251                 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1252
1253                 for (int i = 0; i < countof(suffixes); i++) {
1254                         Lisp_Object alias_subsidiary =
1255                                 append_suffix_to_symbol(alias, suffixes[i]);
1256                         Lisp_Object aliasee_subsidiary =
1257                                 append_suffix_to_symbol(aliasee, suffixes[i]);
1258
1259                         if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1260                                 Fdefine_coding_system_alias(alias_subsidiary,
1261                                                             aliasee_subsidiary);
1262                         }
1263                 }
1264         }
1265         /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1266            but it doesn't look intentional, so I'd rather return something
1267            meaningful or nothing at all. */
1268         return Qnil;
1269 }
1270
1271 static Lisp_Object
1272 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1273 {
1274         Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1275         Lisp_Object new_coding_system;
1276
1277         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1278                 return coding_system;
1279
1280         switch (type) {
1281         case EOL_AUTODETECT:
1282                 return coding_system;
1283         case EOL_LF:
1284                 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1285                 break;
1286         case EOL_CR:
1287                 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1288                 break;
1289         case EOL_CRLF:
1290                 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1291                 break;
1292         default:
1293                 abort();
1294                 return Qnil;
1295         }
1296
1297         return NILP(new_coding_system) ? coding_system : new_coding_system;
1298 }
1299
1300 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0,   /*
1301 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1302 */
1303       (coding_system, eol_type))
1304 {
1305         coding_system = Fget_coding_system(coding_system);
1306
1307         return subsidiary_coding_system(coding_system,
1308                                         symbol_to_eol_type(eol_type));
1309 }
1310 \f
1311 /************************************************************************/
1312 /*                         Coding system accessors                      */
1313 /************************************************************************/
1314
1315 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0,   /*
1316 Return the doc string for CODING-SYSTEM.
1317 */
1318       (coding_system))
1319 {
1320         coding_system = Fget_coding_system(coding_system);
1321         return XCODING_SYSTEM_DOC_STRING(coding_system);
1322 }
1323
1324 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0,       /*
1325 Return the type of CODING-SYSTEM.
1326 */
1327       (coding_system))
1328 {
1329         Lisp_Object tmp = Fget_coding_system(coding_system);
1330
1331         switch (XCODING_SYSTEM_TYPE(tmp)) {
1332         default:
1333                 abort();
1334                 break;
1335         case CODESYS_AUTODETECT:
1336                 return Qundecided;
1337 #ifdef MULE
1338         case CODESYS_SHIFT_JIS:
1339                 return Qshift_jis;
1340         case CODESYS_ISO2022:
1341                 return Qiso2022;
1342         case CODESYS_BIG5:
1343                 return Qbig5;
1344         case CODESYS_UCS4:
1345                 return Qucs4;
1346         case CODESYS_UTF8:
1347                 return Qutf8;
1348         case CODESYS_CCL:
1349                 return Qccl;
1350 #endif
1351         case CODESYS_NO_CONVERSION:
1352                 return Qno_conversion;
1353 #ifdef DEBUG_SXEMACS
1354         case CODESYS_INTERNAL:
1355                 return Qinternal;
1356 #endif
1357         }
1358 }
1359
1360 #ifdef MULE
1361 static
1362 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1363 {
1364         Lisp_Object cs
1365             = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1366
1367         return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1368 }
1369
1370 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1371 Return initial charset of CODING-SYSTEM designated to GNUM.
1372 GNUM allows 0 .. 3.
1373 */
1374       (coding_system, gnum))
1375 {
1376         coding_system = Fget_coding_system(coding_system);
1377         CHECK_INT(gnum);
1378
1379         return coding_system_charset(coding_system, XINT(gnum));
1380 }
1381 #endif                          /* MULE */
1382
1383 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0,       /*
1384 Return the PROP property of CODING-SYSTEM.
1385 */
1386       (coding_system, prop))
1387 {
1388         int i, ok = 0;
1389         enum coding_system_type type;
1390
1391         coding_system = Fget_coding_system(coding_system);
1392         CHECK_SYMBOL(prop);
1393         type = XCODING_SYSTEM_TYPE(coding_system);
1394
1395         for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1396                 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1397                         ok = 1;
1398                         switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1399                         case CODESYS_PROP_ALL_OK:
1400                                 break;
1401 #ifdef MULE
1402                         case CODESYS_PROP_ISO2022:
1403                                 if (type != CODESYS_ISO2022)
1404                                         signal_simple_error
1405                                             ("Property only valid in ISO2022 coding systems",
1406                                              prop);
1407                                 break;
1408
1409                         case CODESYS_PROP_CCL:
1410                                 if (type != CODESYS_CCL)
1411                                         signal_simple_error
1412                                             ("Property only valid in CCL coding systems",
1413                                              prop);
1414                                 break;
1415 #endif                          /* MULE */
1416                         default:
1417                                 abort();
1418                         }
1419                 }
1420
1421         if (!ok)
1422                 signal_simple_error("Unrecognized property", prop);
1423
1424         if (EQ(prop, Qname))
1425                 return XCODING_SYSTEM_NAME(coding_system);
1426         else if (EQ(prop, Qtype))
1427                 return Fcoding_system_type(coding_system);
1428         else if (EQ(prop, Qdoc_string))
1429                 return XCODING_SYSTEM_DOC_STRING(coding_system);
1430         else if (EQ(prop, Qmnemonic))
1431                 return XCODING_SYSTEM_MNEMONIC(coding_system);
1432         else if (EQ(prop, Qeol_type))
1433                 return
1434                     eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1435         else if (EQ(prop, Qeol_lf))
1436                 return XCODING_SYSTEM_EOL_LF(coding_system);
1437         else if (EQ(prop, Qeol_crlf))
1438                 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1439         else if (EQ(prop, Qeol_cr))
1440                 return XCODING_SYSTEM_EOL_CR(coding_system);
1441         else if (EQ(prop, Qpost_read_conversion))
1442                 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1443         else if (EQ(prop, Qpre_write_conversion))
1444                 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1445 #ifdef MULE
1446         else if (type == CODESYS_ISO2022) {
1447                 if (EQ(prop, Qcharset_g0))
1448                         return coding_system_charset(coding_system, 0);
1449                 else if (EQ(prop, Qcharset_g1))
1450                         return coding_system_charset(coding_system, 1);
1451                 else if (EQ(prop, Qcharset_g2))
1452                         return coding_system_charset(coding_system, 2);
1453                 else if (EQ(prop, Qcharset_g3))
1454                         return coding_system_charset(coding_system, 3);
1455
1456 #define FORCE_CHARSET(charset_num) \
1457   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1458    (coding_system, charset_num) ? Qt : Qnil)
1459
1460                 else if (EQ(prop, Qforce_g0_on_output))
1461                         return FORCE_CHARSET(0);
1462                 else if (EQ(prop, Qforce_g1_on_output))
1463                         return FORCE_CHARSET(1);
1464                 else if (EQ(prop, Qforce_g2_on_output))
1465                         return FORCE_CHARSET(2);
1466                 else if (EQ(prop, Qforce_g3_on_output))
1467                         return FORCE_CHARSET(3);
1468
1469 #define LISP_BOOLEAN(prop) \
1470   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1471
1472                 else if (EQ(prop, Qshort))
1473                         return LISP_BOOLEAN(SHORT);
1474                 else if (EQ(prop, Qno_ascii_eol))
1475                         return LISP_BOOLEAN(NO_ASCII_EOL);
1476                 else if (EQ(prop, Qno_ascii_cntl))
1477                         return LISP_BOOLEAN(NO_ASCII_CNTL);
1478                 else if (EQ(prop, Qseven))
1479                         return LISP_BOOLEAN(SEVEN);
1480                 else if (EQ(prop, Qlock_shift))
1481                         return LISP_BOOLEAN(LOCK_SHIFT);
1482                 else if (EQ(prop, Qno_iso6429))
1483                         return LISP_BOOLEAN(NO_ISO6429);
1484                 else if (EQ(prop, Qescape_quoted))
1485                         return LISP_BOOLEAN(ESCAPE_QUOTED);
1486
1487                 else if (EQ(prop, Qinput_charset_conversion))
1488                         return
1489                             unparse_charset_conversion_specs
1490                             (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1491                 else if (EQ(prop, Qoutput_charset_conversion))
1492                         return
1493                             unparse_charset_conversion_specs
1494                             (XCODING_SYSTEM(coding_system)->iso2022.
1495                              output_conv);
1496                 else
1497                         abort();
1498         } else if (type == CODESYS_CCL) {
1499                 if (EQ(prop, Qdecode))
1500                         return XCODING_SYSTEM_CCL_DECODE(coding_system);
1501                 else if (EQ(prop, Qencode))
1502                         return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1503                 else
1504                         abort();
1505         }
1506 #endif                          /* MULE */
1507         else
1508                 abort();
1509
1510         return Qnil;            /* not reached */
1511 }
1512 \f
1513 /************************************************************************/
1514 /*                       Coding category functions                      */
1515 /************************************************************************/
1516
1517 static int decode_coding_category(Lisp_Object symbol)
1518 {
1519         int i;
1520
1521         CHECK_SYMBOL(symbol);
1522         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1523                 if (EQ(coding_category_symbol[i], symbol))
1524                         return i;
1525
1526         signal_simple_error("Unrecognized coding category", symbol);
1527         return 0;               /* not reached */
1528 }
1529
1530 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0,   /*
1531 Return a list of all recognized coding categories.
1532 */
1533       ())
1534 {
1535         int i;
1536         Lisp_Object list = Qnil;
1537
1538         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1539                 list = Fcons(coding_category_symbol[i], list);
1540         return list;
1541 }
1542
1543 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0,   /*
1544 Change the priority order of the coding categories.
1545 LIST should be list of coding categories, in descending order of
1546 priority.  Unspecified coding categories will be lower in priority
1547 than all specified ones, in the same relative order they were in
1548 previously.
1549 */
1550       (list))
1551 {
1552         int category_to_priority[CODING_CATEGORY_LAST];
1553         int i, j;
1554         Lisp_Object rest;
1555
1556         /* First generate a list that maps coding categories to priorities. */
1557
1558         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1559                 category_to_priority[i] = -1;
1560
1561         /* Highest priority comes from the specified list. */
1562         i = 0;
1563         EXTERNAL_LIST_LOOP(rest, list) {
1564                 int cat = decode_coding_category(XCAR(rest));
1565
1566                 if (category_to_priority[cat] >= 0)
1567                         signal_simple_error("Duplicate coding category in list",
1568                                             XCAR(rest));
1569                 category_to_priority[cat] = i++;
1570         }
1571
1572         /* Now go through the existing categories by priority to retrieve
1573            the categories not yet specified and preserve their priority
1574            order. */
1575         for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1576                 int cat = fcd->coding_category_by_priority[j];
1577                 if (category_to_priority[cat] < 0)
1578                         category_to_priority[cat] = i++;
1579         }
1580
1581         /* Now we need to construct the inverse of the mapping we just
1582            constructed. */
1583
1584         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1585                 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1586
1587         /* Phew!  That was confusing. */
1588         return Qnil;
1589 }
1590
1591 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0,   /*
1592 Return a list of coding categories in descending order of priority.
1593 */
1594       ())
1595 {
1596         int i;
1597         Lisp_Object list = Qnil;
1598
1599         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1600                 list =
1601                     Fcons(coding_category_symbol
1602                           [fcd->coding_category_by_priority[i]], list);
1603         return list;
1604 }
1605
1606 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0,       /*
1607 Change the coding system associated with a coding category.
1608 */
1609       (coding_category, coding_system))
1610 {
1611         int cat = decode_coding_category(coding_category);
1612
1613         coding_system = Fget_coding_system(coding_system);
1614         fcd->coding_category_system[cat] = coding_system;
1615         return Qnil;
1616 }
1617
1618 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0,       /*
1619 Return the coding system associated with a coding category.
1620 */
1621       (coding_category))
1622 {
1623         int cat = decode_coding_category(coding_category);
1624         Lisp_Object sys = fcd->coding_category_system[cat];
1625
1626         if (!NILP(sys))
1627                 return XCODING_SYSTEM_NAME(sys);
1628         return Qnil;
1629 }
1630 \f
1631 /************************************************************************/
1632 /*                     Detecting the encoding of data                   */
1633 /************************************************************************/
1634
1635 struct detection_state {
1636         eol_type_t eol_type;
1637         int seen_non_ascii;
1638         int mask;
1639 #ifdef MULE
1640         struct {
1641                 int mask;
1642                 int in_second_byte;
1643         } big5;
1644
1645         struct {
1646                 int mask;
1647                 int in_second_byte;
1648         } shift_jis;
1649
1650         struct {
1651                 int mask;
1652                 int in_byte;
1653         } ucs4;
1654
1655         struct {
1656                 int mask;
1657                 int in_byte;
1658         } utf8;
1659
1660         struct {
1661                 int mask;
1662                 int initted;
1663                 struct iso2022_decoder iso;
1664                 unsigned int flags;
1665                 int high_byte_count;
1666                 unsigned int saw_single_shift:1;
1667         } iso2022;
1668 #endif
1669         struct {
1670                 int seen_anything;
1671                 int just_saw_cr;
1672         } eol;
1673 };
1674
1675 static int acceptable_control_char_p(int c)
1676 {
1677         switch (c) {
1678                 /* Allow and ignore control characters that you might
1679                    reasonably see in a text file */
1680         case '\r':
1681         case '\n':
1682         case '\t':
1683         case 7:         /* bell */
1684         case 8:         /* backspace */
1685         case 11:                /* vertical tab */
1686         case 12:                /* form feed */
1687         case 26:                /* MS-DOS C-z junk */
1688         case 31:                /* '^_' -- for info */
1689                 return 1;
1690         default:
1691                 return 0;
1692         }
1693 }
1694
1695 static int mask_has_at_most_one_bit_p(int mask)
1696 {
1697         /* Perhaps the only thing useful you learn from intensive Microsoft
1698            technical interviews */
1699         return (mask & (mask - 1)) == 0;
1700 }
1701
1702 static eol_type_t
1703 detect_eol_type(struct detection_state *st, const Extbyte * src,
1704                 Lstream_data_count n)
1705 {
1706         while (n--) {
1707                 const unsigned char c = *(const unsigned char*)src++;
1708                 if (c == '\n') {
1709                         if (st->eol.just_saw_cr)
1710                                 return EOL_CRLF;
1711                         else if (st->eol.seen_anything)
1712                                 return EOL_LF;
1713                 } else if (st->eol.just_saw_cr)
1714                         return EOL_CR;
1715                 else if (c == '\r')
1716                         st->eol.just_saw_cr = 1;
1717                 else
1718                         st->eol.just_saw_cr = 0;
1719                 st->eol.seen_anything = 1;
1720         }
1721
1722         return EOL_AUTODETECT;
1723 }
1724
1725 /* Attempt to determine the encoding and EOL type of the given text.
1726    Before calling this function for the first type, you must initialize
1727    st->eol_type as appropriate and initialize st->mask to ~0.
1728
1729    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1730    not yet known.
1731
1732    st->mask holds the determined coding category mask, or ~0 if only
1733    ASCII has been seen so far.
1734
1735    Returns:
1736
1737    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1738         is present in st->mask
1739    1 == definitive answers are here for both st->eol_type and st->mask
1740 */
1741
1742 static int
1743 detect_coding_type(struct detection_state *st, const Extbyte * src,
1744                    Lstream_data_count n, int just_do_eol)
1745 {
1746         if (st->eol_type == EOL_AUTODETECT)
1747                 st->eol_type = detect_eol_type(st, src, n);
1748
1749         if (just_do_eol)
1750                 return st->eol_type != EOL_AUTODETECT;
1751
1752         if (!st->seen_non_ascii) {
1753                 for (; n; n--, src++) {
1754                         const unsigned char c = *(const unsigned char *)src;
1755                         if ((c < 0x20 && !acceptable_control_char_p(c))
1756                             || c >= 0x80) {
1757                                 st->seen_non_ascii = 1;
1758 #ifdef MULE
1759                                 st->shift_jis.mask = ~0;
1760                                 st->big5.mask = ~0;
1761                                 st->ucs4.mask = ~0;
1762                                 st->utf8.mask = ~0;
1763                                 st->iso2022.mask = ~0;
1764 #endif
1765                                 break;
1766                         }
1767                 }
1768         }
1769
1770         if (!n) {
1771                 return 0;
1772         }
1773 #ifdef MULE
1774         if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1775                 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1776         if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1777                 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1778         if (!mask_has_at_most_one_bit_p(st->big5.mask))
1779                 st->big5.mask = detect_coding_big5(st, src, n);
1780         if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1781                 st->utf8.mask = detect_coding_utf8(st, src, n);
1782         if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1783                 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1784
1785         st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1786                 | st->utf8.mask | st->ucs4.mask;
1787 #endif
1788         {
1789                 int retval = mask_has_at_most_one_bit_p(st->mask);
1790                 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1791                 return retval && st->eol_type != EOL_AUTODETECT;
1792         }
1793 }
1794
1795 static Lisp_Object coding_system_from_mask(int mask)
1796 {
1797         if (mask == ~0) {
1798                 /* If the file was entirely or basically ASCII, use the
1799                    default value of `buffer-file-coding-system'. */
1800                 Lisp_Object retval =
1801                     XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1802                 if (!NILP(retval)) {
1803                         retval = Ffind_coding_system(retval);
1804                         if (NILP(retval)) {
1805                                 warn_when_safe
1806                                     (Qbad_variable, Qwarning,
1807                                      "Invalid `default-buffer-file-coding-system', set to nil");
1808                                 XBUFFER(Vbuffer_defaults)->
1809                                     buffer_file_coding_system = Qnil;
1810                         }
1811                 }
1812                 if (NILP(retval))
1813                         retval = Fget_coding_system(Qraw_text);
1814                 return retval;
1815         } else {
1816                 int i;
1817                 int cat = -1;
1818 #ifdef MULE
1819                 mask = postprocess_iso2022_mask(mask);
1820 #endif
1821                 /* Look through the coding categories by priority and find
1822                    the first one that is allowed. */
1823                 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1824                         cat = fcd->coding_category_by_priority[i];
1825                         if (cat<0)
1826                                 continue;
1827                         if ((mask & (1 << cat)) &&
1828                             !NILP(fcd->coding_category_system[cat]))
1829                                 break;
1830                 }
1831                 if (cat >= 0)
1832                         return fcd->coding_category_system[cat];
1833                 else
1834                         return Fget_coding_system(Qraw_text);
1835         }
1836 }
1837
1838 /* Given a seekable read stream and potential coding system and EOL type
1839    as specified, do any autodetection that is called for.  If the
1840    coding system and/or EOL type are not `autodetect', they will be left
1841    alone; but this function will never return an autodetect coding system
1842    or EOL type.
1843
1844    This function does not automatically fetch subsidiary coding systems;
1845    that should be unnecessary with the explicit eol-type argument. */
1846
1847 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1848 /* number of leading lines to check for a coding cookie */
1849 #define LINES_TO_CHECK 2
1850
1851 void
1852 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1853                              eol_type_t * eol_type_in_out)
1854 {
1855         struct detection_state decst;
1856
1857         if (*eol_type_in_out == EOL_AUTODETECT)
1858                 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
1859
1860         xzero(decst);
1861         decst.eol_type = *eol_type_in_out;
1862         decst.mask = ~0;
1863
1864         /* If autodetection is called for, do it now. */
1865         if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1866             || *eol_type_in_out == EOL_AUTODETECT) {
1867                 Extbyte buf[4096];
1868                 Lisp_Object coding_system = Qnil;
1869                 Extbyte *p;
1870                 Lstream_data_count nread =
1871                     Lstream_read(stream, buf, sizeof(buf));
1872                 Extbyte *scan_end;
1873                 int lines_checked = 0;
1874
1875                 /* Look for initial "-*-"; mode line prefix */
1876                 for (p = buf,
1877                      scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1878                      p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1879                         if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1880                                 Extbyte *local_vars_beg = p + 3;
1881                                 /* Look for final "-*-"; mode line suffix */
1882                                 for (p = local_vars_beg,
1883                                      scan_end = buf + nread - LENGTH("-*-");
1884                                      p <= scan_end
1885                                      && lines_checked < LINES_TO_CHECK; p++)
1886                                         if (*p == '-' && *(p + 1) == '*'
1887                                             && *(p + 2) == '-') {
1888                                                 Extbyte *suffix = p;
1889                                                 /* Look for "coding:" */
1890                                                 for (p = local_vars_beg,
1891                                                      scan_end =
1892                                                      suffix -
1893                                                      LENGTH("coding:?");
1894                                                      p <= scan_end; p++)
1895                                                         if (memcmp
1896                                                             ("coding:", p,
1897                                                              LENGTH("coding:"))
1898                                                             == 0
1899                                                             && (p ==
1900                                                                 local_vars_beg
1901                                                                 || (*(p - 1) ==
1902                                                                     ' '
1903                                                                     || *(p -
1904                                                                          1) ==
1905                                                                     '\t'
1906                                                                     || *(p -
1907                                                                          1) ==
1908                                                                     ';'))) {
1909                                                                 Extbyte save;
1910                                                                 int n;
1911                                                                 p += LENGTH
1912                                                                     ("coding:");
1913                                                                 while (*p == ' '
1914                                                                        || *p ==
1915                                                                        '\t')
1916                                                                         p++;
1917
1918                                                                 /* Get coding system name */
1919                                                                 save = *suffix;
1920                                                                 *suffix = '\0';
1921                                                                 /* Characters valid in a MIME charset name (rfc 1521),
1922                                                                    and in a Lisp symbol name. */
1923                                                                 n = strspn((char
1924                                                                             *)p,
1925                                                                            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1926                                                                            "abcdefghijklmnopqrstuvwxyz"
1927                                                                            "0123456789"
1928                                                                            "!$%&*+-.^_{|}~");
1929                                                                 *suffix = save;
1930                                                                 if (n > 0) {
1931                                                                         save =
1932                                                                             p
1933                                                                             [n];
1934                                                                         p[n] =
1935                                                                             '\0';
1936                                                                         coding_system
1937                                                                             =
1938                                                                             Ffind_coding_system
1939                                                                             (intern
1940                                                                              ((char *)p));
1941                                                                         p[n] =
1942                                                                             save;
1943                                                                 }
1944                                                                 break;
1945                                                         }
1946                                                 break;
1947                                         }
1948                                 /* #### file must use standard EOLs or we miss 2d line */
1949                                 /* #### not to mention this is broken for UTF-16 DOS files */
1950                                         else if (*p == '\n' || *p == '\r') {
1951                                                 lines_checked++;
1952                                                 /* skip past multibyte (DOS) newline */
1953                                                 if (*p == '\r'
1954                                                     && *(p + 1) == '\n')
1955                                                         p++;
1956                                         }
1957                                 break;
1958                         }
1959                 /* #### file must use standard EOLs or we miss 2d line */
1960                 /* #### not to mention this is broken for UTF-16 DOS files */
1961                         else if (*p == '\n' || *p == '\r') {
1962                                 lines_checked++;
1963                                 /* skip past multibyte (DOS) newline */
1964                                 if (*p == '\r' && *(p + 1) == '\n')
1965                                         p++;
1966                         }
1967
1968                 if (NILP(coding_system))
1969                         do {
1970                                 if (detect_coding_type(&decst, buf, nread,
1971                                                        XCODING_SYSTEM_TYPE
1972                                                        (*codesys_in_out)
1973                                                        != CODESYS_AUTODETECT))
1974                                         break;
1975                                 nread = Lstream_read(stream, buf, sizeof(buf));
1976                                 if (nread == 0)
1977                                         break;
1978                         }
1979                         while (1);
1980
1981                 else if (XCODING_SYSTEM_TYPE(*codesys_in_out) ==
1982                          CODESYS_AUTODETECT
1983                          && XCODING_SYSTEM_EOL_TYPE(coding_system) ==
1984                          EOL_AUTODETECT)
1985                         do {
1986                                 if (detect_coding_type(&decst, buf, nread, 1))
1987                                         break;
1988                                 nread = Lstream_read(stream, buf, sizeof(buf));
1989                                 if (!nread)
1990                                         break;
1991                         }
1992                         while (1);
1993
1994                 *eol_type_in_out = decst.eol_type;
1995                 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
1996                         if (NILP(coding_system))
1997                                 *codesys_in_out =
1998                                     coding_system_from_mask(decst.mask);
1999                         else
2000                                 *codesys_in_out = coding_system;
2001                 }
2002         }
2003
2004         /* If we absolutely can't determine the EOL type, just assume LF. */
2005         if (*eol_type_in_out == EOL_AUTODETECT)
2006                 *eol_type_in_out = EOL_LF;
2007
2008         Lstream_rewind(stream);
2009 }
2010
2011 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0,   /*
2012 Detect coding system of the text in the region between START and END.
2013 Return a list of possible coding systems ordered by priority.
2014 If only ASCII characters are found, return 'undecided or one of
2015 its subsidiary coding systems according to a detected end-of-line
2016 type.  Optional arg BUFFER defaults to the current buffer.
2017 */
2018       (start, end, buffer))
2019 {
2020         Lisp_Object val = Qnil;
2021         struct buffer *buf = decode_buffer(buffer, 0);
2022         Bufpos b, e;
2023         Lisp_Object instream, lb_instream;
2024         lstream_t istr, lb_istr;
2025         struct detection_state decst;
2026         struct gcpro gcpro1, gcpro2;
2027
2028         get_buffer_range_char(buf, start, end, &b, &e, 0);
2029         lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2030         lb_istr = XLSTREAM(lb_instream);
2031         instream =
2032             make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2033         istr = XLSTREAM(instream);
2034         GCPRO2(instream, lb_instream);
2035         xzero(decst);
2036         decst.eol_type = EOL_AUTODETECT;
2037         decst.mask = ~0;
2038         while (1) {
2039                 Extbyte random_buffer[4096];
2040                 Lstream_data_count nread =
2041                     Lstream_read(istr, random_buffer, sizeof(random_buffer));
2042
2043                 if (!nread)
2044                         break;
2045                 if (detect_coding_type(&decst, random_buffer, nread, 0))
2046                         break;
2047         }
2048
2049         if (decst.mask == ~0)
2050                 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2051                                                decst.eol_type);
2052         else {
2053                 int i;
2054
2055                 val = Qnil;
2056 #ifdef MULE
2057                 decst.mask = postprocess_iso2022_mask(decst.mask);
2058 #endif
2059                 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2060                         int sys = fcd->coding_category_by_priority[i];
2061                         if (decst.mask & (1 << sys)) {
2062                                 Lisp_Object codesys =
2063                                     fcd->coding_category_system[sys];
2064                                 if (!NILP(codesys))
2065                                         codesys =
2066                                             subsidiary_coding_system(codesys,
2067                                                                      decst.
2068                                                                      eol_type);
2069                                 val = Fcons(codesys, val);
2070                         }
2071                 }
2072         }
2073         Lstream_close(istr);
2074         UNGCPRO;
2075         Lstream_delete(istr);
2076         Lstream_delete(lb_istr);
2077         return val;
2078 }
2079 \f
2080 /************************************************************************/
2081 /*           Converting to internal Mule format ("decoding")            */
2082 /************************************************************************/
2083
2084 /* A decoding stream is a stream used for decoding text (i.e.
2085    converting from some external format to internal format).
2086    The decoding-stream object keeps track of the actual coding
2087    stream, the stream that is at the other end, and data that
2088    needs to be persistent across the lifetime of the stream. */
2089
2090 /* Handle the EOL stuff related to just-read-in character C.
2091    EOL_TYPE is the EOL type of the coding stream.
2092    FLAGS is the current value of FLAGS in the coding stream, and may
2093    be modified by this macro.  (The macro only looks at the
2094    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2095    bytes are to be written.  You need to also define a local goto
2096    label "label_continue_loop" that is at the end of the main
2097    character-reading loop.
2098
2099    If C is a CR character, then this macro handles it entirely and
2100    jumps to label_continue_loop.  Otherwise, this macro does not add
2101    anything to DST, and continues normally.  You should continue
2102    processing C normally after this macro. */
2103
2104 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2105 do {                                                            \
2106   if (c == '\r')                                                \
2107     {                                                           \
2108       if (eol_type == EOL_CR)                                   \
2109         Dynarr_add (dst, '\n');                                 \
2110       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2111         Dynarr_add (dst, c);                                    \
2112       else                                                      \
2113         flags |= CODING_STATE_CR;                               \
2114       goto label_continue_loop;                                 \
2115     }                                                           \
2116   else if (flags & CODING_STATE_CR)                             \
2117     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2118       if (c != '\n')                                            \
2119         Dynarr_add (dst, '\r');                                 \
2120       flags &= ~CODING_STATE_CR;                                \
2121     }                                                           \
2122 } while (0)
2123
2124 /* C should be a binary character in the range 0 - 255; convert
2125    to internal format and add to Dynarr DST. */
2126
2127 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2128 do {                                            \
2129   if (BYTE_ASCII_P (c))                         \
2130     Dynarr_add (dst, c);                        \
2131   else if (BYTE_C1_P (c))                       \
2132     {                                           \
2133       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2134       Dynarr_add (dst, c + 0x20);               \
2135     }                                           \
2136   else                                          \
2137     {                                           \
2138       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2139       Dynarr_add (dst, c);                      \
2140     }                                           \
2141 } while (0)
2142
2143 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2144 do {                                    \
2145   if (ch)                               \
2146     {                                   \
2147       DECODE_ADD_BINARY_CHAR (ch, dst); \
2148       ch = 0;                           \
2149     }                                   \
2150 } while (0)
2151
2152 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2153 do {                                    \
2154   if (flags & CODING_STATE_END)         \
2155     {                                   \
2156       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2157       if (flags & CODING_STATE_CR)      \
2158         Dynarr_add (dst, '\r');         \
2159     }                                   \
2160 } while (0)
2161
2162 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2163
2164 typedef struct decoding_stream_s *decoding_stream_t;
2165 struct decoding_stream_s {
2166         /* Coding system that governs the conversion. */
2167         Lisp_Coding_System *codesys;
2168
2169         /* Stream that we read the encoded data from or
2170            write the decoded data to. */
2171         lstream_t other_end;
2172
2173         /* If we are reading, then we can return only a fixed amount of
2174            data, so if the conversion resulted in too much data, we store it
2175            here for retrieval the next time around. */
2176         unsigned_char_dynarr *runoff;
2177
2178         /* FLAGS holds flags indicating the current state of the decoding.
2179            Some of these flags are dependent on the coding system. */
2180         unsigned int flags;
2181
2182         /* CH holds a partially built-up character.  Since we only deal
2183            with one- and two-byte characters at the moment, we only use
2184            this to store the first byte of a two-byte character. */
2185         unsigned int ch;
2186
2187         /* EOL_TYPE specifies the type of end-of-line conversion that
2188            currently applies.  We need to keep this separate from the
2189            EOL type stored in CODESYS because the latter might indicate
2190            automatic EOL-type detection while the former will always
2191            indicate a particular EOL type. */
2192         eol_type_t eol_type;
2193 #ifdef MULE
2194         /* Additional ISO2022 information.  We define the structure above
2195            because it's also needed by the detection routines. */
2196         struct iso2022_decoder iso2022;
2197
2198         /* Additional information (the state of the running CCL program)
2199            used by the CCL decoder. */
2200         struct ccl_program ccl;
2201
2202         /* counter for UTF-8 or UCS-4 */
2203         unsigned char counter;
2204 #endif
2205         struct detection_state decst;
2206 };
2207
2208 static Lstream_data_count
2209 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2210 static Lstream_data_count
2211 decoding_writer(lstream_t stream,
2212                 const unsigned char *data, Lstream_data_count size);
2213 static int decoding_rewinder(lstream_t stream);
2214 static int decoding_seekable_p(lstream_t stream);
2215 static int decoding_flusher(lstream_t stream);
2216 static int decoding_closer(lstream_t stream);
2217
2218 static Lisp_Object decoding_marker(Lisp_Object stream);
2219
2220 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2221                               sizeof(struct decoding_stream_s));
2222
2223 static Lisp_Object
2224 decoding_marker(Lisp_Object stream)
2225 {
2226         lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2227         Lisp_Object str_obj;
2228
2229         /* We do not need to mark the coding systems or charsets stored
2230            within the stream because they are stored in a global list
2231            and automatically marked. */
2232
2233         XSETLSTREAM(str_obj, str);
2234         mark_object(str_obj);
2235         if (str->imp->marker) {
2236                 return str->imp->marker(str_obj);
2237         } else {
2238                 return Qnil;
2239         }
2240 }
2241
2242 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2243    so we read data from the other end, decode it, and store it into DATA. */
2244
2245 static Lstream_data_count
2246 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2247 {
2248         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2249         unsigned char *orig_data = data;
2250         Lstream_data_count read_size;
2251         int error_occurred = 0;
2252
2253         /* We need to interface to mule_decode(), which expects to take some
2254            amount of data and store the result into a Dynarr.  We have
2255            mule_decode() store into str->runoff, and take data from there
2256            as necessary. */
2257
2258         /* We loop until we have enough data, reading chunks from the other
2259            end and decoding it. */
2260         while (1) {
2261                 /* Take data from the runoff if we can.  Make sure to take at
2262                    most SIZE bytes, and delete the data from the runoff. */
2263                 if (Dynarr_length(str->runoff) > 0) {
2264                         Lstream_data_count chunk =
2265                                 min(size,
2266                                     (Lstream_data_count)
2267                                     Dynarr_length(str->runoff));
2268                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2269                         Dynarr_delete_many(str->runoff, 0, chunk);
2270                         data += chunk;
2271                         size -= chunk;
2272                 }
2273
2274                 if (size == 0) {
2275                         /* No more room for data */
2276                         break;
2277                 }
2278
2279                 if (str->flags & CODING_STATE_END) {
2280                         /* This means that on the previous iteration, we hit the
2281                            EOF on the other end.  We loop once more so that
2282                            mule_decode() can output any final stuff it may be
2283                            holding, or any "go back to a sane state" escape
2284                            sequences. (This latter makes sense during
2285                            encoding.) */
2286                         break;
2287                 }
2288
2289                 /* Exhausted the runoff, so get some more.  DATA has at least
2290                    SIZE bytes left of storage in it, so it's OK to read directly
2291                    into it.  (We'll be overwriting above, after we've decoded it
2292                    into the runoff.) */
2293                 read_size = Lstream_read(str->other_end, data, size);
2294                 if (read_size < 0) {
2295                         error_occurred = 1;
2296                         break;
2297                 }
2298                 if (read_size == 0) {
2299                         /* There might be some more end data produced in the
2300                            translation.  See the comment above. */
2301                         str->flags |= CODING_STATE_END;
2302                 }
2303                 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2304         }
2305
2306         if (data - orig_data == 0) {
2307                 return error_occurred ? -1 : 0;
2308         } else {
2309                 return data - orig_data;
2310         }
2311 }
2312
2313 static Lstream_data_count
2314 decoding_writer(lstream_t stream, const unsigned char *data,
2315                 Lstream_data_count size)
2316 {
2317         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2318         Lstream_data_count retval;
2319
2320         /* Decode all our data into the runoff, and then attempt to write
2321            it all out to the other end.  Remove whatever chunk we succeeded
2322            in writing. */
2323         mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2324         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2325                                Dynarr_length(str->runoff));
2326         if (retval > 0) {
2327                 Dynarr_delete_many(str->runoff, 0, retval);
2328         }
2329         /* Do NOT return retval.  The return value indicates how much
2330            of the incoming data was written, not how many bytes were
2331            written. */
2332         return size;
2333 }
2334
2335 static void
2336 reset_decoding_stream(decoding_stream_t str)
2337 {
2338 #ifdef MULE
2339         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2340                 Lisp_Object coding_system;
2341                 XSETCODING_SYSTEM(coding_system, str->codesys);
2342                 reset_iso2022(coding_system, &str->iso2022);
2343         } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2344                 setup_ccl_program(&str->ccl,
2345                                   CODING_SYSTEM_CCL_DECODE(str->codesys));
2346         }
2347         str->counter = 0;
2348 #endif                          /* MULE */
2349         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2350             || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2351                 xzero(str->decst);
2352                 str->decst.eol_type = EOL_AUTODETECT;
2353                 str->decst.mask = ~0;
2354         }
2355         str->flags = str->ch = 0;
2356 }
2357
2358 static int
2359 decoding_rewinder(lstream_t stream)
2360 {
2361         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2362         reset_decoding_stream(str);
2363         Dynarr_reset(str->runoff);
2364         return Lstream_rewind(str->other_end);
2365 }
2366
2367 static int
2368 decoding_seekable_p(lstream_t stream)
2369 {
2370         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2371         return Lstream_seekable_p(str->other_end);
2372 }
2373
2374 static int
2375 decoding_flusher(lstream_t stream)
2376 {
2377         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2378         return Lstream_flush(str->other_end);
2379 }
2380
2381 static int
2382 decoding_closer(lstream_t stream)
2383 {
2384         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2385         if (stream->flags & LSTREAM_FL_WRITE) {
2386                 str->flags |= CODING_STATE_END;
2387                 decoding_writer(stream, 0, 0);
2388         }
2389         Dynarr_free(str->runoff);
2390 #ifdef MULE
2391 #ifdef ENABLE_COMPOSITE_CHARS
2392         if (str->iso2022.composite_chars) {
2393                 Dynarr_free(str->iso2022.composite_chars);
2394         }
2395 #endif
2396 #endif
2397         return Lstream_close(str->other_end);
2398 }
2399
2400 Lisp_Object
2401 decoding_stream_coding_system(lstream_t stream)
2402 {
2403         Lisp_Object coding_system;
2404         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2405
2406         XSETCODING_SYSTEM(coding_system, str->codesys);
2407         return subsidiary_coding_system(coding_system, str->eol_type);
2408 }
2409
2410 void
2411 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2412 {
2413         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2414         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2415         str->codesys = cs;
2416         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2417                 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2418         }
2419         reset_decoding_stream(str);
2420         return;
2421 }
2422
2423 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2424    stream for writing, no automatic code detection will be performed.
2425    The reason for this is that automatic code detection requires a
2426    seekable input.  Things will also fail if you open a decoding
2427    stream for reading using a non-fully-specified coding system and
2428    a non-seekable input stream. */
2429
2430 static Lisp_Object
2431 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2432 {
2433         lstream_t lstr = Lstream_new(lstream_decoding, mode);
2434         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2435         Lisp_Object obj;
2436
2437         xzero(*str);
2438         str->other_end = stream;
2439         str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2440         str->eol_type = EOL_AUTODETECT;
2441         if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2442                 /* We can determine the coding system now. */
2443                 determine_real_coding_system(stream, &codesys, &str->eol_type);
2444         }
2445         set_decoding_stream_coding_system(lstr, codesys);
2446         str->decst.eol_type = str->eol_type;
2447         str->decst.mask = ~0;
2448         XSETLSTREAM(obj, lstr);
2449         return obj;
2450 }
2451
2452 Lisp_Object
2453 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2454 {
2455         return make_decoding_stream_1(stream, codesys, "r");
2456 }
2457
2458 Lisp_Object
2459 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2460 {
2461         return make_decoding_stream_1(stream, codesys, "w");
2462 }
2463
2464 /* Note: the decode_coding_* functions all take the same
2465    arguments as mule_decode(), which is to say some SRC data of
2466    size N, which is to be stored into dynamic array DST.
2467    DECODING is the stream within which the decoding is
2468    taking place, but no data is actually read from or
2469    written to that stream; that is handled in decoding_reader()
2470    or decoding_writer().  This allows the same functions to
2471    be used for both reading and writing. */
2472
2473 static void
2474 mule_decode(lstream_t decoding, const Extbyte * src,
2475             unsigned_char_dynarr * dst, Lstream_data_count n)
2476 {
2477         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2478
2479         /* If necessary, do encoding-detection now.  We do this when
2480            we're a writing stream or a non-seekable reading stream,
2481            meaning that we can't just process the whole input,
2482            rewind, and start over. */
2483
2484         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2485             str->eol_type == EOL_AUTODETECT) {
2486                 Lisp_Object codesys;
2487
2488                 XSETCODING_SYSTEM(codesys, str->codesys);
2489                 detect_coding_type(&str->decst, src, n,
2490                                    CODING_SYSTEM_TYPE(str->codesys) !=
2491                                    CODESYS_AUTODETECT);
2492                 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2493                     str->decst.mask != ~0)
2494                         /* #### This is cheesy.  What we really ought to do is
2495                            buffer up a certain amount of data so as to get a
2496                            less random result. */
2497                         codesys = coding_system_from_mask(str->decst.mask);
2498                 str->eol_type = str->decst.eol_type;
2499                 if (XCODING_SYSTEM(codesys) != str->codesys) {
2500                         /* Preserve the CODING_STATE_END flag in case it was set.
2501                            If we erase it, bad things might happen. */
2502                         int was_end = str->flags & CODING_STATE_END;
2503                         set_decoding_stream_coding_system(decoding, codesys);
2504                         if (was_end)
2505                                 str->flags |= CODING_STATE_END;
2506                 }
2507         }
2508
2509         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2510 #ifdef DEBUG_SXEMACS
2511         case CODESYS_INTERNAL:
2512                 Dynarr_add_many(dst, src, n);
2513                 break;
2514 #endif
2515         case CODESYS_AUTODETECT:
2516                 /* If we got this far and still haven't decided on the coding
2517                    system, then do no conversion. */
2518         case CODESYS_NO_CONVERSION:
2519                 decode_coding_no_conversion(decoding, src, dst, n);
2520                 break;
2521 #ifdef MULE
2522         case CODESYS_SHIFT_JIS:
2523                 decode_coding_sjis(decoding, src, dst, n);
2524                 break;
2525         case CODESYS_BIG5:
2526                 decode_coding_big5(decoding, src, dst, n);
2527                 break;
2528         case CODESYS_UCS4:
2529                 decode_coding_ucs4(decoding, src, dst, n);
2530                 break;
2531         case CODESYS_UTF8:
2532                 decode_coding_utf8(decoding, src, dst, n);
2533                 break;
2534         case CODESYS_CCL:
2535                 str->ccl.last_block = str->flags & CODING_STATE_END;
2536                 /* When applying ccl program to stream, MUST NOT set NULL
2537                    pointer to src.  */
2538                 ccl_driver(&str->ccl,
2539                            (src
2540                             ? (const unsigned char *)src
2541                             : (const unsigned char *)""),
2542                            dst, n, 0, CCL_MODE_DECODING);
2543                 break;
2544         case CODESYS_ISO2022:
2545                 decode_coding_iso2022(decoding, src, dst, n);
2546                 break;
2547 #endif                          /* MULE */
2548         default:
2549                 abort();
2550         }
2551 }
2552
2553 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0,   /*
2554 Decode the text between START and END which is encoded in CODING-SYSTEM.
2555 This is useful if you've read in encoded text from a file without decoding
2556 it (e.g. you read in a JIS-formatted file but used the `binary' or
2557 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2558 Return length of decoded text.
2559 BUFFER defaults to the current buffer if unspecified.
2560 */
2561       (start, end, coding_system, buffer))
2562 {
2563         Bufpos b, e;
2564         struct buffer *buf = decode_buffer(buffer, 0);
2565         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2566         lstream_t istr, ostr;
2567         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2568
2569         get_buffer_range_char(buf, start, end, &b, &e, 0);
2570
2571         barf_if_buffer_read_only(buf, b, e);
2572
2573         coding_system = Fget_coding_system(coding_system);
2574         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2575         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2576         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2577                                                    coding_system);
2578         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2579                                                 Fget_coding_system(Qbinary));
2580         istr = XLSTREAM(instream);
2581         ostr = XLSTREAM(outstream);
2582         GCPRO4(instream, lb_outstream, de_outstream, outstream);
2583
2584         /* The chain of streams looks like this:
2585
2586            [BUFFER] <----- send through
2587            ------> [ENCODE AS BINARY]
2588            ------> [DECODE AS SPECIFIED]
2589            ------> [BUFFER]
2590          */
2591
2592         while (1) {
2593                 char tempbuf[1024];     /* some random amount */
2594                 Bufpos newpos, even_newer_pos;
2595                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2596                 Lstream_data_count size_in_bytes =
2597                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
2598
2599                 if (!size_in_bytes)
2600                         break;
2601                 newpos = lisp_buffer_stream_startpos(istr);
2602                 Lstream_write(ostr, tempbuf, size_in_bytes);
2603                 even_newer_pos = lisp_buffer_stream_startpos(istr);
2604                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2605                                     even_newer_pos, 0);
2606         }
2607         Lstream_close(istr);
2608         Lstream_close(ostr);
2609         UNGCPRO;
2610         Lstream_delete(istr);
2611         Lstream_delete(ostr);
2612         Lstream_delete(XLSTREAM(de_outstream));
2613         Lstream_delete(XLSTREAM(lb_outstream));
2614         return Qnil;
2615 }
2616 \f
2617 /************************************************************************/
2618 /*           Converting to an external encoding ("encoding")            */
2619 /************************************************************************/
2620
2621 /* An encoding stream is an output stream.  When you create the
2622    stream, you specify the coding system that governs the encoding
2623    and another stream that the resulting encoded data is to be
2624    sent to, and then start sending data to it. */
2625
2626 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2627
2628 typedef struct encoding_stream_s *encoding_stream_t;
2629 struct encoding_stream_s {
2630         /* Coding system that governs the conversion. */
2631         Lisp_Coding_System *codesys;
2632
2633         /* Stream that we read the encoded data from or
2634            write the decoded data to. */
2635         lstream_t other_end;
2636
2637         /* If we are reading, then we can return only a fixed amount of
2638            data, so if the conversion resulted in too much data, we store it
2639            here for retrieval the next time around. */
2640         unsigned_char_dynarr *runoff;
2641
2642         /* FLAGS holds flags indicating the current state of the encoding.
2643            Some of these flags are dependent on the coding system. */
2644         unsigned int flags;
2645
2646         /* CH holds a partially built-up character.  Since we only deal
2647            with one- and two-byte characters at the moment, we only use
2648            this to store the first byte of a two-byte character. */
2649         unsigned int ch;
2650 #ifdef MULE
2651         /* Additional information used by the ISO2022 encoder. */
2652         struct {
2653                 /* CHARSET holds the character sets currently assigned to the G0
2654                    through G3 registers.  It is initialized from the array
2655                    INITIAL_CHARSET in CODESYS. */
2656                 Lisp_Object charset[4];
2657
2658                 /* Which registers are currently invoked into the left (GL) and
2659                    right (GR) halves of the 8-bit encoding space? */
2660                 int register_left, register_right;
2661
2662                 /* Whether we need to explicitly designate the charset in the
2663                    G? register before using it.  It is initialized from the
2664                    array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2665                 unsigned char force_charset_on_output[4];
2666
2667                 /* Other state variables that need to be preserved across
2668                    invocations. */
2669                 Lisp_Object current_charset;
2670                 int current_half;
2671                 int current_char_boundary;
2672         } iso2022;
2673
2674         /* Additional information (the state of the running CCL program)
2675            used by the CCL encoder. */
2676         struct ccl_program ccl;
2677 #endif                          /* MULE */
2678 };
2679
2680 static Lstream_data_count
2681 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2682 static Lstream_data_count
2683 encoding_writer(lstream_t stream,
2684                 const unsigned char *data, Lstream_data_count size);
2685 static int encoding_rewinder(lstream_t stream);
2686 static int encoding_seekable_p(lstream_t stream);
2687 static int encoding_flusher(lstream_t stream);
2688 static int encoding_closer(lstream_t stream);
2689
2690 static Lisp_Object encoding_marker(Lisp_Object stream);
2691
2692 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2693                               sizeof(struct encoding_stream_s));
2694
2695 static Lisp_Object
2696 encoding_marker(Lisp_Object stream)
2697 {
2698         lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2699         Lisp_Object str_obj;
2700
2701         /* We do not need to mark the coding systems or charsets stored
2702            within the stream because they are stored in a global list
2703            and automatically marked. */
2704
2705         XSETLSTREAM(str_obj, str);
2706         mark_object(str_obj);
2707         if (str->imp->marker) {
2708                 return str->imp->marker(str_obj);
2709         } else {
2710                 return Qnil;
2711         }
2712 }
2713
2714 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2715    so we read data from the other end, encode it, and store it into DATA. */
2716
2717 static Lstream_data_count
2718 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2719 {
2720         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2721         unsigned char *orig_data = data;
2722         Lstream_data_count read_size;
2723         int error_occurred = 0;
2724
2725         /* We need to interface to mule_encode(), which expects to take some
2726            amount of data and store the result into a Dynarr.  We have
2727            mule_encode() store into str->runoff, and take data from there
2728            as necessary. */
2729
2730         /* We loop until we have enough data, reading chunks from the other
2731            end and encoding it. */
2732         while (1) {
2733                 /* Take data from the runoff if we can.  Make sure to take at
2734                    most SIZE bytes, and delete the data from the runoff. */
2735                 if (Dynarr_length(str->runoff) > 0) {
2736                         int chunk = min((int)size, Dynarr_length(str->runoff));
2737                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2738                         Dynarr_delete_many(str->runoff, 0, chunk);
2739                         data += chunk;
2740                         size -= chunk;
2741                 }
2742
2743                 if (size == 0) {
2744                         /* No more room for data */
2745                         break;
2746                 }
2747
2748                 if (str->flags & CODING_STATE_END) {
2749                         /* This means that on the previous iteration, we hit the
2750                            EOF on the other end.  We loop once more so that
2751                            mule_encode() can output any final stuff it may be
2752                            holding, or any "go back to a sane state" escape
2753                            sequences. (This latter makes sense during
2754                            encoding.) */
2755                         break;
2756                 }
2757
2758                 /* Exhausted the runoff, so get some more.  DATA at least SIZE
2759                    bytes left of storage in it, so it's OK to read directly into
2760                    it.  (We'll be overwriting above, after we've encoded it into
2761                    the runoff.) */
2762                 read_size = Lstream_read(str->other_end, data, size);
2763                 if (read_size < 0) {
2764                         error_occurred = 1;
2765                         break;
2766                 }
2767                 if (read_size == 0) {
2768                         /* There might be some more end data produced in the
2769                            translation.  See the comment above. */
2770                         str->flags |= CODING_STATE_END;
2771                 }
2772                 mule_encode(stream, data, str->runoff, read_size);
2773         }
2774
2775         if (data == orig_data) {
2776                 return error_occurred ? -1 : 0;
2777         } else {
2778                 return data - orig_data;
2779         }
2780 }
2781
2782 static Lstream_data_count
2783 encoding_writer(lstream_t stream, const unsigned char *data,
2784                 Lstream_data_count size)
2785 {
2786         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2787         Lstream_data_count retval;
2788
2789         /* Encode all our data into the runoff, and then attempt to write
2790            it all out to the other end.  Remove whatever chunk we succeeded
2791            in writing. */
2792         mule_encode(stream, data, str->runoff, size);
2793         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2794                                Dynarr_length(str->runoff));
2795         if (retval > 0) {
2796                 Dynarr_delete_many(str->runoff, 0, retval);
2797         }
2798         /* Do NOT return retval.  The return value indicates how much
2799            of the incoming data was written, not how many bytes were
2800            written. */
2801         return size;
2802 }
2803
2804 static void
2805 reset_encoding_stream(encoding_stream_t str)
2806 {
2807 #ifdef MULE
2808         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2809         case CODESYS_ISO2022: {
2810                 int i;
2811
2812                 for (i = 0; i < 4; i++) {
2813                         str->iso2022.charset[i] =
2814                                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2815                                         str->codesys, i);
2816                         str->iso2022.force_charset_on_output[i] =
2817                                 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2818                                         str->codesys, i);
2819                 }
2820                 str->iso2022.register_left = 0;
2821                 str->iso2022.register_right = 1;
2822                 str->iso2022.current_charset = Qnil;
2823                 str->iso2022.current_half = 0;
2824                 str->iso2022.current_char_boundary = 1;
2825                 break;
2826         }
2827         case CODESYS_CCL:
2828                 setup_ccl_program(&str->ccl,
2829                                   CODING_SYSTEM_CCL_ENCODE(str->codesys));
2830                 break;
2831
2832                 /* list the rest of them lot explicitly */
2833         case CODESYS_AUTODETECT:
2834         case CODESYS_SHIFT_JIS:
2835         case CODESYS_BIG5:
2836         case CODESYS_UCS4:
2837         case CODESYS_UTF8:
2838         case CODESYS_NO_CONVERSION:
2839 #ifdef DEBUG_SXEMACS
2840         case CODESYS_INTERNAL:
2841 #endif
2842         default:
2843                 break;
2844         }
2845 #endif                          /* MULE */
2846
2847         str->flags = str->ch = 0;
2848 }
2849
2850 static int
2851 encoding_rewinder(lstream_t stream)
2852 {
2853         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2854         reset_encoding_stream(str);
2855         Dynarr_reset(str->runoff);
2856         return Lstream_rewind(str->other_end);
2857 }
2858
2859 static int
2860 encoding_seekable_p(lstream_t stream)
2861 {
2862         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2863         return Lstream_seekable_p(str->other_end);
2864 }
2865
2866 static int
2867 encoding_flusher(lstream_t stream)
2868 {
2869         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2870         return Lstream_flush(str->other_end);
2871 }
2872
2873 static int
2874 encoding_closer(lstream_t stream)
2875 {
2876         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2877         if (stream->flags & LSTREAM_FL_WRITE) {
2878                 str->flags |= CODING_STATE_END;
2879                 encoding_writer(stream, 0, 0);
2880         }
2881         Dynarr_free(str->runoff);
2882         return Lstream_close(str->other_end);
2883 }
2884
2885 Lisp_Object
2886 encoding_stream_coding_system(lstream_t stream)
2887 {
2888         Lisp_Object coding_system;
2889         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2890
2891         XSETCODING_SYSTEM(coding_system, str->codesys);
2892         return coding_system;
2893 }
2894
2895 void
2896 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2897 {
2898         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2899         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2900         str->codesys = cs;
2901         reset_encoding_stream(str);
2902 }
2903
2904 static Lisp_Object
2905 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2906 {
2907         lstream_t lstr = Lstream_new(lstream_encoding, mode);
2908         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2909         Lisp_Object obj;
2910
2911         xzero(*str);
2912         str->runoff = Dynarr_new(unsigned_char);
2913         str->other_end = stream;
2914         set_encoding_stream_coding_system(lstr, codesys);
2915         XSETLSTREAM(obj, lstr);
2916         return obj;
2917 }
2918
2919 Lisp_Object
2920 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2921 {
2922         return make_encoding_stream_1(stream, codesys, "r");
2923 }
2924
2925 Lisp_Object
2926 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2927 {
2928         return make_encoding_stream_1(stream, codesys, "w");
2929 }
2930
2931 /* Convert N bytes of internally-formatted data stored in SRC to an
2932    external format, according to the encoding stream ENCODING.
2933    Store the encoded data into DST. */
2934
2935 static void
2936 mule_encode(lstream_t encoding, const Bufbyte * src,
2937             unsigned_char_dynarr * dst, Lstream_data_count n)
2938 {
2939         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2940
2941         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2942 #ifdef DEBUG_SXEMACS
2943         case CODESYS_INTERNAL:
2944                 Dynarr_add_many(dst, src, n);
2945                 break;
2946 #endif
2947         case CODESYS_AUTODETECT:
2948                 /* If we got this far and still haven't decided on the coding
2949                    system, then do no conversion. */
2950         case CODESYS_NO_CONVERSION:
2951                 encode_coding_no_conversion(encoding, src, dst, n);
2952                 break;
2953 #ifdef MULE
2954         case CODESYS_SHIFT_JIS:
2955                 encode_coding_sjis(encoding, src, dst, n);
2956                 break;
2957         case CODESYS_BIG5:
2958                 encode_coding_big5(encoding, src, dst, n);
2959                 break;
2960         case CODESYS_UCS4:
2961                 encode_coding_ucs4(encoding, src, dst, n);
2962                 break;
2963         case CODESYS_UTF8:
2964                 encode_coding_utf8(encoding, src, dst, n);
2965                 break;
2966         case CODESYS_CCL:
2967                 str->ccl.last_block = str->flags & CODING_STATE_END;
2968                 /* When applying ccl program to stream, MUST NOT set NULL
2969                    pointer to src.  */
2970                 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
2971                            dst, n, 0, CCL_MODE_ENCODING);
2972                 break;
2973         case CODESYS_ISO2022:
2974                 encode_coding_iso2022(encoding, src, dst, n);
2975                 break;
2976 #endif                          /* MULE */
2977         default:
2978                 abort();
2979         }
2980 }
2981
2982 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0,   /*
2983 Encode the text between START and END using CODING-SYSTEM.
2984 This will, for example, convert Japanese characters into stuff such as
2985 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2986 text.  BUFFER defaults to the current buffer if unspecified.
2987 */
2988       (start, end, coding_system, buffer))
2989 {
2990         Bufpos b, e;
2991         struct buffer *buf = decode_buffer(buffer, 0);
2992         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2993         lstream_t istr, ostr;
2994         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2995
2996         get_buffer_range_char(buf, start, end, &b, &e, 0);
2997
2998         barf_if_buffer_read_only(buf, b, e);
2999
3000         coding_system = Fget_coding_system(coding_system);
3001         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
3002         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
3003         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
3004                                                    Fget_coding_system(Qbinary));
3005         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
3006                                                 coding_system);
3007         istr = XLSTREAM(instream);
3008         ostr = XLSTREAM(outstream);
3009         GCPRO4(instream, outstream, de_outstream, lb_outstream);
3010         /* The chain of streams looks like this:
3011
3012            [BUFFER] <----- send through
3013            ------> [ENCODE AS SPECIFIED]
3014            ------> [DECODE AS BINARY]
3015            ------> [BUFFER]
3016          */
3017         while (1) {
3018                 char tempbuf[1024];     /* some random amount */
3019                 Bufpos newpos, even_newer_pos;
3020                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
3021                 Lstream_data_count size_in_bytes =
3022                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
3023
3024                 if (!size_in_bytes)
3025                         break;
3026                 newpos = lisp_buffer_stream_startpos(istr);
3027                 Lstream_write(ostr, tempbuf, size_in_bytes);
3028                 even_newer_pos = lisp_buffer_stream_startpos(istr);
3029                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3030                                     even_newer_pos, 0);
3031         }
3032
3033         {
3034                 Charcount retlen =
3035                     lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3036                 Lstream_close(istr);
3037                 Lstream_close(ostr);
3038                 UNGCPRO;
3039                 Lstream_delete(istr);
3040                 Lstream_delete(ostr);
3041                 Lstream_delete(XLSTREAM(de_outstream));
3042                 Lstream_delete(XLSTREAM(lb_outstream));
3043                 return make_int(retlen);
3044         }
3045 }
3046
3047 #ifdef MULE
3048 \f
3049 /************************************************************************/
3050 /*                          Shift-JIS methods                           */
3051 /************************************************************************/
3052
3053 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3054    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3055    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3056    encoded by "position-code + 0x80".  A character of JISX0208
3057    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3058    position-codes are divided and shifted so that it fit in the range
3059    below.
3060
3061    --- CODE RANGE of Shift-JIS ---
3062    (character set)      (range)
3063    ASCII                0x00 .. 0x7F
3064    JISX0201-Kana        0xA0 .. 0xDF
3065    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3066             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3067    -------------------------------
3068
3069 */
3070
3071 /* Is this the first byte of a Shift-JIS two-byte char? */
3072
3073 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3074   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3075
3076 /* Is this the second byte of a Shift-JIS two-byte char? */
3077
3078 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3079   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3080
3081 #define BYTE_SJIS_KATAKANA_P(c) \
3082   ((c) >= 0xA1 && (c) <= 0xDF)
3083
3084 static int
3085 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3086                    Lstream_data_count n)
3087 {
3088         while (n--) {
3089                 const unsigned char c = *(const unsigned char *)src++;
3090                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3091                         return 0;
3092                 if (st->shift_jis.in_second_byte) {
3093                         st->shift_jis.in_second_byte = 0;
3094                         if (c < 0x40)
3095                                 return 0;
3096                 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3097                         st->shift_jis.in_second_byte = 1;
3098         }
3099         return CODING_CATEGORY_SHIFT_JIS_MASK;
3100 }
3101
3102 /* Convert Shift-JIS data to internal format. */
3103
3104 static void
3105 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3106                    unsigned_char_dynarr * dst, Lstream_data_count n)
3107 {
3108         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3109         unsigned int flags = str->flags;
3110         unsigned int ch = str->ch;
3111         eol_type_t eol_type = str->eol_type;
3112
3113         while (n--) {
3114                 const unsigned char c = *(const unsigned char *)src++;
3115
3116                 if (ch) {
3117                         /* Previous character was first byte of Shift-JIS Kanji
3118                            char. */
3119                         if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3120                                 unsigned char e1, e2;
3121
3122                                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3123                                 DECODE_SJIS(ch, c, e1, e2);
3124                                 Dynarr_add(dst, e1);
3125                                 Dynarr_add(dst, e2);
3126                         } else {
3127                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3128                                 DECODE_ADD_BINARY_CHAR(c, dst);
3129                         }
3130                         ch = 0;
3131                 } else {
3132                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3133                         if (BYTE_SJIS_TWO_BYTE_1_P(c))
3134                                 ch = c;
3135                         else if (BYTE_SJIS_KATAKANA_P(c)) {
3136                                 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3137                                 Dynarr_add(dst, c);
3138                         } else
3139                                 DECODE_ADD_BINARY_CHAR(c, dst);
3140                 }
3141               label_continue_loop:;
3142         }
3143
3144         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3145
3146         str->flags = flags;
3147         str->ch = ch;
3148 }
3149
3150 /* Convert internally-formatted data to Shift-JIS. */
3151
3152 static void
3153 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3154                    unsigned_char_dynarr * dst, Lstream_data_count n)
3155 {
3156         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3157         unsigned int flags = str->flags;
3158         unsigned int ch = str->ch;
3159         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3160
3161         while (n--) {
3162                 Bufbyte c = *src++;
3163                 if (c == '\n') {
3164                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3165                                 Dynarr_add(dst, '\r');
3166                         if (eol_type != EOL_CR)
3167                                 Dynarr_add(dst, '\n');
3168                         ch = 0;
3169                 } else if (BYTE_ASCII_P(c)) {
3170                         Dynarr_add(dst, c);
3171                         ch = 0;
3172                 } else if (BUFBYTE_LEADING_BYTE_P(c))
3173                         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3174                               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3175                               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3176                 else if (ch) {
3177                         if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3178                                 Dynarr_add(dst, c);
3179                                 ch = 0;
3180                         } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3181                                    ch == LEADING_BYTE_JAPANESE_JISX0208)
3182                                 ch = c;
3183                         else {
3184                                 /* j1 is bessel j1 function,
3185                                  * so we use something else */
3186                                 /* unsigned char j1, j2; */
3187                                 unsigned char tt1, tt2;
3188
3189                                 ENCODE_SJIS(ch, c, tt1, tt2);
3190                                 Dynarr_add(dst, tt1);
3191                                 Dynarr_add(dst, tt2);
3192                                 ch = 0;
3193                         }
3194                 }
3195         }
3196
3197         str->flags = flags;
3198         str->ch = ch;
3199 }
3200
3201 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3202 Decode a JISX0208 character of Shift-JIS coding-system.
3203 CODE is the character code in Shift-JIS as a cons of type bytes.
3204 Return the corresponding character.
3205 */
3206       (code))
3207 {
3208         unsigned char c1, c2, s1, s2;
3209
3210         CHECK_CONS(code);
3211         CHECK_INT(XCAR(code));
3212         CHECK_INT(XCDR(code));
3213         s1 = XINT(XCAR(code));
3214         s2 = XINT(XCDR(code));
3215         if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3216                 DECODE_SJIS(s1, s2, c1, c2);
3217                 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3218                                            c1 & 0x7F, c2 & 0x7F));
3219         } else
3220                 return Qnil;
3221 }
3222
3223 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3224 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3225 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3226 */
3227       (character))
3228 {
3229         Lisp_Object charset;
3230         int c1, c2, s1, s2;
3231
3232         CHECK_CHAR_COERCE_INT(character);
3233         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3234         if (EQ(charset, Vcharset_japanese_jisx0208)) {
3235                 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3236                 return Fcons(make_int(s1), make_int(s2));
3237         } else
3238                 return Qnil;
3239 }
3240 \f
3241 /************************************************************************/
3242 /*                            Big5 methods                              */
3243 /************************************************************************/
3244
3245 /* BIG5 is a coding system encoding two character sets: ASCII and
3246    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3247    character set and is encoded in two-byte.
3248
3249    --- CODE RANGE of BIG5 ---
3250    (character set)      (range)
3251    ASCII                0x00 .. 0x7F
3252    Big5 (1st byte)      0xA1 .. 0xFE
3253         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3254    --------------------------
3255
3256    Since the number of characters in Big5 is larger than maximum
3257    characters in Emacs' charset (96x96), it can't be handled as one
3258    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3259    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3260    contains frequently used characters and the latter contains less
3261    frequently used characters.  */
3262
3263 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3264   ((c) >= 0xA1 && (c) <= 0xFE)
3265
3266 /* Is this the second byte of a Shift-JIS two-byte char? */
3267
3268 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3269   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3270
3271 /* Number of Big5 characters which have the same code in 1st byte.  */
3272
3273 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3274
3275 /* Code conversion macros.  These are macros because they are used in
3276    inner loops during code conversion.
3277
3278    Note that temporary variables in macros introduce the classic
3279    dynamic-scoping problems with variable names.  We use capital-
3280    lettered variables in the assumption that SXEmacs does not use
3281    capital letters in variables except in a very formalized way
3282    (e.g. Qstring). */
3283
3284 /* Convert Big5 code (b1, b2) into its internal string representation
3285    (lb, c1, c2). */
3286
3287 /* There is a much simpler way to split the Big5 charset into two.
3288    For the moment I'm going to leave the algorithm as-is because it
3289    claims to separate out the most-used characters into a single
3290    charset, which perhaps will lead to optimizations in various
3291    places.
3292
3293    The way the algorithm works is something like this:
3294
3295    Big5 can be viewed as a 94x157 charset, where the row is
3296    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3297    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3298    the split between low and high column numbers is apparently
3299    meaningless; ascending rows produce less and less frequent chars.
3300    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3301    the first charset, and the upper half (0xC9 .. 0xFE) to the
3302    second.  To do the conversion, we convert the character into
3303    a single number where 0 .. 156 is the first row, 157 .. 313
3304    is the second, etc.  That way, the characters are ordered by
3305    decreasing frequency.  Then we just chop the space in two
3306    and coerce the result into a 94x94 space.
3307    */
3308
3309 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3310 {                                                                       \
3311   int B1 = b1, B2 = b2;                                                 \
3312   unsigned int I                                                        \
3313     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3314                                                                         \
3315   if (B1 < 0xC9)                                                        \
3316     {                                                                   \
3317       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3318     }                                                                   \
3319   else                                                                  \
3320     {                                                                   \
3321       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3322       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3323     }                                                                   \
3324   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3325   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3326 } while (0)
3327
3328 /* Convert the internal string representation of a Big5 character
3329    (lb, c1, c2) into Big5 code (b1, b2). */
3330
3331 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3332 {                                                                       \
3333   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3334                                                                         \
3335   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3336     {                                                                   \
3337       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3338     }                                                                   \
3339   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3340   b2 = I % BIG5_SAME_ROW;                                               \
3341   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3342 } while (0)
3343
3344 static int
3345 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3346                    Lstream_data_count n)
3347 {
3348         while (n--) {
3349                 const unsigned char c = *(const unsigned char *)src++;
3350                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3351                     (c >= 0x80 && c <= 0xA0))
3352                         return 0;
3353                 if (st->big5.in_second_byte) {
3354                         st->big5.in_second_byte = 0;
3355                         if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3356                                 return 0;
3357                 } else if (c >= 0xA1)
3358                         st->big5.in_second_byte = 1;
3359         }
3360         return CODING_CATEGORY_BIG5_MASK;
3361 }
3362
3363 /* Convert Big5 data to internal format. */
3364
3365 static void
3366 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3367                    unsigned_char_dynarr * dst, Lstream_data_count n)
3368 {
3369         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3370         unsigned int flags = str->flags;
3371         unsigned int ch = str->ch;
3372         eol_type_t eol_type = str->eol_type;
3373
3374         while (n--) {
3375                 const unsigned char c = *(const unsigned char *)src++;
3376                 if (ch) {
3377                         /* Previous character was first byte of Big5 char. */
3378                         if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3379                                 unsigned char b1, b2, b3;
3380                                 DECODE_BIG5(ch, c, b1, b2, b3);
3381                                 Dynarr_add(dst, b1);
3382                                 Dynarr_add(dst, b2);
3383                                 Dynarr_add(dst, b3);
3384                         } else {
3385                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3386                                 DECODE_ADD_BINARY_CHAR(c, dst);
3387                         }
3388                         ch = 0;
3389                 } else {
3390                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3391                         if (BYTE_BIG5_TWO_BYTE_1_P(c))
3392                                 ch = c;
3393                         else
3394                                 DECODE_ADD_BINARY_CHAR(c, dst);
3395                 }
3396               label_continue_loop:;
3397         }
3398
3399         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3400
3401         str->flags = flags;
3402         str->ch = ch;
3403 }
3404
3405 /* Convert internally-formatted data to Big5. */
3406
3407 static void
3408 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3409                    unsigned_char_dynarr * dst, Lstream_data_count n)
3410 {
3411         unsigned char c;
3412         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3413         unsigned int flags = str->flags;
3414         unsigned int ch = str->ch;
3415         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3416
3417         while (n--) {
3418                 c = *src++;
3419                 if (c == '\n') {
3420                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3421                                 Dynarr_add(dst, '\r');
3422                         if (eol_type != EOL_CR)
3423                                 Dynarr_add(dst, '\n');
3424                 } else if (BYTE_ASCII_P(c)) {
3425                         /* ASCII. */
3426                         Dynarr_add(dst, c);
3427                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3428                         if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3429                             c == LEADING_BYTE_CHINESE_BIG5_2) {
3430                                 /* A recognized leading byte. */
3431                                 ch = c;
3432                                 continue;       /* not done with this character. */
3433                         }
3434                         /* otherwise just ignore this character. */
3435                 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3436                            ch == LEADING_BYTE_CHINESE_BIG5_2) {
3437                         /* Previous char was a recognized leading byte. */
3438                         ch = (ch << 8) | c;
3439                         continue;       /* not done with this character. */
3440                 } else if (ch) {
3441                         /* Encountering second byte of a Big5 character. */
3442                         unsigned char b1, b2;
3443
3444                         ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3445                         Dynarr_add(dst, b1);
3446                         Dynarr_add(dst, b2);
3447                 }
3448
3449                 ch = 0;
3450         }
3451
3452         str->flags = flags;
3453         str->ch = ch;
3454 }
3455
3456 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0,   /*
3457 Decode a Big5 character CODE of BIG5 coding-system.
3458 CODE is the character code in BIG5, a cons of two integers.
3459 Return the corresponding character.
3460 */
3461       (code))
3462 {
3463         unsigned char c1, c2, b1, b2;
3464
3465         CHECK_CONS(code);
3466         CHECK_INT(XCAR(code));
3467         CHECK_INT(XCDR(code));
3468         b1 = XINT(XCAR(code));
3469         b2 = XINT(XCDR(code));
3470         if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3471                 int leading_byte;
3472                 Lisp_Object charset;
3473                 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3474                 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3475                 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3476         } else
3477                 return Qnil;
3478 }
3479
3480 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0,   /*
3481 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3482 Return the corresponding character code in Big5.
3483 */
3484       (character))
3485 {
3486         Lisp_Object charset;
3487         int c1, c2, b1, b2;
3488
3489         CHECK_CHAR_COERCE_INT(character);
3490         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3491         if (EQ(charset, Vcharset_chinese_big5_1) ||
3492             EQ(charset, Vcharset_chinese_big5_2)) {
3493                 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3494                             c2 | 0x80, b1, b2);
3495                 return Fcons(make_int(b1), make_int(b2));
3496         } else
3497                 return Qnil;
3498 }
3499 \f
3500 /************************************************************************/
3501 /*                           UCS-4 methods                              */
3502 /*                                                                      */
3503 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3504 /*                                                                      */
3505 /************************************************************************/
3506
3507 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0,   /*
3508 Map UCS-4 code CODE to Mule character CHARACTER.
3509
3510 Return T on success, NIL on failure.
3511 */
3512       (code, character))
3513 {
3514         size_t c;
3515
3516         CHECK_CHAR(character);
3517         CHECK_NATNUM(code);
3518         c = XINT(code);
3519
3520         if (c < countof(fcd->ucs_to_mule_table)) {
3521                 fcd->ucs_to_mule_table[c] = character;
3522                 return Qt;
3523         } else
3524                 return Qnil;
3525 }
3526
3527 static Lisp_Object ucs_to_char(unsigned long code)
3528 {
3529         if (code < countof(fcd->ucs_to_mule_table)) {
3530                 return fcd->ucs_to_mule_table[code];
3531         } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3532                 unsigned int c;
3533
3534                 code -= 0xe00000;
3535                 c = code % (94 * 94);
3536                 return make_char
3537                     (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3538                                (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3539                                 CHARSET_LEFT_TO_RIGHT),
3540                                c / 94 + 33, c % 94 + 33));
3541         } else
3542                 return Qnil;
3543 }
3544
3545 DEFUN("ucs-char", Fucs_char, 1, 1, 0,   /*
3546 Return Mule character corresponding to UCS code CODE (a positive integer).
3547 */
3548       (code))
3549 {
3550         CHECK_NATNUM(code);
3551         return ucs_to_char(XINT(code));
3552 }
3553
3554 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0,   /*
3555 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3556 */
3557       (character, code))
3558 {
3559         /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3560            Fset_char_ucs is more restrictive on index arg, but should
3561            check code arg in a char_table method. */
3562         CHECK_CHAR(character);
3563         CHECK_NATNUM(code);
3564         return Fput_char_table(character, code, mule_to_ucs_table);
3565 }
3566
3567 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0,   /*
3568 Return the UCS code (a positive integer) corresponding to CHARACTER.
3569 */
3570       (character))
3571 {
3572         return Fget_char_table(character, mule_to_ucs_table);
3573 }
3574
3575 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3576    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3577    is not found, instead.
3578    #### do something more appropriate (use blob?)
3579         Danger, Will Robinson!  Data loss.  Should we signal user? */
3580 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3581 {
3582         Lisp_Object chr = ucs_to_char(ch);
3583
3584         if (!NILP(chr)) {
3585                 Bufbyte work[MAX_EMCHAR_LEN];
3586                 int len;
3587
3588                 ch = XCHAR(chr);
3589                 len = (ch < 128) ?
3590                     simple_set_charptr_emchar(work, ch) :
3591                     non_ascii_set_charptr_emchar(work, ch);
3592                 Dynarr_add_many(dst, work, len);
3593         } else {
3594                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3595                 Dynarr_add(dst, 34 + 128);
3596                 Dynarr_add(dst, 46 + 128);
3597         }
3598 }
3599
3600 static unsigned long
3601 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3602 {
3603         Lisp_Object code
3604             = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3605                               mule_to_ucs_table);
3606
3607         if (INTP(code)) {
3608                 return XINT(code);
3609         } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3610                    (XCHARSET_CHARS(charset) == 94)) {
3611                 unsigned char final = XCHARSET_FINAL(charset);
3612
3613                 if (('@' <= final) && (final < 0x7f)) {
3614                         return 0xe00000 + (final - '@') * 94 * 94
3615                             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3616                 } else {
3617                         return '?';
3618                 }
3619         } else {
3620                 return '?';
3621         }
3622 }
3623
3624 static void
3625 encode_ucs4(Lisp_Object charset,
3626             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3627 {
3628         unsigned long code = mule_char_to_ucs4(charset, h, l);
3629         Dynarr_add(dst, code >> 24);
3630         Dynarr_add(dst, (code >> 16) & 255);
3631         Dynarr_add(dst, (code >> 8) & 255);
3632         Dynarr_add(dst, code & 255);
3633 }
3634
3635 static int
3636 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3637                    Lstream_data_count n)
3638 {
3639         while (n--) {
3640                 const unsigned char c = *(const unsigned char *)src++;
3641                 switch (st->ucs4.in_byte) {
3642                 case 0:
3643                         if (c >= 128)
3644                                 return 0;
3645                         else
3646                                 st->ucs4.in_byte++;
3647                         break;
3648                 case 3:
3649                         st->ucs4.in_byte = 0;
3650                         break;
3651                 default:
3652                         st->ucs4.in_byte++;
3653                 }
3654         }
3655         return CODING_CATEGORY_UCS4_MASK;
3656 }
3657
3658 static void
3659 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3660                    unsigned_char_dynarr * dst, Lstream_data_count n)
3661 {
3662         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3663         unsigned int flags = str->flags;
3664         unsigned int ch = str->ch;
3665         unsigned char counter = str->counter;
3666
3667         while (n--) {
3668                 const unsigned char c = *(const unsigned char *)src++;
3669                 switch (counter) {
3670                 case 0:
3671                         ch = c;
3672                         counter = 3;
3673                         break;
3674                 case 1:
3675                         decode_ucs4((ch << 8) | c, dst);
3676                         ch = 0;
3677                         counter = 0;
3678                         break;
3679                 default:
3680                         ch = (ch << 8) | c;
3681                         counter--;
3682                 }
3683         }
3684         if (counter & CODING_STATE_END)
3685                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3686
3687         str->flags = flags;
3688         str->ch = ch;
3689         str->counter = counter;
3690 }
3691
3692 static void
3693 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3694                    unsigned_char_dynarr * dst, Lstream_data_count n)
3695 {
3696         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3697         unsigned int flags = str->flags;
3698         unsigned int ch = str->ch;
3699         unsigned char char_boundary = str->iso2022.current_char_boundary;
3700         Lisp_Object charset = str->iso2022.current_charset;
3701
3702 #ifdef ENABLE_COMPOSITE_CHARS
3703         /* flags for handling composite chars.  We do a little switcharoo
3704            on the source while we're outputting the composite char. */
3705         unsigned int saved_n = 0;
3706         const unsigned char *saved_src = NULL;
3707         int in_composite = 0;
3708
3709       back_to_square_n:
3710 #endif
3711
3712         while (n--) {
3713                 unsigned char c = *src++;
3714
3715                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3716                         ch = 0;
3717                         encode_ucs4(Vcharset_ascii, c, 0, dst);
3718                         char_boundary = 1;
3719                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3720                         ch = 0;
3721                         charset = CHARSET_BY_LEADING_BYTE(c);
3722                         if (LEADING_BYTE_PREFIX_P(c))
3723                                 ch = c;
3724                         char_boundary = 0;
3725                 } else {        /* Processing Non-ASCII character */
3726                         char_boundary = 1;
3727                         if (EQ(charset, Vcharset_control_1)) {
3728                                 encode_ucs4(Vcharset_control_1, c, 0, dst);
3729                         } else {
3730                                 switch (XCHARSET_REP_BYTES(charset)) {
3731                                 case 2:
3732                                         encode_ucs4(charset, c, 0, dst);
3733                                         break;
3734                                 case 3:
3735                                         if (XCHARSET_PRIVATE_P(charset)) {
3736                                                 encode_ucs4(charset, c, 0, dst);
3737                                                 ch = 0;
3738                                         } else if (ch) {
3739 #ifdef ENABLE_COMPOSITE_CHARS
3740                                                 if (EQ
3741                                                     (charset,
3742                                                      Vcharset_composite)) {
3743                                                         if (in_composite) {
3744                                                                 /* #### Bother! We don't know how to
3745                                                                    handle this yet. */
3746                                                                 Dynarr_add(dst,
3747                                                                            '\0');
3748                                                                 Dynarr_add(dst,
3749                                                                            '\0');
3750                                                                 Dynarr_add(dst,
3751                                                                            '\0');
3752                                                                 Dynarr_add(dst,
3753                                                                            '~');
3754                                                         } else {
3755                                                                 Emchar emch =
3756                                                                     MAKE_CHAR
3757                                                                     (Vcharset_composite,
3758                                                                      ch & 0x7F,
3759                                                                      c & 0x7F);
3760                                                                 Lisp_Object lstr
3761                                                                     =
3762                                                                     composite_char_string
3763                                                                     (emch);
3764                                                                 saved_n = n;
3765                                                                 saved_src = src;
3766                                                                 in_composite =
3767                                                                     1;
3768                                                                 src =
3769                                                                     XSTRING_DATA
3770                                                                     (lstr);
3771                                                                 n = XSTRING_LENGTH(lstr);
3772                                                         }
3773                                                 } else
3774 #endif                          /* ENABLE_COMPOSITE_CHARS */
3775                                                 {
3776                                                         encode_ucs4(charset, ch,
3777                                                                     c, dst);
3778                                                 }
3779                                                 ch = 0;
3780                                         } else {
3781                                                 ch = c;
3782                                                 char_boundary = 0;
3783                                         }
3784                                         break;
3785                                 case 4:
3786                                         if (ch) {
3787                                                 encode_ucs4(charset, ch, c,
3788                                                             dst);
3789                                                 ch = 0;
3790                                         } else {
3791                                                 ch = c;
3792                                                 char_boundary = 0;
3793                                         }
3794                                         break;
3795                                 default:
3796                                         abort();
3797                                 }
3798                         }
3799                 }
3800         }
3801
3802 #ifdef ENABLE_COMPOSITE_CHARS
3803         if (in_composite) {
3804                 n = saved_n;
3805                 src = saved_src;
3806                 in_composite = 0;
3807                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
3808         }
3809 #endif                          /* ENABLE_COMPOSITE_CHARS */
3810
3811         str->flags = flags;
3812         str->ch = ch;
3813         str->iso2022.current_char_boundary = char_boundary;
3814         str->iso2022.current_charset = charset;
3815
3816         /* Verbum caro factum est! */
3817 }
3818 \f
3819 /************************************************************************/
3820 /*                           UTF-8 methods                              */
3821 /************************************************************************/
3822
3823 static int
3824 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3825                    Lstream_data_count n)
3826 {
3827         while (n--) {
3828                 const unsigned char c = *(const unsigned char *)src++;
3829                 switch (st->utf8.in_byte) {
3830                 case 0:
3831                         if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3832                             || c == ISO_CODE_SO)
3833                                 return 0;
3834                         else if (c >= 0xfc)
3835                                 st->utf8.in_byte = 5;
3836                         else if (c >= 0xf8)
3837                                 st->utf8.in_byte = 4;
3838                         else if (c >= 0xf0)
3839                                 st->utf8.in_byte = 3;
3840                         else if (c >= 0xe0)
3841                                 st->utf8.in_byte = 2;
3842                         else if (c >= 0xc0)
3843                                 st->utf8.in_byte = 1;
3844                         else if (c >= 0x80)
3845                                 return 0;
3846                         break;
3847                 default:
3848                         if ((c & 0xc0) != 0x80)
3849                                 return 0;
3850                         else
3851                                 st->utf8.in_byte--;
3852                 }
3853         }
3854         return CODING_CATEGORY_UTF8_MASK;
3855 }
3856
3857 static void
3858 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3859                    unsigned_char_dynarr * dst, Lstream_data_count n)
3860 {
3861         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3862         unsigned int flags = str->flags;
3863         unsigned int ch = str->ch;
3864         eol_type_t eol_type = str->eol_type;
3865         unsigned char counter = str->counter;
3866
3867         while (n--) {
3868                 const unsigned char c = *(const unsigned char *)src++;
3869                 switch (counter) {
3870                 case 0:
3871                         if (c >= 0xfc) {
3872                                 ch = c & 0x01;
3873                                 counter = 5;
3874                         } else if (c >= 0xf8) {
3875                                 ch = c & 0x03;
3876                                 counter = 4;
3877                         } else if (c >= 0xf0) {
3878                                 ch = c & 0x07;
3879                                 counter = 3;
3880                         } else if (c >= 0xe0) {
3881                                 ch = c & 0x0f;
3882                                 counter = 2;
3883                         } else if (c >= 0xc0) {
3884                                 ch = c & 0x1f;
3885                                 counter = 1;
3886                         } else {
3887                                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3888                                 decode_ucs4(c, dst);
3889                         }
3890                         break;
3891                 case 1:
3892                         ch = (ch << 6) | (c & 0x3f);
3893                         decode_ucs4(ch, dst);
3894                         ch = 0;
3895                         counter = 0;
3896                         break;
3897                 default:
3898                         ch = (ch << 6) | (c & 0x3f);
3899                         counter--;
3900                 }
3901               label_continue_loop:;
3902         }
3903
3904         if (flags & CODING_STATE_END)
3905                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3906
3907         str->flags = flags;
3908         str->ch = ch;
3909         str->counter = counter;
3910 }
3911
3912 static void
3913 encode_utf8(Lisp_Object charset,
3914             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3915 {
3916         unsigned long code = mule_char_to_ucs4(charset, h, l);
3917         if (code <= 0x7f) {
3918                 Dynarr_add(dst, code);
3919         } else if (code <= 0x7ff) {
3920                 Dynarr_add(dst, (code >> 6) | 0xc0);
3921                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3922         } else if (code <= 0xffff) {
3923                 Dynarr_add(dst, (code >> 12) | 0xe0);
3924                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3925                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3926         } else if (code <= 0x1fffff) {
3927                 Dynarr_add(dst, (code >> 18) | 0xf0);
3928                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3929                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3930                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3931         } else if (code <= 0x3ffffff) {
3932                 Dynarr_add(dst, (code >> 24) | 0xf8);
3933                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3934                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3935                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3936                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3937         } else {
3938                 Dynarr_add(dst, (code >> 30) | 0xfc);
3939                 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3940                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3941                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3942                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3943                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3944         }
3945 }
3946
3947 static void
3948 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3949                    unsigned_char_dynarr * dst, Lstream_data_count n)
3950 {
3951         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3952         unsigned int flags = str->flags;
3953         unsigned int ch = str->ch;
3954         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3955         unsigned char char_boundary = str->iso2022.current_char_boundary;
3956         Lisp_Object charset = str->iso2022.current_charset;
3957
3958 #ifdef ENABLE_COMPOSITE_CHARS
3959         /* flags for handling composite chars.  We do a little switcharoo
3960            on the source while we're outputting the composite char. */
3961         unsigned int saved_n = 0;
3962         const unsigned char *saved_src = NULL;
3963         int in_composite = 0;
3964
3965       back_to_square_n:
3966 #endif                          /* ENABLE_COMPOSITE_CHARS */
3967
3968         while (n--) {
3969                 unsigned char c = *src++;
3970
3971                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3972                         ch = 0;
3973                         if (c == '\n') {
3974                                 if (eol_type != EOL_LF
3975                                     && eol_type != EOL_AUTODETECT)
3976                                         Dynarr_add(dst, '\r');
3977                                 if (eol_type != EOL_CR)
3978                                         Dynarr_add(dst, c);
3979                         } else
3980                                 encode_utf8(Vcharset_ascii, c, 0, dst);
3981                         char_boundary = 1;
3982                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3983                         ch = 0;
3984                         charset = CHARSET_BY_LEADING_BYTE(c);
3985                         if (LEADING_BYTE_PREFIX_P(c))
3986                                 ch = c;
3987                         char_boundary = 0;
3988                 } else {        /* Processing Non-ASCII character */
3989                         char_boundary = 1;
3990                         if (EQ(charset, Vcharset_control_1)) {
3991                                 encode_utf8(Vcharset_control_1, c, 0, dst);
3992                         } else {
3993                                 switch (XCHARSET_REP_BYTES(charset)) {
3994                                 case 2:
3995                                         encode_utf8(charset, c, 0, dst);
3996                                         break;
3997                                 case 3:
3998                                         if (XCHARSET_PRIVATE_P(charset)) {
3999                                                 encode_utf8(charset, c, 0, dst);
4000                                                 ch = 0;
4001                                         } else if (ch) {
4002 #ifdef ENABLE_COMPOSITE_CHARS
4003                                                 if (EQ
4004                                                     (charset,
4005                                                      Vcharset_composite)) {
4006                                                         if (in_composite) {
4007                                                                 /* #### Bother! We don't know how to
4008                                                                    handle this yet. */
4009                                                                 encode_utf8
4010                                                                     (Vcharset_ascii,
4011                                                                      '~', 0,
4012                                                                      dst);
4013                                                         } else {
4014                                                                 Emchar emch =
4015                                                                     MAKE_CHAR
4016                                                                     (Vcharset_composite,
4017                                                                      ch & 0x7F,
4018                                                                      c & 0x7F);
4019                                                                 Lisp_Object lstr
4020                                                                     =
4021                                                                     composite_char_string
4022                                                                     (emch);
4023                                                                 saved_n = n;
4024                                                                 saved_src = src;
4025                                                                 in_composite =
4026                                                                     1;
4027                                                                 src =
4028                                                                     XSTRING_DATA
4029                                                                     (lstr);
4030                                                                 n = XSTRING_LENGTH(lstr);
4031                                                         }
4032                                                 } else
4033 #endif                          /* ENABLE_COMPOSITE_CHARS */
4034                                                 {
4035                                                         encode_utf8(charset, ch,
4036                                                                     c, dst);
4037                                                 }
4038                                                 ch = 0;
4039                                         } else {
4040                                                 ch = c;
4041                                                 char_boundary = 0;
4042                                         }
4043                                         break;
4044                                 case 4:
4045                                         if (ch) {
4046                                                 encode_utf8(charset, ch, c,
4047                                                             dst);
4048                                                 ch = 0;
4049                                         } else {
4050                                                 ch = c;
4051                                                 char_boundary = 0;
4052                                         }
4053                                         break;
4054                                 default:
4055                                         abort();
4056                                 }
4057                         }
4058                 }
4059         }
4060
4061 #ifdef ENABLE_COMPOSITE_CHARS
4062         if (in_composite) {
4063                 n = saved_n;
4064                 src = saved_src;
4065                 in_composite = 0;
4066                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
4067         }
4068 #endif
4069
4070         str->flags = flags;
4071         str->ch = ch;
4072         str->iso2022.current_char_boundary = char_boundary;
4073         str->iso2022.current_charset = charset;
4074
4075         /* Verbum caro factum est! */
4076 }
4077 \f
4078 /************************************************************************/
4079 /*                           ISO2022 methods                            */
4080 /************************************************************************/
4081
4082 /* The following note describes the coding system ISO2022 briefly.
4083    Since the intention of this note is to help understand the
4084    functions in this file, some parts are NOT ACCURATE or OVERLY
4085    SIMPLIFIED.  For thorough understanding, please refer to the
4086    original document of ISO2022.
4087
4088    ISO2022 provides many mechanisms to encode several character sets
4089    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4090    is encoded using bytes less than 128.  This may make the encoded
4091    text a little bit longer, but the text passes more easily through
4092    several gateways, some of which strip off MSB (Most Signigant Bit).
4093
4094    There are two kinds of character sets: control character set and
4095    graphic character set.  The former contains control characters such
4096    as `newline' and `escape' to provide control functions (control
4097    functions are also provided by escape sequences).  The latter
4098    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4099    two control character sets and many graphic character sets.
4100
4101    Graphic character sets are classified into one of the following
4102    four classes, according to the number of bytes (DIMENSION) and
4103    number of characters in one dimension (CHARS) of the set:
4104    - DIMENSION1_CHARS94
4105    - DIMENSION1_CHARS96
4106    - DIMENSION2_CHARS94
4107    - DIMENSION2_CHARS96
4108
4109    In addition, each character set is assigned an identification tag,
4110    unique for each set, called "final character" (denoted as <F>
4111    hereafter).  The <F> of each character set is decided by ECMA(*)
4112    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4113    (0x30..0x3F are for private use only).
4114
4115    Note (*): ECMA = European Computer Manufacturers Association
4116
4117    Here are examples of graphic character set [NAME(<F>)]:
4118         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4119         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4120         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4121         o DIMENSION2_CHARS96 -- none for the moment
4122
4123    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4124         C0 [0x00..0x1F] -- control character plane 0
4125         GL [0x20..0x7F] -- graphic character plane 0
4126         C1 [0x80..0x9F] -- control character plane 1
4127         GR [0xA0..0xFF] -- graphic character plane 1
4128
4129    A control character set is directly designated and invoked to C0 or
4130    C1 by an escape sequence.  The most common case is that:
4131    - ISO646's  control character set is designated/invoked to C0, and
4132    - ISO6429's control character set is designated/invoked to C1,
4133    and usually these designations/invocations are omitted in encoded
4134    text.  In a 7-bit environment, only C0 can be used, and a control
4135    character for C1 is encoded by an appropriate escape sequence to
4136    fit into the environment.  All control characters for C1 are
4137    defined to have corresponding escape sequences.
4138
4139    A graphic character set is at first designated to one of four
4140    graphic registers (G0 through G3), then these graphic registers are
4141    invoked to GL or GR.  These designations and invocations can be
4142    done independently.  The most common case is that G0 is invoked to
4143    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4144    these invocations and designations are omitted in encoded text.
4145    In a 7-bit environment, only GL can be used.
4146
4147    When a graphic character set of CHARS94 is invoked to GL, codes
4148    0x20 and 0x7F of the GL area work as control characters SPACE and
4149    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4150    be used.
4151
4152    There are two ways of invocation: locking-shift and single-shift.
4153    With locking-shift, the invocation lasts until the next different
4154    invocation, whereas with single-shift, the invocation affects the
4155    following character only and doesn't affect the locking-shift
4156    state.  Invocations are done by the following control characters or
4157    escape sequences:
4158
4159    ----------------------------------------------------------------------
4160    abbrev  function                  cntrl escape seq   description
4161    ----------------------------------------------------------------------
4162    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4163    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4164    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4165    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4166    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4167    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4168    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4169    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4170    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4171    ----------------------------------------------------------------------
4172    (*) These are not used by any known coding system.
4173
4174    Control characters for these functions are defined by macros
4175    ISO_CODE_XXX in `coding.h'.
4176
4177    Designations are done by the following escape sequences:
4178    ----------------------------------------------------------------------
4179    escape sequence      description
4180    ----------------------------------------------------------------------
4181    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4182    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4183    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4184    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4185    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4186    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4187    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4188    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4189    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4190    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4191    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4192    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4193    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4194    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4195    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4196    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4197    ----------------------------------------------------------------------
4198
4199    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4200    of dimension 1, chars 94, and final character <F>, etc...
4201
4202    Note (*): Although these designations are not allowed in ISO2022,
4203    Emacs accepts them on decoding, and produces them on encoding
4204    CHARS96 character sets in a coding system which is characterized as
4205    7-bit environment, non-locking-shift, and non-single-shift.
4206
4207    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4208    '(' can be omitted.  We refer to this as "short-form" hereafter.
4209
4210    Now you may notice that there are a lot of ways for encoding the
4211    same multilingual text in ISO2022.  Actually, there exist many
4212    coding systems such as Compound Text (used in X11's inter client
4213    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4214    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4215    localized platforms), and all of these are variants of ISO2022.
4216
4217    In addition to the above, Emacs handles two more kinds of escape
4218    sequences: ISO6429's direction specification and Emacs' private
4219    sequence for specifying character composition.
4220
4221    ISO6429's direction specification takes the following form:
4222         o CSI ']'      -- end of the current direction
4223         o CSI '0' ']'  -- end of the current direction
4224         o CSI '1' ']'  -- start of left-to-right text
4225         o CSI '2' ']'  -- start of right-to-left text
4226    The control character CSI (0x9B: control sequence introducer) is
4227    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4228
4229    Character composition specification takes the following form:
4230         o ESC '0' -- start character composition
4231         o ESC '1' -- end character composition
4232    Since these are not standard escape sequences of any ISO standard,
4233    their use with these meanings is restricted to Emacs only.  */
4234
4235 static void
4236 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4237 {
4238         int i;
4239
4240         for (i = 0; i < 4; i++) {
4241                 if (!NILP(coding_system))
4242                         iso->charset[i] =
4243                             XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4244                             (coding_system, i);
4245                 else
4246                         iso->charset[i] = Qt;
4247                 iso->invalid_designated[i] = 0;
4248         }
4249         iso->esc = ISO_ESC_NOTHING;
4250         iso->esc_bytes_index = 0;
4251         iso->register_left = 0;
4252         iso->register_right = 1;
4253         iso->switched_dir_and_no_valid_charset_yet = 0;
4254         iso->invalid_switch_dir = 0;
4255         iso->output_direction_sequence = 0;
4256         iso->output_literally = 0;
4257 #ifdef ENABLE_COMPOSITE_CHARS
4258         if (iso->composite_chars)
4259                 Dynarr_reset(iso->composite_chars);
4260 #endif
4261 }
4262
4263 static int fit_to_be_escape_quoted(unsigned char c)
4264 {
4265         switch (c) {
4266         case ISO_CODE_ESC:
4267         case ISO_CODE_CSI:
4268         case ISO_CODE_SS2:
4269         case ISO_CODE_SS3:
4270         case ISO_CODE_SO:
4271         case ISO_CODE_SI:
4272                 return 1;
4273
4274         default:
4275                 return 0;
4276         }
4277 }
4278
4279 /* Parse one byte of an ISO2022 escape sequence.
4280    If the result is an invalid escape sequence, return 0 and
4281    do not change anything in STR.  Otherwise, if the result is
4282    an incomplete escape sequence, update ISO2022.ESC and
4283    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4284    all the state variables (but not ISO2022.ESC_BYTES) and
4285    return 1.
4286
4287    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4288    or invocation of an invalid character set and treat that as
4289    an unrecognized escape sequence.
4290
4291    ********************************************************************
4292
4293    #### Strategies for error annotation and coding orthogonalization
4294
4295    We really want to separate out a number of things.  Conceptually,
4296    there is a nested syntax.
4297
4298    At the top level is the ISO 2022 extension syntax, including charset
4299    designation and invocation, and certain auxiliary controls such as the
4300    ISO 6429 direction specification.  These are octet-oriented, with the
4301    single exception (AFAIK) of the "exit Unicode" sequence which uses the
4302    UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4303    UTF-16, and 4 bytes for UCS-4 and UTF-32).  This will be treated as a
4304    (deprecated) special case in Unicode processing.
4305
4306    The middle layer is ISO 2022 character interpretation.  This will depend
4307    on the current state of the ISO 2022 registers, and assembles octets
4308    into the character's internal representation.
4309
4310    The lowest level is translating system control conventions.  At present
4311    this is restricted to newline translation, but one could imagine doing
4312    tab conversion or line wrapping here.  "Escape from Unicode" processing
4313    would be done at this level.
4314
4315    At each level the parser will verify the syntax.  In the case of a
4316    syntax error or warning (such as a redundant escape sequence that affects
4317    no characters), the parser will take some action, typically inserting the
4318    erroneous octets directly into the output and creating an annotation
4319    which can be used by higher level I/O to mark the affected region.
4320
4321    This should make it possible to do something sensible about separating
4322    newline convention processing from character construction, and about
4323    preventing ISO 2022 escape sequences from being recognized
4324    inappropriately.
4325
4326    The basic strategy will be to have octet classification tables, and
4327    switch processing according to the table entry.
4328
4329    It's possible that, by doing the processing with tables of functions or
4330    the like, the parser can be used for both detection and translation. */
4331
4332 static int
4333 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4334                   unsigned char c, unsigned int *flags,
4335                   int check_invalid_charsets)
4336 {
4337         /* (1) If we're at the end of a designation sequence, CS is the
4338            charset being designated and REG is the register to designate
4339            it to.
4340
4341            (2) If we're at the end of a locking-shift sequence, REG is
4342            the register to invoke and HALF (0 == left, 1 == right) is
4343            the half to invoke it into.
4344
4345            (3) If we're at the end of a single-shift sequence, REG is
4346            the register to invoke. */
4347         Lisp_Object cs = Qnil;
4348         int reg, half;
4349
4350         /* NOTE: This code does goto's all over the fucking place.
4351            The reason for this is that we're basically implementing
4352            a state machine here, and hierarchical languages like C
4353            don't really provide a clean way of doing this. */
4354
4355         if (!(*flags & CODING_STATE_ESCAPE))
4356                 /* At beginning of escape sequence; we need to reset our
4357                    escape-state variables. */
4358                 iso->esc = ISO_ESC_NOTHING;
4359
4360         iso->output_literally = 0;
4361         iso->output_direction_sequence = 0;
4362
4363         switch (iso->esc) {
4364         case ISO_ESC_NOTHING:
4365                 iso->esc_bytes_index = 0;
4366                 switch (c) {
4367                 case ISO_CODE_ESC:      /* Start escape sequence */
4368                         *flags |= CODING_STATE_ESCAPE;
4369                         iso->esc = ISO_ESC;
4370                         goto not_done;
4371
4372                 case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4373                         *flags |= CODING_STATE_ESCAPE;
4374                         iso->esc = ISO_ESC_5_11;
4375                         goto not_done;
4376
4377                 case ISO_CODE_SO:       /* locking shift 1 */
4378                         reg = 1;
4379                         half = 0;
4380                         goto locking_shift;
4381                 case ISO_CODE_SI:       /* locking shift 0 */
4382                         reg = 0;
4383                         half = 0;
4384                         goto locking_shift;
4385
4386                 case ISO_CODE_SS2:      /* single shift */
4387                         reg = 2;
4388                         goto single_shift;
4389                 case ISO_CODE_SS3:      /* single shift */
4390                         reg = 3;
4391                         goto single_shift;
4392
4393                 default:        /* Other control characters */
4394                         return 0;
4395                 }
4396
4397         case ISO_ESC:
4398                 switch (c) {
4399           /**** single shift ****/
4400
4401                 case 'N':       /* single shift 2 */
4402                         reg = 2;
4403                         goto single_shift;
4404                 case 'O':       /* single shift 3 */
4405                         reg = 3;
4406                         goto single_shift;
4407
4408           /**** locking shift ****/
4409
4410                 case '~':       /* locking shift 1 right */
4411                         reg = 1;
4412                         half = 1;
4413                         goto locking_shift;
4414                 case 'n':       /* locking shift 2 */
4415                         reg = 2;
4416                         half = 0;
4417                         goto locking_shift;
4418                 case '}':       /* locking shift 2 right */
4419                         reg = 2;
4420                         half = 1;
4421                         goto locking_shift;
4422                 case 'o':       /* locking shift 3 */
4423                         reg = 3;
4424                         half = 0;
4425                         goto locking_shift;
4426                 case '|':       /* locking shift 3 right */
4427                         reg = 3;
4428                         half = 1;
4429                         goto locking_shift;
4430
4431 #ifdef ENABLE_COMPOSITE_CHARS
4432           /**** composite ****/
4433
4434                 case '0':
4435                         iso->esc = ISO_ESC_START_COMPOSITE;
4436                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4437                             CODING_STATE_COMPOSITE;
4438                         return 1;
4439
4440                 case '1':
4441                         iso->esc = ISO_ESC_END_COMPOSITE;
4442                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4443                             ~CODING_STATE_COMPOSITE;
4444                         return 1;
4445 #endif                          /* ENABLE_COMPOSITE_CHARS */
4446
4447           /**** directionality ****/
4448
4449                 case '[':
4450                         iso->esc = ISO_ESC_5_11;
4451                         goto not_done;
4452
4453           /**** designation ****/
4454
4455                 case '$':       /* multibyte charset prefix */
4456                         iso->esc = ISO_ESC_2_4;
4457                         goto not_done;
4458
4459                 default:
4460                         if (0x28 <= c && c <= 0x2F) {
4461                                 iso->esc =
4462                                     (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4463                                 goto not_done;
4464                         }
4465
4466                         /* This function is called with CODESYS equal to nil when
4467                            doing coding-system detection. */
4468                         if (!NILP(codesys)
4469                             && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4470                             && fit_to_be_escape_quoted(c)) {
4471                                 iso->esc = ISO_ESC_LITERAL;
4472                                 *flags &= CODING_STATE_ISO2022_LOCK;
4473                                 return 1;
4474                         }
4475
4476                         /* bzzzt! */
4477                         return 0;
4478                 }
4479
4480       /**** directionality ****/
4481
4482         case ISO_ESC_5_11:      /* ISO6429 direction control */
4483                 if (c == ']') {
4484                         *flags &=
4485                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4486                         goto directionality;
4487                 }
4488                 if (c == '0')
4489                         iso->esc = ISO_ESC_5_11_0;
4490                 else if (c == '1')
4491                         iso->esc = ISO_ESC_5_11_1;
4492                 else if (c == '2')
4493                         iso->esc = ISO_ESC_5_11_2;
4494                 else
4495                         return 0;
4496                 goto not_done;
4497
4498         case ISO_ESC_5_11_0:
4499                 if (c == ']') {
4500                         *flags &=
4501                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4502                         goto directionality;
4503                 }
4504                 return 0;
4505
4506         case ISO_ESC_5_11_1:
4507                 if (c == ']') {
4508                         *flags =
4509                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4510                         goto directionality;
4511                 }
4512                 return 0;
4513
4514         case ISO_ESC_5_11_2:
4515                 if (c == ']') {
4516                         *flags =
4517                             (*flags & CODING_STATE_ISO2022_LOCK) |
4518                             CODING_STATE_R2L;
4519                         goto directionality;
4520                 }
4521                 return 0;
4522
4523         directionality:
4524                 iso->esc = ISO_ESC_DIRECTIONALITY;
4525                 /* Various junk here to attempt to preserve the direction
4526                    sequences literally in the text if they would otherwise be
4527                    swallowed due to invalid designations that don't show up as
4528                    actual charset changes in the text. */
4529                 if (iso->invalid_switch_dir) {
4530                         /* We already inserted a direction switch literally into
4531                            the text.  We assume (#### this may not be right)
4532                            that the next direction switch is the one going the
4533                            other way, and we need to output that literally as
4534                            well. */
4535                         iso->output_literally = 1;
4536                         iso->invalid_switch_dir = 0;
4537                 } else {
4538                         int jj;
4539
4540                         /* If we are in the thrall of an invalid designation,
4541                            then stick the directionality sequence literally into
4542                            the output stream so it ends up in the original text
4543                            again. */
4544                         for (jj = 0; jj < 4; jj++)
4545                                 if (iso->invalid_designated[jj])
4546                                         break;
4547                         if (jj < 4) {
4548                                 iso->output_literally = 1;
4549                                 iso->invalid_switch_dir = 1;
4550                         } else
4551                                 /* Indicate that we haven't yet seen a valid
4552                                    designation, so that if a switch-dir is
4553                                    directly followed by an invalid designation,
4554                                    both get inserted literally. */
4555                                 iso->switched_dir_and_no_valid_charset_yet = 1;
4556                 }
4557                 return 1;
4558
4559                 /**** designation ****/
4560
4561         case ISO_ESC_2_4:
4562                 if (0x28 <= c && c <= 0x2F) {
4563                         iso->esc =
4564                             (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4565                         goto not_done;
4566                 }
4567                 if (0x40 <= c && c <= 0x42) {
4568                         cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4569                                                    *flags & CODING_STATE_R2L ?
4570                                                    CHARSET_RIGHT_TO_LEFT :
4571                                                    CHARSET_LEFT_TO_RIGHT);
4572                         reg = 0;
4573                         goto designated;
4574                 }
4575                 return 0;
4576
4577                 /* list the rest */
4578         case ISO_ESC_2_8:
4579         case ISO_ESC_2_9:
4580         case ISO_ESC_2_10:
4581         case ISO_ESC_2_11:
4582         case ISO_ESC_2_12:
4583         case ISO_ESC_2_13:
4584         case ISO_ESC_2_14:
4585         case ISO_ESC_2_15:
4586         case ISO_ESC_2_4_8:
4587         case ISO_ESC_2_4_9:
4588         case ISO_ESC_2_4_10:
4589         case ISO_ESC_2_4_11:
4590         case ISO_ESC_2_4_12:
4591         case ISO_ESC_2_4_13:
4592         case ISO_ESC_2_4_14:
4593         case ISO_ESC_2_4_15:
4594         case ISO_ESC_SINGLE_SHIFT:
4595         case ISO_ESC_LOCKING_SHIFT:
4596         case ISO_ESC_DESIGNATE:
4597         case ISO_ESC_DIRECTIONALITY:
4598         case ISO_ESC_LITERAL:
4599
4600         default: {
4601                 int type = -1;
4602
4603                 if (c < '0' || c > '~')
4604                         return 0;       /* bad final byte */
4605
4606                 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4607                         type = ((iso->esc >= ISO_ESC_2_12) ?
4608                                 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4609                         reg = (iso->esc - ISO_ESC_2_8) & 3;
4610                 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4611                            iso->esc <= ISO_ESC_2_4_15) {
4612                         type = ((iso->esc >= ISO_ESC_2_4_12) ?
4613                                 CHARSET_TYPE_96X96 :
4614                                 CHARSET_TYPE_94X94);
4615                         reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4616                 } else {
4617                         /* Can this ever be reached? -slb */
4618                         abort();
4619                         return 0;
4620                 }
4621
4622                 cs = CHARSET_BY_ATTRIBUTES(type, c,
4623                                            *flags & CODING_STATE_R2L ?
4624                                            CHARSET_RIGHT_TO_LEFT :
4625                                            CHARSET_LEFT_TO_RIGHT);
4626                 goto designated;
4627         }
4628         }
4629
4630       not_done:
4631         iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4632         return -1;
4633
4634       single_shift:
4635         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4636                 /* can't invoke something that ain't there. */
4637                 return 0;
4638         iso->esc = ISO_ESC_SINGLE_SHIFT;
4639         *flags &= CODING_STATE_ISO2022_LOCK;
4640         if (reg == 2)
4641                 *flags |= CODING_STATE_SS2;
4642         else
4643                 *flags |= CODING_STATE_SS3;
4644         return 1;
4645
4646       locking_shift:
4647         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4648                 /* can't invoke something that ain't there. */
4649                 return 0;
4650         if (half)
4651                 iso->register_right = reg;
4652         else
4653                 iso->register_left = reg;
4654         *flags &= CODING_STATE_ISO2022_LOCK;
4655         iso->esc = ISO_ESC_LOCKING_SHIFT;
4656         return 1;
4657
4658       designated:
4659         if (NILP(cs) && check_invalid_charsets) {
4660                 iso->invalid_designated[reg] = 1;
4661                 iso->charset[reg] = Vcharset_ascii;
4662                 iso->esc = ISO_ESC_DESIGNATE;
4663                 *flags &= CODING_STATE_ISO2022_LOCK;
4664                 iso->output_literally = 1;
4665                 if (iso->switched_dir_and_no_valid_charset_yet) {
4666                         /* We encountered a switch-direction followed by an
4667                            invalid designation.  Ensure that the switch-direction
4668                            gets outputted; otherwise it will probably get eaten
4669                            when the text is written out again. */
4670                         iso->switched_dir_and_no_valid_charset_yet = 0;
4671                         iso->output_direction_sequence = 1;
4672                         /* And make sure that the switch-dir going the other
4673                            way gets outputted, as well. */
4674                         iso->invalid_switch_dir = 1;
4675                 }
4676                 return 1;
4677         }
4678         /* This function is called with CODESYS equal to nil when
4679            doing coding-system detection. */
4680         if (!NILP(codesys)) {
4681                 charset_conversion_spec_dynarr *dyn =
4682                     XCODING_SYSTEM(codesys)->iso2022.input_conv;
4683
4684                 if (dyn) {
4685                         int i;
4686
4687                         for (i = 0; i < Dynarr_length(dyn); i++) {
4688                                 struct charset_conversion_spec *spec =
4689                                     Dynarr_atp(dyn, i);
4690                                 if (EQ(cs, spec->from_charset))
4691                                         cs = spec->to_charset;
4692                         }
4693                 }
4694         }
4695
4696         iso->charset[reg] = cs;
4697         iso->esc = ISO_ESC_DESIGNATE;
4698         *flags &= CODING_STATE_ISO2022_LOCK;
4699         if (iso->invalid_designated[reg]) {
4700                 iso->invalid_designated[reg] = 0;
4701                 iso->output_literally = 1;
4702         }
4703         if (iso->switched_dir_and_no_valid_charset_yet)
4704                 iso->switched_dir_and_no_valid_charset_yet = 0;
4705         return 1;
4706 }
4707
4708 static int
4709 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4710                       Lstream_data_count n)
4711 {
4712         int mask;
4713
4714         /* #### There are serious deficiencies in the recognition mechanism
4715            here.  This needs to be much smarter if it's going to cut it.
4716            The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4717            it should be detected as Latin-1.
4718            All the ISO2022 stuff in this file should be synced up with the
4719            code from FSF Emacs-20.4, in which Mule should be more or less stable.
4720            Perhaps we should wait till R2L works in FSF Emacs? */
4721
4722         if (!st->iso2022.initted) {
4723                 reset_iso2022(Qnil, &st->iso2022.iso);
4724                 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4725                                     CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4726                                     CODING_CATEGORY_ISO_8_1_MASK |
4727                                     CODING_CATEGORY_ISO_8_2_MASK |
4728                                     CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4729                 st->iso2022.flags = 0;
4730                 st->iso2022.high_byte_count = 0;
4731                 st->iso2022.saw_single_shift = 0;
4732                 st->iso2022.initted = 1;
4733         }
4734
4735         mask = st->iso2022.mask;
4736
4737         while (n--) {
4738                 const unsigned char c = *(const unsigned char *)src++;
4739                 if (c >= 0xA0) {
4740                         mask &= ~CODING_CATEGORY_ISO_7_MASK;
4741                         st->iso2022.high_byte_count++;
4742                 } else {
4743                         if (st->iso2022.high_byte_count
4744                             && !st->iso2022.saw_single_shift) {
4745                                 if (st->iso2022.high_byte_count & 1)
4746                                         /* odd number of high bytes; assume not iso-8-2 */
4747                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4748                         }
4749                         st->iso2022.high_byte_count = 0;
4750                         st->iso2022.saw_single_shift = 0;
4751                         if (c > 0x80)
4752                                 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4753                 }
4754                 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4755                     && (BYTE_C0_P(c) || BYTE_C1_P(c))) {        /* control chars */
4756                         switch (c) {
4757                                 /* Allow and ignore control characters that you might
4758                                    reasonably see in a text file */
4759                         case '\r':
4760                         case '\n':
4761                         case '\t':
4762                         case 7: /* bell */
4763                         case 8: /* backspace */
4764                         case 11:        /* vertical tab */
4765                         case 12:        /* form feed */
4766                         case 26:        /* MS-DOS C-z junk */
4767                         case 31:        /* '^_' -- for info */
4768                                 goto label_continue_loop;
4769
4770                         default:
4771                                 break;
4772                         }
4773                 }
4774
4775                 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4776                     || BYTE_C1_P(c)) {
4777                         if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4778                                               &st->iso2022.flags, 0)) {
4779                                 switch (st->iso2022.iso.esc) {
4780                                 case ISO_ESC_DESIGNATE:
4781                                         mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4782                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4783                                         break;
4784                                 case ISO_ESC_LOCKING_SHIFT:
4785                                         mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4786                                         goto ran_out_of_chars;
4787                                 case ISO_ESC_SINGLE_SHIFT:
4788                                         mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4789                                         st->iso2022.saw_single_shift = 1;
4790                                         break;
4791
4792                                         /* list the rest */
4793                                 case ISO_ESC_NOTHING:
4794                                 case ISO_ESC:
4795                                 case ISO_ESC_2_4:
4796                                 case ISO_ESC_2_8:
4797                                 case ISO_ESC_2_9:
4798                                 case ISO_ESC_2_10:
4799                                 case ISO_ESC_2_11:
4800                                 case ISO_ESC_2_12:
4801                                 case ISO_ESC_2_13:
4802                                 case ISO_ESC_2_14:
4803                                 case ISO_ESC_2_15:
4804                                 case ISO_ESC_2_4_8:
4805                                 case ISO_ESC_2_4_9:
4806                                 case ISO_ESC_2_4_10:
4807                                 case ISO_ESC_2_4_11:
4808                                 case ISO_ESC_2_4_12:
4809                                 case ISO_ESC_2_4_13:
4810                                 case ISO_ESC_2_4_14:
4811                                 case ISO_ESC_2_4_15:
4812                                 case ISO_ESC_5_11:
4813                                 case ISO_ESC_5_11_0:
4814                                 case ISO_ESC_5_11_1:
4815                                 case ISO_ESC_5_11_2:
4816                                 case ISO_ESC_DIRECTIONALITY:
4817                                 case ISO_ESC_LITERAL:
4818                                 default:
4819                                         break;
4820                                 }
4821                         } else {
4822                                 mask = 0;
4823                                 goto ran_out_of_chars;
4824                         }
4825                 }
4826         label_continue_loop:;
4827         }
4828
4829 ran_out_of_chars:
4830         return mask;
4831 }
4832
4833 static int postprocess_iso2022_mask(int mask)
4834 {
4835         /* #### kind of cheesy */
4836         /* If seven-bit ISO is allowed, then assume that the encoding is
4837            entirely seven-bit and turn off the eight-bit ones. */
4838         if (mask & CODING_CATEGORY_ISO_7_MASK)
4839                 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4840                           CODING_CATEGORY_ISO_8_1_MASK |
4841                           CODING_CATEGORY_ISO_8_2_MASK);
4842         return mask;
4843 }
4844
4845 /* If FLAGS is a null pointer or specifies right-to-left motion,
4846    output a switch-dir-to-left-to-right sequence to DST.
4847    Also update FLAGS if it is not a null pointer.
4848    If INTERNAL_P is set, we are outputting in internal format and
4849    need to handle the CSI differently. */
4850
4851 static void
4852 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4853                                 unsigned_char_dynarr * dst,
4854                                 unsigned int *flags, int internal_p)
4855 {
4856         if (!flags || (*flags & CODING_STATE_R2L)) {
4857                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4858                         Dynarr_add(dst, ISO_CODE_ESC);
4859                         Dynarr_add(dst, '[');
4860                 } else if (internal_p)
4861                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4862                 else
4863                         Dynarr_add(dst, ISO_CODE_CSI);
4864                 Dynarr_add(dst, '0');
4865                 Dynarr_add(dst, ']');
4866                 if (flags)
4867                         *flags &= ~CODING_STATE_R2L;
4868         }
4869 }
4870
4871 /* If FLAGS is a null pointer or specifies a direction different from
4872    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4873    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4874    sequence to DST.  Also update FLAGS if it is not a null pointer.
4875    If INTERNAL_P is set, we are outputting in internal format and
4876    need to handle the CSI differently. */
4877
4878 static void
4879 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4880                          unsigned_char_dynarr * dst, unsigned int *flags,
4881                          int internal_p)
4882 {
4883         if ((!flags || (*flags & CODING_STATE_R2L)) &&
4884             direction == CHARSET_LEFT_TO_RIGHT)
4885                 restore_left_to_right_direction(codesys, dst, flags,
4886                                                 internal_p);
4887         else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4888                  && (!flags || !(*flags & CODING_STATE_R2L)) &&
4889                  direction == CHARSET_RIGHT_TO_LEFT) {
4890                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4891                         Dynarr_add(dst, ISO_CODE_ESC);
4892                         Dynarr_add(dst, '[');
4893                 } else if (internal_p)
4894                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4895                 else
4896                         Dynarr_add(dst, ISO_CODE_CSI);
4897                 Dynarr_add(dst, '2');
4898                 Dynarr_add(dst, ']');
4899                 if (flags)
4900                         *flags |= CODING_STATE_R2L;
4901         }
4902 }
4903
4904 /* Convert ISO2022-format data to internal format. */
4905
4906 static void
4907 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4908                       unsigned_char_dynarr * dst, Lstream_data_count n)
4909 {
4910         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4911         unsigned int flags = str->flags;
4912         unsigned int ch = str->ch;
4913         eol_type_t eol_type = str->eol_type;
4914 #ifdef ENABLE_COMPOSITE_CHARS
4915         unsigned_char_dynarr *real_dst = dst;
4916 #endif
4917         Lisp_Object coding_system;
4918
4919         XSETCODING_SYSTEM(coding_system, str->codesys);
4920
4921 #ifdef ENABLE_COMPOSITE_CHARS
4922         if (flags & CODING_STATE_COMPOSITE)
4923                 dst = str->iso2022.composite_chars;
4924 #endif                          /* ENABLE_COMPOSITE_CHARS */
4925
4926         while (n--) {
4927                 const unsigned char c = *(const unsigned char *)src++;
4928                 if (flags & CODING_STATE_ESCAPE) {
4929                         /* Within ESC sequence */
4930                         int retval = parse_iso2022_esc(
4931                                 coding_system, &str->iso2022, c, &flags, 1);
4932
4933                         if (retval) {
4934                                 switch (str->iso2022.esc) {
4935 #ifdef ENABLE_COMPOSITE_CHARS
4936                                 case ISO_ESC_START_COMPOSITE:
4937                                         if (str->iso2022.composite_chars)
4938                                                 Dynarr_reset(str->iso2022.
4939                                                              composite_chars);
4940                                         else
4941                                                 str->iso2022.composite_chars =
4942                                                     Dynarr_new(unsigned_char);
4943                                         dst = str->iso2022.composite_chars;
4944                                         break;
4945                                 case ISO_ESC_END_COMPOSITE:
4946                                         {
4947                                                 Bufbyte comstr[MAX_EMCHAR_LEN];
4948                                                 Bytecount len;
4949                                                 Emchar emch =
4950                                                     lookup_composite_char
4951                                                     (Dynarr_atp(dst, 0),
4952                                                      Dynarr_length(dst));
4953                                                 dst = real_dst;
4954                                                 len =
4955                                                     set_charptr_emchar(comstr,
4956                                                                        emch);
4957                                                 Dynarr_add_many(dst, comstr,
4958                                                                 len);
4959                                                 break;
4960                                         }
4961 #endif                          /* ENABLE_COMPOSITE_CHARS */
4962
4963                                 case ISO_ESC_LITERAL:
4964                                         DECODE_ADD_BINARY_CHAR(c, dst);
4965                                         break;
4966
4967                                 case ISO_ESC_NOTHING:
4968                                 case ISO_ESC:
4969                                 case ISO_ESC_2_4:
4970                                 case ISO_ESC_2_8:
4971                                 case ISO_ESC_2_9:
4972                                 case ISO_ESC_2_10:
4973                                 case ISO_ESC_2_11:
4974                                 case ISO_ESC_2_12:
4975                                 case ISO_ESC_2_13:
4976                                 case ISO_ESC_2_14:
4977                                 case ISO_ESC_2_15:
4978                                 case ISO_ESC_2_4_8:
4979                                 case ISO_ESC_2_4_9:
4980                                 case ISO_ESC_2_4_10:
4981                                 case ISO_ESC_2_4_11:
4982                                 case ISO_ESC_2_4_12:
4983                                 case ISO_ESC_2_4_13:
4984                                 case ISO_ESC_2_4_14:
4985                                 case ISO_ESC_2_4_15:
4986                                 case ISO_ESC_5_11:
4987                                 case ISO_ESC_5_11_0:
4988                                 case ISO_ESC_5_11_1:
4989                                 case ISO_ESC_5_11_2:
4990                                 case ISO_ESC_SINGLE_SHIFT:
4991                                 case ISO_ESC_LOCKING_SHIFT:
4992                                 case ISO_ESC_DESIGNATE:
4993                                 case ISO_ESC_DIRECTIONALITY:
4994
4995                                 default:
4996                                         /* Everything else handled already */
4997                                         break;
4998                                 }
4999                         }
5000
5001                         /* Attempted error recovery. */
5002                         if (str->iso2022.output_direction_sequence)
5003                                 ensure_correct_direction(flags &
5004                                                          CODING_STATE_R2L ?
5005                                                          CHARSET_RIGHT_TO_LEFT :
5006                                                          CHARSET_LEFT_TO_RIGHT,
5007                                                          str->codesys, dst, 0,
5008                                                          1);
5009                         /* More error recovery. */
5010                         if (!retval || str->iso2022.output_literally) {
5011                                 /* Output the (possibly invalid) sequence */
5012                                 int i;
5013                                 for (i = 0; i < str->iso2022.esc_bytes_index;
5014                                      i++)
5015                                         DECODE_ADD_BINARY_CHAR(str->iso2022.
5016                                                                esc_bytes[i],
5017                                                                dst);
5018                                 flags &= CODING_STATE_ISO2022_LOCK;
5019                                 if (!retval)
5020                                         n++, src--;     /* Repeat the loop with the same character. */
5021                                 else {
5022                                         /* No sense in reprocessing the final byte of the
5023                                            escape sequence; it could mess things up anyway.
5024                                            Just add it now. */
5025                                         DECODE_ADD_BINARY_CHAR(c, dst);
5026                                 }
5027                         }
5028                         ch = 0;
5029                 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) {      /* Control characters */
5030
5031           /***** Error-handling *****/
5032
5033                         /* If we were in the middle of a character, dump out the
5034                            partial character. */
5035                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5036
5037                         /* If we just saw a single-shift character, dump it out.
5038                            This may dump out the wrong sort of single-shift character,
5039                            but least it will give an indication that something went
5040                            wrong. */
5041                         if (flags & CODING_STATE_SS2) {
5042                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5043                                 flags &= ~CODING_STATE_SS2;
5044                         }
5045                         if (flags & CODING_STATE_SS3) {
5046                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5047                                 flags &= ~CODING_STATE_SS3;
5048                         }
5049
5050           /***** Now handle the control characters. *****/
5051
5052                         /* Handle CR/LF */
5053                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5054
5055                         flags &= CODING_STATE_ISO2022_LOCK;
5056
5057                         if (!parse_iso2022_esc
5058                             (coding_system, &str->iso2022, c, &flags, 1))
5059                                 DECODE_ADD_BINARY_CHAR(c, dst);
5060                 } else {        /* Graphic characters */
5061                         Lisp_Object charset;
5062                         int lb;
5063                         int reg;
5064
5065                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5066
5067                         /* Now determine the charset. */
5068                         reg = ((flags & CODING_STATE_SS2) ? 2
5069                                : (flags & CODING_STATE_SS3) ? 3
5070                                : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5071                                : str->iso2022.register_left);
5072                         charset = str->iso2022.charset[reg];
5073
5074                         /* Error checking: */
5075                         if (!CHARSETP(charset)
5076                             || str->iso2022.invalid_designated[reg]
5077                             ||
5078                             (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5079                              && XCHARSET_CHARS(charset) == 94))
5080                                 /* Mrmph.  We are trying to invoke a register that has no
5081                                    or an invalid charset in it, or trying to add a character
5082                                    outside the range of the charset.  Insert that char literally
5083                                    to preserve it for the output. */
5084                         {
5085                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5086                                 DECODE_ADD_BINARY_CHAR(c, dst);
5087                         }
5088
5089                         else {
5090                                 /* Things are probably hunky-dorey. */
5091
5092                                 /* Fetch reverse charset, maybe. */
5093                                 if (((flags & CODING_STATE_R2L) &&
5094                                      XCHARSET_DIRECTION(charset) ==
5095                                      CHARSET_LEFT_TO_RIGHT)
5096                                     || (!(flags & CODING_STATE_R2L)
5097                                         && XCHARSET_DIRECTION(charset) ==
5098                                         CHARSET_RIGHT_TO_LEFT)) {
5099                                         Lisp_Object new_charset =
5100                                             XCHARSET_REVERSE_DIRECTION_CHARSET
5101                                             (charset);
5102                                         if (!NILP(new_charset))
5103                                                 charset = new_charset;
5104                                 }
5105
5106                                 lb = XCHARSET_LEADING_BYTE(charset);
5107                                 switch (XCHARSET_REP_BYTES(charset)) {
5108                                 case 1: /* ASCII */
5109                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5110                                         Dynarr_add(dst, c & 0x7F);
5111                                         break;
5112
5113                                 case 2: /* one-byte official */
5114                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5115                                         Dynarr_add(dst, lb);
5116                                         Dynarr_add(dst, c | 0x80);
5117                                         break;
5118
5119                                 case 3: /* one-byte private or two-byte official */
5120                                         if (XCHARSET_PRIVATE_P(charset)) {
5121                                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5122                                                 Dynarr_add(dst,
5123                                                            PRE_LEADING_BYTE_PRIVATE_1);
5124                                                 Dynarr_add(dst, lb);
5125                                                 Dynarr_add(dst, c | 0x80);
5126                                         } else {
5127                                                 if (ch) {
5128                                                         Dynarr_add(dst, lb);
5129                                                         Dynarr_add(dst,
5130                                                                    ch | 0x80);
5131                                                         Dynarr_add(dst,
5132                                                                    c | 0x80);
5133                                                         ch = 0;
5134                                                 } else
5135                                                         ch = c;
5136                                         }
5137                                         break;
5138
5139                                 default:        /* two-byte private */
5140                                         if (ch) {
5141                                                 Dynarr_add(dst,
5142                                                            PRE_LEADING_BYTE_PRIVATE_2);
5143                                                 Dynarr_add(dst, lb);
5144                                                 Dynarr_add(dst, ch | 0x80);
5145                                                 Dynarr_add(dst, c | 0x80);
5146                                                 ch = 0;
5147                                         } else
5148                                                 ch = c;
5149                                 }
5150                         }
5151
5152                         if (!ch)
5153                                 flags &= CODING_STATE_ISO2022_LOCK;
5154                 }
5155
5156               label_continue_loop:;
5157         }
5158
5159         if (flags & CODING_STATE_END)
5160                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5161
5162         str->flags = flags;
5163         str->ch = ch;
5164 }
5165
5166 /***** ISO2022 encoder *****/
5167
5168 /* Designate CHARSET into register REG. */
5169
5170 static void
5171 iso2022_designate(Lisp_Object charset, unsigned char reg,
5172                   encoding_stream_t str, unsigned_char_dynarr * dst)
5173 {
5174         static const char inter94[] = "()*+";
5175         static const char inter96[] = ",-./";
5176         unsigned int type;
5177         unsigned char final;
5178         Lisp_Object old_charset = str->iso2022.charset[reg];
5179
5180         str->iso2022.charset[reg] = charset;
5181         if (!CHARSETP(charset))
5182                 /* charset might be an initial nil or t. */
5183                 return;
5184         type = XCHARSET_TYPE(charset);
5185         final = XCHARSET_FINAL(charset);
5186         if (!str->iso2022.force_charset_on_output[reg] &&
5187             CHARSETP(old_charset) &&
5188             XCHARSET_TYPE(old_charset) == type &&
5189             XCHARSET_FINAL(old_charset) == final)
5190                 return;
5191
5192         str->iso2022.force_charset_on_output[reg] = 0;
5193
5194         {
5195                 charset_conversion_spec_dynarr *dyn =
5196                     str->codesys->iso2022.output_conv;
5197
5198                 if (dyn) {
5199                         int i;
5200
5201                         for (i = 0; i < Dynarr_length(dyn); i++) {
5202                                 struct charset_conversion_spec *spec =
5203                                     Dynarr_atp(dyn, i);
5204                                 if (EQ(charset, spec->from_charset))
5205                                         charset = spec->to_charset;
5206                         }
5207                 }
5208         }
5209
5210         Dynarr_add(dst, ISO_CODE_ESC);
5211         switch (type) {
5212         case CHARSET_TYPE_94:
5213                 Dynarr_add(dst, inter94[reg]);
5214                 break;
5215         case CHARSET_TYPE_96:
5216                 Dynarr_add(dst, inter96[reg]);
5217                 break;
5218         case CHARSET_TYPE_94X94:
5219                 Dynarr_add(dst, '$');
5220                 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5221                     || final < '@' || final > 'B')
5222                         Dynarr_add(dst, inter94[reg]);
5223                 break;
5224         case CHARSET_TYPE_96X96:
5225                 Dynarr_add(dst, '$');
5226                 Dynarr_add(dst, inter96[reg]);
5227                 break;
5228         default:
5229                 break;
5230         }
5231         Dynarr_add(dst, final);
5232 }
5233
5234 static void
5235 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5236 {
5237         if (str->iso2022.register_left != 0) {
5238                 Dynarr_add(dst, ISO_CODE_SI);
5239                 str->iso2022.register_left = 0;
5240         }
5241 }
5242
5243 static void
5244 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5245 {
5246         if (str->iso2022.register_left != 1) {
5247                 Dynarr_add(dst, ISO_CODE_SO);
5248                 str->iso2022.register_left = 1;
5249         }
5250 }
5251
5252 /* Convert internally-formatted data to ISO2022 format. */
5253
5254 static void
5255 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5256                       unsigned_char_dynarr * dst, Lstream_data_count n)
5257 {
5258         unsigned char charmask, c;
5259         unsigned char char_boundary;
5260         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5261         unsigned int flags = str->flags;
5262         unsigned int ch = str->ch;
5263         Lisp_Coding_System *codesys = str->codesys;
5264         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5265         int i;
5266         Lisp_Object charset;
5267         int half;
5268
5269 #ifdef ENABLE_COMPOSITE_CHARS
5270         /* flags for handling composite chars.  We do a little switcharoo
5271            on the source while we're outputting the composite char. */
5272         unsigned int saved_n = 0;
5273         const unsigned char *saved_src = NULL;
5274         int in_composite = 0;
5275 #endif                          /* ENABLE_COMPOSITE_CHARS */
5276
5277         char_boundary = str->iso2022.current_char_boundary;
5278         charset = str->iso2022.current_charset;
5279         half = str->iso2022.current_half;
5280
5281 #ifdef ENABLE_COMPOSITE_CHARS
5282       back_to_square_n:
5283 #endif
5284         while (n--) {
5285                 c = *src++;
5286
5287                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
5288                         ch = 0;
5289
5290                         restore_left_to_right_direction(codesys, dst, &flags,
5291                                                         0);
5292
5293                         /* Make sure G0 contains ASCII */
5294                         if ((c > ' ' && c < ISO_CODE_DEL) ||
5295                             !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5296                                 ensure_normal_shift(str, dst);
5297                                 iso2022_designate(Vcharset_ascii, 0, str, dst);
5298                         }
5299
5300                         /* If necessary, restore everything to the default state
5301                            at end-of-line */
5302                         if (c == '\n' &&
5303                             !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5304                                 restore_left_to_right_direction(codesys, dst,
5305                                                                 &flags, 0);
5306
5307                                 ensure_normal_shift(str, dst);
5308
5309                                 for (i = 0; i < 4; i++) {
5310                                         Lisp_Object initial_charset =
5311                                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5312                                             (codesys, i);
5313                                         iso2022_designate(initial_charset, i,
5314                                                           str, dst);
5315                                 }
5316                         }
5317                         if (c == '\n') {
5318                                 if (eol_type != EOL_LF
5319                                     && eol_type != EOL_AUTODETECT)
5320                                         Dynarr_add(dst, '\r');
5321                                 if (eol_type != EOL_CR)
5322                                         Dynarr_add(dst, c);
5323                         } else {
5324                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5325                                     && fit_to_be_escape_quoted(c))
5326                                         Dynarr_add(dst, ISO_CODE_ESC);
5327                                 Dynarr_add(dst, c);
5328                         }
5329                         char_boundary = 1;
5330                 }
5331
5332                 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {     /* Processing Leading Byte */
5333                         ch = 0;
5334                         charset = CHARSET_BY_LEADING_BYTE(c);
5335                         if (LEADING_BYTE_PREFIX_P(c))
5336                                 ch = c;
5337                         else if (!EQ(charset, Vcharset_control_1)
5338 #ifdef ENABLE_COMPOSITE_CHARS
5339                                  && !EQ(charset, Vcharset_composite)
5340 #endif
5341                             ) {
5342                                 int reg;
5343
5344                                 ensure_correct_direction(XCHARSET_DIRECTION
5345                                                          (charset), codesys,
5346                                                          dst, &flags, 0);
5347
5348                                 /* Now determine which register to use. */
5349                                 reg = -1;
5350                                 for (i = 0; i < 4; i++) {
5351                                         if (EQ(charset, str->iso2022.charset[i])
5352                                             || EQ(charset,
5353                                                   CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5354                                                   (codesys, i))) {
5355                                                 reg = i;
5356                                                 break;
5357                                         }
5358                                 }
5359
5360                                 if (reg == -1) {
5361                                         if (XCHARSET_GRAPHIC(charset) != 0) {
5362                                                 if (!NILP
5363                                                     (str->iso2022.charset[1])
5364                                                     &&
5365                                                     (!CODING_SYSTEM_ISO2022_SEVEN
5366                                                      (codesys)
5367                                                      ||
5368                                                      CODING_SYSTEM_ISO2022_LOCK_SHIFT
5369                                                      (codesys)))
5370                                                         reg = 1;
5371                                                 else if (!NILP
5372                                                          (str->iso2022.
5373                                                           charset[2]))
5374                                                         reg = 2;
5375                                                 else if (!NILP
5376                                                          (str->iso2022.
5377                                                           charset[3]))
5378                                                         reg = 3;
5379                                                 else
5380                                                         reg = 0;
5381                                         } else
5382                                                 reg = 0;
5383                                 }
5384
5385                                 iso2022_designate(charset, reg, str, dst);
5386
5387                                 /* Now invoke that register. */
5388                                 switch (reg) {
5389                                 case 0:
5390                                         ensure_normal_shift(str, dst);
5391                                         half = 0;
5392                                         break;
5393
5394                                 case 1:
5395                                         if (CODING_SYSTEM_ISO2022_SEVEN
5396                                             (codesys)) {
5397                                                 ensure_shift_out(str, dst);
5398                                                 half = 0;
5399                                         } else
5400                                                 half = 1;
5401                                         break;
5402
5403                                 case 2:
5404                                         if (CODING_SYSTEM_ISO2022_SEVEN
5405                                             (str->codesys)) {
5406                                                 Dynarr_add(dst, ISO_CODE_ESC);
5407                                                 Dynarr_add(dst, 'N');
5408                                                 half = 0;
5409                                         } else {
5410                                                 Dynarr_add(dst, ISO_CODE_SS2);
5411                                                 half = 1;
5412                                         }
5413                                         break;
5414
5415                                 case 3:
5416                                         if (CODING_SYSTEM_ISO2022_SEVEN
5417                                             (str->codesys)) {
5418                                                 Dynarr_add(dst, ISO_CODE_ESC);
5419                                                 Dynarr_add(dst, 'O');
5420                                                 half = 0;
5421                                         } else {
5422                                                 Dynarr_add(dst, ISO_CODE_SS3);
5423                                                 half = 1;
5424                                         }
5425                                         break;
5426
5427                                 default:
5428                                         abort();
5429                                 }
5430                         }
5431                         char_boundary = 0;
5432                 } else {        /* Processing Non-ASCII character */
5433                         charmask = (half == 0 ? 0x7F : 0xFF);
5434                         char_boundary = 1;
5435                         if (EQ(charset, Vcharset_control_1)) {
5436                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5437                                     && fit_to_be_escape_quoted(c))
5438                                         Dynarr_add(dst, ISO_CODE_ESC);
5439                                 /* you asked for it ... */
5440                                 Dynarr_add(dst, c - 0x20);
5441                         } else {
5442                                 switch (XCHARSET_REP_BYTES(charset)) {
5443                                 case 2:
5444                                         Dynarr_add(dst, c & charmask);
5445                                         break;
5446                                 case 3:
5447                                         if (XCHARSET_PRIVATE_P(charset)) {
5448                                                 Dynarr_add(dst, c & charmask);
5449                                                 ch = 0;
5450                                         } else if (ch) {
5451 #ifdef ENABLE_COMPOSITE_CHARS
5452                                                 if (EQ
5453                                                     (charset,
5454                                                      Vcharset_composite)) {
5455                                                         if (in_composite) {
5456                                                                 /* #### Bother! We don't know how to
5457                                                                    handle this yet. */
5458                                                                 Dynarr_add(dst,
5459                                                                            '~');
5460                                                         } else {
5461                                                                 Emchar emch =
5462                                                                     MAKE_CHAR
5463                                                                     (Vcharset_composite,
5464                                                                      ch & 0x7F,
5465                                                                      c & 0x7F);
5466                                                                 Lisp_Object lstr
5467                                                                     =
5468                                                                     composite_char_string
5469                                                                     (emch);
5470                                                                 saved_n = n;
5471                                                                 saved_src = src;
5472                                                                 in_composite =
5473                                                                     1;
5474                                                                 src =
5475                                                                     XSTRING_DATA
5476                                                                     (lstr);
5477                                                                 n = XSTRING_LENGTH(lstr);
5478                                                                 Dynarr_add(dst,
5479                                                                            ISO_CODE_ESC);
5480                                                                 Dynarr_add(dst, '0');   /* start composing */
5481                                                         }
5482                                                 } else
5483 #endif                          /* ENABLE_COMPOSITE_CHARS */
5484                                                 {
5485                                                         Dynarr_add(dst,
5486                                                                    ch &
5487                                                                    charmask);
5488                                                         Dynarr_add(dst,
5489                                                                    c &
5490                                                                    charmask);
5491                                                 }
5492                                                 ch = 0;
5493                                         } else {
5494                                                 ch = c;
5495                                                 char_boundary = 0;
5496                                         }
5497                                         break;
5498                                 case 4:
5499                                         if (ch) {
5500                                                 Dynarr_add(dst, ch & charmask);
5501                                                 Dynarr_add(dst, c & charmask);
5502                                                 ch = 0;
5503                                         } else {
5504                                                 ch = c;
5505                                                 char_boundary = 0;
5506                                         }
5507                                         break;
5508                                 default:
5509                                         abort();
5510                                 }
5511                         }
5512                 }
5513         }
5514
5515 #ifdef ENABLE_COMPOSITE_CHARS
5516         if (in_composite) {
5517                 n = saved_n;
5518                 src = saved_src;
5519                 in_composite = 0;
5520                 Dynarr_add(dst, ISO_CODE_ESC);
5521                 Dynarr_add(dst, '1');   /* end composing */
5522                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
5523         }
5524 #endif                          /* ENABLE_COMPOSITE_CHARS */
5525
5526         if (char_boundary && flags & CODING_STATE_END) {
5527                 restore_left_to_right_direction(codesys, dst, &flags, 0);
5528                 ensure_normal_shift(str, dst);
5529                 for (i = 0; i < 4; i++) {
5530                         Lisp_Object initial_charset =
5531                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5532                         iso2022_designate(initial_charset, i, str, dst);
5533                 }
5534         }
5535
5536         str->flags = flags;
5537         str->ch = ch;
5538         str->iso2022.current_char_boundary = char_boundary;
5539         str->iso2022.current_charset = charset;
5540         str->iso2022.current_half = half;
5541
5542         /* Verbum caro factum est! */
5543 }
5544 #endif                          /* MULE */
5545 \f
5546 /************************************************************************/
5547 /*                     No-conversion methods                            */
5548 /************************************************************************/
5549
5550 /* This is used when reading in "binary" files -- i.e. files that may
5551    contain all 256 possible byte values and that are not to be
5552    interpreted as being in any particular decoding. */
5553 static void
5554 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5555                             unsigned_char_dynarr * dst, Lstream_data_count n)
5556 {
5557         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5558         unsigned int flags = str->flags;
5559         unsigned int ch = str->ch;
5560         eol_type_t eol_type = str->eol_type;
5561
5562         while (n--) {
5563                 const unsigned char c = *(const unsigned char *)src++;
5564
5565                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5566                 DECODE_ADD_BINARY_CHAR(c, dst);
5567         label_continue_loop:;
5568         }
5569
5570         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5571
5572         str->flags = flags;
5573         str->ch = ch;
5574 }
5575
5576 static void
5577 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5578                             unsigned_char_dynarr * dst, Lstream_data_count n)
5579 {
5580         unsigned char c;
5581         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5582         unsigned int flags = str->flags;
5583         unsigned int ch = str->ch;
5584         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5585
5586         while (n--) {
5587                 c = *src++;
5588                 if (c == '\n') {
5589                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5590                                 Dynarr_add(dst, '\r');
5591                         if (eol_type != EOL_CR)
5592                                 Dynarr_add(dst, '\n');
5593                         ch = 0;
5594                 } else if (BYTE_ASCII_P(c)) {
5595                         assert(ch == 0);
5596                         Dynarr_add(dst, c);
5597                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5598                         assert(ch == 0);
5599                         if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5600                             c == LEADING_BYTE_CONTROL_1)
5601                                 ch = c;
5602                         else
5603                                 Dynarr_add(dst, '~');   /* untranslatable character */
5604                 } else {
5605                         if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5606                                 Dynarr_add(dst, c);
5607                         else if (ch == LEADING_BYTE_CONTROL_1) {
5608                                 assert(c < 0xC0);
5609                                 Dynarr_add(dst, c - 0x20);
5610                         }
5611                         /* else it should be the second or third byte of an
5612                            untranslatable character, so ignore it */
5613                         ch = 0;
5614                 }
5615         }
5616
5617         str->flags = flags;
5618         str->ch = ch;
5619 }
5620 \f
5621 /************************************************************************/
5622 /*                             Initialization                           */
5623 /************************************************************************/
5624
5625 void syms_of_file_coding(void)
5626 {
5627         INIT_LRECORD_IMPLEMENTATION(coding_system);
5628
5629         DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5630
5631         DEFSUBR(Fcoding_system_p);
5632         DEFSUBR(Ffind_coding_system);
5633         DEFSUBR(Fget_coding_system);
5634         DEFSUBR(Fcoding_system_list);
5635         DEFSUBR(Fcoding_system_name);
5636         DEFSUBR(Fmake_coding_system);
5637         DEFSUBR(Fcopy_coding_system);
5638         DEFSUBR(Fcoding_system_canonical_name_p);
5639         DEFSUBR(Fcoding_system_alias_p);
5640         DEFSUBR(Fcoding_system_aliasee);
5641         DEFSUBR(Fdefine_coding_system_alias);
5642         DEFSUBR(Fsubsidiary_coding_system);
5643
5644         DEFSUBR(Fcoding_system_type);
5645         DEFSUBR(Fcoding_system_doc_string);
5646 #ifdef MULE
5647         DEFSUBR(Fcoding_system_charset);
5648 #endif
5649         DEFSUBR(Fcoding_system_property);
5650
5651         DEFSUBR(Fcoding_category_list);
5652         DEFSUBR(Fset_coding_priority_list);
5653         DEFSUBR(Fcoding_priority_list);
5654         DEFSUBR(Fset_coding_category_system);
5655         DEFSUBR(Fcoding_category_system);
5656
5657         DEFSUBR(Fdetect_coding_region);
5658         DEFSUBR(Fdecode_coding_region);
5659         DEFSUBR(Fencode_coding_region);
5660 #ifdef MULE
5661         DEFSUBR(Fdecode_shift_jis_char);
5662         DEFSUBR(Fencode_shift_jis_char);
5663         DEFSUBR(Fdecode_big5_char);
5664         DEFSUBR(Fencode_big5_char);
5665         DEFSUBR(Fset_ucs_char);
5666         DEFSUBR(Fucs_char);
5667         DEFSUBR(Fset_char_ucs);
5668         DEFSUBR(Fchar_ucs);
5669 #endif                          /* MULE */
5670         defsymbol(&Qcoding_systemp, "coding-system-p");
5671         defsymbol(&Qno_conversion, "no-conversion");
5672         defsymbol(&Qraw_text, "raw-text");
5673 #ifdef MULE
5674         defsymbol(&Qbig5, "big5");
5675         defsymbol(&Qshift_jis, "shift-jis");
5676         defsymbol(&Qucs4, "ucs-4");
5677         defsymbol(&Qutf8, "utf-8");
5678         defsymbol(&Qccl, "ccl");
5679         defsymbol(&Qiso2022, "iso2022");
5680 #endif                          /* MULE */
5681         defsymbol(&Qmnemonic, "mnemonic");
5682         defsymbol(&Qeol_type, "eol-type");
5683         defsymbol(&Qpost_read_conversion, "post-read-conversion");
5684         defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5685
5686         defsymbol(&Qcr, "cr");
5687         defsymbol(&Qlf, "lf");
5688         defsymbol(&Qcrlf, "crlf");
5689         defsymbol(&Qeol_cr, "eol-cr");
5690         defsymbol(&Qeol_lf, "eol-lf");
5691         defsymbol(&Qeol_crlf, "eol-crlf");
5692 #ifdef MULE
5693         defsymbol(&Qcharset_g0, "charset-g0");
5694         defsymbol(&Qcharset_g1, "charset-g1");
5695         defsymbol(&Qcharset_g2, "charset-g2");
5696         defsymbol(&Qcharset_g3, "charset-g3");
5697         defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5698         defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5699         defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5700         defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5701         defsymbol(&Qno_iso6429, "no-iso6429");
5702         defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5703         defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5704
5705         defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5706         defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5707         defsymbol(&Qseven, "seven");
5708         defsymbol(&Qlock_shift, "lock-shift");
5709         defsymbol(&Qescape_quoted, "escape-quoted");
5710 #endif                          /* MULE */
5711         defsymbol(&Qencode, "encode");
5712         defsymbol(&Qdecode, "decode");
5713
5714 #ifdef MULE
5715         defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5716                   "shift-jis");
5717         defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5718         defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5719         defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5720         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5721         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5722                   "iso-8-designate");
5723         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5724         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5725         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5726                   "iso-lock-shift");
5727 #endif                          /* MULE */
5728         defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5729                   "no-conversion");
5730 }
5731
5732 void lstream_type_create_file_coding(void)
5733 {
5734         LSTREAM_HAS_METHOD(decoding, reader);
5735         LSTREAM_HAS_METHOD(decoding, writer);
5736         LSTREAM_HAS_METHOD(decoding, rewinder);
5737         LSTREAM_HAS_METHOD(decoding, seekable_p);
5738         LSTREAM_HAS_METHOD(decoding, flusher);
5739         LSTREAM_HAS_METHOD(decoding, closer);
5740         LSTREAM_HAS_METHOD(decoding, marker);
5741
5742         LSTREAM_HAS_METHOD(encoding, reader);
5743         LSTREAM_HAS_METHOD(encoding, writer);
5744         LSTREAM_HAS_METHOD(encoding, rewinder);
5745         LSTREAM_HAS_METHOD(encoding, seekable_p);
5746         LSTREAM_HAS_METHOD(encoding, flusher);
5747         LSTREAM_HAS_METHOD(encoding, closer);
5748         LSTREAM_HAS_METHOD(encoding, marker);
5749 }
5750
5751 void vars_of_file_coding(void)
5752 {
5753         int i;
5754
5755         fcd = xnew(struct file_coding_dump);
5756         dump_add_root_struct_ptr(&fcd, &fcd_description);
5757
5758         /* Initialize to something reasonable ... */
5759         for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5760                 fcd->coding_category_system[i] = Qnil;
5761                 fcd->coding_category_by_priority[i] = i;
5762         }
5763
5764         Fprovide(intern("file-coding"));
5765
5766         DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system  /*
5767 Coding system used for TTY keyboard input.
5768 Not used under a windowing system.
5769                                                                          */ );
5770         Vkeyboard_coding_system = Qnil;
5771
5772         DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system  /*
5773 Coding system used for TTY display output.
5774 Not used under a windowing system.
5775                                                                          */ );
5776         Vterminal_coding_system = Qnil;
5777
5778         DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read  /*
5779 Overriding coding system used when reading from a file or process.
5780 You should bind this variable with `let', but do not set it globally.
5781 If this is non-nil, it specifies the coding system that will be used
5782 to decode input on read operations, such as from a file or process.
5783 It overrides `buffer-file-coding-system-for-read',
5784 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5785 this one for permanent changes to the environment.  */ );
5786         Vcoding_system_for_read = Qnil;
5787
5788         DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write        /*
5789 Overriding coding system used when writing to a file or process.
5790 You should bind this variable with `let', but do not set it globally.
5791 If this is non-nil, it specifies the coding system that will be used
5792 to encode output for write operations, such as to a file or process.
5793 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5794 Use those variables instead of this one for permanent changes to the
5795 environment.  */ );
5796         Vcoding_system_for_write = Qnil;
5797
5798         DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system        /*
5799 Coding system used to convert pathnames when accessing files.
5800                                                                                  */ );
5801         Vfile_name_coding_system = Qnil;
5802
5803         DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5804 Non-nil means the buffer contents are regarded as multi-byte form
5805 of characters, not a binary code.  This affects the display, file I/O,
5806 and behaviors of various editing commands.
5807
5808 Setting this to nil does not do anything.
5809                                                                                  */ );
5810         enable_multibyte_characters = 1;
5811 }
5812
5813 void complex_vars_of_file_coding(void)
5814 {
5815         staticpro(&Vcoding_system_hash_table);
5816         Vcoding_system_hash_table =
5817             make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5818
5819         the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5820         dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5821                                  &codesys_prop_dynarr_description);
5822
5823 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5824 {                                               \
5825   struct codesys_prop csp;                      \
5826   csp.sym = (Sym);                              \
5827   csp.prop_type = (Prop_Type);                  \
5828   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5829 } while (0)
5830
5831         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5832         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5833         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5834         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5835         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5836         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5837         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5838 #ifdef MULE
5839         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5840         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5841         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5842         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5843         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5844         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5845         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5846         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5847         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5848         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5849         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5850         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5851         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5852         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5853         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5854         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5855         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5856
5857         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5858         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5859 #endif                          /* MULE */
5860         /* Need to create this here or we're really screwed. */
5861         Fmake_coding_system
5862             (Qraw_text, Qno_conversion,
5863              build_string
5864              ("Raw text, which means it converts only line-break-codes."),
5865              list2(Qmnemonic, build_string("Raw")));
5866
5867         Fmake_coding_system
5868             (Qbinary, Qno_conversion,
5869              build_string("Binary, which means it does not convert anything."),
5870              list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5871
5872         Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5873
5874         Fdefine_coding_system_alias(Qfile_name, Qbinary);
5875
5876         Fdefine_coding_system_alias(Qterminal, Qbinary);
5877         Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5878
5879         /* Need this for bootstrapping */
5880         fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5881             Fget_coding_system(Qraw_text);
5882
5883 #ifdef MULE
5884         {
5885                 size_t i;
5886
5887                 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5888                         fcd->ucs_to_mule_table[i] = Qnil;
5889         }
5890         staticpro(&mule_to_ucs_table);
5891         mule_to_ucs_table = Fmake_char_table(Qgeneric);
5892 #endif                          /* MULE */
5893 }