Coverity fixes from Nelson
[sxemacs] / src / mule / file-coding.c
1 /* Code conversion functions.
2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3    Copyright (C) 1995 Sun Microsystems, Inc.
4
5 This file is part of SXEmacs
6
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>. */
19
20
21 /* Synched up with: Mule 2.3.   Not in FSF. */
22
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
24
25 #include <config.h>
26 #include "lisp.h"
27
28 #include "buffer.h"
29 #include "elhash.h"
30 #include "ui/insdel.h"
31 #include "lstream.h"
32 #include "opaque.h"
33 #ifdef MULE
34 #include "mule-ccl.h"
35 #include "chartab.h"
36 #endif
37 #include "file-coding.h"
38
39 Lisp_Object Qcoding_system_error;
40
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
46
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
49
50 struct file_coding_dump {
51         /* Coding system currently associated with each coding category. */
52         Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
53
54         /* Table of all coding categories in decreasing order of priority.
55            This describes a permutation of the possible coding categories. */
56         int coding_category_by_priority[CODING_CATEGORY_LAST];
57
58 #ifdef MULE
59         Lisp_Object ucs_to_mule_table[65536];
60 #endif
61 } *fcd;
62
63 static const struct lrecord_description fcd_description_1[] = {
64         {XD_LISP_OBJECT_ARRAY,
65          offsetof(struct file_coding_dump, coding_category_system),
66          CODING_CATEGORY_LAST},
67 #ifdef MULE
68         {XD_LISP_OBJECT_ARRAY,
69          offsetof(struct file_coding_dump, ucs_to_mule_table),
70          countof(fcd->ucs_to_mule_table)},
71 #endif
72         {XD_END}
73 };
74
75 static const struct struct_description fcd_description = {
76         sizeof(struct file_coding_dump),
77         fcd_description_1
78 };
79
80 Lisp_Object mule_to_ucs_table;
81
82 Lisp_Object Qcoding_systemp;
83
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
86
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
92
93 #ifdef MULE
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 #endif
104 Lisp_Object Qencode, Qdecode;
105
106 Lisp_Object Vcoding_system_hash_table;
107
108 int enable_multibyte_characters;
109
110 #ifdef MULE
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113         /* CHARSET holds the character sets currently assigned to the G0
114            through G3 variables.  It is initialized from the array
115            INITIAL_CHARSET in CODESYS. */
116         Lisp_Object charset[4];
117
118         /* Which registers are currently invoked into the left (GL) and
119            right (GR) halves of the 8-bit encoding space? */
120         int register_left, register_right;
121
122         /* ISO_ESC holds a value indicating part of an escape sequence
123            that has already been seen. */
124         enum iso_esc_flag esc;
125
126         /* This records the bytes we've seen so far in an escape sequence,
127            in case the sequence is invalid (we spit out the bytes unchanged). */
128         unsigned char esc_bytes[8];
129
130         /* Index for next byte to store in ISO escape sequence. */
131         int esc_bytes_index;
132
133 #ifdef ENABLE_COMPOSITE_CHARS
134         /* Stuff seen so far when composing a string. */
135         unsigned_char_dynarr *composite_chars;
136 #endif
137
138         /* If we saw an invalid designation sequence for a particular
139            register, we flag it here and switch to ASCII.  The next time we
140            see a valid designation for this register, we turn off the flag
141            and do the designation normally, but pretend the sequence was
142            invalid.  The effect of all this is that (most of the time) the
143            escape sequences for both the switch to the unknown charset, and
144            the switch back to the known charset, get inserted literally into
145            the buffer and saved out as such.  The hope is that we can
146            preserve the escape sequences so that the resulting written out
147            file makes sense.  If we don't do any of this, the designation
148            to the invalid charset will be preserved but that switch back
149            to the known charset will probably get eaten because it was
150            the same charset that was already present in the register. */
151         unsigned char invalid_designated[4];
152
153         /* We try to do similar things as above for direction-switching
154            sequences.  If we encountered a direction switch while an
155            invalid designation was present, or an invalid designation
156            just after a direction switch (i.e. no valid designation
157            encountered yet), we insert the direction-switch escape
158            sequence literally into the output stream, and later on
159            insert the corresponding direction-restoring escape sequence
160            literally also. */
161         unsigned int switched_dir_and_no_valid_charset_yet:1;
162         unsigned int invalid_switch_dir:1;
163
164         /* Tells the decoder to output the escape sequence literally
165            even though it was valid.  Used in the games we play to
166            avoid lossage when we encounter invalid designations. */
167         unsigned int output_literally:1;
168         /* We encountered a direction switch followed by an invalid
169            designation.  We didn't output the direction switch
170            literally because we didn't know about the invalid designation;
171            but we have to do so now. */
172         unsigned int output_direction_sequence:1;
173 };
174 #endif                          /* MULE */
175 EXFUN(Fcopy_coding_system, 2);
176 #ifdef MULE
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179                               const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181                                unsigned_char_dynarr * dst,
182                                Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184                                unsigned_char_dynarr * dst,
185                                Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187                               Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189                                unsigned_char_dynarr * dst,
190                                Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192                                unsigned_char_dynarr * dst,
193                                Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195                               Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197                                unsigned_char_dynarr * dst,
198                                Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200                                unsigned_char_dynarr * dst,
201                                Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203                               Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205                                unsigned_char_dynarr * dst,
206                                Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208                                unsigned_char_dynarr * dst,
209                                Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212                           struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214                                  const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216                                   unsigned_char_dynarr * dst,
217                                   Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219                                   unsigned_char_dynarr * dst,
220                                   Lstream_data_count n);
221 #endif                          /* MULE */
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223                                         unsigned_char_dynarr * dst,
224                                         Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226                                         unsigned_char_dynarr * dst,
227                                         Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229                         unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231                         unsigned_char_dynarr * dst, Lstream_data_count n);
232
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
235         Lisp_Object sym;
236         int prop_type;
237 };
238
239 typedef struct {
240         Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
242
243 static const struct lrecord_description codesys_prop_description_1[] = {
244         {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
245         {XD_END}
246 };
247
248 static const struct struct_description codesys_prop_description = {
249         sizeof(codesys_prop),
250         codesys_prop_description_1
251 };
252
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254         XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
255         {XD_END}
256 };
257
258 static const struct struct_description codesys_prop_dynarr_description = {
259         sizeof(codesys_prop_dynarr),
260         codesys_prop_dynarr_description_1
261 };
262
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
264
265 enum codesys_prop_enum {
266         CODESYS_PROP_ALL_OK,
267         CODESYS_PROP_ISO2022,
268         CODESYS_PROP_CCL
269 };
270 \f
271 /************************************************************************/
272 /*                       Coding system functions                        */
273 /************************************************************************/
274
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
278
279 #ifdef MULE
280 static const struct lrecord_description ccs_description_1[] = {
281         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
283         {XD_END}
284 };
285
286 static const struct struct_description ccs_description = {
287         sizeof(charset_conversion_spec),
288         ccs_description_1
289 };
290
291 static const struct lrecord_description ccsd_description_1[] = {
292         XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
293         {XD_END}
294 };
295
296 static const struct struct_description ccsd_description = {
297         sizeof(charset_conversion_spec_dynarr),
298         ccsd_description_1
299 };
300 #endif
301
302 static const struct lrecord_description coding_system_description[] = {
303         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
311 #ifdef MULE
312         {XD_LISP_OBJECT_ARRAY,
313          offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
315          &ccsd_description},
316         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
317          &ccsd_description},
318         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
320 #endif
321         {XD_END}
322 };
323
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325                               mark_coding_system, print_coding_system,
326                               finalize_coding_system,
327                               0, 0, coding_system_description,
328                               Lisp_Coding_System);
329
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
331 {
332         Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
333
334         mark_object(CODING_SYSTEM_NAME(codesys));
335         mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336         mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337         mark_object(CODING_SYSTEM_EOL_LF(codesys));
338         mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339         mark_object(CODING_SYSTEM_EOL_CR(codesys));
340
341         switch (CODING_SYSTEM_TYPE(codesys)) {
342 #ifdef MULE
343                 int i;
344         case CODESYS_ISO2022:
345                 for (i = 0; i < 4; i++)
346                         mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
347                                     (codesys, i));
348                 if (codesys->iso2022.input_conv) {
349                         for (i = 0;
350                              i < Dynarr_length(codesys->iso2022.input_conv);
351                              i++) {
352                                 struct charset_conversion_spec *ccs =
353                                     Dynarr_atp(codesys->iso2022.input_conv, i);
354                                 mark_object(ccs->from_charset);
355                                 mark_object(ccs->to_charset);
356                         }
357                 }
358                 if (codesys->iso2022.output_conv) {
359                         for (i = 0;
360                              i < Dynarr_length(codesys->iso2022.output_conv);
361                              i++) {
362                                 struct charset_conversion_spec *ccs =
363                                     Dynarr_atp(codesys->iso2022.output_conv, i);
364                                 mark_object(ccs->from_charset);
365                                 mark_object(ccs->to_charset);
366                         }
367                 }
368                 break;
369
370         case CODESYS_CCL:
371                 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372                 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
373                 break;
374
375                 /* list the rest of them lot explicitly */
376         case CODESYS_AUTODETECT:
377         case CODESYS_SHIFT_JIS:
378         case CODESYS_BIG5:
379         case CODESYS_UCS4:
380         case CODESYS_UTF8:
381         case CODESYS_NO_CONVERSION:
382 #ifdef DEBUG_SXEMACS
383         case CODESYS_INTERNAL:
384 #endif
385 #endif                          /* MULE */
386         default:
387                 break;
388         }
389
390         mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391         return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
392 }
393
394 static void
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
396 {
397         Lisp_Coding_System *c = XCODING_SYSTEM(obj);
398         if (print_readably)
399                 error("printing unreadable object #<coding-system 0x%x>",
400                       c->header.uid);
401
402         write_c_string("#<coding-system ", printcharfun);
403         print_internal(c->name, printcharfun, 1);
404         write_c_string(">", printcharfun);
405 }
406
407 static void finalize_coding_system(void *header, int for_disksave)
408 {
409         Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410         /* Since coding systems never go away, this function is not
411            necessary.  But it would be necessary if we changed things
412            so that coding systems could go away. */
413         if (!for_disksave) {    /* see comment in lstream.c */
414                 switch (CODING_SYSTEM_TYPE(c)) {
415 #ifdef MULE
416                 case CODESYS_ISO2022:
417                         if (c->iso2022.input_conv) {
418                                 Dynarr_free(c->iso2022.input_conv);
419                                 c->iso2022.input_conv = 0;
420                         }
421                         if (c->iso2022.output_conv) {
422                                 Dynarr_free(c->iso2022.output_conv);
423                                 c->iso2022.output_conv = 0;
424                         }
425                         break;
426
427                         /* list the rest of them lot explicitly */
428                 case CODESYS_AUTODETECT:
429                 case CODESYS_SHIFT_JIS:
430                 case CODESYS_BIG5:
431                 case CODESYS_UCS4:
432                 case CODESYS_UTF8:
433                 case CODESYS_CCL:
434                 case CODESYS_NO_CONVERSION:
435 #ifdef DEBUG_SXEMACS
436                 case CODESYS_INTERNAL:
437 #endif
438 #endif                          /* MULE */
439                 default:
440                         break;
441                 }
442         }
443 }
444
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
446 {
447         CHECK_SYMBOL(symbol);
448         if (NILP(symbol))
449                 return EOL_AUTODETECT;
450         if (EQ(symbol, Qlf))
451                 return EOL_LF;
452         if (EQ(symbol, Qcrlf))
453                 return EOL_CRLF;
454         if (EQ(symbol, Qcr))
455                 return EOL_CR;
456
457         signal_simple_error("Unrecognized eol type", symbol);
458         return EOL_AUTODETECT;  /* not reached */
459 }
460
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
462 {
463         switch (type) {
464         default:
465                 abort();
466         case EOL_LF:
467                 return Qlf;
468         case EOL_CRLF:
469                 return Qcrlf;
470         case EOL_CR:
471                 return Qcr;
472         case EOL_AUTODETECT:
473                 return Qnil;
474         }
475 }
476
477 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
478 {
479         Lisp_Object codesys_obj;
480         int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
481         char *codesys_name = (char *)alloca(len + 7);
482         int mlen = -1;
483         char *codesys_mnemonic = 0;
484
485         Lisp_Object codesys_name_sym, sub_codesys_obj;
486
487         /* kludge */
488
489         XSETCODING_SYSTEM(codesys_obj, codesys);
490
491         memcpy(codesys_name,
492                string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
493
494         if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
495                 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
496                 codesys_mnemonic = (char *)alloca(mlen + 7);
497                 memcpy(codesys_mnemonic,
498                        XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
499         }
500 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
501   strcpy (codesys_name + len, "-" op_sys);                                      \
502   if (mlen != -1)                                                               \
503     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
504   codesys_name_sym = intern (codesys_name);                                     \
505   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
506   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
507   if (mlen != -1)                                                               \
508     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
509       build_string (codesys_mnemonic);                                          \
510   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
511 } while (0)
512
513         DEFINE_SUB_CODESYS("unix", "", EOL_LF);
514         DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
515         DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
516 }
517
518 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0,     /*
519 Return t if OBJECT is a coding system.
520 A coding system is an object that defines how text containing multiple
521 character sets is encoded into a stream of (typically 8-bit) bytes.
522 The coding system is used to decode the stream into a series of
523 characters (which may be from multiple charsets) when the text is read
524 from a file or process, and is used to encode the text back into the
525 same format when it is written out to a file or process.
526
527 For example, many ISO2022-compliant coding systems (such as Compound
528 Text, which is used for inter-client data under the X Window System)
529 use escape sequences to switch between different charsets -- Japanese
530 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
531 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
532 `make-coding-system' for more information.
533
534 Coding systems are normally identified using a symbol, and the
535 symbol is accepted in place of the actual coding system object whenever
536 a coding system is called for. (This is similar to how faces work.)
537 */
538       (object))
539 {
540         return CODING_SYSTEMP(object) ? Qt : Qnil;
541 }
542
543 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0,       /*
544 Retrieve the coding system of the given name.
545
546 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
547 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
548 If there is no such coding system, nil is returned.  Otherwise the
549 associated coding system object is returned.
550 */
551       (coding_system_or_name))
552 {
553         if (NILP(coding_system_or_name))
554                 coding_system_or_name = Qbinary;
555         else if (CODING_SYSTEMP(coding_system_or_name))
556                 return coding_system_or_name;
557         else
558                 CHECK_SYMBOL(coding_system_or_name);
559
560         while (1) {
561                 coding_system_or_name =
562                     Fgethash(coding_system_or_name, Vcoding_system_hash_table,
563                              Qnil);
564
565                 if (CODING_SYSTEMP(coding_system_or_name)
566                     || NILP(coding_system_or_name))
567                         return coding_system_or_name;
568         }
569 }
570
571 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
572 Retrieve the coding system of the given name.
573 Same as `find-coding-system' except that if there is no such
574 coding system, an error is signaled instead of returning nil.
575 */
576       (name))
577 {
578         Lisp_Object coding_system = Ffind_coding_system(name);
579
580         if (NILP(coding_system))
581                 signal_simple_error("No such coding system", name);
582         return coding_system;
583 }
584
585 /* We store the coding systems in hash tables with the names as the key and the
586    actual coding system object as the value.  Occasionally we need to use them
587    in a list format.  These routines provide us with that. */
588 struct coding_system_list_closure {
589         Lisp_Object *coding_system_list;
590 };
591
592 static int
593 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
594                                  void *coding_system_list_closure)
595 {
596         /* This function can GC */
597         struct coding_system_list_closure *cscl =
598             (struct coding_system_list_closure *)coding_system_list_closure;
599         Lisp_Object *coding_system_list = cscl->coding_system_list;
600
601         *coding_system_list = Fcons(key, *coding_system_list);
602         return 0;
603 }
604
605 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0,       /*
606 Return a list of the names of all defined coding systems.
607 */
608       ())
609 {
610         Lisp_Object coding_system_list = Qnil;
611         struct gcpro gcpro1;
612         struct coding_system_list_closure coding_system_list_closure;
613
614         GCPRO1(coding_system_list);
615         coding_system_list_closure.coding_system_list = &coding_system_list;
616         elisp_maphash(add_coding_system_to_list_mapper,
617                       Vcoding_system_hash_table, &coding_system_list_closure);
618         UNGCPRO;
619
620         return coding_system_list;
621 }
622
623 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0,       /*
624 Return the name of the given coding system.
625 */
626       (coding_system))
627 {
628         coding_system = Fget_coding_system(coding_system);
629         return XCODING_SYSTEM_NAME(coding_system);
630 }
631
632 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
633                                                   Lisp_Object name)
634 {
635         Lisp_Coding_System *codesys =
636             alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
637
638         zero_lcrecord(codesys);
639         CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
640         CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
641         CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
642         CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
643         CODING_SYSTEM_EOL_CR(codesys) = Qnil;
644         CODING_SYSTEM_EOL_LF(codesys) = Qnil;
645         CODING_SYSTEM_TYPE(codesys) = type;
646         CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
647 #ifdef MULE
648         if (type == CODESYS_ISO2022) {
649                 int i;
650                 for (i = 0; i < 4; i++)
651                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
652                             Qnil;
653         } else if (type == CODESYS_CCL) {
654                 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
655                 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
656         }
657 #endif                          /* MULE */
658         CODING_SYSTEM_NAME(codesys) = name;
659
660         return codesys;
661 }
662
663 #ifdef MULE
664 /* Given a list of charset conversion specs as specified in a Lisp
665    program, parse it into STORE_HERE. */
666
667 static void
668 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
669                                Lisp_Object spec_list)
670 {
671         Lisp_Object rest;
672
673         EXTERNAL_LIST_LOOP(rest, spec_list) {
674                 Lisp_Object car = XCAR(rest);
675                 Lisp_Object from, to;
676                 struct charset_conversion_spec spec;
677
678                 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
679                         signal_simple_error("Invalid charset conversion spec",
680                                             car);
681                 from = Fget_charset(XCAR(car));
682                 to = Fget_charset(XCAR(XCDR(car)));
683                 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
684                         signal_simple_error_2
685                             ("Attempted conversion between different charset types",
686                              from, to);
687                 spec.from_charset = from;
688                 spec.to_charset = to;
689
690                 Dynarr_add(store_here, spec);
691         }
692 }
693
694 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
695    specs, return the equivalent as the Lisp programmer would see it.
696
697    If LOAD_HERE is 0, return Qnil. */
698
699 static Lisp_Object
700 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
701 {
702         int i;
703         Lisp_Object result;
704
705         if (!load_here)
706                 return Qnil;
707         for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
708                 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
709                 result =
710                     Fcons(list2(ccs->from_charset, ccs->to_charset), result);
711         }
712
713         return Fnreverse(result);
714 }
715
716 #endif                          /* MULE */
717
718 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0,       /*
719 Register symbol NAME as a coding system.
720
721 TYPE describes the conversion method used and should be one of
722
723 nil or 'undecided
724 Automatic conversion.  SXEmacs attempts to detect the coding system
725 used in the file.
726 'no-conversion
727 No conversion.  Use this for binary files and such.  On output,
728 graphic characters that are not in ASCII or Latin-1 will be
729 replaced by a ?. (For a no-conversion-encoded buffer, these
730 characters will only be present if you explicitly insert them.)
731 'shift-jis
732 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
733 'ucs-4
734 ISO 10646 UCS-4 encoding.
735 'utf-8
736 ISO 10646 UTF-8 encoding.
737 'iso2022
738 Any ISO2022-compliant encoding.  Among other things, this includes
739 JIS (the Japanese encoding commonly used for e-mail), EUC (the
740 standard Unix encoding for Japanese and other languages), and
741 Compound Text (the encoding used in X11).  You can specify more
742 specific information about the conversion with the PROPS argument.
743 'big5
744 Big5 (the encoding commonly used for Taiwanese).
745 'ccl
746 The conversion is performed using a user-written pseudo-code
747 program.  CCL (Code Conversion Language) is the name of this
748 pseudo-code.
749 'internal
750 Write out or read in the raw contents of the memory representing
751 the buffer's text.  This is primarily useful for debugging
752 purposes, and is only enabled when SXEmacs has been compiled with
753 DEBUG_SXEMACS defined (via the --debug configure option).
754 WARNING: Reading in a file using 'internal conversion can result
755 in an internal inconsistency in the memory representing a
756 buffer's text, which will produce unpredictable results and may
757 cause SXEmacs to crash.  Under normal circumstances you should
758 never use 'internal conversion.
759
760 DOC-STRING is a string describing the coding system.
761
762 PROPS is a property list, describing the specific nature of the
763 character set.  Recognized properties are:
764
765 'mnemonic
766 String to be displayed in the modeline when this coding system is
767 active.
768
769 'eol-type
770 End-of-line conversion to be used.  It should be one of
771
772 nil
773 Automatically detect the end-of-line type (LF, CRLF,
774 or CR).  Also generate subsidiary coding systems named
775 `NAME-unix', `NAME-dos', and `NAME-mac', that are
776 identical to this coding system but have an EOL-TYPE
777 value of 'lf, 'crlf, and 'cr, respectively.
778 'lf
779 The end of a line is marked externally using ASCII LF.
780 Since this is also the way that SXEmacs represents an
781 end-of-line internally, specifying this option results
782 in no end-of-line conversion.  This is the standard
783 format for Unix text files.
784 'crlf
785 The end of a line is marked externally using ASCII
786 CRLF.  This is the standard format for MS-DOS text
787 files.
788 'cr
789 The end of a line is marked externally using ASCII CR.
790 This is the standard format for Macintosh text files.
791 t
792 Automatically detect the end-of-line type but do not
793 generate subsidiary coding systems.  (This value is
794 converted to nil when stored internally, and
795 `coding-system-property' will return nil.)
796
797 'post-read-conversion
798 Function called after a file has been read in, to perform the
799 decoding.  Called with two arguments, START and END, denoting
800 a region of the current buffer to be decoded.
801
802 'pre-write-conversion
803 Function called before a file is written out, to perform the
804 encoding.  Called with two arguments, START and END, denoting
805 a region of the current buffer to be encoded.
806
807 The following additional properties are recognized if TYPE is 'iso2022:
808
809 'charset-g0
810 'charset-g1
811 'charset-g2
812 'charset-g3
813 The character set initially designated to the G0 - G3 registers.
814 The value should be one of
815
816 -- A charset object (designate that character set)
817 -- nil (do not ever use this register)
818 -- t (no character set is initially designated to
819 the register, but may be later on; this automatically
820 sets the corresponding `force-g*-on-output' property)
821
822 'force-g0-on-output
823 'force-g1-on-output
824 'force-g2-on-output
825 'force-g2-on-output
826 If non-nil, send an explicit designation sequence on output before
827 using the specified register.
828
829 'short
830 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
831 "ESC $ B" on output in place of the full designation sequences
832 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
833
834 'no-ascii-eol
835 If non-nil, don't designate ASCII to G0 at each end of line on output.
836 Setting this to non-nil also suppresses other state-resetting that
837 normally happens at the end of a line.
838
839 'no-ascii-cntl
840 If non-nil, don't designate ASCII to G0 before control chars on output.
841
842 'seven
843 If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
844 environment.
845
846 'lock-shift
847 If non-nil, use locking-shift (SO/SI) instead of single-shift
848 or designation by escape sequence.
849
850 'no-iso6429
851 If non-nil, don't use ISO6429's direction specification.
852
853 'escape-quoted
854 If non-nil, literal control characters that are the same as
855 the beginning of a recognized ISO2022 or ISO6429 escape sequence
856 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
857 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
858 so that they can be properly distinguished from an escape sequence.
859 (Note that doing this results in a non-portable encoding.) This
860 encoding flag is used for byte-compiled files.  Note that ESC
861 is a good choice for a quoting character because there are no
862 escape sequences whose second byte is a character from the Control-0
863 or Control-1 character sets; this is explicitly disallowed by the
864 ISO2022 standard.
865
866 'input-charset-conversion
867 A list of conversion specifications, specifying conversion of
868 characters in one charset to another when decoding is performed.
869 Each specification is a list of two elements: the source charset,
870 and the destination charset.
871
872 'output-charset-conversion
873 A list of conversion specifications, specifying conversion of
874 characters in one charset to another when encoding is performed.
875 The form of each specification is the same as for
876 'input-charset-conversion.
877
878 The following additional properties are recognized (and required)
879 if TYPE is 'ccl:
880
881 'decode
882 CCL program used for decoding (converting to internal format).
883
884 'encode
885 CCL program used for encoding (converting to external format).
886 */
887       (name, type, doc_string, props))
888 {
889         Lisp_Coding_System *codesys;
890         enum coding_system_type ty;
891         int need_to_setup_eol_systems = 1;
892
893         /* Convert type to constant */
894         if (NILP(type) || EQ(type, Qundecided)) {
895                 ty = CODESYS_AUTODETECT;
896         }
897 #ifdef MULE
898         else if (EQ(type, Qshift_jis)) {
899                 ty = CODESYS_SHIFT_JIS;
900         } else if (EQ(type, Qiso2022)) {
901                 ty = CODESYS_ISO2022;
902         } else if (EQ(type, Qbig5)) {
903                 ty = CODESYS_BIG5;
904         } else if (EQ(type, Qucs4)) {
905                 ty = CODESYS_UCS4;
906         } else if (EQ(type, Qutf8)) {
907                 ty = CODESYS_UTF8;
908         } else if (EQ(type, Qccl)) {
909                 ty = CODESYS_CCL;
910         }
911 #endif
912         else if (EQ(type, Qno_conversion)) {
913                 ty = CODESYS_NO_CONVERSION;
914         }
915 #ifdef DEBUG_SXEMACS
916         else if (EQ(type, Qinternal)) {
917                 ty = CODESYS_INTERNAL;
918         }
919 #endif
920         else
921                 signal_simple_error("Invalid coding system type", type);
922
923         CHECK_SYMBOL(name);
924
925         codesys = allocate_coding_system(ty, name);
926
927         if (NILP(doc_string))
928                 doc_string = build_string("");
929         else
930                 CHECK_STRING(doc_string);
931         CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
932
933         {
934                 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
935                         if (EQ(key, Qmnemonic)) {
936                                 if (!NILP(value))
937                                         CHECK_STRING(value);
938                                 CODING_SYSTEM_MNEMONIC(codesys) = value;
939                         }
940
941                         else if (EQ(key, Qeol_type)) {
942                                 need_to_setup_eol_systems = NILP(value);
943                                 if (EQ(value, Qt))
944                                         value = Qnil;
945                                 CODING_SYSTEM_EOL_TYPE(codesys) =
946                                     symbol_to_eol_type(value);
947                         }
948
949                         else if (EQ(key, Qpost_read_conversion))
950                                 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
951                                     value;
952                         else if (EQ(key, Qpre_write_conversion))
953                                 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
954                                     value;
955 #ifdef MULE
956                         else if (ty == CODESYS_ISO2022) {
957 #define FROB_INITIAL_CHARSET(charset_num) \
958   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
959     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
960
961                                 if (EQ(key, Qcharset_g0))
962                                         FROB_INITIAL_CHARSET(0);
963                                 else if (EQ(key, Qcharset_g1))
964                                         FROB_INITIAL_CHARSET(1);
965                                 else if (EQ(key, Qcharset_g2))
966                                         FROB_INITIAL_CHARSET(2);
967                                 else if (EQ(key, Qcharset_g3))
968                                         FROB_INITIAL_CHARSET(3);
969
970 #define FROB_FORCE_CHARSET(charset_num) \
971   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
972
973                                 else if (EQ(key, Qforce_g0_on_output))
974                                         FROB_FORCE_CHARSET(0);
975                                 else if (EQ(key, Qforce_g1_on_output))
976                                         FROB_FORCE_CHARSET(1);
977                                 else if (EQ(key, Qforce_g2_on_output))
978                                         FROB_FORCE_CHARSET(2);
979                                 else if (EQ(key, Qforce_g3_on_output))
980                                         FROB_FORCE_CHARSET(3);
981
982 #define FROB_BOOLEAN_PROPERTY(prop) \
983   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
984
985                                 else if (EQ(key, Qshort))
986                                         FROB_BOOLEAN_PROPERTY(SHORT);
987                                 else if (EQ(key, Qno_ascii_eol))
988                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
989                                 else if (EQ(key, Qno_ascii_cntl))
990                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
991                                 else if (EQ(key, Qseven))
992                                         FROB_BOOLEAN_PROPERTY(SEVEN);
993                                 else if (EQ(key, Qlock_shift))
994                                         FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
995                                 else if (EQ(key, Qno_iso6429))
996                                         FROB_BOOLEAN_PROPERTY(NO_ISO6429);
997                                 else if (EQ(key, Qescape_quoted))
998                                         FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
999
1000                                 else if (EQ(key, Qinput_charset_conversion)) {
1001                                         codesys->iso2022.input_conv =
1002                                             Dynarr_new(charset_conversion_spec);
1003                                         parse_charset_conversion_specs(codesys->
1004                                                                        iso2022.
1005                                                                        input_conv,
1006                                                                        value);
1007                                 } else if (EQ(key, Qoutput_charset_conversion)) {
1008                                         codesys->iso2022.output_conv =
1009                                             Dynarr_new(charset_conversion_spec);
1010                                         parse_charset_conversion_specs(codesys->
1011                                                                        iso2022.
1012                                                                        output_conv,
1013                                                                        value);
1014                                 } else
1015                                         signal_simple_error
1016                                             ("Unrecognized property", key);
1017                         } else if (EQ(type, Qccl)) {
1018                                 Lisp_Object sym;
1019                                 struct ccl_program test_ccl;
1020                                 Extbyte *suffix;
1021
1022                                 /* Check key first.  */
1023                                 if (EQ(key, Qdecode))
1024                                         suffix = "-ccl-decode";
1025                                 else if (EQ(key, Qencode))
1026                                         suffix = "-ccl-encode";
1027                                 else
1028                                         signal_simple_error
1029                                             ("Unrecognized property", key);
1030
1031                                 /* If value is vector, register it as a ccl program
1032                                    associated with an newly created symbol for
1033                                    backward compatibility.  */
1034                                 if (VECTORP(value)) {
1035                                         sym =
1036                                             Fintern(concat2
1037                                                     (Fsymbol_name(name),
1038                                                      build_string(suffix)),
1039                                                     Qnil);
1040                                         Fregister_ccl_program(sym, value);
1041                                 } else {
1042                                         CHECK_SYMBOL(value);
1043                                         sym = value;
1044                                 }
1045                                 /* check if the given ccl programs are valid.  */
1046                                 if (setup_ccl_program(&test_ccl, sym) < 0)
1047                                         signal_simple_error
1048                                             ("Invalid CCL program", value);
1049
1050                                 if (EQ(key, Qdecode))
1051                                         CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1052                                 else if (EQ(key, Qencode))
1053                                         CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1054
1055                         }
1056 #endif                          /* MULE */
1057                         else
1058                                 signal_simple_error("Unrecognized property",
1059                                                     key);
1060                 }
1061         }
1062
1063         if (need_to_setup_eol_systems)
1064                 setup_eol_coding_systems(codesys);
1065
1066         {
1067                 Lisp_Object codesys_obj;
1068                 XSETCODING_SYSTEM(codesys_obj, codesys);
1069                 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1070                 return codesys_obj;
1071         }
1072 }
1073
1074 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0,       /*
1075 Copy OLD-CODING-SYSTEM to NEW-NAME.
1076 If NEW-NAME does not name an existing coding system, a new one will
1077 be created.
1078 */
1079       (old_coding_system, new_name))
1080 {
1081         Lisp_Object new_coding_system;
1082         old_coding_system = Fget_coding_system(old_coding_system);
1083         new_coding_system = Ffind_coding_system(new_name);
1084         if (NILP(new_coding_system)) {
1085                 XSETCODING_SYSTEM(new_coding_system,
1086                                   allocate_coding_system
1087                                   (XCODING_SYSTEM_TYPE(old_coding_system),
1088                                    new_name));
1089                 Fputhash(new_name, new_coding_system,
1090                          Vcoding_system_hash_table);
1091         }
1092
1093         {
1094                 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1095                 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1096                 memcpy(((char *)to) + sizeof(to->header),
1097                        ((char *)from) + sizeof(from->header),
1098                        sizeof(*from) - sizeof(from->header));
1099                 to->name = new_name;
1100         }
1101         return new_coding_system;
1102 }
1103
1104 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0,       /*
1105 Return t if OBJECT names a coding system, and is not a coding system alias.
1106 */
1107       (object))
1108 {
1109         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1110         return CODING_SYSTEMP(val) ? Qt : Qnil;
1111 }
1112
1113 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1114  Return t if OBJECT is a coding system alias.
1115 All coding system aliases are created by `define-coding-system-alias'.
1116 */
1117       (object))
1118 {
1119         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1120         return SYMBOLP(val) ? Qt : Qnil;
1121 }
1122
1123 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1124 Return the coding-system symbol for which symbol ALIAS is an alias.
1125 */
1126       (alias))
1127 {
1128         Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1129         if (SYMBOLP(aliasee)) {
1130                 return aliasee;
1131         } else {
1132                 signal_simple_error(
1133                         "Symbol is not a coding system alias", alias);
1134         }
1135         /* To keep the compiler happy */
1136         return Qnil;
1137 }
1138
1139 static Lisp_Object
1140 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1141 {
1142         return Fintern(concat2(Fsymbol_name(symbol),
1143                                build_string(ascii_string)), Qnil);
1144 }
1145
1146 /* A maphash function, for removing dangling coding system aliases. */
1147 static int
1148 dangling_coding_system_alias_p(Lisp_Object alias,
1149                                Lisp_Object aliasee, void *dangling_aliases)
1150 {
1151         if (SYMBOLP(aliasee)
1152             && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1153                 (*(int *)dangling_aliases)++;
1154                 return 1;
1155         } else {
1156                 return 0;
1157         }
1158 }
1159
1160 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0,       /*
1161 Define symbol ALIAS as an alias for coding system ALIASEE.
1162
1163 You can use this function to redefine an alias that has already been defined,
1164 but you cannot redefine a name which is the canonical name for a coding system.
1165 \(a canonical name of a coding system is what is returned when you call
1166 `coding-system-name' on a coding system).
1167
1168 ALIASEE itself can be an alias, which allows you to define nested aliases.
1169
1170 You are forbidden, however, from creating alias loops or `dangling' aliases.
1171 These will be detected, and an error will be signaled if you attempt to do so.
1172
1173 If ALIASEE is nil, then ALIAS will simply be undefined.
1174
1175 See also `coding-system-alias-p', `coding-system-aliasee',
1176 and `coding-system-canonical-name-p'.
1177 */
1178       (alias, aliasee))
1179 {
1180         Lisp_Object real_coding_system, probe;
1181
1182         CHECK_SYMBOL(alias);
1183
1184         if (!NILP(Fcoding_system_canonical_name_p(alias)))
1185                 signal_simple_error
1186                     ("Symbol is the canonical name of a coding system and cannot be redefined",
1187                      alias);
1188
1189         if (NILP(aliasee)) {
1190                 Lisp_Object subsidiary_unix =
1191                     append_suffix_to_symbol(alias, "-unix");
1192                 Lisp_Object subsidiary_dos =
1193                     append_suffix_to_symbol(alias, "-dos");
1194                 Lisp_Object subsidiary_mac =
1195                     append_suffix_to_symbol(alias, "-mac");
1196
1197                 Fremhash(alias, Vcoding_system_hash_table);
1198
1199                 /* Undefine subsidiary aliases,
1200                    presumably created by a previous call to this function */
1201                 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1202                     !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1203                     !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1204                         Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1205                         Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1206                         Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1207                 }
1208
1209                 /* Undefine dangling coding system aliases. */
1210                 {
1211                         int dangling_aliases;
1212
1213                         do {
1214                                 dangling_aliases = 0;
1215                                 elisp_map_remhash
1216                                     (dangling_coding_system_alias_p,
1217                                      Vcoding_system_hash_table,
1218                                      &dangling_aliases);
1219                         } while (dangling_aliases > 0);
1220                 }
1221
1222                 return Qnil;
1223         }
1224
1225         if (CODING_SYSTEMP(aliasee))
1226                 aliasee = XCODING_SYSTEM_NAME(aliasee);
1227
1228         /* Checks that aliasee names a coding-system */
1229         real_coding_system = Fget_coding_system(aliasee);
1230
1231         /* Check for coding system alias loops */
1232         if (EQ(alias, aliasee))
1233               alias_loop:signal_simple_error_2
1234                     ("Attempt to create a coding system alias loop", alias,
1235                      aliasee);
1236
1237         for (probe = aliasee;
1238              SYMBOLP(probe);
1239              probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1240                 if (EQ(probe, alias))
1241                         goto alias_loop;
1242         }
1243
1244         Fputhash(alias, aliasee, Vcoding_system_hash_table);
1245
1246         /* Set up aliases for subsidiaries.
1247            #### There must be a better way to handle subsidiary coding
1248            #### systems. */
1249         {
1250                 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1251
1252                 for (int i = 0; i < countof(suffixes); i++) {
1253                         Lisp_Object alias_subsidiary =
1254                                 append_suffix_to_symbol(alias, suffixes[i]);
1255                         Lisp_Object aliasee_subsidiary =
1256                                 append_suffix_to_symbol(aliasee, suffixes[i]);
1257
1258                         if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1259                                 Fdefine_coding_system_alias(alias_subsidiary,
1260                                                             aliasee_subsidiary);
1261                         }
1262                 }
1263         }
1264         /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1265            but it doesn't look intentional, so I'd rather return something
1266            meaningful or nothing at all. */
1267         return Qnil;
1268 }
1269
1270 static Lisp_Object
1271 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1272 {
1273         Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1274         Lisp_Object new_coding_system;
1275
1276         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1277                 return coding_system;
1278
1279         switch (type) {
1280         case EOL_AUTODETECT:
1281                 return coding_system;
1282         case EOL_LF:
1283                 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1284                 break;
1285         case EOL_CR:
1286                 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1287                 break;
1288         case EOL_CRLF:
1289                 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1290                 break;
1291         default:
1292                 abort();
1293                 return Qnil;
1294         }
1295
1296         return NILP(new_coding_system) ? coding_system : new_coding_system;
1297 }
1298
1299 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0,   /*
1300 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1301 */
1302       (coding_system, eol_type))
1303 {
1304         coding_system = Fget_coding_system(coding_system);
1305
1306         return subsidiary_coding_system(coding_system,
1307                                         symbol_to_eol_type(eol_type));
1308 }
1309 \f
1310 /************************************************************************/
1311 /*                         Coding system accessors                      */
1312 /************************************************************************/
1313
1314 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0,   /*
1315 Return the doc string for CODING-SYSTEM.
1316 */
1317       (coding_system))
1318 {
1319         coding_system = Fget_coding_system(coding_system);
1320         return XCODING_SYSTEM_DOC_STRING(coding_system);
1321 }
1322
1323 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0,       /*
1324 Return the type of CODING-SYSTEM.
1325 */
1326       (coding_system))
1327 {
1328         Lisp_Object tmp = Fget_coding_system(coding_system);
1329
1330         switch (XCODING_SYSTEM_TYPE(tmp)) {
1331         default:
1332                 abort();
1333         case CODESYS_AUTODETECT:
1334                 return Qundecided;
1335 #ifdef MULE
1336         case CODESYS_SHIFT_JIS:
1337                 return Qshift_jis;
1338         case CODESYS_ISO2022:
1339                 return Qiso2022;
1340         case CODESYS_BIG5:
1341                 return Qbig5;
1342         case CODESYS_UCS4:
1343                 return Qucs4;
1344         case CODESYS_UTF8:
1345                 return Qutf8;
1346         case CODESYS_CCL:
1347                 return Qccl;
1348 #endif
1349         case CODESYS_NO_CONVERSION:
1350                 return Qno_conversion;
1351 #ifdef DEBUG_SXEMACS
1352         case CODESYS_INTERNAL:
1353                 return Qinternal;
1354 #endif
1355         }
1356 }
1357
1358 #ifdef MULE
1359 static
1360 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1361 {
1362         Lisp_Object cs
1363             = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1364
1365         return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1366 }
1367
1368 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1369 Return initial charset of CODING-SYSTEM designated to GNUM.
1370 GNUM allows 0 .. 3.
1371 */
1372       (coding_system, gnum))
1373 {
1374         coding_system = Fget_coding_system(coding_system);
1375         CHECK_INT(gnum);
1376
1377         return coding_system_charset(coding_system, XINT(gnum));
1378 }
1379 #endif                          /* MULE */
1380
1381 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0,       /*
1382 Return the PROP property of CODING-SYSTEM.
1383 */
1384       (coding_system, prop))
1385 {
1386         int i, ok = 0;
1387         enum coding_system_type type;
1388
1389         coding_system = Fget_coding_system(coding_system);
1390         CHECK_SYMBOL(prop);
1391         type = XCODING_SYSTEM_TYPE(coding_system);
1392
1393         for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1394                 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1395                         ok = 1;
1396                         switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1397                         case CODESYS_PROP_ALL_OK:
1398                                 break;
1399 #ifdef MULE
1400                         case CODESYS_PROP_ISO2022:
1401                                 if (type != CODESYS_ISO2022)
1402                                         signal_simple_error
1403                                             ("Property only valid in ISO2022 coding systems",
1404                                              prop);
1405                                 break;
1406
1407                         case CODESYS_PROP_CCL:
1408                                 if (type != CODESYS_CCL)
1409                                         signal_simple_error
1410                                             ("Property only valid in CCL coding systems",
1411                                              prop);
1412                                 break;
1413 #endif                          /* MULE */
1414                         default:
1415                                 abort();
1416                         }
1417                 }
1418
1419         if (!ok)
1420                 signal_simple_error("Unrecognized property", prop);
1421
1422         if (EQ(prop, Qname))
1423                 return XCODING_SYSTEM_NAME(coding_system);
1424         else if (EQ(prop, Qtype))
1425                 return Fcoding_system_type(coding_system);
1426         else if (EQ(prop, Qdoc_string))
1427                 return XCODING_SYSTEM_DOC_STRING(coding_system);
1428         else if (EQ(prop, Qmnemonic))
1429                 return XCODING_SYSTEM_MNEMONIC(coding_system);
1430         else if (EQ(prop, Qeol_type))
1431                 return
1432                     eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1433         else if (EQ(prop, Qeol_lf))
1434                 return XCODING_SYSTEM_EOL_LF(coding_system);
1435         else if (EQ(prop, Qeol_crlf))
1436                 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1437         else if (EQ(prop, Qeol_cr))
1438                 return XCODING_SYSTEM_EOL_CR(coding_system);
1439         else if (EQ(prop, Qpost_read_conversion))
1440                 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1441         else if (EQ(prop, Qpre_write_conversion))
1442                 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1443 #ifdef MULE
1444         else if (type == CODESYS_ISO2022) {
1445                 if (EQ(prop, Qcharset_g0))
1446                         return coding_system_charset(coding_system, 0);
1447                 else if (EQ(prop, Qcharset_g1))
1448                         return coding_system_charset(coding_system, 1);
1449                 else if (EQ(prop, Qcharset_g2))
1450                         return coding_system_charset(coding_system, 2);
1451                 else if (EQ(prop, Qcharset_g3))
1452                         return coding_system_charset(coding_system, 3);
1453
1454 #define FORCE_CHARSET(charset_num) \
1455   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1456    (coding_system, charset_num) ? Qt : Qnil)
1457
1458                 else if (EQ(prop, Qforce_g0_on_output))
1459                         return FORCE_CHARSET(0);
1460                 else if (EQ(prop, Qforce_g1_on_output))
1461                         return FORCE_CHARSET(1);
1462                 else if (EQ(prop, Qforce_g2_on_output))
1463                         return FORCE_CHARSET(2);
1464                 else if (EQ(prop, Qforce_g3_on_output))
1465                         return FORCE_CHARSET(3);
1466
1467 #define LISP_BOOLEAN(prop) \
1468   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1469
1470                 else if (EQ(prop, Qshort))
1471                         return LISP_BOOLEAN(SHORT);
1472                 else if (EQ(prop, Qno_ascii_eol))
1473                         return LISP_BOOLEAN(NO_ASCII_EOL);
1474                 else if (EQ(prop, Qno_ascii_cntl))
1475                         return LISP_BOOLEAN(NO_ASCII_CNTL);
1476                 else if (EQ(prop, Qseven))
1477                         return LISP_BOOLEAN(SEVEN);
1478                 else if (EQ(prop, Qlock_shift))
1479                         return LISP_BOOLEAN(LOCK_SHIFT);
1480                 else if (EQ(prop, Qno_iso6429))
1481                         return LISP_BOOLEAN(NO_ISO6429);
1482                 else if (EQ(prop, Qescape_quoted))
1483                         return LISP_BOOLEAN(ESCAPE_QUOTED);
1484
1485                 else if (EQ(prop, Qinput_charset_conversion))
1486                         return
1487                             unparse_charset_conversion_specs
1488                             (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1489                 else if (EQ(prop, Qoutput_charset_conversion))
1490                         return
1491                             unparse_charset_conversion_specs
1492                             (XCODING_SYSTEM(coding_system)->iso2022.
1493                              output_conv);
1494                 else
1495                         abort();
1496         } else if (type == CODESYS_CCL) {
1497                 if (EQ(prop, Qdecode))
1498                         return XCODING_SYSTEM_CCL_DECODE(coding_system);
1499                 else if (EQ(prop, Qencode))
1500                         return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1501                 else
1502                         abort();
1503         }
1504 #endif                          /* MULE */
1505         else
1506                 abort();
1507
1508         return Qnil;            /* not reached */
1509 }
1510 \f
1511 /************************************************************************/
1512 /*                       Coding category functions                      */
1513 /************************************************************************/
1514
1515 static int decode_coding_category(Lisp_Object symbol)
1516 {
1517         int i;
1518
1519         CHECK_SYMBOL(symbol);
1520         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1521                 if (EQ(coding_category_symbol[i], symbol))
1522                         return i;
1523
1524         signal_simple_error("Unrecognized coding category", symbol);
1525         return 0;               /* not reached */
1526 }
1527
1528 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0,   /*
1529 Return a list of all recognized coding categories.
1530 */
1531       ())
1532 {
1533         int i;
1534         Lisp_Object list = Qnil;
1535
1536         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1537                 list = Fcons(coding_category_symbol[i], list);
1538         return list;
1539 }
1540
1541 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0,   /*
1542 Change the priority order of the coding categories.
1543 LIST should be list of coding categories, in descending order of
1544 priority.  Unspecified coding categories will be lower in priority
1545 than all specified ones, in the same relative order they were in
1546 previously.
1547 */
1548       (list))
1549 {
1550         int category_to_priority[CODING_CATEGORY_LAST];
1551         int i, j;
1552         Lisp_Object rest;
1553
1554         /* First generate a list that maps coding categories to priorities. */
1555
1556         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1557                 category_to_priority[i] = -1;
1558
1559         /* Highest priority comes from the specified list. */
1560         i = 0;
1561         EXTERNAL_LIST_LOOP(rest, list) {
1562                 int cat = decode_coding_category(XCAR(rest));
1563
1564                 if (category_to_priority[cat] >= 0)
1565                         signal_simple_error("Duplicate coding category in list",
1566                                             XCAR(rest));
1567                 category_to_priority[cat] = i++;
1568         }
1569
1570         /* Now go through the existing categories by priority to retrieve
1571            the categories not yet specified and preserve their priority
1572            order. */
1573         for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1574                 int cat = fcd->coding_category_by_priority[j];
1575                 if (category_to_priority[cat] < 0)
1576                         category_to_priority[cat] = i++;
1577         }
1578
1579         /* Now we need to construct the inverse of the mapping we just
1580            constructed. */
1581
1582         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1583                 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1584
1585         /* Phew!  That was confusing. */
1586         return Qnil;
1587 }
1588
1589 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0,   /*
1590 Return a list of coding categories in descending order of priority.
1591 */
1592       ())
1593 {
1594         int i;
1595         Lisp_Object list = Qnil;
1596
1597         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1598                 list =
1599                     Fcons(coding_category_symbol
1600                           [fcd->coding_category_by_priority[i]], list);
1601         return list;
1602 }
1603
1604 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0,       /*
1605 Change the coding system associated with a coding category.
1606 */
1607       (coding_category, coding_system))
1608 {
1609         int cat = decode_coding_category(coding_category);
1610
1611         coding_system = Fget_coding_system(coding_system);
1612         fcd->coding_category_system[cat] = coding_system;
1613         return Qnil;
1614 }
1615
1616 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0,       /*
1617 Return the coding system associated with a coding category.
1618 */
1619       (coding_category))
1620 {
1621         int cat = decode_coding_category(coding_category);
1622         Lisp_Object sys = fcd->coding_category_system[cat];
1623
1624         if (!NILP(sys))
1625                 return XCODING_SYSTEM_NAME(sys);
1626         return Qnil;
1627 }
1628 \f
1629 /************************************************************************/
1630 /*                     Detecting the encoding of data                   */
1631 /************************************************************************/
1632
1633 struct detection_state {
1634         eol_type_t eol_type;
1635         int seen_non_ascii;
1636         int mask;
1637 #ifdef MULE
1638         struct {
1639                 int mask;
1640                 int in_second_byte;
1641         } big5;
1642
1643         struct {
1644                 int mask;
1645                 int in_second_byte;
1646         } shift_jis;
1647
1648         struct {
1649                 int mask;
1650                 int in_byte;
1651         } ucs4;
1652
1653         struct {
1654                 int mask;
1655                 int in_byte;
1656         } utf8;
1657
1658         struct {
1659                 int mask;
1660                 int initted;
1661                 struct iso2022_decoder iso;
1662                 unsigned int flags;
1663                 int high_byte_count;
1664                 unsigned int saw_single_shift:1;
1665         } iso2022;
1666 #endif
1667         struct {
1668                 int seen_anything;
1669                 int just_saw_cr;
1670         } eol;
1671 };
1672
1673 static int acceptable_control_char_p(int c)
1674 {
1675         switch (c) {
1676                 /* Allow and ignore control characters that you might
1677                    reasonably see in a text file */
1678         case '\r':
1679         case '\n':
1680         case '\t':
1681         case 7:         /* bell */
1682         case 8:         /* backspace */
1683         case 11:                /* vertical tab */
1684         case 12:                /* form feed */
1685         case 26:                /* MS-DOS C-z junk */
1686         case 31:                /* '^_' -- for info */
1687                 return 1;
1688         default:
1689                 return 0;
1690         }
1691 }
1692
1693 static int mask_has_at_most_one_bit_p(int mask)
1694 {
1695         /* Perhaps the only thing useful you learn from intensive Microsoft
1696            technical interviews */
1697         return (mask & (mask - 1)) == 0;
1698 }
1699
1700 static eol_type_t
1701 detect_eol_type(struct detection_state *st, const Extbyte * src,
1702                 Lstream_data_count n)
1703 {
1704         while (n--) {
1705                 const unsigned char c = *(const unsigned char*)src++;
1706                 if (c == '\n') {
1707                         if (st->eol.just_saw_cr)
1708                                 return EOL_CRLF;
1709                         else if (st->eol.seen_anything)
1710                                 return EOL_LF;
1711                 } else if (st->eol.just_saw_cr)
1712                         return EOL_CR;
1713                 else if (c == '\r')
1714                         st->eol.just_saw_cr = 1;
1715                 else
1716                         st->eol.just_saw_cr = 0;
1717                 st->eol.seen_anything = 1;
1718         }
1719
1720         return EOL_AUTODETECT;
1721 }
1722
1723 /* Attempt to determine the encoding and EOL type of the given text.
1724    Before calling this function for the first type, you must initialize
1725    st->eol_type as appropriate and initialize st->mask to ~0.
1726
1727    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1728    not yet known.
1729
1730    st->mask holds the determined coding category mask, or ~0 if only
1731    ASCII has been seen so far.
1732
1733    Returns:
1734
1735    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1736         is present in st->mask
1737    1 == definitive answers are here for both st->eol_type and st->mask
1738 */
1739
1740 static int
1741 detect_coding_type(struct detection_state *st, const Extbyte * src,
1742                    Lstream_data_count n, int just_do_eol)
1743 {
1744         if (st->eol_type == EOL_AUTODETECT)
1745                 st->eol_type = detect_eol_type(st, src, n);
1746
1747         if (just_do_eol)
1748                 return st->eol_type != EOL_AUTODETECT;
1749
1750         if (!st->seen_non_ascii) {
1751                 for (; n; n--, src++) {
1752                         const unsigned char c = *(const unsigned char *)src;
1753                         if ((c < 0x20 && !acceptable_control_char_p(c))
1754                             || c >= 0x80) {
1755                                 st->seen_non_ascii = 1;
1756 #ifdef MULE
1757                                 st->shift_jis.mask = ~0;
1758                                 st->big5.mask = ~0;
1759                                 st->ucs4.mask = ~0;
1760                                 st->utf8.mask = ~0;
1761                                 st->iso2022.mask = ~0;
1762 #endif
1763                                 break;
1764                         }
1765                 }
1766         }
1767
1768         if (!n) {
1769                 return 0;
1770         }
1771 #ifdef MULE
1772         if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1773                 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1774         if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1775                 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1776         if (!mask_has_at_most_one_bit_p(st->big5.mask))
1777                 st->big5.mask = detect_coding_big5(st, src, n);
1778         if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1779                 st->utf8.mask = detect_coding_utf8(st, src, n);
1780         if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1781                 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1782
1783         st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1784                 | st->utf8.mask | st->ucs4.mask;
1785 #endif
1786         {
1787                 int retval = mask_has_at_most_one_bit_p(st->mask);
1788                 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1789                 return retval && st->eol_type != EOL_AUTODETECT;
1790         }
1791 }
1792
1793 static Lisp_Object coding_system_from_mask(int mask)
1794 {
1795         if (mask == ~0) {
1796                 /* If the file was entirely or basically ASCII, use the
1797                    default value of `buffer-file-coding-system'. */
1798                 Lisp_Object retval =
1799                     XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1800                 if (!NILP(retval)) {
1801                         retval = Ffind_coding_system(retval);
1802                         if (NILP(retval)) {
1803                                 warn_when_safe
1804                                     (Qbad_variable, Qwarning,
1805                                      "Invalid `default-buffer-file-coding-system', set to nil");
1806                                 XBUFFER(Vbuffer_defaults)->
1807                                     buffer_file_coding_system = Qnil;
1808                         }
1809                 }
1810                 if (NILP(retval))
1811                         retval = Fget_coding_system(Qraw_text);
1812                 return retval;
1813         } else {
1814                 int i;
1815                 int cat = -1;
1816 #ifdef MULE
1817                 mask = postprocess_iso2022_mask(mask);
1818 #endif
1819                 /* Look through the coding categories by priority and find
1820                    the first one that is allowed. */
1821                 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1822                         cat = fcd->coding_category_by_priority[i];
1823                         if (cat<0)
1824                                 continue;
1825                         if ((mask & (1 << cat)) &&
1826                             !NILP(fcd->coding_category_system[cat]))
1827                                 break;
1828                 }
1829                 if (cat >= 0)
1830                         return fcd->coding_category_system[cat];
1831                 else
1832                         return Fget_coding_system(Qraw_text);
1833         }
1834 }
1835
1836 /* Given a seekable read stream and potential coding system and EOL type
1837    as specified, do any autodetection that is called for.  If the
1838    coding system and/or EOL type are not `autodetect', they will be left
1839    alone; but this function will never return an autodetect coding system
1840    or EOL type.
1841
1842    This function does not automatically fetch subsidiary coding systems;
1843    that should be unnecessary with the explicit eol-type argument. */
1844
1845 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1846 /* number of leading lines to check for a coding cookie */
1847 #define LINES_TO_CHECK 2
1848
1849 void
1850 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1851                              eol_type_t * eol_type_in_out)
1852 {
1853         struct detection_state decst;
1854
1855         if (*eol_type_in_out == EOL_AUTODETECT)
1856                 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
1857
1858         xzero(decst);
1859         decst.eol_type = *eol_type_in_out;
1860         decst.mask = ~0;
1861
1862         /* If autodetection is called for, do it now. */
1863         if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1864             || *eol_type_in_out == EOL_AUTODETECT) {
1865                 Extbyte buf[4096];
1866                 Lisp_Object coding_system = Qnil;
1867                 Extbyte *p;
1868                 Lstream_data_count nread =
1869                     Lstream_read(stream, buf, sizeof(buf));
1870                 Extbyte *scan_end;
1871                 int lines_checked = 0;
1872
1873                 /* Look for initial "-*-"; mode line prefix */
1874                 for (p = buf,
1875                      scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1876                      p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1877                         if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1878                                 Extbyte *local_vars_beg = p + 3;
1879                                 /* Look for final "-*-"; mode line suffix */
1880                                 for (p = local_vars_beg,
1881                                      scan_end = buf + nread - LENGTH("-*-");
1882                                      p <= scan_end
1883                                      && lines_checked < LINES_TO_CHECK; p++)
1884                                         if (*p == '-' && *(p + 1) == '*'
1885                                             && *(p + 2) == '-') {
1886                                                 Extbyte *suffix = p;
1887                                                 /* Look for "coding:" */
1888                                                 for (p = local_vars_beg,
1889                                                      scan_end =
1890                                                      suffix -
1891                                                      LENGTH("coding:?");
1892                                                      p <= scan_end; p++)
1893                                                         if (memcmp
1894                                                             ("coding:", p,
1895                                                              LENGTH("coding:"))
1896                                                             == 0
1897                                                             && (p ==
1898                                                                 local_vars_beg
1899                                                                 || (*(p - 1) ==
1900                                                                     ' '
1901                                                                     || *(p -
1902                                                                          1) ==
1903                                                                     '\t'
1904                                                                     || *(p -
1905                                                                          1) ==
1906                                                                     ';'))) {
1907                                                                 Extbyte save;
1908                                                                 int n;
1909                                                                 p += LENGTH
1910                                                                     ("coding:");
1911                                                                 while (*p == ' '
1912                                                                        || *p ==
1913                                                                        '\t')
1914                                                                         p++;
1915
1916                                                                 /* Get coding system name */
1917                                                                 save = *suffix;
1918                                                                 *suffix = '\0';
1919                                                                 /* Characters valid in a MIME charset name (rfc 1521),
1920                                                                    and in a Lisp symbol name. */
1921                                                                 n = strspn((char
1922                                                                             *)p,
1923                                                                            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1924                                                                            "abcdefghijklmnopqrstuvwxyz"
1925                                                                            "0123456789"
1926                                                                            "!$%&*+-.^_{|}~");
1927                                                                 *suffix = save;
1928                                                                 if (n > 0) {
1929                                                                         save =
1930                                                                             p
1931                                                                             [n];
1932                                                                         p[n] =
1933                                                                             '\0';
1934                                                                         coding_system
1935                                                                             =
1936                                                                             Ffind_coding_system
1937                                                                             (intern
1938                                                                              ((char *)p));
1939                                                                         p[n] =
1940                                                                             save;
1941                                                                 }
1942                                                                 break;
1943                                                         }
1944                                                 break;
1945                                         }
1946                                 /* #### file must use standard EOLs or we miss 2d line */
1947                                 /* #### not to mention this is broken for UTF-16 DOS files */
1948                                         else if (*p == '\n' || *p == '\r') {
1949                                                 lines_checked++;
1950                                                 /* skip past multibyte (DOS) newline */
1951                                                 if (*p == '\r'
1952                                                     && *(p + 1) == '\n')
1953                                                         p++;
1954                                         }
1955                                 break;
1956                         }
1957                 /* #### file must use standard EOLs or we miss 2d line */
1958                 /* #### not to mention this is broken for UTF-16 DOS files */
1959                         else if (*p == '\n' || *p == '\r') {
1960                                 lines_checked++;
1961                                 /* skip past multibyte (DOS) newline */
1962                                 if (*p == '\r' && *(p + 1) == '\n')
1963                                         p++;
1964                         }
1965
1966                 if (NILP(coding_system))
1967                         do {
1968                                 if (detect_coding_type(&decst, buf, nread,
1969                                                        XCODING_SYSTEM_TYPE
1970                                                        (*codesys_in_out)
1971                                                        != CODESYS_AUTODETECT))
1972                                         break;
1973                                 nread = Lstream_read(stream, buf, sizeof(buf));
1974                                 if (nread == 0)
1975                                         break;
1976                         }
1977                         while (1);
1978
1979                 else if (XCODING_SYSTEM_TYPE(*codesys_in_out) ==
1980                          CODESYS_AUTODETECT
1981                          && XCODING_SYSTEM_EOL_TYPE(coding_system) ==
1982                          EOL_AUTODETECT)
1983                         do {
1984                                 if (detect_coding_type(&decst, buf, nread, 1))
1985                                         break;
1986                                 nread = Lstream_read(stream, buf, sizeof(buf));
1987                                 if (!nread)
1988                                         break;
1989                         }
1990                         while (1);
1991
1992                 *eol_type_in_out = decst.eol_type;
1993                 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
1994                         if (NILP(coding_system))
1995                                 *codesys_in_out =
1996                                     coding_system_from_mask(decst.mask);
1997                         else
1998                                 *codesys_in_out = coding_system;
1999                 }
2000         }
2001
2002         /* If we absolutely can't determine the EOL type, just assume LF. */
2003         if (*eol_type_in_out == EOL_AUTODETECT)
2004                 *eol_type_in_out = EOL_LF;
2005
2006         Lstream_rewind(stream);
2007 }
2008
2009 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0,   /*
2010 Detect coding system of the text in the region between START and END.
2011 Return a list of possible coding systems ordered by priority.
2012 If only ASCII characters are found, return 'undecided or one of
2013 its subsidiary coding systems according to a detected end-of-line
2014 type.  Optional arg BUFFER defaults to the current buffer.
2015 */
2016       (start, end, buffer))
2017 {
2018         Lisp_Object val = Qnil;
2019         struct buffer *buf = decode_buffer(buffer, 0);
2020         Bufpos b, e;
2021         Lisp_Object instream, lb_instream;
2022         lstream_t istr, lb_istr;
2023         struct detection_state decst;
2024         struct gcpro gcpro1, gcpro2;
2025
2026         get_buffer_range_char(buf, start, end, &b, &e, 0);
2027         lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2028         lb_istr = XLSTREAM(lb_instream);
2029         instream =
2030             make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2031         istr = XLSTREAM(instream);
2032         GCPRO2(instream, lb_instream);
2033         xzero(decst);
2034         decst.eol_type = EOL_AUTODETECT;
2035         decst.mask = ~0;
2036         while (1) {
2037                 Extbyte random_buffer[4096];
2038                 Lstream_data_count nread =
2039                     Lstream_read(istr, random_buffer, sizeof(random_buffer));
2040
2041                 if (!nread)
2042                         break;
2043                 if (detect_coding_type(&decst, random_buffer, nread, 0))
2044                         break;
2045         }
2046
2047         if (decst.mask == ~0)
2048                 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2049                                                decst.eol_type);
2050         else {
2051                 int i;
2052
2053                 val = Qnil;
2054 #ifdef MULE
2055                 decst.mask = postprocess_iso2022_mask(decst.mask);
2056 #endif
2057                 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2058                         int sys = fcd->coding_category_by_priority[i];
2059                         if (decst.mask & (1 << sys)) {
2060                                 Lisp_Object codesys =
2061                                     fcd->coding_category_system[sys];
2062                                 if (!NILP(codesys))
2063                                         codesys =
2064                                             subsidiary_coding_system(codesys,
2065                                                                      decst.
2066                                                                      eol_type);
2067                                 val = Fcons(codesys, val);
2068                         }
2069                 }
2070         }
2071         Lstream_close(istr);
2072         UNGCPRO;
2073         Lstream_delete(istr);
2074         Lstream_delete(lb_istr);
2075         return val;
2076 }
2077 \f
2078 /************************************************************************/
2079 /*           Converting to internal Mule format ("decoding")            */
2080 /************************************************************************/
2081
2082 /* A decoding stream is a stream used for decoding text (i.e.
2083    converting from some external format to internal format).
2084    The decoding-stream object keeps track of the actual coding
2085    stream, the stream that is at the other end, and data that
2086    needs to be persistent across the lifetime of the stream. */
2087
2088 /* Handle the EOL stuff related to just-read-in character C.
2089    EOL_TYPE is the EOL type of the coding stream.
2090    FLAGS is the current value of FLAGS in the coding stream, and may
2091    be modified by this macro.  (The macro only looks at the
2092    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2093    bytes are to be written.  You need to also define a local goto
2094    label "label_continue_loop" that is at the end of the main
2095    character-reading loop.
2096
2097    If C is a CR character, then this macro handles it entirely and
2098    jumps to label_continue_loop.  Otherwise, this macro does not add
2099    anything to DST, and continues normally.  You should continue
2100    processing C normally after this macro. */
2101
2102 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2103 do {                                                            \
2104   if (c == '\r')                                                \
2105     {                                                           \
2106       if (eol_type == EOL_CR)                                   \
2107         Dynarr_add (dst, '\n');                                 \
2108       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2109         Dynarr_add (dst, c);                                    \
2110       else                                                      \
2111         flags |= CODING_STATE_CR;                               \
2112       goto label_continue_loop;                                 \
2113     }                                                           \
2114   else if (flags & CODING_STATE_CR)                             \
2115     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2116       if (c != '\n')                                            \
2117         Dynarr_add (dst, '\r');                                 \
2118       flags &= ~CODING_STATE_CR;                                \
2119     }                                                           \
2120 } while (0)
2121
2122 /* C should be a binary character in the range 0 - 255; convert
2123    to internal format and add to Dynarr DST. */
2124
2125 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2126 do {                                            \
2127   if (BYTE_ASCII_P (c))                         \
2128     Dynarr_add (dst, c);                        \
2129   else if (BYTE_C1_P (c))                       \
2130     {                                           \
2131       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2132       Dynarr_add (dst, c + 0x20);               \
2133     }                                           \
2134   else                                          \
2135     {                                           \
2136       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2137       Dynarr_add (dst, c);                      \
2138     }                                           \
2139 } while (0)
2140
2141 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2142 do {                                    \
2143   if (ch)                               \
2144     {                                   \
2145       DECODE_ADD_BINARY_CHAR (ch, dst); \
2146       ch = 0;                           \
2147     }                                   \
2148 } while (0)
2149
2150 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2151 do {                                    \
2152   if (flags & CODING_STATE_END)         \
2153     {                                   \
2154       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2155       if (flags & CODING_STATE_CR)      \
2156         Dynarr_add (dst, '\r');         \
2157     }                                   \
2158 } while (0)
2159
2160 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2161
2162 typedef struct decoding_stream_s *decoding_stream_t;
2163 struct decoding_stream_s {
2164         /* Coding system that governs the conversion. */
2165         Lisp_Coding_System *codesys;
2166
2167         /* Stream that we read the encoded data from or
2168            write the decoded data to. */
2169         lstream_t other_end;
2170
2171         /* If we are reading, then we can return only a fixed amount of
2172            data, so if the conversion resulted in too much data, we store it
2173            here for retrieval the next time around. */
2174         unsigned_char_dynarr *runoff;
2175
2176         /* FLAGS holds flags indicating the current state of the decoding.
2177            Some of these flags are dependent on the coding system. */
2178         unsigned int flags;
2179
2180         /* CH holds a partially built-up character.  Since we only deal
2181            with one- and two-byte characters at the moment, we only use
2182            this to store the first byte of a two-byte character. */
2183         unsigned int ch;
2184
2185         /* EOL_TYPE specifies the type of end-of-line conversion that
2186            currently applies.  We need to keep this separate from the
2187            EOL type stored in CODESYS because the latter might indicate
2188            automatic EOL-type detection while the former will always
2189            indicate a particular EOL type. */
2190         eol_type_t eol_type;
2191 #ifdef MULE
2192         /* Additional ISO2022 information.  We define the structure above
2193            because it's also needed by the detection routines. */
2194         struct iso2022_decoder iso2022;
2195
2196         /* Additional information (the state of the running CCL program)
2197            used by the CCL decoder. */
2198         struct ccl_program ccl;
2199
2200         /* counter for UTF-8 or UCS-4 */
2201         unsigned char counter;
2202 #endif
2203         struct detection_state decst;
2204 };
2205
2206 static Lstream_data_count
2207 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2208 static Lstream_data_count
2209 decoding_writer(lstream_t stream,
2210                 const unsigned char *data, Lstream_data_count size);
2211 static int decoding_rewinder(lstream_t stream);
2212 static int decoding_seekable_p(lstream_t stream);
2213 static int decoding_flusher(lstream_t stream);
2214 static int decoding_closer(lstream_t stream);
2215
2216 static Lisp_Object decoding_marker(Lisp_Object stream);
2217
2218 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2219                               sizeof(struct decoding_stream_s));
2220
2221 static Lisp_Object
2222 decoding_marker(Lisp_Object stream)
2223 {
2224         lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2225         Lisp_Object str_obj;
2226
2227         /* We do not need to mark the coding systems or charsets stored
2228            within the stream because they are stored in a global list
2229            and automatically marked. */
2230
2231         XSETLSTREAM(str_obj, str);
2232         mark_object(str_obj);
2233         if (str->imp->marker) {
2234                 return str->imp->marker(str_obj);
2235         } else {
2236                 return Qnil;
2237         }
2238 }
2239
2240 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2241    so we read data from the other end, decode it, and store it into DATA. */
2242
2243 static Lstream_data_count
2244 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2245 {
2246         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2247         unsigned char *orig_data = data;
2248         Lstream_data_count read_size;
2249         int error_occurred = 0;
2250
2251         /* We need to interface to mule_decode(), which expects to take some
2252            amount of data and store the result into a Dynarr.  We have
2253            mule_decode() store into str->runoff, and take data from there
2254            as necessary. */
2255
2256         /* We loop until we have enough data, reading chunks from the other
2257            end and decoding it. */
2258         while (1) {
2259                 /* Take data from the runoff if we can.  Make sure to take at
2260                    most SIZE bytes, and delete the data from the runoff. */
2261                 if (Dynarr_length(str->runoff) > 0) {
2262                         Lstream_data_count chunk =
2263                                 min(size,
2264                                     (Lstream_data_count)
2265                                     Dynarr_length(str->runoff));
2266                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2267                         Dynarr_delete_many(str->runoff, 0, chunk);
2268                         data += chunk;
2269                         size -= chunk;
2270                 }
2271
2272                 if (size == 0) {
2273                         /* No more room for data */
2274                         break;
2275                 }
2276
2277                 if (str->flags & CODING_STATE_END) {
2278                         /* This means that on the previous iteration, we hit the
2279                            EOF on the other end.  We loop once more so that
2280                            mule_decode() can output any final stuff it may be
2281                            holding, or any "go back to a sane state" escape
2282                            sequences. (This latter makes sense during
2283                            encoding.) */
2284                         break;
2285                 }
2286
2287                 /* Exhausted the runoff, so get some more.  DATA has at least
2288                    SIZE bytes left of storage in it, so it's OK to read directly
2289                    into it.  (We'll be overwriting above, after we've decoded it
2290                    into the runoff.) */
2291                 read_size = Lstream_read(str->other_end, data, size);
2292                 if (read_size < 0) {
2293                         error_occurred = 1;
2294                         break;
2295                 }
2296                 if (read_size == 0) {
2297                         /* There might be some more end data produced in the
2298                            translation.  See the comment above. */
2299                         str->flags |= CODING_STATE_END;
2300                 }
2301                 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2302         }
2303
2304         if (data - orig_data == 0) {
2305                 return error_occurred ? -1 : 0;
2306         } else {
2307                 return data - orig_data;
2308         }
2309 }
2310
2311 static Lstream_data_count
2312 decoding_writer(lstream_t stream, const unsigned char *data,
2313                 Lstream_data_count size)
2314 {
2315         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2316         Lstream_data_count retval;
2317
2318         /* Decode all our data into the runoff, and then attempt to write
2319            it all out to the other end.  Remove whatever chunk we succeeded
2320            in writing. */
2321         mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2322         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2323                                Dynarr_length(str->runoff));
2324         if (retval > 0) {
2325                 Dynarr_delete_many(str->runoff, 0, retval);
2326         }
2327         /* Do NOT return retval.  The return value indicates how much
2328            of the incoming data was written, not how many bytes were
2329            written. */
2330         return size;
2331 }
2332
2333 static void
2334 reset_decoding_stream(decoding_stream_t str)
2335 {
2336 #ifdef MULE
2337         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2338                 Lisp_Object coding_system;
2339                 XSETCODING_SYSTEM(coding_system, str->codesys);
2340                 reset_iso2022(coding_system, &str->iso2022);
2341         } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2342                 setup_ccl_program(&str->ccl,
2343                                   CODING_SYSTEM_CCL_DECODE(str->codesys));
2344         }
2345         str->counter = 0;
2346 #endif                          /* MULE */
2347         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2348             || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2349                 xzero(str->decst);
2350                 str->decst.eol_type = EOL_AUTODETECT;
2351                 str->decst.mask = ~0;
2352         }
2353         str->flags = str->ch = 0;
2354 }
2355
2356 static int
2357 decoding_rewinder(lstream_t stream)
2358 {
2359         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2360         reset_decoding_stream(str);
2361         Dynarr_reset(str->runoff);
2362         return Lstream_rewind(str->other_end);
2363 }
2364
2365 static int
2366 decoding_seekable_p(lstream_t stream)
2367 {
2368         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2369         return Lstream_seekable_p(str->other_end);
2370 }
2371
2372 static int
2373 decoding_flusher(lstream_t stream)
2374 {
2375         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2376         return Lstream_flush(str->other_end);
2377 }
2378
2379 static int
2380 decoding_closer(lstream_t stream)
2381 {
2382         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2383         if (stream->flags & LSTREAM_FL_WRITE) {
2384                 str->flags |= CODING_STATE_END;
2385                 decoding_writer(stream, 0, 0);
2386         }
2387         Dynarr_free(str->runoff);
2388 #ifdef MULE
2389 #ifdef ENABLE_COMPOSITE_CHARS
2390         if (str->iso2022.composite_chars) {
2391                 Dynarr_free(str->iso2022.composite_chars);
2392         }
2393 #endif
2394 #endif
2395         return Lstream_close(str->other_end);
2396 }
2397
2398 Lisp_Object
2399 decoding_stream_coding_system(lstream_t stream)
2400 {
2401         Lisp_Object coding_system;
2402         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2403
2404         XSETCODING_SYSTEM(coding_system, str->codesys);
2405         return subsidiary_coding_system(coding_system, str->eol_type);
2406 }
2407
2408 void
2409 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2410 {
2411         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2412         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2413         str->codesys = cs;
2414         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2415                 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2416         }
2417         reset_decoding_stream(str);
2418         return;
2419 }
2420
2421 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2422    stream for writing, no automatic code detection will be performed.
2423    The reason for this is that automatic code detection requires a
2424    seekable input.  Things will also fail if you open a decoding
2425    stream for reading using a non-fully-specified coding system and
2426    a non-seekable input stream. */
2427
2428 static Lisp_Object
2429 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2430 {
2431         lstream_t lstr = Lstream_new(lstream_decoding, mode);
2432         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2433         Lisp_Object obj;
2434
2435         xzero(*str);
2436         str->other_end = stream;
2437         str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2438         str->eol_type = EOL_AUTODETECT;
2439         if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2440                 /* We can determine the coding system now. */
2441                 determine_real_coding_system(stream, &codesys, &str->eol_type);
2442         }
2443         set_decoding_stream_coding_system(lstr, codesys);
2444         str->decst.eol_type = str->eol_type;
2445         str->decst.mask = ~0;
2446         XSETLSTREAM(obj, lstr);
2447         return obj;
2448 }
2449
2450 Lisp_Object
2451 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2452 {
2453         return make_decoding_stream_1(stream, codesys, "r");
2454 }
2455
2456 Lisp_Object
2457 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2458 {
2459         return make_decoding_stream_1(stream, codesys, "w");
2460 }
2461
2462 /* Note: the decode_coding_* functions all take the same
2463    arguments as mule_decode(), which is to say some SRC data of
2464    size N, which is to be stored into dynamic array DST.
2465    DECODING is the stream within which the decoding is
2466    taking place, but no data is actually read from or
2467    written to that stream; that is handled in decoding_reader()
2468    or decoding_writer().  This allows the same functions to
2469    be used for both reading and writing. */
2470
2471 static void
2472 mule_decode(lstream_t decoding, const Extbyte * src,
2473             unsigned_char_dynarr * dst, Lstream_data_count n)
2474 {
2475         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2476
2477         /* If necessary, do encoding-detection now.  We do this when
2478            we're a writing stream or a non-seekable reading stream,
2479            meaning that we can't just process the whole input,
2480            rewind, and start over. */
2481
2482         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2483             str->eol_type == EOL_AUTODETECT) {
2484                 Lisp_Object codesys;
2485
2486                 XSETCODING_SYSTEM(codesys, str->codesys);
2487                 detect_coding_type(&str->decst, src, n,
2488                                    CODING_SYSTEM_TYPE(str->codesys) !=
2489                                    CODESYS_AUTODETECT);
2490                 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2491                     str->decst.mask != ~0)
2492                         /* #### This is cheesy.  What we really ought to do is
2493                            buffer up a certain amount of data so as to get a
2494                            less random result. */
2495                         codesys = coding_system_from_mask(str->decst.mask);
2496                 str->eol_type = str->decst.eol_type;
2497                 if (XCODING_SYSTEM(codesys) != str->codesys) {
2498                         /* Preserve the CODING_STATE_END flag in case it was set.
2499                            If we erase it, bad things might happen. */
2500                         int was_end = str->flags & CODING_STATE_END;
2501                         set_decoding_stream_coding_system(decoding, codesys);
2502                         if (was_end)
2503                                 str->flags |= CODING_STATE_END;
2504                 }
2505         }
2506
2507         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2508 #ifdef DEBUG_SXEMACS
2509         case CODESYS_INTERNAL:
2510                 Dynarr_add_many(dst, src, n);
2511                 break;
2512 #endif
2513         case CODESYS_AUTODETECT:
2514                 /* If we got this far and still haven't decided on the coding
2515                    system, then do no conversion. */
2516         case CODESYS_NO_CONVERSION:
2517                 decode_coding_no_conversion(decoding, src, dst, n);
2518                 break;
2519 #ifdef MULE
2520         case CODESYS_SHIFT_JIS:
2521                 decode_coding_sjis(decoding, src, dst, n);
2522                 break;
2523         case CODESYS_BIG5:
2524                 decode_coding_big5(decoding, src, dst, n);
2525                 break;
2526         case CODESYS_UCS4:
2527                 decode_coding_ucs4(decoding, src, dst, n);
2528                 break;
2529         case CODESYS_UTF8:
2530                 decode_coding_utf8(decoding, src, dst, n);
2531                 break;
2532         case CODESYS_CCL:
2533                 str->ccl.last_block = str->flags & CODING_STATE_END;
2534                 /* When applying ccl program to stream, MUST NOT set NULL
2535                    pointer to src.  */
2536                 ccl_driver(&str->ccl,
2537                            (src
2538                             ? (const unsigned char *)src
2539                             : (const unsigned char *)""),
2540                            dst, n, 0, CCL_MODE_DECODING);
2541                 break;
2542         case CODESYS_ISO2022:
2543                 decode_coding_iso2022(decoding, src, dst, n);
2544                 break;
2545 #endif                          /* MULE */
2546         default:
2547                 abort();
2548         }
2549 }
2550
2551 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0,   /*
2552 Decode the text between START and END which is encoded in CODING-SYSTEM.
2553 This is useful if you've read in encoded text from a file without decoding
2554 it (e.g. you read in a JIS-formatted file but used the `binary' or
2555 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2556 Return length of decoded text.
2557 BUFFER defaults to the current buffer if unspecified.
2558 */
2559       (start, end, coding_system, buffer))
2560 {
2561         Bufpos b, e;
2562         struct buffer *buf = decode_buffer(buffer, 0);
2563         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2564         lstream_t istr, ostr;
2565         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2566
2567         get_buffer_range_char(buf, start, end, &b, &e, 0);
2568
2569         barf_if_buffer_read_only(buf, b, e);
2570
2571         coding_system = Fget_coding_system(coding_system);
2572         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2573         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2574         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2575                                                    coding_system);
2576         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2577                                                 Fget_coding_system(Qbinary));
2578         istr = XLSTREAM(instream);
2579         ostr = XLSTREAM(outstream);
2580         GCPRO4(instream, lb_outstream, de_outstream, outstream);
2581
2582         /* The chain of streams looks like this:
2583
2584            [BUFFER] <----- send through
2585            ------> [ENCODE AS BINARY]
2586            ------> [DECODE AS SPECIFIED]
2587            ------> [BUFFER]
2588          */
2589
2590         while (1) {
2591                 char tempbuf[1024];     /* some random amount */
2592                 Bufpos newpos, even_newer_pos;
2593                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2594                 Lstream_data_count size_in_bytes =
2595                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
2596
2597                 if (!size_in_bytes)
2598                         break;
2599                 newpos = lisp_buffer_stream_startpos(istr);
2600                 Lstream_write(ostr, tempbuf, size_in_bytes);
2601                 even_newer_pos = lisp_buffer_stream_startpos(istr);
2602                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2603                                     even_newer_pos, 0);
2604         }
2605         Lstream_close(istr);
2606         Lstream_close(ostr);
2607         UNGCPRO;
2608         Lstream_delete(istr);
2609         Lstream_delete(ostr);
2610         Lstream_delete(XLSTREAM(de_outstream));
2611         Lstream_delete(XLSTREAM(lb_outstream));
2612         return Qnil;
2613 }
2614 \f
2615 /************************************************************************/
2616 /*           Converting to an external encoding ("encoding")            */
2617 /************************************************************************/
2618
2619 /* An encoding stream is an output stream.  When you create the
2620    stream, you specify the coding system that governs the encoding
2621    and another stream that the resulting encoded data is to be
2622    sent to, and then start sending data to it. */
2623
2624 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2625
2626 typedef struct encoding_stream_s *encoding_stream_t;
2627 struct encoding_stream_s {
2628         /* Coding system that governs the conversion. */
2629         Lisp_Coding_System *codesys;
2630
2631         /* Stream that we read the encoded data from or
2632            write the decoded data to. */
2633         lstream_t other_end;
2634
2635         /* If we are reading, then we can return only a fixed amount of
2636            data, so if the conversion resulted in too much data, we store it
2637            here for retrieval the next time around. */
2638         unsigned_char_dynarr *runoff;
2639
2640         /* FLAGS holds flags indicating the current state of the encoding.
2641            Some of these flags are dependent on the coding system. */
2642         unsigned int flags;
2643
2644         /* CH holds a partially built-up character.  Since we only deal
2645            with one- and two-byte characters at the moment, we only use
2646            this to store the first byte of a two-byte character. */
2647         unsigned int ch;
2648 #ifdef MULE
2649         /* Additional information used by the ISO2022 encoder. */
2650         struct {
2651                 /* CHARSET holds the character sets currently assigned to the G0
2652                    through G3 registers.  It is initialized from the array
2653                    INITIAL_CHARSET in CODESYS. */
2654                 Lisp_Object charset[4];
2655
2656                 /* Which registers are currently invoked into the left (GL) and
2657                    right (GR) halves of the 8-bit encoding space? */
2658                 int register_left, register_right;
2659
2660                 /* Whether we need to explicitly designate the charset in the
2661                    G? register before using it.  It is initialized from the
2662                    array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2663                 unsigned char force_charset_on_output[4];
2664
2665                 /* Other state variables that need to be preserved across
2666                    invocations. */
2667                 Lisp_Object current_charset;
2668                 int current_half;
2669                 int current_char_boundary;
2670         } iso2022;
2671
2672         /* Additional information (the state of the running CCL program)
2673            used by the CCL encoder. */
2674         struct ccl_program ccl;
2675 #endif                          /* MULE */
2676 };
2677
2678 static Lstream_data_count
2679 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2680 static Lstream_data_count
2681 encoding_writer(lstream_t stream,
2682                 const unsigned char *data, Lstream_data_count size);
2683 static int encoding_rewinder(lstream_t stream);
2684 static int encoding_seekable_p(lstream_t stream);
2685 static int encoding_flusher(lstream_t stream);
2686 static int encoding_closer(lstream_t stream);
2687
2688 static Lisp_Object encoding_marker(Lisp_Object stream);
2689
2690 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2691                               sizeof(struct encoding_stream_s));
2692
2693 static Lisp_Object
2694 encoding_marker(Lisp_Object stream)
2695 {
2696         lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2697         Lisp_Object str_obj;
2698
2699         /* We do not need to mark the coding systems or charsets stored
2700            within the stream because they are stored in a global list
2701            and automatically marked. */
2702
2703         XSETLSTREAM(str_obj, str);
2704         mark_object(str_obj);
2705         if (str->imp->marker) {
2706                 return str->imp->marker(str_obj);
2707         } else {
2708                 return Qnil;
2709         }
2710 }
2711
2712 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2713    so we read data from the other end, encode it, and store it into DATA. */
2714
2715 static Lstream_data_count
2716 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2717 {
2718         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2719         unsigned char *orig_data = data;
2720         Lstream_data_count read_size;
2721         int error_occurred = 0;
2722
2723         /* We need to interface to mule_encode(), which expects to take some
2724            amount of data and store the result into a Dynarr.  We have
2725            mule_encode() store into str->runoff, and take data from there
2726            as necessary. */
2727
2728         /* We loop until we have enough data, reading chunks from the other
2729            end and encoding it. */
2730         while (1) {
2731                 /* Take data from the runoff if we can.  Make sure to take at
2732                    most SIZE bytes, and delete the data from the runoff. */
2733                 if (Dynarr_length(str->runoff) > 0) {
2734                         int chunk = min((int)size, Dynarr_length(str->runoff));
2735                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2736                         Dynarr_delete_many(str->runoff, 0, chunk);
2737                         data += chunk;
2738                         size -= chunk;
2739                 }
2740
2741                 if (size == 0) {
2742                         /* No more room for data */
2743                         break;
2744                 }
2745
2746                 if (str->flags & CODING_STATE_END) {
2747                         /* This means that on the previous iteration, we hit the
2748                            EOF on the other end.  We loop once more so that
2749                            mule_encode() can output any final stuff it may be
2750                            holding, or any "go back to a sane state" escape
2751                            sequences. (This latter makes sense during
2752                            encoding.) */
2753                         break;
2754                 }
2755
2756                 /* Exhausted the runoff, so get some more.  DATA at least SIZE
2757                    bytes left of storage in it, so it's OK to read directly into
2758                    it.  (We'll be overwriting above, after we've encoded it into
2759                    the runoff.) */
2760                 read_size = Lstream_read(str->other_end, data, size);
2761                 if (read_size < 0) {
2762                         error_occurred = 1;
2763                         break;
2764                 }
2765                 if (read_size == 0) {
2766                         /* There might be some more end data produced in the
2767                            translation.  See the comment above. */
2768                         str->flags |= CODING_STATE_END;
2769                 }
2770                 mule_encode(stream, data, str->runoff, read_size);
2771         }
2772
2773         if (data == orig_data) {
2774                 return error_occurred ? -1 : 0;
2775         } else {
2776                 return data - orig_data;
2777         }
2778 }
2779
2780 static Lstream_data_count
2781 encoding_writer(lstream_t stream, const unsigned char *data,
2782                 Lstream_data_count size)
2783 {
2784         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2785         Lstream_data_count retval;
2786
2787         /* Encode all our data into the runoff, and then attempt to write
2788            it all out to the other end.  Remove whatever chunk we succeeded
2789            in writing. */
2790         mule_encode(stream, data, str->runoff, size);
2791         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2792                                Dynarr_length(str->runoff));
2793         if (retval > 0) {
2794                 Dynarr_delete_many(str->runoff, 0, retval);
2795         }
2796         /* Do NOT return retval.  The return value indicates how much
2797            of the incoming data was written, not how many bytes were
2798            written. */
2799         return size;
2800 }
2801
2802 static void
2803 reset_encoding_stream(encoding_stream_t str)
2804 {
2805 #ifdef MULE
2806         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2807         case CODESYS_ISO2022: {
2808                 int i;
2809
2810                 for (i = 0; i < 4; i++) {
2811                         str->iso2022.charset[i] =
2812                                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2813                                         str->codesys, i);
2814                         str->iso2022.force_charset_on_output[i] =
2815                                 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2816                                         str->codesys, i);
2817                 }
2818                 str->iso2022.register_left = 0;
2819                 str->iso2022.register_right = 1;
2820                 str->iso2022.current_charset = Qnil;
2821                 str->iso2022.current_half = 0;
2822                 str->iso2022.current_char_boundary = 1;
2823                 break;
2824         }
2825         case CODESYS_CCL:
2826                 setup_ccl_program(&str->ccl,
2827                                   CODING_SYSTEM_CCL_ENCODE(str->codesys));
2828                 break;
2829
2830                 /* list the rest of them lot explicitly */
2831         case CODESYS_AUTODETECT:
2832         case CODESYS_SHIFT_JIS:
2833         case CODESYS_BIG5:
2834         case CODESYS_UCS4:
2835         case CODESYS_UTF8:
2836         case CODESYS_NO_CONVERSION:
2837 #ifdef DEBUG_SXEMACS
2838         case CODESYS_INTERNAL:
2839 #endif
2840         default:
2841                 break;
2842         }
2843 #endif                          /* MULE */
2844
2845         str->flags = str->ch = 0;
2846 }
2847
2848 static int
2849 encoding_rewinder(lstream_t stream)
2850 {
2851         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2852         reset_encoding_stream(str);
2853         Dynarr_reset(str->runoff);
2854         return Lstream_rewind(str->other_end);
2855 }
2856
2857 static int
2858 encoding_seekable_p(lstream_t stream)
2859 {
2860         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2861         return Lstream_seekable_p(str->other_end);
2862 }
2863
2864 static int
2865 encoding_flusher(lstream_t stream)
2866 {
2867         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2868         return Lstream_flush(str->other_end);
2869 }
2870
2871 static int
2872 encoding_closer(lstream_t stream)
2873 {
2874         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2875         if (stream->flags & LSTREAM_FL_WRITE) {
2876                 str->flags |= CODING_STATE_END;
2877                 encoding_writer(stream, 0, 0);
2878         }
2879         Dynarr_free(str->runoff);
2880         return Lstream_close(str->other_end);
2881 }
2882
2883 Lisp_Object
2884 encoding_stream_coding_system(lstream_t stream)
2885 {
2886         Lisp_Object coding_system;
2887         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2888
2889         XSETCODING_SYSTEM(coding_system, str->codesys);
2890         return coding_system;
2891 }
2892
2893 void
2894 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2895 {
2896         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2897         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2898         str->codesys = cs;
2899         reset_encoding_stream(str);
2900 }
2901
2902 static Lisp_Object
2903 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2904 {
2905         lstream_t lstr = Lstream_new(lstream_encoding, mode);
2906         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2907         Lisp_Object obj;
2908
2909         xzero(*str);
2910         str->runoff = Dynarr_new(unsigned_char);
2911         str->other_end = stream;
2912         set_encoding_stream_coding_system(lstr, codesys);
2913         XSETLSTREAM(obj, lstr);
2914         return obj;
2915 }
2916
2917 Lisp_Object
2918 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2919 {
2920         return make_encoding_stream_1(stream, codesys, "r");
2921 }
2922
2923 Lisp_Object
2924 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2925 {
2926         return make_encoding_stream_1(stream, codesys, "w");
2927 }
2928
2929 /* Convert N bytes of internally-formatted data stored in SRC to an
2930    external format, according to the encoding stream ENCODING.
2931    Store the encoded data into DST. */
2932
2933 static void
2934 mule_encode(lstream_t encoding, const Bufbyte * src,
2935             unsigned_char_dynarr * dst, Lstream_data_count n)
2936 {
2937         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2938
2939         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2940 #ifdef DEBUG_SXEMACS
2941         case CODESYS_INTERNAL:
2942                 Dynarr_add_many(dst, src, n);
2943                 break;
2944 #endif
2945         case CODESYS_AUTODETECT:
2946                 /* If we got this far and still haven't decided on the coding
2947                    system, then do no conversion. */
2948         case CODESYS_NO_CONVERSION:
2949                 encode_coding_no_conversion(encoding, src, dst, n);
2950                 break;
2951 #ifdef MULE
2952         case CODESYS_SHIFT_JIS:
2953                 encode_coding_sjis(encoding, src, dst, n);
2954                 break;
2955         case CODESYS_BIG5:
2956                 encode_coding_big5(encoding, src, dst, n);
2957                 break;
2958         case CODESYS_UCS4:
2959                 encode_coding_ucs4(encoding, src, dst, n);
2960                 break;
2961         case CODESYS_UTF8:
2962                 encode_coding_utf8(encoding, src, dst, n);
2963                 break;
2964         case CODESYS_CCL:
2965                 str->ccl.last_block = str->flags & CODING_STATE_END;
2966                 /* When applying ccl program to stream, MUST NOT set NULL
2967                    pointer to src.  */
2968                 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
2969                            dst, n, 0, CCL_MODE_ENCODING);
2970                 break;
2971         case CODESYS_ISO2022:
2972                 encode_coding_iso2022(encoding, src, dst, n);
2973                 break;
2974 #endif                          /* MULE */
2975         default:
2976                 abort();
2977         }
2978 }
2979
2980 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0,   /*
2981 Encode the text between START and END using CODING-SYSTEM.
2982 This will, for example, convert Japanese characters into stuff such as
2983 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2984 text.  BUFFER defaults to the current buffer if unspecified.
2985 */
2986       (start, end, coding_system, buffer))
2987 {
2988         Bufpos b, e;
2989         struct buffer *buf = decode_buffer(buffer, 0);
2990         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2991         lstream_t istr, ostr;
2992         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2993
2994         get_buffer_range_char(buf, start, end, &b, &e, 0);
2995
2996         barf_if_buffer_read_only(buf, b, e);
2997
2998         coding_system = Fget_coding_system(coding_system);
2999         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
3000         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
3001         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
3002                                                    Fget_coding_system(Qbinary));
3003         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
3004                                                 coding_system);
3005         istr = XLSTREAM(instream);
3006         ostr = XLSTREAM(outstream);
3007         GCPRO4(instream, outstream, de_outstream, lb_outstream);
3008         /* The chain of streams looks like this:
3009
3010            [BUFFER] <----- send through
3011            ------> [ENCODE AS SPECIFIED]
3012            ------> [DECODE AS BINARY]
3013            ------> [BUFFER]
3014          */
3015         while (1) {
3016                 char tempbuf[1024];     /* some random amount */
3017                 Bufpos newpos, even_newer_pos;
3018                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
3019                 Lstream_data_count size_in_bytes =
3020                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
3021
3022                 if (!size_in_bytes)
3023                         break;
3024                 newpos = lisp_buffer_stream_startpos(istr);
3025                 Lstream_write(ostr, tempbuf, size_in_bytes);
3026                 even_newer_pos = lisp_buffer_stream_startpos(istr);
3027                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3028                                     even_newer_pos, 0);
3029         }
3030
3031         {
3032                 Charcount retlen =
3033                     lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3034                 Lstream_close(istr);
3035                 Lstream_close(ostr);
3036                 UNGCPRO;
3037                 Lstream_delete(istr);
3038                 Lstream_delete(ostr);
3039                 Lstream_delete(XLSTREAM(de_outstream));
3040                 Lstream_delete(XLSTREAM(lb_outstream));
3041                 return make_int(retlen);
3042         }
3043 }
3044
3045 #ifdef MULE
3046 \f
3047 /************************************************************************/
3048 /*                          Shift-JIS methods                           */
3049 /************************************************************************/
3050
3051 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3052    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3053    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3054    encoded by "position-code + 0x80".  A character of JISX0208
3055    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3056    position-codes are divided and shifted so that it fit in the range
3057    below.
3058
3059    --- CODE RANGE of Shift-JIS ---
3060    (character set)      (range)
3061    ASCII                0x00 .. 0x7F
3062    JISX0201-Kana        0xA0 .. 0xDF
3063    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3064             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3065    -------------------------------
3066
3067 */
3068
3069 /* Is this the first byte of a Shift-JIS two-byte char? */
3070
3071 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3072   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3073
3074 /* Is this the second byte of a Shift-JIS two-byte char? */
3075
3076 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3077   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3078
3079 #define BYTE_SJIS_KATAKANA_P(c) \
3080   ((c) >= 0xA1 && (c) <= 0xDF)
3081
3082 static int
3083 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3084                    Lstream_data_count n)
3085 {
3086         while (n--) {
3087                 const unsigned char c = *(const unsigned char *)src++;
3088                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3089                         return 0;
3090                 if (st->shift_jis.in_second_byte) {
3091                         st->shift_jis.in_second_byte = 0;
3092                         if (c < 0x40)
3093                                 return 0;
3094                 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3095                         st->shift_jis.in_second_byte = 1;
3096         }
3097         return CODING_CATEGORY_SHIFT_JIS_MASK;
3098 }
3099
3100 /* Convert Shift-JIS data to internal format. */
3101
3102 static void
3103 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3104                    unsigned_char_dynarr * dst, Lstream_data_count n)
3105 {
3106         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3107         unsigned int flags = str->flags;
3108         unsigned int ch = str->ch;
3109         eol_type_t eol_type = str->eol_type;
3110
3111         while (n--) {
3112                 const unsigned char c = *(const unsigned char *)src++;
3113
3114                 if (ch) {
3115                         /* Previous character was first byte of Shift-JIS Kanji
3116                            char. */
3117                         if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3118                                 unsigned char e1, e2;
3119
3120                                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3121                                 DECODE_SJIS(ch, c, e1, e2);
3122                                 Dynarr_add(dst, e1);
3123                                 Dynarr_add(dst, e2);
3124                         } else {
3125                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3126                                 DECODE_ADD_BINARY_CHAR(c, dst);
3127                         }
3128                         ch = 0;
3129                 } else {
3130                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3131                         if (BYTE_SJIS_TWO_BYTE_1_P(c))
3132                                 ch = c;
3133                         else if (BYTE_SJIS_KATAKANA_P(c)) {
3134                                 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3135                                 Dynarr_add(dst, c);
3136                         } else
3137                                 DECODE_ADD_BINARY_CHAR(c, dst);
3138                 }
3139               label_continue_loop:;
3140         }
3141
3142         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3143
3144         str->flags = flags;
3145         str->ch = ch;
3146 }
3147
3148 /* Convert internally-formatted data to Shift-JIS. */
3149
3150 static void
3151 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3152                    unsigned_char_dynarr * dst, Lstream_data_count n)
3153 {
3154         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3155         unsigned int flags = str->flags;
3156         unsigned int ch = str->ch;
3157         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3158
3159         while (n--) {
3160                 Bufbyte c = *src++;
3161                 if (c == '\n') {
3162                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3163                                 Dynarr_add(dst, '\r');
3164                         if (eol_type != EOL_CR)
3165                                 Dynarr_add(dst, '\n');
3166                         ch = 0;
3167                 } else if (BYTE_ASCII_P(c)) {
3168                         Dynarr_add(dst, c);
3169                         ch = 0;
3170                 } else if (BUFBYTE_LEADING_BYTE_P(c))
3171                         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3172                               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3173                               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3174                 else if (ch) {
3175                         if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3176                                 Dynarr_add(dst, c);
3177                                 ch = 0;
3178                         } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3179                                    ch == LEADING_BYTE_JAPANESE_JISX0208)
3180                                 ch = c;
3181                         else {
3182                                 /* j1 is bessel j1 function,
3183                                  * so we use something else */
3184                                 /* unsigned char j1, j2; */
3185                                 unsigned char tt1, tt2;
3186
3187                                 ENCODE_SJIS(ch, c, tt1, tt2);
3188                                 Dynarr_add(dst, tt1);
3189                                 Dynarr_add(dst, tt2);
3190                                 ch = 0;
3191                         }
3192                 }
3193         }
3194
3195         str->flags = flags;
3196         str->ch = ch;
3197 }
3198
3199 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3200 Decode a JISX0208 character of Shift-JIS coding-system.
3201 CODE is the character code in Shift-JIS as a cons of type bytes.
3202 Return the corresponding character.
3203 */
3204       (code))
3205 {
3206         unsigned char c1, c2, s1, s2;
3207
3208         CHECK_CONS(code);
3209         CHECK_INT(XCAR(code));
3210         CHECK_INT(XCDR(code));
3211         s1 = XINT(XCAR(code));
3212         s2 = XINT(XCDR(code));
3213         if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3214                 DECODE_SJIS(s1, s2, c1, c2);
3215                 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3216                                            c1 & 0x7F, c2 & 0x7F));
3217         } else
3218                 return Qnil;
3219 }
3220
3221 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3222 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3223 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3224 */
3225       (character))
3226 {
3227         Lisp_Object charset;
3228         int c1, c2, s1, s2;
3229
3230         CHECK_CHAR_COERCE_INT(character);
3231         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3232         if (EQ(charset, Vcharset_japanese_jisx0208)) {
3233                 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3234                 return Fcons(make_int(s1), make_int(s2));
3235         } else
3236                 return Qnil;
3237 }
3238 \f
3239 /************************************************************************/
3240 /*                            Big5 methods                              */
3241 /************************************************************************/
3242
3243 /* BIG5 is a coding system encoding two character sets: ASCII and
3244    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3245    character set and is encoded in two-byte.
3246
3247    --- CODE RANGE of BIG5 ---
3248    (character set)      (range)
3249    ASCII                0x00 .. 0x7F
3250    Big5 (1st byte)      0xA1 .. 0xFE
3251         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3252    --------------------------
3253
3254    Since the number of characters in Big5 is larger than maximum
3255    characters in Emacs' charset (96x96), it can't be handled as one
3256    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3257    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3258    contains frequently used characters and the latter contains less
3259    frequently used characters.  */
3260
3261 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3262   ((c) >= 0xA1 && (c) <= 0xFE)
3263
3264 /* Is this the second byte of a Shift-JIS two-byte char? */
3265
3266 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3267   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3268
3269 /* Number of Big5 characters which have the same code in 1st byte.  */
3270
3271 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3272
3273 /* Code conversion macros.  These are macros because they are used in
3274    inner loops during code conversion.
3275
3276    Note that temporary variables in macros introduce the classic
3277    dynamic-scoping problems with variable names.  We use capital-
3278    lettered variables in the assumption that SXEmacs does not use
3279    capital letters in variables except in a very formalized way
3280    (e.g. Qstring). */
3281
3282 /* Convert Big5 code (b1, b2) into its internal string representation
3283    (lb, c1, c2). */
3284
3285 /* There is a much simpler way to split the Big5 charset into two.
3286    For the moment I'm going to leave the algorithm as-is because it
3287    claims to separate out the most-used characters into a single
3288    charset, which perhaps will lead to optimizations in various
3289    places.
3290
3291    The way the algorithm works is something like this:
3292
3293    Big5 can be viewed as a 94x157 charset, where the row is
3294    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3295    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3296    the split between low and high column numbers is apparently
3297    meaningless; ascending rows produce less and less frequent chars.
3298    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3299    the first charset, and the upper half (0xC9 .. 0xFE) to the
3300    second.  To do the conversion, we convert the character into
3301    a single number where 0 .. 156 is the first row, 157 .. 313
3302    is the second, etc.  That way, the characters are ordered by
3303    decreasing frequency.  Then we just chop the space in two
3304    and coerce the result into a 94x94 space.
3305    */
3306
3307 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3308 {                                                                       \
3309   int B1 = b1, B2 = b2;                                                 \
3310   unsigned int I                                                        \
3311     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3312                                                                         \
3313   if (B1 < 0xC9)                                                        \
3314     {                                                                   \
3315       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3316     }                                                                   \
3317   else                                                                  \
3318     {                                                                   \
3319       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3320       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3321     }                                                                   \
3322   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3323   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3324 } while (0)
3325
3326 /* Convert the internal string representation of a Big5 character
3327    (lb, c1, c2) into Big5 code (b1, b2). */
3328
3329 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3330 {                                                                       \
3331   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3332                                                                         \
3333   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3334     {                                                                   \
3335       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3336     }                                                                   \
3337   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3338   b2 = I % BIG5_SAME_ROW;                                               \
3339   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3340 } while (0)
3341
3342 static int
3343 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3344                    Lstream_data_count n)
3345 {
3346         while (n--) {
3347                 const unsigned char c = *(const unsigned char *)src++;
3348                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3349                     (c >= 0x80 && c <= 0xA0))
3350                         return 0;
3351                 if (st->big5.in_second_byte) {
3352                         st->big5.in_second_byte = 0;
3353                         if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3354                                 return 0;
3355                 } else if (c >= 0xA1)
3356                         st->big5.in_second_byte = 1;
3357         }
3358         return CODING_CATEGORY_BIG5_MASK;
3359 }
3360
3361 /* Convert Big5 data to internal format. */
3362
3363 static void
3364 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3365                    unsigned_char_dynarr * dst, Lstream_data_count n)
3366 {
3367         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3368         unsigned int flags = str->flags;
3369         unsigned int ch = str->ch;
3370         eol_type_t eol_type = str->eol_type;
3371
3372         while (n--) {
3373                 const unsigned char c = *(const unsigned char *)src++;
3374                 if (ch) {
3375                         /* Previous character was first byte of Big5 char. */
3376                         if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3377                                 unsigned char b1, b2, b3;
3378                                 DECODE_BIG5(ch, c, b1, b2, b3);
3379                                 Dynarr_add(dst, b1);
3380                                 Dynarr_add(dst, b2);
3381                                 Dynarr_add(dst, b3);
3382                         } else {
3383                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3384                                 DECODE_ADD_BINARY_CHAR(c, dst);
3385                         }
3386                         ch = 0;
3387                 } else {
3388                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3389                         if (BYTE_BIG5_TWO_BYTE_1_P(c))
3390                                 ch = c;
3391                         else
3392                                 DECODE_ADD_BINARY_CHAR(c, dst);
3393                 }
3394               label_continue_loop:;
3395         }
3396
3397         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3398
3399         str->flags = flags;
3400         str->ch = ch;
3401 }
3402
3403 /* Convert internally-formatted data to Big5. */
3404
3405 static void
3406 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3407                    unsigned_char_dynarr * dst, Lstream_data_count n)
3408 {
3409         unsigned char c;
3410         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3411         unsigned int flags = str->flags;
3412         unsigned int ch = str->ch;
3413         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3414
3415         while (n--) {
3416                 c = *src++;
3417                 if (c == '\n') {
3418                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3419                                 Dynarr_add(dst, '\r');
3420                         if (eol_type != EOL_CR)
3421                                 Dynarr_add(dst, '\n');
3422                 } else if (BYTE_ASCII_P(c)) {
3423                         /* ASCII. */
3424                         Dynarr_add(dst, c);
3425                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3426                         if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3427                             c == LEADING_BYTE_CHINESE_BIG5_2) {
3428                                 /* A recognized leading byte. */
3429                                 ch = c;
3430                                 continue;       /* not done with this character. */
3431                         }
3432                         /* otherwise just ignore this character. */
3433                 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3434                            ch == LEADING_BYTE_CHINESE_BIG5_2) {
3435                         /* Previous char was a recognized leading byte. */
3436                         ch = (ch << 8) | c;
3437                         continue;       /* not done with this character. */
3438                 } else if (ch) {
3439                         /* Encountering second byte of a Big5 character. */
3440                         unsigned char b1, b2;
3441
3442                         ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3443                         Dynarr_add(dst, b1);
3444                         Dynarr_add(dst, b2);
3445                 }
3446
3447                 ch = 0;
3448         }
3449
3450         str->flags = flags;
3451         str->ch = ch;
3452 }
3453
3454 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0,   /*
3455 Decode a Big5 character CODE of BIG5 coding-system.
3456 CODE is the character code in BIG5, a cons of two integers.
3457 Return the corresponding character.
3458 */
3459       (code))
3460 {
3461         unsigned char c1, c2, b1, b2;
3462
3463         CHECK_CONS(code);
3464         CHECK_INT(XCAR(code));
3465         CHECK_INT(XCDR(code));
3466         b1 = XINT(XCAR(code));
3467         b2 = XINT(XCDR(code));
3468         if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3469                 int leading_byte;
3470                 Lisp_Object charset;
3471                 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3472                 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3473                 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3474         } else
3475                 return Qnil;
3476 }
3477
3478 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0,   /*
3479 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3480 Return the corresponding character code in Big5.
3481 */
3482       (character))
3483 {
3484         Lisp_Object charset;
3485         int c1, c2, b1, b2;
3486
3487         CHECK_CHAR_COERCE_INT(character);
3488         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3489         if (EQ(charset, Vcharset_chinese_big5_1) ||
3490             EQ(charset, Vcharset_chinese_big5_2)) {
3491                 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3492                             c2 | 0x80, b1, b2);
3493                 return Fcons(make_int(b1), make_int(b2));
3494         } else
3495                 return Qnil;
3496 }
3497 \f
3498 /************************************************************************/
3499 /*                           UCS-4 methods                              */
3500 /*                                                                      */
3501 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3502 /*                                                                      */
3503 /************************************************************************/
3504
3505 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0,   /*
3506 Map UCS-4 code CODE to Mule character CHARACTER.
3507
3508 Return T on success, NIL on failure.
3509 */
3510       (code, character))
3511 {
3512         size_t c;
3513
3514         CHECK_CHAR(character);
3515         CHECK_NATNUM(code);
3516         c = XINT(code);
3517
3518         if (c < countof(fcd->ucs_to_mule_table)) {
3519                 fcd->ucs_to_mule_table[c] = character;
3520                 return Qt;
3521         } else
3522                 return Qnil;
3523 }
3524
3525 static Lisp_Object ucs_to_char(unsigned long code)
3526 {
3527         if (code < countof(fcd->ucs_to_mule_table)) {
3528                 return fcd->ucs_to_mule_table[code];
3529         } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3530                 unsigned int c;
3531
3532                 code -= 0xe00000;
3533                 c = code % (94 * 94);
3534                 return make_char
3535                     (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3536                                (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3537                                 CHARSET_LEFT_TO_RIGHT),
3538                                c / 94 + 33, c % 94 + 33));
3539         } else
3540                 return Qnil;
3541 }
3542
3543 DEFUN("ucs-char", Fucs_char, 1, 1, 0,   /*
3544 Return Mule character corresponding to UCS code CODE (a positive integer).
3545 */
3546       (code))
3547 {
3548         CHECK_NATNUM(code);
3549         return ucs_to_char(XINT(code));
3550 }
3551
3552 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0,   /*
3553 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3554 */
3555       (character, code))
3556 {
3557         /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3558            Fset_char_ucs is more restrictive on index arg, but should
3559            check code arg in a char_table method. */
3560         CHECK_CHAR(character);
3561         CHECK_NATNUM(code);
3562         return Fput_char_table(character, code, mule_to_ucs_table);
3563 }
3564
3565 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0,   /*
3566 Return the UCS code (a positive integer) corresponding to CHARACTER.
3567 */
3568       (character))
3569 {
3570         return Fget_char_table(character, mule_to_ucs_table);
3571 }
3572
3573 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3574    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3575    is not found, instead.
3576    #### do something more appropriate (use blob?)
3577         Danger, Will Robinson!  Data loss.  Should we signal user? */
3578 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3579 {
3580         Lisp_Object chr = ucs_to_char(ch);
3581
3582         if (!NILP(chr)) {
3583                 Bufbyte work[MAX_EMCHAR_LEN];
3584                 int len;
3585
3586                 ch = XCHAR(chr);
3587                 len = (ch < 128) ?
3588                     simple_set_charptr_emchar(work, ch) :
3589                     non_ascii_set_charptr_emchar(work, ch);
3590                 Dynarr_add_many(dst, work, len);
3591         } else {
3592                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3593                 Dynarr_add(dst, 34 + 128);
3594                 Dynarr_add(dst, 46 + 128);
3595         }
3596 }
3597
3598 static unsigned long
3599 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3600 {
3601         Lisp_Object code
3602             = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3603                               mule_to_ucs_table);
3604
3605         if (INTP(code)) {
3606                 return XINT(code);
3607         } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3608                    (XCHARSET_CHARS(charset) == 94)) {
3609                 unsigned char final = XCHARSET_FINAL(charset);
3610
3611                 if (('@' <= final) && (final < 0x7f)) {
3612                         return 0xe00000 + (final - '@') * 94 * 94
3613                             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3614                 } else {
3615                         return '?';
3616                 }
3617         } else {
3618                 return '?';
3619         }
3620 }
3621
3622 static void
3623 encode_ucs4(Lisp_Object charset,
3624             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3625 {
3626         unsigned long code = mule_char_to_ucs4(charset, h, l);
3627         Dynarr_add(dst, code >> 24);
3628         Dynarr_add(dst, (code >> 16) & 255);
3629         Dynarr_add(dst, (code >> 8) & 255);
3630         Dynarr_add(dst, code & 255);
3631 }
3632
3633 static int
3634 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3635                    Lstream_data_count n)
3636 {
3637         while (n--) {
3638                 const unsigned char c = *(const unsigned char *)src++;
3639                 switch (st->ucs4.in_byte) {
3640                 case 0:
3641                         if (c >= 128)
3642                                 return 0;
3643                         else
3644                                 st->ucs4.in_byte++;
3645                         break;
3646                 case 3:
3647                         st->ucs4.in_byte = 0;
3648                         break;
3649                 default:
3650                         st->ucs4.in_byte++;
3651                 }
3652         }
3653         return CODING_CATEGORY_UCS4_MASK;
3654 }
3655
3656 static void
3657 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3658                    unsigned_char_dynarr * dst, Lstream_data_count n)
3659 {
3660         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3661         unsigned int flags = str->flags;
3662         unsigned int ch = str->ch;
3663         unsigned char counter = str->counter;
3664
3665         while (n--) {
3666                 const unsigned char c = *(const unsigned char *)src++;
3667                 switch (counter) {
3668                 case 0:
3669                         ch = c;
3670                         counter = 3;
3671                         break;
3672                 case 1:
3673                         decode_ucs4((ch << 8) | c, dst);
3674                         ch = 0;
3675                         counter = 0;
3676                         break;
3677                 default:
3678                         ch = (ch << 8) | c;
3679                         counter--;
3680                 }
3681         }
3682         if (counter & CODING_STATE_END)
3683                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3684
3685         str->flags = flags;
3686         str->ch = ch;
3687         str->counter = counter;
3688 }
3689
3690 static void
3691 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3692                    unsigned_char_dynarr * dst, Lstream_data_count n)
3693 {
3694         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3695         unsigned int flags = str->flags;
3696         unsigned int ch = str->ch;
3697         unsigned char char_boundary = str->iso2022.current_char_boundary;
3698         Lisp_Object charset = str->iso2022.current_charset;
3699
3700 #ifdef ENABLE_COMPOSITE_CHARS
3701         /* flags for handling composite chars.  We do a little switcharoo
3702            on the source while we're outputting the composite char. */
3703         unsigned int saved_n = 0;
3704         const unsigned char *saved_src = NULL;
3705         int in_composite = 0;
3706
3707       back_to_square_n:
3708 #endif
3709
3710         while (n--) {
3711                 unsigned char c = *src++;
3712
3713                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3714                         ch = 0;
3715                         encode_ucs4(Vcharset_ascii, c, 0, dst);
3716                         char_boundary = 1;
3717                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3718                         ch = 0;
3719                         charset = CHARSET_BY_LEADING_BYTE(c);
3720                         if (LEADING_BYTE_PREFIX_P(c))
3721                                 ch = c;
3722                         char_boundary = 0;
3723                 } else {        /* Processing Non-ASCII character */
3724                         char_boundary = 1;
3725                         if (EQ(charset, Vcharset_control_1)) {
3726                                 encode_ucs4(Vcharset_control_1, c, 0, dst);
3727                         } else {
3728                                 switch (XCHARSET_REP_BYTES(charset)) {
3729                                 case 2:
3730                                         encode_ucs4(charset, c, 0, dst);
3731                                         break;
3732                                 case 3:
3733                                         if (XCHARSET_PRIVATE_P(charset)) {
3734                                                 encode_ucs4(charset, c, 0, dst);
3735                                                 ch = 0;
3736                                         } else if (ch) {
3737 #ifdef ENABLE_COMPOSITE_CHARS
3738                                                 if (EQ
3739                                                     (charset,
3740                                                      Vcharset_composite)) {
3741                                                         if (in_composite) {
3742                                                                 /* #### Bother! We don't know how to
3743                                                                    handle this yet. */
3744                                                                 Dynarr_add(dst,
3745                                                                            '\0');
3746                                                                 Dynarr_add(dst,
3747                                                                            '\0');
3748                                                                 Dynarr_add(dst,
3749                                                                            '\0');
3750                                                                 Dynarr_add(dst,
3751                                                                            '~');
3752                                                         } else {
3753                                                                 Emchar emch =
3754                                                                     MAKE_CHAR
3755                                                                     (Vcharset_composite,
3756                                                                      ch & 0x7F,
3757                                                                      c & 0x7F);
3758                                                                 Lisp_Object lstr
3759                                                                     =
3760                                                                     composite_char_string
3761                                                                     (emch);
3762                                                                 saved_n = n;
3763                                                                 saved_src = src;
3764                                                                 in_composite =
3765                                                                     1;
3766                                                                 src =
3767                                                                     XSTRING_DATA
3768                                                                     (lstr);
3769                                                                 n = XSTRING_LENGTH(lstr);
3770                                                         }
3771                                                 } else
3772 #endif                          /* ENABLE_COMPOSITE_CHARS */
3773                                                 {
3774                                                         encode_ucs4(charset, ch,
3775                                                                     c, dst);
3776                                                 }
3777                                                 ch = 0;
3778                                         } else {
3779                                                 ch = c;
3780                                                 char_boundary = 0;
3781                                         }
3782                                         break;
3783                                 case 4:
3784                                         if (ch) {
3785                                                 encode_ucs4(charset, ch, c,
3786                                                             dst);
3787                                                 ch = 0;
3788                                         } else {
3789                                                 ch = c;
3790                                                 char_boundary = 0;
3791                                         }
3792                                         break;
3793                                 default:
3794                                         abort();
3795                                 }
3796                         }
3797                 }
3798         }
3799
3800 #ifdef ENABLE_COMPOSITE_CHARS
3801         if (in_composite) {
3802                 n = saved_n;
3803                 src = saved_src;
3804                 in_composite = 0;
3805                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
3806         }
3807 #endif                          /* ENABLE_COMPOSITE_CHARS */
3808
3809         str->flags = flags;
3810         str->ch = ch;
3811         str->iso2022.current_char_boundary = char_boundary;
3812         str->iso2022.current_charset = charset;
3813
3814         /* Verbum caro factum est! */
3815 }
3816 \f
3817 /************************************************************************/
3818 /*                           UTF-8 methods                              */
3819 /************************************************************************/
3820
3821 static int
3822 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3823                    Lstream_data_count n)
3824 {
3825         while (n--) {
3826                 const unsigned char c = *(const unsigned char *)src++;
3827                 switch (st->utf8.in_byte) {
3828                 case 0:
3829                         if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3830                             || c == ISO_CODE_SO)
3831                                 return 0;
3832                         else if (c >= 0xfc)
3833                                 st->utf8.in_byte = 5;
3834                         else if (c >= 0xf8)
3835                                 st->utf8.in_byte = 4;
3836                         else if (c >= 0xf0)
3837                                 st->utf8.in_byte = 3;
3838                         else if (c >= 0xe0)
3839                                 st->utf8.in_byte = 2;
3840                         else if (c >= 0xc0)
3841                                 st->utf8.in_byte = 1;
3842                         else if (c >= 0x80)
3843                                 return 0;
3844                         break;
3845                 default:
3846                         if ((c & 0xc0) != 0x80)
3847                                 return 0;
3848                         else
3849                                 st->utf8.in_byte--;
3850                 }
3851         }
3852         return CODING_CATEGORY_UTF8_MASK;
3853 }
3854
3855 static void
3856 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3857                    unsigned_char_dynarr * dst, Lstream_data_count n)
3858 {
3859         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3860         unsigned int flags = str->flags;
3861         unsigned int ch = str->ch;
3862         eol_type_t eol_type = str->eol_type;
3863         unsigned char counter = str->counter;
3864
3865         while (n--) {
3866                 const unsigned char c = *(const unsigned char *)src++;
3867                 switch (counter) {
3868                 case 0:
3869                         if (c >= 0xfc) {
3870                                 ch = c & 0x01;
3871                                 counter = 5;
3872                         } else if (c >= 0xf8) {
3873                                 ch = c & 0x03;
3874                                 counter = 4;
3875                         } else if (c >= 0xf0) {
3876                                 ch = c & 0x07;
3877                                 counter = 3;
3878                         } else if (c >= 0xe0) {
3879                                 ch = c & 0x0f;
3880                                 counter = 2;
3881                         } else if (c >= 0xc0) {
3882                                 ch = c & 0x1f;
3883                                 counter = 1;
3884                         } else {
3885                                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3886                                 decode_ucs4(c, dst);
3887                         }
3888                         break;
3889                 case 1:
3890                         ch = (ch << 6) | (c & 0x3f);
3891                         decode_ucs4(ch, dst);
3892                         ch = 0;
3893                         counter = 0;
3894                         break;
3895                 default:
3896                         ch = (ch << 6) | (c & 0x3f);
3897                         counter--;
3898                 }
3899               label_continue_loop:;
3900         }
3901
3902         if (flags & CODING_STATE_END)
3903                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3904
3905         str->flags = flags;
3906         str->ch = ch;
3907         str->counter = counter;
3908 }
3909
3910 static void
3911 encode_utf8(Lisp_Object charset,
3912             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3913 {
3914         unsigned long code = mule_char_to_ucs4(charset, h, l);
3915         if (code <= 0x7f) {
3916                 Dynarr_add(dst, code);
3917         } else if (code <= 0x7ff) {
3918                 Dynarr_add(dst, (code >> 6) | 0xc0);
3919                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3920         } else if (code <= 0xffff) {
3921                 Dynarr_add(dst, (code >> 12) | 0xe0);
3922                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3923                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3924         } else if (code <= 0x1fffff) {
3925                 Dynarr_add(dst, (code >> 18) | 0xf0);
3926                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3927                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3928                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3929         } else if (code <= 0x3ffffff) {
3930                 Dynarr_add(dst, (code >> 24) | 0xf8);
3931                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3932                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3933                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3934                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3935         } else {
3936                 Dynarr_add(dst, (code >> 30) | 0xfc);
3937                 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3938                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3939                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3940                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3941                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3942         }
3943 }
3944
3945 static void
3946 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3947                    unsigned_char_dynarr * dst, Lstream_data_count n)
3948 {
3949         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3950         unsigned int flags = str->flags;
3951         unsigned int ch = str->ch;
3952         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3953         unsigned char char_boundary = str->iso2022.current_char_boundary;
3954         Lisp_Object charset = str->iso2022.current_charset;
3955
3956 #ifdef ENABLE_COMPOSITE_CHARS
3957         /* flags for handling composite chars.  We do a little switcharoo
3958            on the source while we're outputting the composite char. */
3959         unsigned int saved_n = 0;
3960         const unsigned char *saved_src = NULL;
3961         int in_composite = 0;
3962
3963       back_to_square_n:
3964 #endif                          /* ENABLE_COMPOSITE_CHARS */
3965
3966         while (n--) {
3967                 unsigned char c = *src++;
3968
3969                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3970                         ch = 0;
3971                         if (c == '\n') {
3972                                 if (eol_type != EOL_LF
3973                                     && eol_type != EOL_AUTODETECT)
3974                                         Dynarr_add(dst, '\r');
3975                                 if (eol_type != EOL_CR)
3976                                         Dynarr_add(dst, c);
3977                         } else
3978                                 encode_utf8(Vcharset_ascii, c, 0, dst);
3979                         char_boundary = 1;
3980                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3981                         ch = 0;
3982                         charset = CHARSET_BY_LEADING_BYTE(c);
3983                         if (LEADING_BYTE_PREFIX_P(c))
3984                                 ch = c;
3985                         char_boundary = 0;
3986                 } else {        /* Processing Non-ASCII character */
3987                         char_boundary = 1;
3988                         if (EQ(charset, Vcharset_control_1)) {
3989                                 encode_utf8(Vcharset_control_1, c, 0, dst);
3990                         } else {
3991                                 switch (XCHARSET_REP_BYTES(charset)) {
3992                                 case 2:
3993                                         encode_utf8(charset, c, 0, dst);
3994                                         break;
3995                                 case 3:
3996                                         if (XCHARSET_PRIVATE_P(charset)) {
3997                                                 encode_utf8(charset, c, 0, dst);
3998                                                 ch = 0;
3999                                         } else if (ch) {
4000 #ifdef ENABLE_COMPOSITE_CHARS
4001                                                 if (EQ
4002                                                     (charset,
4003                                                      Vcharset_composite)) {
4004                                                         if (in_composite) {
4005                                                                 /* #### Bother! We don't know how to
4006                                                                    handle this yet. */
4007                                                                 encode_utf8
4008                                                                     (Vcharset_ascii,
4009                                                                      '~', 0,
4010                                                                      dst);
4011                                                         } else {
4012                                                                 Emchar emch =
4013                                                                     MAKE_CHAR
4014                                                                     (Vcharset_composite,
4015                                                                      ch & 0x7F,
4016                                                                      c & 0x7F);
4017                                                                 Lisp_Object lstr
4018                                                                     =
4019                                                                     composite_char_string
4020                                                                     (emch);
4021                                                                 saved_n = n;
4022                                                                 saved_src = src;
4023                                                                 in_composite =
4024                                                                     1;
4025                                                                 src =
4026                                                                     XSTRING_DATA
4027                                                                     (lstr);
4028                                                                 n = XSTRING_LENGTH(lstr);
4029                                                         }
4030                                                 } else
4031 #endif                          /* ENABLE_COMPOSITE_CHARS */
4032                                                 {
4033                                                         encode_utf8(charset, ch,
4034                                                                     c, dst);
4035                                                 }
4036                                                 ch = 0;
4037                                         } else {
4038                                                 ch = c;
4039                                                 char_boundary = 0;
4040                                         }
4041                                         break;
4042                                 case 4:
4043                                         if (ch) {
4044                                                 encode_utf8(charset, ch, c,
4045                                                             dst);
4046                                                 ch = 0;
4047                                         } else {
4048                                                 ch = c;
4049                                                 char_boundary = 0;
4050                                         }
4051                                         break;
4052                                 default:
4053                                         abort();
4054                                 }
4055                         }
4056                 }
4057         }
4058
4059 #ifdef ENABLE_COMPOSITE_CHARS
4060         if (in_composite) {
4061                 n = saved_n;
4062                 src = saved_src;
4063                 in_composite = 0;
4064                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
4065         }
4066 #endif
4067
4068         str->flags = flags;
4069         str->ch = ch;
4070         str->iso2022.current_char_boundary = char_boundary;
4071         str->iso2022.current_charset = charset;
4072
4073         /* Verbum caro factum est! */
4074 }
4075 \f
4076 /************************************************************************/
4077 /*                           ISO2022 methods                            */
4078 /************************************************************************/
4079
4080 /* The following note describes the coding system ISO2022 briefly.
4081    Since the intention of this note is to help understand the
4082    functions in this file, some parts are NOT ACCURATE or OVERLY
4083    SIMPLIFIED.  For thorough understanding, please refer to the
4084    original document of ISO2022.
4085
4086    ISO2022 provides many mechanisms to encode several character sets
4087    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4088    is encoded using bytes less than 128.  This may make the encoded
4089    text a little bit longer, but the text passes more easily through
4090    several gateways, some of which strip off MSB (Most Signigant Bit).
4091
4092    There are two kinds of character sets: control character set and
4093    graphic character set.  The former contains control characters such
4094    as `newline' and `escape' to provide control functions (control
4095    functions are also provided by escape sequences).  The latter
4096    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4097    two control character sets and many graphic character sets.
4098
4099    Graphic character sets are classified into one of the following
4100    four classes, according to the number of bytes (DIMENSION) and
4101    number of characters in one dimension (CHARS) of the set:
4102    - DIMENSION1_CHARS94
4103    - DIMENSION1_CHARS96
4104    - DIMENSION2_CHARS94
4105    - DIMENSION2_CHARS96
4106
4107    In addition, each character set is assigned an identification tag,
4108    unique for each set, called "final character" (denoted as <F>
4109    hereafter).  The <F> of each character set is decided by ECMA(*)
4110    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4111    (0x30..0x3F are for private use only).
4112
4113    Note (*): ECMA = European Computer Manufacturers Association
4114
4115    Here are examples of graphic character set [NAME(<F>)]:
4116         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4117         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4118         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4119         o DIMENSION2_CHARS96 -- none for the moment
4120
4121    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4122         C0 [0x00..0x1F] -- control character plane 0
4123         GL [0x20..0x7F] -- graphic character plane 0
4124         C1 [0x80..0x9F] -- control character plane 1
4125         GR [0xA0..0xFF] -- graphic character plane 1
4126
4127    A control character set is directly designated and invoked to C0 or
4128    C1 by an escape sequence.  The most common case is that:
4129    - ISO646's  control character set is designated/invoked to C0, and
4130    - ISO6429's control character set is designated/invoked to C1,
4131    and usually these designations/invocations are omitted in encoded
4132    text.  In a 7-bit environment, only C0 can be used, and a control
4133    character for C1 is encoded by an appropriate escape sequence to
4134    fit into the environment.  All control characters for C1 are
4135    defined to have corresponding escape sequences.
4136
4137    A graphic character set is at first designated to one of four
4138    graphic registers (G0 through G3), then these graphic registers are
4139    invoked to GL or GR.  These designations and invocations can be
4140    done independently.  The most common case is that G0 is invoked to
4141    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4142    these invocations and designations are omitted in encoded text.
4143    In a 7-bit environment, only GL can be used.
4144
4145    When a graphic character set of CHARS94 is invoked to GL, codes
4146    0x20 and 0x7F of the GL area work as control characters SPACE and
4147    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4148    be used.
4149
4150    There are two ways of invocation: locking-shift and single-shift.
4151    With locking-shift, the invocation lasts until the next different
4152    invocation, whereas with single-shift, the invocation affects the
4153    following character only and doesn't affect the locking-shift
4154    state.  Invocations are done by the following control characters or
4155    escape sequences:
4156
4157    ----------------------------------------------------------------------
4158    abbrev  function                  cntrl escape seq   description
4159    ----------------------------------------------------------------------
4160    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4161    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4162    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4163    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4164    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4165    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4166    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4167    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4168    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4169    ----------------------------------------------------------------------
4170    (*) These are not used by any known coding system.
4171
4172    Control characters for these functions are defined by macros
4173    ISO_CODE_XXX in `coding.h'.
4174
4175    Designations are done by the following escape sequences:
4176    ----------------------------------------------------------------------
4177    escape sequence      description
4178    ----------------------------------------------------------------------
4179    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4180    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4181    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4182    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4183    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4184    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4185    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4186    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4187    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4188    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4189    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4190    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4191    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4192    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4193    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4194    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4195    ----------------------------------------------------------------------
4196
4197    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4198    of dimension 1, chars 94, and final character <F>, etc...
4199
4200    Note (*): Although these designations are not allowed in ISO2022,
4201    Emacs accepts them on decoding, and produces them on encoding
4202    CHARS96 character sets in a coding system which is characterized as
4203    7-bit environment, non-locking-shift, and non-single-shift.
4204
4205    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4206    '(' can be omitted.  We refer to this as "short-form" hereafter.
4207
4208    Now you may notice that there are a lot of ways for encoding the
4209    same multilingual text in ISO2022.  Actually, there exist many
4210    coding systems such as Compound Text (used in X11's inter client
4211    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4212    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4213    localized platforms), and all of these are variants of ISO2022.
4214
4215    In addition to the above, Emacs handles two more kinds of escape
4216    sequences: ISO6429's direction specification and Emacs' private
4217    sequence for specifying character composition.
4218
4219    ISO6429's direction specification takes the following form:
4220         o CSI ']'      -- end of the current direction
4221         o CSI '0' ']'  -- end of the current direction
4222         o CSI '1' ']'  -- start of left-to-right text
4223         o CSI '2' ']'  -- start of right-to-left text
4224    The control character CSI (0x9B: control sequence introducer) is
4225    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4226
4227    Character composition specification takes the following form:
4228         o ESC '0' -- start character composition
4229         o ESC '1' -- end character composition
4230    Since these are not standard escape sequences of any ISO standard,
4231    their use with these meanings is restricted to Emacs only.  */
4232
4233 static void
4234 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4235 {
4236         int i;
4237
4238         for (i = 0; i < 4; i++) {
4239                 if (!NILP(coding_system))
4240                         iso->charset[i] =
4241                             XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4242                             (coding_system, i);
4243                 else
4244                         iso->charset[i] = Qt;
4245                 iso->invalid_designated[i] = 0;
4246         }
4247         iso->esc = ISO_ESC_NOTHING;
4248         iso->esc_bytes_index = 0;
4249         iso->register_left = 0;
4250         iso->register_right = 1;
4251         iso->switched_dir_and_no_valid_charset_yet = 0;
4252         iso->invalid_switch_dir = 0;
4253         iso->output_direction_sequence = 0;
4254         iso->output_literally = 0;
4255 #ifdef ENABLE_COMPOSITE_CHARS
4256         if (iso->composite_chars)
4257                 Dynarr_reset(iso->composite_chars);
4258 #endif
4259 }
4260
4261 static int fit_to_be_escape_quoted(unsigned char c)
4262 {
4263         switch (c) {
4264         case ISO_CODE_ESC:
4265         case ISO_CODE_CSI:
4266         case ISO_CODE_SS2:
4267         case ISO_CODE_SS3:
4268         case ISO_CODE_SO:
4269         case ISO_CODE_SI:
4270                 return 1;
4271
4272         default:
4273                 return 0;
4274         }
4275 }
4276
4277 /* Parse one byte of an ISO2022 escape sequence.
4278    If the result is an invalid escape sequence, return 0 and
4279    do not change anything in STR.  Otherwise, if the result is
4280    an incomplete escape sequence, update ISO2022.ESC and
4281    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4282    all the state variables (but not ISO2022.ESC_BYTES) and
4283    return 1.
4284
4285    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4286    or invocation of an invalid character set and treat that as
4287    an unrecognized escape sequence.
4288
4289    ********************************************************************
4290
4291    #### Strategies for error annotation and coding orthogonalization
4292
4293    We really want to separate out a number of things.  Conceptually,
4294    there is a nested syntax.
4295
4296    At the top level is the ISO 2022 extension syntax, including charset
4297    designation and invocation, and certain auxiliary controls such as the
4298    ISO 6429 direction specification.  These are octet-oriented, with the
4299    single exception (AFAIK) of the "exit Unicode" sequence which uses the
4300    UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4301    UTF-16, and 4 bytes for UCS-4 and UTF-32).  This will be treated as a
4302    (deprecated) special case in Unicode processing.
4303
4304    The middle layer is ISO 2022 character interpretation.  This will depend
4305    on the current state of the ISO 2022 registers, and assembles octets
4306    into the character's internal representation.
4307
4308    The lowest level is translating system control conventions.  At present
4309    this is restricted to newline translation, but one could imagine doing
4310    tab conversion or line wrapping here.  "Escape from Unicode" processing
4311    would be done at this level.
4312
4313    At each level the parser will verify the syntax.  In the case of a
4314    syntax error or warning (such as a redundant escape sequence that affects
4315    no characters), the parser will take some action, typically inserting the
4316    erroneous octets directly into the output and creating an annotation
4317    which can be used by higher level I/O to mark the affected region.
4318
4319    This should make it possible to do something sensible about separating
4320    newline convention processing from character construction, and about
4321    preventing ISO 2022 escape sequences from being recognized
4322    inappropriately.
4323
4324    The basic strategy will be to have octet classification tables, and
4325    switch processing according to the table entry.
4326
4327    It's possible that, by doing the processing with tables of functions or
4328    the like, the parser can be used for both detection and translation. */
4329
4330 static int
4331 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4332                   unsigned char c, unsigned int *flags,
4333                   int check_invalid_charsets)
4334 {
4335         /* (1) If we're at the end of a designation sequence, CS is the
4336            charset being designated and REG is the register to designate
4337            it to.
4338
4339            (2) If we're at the end of a locking-shift sequence, REG is
4340            the register to invoke and HALF (0 == left, 1 == right) is
4341            the half to invoke it into.
4342
4343            (3) If we're at the end of a single-shift sequence, REG is
4344            the register to invoke. */
4345         Lisp_Object cs = Qnil;
4346         int reg, half;
4347
4348         /* NOTE: This code does goto's all over the fucking place.
4349            The reason for this is that we're basically implementing
4350            a state machine here, and hierarchical languages like C
4351            don't really provide a clean way of doing this. */
4352
4353         if (!(*flags & CODING_STATE_ESCAPE))
4354                 /* At beginning of escape sequence; we need to reset our
4355                    escape-state variables. */
4356                 iso->esc = ISO_ESC_NOTHING;
4357
4358         iso->output_literally = 0;
4359         iso->output_direction_sequence = 0;
4360
4361         switch (iso->esc) {
4362         case ISO_ESC_NOTHING:
4363                 iso->esc_bytes_index = 0;
4364                 switch (c) {
4365                 case ISO_CODE_ESC:      /* Start escape sequence */
4366                         *flags |= CODING_STATE_ESCAPE;
4367                         iso->esc = ISO_ESC;
4368                         goto not_done;
4369
4370                 case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4371                         *flags |= CODING_STATE_ESCAPE;
4372                         iso->esc = ISO_ESC_5_11;
4373                         goto not_done;
4374
4375                 case ISO_CODE_SO:       /* locking shift 1 */
4376                         reg = 1;
4377                         half = 0;
4378                         goto locking_shift;
4379                 case ISO_CODE_SI:       /* locking shift 0 */
4380                         reg = 0;
4381                         half = 0;
4382                         goto locking_shift;
4383
4384                 case ISO_CODE_SS2:      /* single shift */
4385                         reg = 2;
4386                         goto single_shift;
4387                 case ISO_CODE_SS3:      /* single shift */
4388                         reg = 3;
4389                         goto single_shift;
4390
4391                 default:        /* Other control characters */
4392                         return 0;
4393                 }
4394
4395         case ISO_ESC:
4396                 switch (c) {
4397           /**** single shift ****/
4398
4399                 case 'N':       /* single shift 2 */
4400                         reg = 2;
4401                         goto single_shift;
4402                 case 'O':       /* single shift 3 */
4403                         reg = 3;
4404                         goto single_shift;
4405
4406           /**** locking shift ****/
4407
4408                 case '~':       /* locking shift 1 right */
4409                         reg = 1;
4410                         half = 1;
4411                         goto locking_shift;
4412                 case 'n':       /* locking shift 2 */
4413                         reg = 2;
4414                         half = 0;
4415                         goto locking_shift;
4416                 case '}':       /* locking shift 2 right */
4417                         reg = 2;
4418                         half = 1;
4419                         goto locking_shift;
4420                 case 'o':       /* locking shift 3 */
4421                         reg = 3;
4422                         half = 0;
4423                         goto locking_shift;
4424                 case '|':       /* locking shift 3 right */
4425                         reg = 3;
4426                         half = 1;
4427                         goto locking_shift;
4428
4429 #ifdef ENABLE_COMPOSITE_CHARS
4430           /**** composite ****/
4431
4432                 case '0':
4433                         iso->esc = ISO_ESC_START_COMPOSITE;
4434                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4435                             CODING_STATE_COMPOSITE;
4436                         return 1;
4437
4438                 case '1':
4439                         iso->esc = ISO_ESC_END_COMPOSITE;
4440                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4441                             ~CODING_STATE_COMPOSITE;
4442                         return 1;
4443 #endif                          /* ENABLE_COMPOSITE_CHARS */
4444
4445           /**** directionality ****/
4446
4447                 case '[':
4448                         iso->esc = ISO_ESC_5_11;
4449                         goto not_done;
4450
4451           /**** designation ****/
4452
4453                 case '$':       /* multibyte charset prefix */
4454                         iso->esc = ISO_ESC_2_4;
4455                         goto not_done;
4456
4457                 default:
4458                         if (0x28 <= c && c <= 0x2F) {
4459                                 iso->esc =
4460                                     (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4461                                 goto not_done;
4462                         }
4463
4464                         /* This function is called with CODESYS equal to nil when
4465                            doing coding-system detection. */
4466                         if (!NILP(codesys)
4467                             && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4468                             && fit_to_be_escape_quoted(c)) {
4469                                 iso->esc = ISO_ESC_LITERAL;
4470                                 *flags &= CODING_STATE_ISO2022_LOCK;
4471                                 return 1;
4472                         }
4473
4474                         /* bzzzt! */
4475                         return 0;
4476                 }
4477
4478       /**** directionality ****/
4479
4480         case ISO_ESC_5_11:      /* ISO6429 direction control */
4481                 if (c == ']') {
4482                         *flags &=
4483                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4484                         goto directionality;
4485                 }
4486                 if (c == '0')
4487                         iso->esc = ISO_ESC_5_11_0;
4488                 else if (c == '1')
4489                         iso->esc = ISO_ESC_5_11_1;
4490                 else if (c == '2')
4491                         iso->esc = ISO_ESC_5_11_2;
4492                 else
4493                         return 0;
4494                 goto not_done;
4495
4496         case ISO_ESC_5_11_0:
4497                 if (c == ']') {
4498                         *flags &=
4499                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4500                         goto directionality;
4501                 }
4502                 return 0;
4503
4504         case ISO_ESC_5_11_1:
4505                 if (c == ']') {
4506                         *flags =
4507                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4508                         goto directionality;
4509                 }
4510                 return 0;
4511
4512         case ISO_ESC_5_11_2:
4513                 if (c == ']') {
4514                         *flags =
4515                             (*flags & CODING_STATE_ISO2022_LOCK) |
4516                             CODING_STATE_R2L;
4517                         goto directionality;
4518                 }
4519                 return 0;
4520
4521         directionality:
4522                 iso->esc = ISO_ESC_DIRECTIONALITY;
4523                 /* Various junk here to attempt to preserve the direction
4524                    sequences literally in the text if they would otherwise be
4525                    swallowed due to invalid designations that don't show up as
4526                    actual charset changes in the text. */
4527                 if (iso->invalid_switch_dir) {
4528                         /* We already inserted a direction switch literally into
4529                            the text.  We assume (#### this may not be right)
4530                            that the next direction switch is the one going the
4531                            other way, and we need to output that literally as
4532                            well. */
4533                         iso->output_literally = 1;
4534                         iso->invalid_switch_dir = 0;
4535                 } else {
4536                         int jj;
4537
4538                         /* If we are in the thrall of an invalid designation,
4539                            then stick the directionality sequence literally into
4540                            the output stream so it ends up in the original text
4541                            again. */
4542                         for (jj = 0; jj < 4; jj++)
4543                                 if (iso->invalid_designated[jj])
4544                                         break;
4545                         if (jj < 4) {
4546                                 iso->output_literally = 1;
4547                                 iso->invalid_switch_dir = 1;
4548                         } else
4549                                 /* Indicate that we haven't yet seen a valid
4550                                    designation, so that if a switch-dir is
4551                                    directly followed by an invalid designation,
4552                                    both get inserted literally. */
4553                                 iso->switched_dir_and_no_valid_charset_yet = 1;
4554                 }
4555                 return 1;
4556
4557                 /**** designation ****/
4558
4559         case ISO_ESC_2_4:
4560                 if (0x28 <= c && c <= 0x2F) {
4561                         iso->esc =
4562                             (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4563                         goto not_done;
4564                 }
4565                 if (0x40 <= c && c <= 0x42) {
4566                         cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4567                                                    *flags & CODING_STATE_R2L ?
4568                                                    CHARSET_RIGHT_TO_LEFT :
4569                                                    CHARSET_LEFT_TO_RIGHT);
4570                         reg = 0;
4571                         goto designated;
4572                 }
4573                 return 0;
4574
4575                 /* list the rest */
4576         case ISO_ESC_2_8:
4577         case ISO_ESC_2_9:
4578         case ISO_ESC_2_10:
4579         case ISO_ESC_2_11:
4580         case ISO_ESC_2_12:
4581         case ISO_ESC_2_13:
4582         case ISO_ESC_2_14:
4583         case ISO_ESC_2_15:
4584         case ISO_ESC_2_4_8:
4585         case ISO_ESC_2_4_9:
4586         case ISO_ESC_2_4_10:
4587         case ISO_ESC_2_4_11:
4588         case ISO_ESC_2_4_12:
4589         case ISO_ESC_2_4_13:
4590         case ISO_ESC_2_4_14:
4591         case ISO_ESC_2_4_15:
4592         case ISO_ESC_SINGLE_SHIFT:
4593         case ISO_ESC_LOCKING_SHIFT:
4594         case ISO_ESC_DESIGNATE:
4595         case ISO_ESC_DIRECTIONALITY:
4596         case ISO_ESC_LITERAL:
4597
4598         default: {
4599                 int type = -1;
4600
4601                 if (c < '0' || c > '~')
4602                         return 0;       /* bad final byte */
4603
4604                 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4605                         type = ((iso->esc >= ISO_ESC_2_12) ?
4606                                 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4607                         reg = (iso->esc - ISO_ESC_2_8) & 3;
4608                 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4609                            iso->esc <= ISO_ESC_2_4_15) {
4610                         type = ((iso->esc >= ISO_ESC_2_4_12) ?
4611                                 CHARSET_TYPE_96X96 :
4612                                 CHARSET_TYPE_94X94);
4613                         reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4614                 } else {
4615                         /* Can this ever be reached? -slb */
4616                         abort();
4617                         return 0;
4618                 }
4619
4620                 cs = CHARSET_BY_ATTRIBUTES(type, c,
4621                                            *flags & CODING_STATE_R2L ?
4622                                            CHARSET_RIGHT_TO_LEFT :
4623                                            CHARSET_LEFT_TO_RIGHT);
4624                 goto designated;
4625         }
4626         }
4627
4628       not_done:
4629         iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4630         return -1;
4631
4632       single_shift:
4633         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4634                 /* can't invoke something that ain't there. */
4635                 return 0;
4636         iso->esc = ISO_ESC_SINGLE_SHIFT;
4637         *flags &= CODING_STATE_ISO2022_LOCK;
4638         if (reg == 2)
4639                 *flags |= CODING_STATE_SS2;
4640         else
4641                 *flags |= CODING_STATE_SS3;
4642         return 1;
4643
4644       locking_shift:
4645         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4646                 /* can't invoke something that ain't there. */
4647                 return 0;
4648         if (half)
4649                 iso->register_right = reg;
4650         else
4651                 iso->register_left = reg;
4652         *flags &= CODING_STATE_ISO2022_LOCK;
4653         iso->esc = ISO_ESC_LOCKING_SHIFT;
4654         return 1;
4655
4656       designated:
4657         if (NILP(cs) && check_invalid_charsets) {
4658                 iso->invalid_designated[reg] = 1;
4659                 iso->charset[reg] = Vcharset_ascii;
4660                 iso->esc = ISO_ESC_DESIGNATE;
4661                 *flags &= CODING_STATE_ISO2022_LOCK;
4662                 iso->output_literally = 1;
4663                 if (iso->switched_dir_and_no_valid_charset_yet) {
4664                         /* We encountered a switch-direction followed by an
4665                            invalid designation.  Ensure that the switch-direction
4666                            gets outputted; otherwise it will probably get eaten
4667                            when the text is written out again. */
4668                         iso->switched_dir_and_no_valid_charset_yet = 0;
4669                         iso->output_direction_sequence = 1;
4670                         /* And make sure that the switch-dir going the other
4671                            way gets outputted, as well. */
4672                         iso->invalid_switch_dir = 1;
4673                 }
4674                 return 1;
4675         }
4676         /* This function is called with CODESYS equal to nil when
4677            doing coding-system detection. */
4678         if (!NILP(codesys)) {
4679                 charset_conversion_spec_dynarr *dyn =
4680                     XCODING_SYSTEM(codesys)->iso2022.input_conv;
4681
4682                 if (dyn) {
4683                         int i;
4684
4685                         for (i = 0; i < Dynarr_length(dyn); i++) {
4686                                 struct charset_conversion_spec *spec =
4687                                     Dynarr_atp(dyn, i);
4688                                 if (EQ(cs, spec->from_charset))
4689                                         cs = spec->to_charset;
4690                         }
4691                 }
4692         }
4693
4694         iso->charset[reg] = cs;
4695         iso->esc = ISO_ESC_DESIGNATE;
4696         *flags &= CODING_STATE_ISO2022_LOCK;
4697         if (iso->invalid_designated[reg]) {
4698                 iso->invalid_designated[reg] = 0;
4699                 iso->output_literally = 1;
4700         }
4701         if (iso->switched_dir_and_no_valid_charset_yet)
4702                 iso->switched_dir_and_no_valid_charset_yet = 0;
4703         return 1;
4704 }
4705
4706 static int
4707 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4708                       Lstream_data_count n)
4709 {
4710         int mask;
4711
4712         /* #### There are serious deficiencies in the recognition mechanism
4713            here.  This needs to be much smarter if it's going to cut it.
4714            The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4715            it should be detected as Latin-1.
4716            All the ISO2022 stuff in this file should be synced up with the
4717            code from FSF Emacs-20.4, in which Mule should be more or less stable.
4718            Perhaps we should wait till R2L works in FSF Emacs? */
4719
4720         if (!st->iso2022.initted) {
4721                 reset_iso2022(Qnil, &st->iso2022.iso);
4722                 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4723                                     CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4724                                     CODING_CATEGORY_ISO_8_1_MASK |
4725                                     CODING_CATEGORY_ISO_8_2_MASK |
4726                                     CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4727                 st->iso2022.flags = 0;
4728                 st->iso2022.high_byte_count = 0;
4729                 st->iso2022.saw_single_shift = 0;
4730                 st->iso2022.initted = 1;
4731         }
4732
4733         mask = st->iso2022.mask;
4734
4735         while (n--) {
4736                 const unsigned char c = *(const unsigned char *)src++;
4737                 if (c >= 0xA0) {
4738                         mask &= ~CODING_CATEGORY_ISO_7_MASK;
4739                         st->iso2022.high_byte_count++;
4740                 } else {
4741                         if (st->iso2022.high_byte_count
4742                             && !st->iso2022.saw_single_shift) {
4743                                 if (st->iso2022.high_byte_count & 1)
4744                                         /* odd number of high bytes; assume not iso-8-2 */
4745                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4746                         }
4747                         st->iso2022.high_byte_count = 0;
4748                         st->iso2022.saw_single_shift = 0;
4749                         if (c > 0x80)
4750                                 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4751                 }
4752                 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4753                     && (BYTE_C0_P(c) || BYTE_C1_P(c))) {        /* control chars */
4754                         switch (c) {
4755                                 /* Allow and ignore control characters that you might
4756                                    reasonably see in a text file */
4757                         case '\r':
4758                         case '\n':
4759                         case '\t':
4760                         case 7: /* bell */
4761                         case 8: /* backspace */
4762                         case 11:        /* vertical tab */
4763                         case 12:        /* form feed */
4764                         case 26:        /* MS-DOS C-z junk */
4765                         case 31:        /* '^_' -- for info */
4766                                 goto label_continue_loop;
4767
4768                         default:
4769                                 break;
4770                         }
4771                 }
4772
4773                 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4774                     || BYTE_C1_P(c)) {
4775                         if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4776                                               &st->iso2022.flags, 0)) {
4777                                 switch (st->iso2022.iso.esc) {
4778                                 case ISO_ESC_DESIGNATE:
4779                                         mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4780                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4781                                         break;
4782                                 case ISO_ESC_LOCKING_SHIFT:
4783                                         mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4784                                         goto ran_out_of_chars;
4785                                 case ISO_ESC_SINGLE_SHIFT:
4786                                         mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4787                                         st->iso2022.saw_single_shift = 1;
4788                                         break;
4789
4790                                         /* list the rest */
4791                                 case ISO_ESC_NOTHING:
4792                                 case ISO_ESC:
4793                                 case ISO_ESC_2_4:
4794                                 case ISO_ESC_2_8:
4795                                 case ISO_ESC_2_9:
4796                                 case ISO_ESC_2_10:
4797                                 case ISO_ESC_2_11:
4798                                 case ISO_ESC_2_12:
4799                                 case ISO_ESC_2_13:
4800                                 case ISO_ESC_2_14:
4801                                 case ISO_ESC_2_15:
4802                                 case ISO_ESC_2_4_8:
4803                                 case ISO_ESC_2_4_9:
4804                                 case ISO_ESC_2_4_10:
4805                                 case ISO_ESC_2_4_11:
4806                                 case ISO_ESC_2_4_12:
4807                                 case ISO_ESC_2_4_13:
4808                                 case ISO_ESC_2_4_14:
4809                                 case ISO_ESC_2_4_15:
4810                                 case ISO_ESC_5_11:
4811                                 case ISO_ESC_5_11_0:
4812                                 case ISO_ESC_5_11_1:
4813                                 case ISO_ESC_5_11_2:
4814                                 case ISO_ESC_DIRECTIONALITY:
4815                                 case ISO_ESC_LITERAL:
4816                                 default:
4817                                         break;
4818                                 }
4819                         } else {
4820                                 mask = 0;
4821                                 goto ran_out_of_chars;
4822                         }
4823                 }
4824         label_continue_loop:;
4825         }
4826
4827 ran_out_of_chars:
4828         return mask;
4829 }
4830
4831 static int postprocess_iso2022_mask(int mask)
4832 {
4833         /* #### kind of cheesy */
4834         /* If seven-bit ISO is allowed, then assume that the encoding is
4835            entirely seven-bit and turn off the eight-bit ones. */
4836         if (mask & CODING_CATEGORY_ISO_7_MASK)
4837                 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4838                           CODING_CATEGORY_ISO_8_1_MASK |
4839                           CODING_CATEGORY_ISO_8_2_MASK);
4840         return mask;
4841 }
4842
4843 /* If FLAGS is a null pointer or specifies right-to-left motion,
4844    output a switch-dir-to-left-to-right sequence to DST.
4845    Also update FLAGS if it is not a null pointer.
4846    If INTERNAL_P is set, we are outputting in internal format and
4847    need to handle the CSI differently. */
4848
4849 static void
4850 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4851                                 unsigned_char_dynarr * dst,
4852                                 unsigned int *flags, int internal_p)
4853 {
4854         if (!flags || (*flags & CODING_STATE_R2L)) {
4855                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4856                         Dynarr_add(dst, ISO_CODE_ESC);
4857                         Dynarr_add(dst, '[');
4858                 } else if (internal_p)
4859                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4860                 else
4861                         Dynarr_add(dst, ISO_CODE_CSI);
4862                 Dynarr_add(dst, '0');
4863                 Dynarr_add(dst, ']');
4864                 if (flags)
4865                         *flags &= ~CODING_STATE_R2L;
4866         }
4867 }
4868
4869 /* If FLAGS is a null pointer or specifies a direction different from
4870    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4871    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4872    sequence to DST.  Also update FLAGS if it is not a null pointer.
4873    If INTERNAL_P is set, we are outputting in internal format and
4874    need to handle the CSI differently. */
4875
4876 static void
4877 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4878                          unsigned_char_dynarr * dst, unsigned int *flags,
4879                          int internal_p)
4880 {
4881         if ((!flags || (*flags & CODING_STATE_R2L)) &&
4882             direction == CHARSET_LEFT_TO_RIGHT)
4883                 restore_left_to_right_direction(codesys, dst, flags,
4884                                                 internal_p);
4885         else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4886                  && (!flags || !(*flags & CODING_STATE_R2L)) &&
4887                  direction == CHARSET_RIGHT_TO_LEFT) {
4888                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4889                         Dynarr_add(dst, ISO_CODE_ESC);
4890                         Dynarr_add(dst, '[');
4891                 } else if (internal_p)
4892                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4893                 else
4894                         Dynarr_add(dst, ISO_CODE_CSI);
4895                 Dynarr_add(dst, '2');
4896                 Dynarr_add(dst, ']');
4897                 if (flags)
4898                         *flags |= CODING_STATE_R2L;
4899         }
4900 }
4901
4902 /* Convert ISO2022-format data to internal format. */
4903
4904 static void
4905 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4906                       unsigned_char_dynarr * dst, Lstream_data_count n)
4907 {
4908         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4909         unsigned int flags = str->flags;
4910         unsigned int ch = str->ch;
4911         eol_type_t eol_type = str->eol_type;
4912 #ifdef ENABLE_COMPOSITE_CHARS
4913         unsigned_char_dynarr *real_dst = dst;
4914 #endif
4915         Lisp_Object coding_system;
4916
4917         XSETCODING_SYSTEM(coding_system, str->codesys);
4918
4919 #ifdef ENABLE_COMPOSITE_CHARS
4920         if (flags & CODING_STATE_COMPOSITE)
4921                 dst = str->iso2022.composite_chars;
4922 #endif                          /* ENABLE_COMPOSITE_CHARS */
4923
4924         while (n--) {
4925                 const unsigned char c = *(const unsigned char *)src++;
4926                 if (flags & CODING_STATE_ESCAPE) {
4927                         /* Within ESC sequence */
4928                         int retval = parse_iso2022_esc(
4929                                 coding_system, &str->iso2022, c, &flags, 1);
4930
4931                         if (retval) {
4932                                 switch (str->iso2022.esc) {
4933 #ifdef ENABLE_COMPOSITE_CHARS
4934                                 case ISO_ESC_START_COMPOSITE:
4935                                         if (str->iso2022.composite_chars)
4936                                                 Dynarr_reset(str->iso2022.
4937                                                              composite_chars);
4938                                         else
4939                                                 str->iso2022.composite_chars =
4940                                                     Dynarr_new(unsigned_char);
4941                                         dst = str->iso2022.composite_chars;
4942                                         break;
4943                                 case ISO_ESC_END_COMPOSITE:
4944                                         {
4945                                                 Bufbyte comstr[MAX_EMCHAR_LEN];
4946                                                 Bytecount len;
4947                                                 Emchar emch =
4948                                                     lookup_composite_char
4949                                                     (Dynarr_atp(dst, 0),
4950                                                      Dynarr_length(dst));
4951                                                 dst = real_dst;
4952                                                 len =
4953                                                     set_charptr_emchar(comstr,
4954                                                                        emch);
4955                                                 Dynarr_add_many(dst, comstr,
4956                                                                 len);
4957                                                 break;
4958                                         }
4959 #endif                          /* ENABLE_COMPOSITE_CHARS */
4960
4961                                 case ISO_ESC_LITERAL:
4962                                         DECODE_ADD_BINARY_CHAR(c, dst);
4963                                         break;
4964
4965                                 case ISO_ESC_NOTHING:
4966                                 case ISO_ESC:
4967                                 case ISO_ESC_2_4:
4968                                 case ISO_ESC_2_8:
4969                                 case ISO_ESC_2_9:
4970                                 case ISO_ESC_2_10:
4971                                 case ISO_ESC_2_11:
4972                                 case ISO_ESC_2_12:
4973                                 case ISO_ESC_2_13:
4974                                 case ISO_ESC_2_14:
4975                                 case ISO_ESC_2_15:
4976                                 case ISO_ESC_2_4_8:
4977                                 case ISO_ESC_2_4_9:
4978                                 case ISO_ESC_2_4_10:
4979                                 case ISO_ESC_2_4_11:
4980                                 case ISO_ESC_2_4_12:
4981                                 case ISO_ESC_2_4_13:
4982                                 case ISO_ESC_2_4_14:
4983                                 case ISO_ESC_2_4_15:
4984                                 case ISO_ESC_5_11:
4985                                 case ISO_ESC_5_11_0:
4986                                 case ISO_ESC_5_11_1:
4987                                 case ISO_ESC_5_11_2:
4988                                 case ISO_ESC_SINGLE_SHIFT:
4989                                 case ISO_ESC_LOCKING_SHIFT:
4990                                 case ISO_ESC_DESIGNATE:
4991                                 case ISO_ESC_DIRECTIONALITY:
4992
4993                                 default:
4994                                         /* Everything else handled already */
4995                                         break;
4996                                 }
4997                         }
4998
4999                         /* Attempted error recovery. */
5000                         if (str->iso2022.output_direction_sequence)
5001                                 ensure_correct_direction(flags &
5002                                                          CODING_STATE_R2L ?
5003                                                          CHARSET_RIGHT_TO_LEFT :
5004                                                          CHARSET_LEFT_TO_RIGHT,
5005                                                          str->codesys, dst, 0,
5006                                                          1);
5007                         /* More error recovery. */
5008                         if (!retval || str->iso2022.output_literally) {
5009                                 /* Output the (possibly invalid) sequence */
5010                                 int i;
5011                                 for (i = 0; i < str->iso2022.esc_bytes_index;
5012                                      i++)
5013                                         DECODE_ADD_BINARY_CHAR(str->iso2022.
5014                                                                esc_bytes[i],
5015                                                                dst);
5016                                 flags &= CODING_STATE_ISO2022_LOCK;
5017                                 if (!retval)
5018                                         n++, src--;     /* Repeat the loop with the same character. */
5019                                 else {
5020                                         /* No sense in reprocessing the final byte of the
5021                                            escape sequence; it could mess things up anyway.
5022                                            Just add it now. */
5023                                         DECODE_ADD_BINARY_CHAR(c, dst);
5024                                 }
5025                         }
5026                         ch = 0;
5027                 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) {      /* Control characters */
5028
5029           /***** Error-handling *****/
5030
5031                         /* If we were in the middle of a character, dump out the
5032                            partial character. */
5033                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5034
5035                         /* If we just saw a single-shift character, dump it out.
5036                            This may dump out the wrong sort of single-shift character,
5037                            but least it will give an indication that something went
5038                            wrong. */
5039                         if (flags & CODING_STATE_SS2) {
5040                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5041                                 flags &= ~CODING_STATE_SS2;
5042                         }
5043                         if (flags & CODING_STATE_SS3) {
5044                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5045                                 flags &= ~CODING_STATE_SS3;
5046                         }
5047
5048           /***** Now handle the control characters. *****/
5049
5050                         /* Handle CR/LF */
5051                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5052
5053                         flags &= CODING_STATE_ISO2022_LOCK;
5054
5055                         if (!parse_iso2022_esc
5056                             (coding_system, &str->iso2022, c, &flags, 1))
5057                                 DECODE_ADD_BINARY_CHAR(c, dst);
5058                 } else {        /* Graphic characters */
5059                         Lisp_Object charset;
5060                         int lb;
5061                         int reg;
5062
5063                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5064
5065                         /* Now determine the charset. */
5066                         reg = ((flags & CODING_STATE_SS2) ? 2
5067                                : (flags & CODING_STATE_SS3) ? 3
5068                                : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5069                                : str->iso2022.register_left);
5070                         charset = str->iso2022.charset[reg];
5071
5072                         /* Error checking: */
5073                         if (!CHARSETP(charset)
5074                             || str->iso2022.invalid_designated[reg]
5075                             ||
5076                             (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5077                              && XCHARSET_CHARS(charset) == 94))
5078                                 /* Mrmph.  We are trying to invoke a register that has no
5079                                    or an invalid charset in it, or trying to add a character
5080                                    outside the range of the charset.  Insert that char literally
5081                                    to preserve it for the output. */
5082                         {
5083                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5084                                 DECODE_ADD_BINARY_CHAR(c, dst);
5085                         }
5086
5087                         else {
5088                                 /* Things are probably hunky-dorey. */
5089
5090                                 /* Fetch reverse charset, maybe. */
5091                                 if (((flags & CODING_STATE_R2L) &&
5092                                      XCHARSET_DIRECTION(charset) ==
5093                                      CHARSET_LEFT_TO_RIGHT)
5094                                     || (!(flags & CODING_STATE_R2L)
5095                                         && XCHARSET_DIRECTION(charset) ==
5096                                         CHARSET_RIGHT_TO_LEFT)) {
5097                                         Lisp_Object new_charset =
5098                                             XCHARSET_REVERSE_DIRECTION_CHARSET
5099                                             (charset);
5100                                         if (!NILP(new_charset))
5101                                                 charset = new_charset;
5102                                 }
5103
5104                                 lb = XCHARSET_LEADING_BYTE(charset);
5105                                 switch (XCHARSET_REP_BYTES(charset)) {
5106                                 case 1: /* ASCII */
5107                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5108                                         Dynarr_add(dst, c & 0x7F);
5109                                         break;
5110
5111                                 case 2: /* one-byte official */
5112                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5113                                         Dynarr_add(dst, lb);
5114                                         Dynarr_add(dst, c | 0x80);
5115                                         break;
5116
5117                                 case 3: /* one-byte private or two-byte official */
5118                                         if (XCHARSET_PRIVATE_P(charset)) {
5119                                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5120                                                 Dynarr_add(dst,
5121                                                            PRE_LEADING_BYTE_PRIVATE_1);
5122                                                 Dynarr_add(dst, lb);
5123                                                 Dynarr_add(dst, c | 0x80);
5124                                         } else {
5125                                                 if (ch) {
5126                                                         Dynarr_add(dst, lb);
5127                                                         Dynarr_add(dst,
5128                                                                    ch | 0x80);
5129                                                         Dynarr_add(dst,
5130                                                                    c | 0x80);
5131                                                         ch = 0;
5132                                                 } else
5133                                                         ch = c;
5134                                         }
5135                                         break;
5136
5137                                 default:        /* two-byte private */
5138                                         if (ch) {
5139                                                 Dynarr_add(dst,
5140                                                            PRE_LEADING_BYTE_PRIVATE_2);
5141                                                 Dynarr_add(dst, lb);
5142                                                 Dynarr_add(dst, ch | 0x80);
5143                                                 Dynarr_add(dst, c | 0x80);
5144                                                 ch = 0;
5145                                         } else
5146                                                 ch = c;
5147                                 }
5148                         }
5149
5150                         if (!ch)
5151                                 flags &= CODING_STATE_ISO2022_LOCK;
5152                 }
5153
5154               label_continue_loop:;
5155         }
5156
5157         if (flags & CODING_STATE_END)
5158                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5159
5160         str->flags = flags;
5161         str->ch = ch;
5162 }
5163
5164 /***** ISO2022 encoder *****/
5165
5166 /* Designate CHARSET into register REG. */
5167
5168 static void
5169 iso2022_designate(Lisp_Object charset, unsigned char reg,
5170                   encoding_stream_t str, unsigned_char_dynarr * dst)
5171 {
5172         static const char inter94[] = "()*+";
5173         static const char inter96[] = ",-./";
5174         unsigned int type;
5175         unsigned char final;
5176         Lisp_Object old_charset = str->iso2022.charset[reg];
5177
5178         str->iso2022.charset[reg] = charset;
5179         if (!CHARSETP(charset))
5180                 /* charset might be an initial nil or t. */
5181                 return;
5182         type = XCHARSET_TYPE(charset);
5183         final = XCHARSET_FINAL(charset);
5184         if (!str->iso2022.force_charset_on_output[reg] &&
5185             CHARSETP(old_charset) &&
5186             XCHARSET_TYPE(old_charset) == type &&
5187             XCHARSET_FINAL(old_charset) == final)
5188                 return;
5189
5190         str->iso2022.force_charset_on_output[reg] = 0;
5191
5192         {
5193                 charset_conversion_spec_dynarr *dyn =
5194                     str->codesys->iso2022.output_conv;
5195
5196                 if (dyn) {
5197                         int i;
5198
5199                         for (i = 0; i < Dynarr_length(dyn); i++) {
5200                                 struct charset_conversion_spec *spec =
5201                                     Dynarr_atp(dyn, i);
5202                                 if (EQ(charset, spec->from_charset))
5203                                         charset = spec->to_charset;
5204                         }
5205                 }
5206         }
5207
5208         Dynarr_add(dst, ISO_CODE_ESC);
5209         switch (type) {
5210         case CHARSET_TYPE_94:
5211                 Dynarr_add(dst, inter94[reg]);
5212                 break;
5213         case CHARSET_TYPE_96:
5214                 Dynarr_add(dst, inter96[reg]);
5215                 break;
5216         case CHARSET_TYPE_94X94:
5217                 Dynarr_add(dst, '$');
5218                 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5219                     || final < '@' || final > 'B')
5220                         Dynarr_add(dst, inter94[reg]);
5221                 break;
5222         case CHARSET_TYPE_96X96:
5223                 Dynarr_add(dst, '$');
5224                 Dynarr_add(dst, inter96[reg]);
5225                 break;
5226         default:
5227                 break;
5228         }
5229         Dynarr_add(dst, final);
5230 }
5231
5232 static void
5233 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5234 {
5235         if (str->iso2022.register_left != 0) {
5236                 Dynarr_add(dst, ISO_CODE_SI);
5237                 str->iso2022.register_left = 0;
5238         }
5239 }
5240
5241 static void
5242 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5243 {
5244         if (str->iso2022.register_left != 1) {
5245                 Dynarr_add(dst, ISO_CODE_SO);
5246                 str->iso2022.register_left = 1;
5247         }
5248 }
5249
5250 /* Convert internally-formatted data to ISO2022 format. */
5251
5252 static void
5253 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5254                       unsigned_char_dynarr * dst, Lstream_data_count n)
5255 {
5256         unsigned char charmask, c;
5257         unsigned char char_boundary;
5258         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5259         unsigned int flags = str->flags;
5260         unsigned int ch = str->ch;
5261         Lisp_Coding_System *codesys = str->codesys;
5262         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5263         int i;
5264         Lisp_Object charset;
5265         int half;
5266
5267 #ifdef ENABLE_COMPOSITE_CHARS
5268         /* flags for handling composite chars.  We do a little switcharoo
5269            on the source while we're outputting the composite char. */
5270         unsigned int saved_n = 0;
5271         const unsigned char *saved_src = NULL;
5272         int in_composite = 0;
5273 #endif                          /* ENABLE_COMPOSITE_CHARS */
5274
5275         char_boundary = str->iso2022.current_char_boundary;
5276         charset = str->iso2022.current_charset;
5277         half = str->iso2022.current_half;
5278
5279 #ifdef ENABLE_COMPOSITE_CHARS
5280       back_to_square_n:
5281 #endif
5282         while (n--) {
5283                 c = *src++;
5284
5285                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
5286                         ch = 0;
5287
5288                         restore_left_to_right_direction(codesys, dst, &flags,
5289                                                         0);
5290
5291                         /* Make sure G0 contains ASCII */
5292                         if ((c > ' ' && c < ISO_CODE_DEL) ||
5293                             !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5294                                 ensure_normal_shift(str, dst);
5295                                 iso2022_designate(Vcharset_ascii, 0, str, dst);
5296                         }
5297
5298                         /* If necessary, restore everything to the default state
5299                            at end-of-line */
5300                         if (c == '\n' &&
5301                             !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5302                                 restore_left_to_right_direction(codesys, dst,
5303                                                                 &flags, 0);
5304
5305                                 ensure_normal_shift(str, dst);
5306
5307                                 for (i = 0; i < 4; i++) {
5308                                         Lisp_Object initial_charset =
5309                                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5310                                             (codesys, i);
5311                                         iso2022_designate(initial_charset, i,
5312                                                           str, dst);
5313                                 }
5314                         }
5315                         if (c == '\n') {
5316                                 if (eol_type != EOL_LF
5317                                     && eol_type != EOL_AUTODETECT)
5318                                         Dynarr_add(dst, '\r');
5319                                 if (eol_type != EOL_CR)
5320                                         Dynarr_add(dst, c);
5321                         } else {
5322                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5323                                     && fit_to_be_escape_quoted(c))
5324                                         Dynarr_add(dst, ISO_CODE_ESC);
5325                                 Dynarr_add(dst, c);
5326                         }
5327                         char_boundary = 1;
5328                 }
5329
5330                 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {     /* Processing Leading Byte */
5331                         ch = 0;
5332                         charset = CHARSET_BY_LEADING_BYTE(c);
5333                         if (LEADING_BYTE_PREFIX_P(c))
5334                                 ch = c;
5335                         else if (!EQ(charset, Vcharset_control_1)
5336 #ifdef ENABLE_COMPOSITE_CHARS
5337                                  && !EQ(charset, Vcharset_composite)
5338 #endif
5339                             ) {
5340                                 int reg;
5341
5342                                 ensure_correct_direction(XCHARSET_DIRECTION
5343                                                          (charset), codesys,
5344                                                          dst, &flags, 0);
5345
5346                                 /* Now determine which register to use. */
5347                                 reg = -1;
5348                                 for (i = 0; i < 4; i++) {
5349                                         if (EQ(charset, str->iso2022.charset[i])
5350                                             || EQ(charset,
5351                                                   CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5352                                                   (codesys, i))) {
5353                                                 reg = i;
5354                                                 break;
5355                                         }
5356                                 }
5357
5358                                 if (reg == -1) {
5359                                         if (XCHARSET_GRAPHIC(charset) != 0) {
5360                                                 if (!NILP
5361                                                     (str->iso2022.charset[1])
5362                                                     &&
5363                                                     (!CODING_SYSTEM_ISO2022_SEVEN
5364                                                      (codesys)
5365                                                      ||
5366                                                      CODING_SYSTEM_ISO2022_LOCK_SHIFT
5367                                                      (codesys)))
5368                                                         reg = 1;
5369                                                 else if (!NILP
5370                                                          (str->iso2022.
5371                                                           charset[2]))
5372                                                         reg = 2;
5373                                                 else if (!NILP
5374                                                          (str->iso2022.
5375                                                           charset[3]))
5376                                                         reg = 3;
5377                                                 else
5378                                                         reg = 0;
5379                                         } else
5380                                                 reg = 0;
5381                                 }
5382
5383                                 iso2022_designate(charset, reg, str, dst);
5384
5385                                 /* Now invoke that register. */
5386                                 switch (reg) {
5387                                 case 0:
5388                                         ensure_normal_shift(str, dst);
5389                                         half = 0;
5390                                         break;
5391
5392                                 case 1:
5393                                         if (CODING_SYSTEM_ISO2022_SEVEN
5394                                             (codesys)) {
5395                                                 ensure_shift_out(str, dst);
5396                                                 half = 0;
5397                                         } else
5398                                                 half = 1;
5399                                         break;
5400
5401                                 case 2:
5402                                         if (CODING_SYSTEM_ISO2022_SEVEN
5403                                             (str->codesys)) {
5404                                                 Dynarr_add(dst, ISO_CODE_ESC);
5405                                                 Dynarr_add(dst, 'N');
5406                                                 half = 0;
5407                                         } else {
5408                                                 Dynarr_add(dst, ISO_CODE_SS2);
5409                                                 half = 1;
5410                                         }
5411                                         break;
5412
5413                                 case 3:
5414                                         if (CODING_SYSTEM_ISO2022_SEVEN
5415                                             (str->codesys)) {
5416                                                 Dynarr_add(dst, ISO_CODE_ESC);
5417                                                 Dynarr_add(dst, 'O');
5418                                                 half = 0;
5419                                         } else {
5420                                                 Dynarr_add(dst, ISO_CODE_SS3);
5421                                                 half = 1;
5422                                         }
5423                                         break;
5424
5425                                 default:
5426                                         abort();
5427                                 }
5428                         }
5429                         char_boundary = 0;
5430                 } else {        /* Processing Non-ASCII character */
5431                         charmask = (half == 0 ? 0x7F : 0xFF);
5432                         char_boundary = 1;
5433                         if (EQ(charset, Vcharset_control_1)) {
5434                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5435                                     && fit_to_be_escape_quoted(c))
5436                                         Dynarr_add(dst, ISO_CODE_ESC);
5437                                 /* you asked for it ... */
5438                                 Dynarr_add(dst, c - 0x20);
5439                         } else {
5440                                 switch (XCHARSET_REP_BYTES(charset)) {
5441                                 case 2:
5442                                         Dynarr_add(dst, c & charmask);
5443                                         break;
5444                                 case 3:
5445                                         if (XCHARSET_PRIVATE_P(charset)) {
5446                                                 Dynarr_add(dst, c & charmask);
5447                                                 ch = 0;
5448                                         } else if (ch) {
5449 #ifdef ENABLE_COMPOSITE_CHARS
5450                                                 if (EQ
5451                                                     (charset,
5452                                                      Vcharset_composite)) {
5453                                                         if (in_composite) {
5454                                                                 /* #### Bother! We don't know how to
5455                                                                    handle this yet. */
5456                                                                 Dynarr_add(dst,
5457                                                                            '~');
5458                                                         } else {
5459                                                                 Emchar emch =
5460                                                                     MAKE_CHAR
5461                                                                     (Vcharset_composite,
5462                                                                      ch & 0x7F,
5463                                                                      c & 0x7F);
5464                                                                 Lisp_Object lstr
5465                                                                     =
5466                                                                     composite_char_string
5467                                                                     (emch);
5468                                                                 saved_n = n;
5469                                                                 saved_src = src;
5470                                                                 in_composite =
5471                                                                     1;
5472                                                                 src =
5473                                                                     XSTRING_DATA
5474                                                                     (lstr);
5475                                                                 n = XSTRING_LENGTH(lstr);
5476                                                                 Dynarr_add(dst,
5477                                                                            ISO_CODE_ESC);
5478                                                                 Dynarr_add(dst, '0');   /* start composing */
5479                                                         }
5480                                                 } else
5481 #endif                          /* ENABLE_COMPOSITE_CHARS */
5482                                                 {
5483                                                         Dynarr_add(dst,
5484                                                                    ch &
5485                                                                    charmask);
5486                                                         Dynarr_add(dst,
5487                                                                    c &
5488                                                                    charmask);
5489                                                 }
5490                                                 ch = 0;
5491                                         } else {
5492                                                 ch = c;
5493                                                 char_boundary = 0;
5494                                         }
5495                                         break;
5496                                 case 4:
5497                                         if (ch) {
5498                                                 Dynarr_add(dst, ch & charmask);
5499                                                 Dynarr_add(dst, c & charmask);
5500                                                 ch = 0;
5501                                         } else {
5502                                                 ch = c;
5503                                                 char_boundary = 0;
5504                                         }
5505                                         break;
5506                                 default:
5507                                         abort();
5508                                 }
5509                         }
5510                 }
5511         }
5512
5513 #ifdef ENABLE_COMPOSITE_CHARS
5514         if (in_composite) {
5515                 n = saved_n;
5516                 src = saved_src;
5517                 in_composite = 0;
5518                 Dynarr_add(dst, ISO_CODE_ESC);
5519                 Dynarr_add(dst, '1');   /* end composing */
5520                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
5521         }
5522 #endif                          /* ENABLE_COMPOSITE_CHARS */
5523
5524         if (char_boundary && flags & CODING_STATE_END) {
5525                 restore_left_to_right_direction(codesys, dst, &flags, 0);
5526                 ensure_normal_shift(str, dst);
5527                 for (i = 0; i < 4; i++) {
5528                         Lisp_Object initial_charset =
5529                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5530                         iso2022_designate(initial_charset, i, str, dst);
5531                 }
5532         }
5533
5534         str->flags = flags;
5535         str->ch = ch;
5536         str->iso2022.current_char_boundary = char_boundary;
5537         str->iso2022.current_charset = charset;
5538         str->iso2022.current_half = half;
5539
5540         /* Verbum caro factum est! */
5541 }
5542 #endif                          /* MULE */
5543 \f
5544 /************************************************************************/
5545 /*                     No-conversion methods                            */
5546 /************************************************************************/
5547
5548 /* This is used when reading in "binary" files -- i.e. files that may
5549    contain all 256 possible byte values and that are not to be
5550    interpreted as being in any particular decoding. */
5551 static void
5552 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5553                             unsigned_char_dynarr * dst, Lstream_data_count n)
5554 {
5555         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5556         unsigned int flags = str->flags;
5557         unsigned int ch = str->ch;
5558         eol_type_t eol_type = str->eol_type;
5559
5560         while (n--) {
5561                 const unsigned char c = *(const unsigned char *)src++;
5562
5563                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5564                 DECODE_ADD_BINARY_CHAR(c, dst);
5565         label_continue_loop:;
5566         }
5567
5568         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5569
5570         str->flags = flags;
5571         str->ch = ch;
5572 }
5573
5574 static void
5575 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5576                             unsigned_char_dynarr * dst, Lstream_data_count n)
5577 {
5578         unsigned char c;
5579         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5580         unsigned int flags = str->flags;
5581         unsigned int ch = str->ch;
5582         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5583
5584         while (n--) {
5585                 c = *src++;
5586                 if (c == '\n') {
5587                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5588                                 Dynarr_add(dst, '\r');
5589                         if (eol_type != EOL_CR)
5590                                 Dynarr_add(dst, '\n');
5591                         ch = 0;
5592                 } else if (BYTE_ASCII_P(c)) {
5593                         assert(ch == 0);
5594                         Dynarr_add(dst, c);
5595                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5596                         assert(ch == 0);
5597                         if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5598                             c == LEADING_BYTE_CONTROL_1)
5599                                 ch = c;
5600                         else
5601                                 Dynarr_add(dst, '~');   /* untranslatable character */
5602                 } else {
5603                         if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5604                                 Dynarr_add(dst, c);
5605                         else if (ch == LEADING_BYTE_CONTROL_1) {
5606                                 assert(c < 0xC0);
5607                                 Dynarr_add(dst, c - 0x20);
5608                         }
5609                         /* else it should be the second or third byte of an
5610                            untranslatable character, so ignore it */
5611                         ch = 0;
5612                 }
5613         }
5614
5615         str->flags = flags;
5616         str->ch = ch;
5617 }
5618 \f
5619 /************************************************************************/
5620 /*                             Initialization                           */
5621 /************************************************************************/
5622
5623 void syms_of_file_coding(void)
5624 {
5625         INIT_LRECORD_IMPLEMENTATION(coding_system);
5626
5627         DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5628
5629         DEFSUBR(Fcoding_system_p);
5630         DEFSUBR(Ffind_coding_system);
5631         DEFSUBR(Fget_coding_system);
5632         DEFSUBR(Fcoding_system_list);
5633         DEFSUBR(Fcoding_system_name);
5634         DEFSUBR(Fmake_coding_system);
5635         DEFSUBR(Fcopy_coding_system);
5636         DEFSUBR(Fcoding_system_canonical_name_p);
5637         DEFSUBR(Fcoding_system_alias_p);
5638         DEFSUBR(Fcoding_system_aliasee);
5639         DEFSUBR(Fdefine_coding_system_alias);
5640         DEFSUBR(Fsubsidiary_coding_system);
5641
5642         DEFSUBR(Fcoding_system_type);
5643         DEFSUBR(Fcoding_system_doc_string);
5644 #ifdef MULE
5645         DEFSUBR(Fcoding_system_charset);
5646 #endif
5647         DEFSUBR(Fcoding_system_property);
5648
5649         DEFSUBR(Fcoding_category_list);
5650         DEFSUBR(Fset_coding_priority_list);
5651         DEFSUBR(Fcoding_priority_list);
5652         DEFSUBR(Fset_coding_category_system);
5653         DEFSUBR(Fcoding_category_system);
5654
5655         DEFSUBR(Fdetect_coding_region);
5656         DEFSUBR(Fdecode_coding_region);
5657         DEFSUBR(Fencode_coding_region);
5658 #ifdef MULE
5659         DEFSUBR(Fdecode_shift_jis_char);
5660         DEFSUBR(Fencode_shift_jis_char);
5661         DEFSUBR(Fdecode_big5_char);
5662         DEFSUBR(Fencode_big5_char);
5663         DEFSUBR(Fset_ucs_char);
5664         DEFSUBR(Fucs_char);
5665         DEFSUBR(Fset_char_ucs);
5666         DEFSUBR(Fchar_ucs);
5667 #endif                          /* MULE */
5668         defsymbol(&Qcoding_systemp, "coding-system-p");
5669         defsymbol(&Qno_conversion, "no-conversion");
5670         defsymbol(&Qraw_text, "raw-text");
5671 #ifdef MULE
5672         defsymbol(&Qbig5, "big5");
5673         defsymbol(&Qshift_jis, "shift-jis");
5674         defsymbol(&Qucs4, "ucs-4");
5675         defsymbol(&Qutf8, "utf-8");
5676         defsymbol(&Qccl, "ccl");
5677         defsymbol(&Qiso2022, "iso2022");
5678 #endif                          /* MULE */
5679         defsymbol(&Qmnemonic, "mnemonic");
5680         defsymbol(&Qeol_type, "eol-type");
5681         defsymbol(&Qpost_read_conversion, "post-read-conversion");
5682         defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5683
5684         defsymbol(&Qcr, "cr");
5685         defsymbol(&Qlf, "lf");
5686         defsymbol(&Qcrlf, "crlf");
5687         defsymbol(&Qeol_cr, "eol-cr");
5688         defsymbol(&Qeol_lf, "eol-lf");
5689         defsymbol(&Qeol_crlf, "eol-crlf");
5690 #ifdef MULE
5691         defsymbol(&Qcharset_g0, "charset-g0");
5692         defsymbol(&Qcharset_g1, "charset-g1");
5693         defsymbol(&Qcharset_g2, "charset-g2");
5694         defsymbol(&Qcharset_g3, "charset-g3");
5695         defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5696         defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5697         defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5698         defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5699         defsymbol(&Qno_iso6429, "no-iso6429");
5700         defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5701         defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5702
5703         defsymbol(&Qshort, "short");
5704         defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5705         defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5706         defsymbol(&Qseven, "seven");
5707         defsymbol(&Qlock_shift, "lock-shift");
5708         defsymbol(&Qescape_quoted, "escape-quoted");
5709 #endif                          /* MULE */
5710         defsymbol(&Qencode, "encode");
5711         defsymbol(&Qdecode, "decode");
5712
5713 #ifdef MULE
5714         defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5715                   "shift-jis");
5716         defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5717         defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5718         defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5719         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5720         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5721                   "iso-8-designate");
5722         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5723         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5724         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5725                   "iso-lock-shift");
5726 #endif                          /* MULE */
5727         defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5728                   "no-conversion");
5729 }
5730
5731 void lstream_type_create_file_coding(void)
5732 {
5733         LSTREAM_HAS_METHOD(decoding, reader);
5734         LSTREAM_HAS_METHOD(decoding, writer);
5735         LSTREAM_HAS_METHOD(decoding, rewinder);
5736         LSTREAM_HAS_METHOD(decoding, seekable_p);
5737         LSTREAM_HAS_METHOD(decoding, flusher);
5738         LSTREAM_HAS_METHOD(decoding, closer);
5739         LSTREAM_HAS_METHOD(decoding, marker);
5740
5741         LSTREAM_HAS_METHOD(encoding, reader);
5742         LSTREAM_HAS_METHOD(encoding, writer);
5743         LSTREAM_HAS_METHOD(encoding, rewinder);
5744         LSTREAM_HAS_METHOD(encoding, seekable_p);
5745         LSTREAM_HAS_METHOD(encoding, flusher);
5746         LSTREAM_HAS_METHOD(encoding, closer);
5747         LSTREAM_HAS_METHOD(encoding, marker);
5748 }
5749
5750 void vars_of_file_coding(void)
5751 {
5752         int i;
5753
5754         fcd = xnew(struct file_coding_dump);
5755         dump_add_root_struct_ptr(&fcd, &fcd_description);
5756
5757         /* Initialize to something reasonable ... */
5758         for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5759                 fcd->coding_category_system[i] = Qnil;
5760                 fcd->coding_category_by_priority[i] = i;
5761         }
5762
5763         Fprovide(intern("file-coding"));
5764
5765         DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system  /*
5766 Coding system used for TTY keyboard input.
5767 Not used under a windowing system.
5768                                                                          */ );
5769         Vkeyboard_coding_system = Qnil;
5770
5771         DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system  /*
5772 Coding system used for TTY display output.
5773 Not used under a windowing system.
5774                                                                          */ );
5775         Vterminal_coding_system = Qnil;
5776
5777         DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read  /*
5778 Overriding coding system used when reading from a file or process.
5779 You should bind this variable with `let', but do not set it globally.
5780 If this is non-nil, it specifies the coding system that will be used
5781 to decode input on read operations, such as from a file or process.
5782 It overrides `buffer-file-coding-system-for-read',
5783 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5784 this one for permanent changes to the environment.  */ );
5785         Vcoding_system_for_read = Qnil;
5786
5787         DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write        /*
5788 Overriding coding system used when writing to a file or process.
5789 You should bind this variable with `let', but do not set it globally.
5790 If this is non-nil, it specifies the coding system that will be used
5791 to encode output for write operations, such as to a file or process.
5792 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5793 Use those variables instead of this one for permanent changes to the
5794 environment.  */ );
5795         Vcoding_system_for_write = Qnil;
5796
5797         DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system        /*
5798 Coding system used to convert pathnames when accessing files.
5799                                                                                  */ );
5800         Vfile_name_coding_system = Qnil;
5801
5802         DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5803 Non-nil means the buffer contents are regarded as multi-byte form
5804 of characters, not a binary code.  This affects the display, file I/O,
5805 and behaviors of various editing commands.
5806
5807 Setting this to nil does not do anything.
5808                                                                                  */ );
5809         enable_multibyte_characters = 1;
5810 }
5811
5812 void complex_vars_of_file_coding(void)
5813 {
5814         staticpro(&Vcoding_system_hash_table);
5815         Vcoding_system_hash_table =
5816             make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5817
5818         the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5819         dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5820                                  &codesys_prop_dynarr_description);
5821
5822 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5823 {                                               \
5824   struct codesys_prop csp;                      \
5825   csp.sym = (Sym);                              \
5826   csp.prop_type = (Prop_Type);                  \
5827   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5828 } while (0)
5829
5830         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5831         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5832         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5833         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5834         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5835         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5836         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5837 #ifdef MULE
5838         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5839         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5840         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5841         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5842         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5843         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5844         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5845         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5846         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5847         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5848         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5849         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5850         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5851         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5852         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5853         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5854         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5855
5856         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5857         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5858 #endif                          /* MULE */
5859         /* Need to create this here or we're really screwed. */
5860         Fmake_coding_system
5861             (Qraw_text, Qno_conversion,
5862              build_string
5863              ("Raw text, which means it converts only line-break-codes."),
5864              list2(Qmnemonic, build_string("Raw")));
5865
5866         Fmake_coding_system
5867             (Qbinary, Qno_conversion,
5868              build_string("Binary, which means it does not convert anything."),
5869              list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5870
5871         Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5872
5873         Fdefine_coding_system_alias(Qfile_name, Qbinary);
5874
5875         Fdefine_coding_system_alias(Qterminal, Qbinary);
5876         Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5877
5878         /* Need this for bootstrapping */
5879         fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5880             Fget_coding_system(Qraw_text);
5881
5882 #ifdef MULE
5883         {
5884                 size_t i;
5885
5886                 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5887                         fcd->ucs_to_mule_table[i] = Qnil;
5888         }
5889         staticpro(&mule_to_ucs_table);
5890         mule_to_ucs_table = Fmake_char_table(Qgeneric);
5891 #endif                          /* MULE */
5892 }