5893ed3f9210c224b4ed938b9b5131fcdcf74184
[sxemacs] / src / mule / file-coding.c
1 /* Code conversion functions.
2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3    Copyright (C) 1995 Sun Microsystems, Inc.
4
5 This file is part of SXEmacs
6
7 SXEmacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 SXEmacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>. */
19
20
21 /* Synched up with: Mule 2.3.   Not in FSF. */
22
23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
24
25 #include <config.h>
26 #include "lisp.h"
27
28 #include "buffer.h"
29 #include "elhash.h"
30 #include "ui/insdel.h"
31 #include "lstream.h"
32 #include "opaque.h"
33 #ifdef MULE
34 #include "mule-ccl.h"
35 #include "chartab.h"
36 #endif
37 #include "file-coding.h"
38
39 Lisp_Object Qcoding_system_error;
40
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
46
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
49
50 struct file_coding_dump {
51         /* Coding system currently associated with each coding category. */
52         Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
53
54         /* Table of all coding categories in decreasing order of priority.
55            This describes a permutation of the possible coding categories. */
56         int coding_category_by_priority[CODING_CATEGORY_LAST];
57
58 #ifdef MULE
59         Lisp_Object ucs_to_mule_table[65536];
60 #endif
61 } *fcd;
62
63 static const struct lrecord_description fcd_description_1[] = {
64         {XD_LISP_OBJECT_ARRAY,
65          offsetof(struct file_coding_dump, coding_category_system),
66          CODING_CATEGORY_LAST},
67 #ifdef MULE
68         {XD_LISP_OBJECT_ARRAY,
69          offsetof(struct file_coding_dump, ucs_to_mule_table),
70          countof(fcd->ucs_to_mule_table)},
71 #endif
72         {XD_END}
73 };
74
75 static const struct struct_description fcd_description = {
76         sizeof(struct file_coding_dump),
77         fcd_description_1
78 };
79
80 Lisp_Object mule_to_ucs_table;
81
82 Lisp_Object Qcoding_systemp;
83
84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
85 /* Qinternal in general.c */
86
87 Lisp_Object Qmnemonic, Qeol_type;
88 Lisp_Object Qcr, Qcrlf, Qlf;
89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
90 Lisp_Object Qpost_read_conversion;
91 Lisp_Object Qpre_write_conversion;
92
93 #ifdef MULE
94 Lisp_Object Qucs4, Qutf8;
95 Lisp_Object Qbig5, Qshift_jis;
96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
99 Lisp_Object Qno_iso6429;
100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
101 Lisp_Object Qescape_quoted;
102 Lisp_Object Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 #endif
104 Lisp_Object Qencode, Qdecode;
105
106 Lisp_Object Vcoding_system_hash_table;
107
108 int enable_multibyte_characters;
109
110 #ifdef MULE
111 /* Additional information used by the ISO2022 decoder and detector. */
112 struct iso2022_decoder {
113         /* CHARSET holds the character sets currently assigned to the G0
114            through G3 variables.  It is initialized from the array
115            INITIAL_CHARSET in CODESYS. */
116         Lisp_Object charset[4];
117
118         /* Which registers are currently invoked into the left (GL) and
119            right (GR) halves of the 8-bit encoding space? */
120         int register_left, register_right;
121
122         /* ISO_ESC holds a value indicating part of an escape sequence
123            that has already been seen. */
124         enum iso_esc_flag esc;
125
126         /* This records the bytes we've seen so far in an escape sequence,
127            in case the sequence is invalid (we spit out the bytes unchanged). */
128         unsigned char esc_bytes[8];
129
130         /* Index for next byte to store in ISO escape sequence. */
131         int esc_bytes_index;
132
133 #ifdef ENABLE_COMPOSITE_CHARS
134         /* Stuff seen so far when composing a string. */
135         unsigned_char_dynarr *composite_chars;
136 #endif
137
138         /* If we saw an invalid designation sequence for a particular
139            register, we flag it here and switch to ASCII.  The next time we
140            see a valid designation for this register, we turn off the flag
141            and do the designation normally, but pretend the sequence was
142            invalid.  The effect of all this is that (most of the time) the
143            escape sequences for both the switch to the unknown charset, and
144            the switch back to the known charset, get inserted literally into
145            the buffer and saved out as such.  The hope is that we can
146            preserve the escape sequences so that the resulting written out
147            file makes sense.  If we don't do any of this, the designation
148            to the invalid charset will be preserved but that switch back
149            to the known charset will probably get eaten because it was
150            the same charset that was already present in the register. */
151         unsigned char invalid_designated[4];
152
153         /* We try to do similar things as above for direction-switching
154            sequences.  If we encountered a direction switch while an
155            invalid designation was present, or an invalid designation
156            just after a direction switch (i.e. no valid designation
157            encountered yet), we insert the direction-switch escape
158            sequence literally into the output stream, and later on
159            insert the corresponding direction-restoring escape sequence
160            literally also. */
161         unsigned int switched_dir_and_no_valid_charset_yet:1;
162         unsigned int invalid_switch_dir:1;
163
164         /* Tells the decoder to output the escape sequence literally
165            even though it was valid.  Used in the games we play to
166            avoid lossage when we encounter invalid designations. */
167         unsigned int output_literally:1;
168         /* We encountered a direction switch followed by an invalid
169            designation.  We didn't output the direction switch
170            literally because we didn't know about the invalid designation;
171            but we have to do so now. */
172         unsigned int output_direction_sequence:1;
173 };
174 #endif                          /* MULE */
175 EXFUN(Fcopy_coding_system, 2);
176 #ifdef MULE
177 struct detection_state;
178 static int detect_coding_sjis(struct detection_state *st,
179                               const Extbyte * src, Lstream_data_count n);
180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
181                                unsigned_char_dynarr * dst,
182                                Lstream_data_count n);
183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
184                                unsigned_char_dynarr * dst,
185                                Lstream_data_count n);
186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
187                               Lstream_data_count n);
188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
189                                unsigned_char_dynarr * dst,
190                                Lstream_data_count n);
191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
192                                unsigned_char_dynarr * dst,
193                                Lstream_data_count n);
194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
195                               Lstream_data_count n);
196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
197                                unsigned_char_dynarr * dst,
198                                Lstream_data_count n);
199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
200                                unsigned_char_dynarr * dst,
201                                Lstream_data_count n);
202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
203                               Lstream_data_count n);
204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
205                                unsigned_char_dynarr * dst,
206                                Lstream_data_count n);
207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
208                                unsigned_char_dynarr * dst,
209                                Lstream_data_count n);
210 static int postprocess_iso2022_mask(int mask);
211 static void reset_iso2022(Lisp_Object coding_system,
212                           struct iso2022_decoder *iso);
213 static int detect_coding_iso2022(struct detection_state *st,
214                                  const Extbyte * src, Lstream_data_count n);
215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
216                                   unsigned_char_dynarr * dst,
217                                   Lstream_data_count n);
218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
219                                   unsigned_char_dynarr * dst,
220                                   Lstream_data_count n);
221 #endif                          /* MULE */
222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
223                                         unsigned_char_dynarr * dst,
224                                         Lstream_data_count n);
225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
226                                         unsigned_char_dynarr * dst,
227                                         Lstream_data_count n);
228 static void mule_decode(lstream_t decoding, const Extbyte * src,
229                         unsigned_char_dynarr * dst, Lstream_data_count n);
230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
231                         unsigned_char_dynarr * dst, Lstream_data_count n);
232
233 typedef struct codesys_prop codesys_prop;
234 struct codesys_prop {
235         Lisp_Object sym;
236         int prop_type;
237 };
238
239 typedef struct {
240         Dynarr_declare(codesys_prop);
241 } codesys_prop_dynarr;
242
243 static const struct lrecord_description codesys_prop_description_1[] = {
244         {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
245         {XD_END}
246 };
247
248 static const struct struct_description codesys_prop_description = {
249         sizeof(codesys_prop),
250         codesys_prop_description_1
251 };
252
253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
254         XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
255         {XD_END}
256 };
257
258 static const struct struct_description codesys_prop_dynarr_description = {
259         sizeof(codesys_prop_dynarr),
260         codesys_prop_dynarr_description_1
261 };
262
263 codesys_prop_dynarr *the_codesys_prop_dynarr;
264
265 enum codesys_prop_enum {
266         CODESYS_PROP_ALL_OK,
267         CODESYS_PROP_ISO2022,
268         CODESYS_PROP_CCL
269 };
270 \f
271 /************************************************************************/
272 /*                       Coding system functions                        */
273 /************************************************************************/
274
275 static Lisp_Object mark_coding_system(Lisp_Object);
276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
277 static void finalize_coding_system(void *header, int for_disksave);
278
279 #ifdef MULE
280 static const struct lrecord_description ccs_description_1[] = {
281         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
282         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
283         {XD_END}
284 };
285
286 static const struct struct_description ccs_description = {
287         sizeof(charset_conversion_spec),
288         ccs_description_1
289 };
290
291 static const struct lrecord_description ccsd_description_1[] = {
292         XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
293         {XD_END}
294 };
295
296 static const struct struct_description ccsd_description = {
297         sizeof(charset_conversion_spec_dynarr),
298         ccsd_description_1
299 };
300 #endif
301
302 static const struct lrecord_description coding_system_description[] = {
303         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
304         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
305         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
306         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
307         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
308         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
309         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
310         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
311 #ifdef MULE
312         {XD_LISP_OBJECT_ARRAY,
313          offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
314         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
315          &ccsd_description},
316         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
317          &ccsd_description},
318         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
319         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
320 #endif
321         {XD_END}
322 };
323
324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
325                               mark_coding_system, print_coding_system,
326                               finalize_coding_system,
327                               0, 0, coding_system_description,
328                               Lisp_Coding_System);
329
330 static Lisp_Object mark_coding_system(Lisp_Object obj)
331 {
332         Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
333
334         mark_object(CODING_SYSTEM_NAME(codesys));
335         mark_object(CODING_SYSTEM_DOC_STRING(codesys));
336         mark_object(CODING_SYSTEM_MNEMONIC(codesys));
337         mark_object(CODING_SYSTEM_EOL_LF(codesys));
338         mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
339         mark_object(CODING_SYSTEM_EOL_CR(codesys));
340
341         switch (CODING_SYSTEM_TYPE(codesys)) {
342 #ifdef MULE
343                 int i;
344         case CODESYS_ISO2022:
345                 for (i = 0; i < 4; i++)
346                         mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
347                                     (codesys, i));
348                 if (codesys->iso2022.input_conv) {
349                         for (i = 0;
350                              i < Dynarr_length(codesys->iso2022.input_conv);
351                              i++) {
352                                 struct charset_conversion_spec *ccs =
353                                     Dynarr_atp(codesys->iso2022.input_conv, i);
354                                 mark_object(ccs->from_charset);
355                                 mark_object(ccs->to_charset);
356                         }
357                 }
358                 if (codesys->iso2022.output_conv) {
359                         for (i = 0;
360                              i < Dynarr_length(codesys->iso2022.output_conv);
361                              i++) {
362                                 struct charset_conversion_spec *ccs =
363                                     Dynarr_atp(codesys->iso2022.output_conv, i);
364                                 mark_object(ccs->from_charset);
365                                 mark_object(ccs->to_charset);
366                         }
367                 }
368                 break;
369
370         case CODESYS_CCL:
371                 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
372                 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
373                 break;
374
375                 /* list the rest of them lot explicitly */
376         case CODESYS_AUTODETECT:
377         case CODESYS_SHIFT_JIS:
378         case CODESYS_BIG5:
379         case CODESYS_UCS4:
380         case CODESYS_UTF8:
381         case CODESYS_NO_CONVERSION:
382 #ifdef DEBUG_SXEMACS
383         case CODESYS_INTERNAL:
384 #endif
385 #endif                          /* MULE */
386         default:
387                 break;
388         }
389
390         mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
391         return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
392 }
393
394 static void
395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
396 {
397         Lisp_Coding_System *c = XCODING_SYSTEM(obj);
398         if (print_readably)
399                 error("printing unreadable object #<coding-system 0x%x>",
400                       c->header.uid);
401
402         write_c_string("#<coding-system ", printcharfun);
403         print_internal(c->name, printcharfun, 1);
404         write_c_string(">", printcharfun);
405 }
406
407 static void finalize_coding_system(void *header, int for_disksave)
408 {
409         Lisp_Coding_System *c = (Lisp_Coding_System *) header;
410         /* Since coding systems never go away, this function is not
411            necessary.  But it would be necessary if we changed things
412            so that coding systems could go away. */
413         if (!for_disksave) {    /* see comment in lstream.c */
414                 switch (CODING_SYSTEM_TYPE(c)) {
415 #ifdef MULE
416                 case CODESYS_ISO2022:
417                         if (c->iso2022.input_conv) {
418                                 Dynarr_free(c->iso2022.input_conv);
419                                 c->iso2022.input_conv = 0;
420                         }
421                         if (c->iso2022.output_conv) {
422                                 Dynarr_free(c->iso2022.output_conv);
423                                 c->iso2022.output_conv = 0;
424                         }
425                         break;
426
427                         /* list the rest of them lot explicitly */
428                 case CODESYS_AUTODETECT:
429                 case CODESYS_SHIFT_JIS:
430                 case CODESYS_BIG5:
431                 case CODESYS_UCS4:
432                 case CODESYS_UTF8:
433                 case CODESYS_CCL:
434                 case CODESYS_NO_CONVERSION:
435 #ifdef DEBUG_SXEMACS
436                 case CODESYS_INTERNAL:
437 #endif
438 #endif                          /* MULE */
439                 default:
440                         break;
441                 }
442         }
443 }
444
445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
446 {
447         CHECK_SYMBOL(symbol);
448         if (NILP(symbol))
449                 return EOL_AUTODETECT;
450         if (EQ(symbol, Qlf))
451                 return EOL_LF;
452         if (EQ(symbol, Qcrlf))
453                 return EOL_CRLF;
454         if (EQ(symbol, Qcr))
455                 return EOL_CR;
456
457         signal_simple_error("Unrecognized eol type", symbol);
458         return EOL_AUTODETECT;  /* not reached */
459 }
460
461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
462 {
463         switch (type) {
464         default:
465                 abort();
466                 break;
467         case EOL_LF:
468                 return Qlf;
469         case EOL_CRLF:
470                 return Qcrlf;
471         case EOL_CR:
472                 return Qcr;
473         case EOL_AUTODETECT:
474                 return Qnil;
475         }
476 }
477
478 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
479 {
480         Lisp_Object codesys_obj;
481         int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
482         char *codesys_name = (char *)alloca(len + 7);
483         int mlen = -1;
484         char *codesys_mnemonic = 0;
485
486         Lisp_Object codesys_name_sym, sub_codesys_obj;
487
488         /* kludge */
489
490         XSETCODING_SYSTEM(codesys_obj, codesys);
491
492         memcpy(codesys_name,
493                string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
494
495         if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
496                 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
497                 codesys_mnemonic = (char *)alloca(mlen + 7);
498                 memcpy(codesys_mnemonic,
499                        XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
500         }
501 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
502   strcpy (codesys_name + len, "-" op_sys);                                      \
503   if (mlen != -1)                                                               \
504     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
505   codesys_name_sym = intern (codesys_name);                                     \
506   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
507   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
508   if (mlen != -1)                                                               \
509     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
510       build_string (codesys_mnemonic);                                          \
511   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
512 } while (0)
513
514         DEFINE_SUB_CODESYS("unix", "", EOL_LF);
515         DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
516         DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
517 }
518
519 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0,     /*
520 Return t if OBJECT is a coding system.
521 A coding system is an object that defines how text containing multiple
522 character sets is encoded into a stream of (typically 8-bit) bytes.
523 The coding system is used to decode the stream into a series of
524 characters (which may be from multiple charsets) when the text is read
525 from a file or process, and is used to encode the text back into the
526 same format when it is written out to a file or process.
527
528 For example, many ISO2022-compliant coding systems (such as Compound
529 Text, which is used for inter-client data under the X Window System)
530 use escape sequences to switch between different charsets -- Japanese
531 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
532 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
533 `make-coding-system' for more information.
534
535 Coding systems are normally identified using a symbol, and the
536 symbol is accepted in place of the actual coding system object whenever
537 a coding system is called for. (This is similar to how faces work.)
538 */
539       (object))
540 {
541         return CODING_SYSTEMP(object) ? Qt : Qnil;
542 }
543
544 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0,       /*
545 Retrieve the coding system of the given name.
546
547 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
548 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
549 If there is no such coding system, nil is returned.  Otherwise the
550 associated coding system object is returned.
551 */
552       (coding_system_or_name))
553 {
554         if (NILP(coding_system_or_name))
555                 coding_system_or_name = Qbinary;
556         else if (CODING_SYSTEMP(coding_system_or_name))
557                 return coding_system_or_name;
558         else
559                 CHECK_SYMBOL(coding_system_or_name);
560
561         while (1) {
562                 coding_system_or_name =
563                     Fgethash(coding_system_or_name, Vcoding_system_hash_table,
564                              Qnil);
565
566                 if (CODING_SYSTEMP(coding_system_or_name)
567                     || NILP(coding_system_or_name))
568                         return coding_system_or_name;
569         }
570 }
571
572 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
573 Retrieve the coding system of the given name.
574 Same as `find-coding-system' except that if there is no such
575 coding system, an error is signaled instead of returning nil.
576 */
577       (name))
578 {
579         Lisp_Object coding_system = Ffind_coding_system(name);
580
581         if (NILP(coding_system))
582                 signal_simple_error("No such coding system", name);
583         return coding_system;
584 }
585
586 /* We store the coding systems in hash tables with the names as the key and the
587    actual coding system object as the value.  Occasionally we need to use them
588    in a list format.  These routines provide us with that. */
589 struct coding_system_list_closure {
590         Lisp_Object *coding_system_list;
591 };
592
593 static int
594 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
595                                  void *coding_system_list_closure)
596 {
597         /* This function can GC */
598         struct coding_system_list_closure *cscl =
599             (struct coding_system_list_closure *)coding_system_list_closure;
600         Lisp_Object *coding_system_list = cscl->coding_system_list;
601
602         *coding_system_list = Fcons(key, *coding_system_list);
603         return 0;
604 }
605
606 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0,       /*
607 Return a list of the names of all defined coding systems.
608 */
609       ())
610 {
611         Lisp_Object coding_system_list = Qnil;
612         struct gcpro gcpro1;
613         struct coding_system_list_closure coding_system_list_closure;
614
615         GCPRO1(coding_system_list);
616         coding_system_list_closure.coding_system_list = &coding_system_list;
617         elisp_maphash(add_coding_system_to_list_mapper,
618                       Vcoding_system_hash_table, &coding_system_list_closure);
619         UNGCPRO;
620
621         return coding_system_list;
622 }
623
624 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0,       /*
625 Return the name of the given coding system.
626 */
627       (coding_system))
628 {
629         coding_system = Fget_coding_system(coding_system);
630         return XCODING_SYSTEM_NAME(coding_system);
631 }
632
633 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
634                                                   Lisp_Object name)
635 {
636         Lisp_Coding_System *codesys =
637             alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
638
639         zero_lcrecord(codesys);
640         CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
641         CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
642         CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
643         CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
644         CODING_SYSTEM_EOL_CR(codesys) = Qnil;
645         CODING_SYSTEM_EOL_LF(codesys) = Qnil;
646         CODING_SYSTEM_TYPE(codesys) = type;
647         CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
648 #ifdef MULE
649         if (type == CODESYS_ISO2022) {
650                 int i;
651                 for (i = 0; i < 4; i++)
652                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
653                             Qnil;
654         } else if (type == CODESYS_CCL) {
655                 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
656                 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
657         }
658 #endif                          /* MULE */
659         CODING_SYSTEM_NAME(codesys) = name;
660
661         return codesys;
662 }
663
664 #ifdef MULE
665 /* Given a list of charset conversion specs as specified in a Lisp
666    program, parse it into STORE_HERE. */
667
668 static void
669 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
670                                Lisp_Object spec_list)
671 {
672         Lisp_Object rest;
673
674         EXTERNAL_LIST_LOOP(rest, spec_list) {
675                 Lisp_Object car = XCAR(rest);
676                 Lisp_Object from, to;
677                 struct charset_conversion_spec spec;
678
679                 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
680                         signal_simple_error("Invalid charset conversion spec",
681                                             car);
682                 from = Fget_charset(XCAR(car));
683                 to = Fget_charset(XCAR(XCDR(car)));
684                 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
685                         signal_simple_error_2
686                             ("Attempted conversion between different charset types",
687                              from, to);
688                 spec.from_charset = from;
689                 spec.to_charset = to;
690
691                 Dynarr_add(store_here, spec);
692         }
693 }
694
695 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
696    specs, return the equivalent as the Lisp programmer would see it.
697
698    If LOAD_HERE is 0, return Qnil. */
699
700 static Lisp_Object
701 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
702 {
703         int i;
704         Lisp_Object result;
705
706         if (!load_here)
707                 return Qnil;
708         for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
709                 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
710                 result =
711                     Fcons(list2(ccs->from_charset, ccs->to_charset), result);
712         }
713
714         return Fnreverse(result);
715 }
716
717 #endif                          /* MULE */
718
719 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0,       /*
720 Register symbol NAME as a coding system.
721
722 TYPE describes the conversion method used and should be one of
723
724 nil or 'undecided
725 Automatic conversion.  SXEmacs attempts to detect the coding system
726 used in the file.
727 'no-conversion
728 No conversion.  Use this for binary files and such.  On output,
729 graphic characters that are not in ASCII or Latin-1 will be
730 replaced by a ?. (For a no-conversion-encoded buffer, these
731 characters will only be present if you explicitly insert them.)
732 'shift-jis
733 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
734 'ucs-4
735 ISO 10646 UCS-4 encoding.
736 'utf-8
737 ISO 10646 UTF-8 encoding.
738 'iso2022
739 Any ISO2022-compliant encoding.  Among other things, this includes
740 JIS (the Japanese encoding commonly used for e-mail), EUC (the
741 standard Unix encoding for Japanese and other languages), and
742 Compound Text (the encoding used in X11).  You can specify more
743 specific information about the conversion with the PROPS argument.
744 'big5
745 Big5 (the encoding commonly used for Taiwanese).
746 'ccl
747 The conversion is performed using a user-written pseudo-code
748 program.  CCL (Code Conversion Language) is the name of this
749 pseudo-code.
750 'internal
751 Write out or read in the raw contents of the memory representing
752 the buffer's text.  This is primarily useful for debugging
753 purposes, and is only enabled when SXEmacs has been compiled with
754 DEBUG_SXEMACS defined (via the --debug configure option).
755 WARNING: Reading in a file using 'internal conversion can result
756 in an internal inconsistency in the memory representing a
757 buffer's text, which will produce unpredictable results and may
758 cause SXEmacs to crash.  Under normal circumstances you should
759 never use 'internal conversion.
760
761 DOC-STRING is a string describing the coding system.
762
763 PROPS is a property list, describing the specific nature of the
764 character set.  Recognized properties are:
765
766 'mnemonic
767 String to be displayed in the modeline when this coding system is
768 active.
769
770 'eol-type
771 End-of-line conversion to be used.  It should be one of
772
773 nil
774 Automatically detect the end-of-line type (LF, CRLF,
775 or CR).  Also generate subsidiary coding systems named
776 `NAME-unix', `NAME-dos', and `NAME-mac', that are
777 identical to this coding system but have an EOL-TYPE
778 value of 'lf, 'crlf, and 'cr, respectively.
779 'lf
780 The end of a line is marked externally using ASCII LF.
781 Since this is also the way that SXEmacs represents an
782 end-of-line internally, specifying this option results
783 in no end-of-line conversion.  This is the standard
784 format for Unix text files.
785 'crlf
786 The end of a line is marked externally using ASCII
787 CRLF.  This is the standard format for MS-DOS text
788 files.
789 'cr
790 The end of a line is marked externally using ASCII CR.
791 This is the standard format for Macintosh text files.
792 t
793 Automatically detect the end-of-line type but do not
794 generate subsidiary coding systems.  (This value is
795 converted to nil when stored internally, and
796 `coding-system-property' will return nil.)
797
798 'post-read-conversion
799 Function called after a file has been read in, to perform the
800 decoding.  Called with two arguments, START and END, denoting
801 a region of the current buffer to be decoded.
802
803 'pre-write-conversion
804 Function called before a file is written out, to perform the
805 encoding.  Called with two arguments, START and END, denoting
806 a region of the current buffer to be encoded.
807
808 The following additional properties are recognized if TYPE is 'iso2022:
809
810 'charset-g0
811 'charset-g1
812 'charset-g2
813 'charset-g3
814 The character set initially designated to the G0 - G3 registers.
815 The value should be one of
816
817 -- A charset object (designate that character set)
818 -- nil (do not ever use this register)
819 -- t (no character set is initially designated to
820 the register, but may be later on; this automatically
821 sets the corresponding `force-g*-on-output' property)
822
823 'force-g0-on-output
824 'force-g1-on-output
825 'force-g2-on-output
826 'force-g2-on-output
827 If non-nil, send an explicit designation sequence on output before
828 using the specified register.
829
830 'short
831 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
832 "ESC $ B" on output in place of the full designation sequences
833 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
834
835 'no-ascii-eol
836 If non-nil, don't designate ASCII to G0 at each end of line on output.
837 Setting this to non-nil also suppresses other state-resetting that
838 normally happens at the end of a line.
839
840 'no-ascii-cntl
841 If non-nil, don't designate ASCII to G0 before control chars on output.
842
843 'seven
844 If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
845 environment.
846
847 'lock-shift
848 If non-nil, use locking-shift (SO/SI) instead of single-shift
849 or designation by escape sequence.
850
851 'no-iso6429
852 If non-nil, don't use ISO6429's direction specification.
853
854 'escape-quoted
855 If non-nil, literal control characters that are the same as
856 the beginning of a recognized ISO2022 or ISO6429 escape sequence
857 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
858 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
859 so that they can be properly distinguished from an escape sequence.
860 (Note that doing this results in a non-portable encoding.) This
861 encoding flag is used for byte-compiled files.  Note that ESC
862 is a good choice for a quoting character because there are no
863 escape sequences whose second byte is a character from the Control-0
864 or Control-1 character sets; this is explicitly disallowed by the
865 ISO2022 standard.
866
867 'input-charset-conversion
868 A list of conversion specifications, specifying conversion of
869 characters in one charset to another when decoding is performed.
870 Each specification is a list of two elements: the source charset,
871 and the destination charset.
872
873 'output-charset-conversion
874 A list of conversion specifications, specifying conversion of
875 characters in one charset to another when encoding is performed.
876 The form of each specification is the same as for
877 'input-charset-conversion.
878
879 The following additional properties are recognized (and required)
880 if TYPE is 'ccl:
881
882 'decode
883 CCL program used for decoding (converting to internal format).
884
885 'encode
886 CCL program used for encoding (converting to external format).
887 */
888       (name, type, doc_string, props))
889 {
890         Lisp_Coding_System *codesys;
891         enum coding_system_type ty;
892         int need_to_setup_eol_systems = 1;
893
894         /* Convert type to constant */
895         if (NILP(type) || EQ(type, Qundecided)) {
896                 ty = CODESYS_AUTODETECT;
897         }
898 #ifdef MULE
899         else if (EQ(type, Qshift_jis)) {
900                 ty = CODESYS_SHIFT_JIS;
901         } else if (EQ(type, Qiso2022)) {
902                 ty = CODESYS_ISO2022;
903         } else if (EQ(type, Qbig5)) {
904                 ty = CODESYS_BIG5;
905         } else if (EQ(type, Qucs4)) {
906                 ty = CODESYS_UCS4;
907         } else if (EQ(type, Qutf8)) {
908                 ty = CODESYS_UTF8;
909         } else if (EQ(type, Qccl)) {
910                 ty = CODESYS_CCL;
911         }
912 #endif
913         else if (EQ(type, Qno_conversion)) {
914                 ty = CODESYS_NO_CONVERSION;
915         }
916 #ifdef DEBUG_SXEMACS
917         else if (EQ(type, Qinternal)) {
918                 ty = CODESYS_INTERNAL;
919         }
920 #endif
921         else
922                 signal_simple_error("Invalid coding system type", type);
923
924         CHECK_SYMBOL(name);
925
926         codesys = allocate_coding_system(ty, name);
927
928         if (NILP(doc_string))
929                 doc_string = build_string("");
930         else
931                 CHECK_STRING(doc_string);
932         CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
933
934         {
935                 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
936                         if (EQ(key, Qmnemonic)) {
937                                 if (!NILP(value))
938                                         CHECK_STRING(value);
939                                 CODING_SYSTEM_MNEMONIC(codesys) = value;
940                         }
941
942                         else if (EQ(key, Qeol_type)) {
943                                 need_to_setup_eol_systems = NILP(value);
944                                 if (EQ(value, Qt))
945                                         value = Qnil;
946                                 CODING_SYSTEM_EOL_TYPE(codesys) =
947                                     symbol_to_eol_type(value);
948                         }
949
950                         else if (EQ(key, Qpost_read_conversion))
951                                 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
952                                     value;
953                         else if (EQ(key, Qpre_write_conversion))
954                                 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
955                                     value;
956 #ifdef MULE
957                         else if (ty == CODESYS_ISO2022) {
958 #define FROB_INITIAL_CHARSET(charset_num) \
959   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
960     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
961
962                                 if (EQ(key, Qcharset_g0))
963                                         FROB_INITIAL_CHARSET(0);
964                                 else if (EQ(key, Qcharset_g1))
965                                         FROB_INITIAL_CHARSET(1);
966                                 else if (EQ(key, Qcharset_g2))
967                                         FROB_INITIAL_CHARSET(2);
968                                 else if (EQ(key, Qcharset_g3))
969                                         FROB_INITIAL_CHARSET(3);
970
971 #define FROB_FORCE_CHARSET(charset_num) \
972   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
973
974                                 else if (EQ(key, Qforce_g0_on_output))
975                                         FROB_FORCE_CHARSET(0);
976                                 else if (EQ(key, Qforce_g1_on_output))
977                                         FROB_FORCE_CHARSET(1);
978                                 else if (EQ(key, Qforce_g2_on_output))
979                                         FROB_FORCE_CHARSET(2);
980                                 else if (EQ(key, Qforce_g3_on_output))
981                                         FROB_FORCE_CHARSET(3);
982
983 #define FROB_BOOLEAN_PROPERTY(prop) \
984   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
985
986                                 else if (EQ(key, Qshort))
987                                         FROB_BOOLEAN_PROPERTY(SHORT);
988                                 else if (EQ(key, Qno_ascii_eol))
989                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
990                                 else if (EQ(key, Qno_ascii_cntl))
991                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
992                                 else if (EQ(key, Qseven))
993                                         FROB_BOOLEAN_PROPERTY(SEVEN);
994                                 else if (EQ(key, Qlock_shift))
995                                         FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
996                                 else if (EQ(key, Qno_iso6429))
997                                         FROB_BOOLEAN_PROPERTY(NO_ISO6429);
998                                 else if (EQ(key, Qescape_quoted))
999                                         FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
1000
1001                                 else if (EQ(key, Qinput_charset_conversion)) {
1002                                         codesys->iso2022.input_conv =
1003                                             Dynarr_new(charset_conversion_spec);
1004                                         parse_charset_conversion_specs(codesys->
1005                                                                        iso2022.
1006                                                                        input_conv,
1007                                                                        value);
1008                                 } else if (EQ(key, Qoutput_charset_conversion)) {
1009                                         codesys->iso2022.output_conv =
1010                                             Dynarr_new(charset_conversion_spec);
1011                                         parse_charset_conversion_specs(codesys->
1012                                                                        iso2022.
1013                                                                        output_conv,
1014                                                                        value);
1015                                 } else
1016                                         signal_simple_error
1017                                             ("Unrecognized property", key);
1018                         } else if (EQ(type, Qccl)) {
1019                                 Lisp_Object sym;
1020                                 struct ccl_program test_ccl;
1021                                 Extbyte *suffix;
1022
1023                                 /* Check key first.  */
1024                                 if (EQ(key, Qdecode))
1025                                         suffix = "-ccl-decode";
1026                                 else if (EQ(key, Qencode))
1027                                         suffix = "-ccl-encode";
1028                                 else
1029                                         signal_simple_error
1030                                             ("Unrecognized property", key);
1031
1032                                 /* If value is vector, register it as a ccl program
1033                                    associated with an newly created symbol for
1034                                    backward compatibility.  */
1035                                 if (VECTORP(value)) {
1036                                         sym =
1037                                             Fintern(concat2
1038                                                     (Fsymbol_name(name),
1039                                                      build_string(suffix)),
1040                                                     Qnil);
1041                                         Fregister_ccl_program(sym, value);
1042                                 } else {
1043                                         CHECK_SYMBOL(value);
1044                                         sym = value;
1045                                 }
1046                                 /* check if the given ccl programs are valid.  */
1047                                 if (setup_ccl_program(&test_ccl, sym) < 0)
1048                                         signal_simple_error
1049                                             ("Invalid CCL program", value);
1050
1051                                 if (EQ(key, Qdecode))
1052                                         CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1053                                 else if (EQ(key, Qencode))
1054                                         CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1055
1056                         }
1057 #endif                          /* MULE */
1058                         else
1059                                 signal_simple_error("Unrecognized property",
1060                                                     key);
1061                 }
1062         }
1063
1064         if (need_to_setup_eol_systems)
1065                 setup_eol_coding_systems(codesys);
1066
1067         {
1068                 Lisp_Object codesys_obj;
1069                 XSETCODING_SYSTEM(codesys_obj, codesys);
1070                 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1071                 return codesys_obj;
1072         }
1073 }
1074
1075 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0,       /*
1076 Copy OLD-CODING-SYSTEM to NEW-NAME.
1077 If NEW-NAME does not name an existing coding system, a new one will
1078 be created.
1079 */
1080       (old_coding_system, new_name))
1081 {
1082         Lisp_Object new_coding_system;
1083         old_coding_system = Fget_coding_system(old_coding_system);
1084         new_coding_system = Ffind_coding_system(new_name);
1085         if (NILP(new_coding_system)) {
1086                 XSETCODING_SYSTEM(new_coding_system,
1087                                   allocate_coding_system
1088                                   (XCODING_SYSTEM_TYPE(old_coding_system),
1089                                    new_name));
1090                 Fputhash(new_name, new_coding_system,
1091                          Vcoding_system_hash_table);
1092         }
1093
1094         {
1095                 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1096                 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1097                 memcpy(((char *)to) + sizeof(to->header),
1098                        ((char *)from) + sizeof(from->header),
1099                        sizeof(*from) - sizeof(from->header));
1100                 to->name = new_name;
1101         }
1102         return new_coding_system;
1103 }
1104
1105 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0,       /*
1106 Return t if OBJECT names a coding system, and is not a coding system alias.
1107 */
1108       (object))
1109 {
1110         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1111         return CODING_SYSTEMP(val) ? Qt : Qnil;
1112 }
1113
1114 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1115  Return t if OBJECT is a coding system alias.
1116 All coding system aliases are created by `define-coding-system-alias'.
1117 */
1118       (object))
1119 {
1120         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1121         return SYMBOLP(val) ? Qt : Qnil;
1122 }
1123
1124 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1125 Return the coding-system symbol for which symbol ALIAS is an alias.
1126 */
1127       (alias))
1128 {
1129         Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1130         if (SYMBOLP(aliasee)) {
1131                 return aliasee;
1132         } else {
1133                 signal_simple_error(
1134                         "Symbol is not a coding system alias", alias);
1135         }
1136         /* To keep the compiler happy */
1137         return Qnil;
1138 }
1139
1140 static Lisp_Object
1141 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1142 {
1143         return Fintern(concat2(Fsymbol_name(symbol),
1144                                build_string(ascii_string)), Qnil);
1145 }
1146
1147 /* A maphash function, for removing dangling coding system aliases. */
1148 static int
1149 dangling_coding_system_alias_p(Lisp_Object alias,
1150                                Lisp_Object aliasee, void *dangling_aliases)
1151 {
1152         if (SYMBOLP(aliasee)
1153             && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1154                 (*(int *)dangling_aliases)++;
1155                 return 1;
1156         } else {
1157                 return 0;
1158         }
1159 }
1160
1161 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0,       /*
1162 Define symbol ALIAS as an alias for coding system ALIASEE.
1163
1164 You can use this function to redefine an alias that has already been defined,
1165 but you cannot redefine a name which is the canonical name for a coding system.
1166 \(a canonical name of a coding system is what is returned when you call
1167 `coding-system-name' on a coding system).
1168
1169 ALIASEE itself can be an alias, which allows you to define nested aliases.
1170
1171 You are forbidden, however, from creating alias loops or `dangling' aliases.
1172 These will be detected, and an error will be signaled if you attempt to do so.
1173
1174 If ALIASEE is nil, then ALIAS will simply be undefined.
1175
1176 See also `coding-system-alias-p', `coding-system-aliasee',
1177 and `coding-system-canonical-name-p'.
1178 */
1179       (alias, aliasee))
1180 {
1181         Lisp_Object real_coding_system, probe;
1182
1183         CHECK_SYMBOL(alias);
1184
1185         if (!NILP(Fcoding_system_canonical_name_p(alias)))
1186                 signal_simple_error
1187                     ("Symbol is the canonical name of a coding system and cannot be redefined",
1188                      alias);
1189
1190         if (NILP(aliasee)) {
1191                 Lisp_Object subsidiary_unix =
1192                     append_suffix_to_symbol(alias, "-unix");
1193                 Lisp_Object subsidiary_dos =
1194                     append_suffix_to_symbol(alias, "-dos");
1195                 Lisp_Object subsidiary_mac =
1196                     append_suffix_to_symbol(alias, "-mac");
1197
1198                 Fremhash(alias, Vcoding_system_hash_table);
1199
1200                 /* Undefine subsidiary aliases,
1201                    presumably created by a previous call to this function */
1202                 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1203                     !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1204                     !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1205                         Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1206                         Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1207                         Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1208                 }
1209
1210                 /* Undefine dangling coding system aliases. */
1211                 {
1212                         int dangling_aliases;
1213
1214                         do {
1215                                 dangling_aliases = 0;
1216                                 elisp_map_remhash
1217                                     (dangling_coding_system_alias_p,
1218                                      Vcoding_system_hash_table,
1219                                      &dangling_aliases);
1220                         } while (dangling_aliases > 0);
1221                 }
1222
1223                 return Qnil;
1224         }
1225
1226         if (CODING_SYSTEMP(aliasee))
1227                 aliasee = XCODING_SYSTEM_NAME(aliasee);
1228
1229         /* Checks that aliasee names a coding-system */
1230         real_coding_system = Fget_coding_system(aliasee);
1231
1232         /* Check for coding system alias loops */
1233         if (EQ(alias, aliasee))
1234               alias_loop:signal_simple_error_2
1235                     ("Attempt to create a coding system alias loop", alias,
1236                      aliasee);
1237
1238         for (probe = aliasee;
1239              SYMBOLP(probe);
1240              probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1241                 if (EQ(probe, alias))
1242                         goto alias_loop;
1243         }
1244
1245         Fputhash(alias, aliasee, Vcoding_system_hash_table);
1246
1247         /* Set up aliases for subsidiaries.
1248            #### There must be a better way to handle subsidiary coding
1249            #### systems. */
1250         {
1251                 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1252
1253                 for (int i = 0; i < countof(suffixes); i++) {
1254                         Lisp_Object alias_subsidiary =
1255                                 append_suffix_to_symbol(alias, suffixes[i]);
1256                         Lisp_Object aliasee_subsidiary =
1257                                 append_suffix_to_symbol(aliasee, suffixes[i]);
1258
1259                         if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1260                                 Fdefine_coding_system_alias(alias_subsidiary,
1261                                                             aliasee_subsidiary);
1262                         }
1263                 }
1264         }
1265         /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1266            but it doesn't look intentional, so I'd rather return something
1267            meaningful or nothing at all. */
1268         return Qnil;
1269 }
1270
1271 static Lisp_Object
1272 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1273 {
1274         Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1275         Lisp_Object new_coding_system;
1276
1277         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1278                 return coding_system;
1279
1280         switch (type) {
1281         case EOL_AUTODETECT:
1282                 return coding_system;
1283         case EOL_LF:
1284                 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1285                 break;
1286         case EOL_CR:
1287                 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1288                 break;
1289         case EOL_CRLF:
1290                 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1291                 break;
1292         default:
1293                 abort();
1294                 return Qnil;
1295         }
1296
1297         return NILP(new_coding_system) ? coding_system : new_coding_system;
1298 }
1299
1300 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0,   /*
1301 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1302 */
1303       (coding_system, eol_type))
1304 {
1305         coding_system = Fget_coding_system(coding_system);
1306
1307         return subsidiary_coding_system(coding_system,
1308                                         symbol_to_eol_type(eol_type));
1309 }
1310 \f
1311 /************************************************************************/
1312 /*                         Coding system accessors                      */
1313 /************************************************************************/
1314
1315 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0,   /*
1316 Return the doc string for CODING-SYSTEM.
1317 */
1318       (coding_system))
1319 {
1320         coding_system = Fget_coding_system(coding_system);
1321         return XCODING_SYSTEM_DOC_STRING(coding_system);
1322 }
1323
1324 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0,       /*
1325 Return the type of CODING-SYSTEM.
1326 */
1327       (coding_system))
1328 {
1329         Lisp_Object tmp = Fget_coding_system(coding_system);
1330
1331         switch (XCODING_SYSTEM_TYPE(tmp)) {
1332         default:
1333                 abort();
1334         case CODESYS_AUTODETECT:
1335                 return Qundecided;
1336 #ifdef MULE
1337         case CODESYS_SHIFT_JIS:
1338                 return Qshift_jis;
1339         case CODESYS_ISO2022:
1340                 return Qiso2022;
1341         case CODESYS_BIG5:
1342                 return Qbig5;
1343         case CODESYS_UCS4:
1344                 return Qucs4;
1345         case CODESYS_UTF8:
1346                 return Qutf8;
1347         case CODESYS_CCL:
1348                 return Qccl;
1349 #endif
1350         case CODESYS_NO_CONVERSION:
1351                 return Qno_conversion;
1352 #ifdef DEBUG_SXEMACS
1353         case CODESYS_INTERNAL:
1354                 return Qinternal;
1355 #endif
1356         }
1357 }
1358
1359 #ifdef MULE
1360 static
1361 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1362 {
1363         Lisp_Object cs
1364             = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1365
1366         return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1367 }
1368
1369 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1370 Return initial charset of CODING-SYSTEM designated to GNUM.
1371 GNUM allows 0 .. 3.
1372 */
1373       (coding_system, gnum))
1374 {
1375         coding_system = Fget_coding_system(coding_system);
1376         CHECK_INT(gnum);
1377
1378         return coding_system_charset(coding_system, XINT(gnum));
1379 }
1380 #endif                          /* MULE */
1381
1382 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0,       /*
1383 Return the PROP property of CODING-SYSTEM.
1384 */
1385       (coding_system, prop))
1386 {
1387         int i, ok = 0;
1388         enum coding_system_type type;
1389
1390         coding_system = Fget_coding_system(coding_system);
1391         CHECK_SYMBOL(prop);
1392         type = XCODING_SYSTEM_TYPE(coding_system);
1393
1394         for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1395                 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1396                         ok = 1;
1397                         switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1398                         case CODESYS_PROP_ALL_OK:
1399                                 break;
1400 #ifdef MULE
1401                         case CODESYS_PROP_ISO2022:
1402                                 if (type != CODESYS_ISO2022)
1403                                         signal_simple_error
1404                                             ("Property only valid in ISO2022 coding systems",
1405                                              prop);
1406                                 break;
1407
1408                         case CODESYS_PROP_CCL:
1409                                 if (type != CODESYS_CCL)
1410                                         signal_simple_error
1411                                             ("Property only valid in CCL coding systems",
1412                                              prop);
1413                                 break;
1414 #endif                          /* MULE */
1415                         default:
1416                                 abort();
1417                         }
1418                 }
1419
1420         if (!ok)
1421                 signal_simple_error("Unrecognized property", prop);
1422
1423         if (EQ(prop, Qname))
1424                 return XCODING_SYSTEM_NAME(coding_system);
1425         else if (EQ(prop, Qtype))
1426                 return Fcoding_system_type(coding_system);
1427         else if (EQ(prop, Qdoc_string))
1428                 return XCODING_SYSTEM_DOC_STRING(coding_system);
1429         else if (EQ(prop, Qmnemonic))
1430                 return XCODING_SYSTEM_MNEMONIC(coding_system);
1431         else if (EQ(prop, Qeol_type))
1432                 return
1433                     eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1434         else if (EQ(prop, Qeol_lf))
1435                 return XCODING_SYSTEM_EOL_LF(coding_system);
1436         else if (EQ(prop, Qeol_crlf))
1437                 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1438         else if (EQ(prop, Qeol_cr))
1439                 return XCODING_SYSTEM_EOL_CR(coding_system);
1440         else if (EQ(prop, Qpost_read_conversion))
1441                 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1442         else if (EQ(prop, Qpre_write_conversion))
1443                 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1444 #ifdef MULE
1445         else if (type == CODESYS_ISO2022) {
1446                 if (EQ(prop, Qcharset_g0))
1447                         return coding_system_charset(coding_system, 0);
1448                 else if (EQ(prop, Qcharset_g1))
1449                         return coding_system_charset(coding_system, 1);
1450                 else if (EQ(prop, Qcharset_g2))
1451                         return coding_system_charset(coding_system, 2);
1452                 else if (EQ(prop, Qcharset_g3))
1453                         return coding_system_charset(coding_system, 3);
1454
1455 #define FORCE_CHARSET(charset_num) \
1456   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1457    (coding_system, charset_num) ? Qt : Qnil)
1458
1459                 else if (EQ(prop, Qforce_g0_on_output))
1460                         return FORCE_CHARSET(0);
1461                 else if (EQ(prop, Qforce_g1_on_output))
1462                         return FORCE_CHARSET(1);
1463                 else if (EQ(prop, Qforce_g2_on_output))
1464                         return FORCE_CHARSET(2);
1465                 else if (EQ(prop, Qforce_g3_on_output))
1466                         return FORCE_CHARSET(3);
1467
1468 #define LISP_BOOLEAN(prop) \
1469   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1470
1471                 else if (EQ(prop, Qshort))
1472                         return LISP_BOOLEAN(SHORT);
1473                 else if (EQ(prop, Qno_ascii_eol))
1474                         return LISP_BOOLEAN(NO_ASCII_EOL);
1475                 else if (EQ(prop, Qno_ascii_cntl))
1476                         return LISP_BOOLEAN(NO_ASCII_CNTL);
1477                 else if (EQ(prop, Qseven))
1478                         return LISP_BOOLEAN(SEVEN);
1479                 else if (EQ(prop, Qlock_shift))
1480                         return LISP_BOOLEAN(LOCK_SHIFT);
1481                 else if (EQ(prop, Qno_iso6429))
1482                         return LISP_BOOLEAN(NO_ISO6429);
1483                 else if (EQ(prop, Qescape_quoted))
1484                         return LISP_BOOLEAN(ESCAPE_QUOTED);
1485
1486                 else if (EQ(prop, Qinput_charset_conversion))
1487                         return
1488                             unparse_charset_conversion_specs
1489                             (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1490                 else if (EQ(prop, Qoutput_charset_conversion))
1491                         return
1492                             unparse_charset_conversion_specs
1493                             (XCODING_SYSTEM(coding_system)->iso2022.
1494                              output_conv);
1495                 else
1496                         abort();
1497         } else if (type == CODESYS_CCL) {
1498                 if (EQ(prop, Qdecode))
1499                         return XCODING_SYSTEM_CCL_DECODE(coding_system);
1500                 else if (EQ(prop, Qencode))
1501                         return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1502                 else
1503                         abort();
1504         }
1505 #endif                          /* MULE */
1506         else
1507                 abort();
1508
1509         return Qnil;            /* not reached */
1510 }
1511 \f
1512 /************************************************************************/
1513 /*                       Coding category functions                      */
1514 /************************************************************************/
1515
1516 static int decode_coding_category(Lisp_Object symbol)
1517 {
1518         int i;
1519
1520         CHECK_SYMBOL(symbol);
1521         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1522                 if (EQ(coding_category_symbol[i], symbol))
1523                         return i;
1524
1525         signal_simple_error("Unrecognized coding category", symbol);
1526         return 0;               /* not reached */
1527 }
1528
1529 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0,   /*
1530 Return a list of all recognized coding categories.
1531 */
1532       ())
1533 {
1534         int i;
1535         Lisp_Object list = Qnil;
1536
1537         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1538                 list = Fcons(coding_category_symbol[i], list);
1539         return list;
1540 }
1541
1542 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0,   /*
1543 Change the priority order of the coding categories.
1544 LIST should be list of coding categories, in descending order of
1545 priority.  Unspecified coding categories will be lower in priority
1546 than all specified ones, in the same relative order they were in
1547 previously.
1548 */
1549       (list))
1550 {
1551         int category_to_priority[CODING_CATEGORY_LAST];
1552         int i, j;
1553         Lisp_Object rest;
1554
1555         /* First generate a list that maps coding categories to priorities. */
1556
1557         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1558                 category_to_priority[i] = -1;
1559
1560         /* Highest priority comes from the specified list. */
1561         i = 0;
1562         EXTERNAL_LIST_LOOP(rest, list) {
1563                 int cat = decode_coding_category(XCAR(rest));
1564
1565                 if (category_to_priority[cat] >= 0)
1566                         signal_simple_error("Duplicate coding category in list",
1567                                             XCAR(rest));
1568                 category_to_priority[cat] = i++;
1569         }
1570
1571         /* Now go through the existing categories by priority to retrieve
1572            the categories not yet specified and preserve their priority
1573            order. */
1574         for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1575                 int cat = fcd->coding_category_by_priority[j];
1576                 if (category_to_priority[cat] < 0)
1577                         category_to_priority[cat] = i++;
1578         }
1579
1580         /* Now we need to construct the inverse of the mapping we just
1581            constructed. */
1582
1583         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1584                 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1585
1586         /* Phew!  That was confusing. */
1587         return Qnil;
1588 }
1589
1590 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0,   /*
1591 Return a list of coding categories in descending order of priority.
1592 */
1593       ())
1594 {
1595         int i;
1596         Lisp_Object list = Qnil;
1597
1598         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1599                 list =
1600                     Fcons(coding_category_symbol
1601                           [fcd->coding_category_by_priority[i]], list);
1602         return list;
1603 }
1604
1605 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0,       /*
1606 Change the coding system associated with a coding category.
1607 */
1608       (coding_category, coding_system))
1609 {
1610         int cat = decode_coding_category(coding_category);
1611
1612         coding_system = Fget_coding_system(coding_system);
1613         fcd->coding_category_system[cat] = coding_system;
1614         return Qnil;
1615 }
1616
1617 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0,       /*
1618 Return the coding system associated with a coding category.
1619 */
1620       (coding_category))
1621 {
1622         int cat = decode_coding_category(coding_category);
1623         Lisp_Object sys = fcd->coding_category_system[cat];
1624
1625         if (!NILP(sys))
1626                 return XCODING_SYSTEM_NAME(sys);
1627         return Qnil;
1628 }
1629 \f
1630 /************************************************************************/
1631 /*                     Detecting the encoding of data                   */
1632 /************************************************************************/
1633
1634 struct detection_state {
1635         eol_type_t eol_type;
1636         int seen_non_ascii;
1637         int mask;
1638 #ifdef MULE
1639         struct {
1640                 int mask;
1641                 int in_second_byte;
1642         } big5;
1643
1644         struct {
1645                 int mask;
1646                 int in_second_byte;
1647         } shift_jis;
1648
1649         struct {
1650                 int mask;
1651                 int in_byte;
1652         } ucs4;
1653
1654         struct {
1655                 int mask;
1656                 int in_byte;
1657         } utf8;
1658
1659         struct {
1660                 int mask;
1661                 int initted;
1662                 struct iso2022_decoder iso;
1663                 unsigned int flags;
1664                 int high_byte_count;
1665                 unsigned int saw_single_shift:1;
1666         } iso2022;
1667 #endif
1668         struct {
1669                 int seen_anything;
1670                 int just_saw_cr;
1671         } eol;
1672 };
1673
1674 static int acceptable_control_char_p(int c)
1675 {
1676         switch (c) {
1677                 /* Allow and ignore control characters that you might
1678                    reasonably see in a text file */
1679         case '\r':
1680         case '\n':
1681         case '\t':
1682         case 7:         /* bell */
1683         case 8:         /* backspace */
1684         case 11:                /* vertical tab */
1685         case 12:                /* form feed */
1686         case 26:                /* MS-DOS C-z junk */
1687         case 31:                /* '^_' -- for info */
1688                 return 1;
1689         default:
1690                 return 0;
1691         }
1692 }
1693
1694 static int mask_has_at_most_one_bit_p(int mask)
1695 {
1696         /* Perhaps the only thing useful you learn from intensive Microsoft
1697            technical interviews */
1698         return (mask & (mask - 1)) == 0;
1699 }
1700
1701 static eol_type_t
1702 detect_eol_type(struct detection_state *st, const Extbyte * src,
1703                 Lstream_data_count n)
1704 {
1705         while (n--) {
1706                 const unsigned char c = *(const unsigned char*)src++;
1707                 if (c == '\n') {
1708                         if (st->eol.just_saw_cr)
1709                                 return EOL_CRLF;
1710                         else if (st->eol.seen_anything)
1711                                 return EOL_LF;
1712                 } else if (st->eol.just_saw_cr)
1713                         return EOL_CR;
1714                 else if (c == '\r')
1715                         st->eol.just_saw_cr = 1;
1716                 else
1717                         st->eol.just_saw_cr = 0;
1718                 st->eol.seen_anything = 1;
1719         }
1720
1721         return EOL_AUTODETECT;
1722 }
1723
1724 /* Attempt to determine the encoding and EOL type of the given text.
1725    Before calling this function for the first type, you must initialize
1726    st->eol_type as appropriate and initialize st->mask to ~0.
1727
1728    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1729    not yet known.
1730
1731    st->mask holds the determined coding category mask, or ~0 if only
1732    ASCII has been seen so far.
1733
1734    Returns:
1735
1736    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1737         is present in st->mask
1738    1 == definitive answers are here for both st->eol_type and st->mask
1739 */
1740
1741 static int
1742 detect_coding_type(struct detection_state *st, const Extbyte * src,
1743                    Lstream_data_count n, int just_do_eol)
1744 {
1745         if (st->eol_type == EOL_AUTODETECT)
1746                 st->eol_type = detect_eol_type(st, src, n);
1747
1748         if (just_do_eol)
1749                 return st->eol_type != EOL_AUTODETECT;
1750
1751         if (!st->seen_non_ascii) {
1752                 for (; n; n--, src++) {
1753                         const unsigned char c = *(const unsigned char *)src;
1754                         if ((c < 0x20 && !acceptable_control_char_p(c))
1755                             || c >= 0x80) {
1756                                 st->seen_non_ascii = 1;
1757 #ifdef MULE
1758                                 st->shift_jis.mask = ~0;
1759                                 st->big5.mask = ~0;
1760                                 st->ucs4.mask = ~0;
1761                                 st->utf8.mask = ~0;
1762                                 st->iso2022.mask = ~0;
1763 #endif
1764                                 break;
1765                         }
1766                 }
1767         }
1768
1769         if (!n) {
1770                 return 0;
1771         }
1772 #ifdef MULE
1773         if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1774                 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1775         if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1776                 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1777         if (!mask_has_at_most_one_bit_p(st->big5.mask))
1778                 st->big5.mask = detect_coding_big5(st, src, n);
1779         if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1780                 st->utf8.mask = detect_coding_utf8(st, src, n);
1781         if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1782                 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1783
1784         st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1785                 | st->utf8.mask | st->ucs4.mask;
1786 #endif
1787         {
1788                 int retval = mask_has_at_most_one_bit_p(st->mask);
1789                 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1790                 return retval && st->eol_type != EOL_AUTODETECT;
1791         }
1792 }
1793
1794 static Lisp_Object coding_system_from_mask(int mask)
1795 {
1796         if (mask == ~0) {
1797                 /* If the file was entirely or basically ASCII, use the
1798                    default value of `buffer-file-coding-system'. */
1799                 Lisp_Object retval =
1800                     XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1801                 if (!NILP(retval)) {
1802                         retval = Ffind_coding_system(retval);
1803                         if (NILP(retval)) {
1804                                 warn_when_safe
1805                                     (Qbad_variable, Qwarning,
1806                                      "Invalid `default-buffer-file-coding-system', set to nil");
1807                                 XBUFFER(Vbuffer_defaults)->
1808                                     buffer_file_coding_system = Qnil;
1809                         }
1810                 }
1811                 if (NILP(retval))
1812                         retval = Fget_coding_system(Qraw_text);
1813                 return retval;
1814         } else {
1815                 int i;
1816                 int cat = -1;
1817 #ifdef MULE
1818                 mask = postprocess_iso2022_mask(mask);
1819 #endif
1820                 /* Look through the coding categories by priority and find
1821                    the first one that is allowed. */
1822                 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1823                         cat = fcd->coding_category_by_priority[i];
1824                         if (cat<0)
1825                                 continue;
1826                         if ((mask & (1 << cat)) &&
1827                             !NILP(fcd->coding_category_system[cat]))
1828                                 break;
1829                 }
1830                 if (cat >= 0)
1831                         return fcd->coding_category_system[cat];
1832                 else
1833                         return Fget_coding_system(Qraw_text);
1834         }
1835 }
1836
1837 /* Given a seekable read stream and potential coding system and EOL type
1838    as specified, do any autodetection that is called for.  If the
1839    coding system and/or EOL type are not `autodetect', they will be left
1840    alone; but this function will never return an autodetect coding system
1841    or EOL type.
1842
1843    This function does not automatically fetch subsidiary coding systems;
1844    that should be unnecessary with the explicit eol-type argument. */
1845
1846 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1847 /* number of leading lines to check for a coding cookie */
1848 #define LINES_TO_CHECK 2
1849
1850 void
1851 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1852                              eol_type_t * eol_type_in_out)
1853 {
1854         struct detection_state decst;
1855
1856         if (*eol_type_in_out == EOL_AUTODETECT)
1857                 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
1858
1859         xzero(decst);
1860         decst.eol_type = *eol_type_in_out;
1861         decst.mask = ~0;
1862
1863         /* If autodetection is called for, do it now. */
1864         if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1865             || *eol_type_in_out == EOL_AUTODETECT) {
1866                 Extbyte buf[4096];
1867                 Lisp_Object coding_system = Qnil;
1868                 Extbyte *p;
1869                 Lstream_data_count nread =
1870                     Lstream_read(stream, buf, sizeof(buf));
1871                 Extbyte *scan_end;
1872                 int lines_checked = 0;
1873
1874                 /* Look for initial "-*-"; mode line prefix */
1875                 for (p = buf,
1876                      scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1877                      p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1878                         if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1879                                 Extbyte *local_vars_beg = p + 3;
1880                                 /* Look for final "-*-"; mode line suffix */
1881                                 for (p = local_vars_beg,
1882                                      scan_end = buf + nread - LENGTH("-*-");
1883                                      p <= scan_end
1884                                      && lines_checked < LINES_TO_CHECK; p++)
1885                                         if (*p == '-' && *(p + 1) == '*'
1886                                             && *(p + 2) == '-') {
1887                                                 Extbyte *suffix = p;
1888                                                 /* Look for "coding:" */
1889                                                 for (p = local_vars_beg,
1890                                                      scan_end =
1891                                                      suffix -
1892                                                      LENGTH("coding:?");
1893                                                      p <= scan_end; p++)
1894                                                         if (memcmp
1895                                                             ("coding:", p,
1896                                                              LENGTH("coding:"))
1897                                                             == 0
1898                                                             && (p ==
1899                                                                 local_vars_beg
1900                                                                 || (*(p - 1) ==
1901                                                                     ' '
1902                                                                     || *(p -
1903                                                                          1) ==
1904                                                                     '\t'
1905                                                                     || *(p -
1906                                                                          1) ==
1907                                                                     ';'))) {
1908                                                                 Extbyte save;
1909                                                                 int n;
1910                                                                 p += LENGTH
1911                                                                     ("coding:");
1912                                                                 while (*p == ' '
1913                                                                        || *p ==
1914                                                                        '\t')
1915                                                                         p++;
1916
1917                                                                 /* Get coding system name */
1918                                                                 save = *suffix;
1919                                                                 *suffix = '\0';
1920                                                                 /* Characters valid in a MIME charset name (rfc 1521),
1921                                                                    and in a Lisp symbol name. */
1922                                                                 n = strspn((char
1923                                                                             *)p,
1924                                                                            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1925                                                                            "abcdefghijklmnopqrstuvwxyz"
1926                                                                            "0123456789"
1927                                                                            "!$%&*+-.^_{|}~");
1928                                                                 *suffix = save;
1929                                                                 if (n > 0) {
1930                                                                         save =
1931                                                                             p
1932                                                                             [n];
1933                                                                         p[n] =
1934                                                                             '\0';
1935                                                                         coding_system
1936                                                                             =
1937                                                                             Ffind_coding_system
1938                                                                             (intern
1939                                                                              ((char *)p));
1940                                                                         p[n] =
1941                                                                             save;
1942                                                                 }
1943                                                                 break;
1944                                                         }
1945                                                 break;
1946                                         }
1947                                 /* #### file must use standard EOLs or we miss 2d line */
1948                                 /* #### not to mention this is broken for UTF-16 DOS files */
1949                                         else if (*p == '\n' || *p == '\r') {
1950                                                 lines_checked++;
1951                                                 /* skip past multibyte (DOS) newline */
1952                                                 if (*p == '\r'
1953                                                     && *(p + 1) == '\n')
1954                                                         p++;
1955                                         }
1956                                 break;
1957                         }
1958                 /* #### file must use standard EOLs or we miss 2d line */
1959                 /* #### not to mention this is broken for UTF-16 DOS files */
1960                         else if (*p == '\n' || *p == '\r') {
1961                                 lines_checked++;
1962                                 /* skip past multibyte (DOS) newline */
1963                                 if (*p == '\r' && *(p + 1) == '\n')
1964                                         p++;
1965                         }
1966
1967                 if (NILP(coding_system))
1968                         do {
1969                                 if (detect_coding_type(&decst, buf, nread,
1970                                                        XCODING_SYSTEM_TYPE
1971                                                        (*codesys_in_out)
1972                                                        != CODESYS_AUTODETECT))
1973                                         break;
1974                                 nread = Lstream_read(stream, buf, sizeof(buf));
1975                                 if (nread == 0)
1976                                         break;
1977                         }
1978                         while (1);
1979
1980                 else if (XCODING_SYSTEM_TYPE(*codesys_in_out) ==
1981                          CODESYS_AUTODETECT
1982                          && XCODING_SYSTEM_EOL_TYPE(coding_system) ==
1983                          EOL_AUTODETECT)
1984                         do {
1985                                 if (detect_coding_type(&decst, buf, nread, 1))
1986                                         break;
1987                                 nread = Lstream_read(stream, buf, sizeof(buf));
1988                                 if (!nread)
1989                                         break;
1990                         }
1991                         while (1);
1992
1993                 *eol_type_in_out = decst.eol_type;
1994                 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
1995                         if (NILP(coding_system))
1996                                 *codesys_in_out =
1997                                     coding_system_from_mask(decst.mask);
1998                         else
1999                                 *codesys_in_out = coding_system;
2000                 }
2001         }
2002
2003         /* If we absolutely can't determine the EOL type, just assume LF. */
2004         if (*eol_type_in_out == EOL_AUTODETECT)
2005                 *eol_type_in_out = EOL_LF;
2006
2007         Lstream_rewind(stream);
2008 }
2009
2010 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0,   /*
2011 Detect coding system of the text in the region between START and END.
2012 Return a list of possible coding systems ordered by priority.
2013 If only ASCII characters are found, return 'undecided or one of
2014 its subsidiary coding systems according to a detected end-of-line
2015 type.  Optional arg BUFFER defaults to the current buffer.
2016 */
2017       (start, end, buffer))
2018 {
2019         Lisp_Object val = Qnil;
2020         struct buffer *buf = decode_buffer(buffer, 0);
2021         Bufpos b, e;
2022         Lisp_Object instream, lb_instream;
2023         lstream_t istr, lb_istr;
2024         struct detection_state decst;
2025         struct gcpro gcpro1, gcpro2;
2026
2027         get_buffer_range_char(buf, start, end, &b, &e, 0);
2028         lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2029         lb_istr = XLSTREAM(lb_instream);
2030         instream =
2031             make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2032         istr = XLSTREAM(instream);
2033         GCPRO2(instream, lb_instream);
2034         xzero(decst);
2035         decst.eol_type = EOL_AUTODETECT;
2036         decst.mask = ~0;
2037         while (1) {
2038                 Extbyte random_buffer[4096];
2039                 Lstream_data_count nread =
2040                     Lstream_read(istr, random_buffer, sizeof(random_buffer));
2041
2042                 if (!nread)
2043                         break;
2044                 if (detect_coding_type(&decst, random_buffer, nread, 0))
2045                         break;
2046         }
2047
2048         if (decst.mask == ~0)
2049                 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2050                                                decst.eol_type);
2051         else {
2052                 int i;
2053
2054                 val = Qnil;
2055 #ifdef MULE
2056                 decst.mask = postprocess_iso2022_mask(decst.mask);
2057 #endif
2058                 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2059                         int sys = fcd->coding_category_by_priority[i];
2060                         if (decst.mask & (1 << sys)) {
2061                                 Lisp_Object codesys =
2062                                     fcd->coding_category_system[sys];
2063                                 if (!NILP(codesys))
2064                                         codesys =
2065                                             subsidiary_coding_system(codesys,
2066                                                                      decst.
2067                                                                      eol_type);
2068                                 val = Fcons(codesys, val);
2069                         }
2070                 }
2071         }
2072         Lstream_close(istr);
2073         UNGCPRO;
2074         Lstream_delete(istr);
2075         Lstream_delete(lb_istr);
2076         return val;
2077 }
2078 \f
2079 /************************************************************************/
2080 /*           Converting to internal Mule format ("decoding")            */
2081 /************************************************************************/
2082
2083 /* A decoding stream is a stream used for decoding text (i.e.
2084    converting from some external format to internal format).
2085    The decoding-stream object keeps track of the actual coding
2086    stream, the stream that is at the other end, and data that
2087    needs to be persistent across the lifetime of the stream. */
2088
2089 /* Handle the EOL stuff related to just-read-in character C.
2090    EOL_TYPE is the EOL type of the coding stream.
2091    FLAGS is the current value of FLAGS in the coding stream, and may
2092    be modified by this macro.  (The macro only looks at the
2093    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2094    bytes are to be written.  You need to also define a local goto
2095    label "label_continue_loop" that is at the end of the main
2096    character-reading loop.
2097
2098    If C is a CR character, then this macro handles it entirely and
2099    jumps to label_continue_loop.  Otherwise, this macro does not add
2100    anything to DST, and continues normally.  You should continue
2101    processing C normally after this macro. */
2102
2103 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2104 do {                                                            \
2105   if (c == '\r')                                                \
2106     {                                                           \
2107       if (eol_type == EOL_CR)                                   \
2108         Dynarr_add (dst, '\n');                                 \
2109       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2110         Dynarr_add (dst, c);                                    \
2111       else                                                      \
2112         flags |= CODING_STATE_CR;                               \
2113       goto label_continue_loop;                                 \
2114     }                                                           \
2115   else if (flags & CODING_STATE_CR)                             \
2116     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2117       if (c != '\n')                                            \
2118         Dynarr_add (dst, '\r');                                 \
2119       flags &= ~CODING_STATE_CR;                                \
2120     }                                                           \
2121 } while (0)
2122
2123 /* C should be a binary character in the range 0 - 255; convert
2124    to internal format and add to Dynarr DST. */
2125
2126 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2127 do {                                            \
2128   if (BYTE_ASCII_P (c))                         \
2129     Dynarr_add (dst, c);                        \
2130   else if (BYTE_C1_P (c))                       \
2131     {                                           \
2132       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2133       Dynarr_add (dst, c + 0x20);               \
2134     }                                           \
2135   else                                          \
2136     {                                           \
2137       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2138       Dynarr_add (dst, c);                      \
2139     }                                           \
2140 } while (0)
2141
2142 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2143 do {                                    \
2144   if (ch)                               \
2145     {                                   \
2146       DECODE_ADD_BINARY_CHAR (ch, dst); \
2147       ch = 0;                           \
2148     }                                   \
2149 } while (0)
2150
2151 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2152 do {                                    \
2153   if (flags & CODING_STATE_END)         \
2154     {                                   \
2155       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2156       if (flags & CODING_STATE_CR)      \
2157         Dynarr_add (dst, '\r');         \
2158     }                                   \
2159 } while (0)
2160
2161 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2162
2163 typedef struct decoding_stream_s *decoding_stream_t;
2164 struct decoding_stream_s {
2165         /* Coding system that governs the conversion. */
2166         Lisp_Coding_System *codesys;
2167
2168         /* Stream that we read the encoded data from or
2169            write the decoded data to. */
2170         lstream_t other_end;
2171
2172         /* If we are reading, then we can return only a fixed amount of
2173            data, so if the conversion resulted in too much data, we store it
2174            here for retrieval the next time around. */
2175         unsigned_char_dynarr *runoff;
2176
2177         /* FLAGS holds flags indicating the current state of the decoding.
2178            Some of these flags are dependent on the coding system. */
2179         unsigned int flags;
2180
2181         /* CH holds a partially built-up character.  Since we only deal
2182            with one- and two-byte characters at the moment, we only use
2183            this to store the first byte of a two-byte character. */
2184         unsigned int ch;
2185
2186         /* EOL_TYPE specifies the type of end-of-line conversion that
2187            currently applies.  We need to keep this separate from the
2188            EOL type stored in CODESYS because the latter might indicate
2189            automatic EOL-type detection while the former will always
2190            indicate a particular EOL type. */
2191         eol_type_t eol_type;
2192 #ifdef MULE
2193         /* Additional ISO2022 information.  We define the structure above
2194            because it's also needed by the detection routines. */
2195         struct iso2022_decoder iso2022;
2196
2197         /* Additional information (the state of the running CCL program)
2198            used by the CCL decoder. */
2199         struct ccl_program ccl;
2200
2201         /* counter for UTF-8 or UCS-4 */
2202         unsigned char counter;
2203 #endif
2204         struct detection_state decst;
2205 };
2206
2207 static Lstream_data_count
2208 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2209 static Lstream_data_count
2210 decoding_writer(lstream_t stream,
2211                 const unsigned char *data, Lstream_data_count size);
2212 static int decoding_rewinder(lstream_t stream);
2213 static int decoding_seekable_p(lstream_t stream);
2214 static int decoding_flusher(lstream_t stream);
2215 static int decoding_closer(lstream_t stream);
2216
2217 static Lisp_Object decoding_marker(Lisp_Object stream);
2218
2219 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2220                               sizeof(struct decoding_stream_s));
2221
2222 static Lisp_Object
2223 decoding_marker(Lisp_Object stream)
2224 {
2225         lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2226         Lisp_Object str_obj;
2227
2228         /* We do not need to mark the coding systems or charsets stored
2229            within the stream because they are stored in a global list
2230            and automatically marked. */
2231
2232         XSETLSTREAM(str_obj, str);
2233         mark_object(str_obj);
2234         if (str->imp->marker) {
2235                 return str->imp->marker(str_obj);
2236         } else {
2237                 return Qnil;
2238         }
2239 }
2240
2241 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2242    so we read data from the other end, decode it, and store it into DATA. */
2243
2244 static Lstream_data_count
2245 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2246 {
2247         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2248         unsigned char *orig_data = data;
2249         Lstream_data_count read_size;
2250         int error_occurred = 0;
2251
2252         /* We need to interface to mule_decode(), which expects to take some
2253            amount of data and store the result into a Dynarr.  We have
2254            mule_decode() store into str->runoff, and take data from there
2255            as necessary. */
2256
2257         /* We loop until we have enough data, reading chunks from the other
2258            end and decoding it. */
2259         while (1) {
2260                 /* Take data from the runoff if we can.  Make sure to take at
2261                    most SIZE bytes, and delete the data from the runoff. */
2262                 if (Dynarr_length(str->runoff) > 0) {
2263                         Lstream_data_count chunk =
2264                                 min(size,
2265                                     (Lstream_data_count)
2266                                     Dynarr_length(str->runoff));
2267                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2268                         Dynarr_delete_many(str->runoff, 0, chunk);
2269                         data += chunk;
2270                         size -= chunk;
2271                 }
2272
2273                 if (size == 0) {
2274                         /* No more room for data */
2275                         break;
2276                 }
2277
2278                 if (str->flags & CODING_STATE_END) {
2279                         /* This means that on the previous iteration, we hit the
2280                            EOF on the other end.  We loop once more so that
2281                            mule_decode() can output any final stuff it may be
2282                            holding, or any "go back to a sane state" escape
2283                            sequences. (This latter makes sense during
2284                            encoding.) */
2285                         break;
2286                 }
2287
2288                 /* Exhausted the runoff, so get some more.  DATA has at least
2289                    SIZE bytes left of storage in it, so it's OK to read directly
2290                    into it.  (We'll be overwriting above, after we've decoded it
2291                    into the runoff.) */
2292                 read_size = Lstream_read(str->other_end, data, size);
2293                 if (read_size < 0) {
2294                         error_occurred = 1;
2295                         break;
2296                 }
2297                 if (read_size == 0) {
2298                         /* There might be some more end data produced in the
2299                            translation.  See the comment above. */
2300                         str->flags |= CODING_STATE_END;
2301                 }
2302                 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2303         }
2304
2305         if (data - orig_data == 0) {
2306                 return error_occurred ? -1 : 0;
2307         } else {
2308                 return data - orig_data;
2309         }
2310 }
2311
2312 static Lstream_data_count
2313 decoding_writer(lstream_t stream, const unsigned char *data,
2314                 Lstream_data_count size)
2315 {
2316         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2317         Lstream_data_count retval;
2318
2319         /* Decode all our data into the runoff, and then attempt to write
2320            it all out to the other end.  Remove whatever chunk we succeeded
2321            in writing. */
2322         mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2323         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2324                                Dynarr_length(str->runoff));
2325         if (retval > 0) {
2326                 Dynarr_delete_many(str->runoff, 0, retval);
2327         }
2328         /* Do NOT return retval.  The return value indicates how much
2329            of the incoming data was written, not how many bytes were
2330            written. */
2331         return size;
2332 }
2333
2334 static void
2335 reset_decoding_stream(decoding_stream_t str)
2336 {
2337 #ifdef MULE
2338         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2339                 Lisp_Object coding_system;
2340                 XSETCODING_SYSTEM(coding_system, str->codesys);
2341                 reset_iso2022(coding_system, &str->iso2022);
2342         } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2343                 setup_ccl_program(&str->ccl,
2344                                   CODING_SYSTEM_CCL_DECODE(str->codesys));
2345         }
2346         str->counter = 0;
2347 #endif                          /* MULE */
2348         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2349             || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2350                 xzero(str->decst);
2351                 str->decst.eol_type = EOL_AUTODETECT;
2352                 str->decst.mask = ~0;
2353         }
2354         str->flags = str->ch = 0;
2355 }
2356
2357 static int
2358 decoding_rewinder(lstream_t stream)
2359 {
2360         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2361         reset_decoding_stream(str);
2362         Dynarr_reset(str->runoff);
2363         return Lstream_rewind(str->other_end);
2364 }
2365
2366 static int
2367 decoding_seekable_p(lstream_t stream)
2368 {
2369         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2370         return Lstream_seekable_p(str->other_end);
2371 }
2372
2373 static int
2374 decoding_flusher(lstream_t stream)
2375 {
2376         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2377         return Lstream_flush(str->other_end);
2378 }
2379
2380 static int
2381 decoding_closer(lstream_t stream)
2382 {
2383         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2384         if (stream->flags & LSTREAM_FL_WRITE) {
2385                 str->flags |= CODING_STATE_END;
2386                 decoding_writer(stream, 0, 0);
2387         }
2388         Dynarr_free(str->runoff);
2389 #ifdef MULE
2390 #ifdef ENABLE_COMPOSITE_CHARS
2391         if (str->iso2022.composite_chars) {
2392                 Dynarr_free(str->iso2022.composite_chars);
2393         }
2394 #endif
2395 #endif
2396         return Lstream_close(str->other_end);
2397 }
2398
2399 Lisp_Object
2400 decoding_stream_coding_system(lstream_t stream)
2401 {
2402         Lisp_Object coding_system;
2403         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2404
2405         XSETCODING_SYSTEM(coding_system, str->codesys);
2406         return subsidiary_coding_system(coding_system, str->eol_type);
2407 }
2408
2409 void
2410 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2411 {
2412         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2413         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2414         str->codesys = cs;
2415         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2416                 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2417         }
2418         reset_decoding_stream(str);
2419         return;
2420 }
2421
2422 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2423    stream for writing, no automatic code detection will be performed.
2424    The reason for this is that automatic code detection requires a
2425    seekable input.  Things will also fail if you open a decoding
2426    stream for reading using a non-fully-specified coding system and
2427    a non-seekable input stream. */
2428
2429 static Lisp_Object
2430 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2431 {
2432         lstream_t lstr = Lstream_new(lstream_decoding, mode);
2433         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2434         Lisp_Object obj;
2435
2436         xzero(*str);
2437         str->other_end = stream;
2438         str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2439         str->eol_type = EOL_AUTODETECT;
2440         if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2441                 /* We can determine the coding system now. */
2442                 determine_real_coding_system(stream, &codesys, &str->eol_type);
2443         }
2444         set_decoding_stream_coding_system(lstr, codesys);
2445         str->decst.eol_type = str->eol_type;
2446         str->decst.mask = ~0;
2447         XSETLSTREAM(obj, lstr);
2448         return obj;
2449 }
2450
2451 Lisp_Object
2452 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2453 {
2454         return make_decoding_stream_1(stream, codesys, "r");
2455 }
2456
2457 Lisp_Object
2458 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2459 {
2460         return make_decoding_stream_1(stream, codesys, "w");
2461 }
2462
2463 /* Note: the decode_coding_* functions all take the same
2464    arguments as mule_decode(), which is to say some SRC data of
2465    size N, which is to be stored into dynamic array DST.
2466    DECODING is the stream within which the decoding is
2467    taking place, but no data is actually read from or
2468    written to that stream; that is handled in decoding_reader()
2469    or decoding_writer().  This allows the same functions to
2470    be used for both reading and writing. */
2471
2472 static void
2473 mule_decode(lstream_t decoding, const Extbyte * src,
2474             unsigned_char_dynarr * dst, Lstream_data_count n)
2475 {
2476         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2477
2478         /* If necessary, do encoding-detection now.  We do this when
2479            we're a writing stream or a non-seekable reading stream,
2480            meaning that we can't just process the whole input,
2481            rewind, and start over. */
2482
2483         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2484             str->eol_type == EOL_AUTODETECT) {
2485                 Lisp_Object codesys;
2486
2487                 XSETCODING_SYSTEM(codesys, str->codesys);
2488                 detect_coding_type(&str->decst, src, n,
2489                                    CODING_SYSTEM_TYPE(str->codesys) !=
2490                                    CODESYS_AUTODETECT);
2491                 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2492                     str->decst.mask != ~0)
2493                         /* #### This is cheesy.  What we really ought to do is
2494                            buffer up a certain amount of data so as to get a
2495                            less random result. */
2496                         codesys = coding_system_from_mask(str->decst.mask);
2497                 str->eol_type = str->decst.eol_type;
2498                 if (XCODING_SYSTEM(codesys) != str->codesys) {
2499                         /* Preserve the CODING_STATE_END flag in case it was set.
2500                            If we erase it, bad things might happen. */
2501                         int was_end = str->flags & CODING_STATE_END;
2502                         set_decoding_stream_coding_system(decoding, codesys);
2503                         if (was_end)
2504                                 str->flags |= CODING_STATE_END;
2505                 }
2506         }
2507
2508         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2509 #ifdef DEBUG_SXEMACS
2510         case CODESYS_INTERNAL:
2511                 Dynarr_add_many(dst, src, n);
2512                 break;
2513 #endif
2514         case CODESYS_AUTODETECT:
2515                 /* If we got this far and still haven't decided on the coding
2516                    system, then do no conversion. */
2517         case CODESYS_NO_CONVERSION:
2518                 decode_coding_no_conversion(decoding, src, dst, n);
2519                 break;
2520 #ifdef MULE
2521         case CODESYS_SHIFT_JIS:
2522                 decode_coding_sjis(decoding, src, dst, n);
2523                 break;
2524         case CODESYS_BIG5:
2525                 decode_coding_big5(decoding, src, dst, n);
2526                 break;
2527         case CODESYS_UCS4:
2528                 decode_coding_ucs4(decoding, src, dst, n);
2529                 break;
2530         case CODESYS_UTF8:
2531                 decode_coding_utf8(decoding, src, dst, n);
2532                 break;
2533         case CODESYS_CCL:
2534                 str->ccl.last_block = str->flags & CODING_STATE_END;
2535                 /* When applying ccl program to stream, MUST NOT set NULL
2536                    pointer to src.  */
2537                 ccl_driver(&str->ccl,
2538                            (src
2539                             ? (const unsigned char *)src
2540                             : (const unsigned char *)""),
2541                            dst, n, 0, CCL_MODE_DECODING);
2542                 break;
2543         case CODESYS_ISO2022:
2544                 decode_coding_iso2022(decoding, src, dst, n);
2545                 break;
2546 #endif                          /* MULE */
2547         default:
2548                 abort();
2549         }
2550 }
2551
2552 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0,   /*
2553 Decode the text between START and END which is encoded in CODING-SYSTEM.
2554 This is useful if you've read in encoded text from a file without decoding
2555 it (e.g. you read in a JIS-formatted file but used the `binary' or
2556 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2557 Return length of decoded text.
2558 BUFFER defaults to the current buffer if unspecified.
2559 */
2560       (start, end, coding_system, buffer))
2561 {
2562         Bufpos b, e;
2563         struct buffer *buf = decode_buffer(buffer, 0);
2564         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2565         lstream_t istr, ostr;
2566         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2567
2568         get_buffer_range_char(buf, start, end, &b, &e, 0);
2569
2570         barf_if_buffer_read_only(buf, b, e);
2571
2572         coding_system = Fget_coding_system(coding_system);
2573         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2574         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2575         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2576                                                    coding_system);
2577         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2578                                                 Fget_coding_system(Qbinary));
2579         istr = XLSTREAM(instream);
2580         ostr = XLSTREAM(outstream);
2581         GCPRO4(instream, lb_outstream, de_outstream, outstream);
2582
2583         /* The chain of streams looks like this:
2584
2585            [BUFFER] <----- send through
2586            ------> [ENCODE AS BINARY]
2587            ------> [DECODE AS SPECIFIED]
2588            ------> [BUFFER]
2589          */
2590
2591         while (1) {
2592                 char tempbuf[1024];     /* some random amount */
2593                 Bufpos newpos, even_newer_pos;
2594                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2595                 Lstream_data_count size_in_bytes =
2596                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
2597
2598                 if (!size_in_bytes)
2599                         break;
2600                 newpos = lisp_buffer_stream_startpos(istr);
2601                 Lstream_write(ostr, tempbuf, size_in_bytes);
2602                 even_newer_pos = lisp_buffer_stream_startpos(istr);
2603                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2604                                     even_newer_pos, 0);
2605         }
2606         Lstream_close(istr);
2607         Lstream_close(ostr);
2608         UNGCPRO;
2609         Lstream_delete(istr);
2610         Lstream_delete(ostr);
2611         Lstream_delete(XLSTREAM(de_outstream));
2612         Lstream_delete(XLSTREAM(lb_outstream));
2613         return Qnil;
2614 }
2615 \f
2616 /************************************************************************/
2617 /*           Converting to an external encoding ("encoding")            */
2618 /************************************************************************/
2619
2620 /* An encoding stream is an output stream.  When you create the
2621    stream, you specify the coding system that governs the encoding
2622    and another stream that the resulting encoded data is to be
2623    sent to, and then start sending data to it. */
2624
2625 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2626
2627 typedef struct encoding_stream_s *encoding_stream_t;
2628 struct encoding_stream_s {
2629         /* Coding system that governs the conversion. */
2630         Lisp_Coding_System *codesys;
2631
2632         /* Stream that we read the encoded data from or
2633            write the decoded data to. */
2634         lstream_t other_end;
2635
2636         /* If we are reading, then we can return only a fixed amount of
2637            data, so if the conversion resulted in too much data, we store it
2638            here for retrieval the next time around. */
2639         unsigned_char_dynarr *runoff;
2640
2641         /* FLAGS holds flags indicating the current state of the encoding.
2642            Some of these flags are dependent on the coding system. */
2643         unsigned int flags;
2644
2645         /* CH holds a partially built-up character.  Since we only deal
2646            with one- and two-byte characters at the moment, we only use
2647            this to store the first byte of a two-byte character. */
2648         unsigned int ch;
2649 #ifdef MULE
2650         /* Additional information used by the ISO2022 encoder. */
2651         struct {
2652                 /* CHARSET holds the character sets currently assigned to the G0
2653                    through G3 registers.  It is initialized from the array
2654                    INITIAL_CHARSET in CODESYS. */
2655                 Lisp_Object charset[4];
2656
2657                 /* Which registers are currently invoked into the left (GL) and
2658                    right (GR) halves of the 8-bit encoding space? */
2659                 int register_left, register_right;
2660
2661                 /* Whether we need to explicitly designate the charset in the
2662                    G? register before using it.  It is initialized from the
2663                    array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2664                 unsigned char force_charset_on_output[4];
2665
2666                 /* Other state variables that need to be preserved across
2667                    invocations. */
2668                 Lisp_Object current_charset;
2669                 int current_half;
2670                 int current_char_boundary;
2671         } iso2022;
2672
2673         /* Additional information (the state of the running CCL program)
2674            used by the CCL encoder. */
2675         struct ccl_program ccl;
2676 #endif                          /* MULE */
2677 };
2678
2679 static Lstream_data_count
2680 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2681 static Lstream_data_count
2682 encoding_writer(lstream_t stream,
2683                 const unsigned char *data, Lstream_data_count size);
2684 static int encoding_rewinder(lstream_t stream);
2685 static int encoding_seekable_p(lstream_t stream);
2686 static int encoding_flusher(lstream_t stream);
2687 static int encoding_closer(lstream_t stream);
2688
2689 static Lisp_Object encoding_marker(Lisp_Object stream);
2690
2691 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2692                               sizeof(struct encoding_stream_s));
2693
2694 static Lisp_Object
2695 encoding_marker(Lisp_Object stream)
2696 {
2697         lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2698         Lisp_Object str_obj;
2699
2700         /* We do not need to mark the coding systems or charsets stored
2701            within the stream because they are stored in a global list
2702            and automatically marked. */
2703
2704         XSETLSTREAM(str_obj, str);
2705         mark_object(str_obj);
2706         if (str->imp->marker) {
2707                 return str->imp->marker(str_obj);
2708         } else {
2709                 return Qnil;
2710         }
2711 }
2712
2713 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2714    so we read data from the other end, encode it, and store it into DATA. */
2715
2716 static Lstream_data_count
2717 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2718 {
2719         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2720         unsigned char *orig_data = data;
2721         Lstream_data_count read_size;
2722         int error_occurred = 0;
2723
2724         /* We need to interface to mule_encode(), which expects to take some
2725            amount of data and store the result into a Dynarr.  We have
2726            mule_encode() store into str->runoff, and take data from there
2727            as necessary. */
2728
2729         /* We loop until we have enough data, reading chunks from the other
2730            end and encoding it. */
2731         while (1) {
2732                 /* Take data from the runoff if we can.  Make sure to take at
2733                    most SIZE bytes, and delete the data from the runoff. */
2734                 if (Dynarr_length(str->runoff) > 0) {
2735                         int chunk = min((int)size, Dynarr_length(str->runoff));
2736                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2737                         Dynarr_delete_many(str->runoff, 0, chunk);
2738                         data += chunk;
2739                         size -= chunk;
2740                 }
2741
2742                 if (size == 0) {
2743                         /* No more room for data */
2744                         break;
2745                 }
2746
2747                 if (str->flags & CODING_STATE_END) {
2748                         /* This means that on the previous iteration, we hit the
2749                            EOF on the other end.  We loop once more so that
2750                            mule_encode() can output any final stuff it may be
2751                            holding, or any "go back to a sane state" escape
2752                            sequences. (This latter makes sense during
2753                            encoding.) */
2754                         break;
2755                 }
2756
2757                 /* Exhausted the runoff, so get some more.  DATA at least SIZE
2758                    bytes left of storage in it, so it's OK to read directly into
2759                    it.  (We'll be overwriting above, after we've encoded it into
2760                    the runoff.) */
2761                 read_size = Lstream_read(str->other_end, data, size);
2762                 if (read_size < 0) {
2763                         error_occurred = 1;
2764                         break;
2765                 }
2766                 if (read_size == 0) {
2767                         /* There might be some more end data produced in the
2768                            translation.  See the comment above. */
2769                         str->flags |= CODING_STATE_END;
2770                 }
2771                 mule_encode(stream, data, str->runoff, read_size);
2772         }
2773
2774         if (data == orig_data) {
2775                 return error_occurred ? -1 : 0;
2776         } else {
2777                 return data - orig_data;
2778         }
2779 }
2780
2781 static Lstream_data_count
2782 encoding_writer(lstream_t stream, const unsigned char *data,
2783                 Lstream_data_count size)
2784 {
2785         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2786         Lstream_data_count retval;
2787
2788         /* Encode all our data into the runoff, and then attempt to write
2789            it all out to the other end.  Remove whatever chunk we succeeded
2790            in writing. */
2791         mule_encode(stream, data, str->runoff, size);
2792         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2793                                Dynarr_length(str->runoff));
2794         if (retval > 0) {
2795                 Dynarr_delete_many(str->runoff, 0, retval);
2796         }
2797         /* Do NOT return retval.  The return value indicates how much
2798            of the incoming data was written, not how many bytes were
2799            written. */
2800         return size;
2801 }
2802
2803 static void
2804 reset_encoding_stream(encoding_stream_t str)
2805 {
2806 #ifdef MULE
2807         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2808         case CODESYS_ISO2022: {
2809                 int i;
2810
2811                 for (i = 0; i < 4; i++) {
2812                         str->iso2022.charset[i] =
2813                                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2814                                         str->codesys, i);
2815                         str->iso2022.force_charset_on_output[i] =
2816                                 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2817                                         str->codesys, i);
2818                 }
2819                 str->iso2022.register_left = 0;
2820                 str->iso2022.register_right = 1;
2821                 str->iso2022.current_charset = Qnil;
2822                 str->iso2022.current_half = 0;
2823                 str->iso2022.current_char_boundary = 1;
2824                 break;
2825         }
2826         case CODESYS_CCL:
2827                 setup_ccl_program(&str->ccl,
2828                                   CODING_SYSTEM_CCL_ENCODE(str->codesys));
2829                 break;
2830
2831                 /* list the rest of them lot explicitly */
2832         case CODESYS_AUTODETECT:
2833         case CODESYS_SHIFT_JIS:
2834         case CODESYS_BIG5:
2835         case CODESYS_UCS4:
2836         case CODESYS_UTF8:
2837         case CODESYS_NO_CONVERSION:
2838 #ifdef DEBUG_SXEMACS
2839         case CODESYS_INTERNAL:
2840 #endif
2841         default:
2842                 break;
2843         }
2844 #endif                          /* MULE */
2845
2846         str->flags = str->ch = 0;
2847 }
2848
2849 static int
2850 encoding_rewinder(lstream_t stream)
2851 {
2852         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2853         reset_encoding_stream(str);
2854         Dynarr_reset(str->runoff);
2855         return Lstream_rewind(str->other_end);
2856 }
2857
2858 static int
2859 encoding_seekable_p(lstream_t stream)
2860 {
2861         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2862         return Lstream_seekable_p(str->other_end);
2863 }
2864
2865 static int
2866 encoding_flusher(lstream_t stream)
2867 {
2868         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2869         return Lstream_flush(str->other_end);
2870 }
2871
2872 static int
2873 encoding_closer(lstream_t stream)
2874 {
2875         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2876         if (stream->flags & LSTREAM_FL_WRITE) {
2877                 str->flags |= CODING_STATE_END;
2878                 encoding_writer(stream, 0, 0);
2879         }
2880         Dynarr_free(str->runoff);
2881         return Lstream_close(str->other_end);
2882 }
2883
2884 Lisp_Object
2885 encoding_stream_coding_system(lstream_t stream)
2886 {
2887         Lisp_Object coding_system;
2888         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2889
2890         XSETCODING_SYSTEM(coding_system, str->codesys);
2891         return coding_system;
2892 }
2893
2894 void
2895 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2896 {
2897         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2898         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2899         str->codesys = cs;
2900         reset_encoding_stream(str);
2901 }
2902
2903 static Lisp_Object
2904 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2905 {
2906         lstream_t lstr = Lstream_new(lstream_encoding, mode);
2907         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2908         Lisp_Object obj;
2909
2910         xzero(*str);
2911         str->runoff = Dynarr_new(unsigned_char);
2912         str->other_end = stream;
2913         set_encoding_stream_coding_system(lstr, codesys);
2914         XSETLSTREAM(obj, lstr);
2915         return obj;
2916 }
2917
2918 Lisp_Object
2919 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2920 {
2921         return make_encoding_stream_1(stream, codesys, "r");
2922 }
2923
2924 Lisp_Object
2925 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2926 {
2927         return make_encoding_stream_1(stream, codesys, "w");
2928 }
2929
2930 /* Convert N bytes of internally-formatted data stored in SRC to an
2931    external format, according to the encoding stream ENCODING.
2932    Store the encoded data into DST. */
2933
2934 static void
2935 mule_encode(lstream_t encoding, const Bufbyte * src,
2936             unsigned_char_dynarr * dst, Lstream_data_count n)
2937 {
2938         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2939
2940         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2941 #ifdef DEBUG_SXEMACS
2942         case CODESYS_INTERNAL:
2943                 Dynarr_add_many(dst, src, n);
2944                 break;
2945 #endif
2946         case CODESYS_AUTODETECT:
2947                 /* If we got this far and still haven't decided on the coding
2948                    system, then do no conversion. */
2949         case CODESYS_NO_CONVERSION:
2950                 encode_coding_no_conversion(encoding, src, dst, n);
2951                 break;
2952 #ifdef MULE
2953         case CODESYS_SHIFT_JIS:
2954                 encode_coding_sjis(encoding, src, dst, n);
2955                 break;
2956         case CODESYS_BIG5:
2957                 encode_coding_big5(encoding, src, dst, n);
2958                 break;
2959         case CODESYS_UCS4:
2960                 encode_coding_ucs4(encoding, src, dst, n);
2961                 break;
2962         case CODESYS_UTF8:
2963                 encode_coding_utf8(encoding, src, dst, n);
2964                 break;
2965         case CODESYS_CCL:
2966                 str->ccl.last_block = str->flags & CODING_STATE_END;
2967                 /* When applying ccl program to stream, MUST NOT set NULL
2968                    pointer to src.  */
2969                 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
2970                            dst, n, 0, CCL_MODE_ENCODING);
2971                 break;
2972         case CODESYS_ISO2022:
2973                 encode_coding_iso2022(encoding, src, dst, n);
2974                 break;
2975 #endif                          /* MULE */
2976         default:
2977                 abort();
2978         }
2979 }
2980
2981 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0,   /*
2982 Encode the text between START and END using CODING-SYSTEM.
2983 This will, for example, convert Japanese characters into stuff such as
2984 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2985 text.  BUFFER defaults to the current buffer if unspecified.
2986 */
2987       (start, end, coding_system, buffer))
2988 {
2989         Bufpos b, e;
2990         struct buffer *buf = decode_buffer(buffer, 0);
2991         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2992         lstream_t istr, ostr;
2993         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2994
2995         get_buffer_range_char(buf, start, end, &b, &e, 0);
2996
2997         barf_if_buffer_read_only(buf, b, e);
2998
2999         coding_system = Fget_coding_system(coding_system);
3000         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
3001         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
3002         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
3003                                                    Fget_coding_system(Qbinary));
3004         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
3005                                                 coding_system);
3006         istr = XLSTREAM(instream);
3007         ostr = XLSTREAM(outstream);
3008         GCPRO4(instream, outstream, de_outstream, lb_outstream);
3009         /* The chain of streams looks like this:
3010
3011            [BUFFER] <----- send through
3012            ------> [ENCODE AS SPECIFIED]
3013            ------> [DECODE AS BINARY]
3014            ------> [BUFFER]
3015          */
3016         while (1) {
3017                 char tempbuf[1024];     /* some random amount */
3018                 Bufpos newpos, even_newer_pos;
3019                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
3020                 Lstream_data_count size_in_bytes =
3021                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
3022
3023                 if (!size_in_bytes)
3024                         break;
3025                 newpos = lisp_buffer_stream_startpos(istr);
3026                 Lstream_write(ostr, tempbuf, size_in_bytes);
3027                 even_newer_pos = lisp_buffer_stream_startpos(istr);
3028                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3029                                     even_newer_pos, 0);
3030         }
3031
3032         {
3033                 Charcount retlen =
3034                     lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3035                 Lstream_close(istr);
3036                 Lstream_close(ostr);
3037                 UNGCPRO;
3038                 Lstream_delete(istr);
3039                 Lstream_delete(ostr);
3040                 Lstream_delete(XLSTREAM(de_outstream));
3041                 Lstream_delete(XLSTREAM(lb_outstream));
3042                 return make_int(retlen);
3043         }
3044 }
3045
3046 #ifdef MULE
3047 \f
3048 /************************************************************************/
3049 /*                          Shift-JIS methods                           */
3050 /************************************************************************/
3051
3052 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3053    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3054    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3055    encoded by "position-code + 0x80".  A character of JISX0208
3056    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3057    position-codes are divided and shifted so that it fit in the range
3058    below.
3059
3060    --- CODE RANGE of Shift-JIS ---
3061    (character set)      (range)
3062    ASCII                0x00 .. 0x7F
3063    JISX0201-Kana        0xA0 .. 0xDF
3064    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3065             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3066    -------------------------------
3067
3068 */
3069
3070 /* Is this the first byte of a Shift-JIS two-byte char? */
3071
3072 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3073   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3074
3075 /* Is this the second byte of a Shift-JIS two-byte char? */
3076
3077 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3078   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3079
3080 #define BYTE_SJIS_KATAKANA_P(c) \
3081   ((c) >= 0xA1 && (c) <= 0xDF)
3082
3083 static int
3084 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3085                    Lstream_data_count n)
3086 {
3087         while (n--) {
3088                 const unsigned char c = *(const unsigned char *)src++;
3089                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3090                         return 0;
3091                 if (st->shift_jis.in_second_byte) {
3092                         st->shift_jis.in_second_byte = 0;
3093                         if (c < 0x40)
3094                                 return 0;
3095                 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3096                         st->shift_jis.in_second_byte = 1;
3097         }
3098         return CODING_CATEGORY_SHIFT_JIS_MASK;
3099 }
3100
3101 /* Convert Shift-JIS data to internal format. */
3102
3103 static void
3104 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3105                    unsigned_char_dynarr * dst, Lstream_data_count n)
3106 {
3107         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3108         unsigned int flags = str->flags;
3109         unsigned int ch = str->ch;
3110         eol_type_t eol_type = str->eol_type;
3111
3112         while (n--) {
3113                 const unsigned char c = *(const unsigned char *)src++;
3114
3115                 if (ch) {
3116                         /* Previous character was first byte of Shift-JIS Kanji
3117                            char. */
3118                         if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3119                                 unsigned char e1, e2;
3120
3121                                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3122                                 DECODE_SJIS(ch, c, e1, e2);
3123                                 Dynarr_add(dst, e1);
3124                                 Dynarr_add(dst, e2);
3125                         } else {
3126                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3127                                 DECODE_ADD_BINARY_CHAR(c, dst);
3128                         }
3129                         ch = 0;
3130                 } else {
3131                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3132                         if (BYTE_SJIS_TWO_BYTE_1_P(c))
3133                                 ch = c;
3134                         else if (BYTE_SJIS_KATAKANA_P(c)) {
3135                                 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3136                                 Dynarr_add(dst, c);
3137                         } else
3138                                 DECODE_ADD_BINARY_CHAR(c, dst);
3139                 }
3140               label_continue_loop:;
3141         }
3142
3143         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3144
3145         str->flags = flags;
3146         str->ch = ch;
3147 }
3148
3149 /* Convert internally-formatted data to Shift-JIS. */
3150
3151 static void
3152 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3153                    unsigned_char_dynarr * dst, Lstream_data_count n)
3154 {
3155         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3156         unsigned int flags = str->flags;
3157         unsigned int ch = str->ch;
3158         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3159
3160         while (n--) {
3161                 Bufbyte c = *src++;
3162                 if (c == '\n') {
3163                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3164                                 Dynarr_add(dst, '\r');
3165                         if (eol_type != EOL_CR)
3166                                 Dynarr_add(dst, '\n');
3167                         ch = 0;
3168                 } else if (BYTE_ASCII_P(c)) {
3169                         Dynarr_add(dst, c);
3170                         ch = 0;
3171                 } else if (BUFBYTE_LEADING_BYTE_P(c))
3172                         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3173                               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3174                               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3175                 else if (ch) {
3176                         if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3177                                 Dynarr_add(dst, c);
3178                                 ch = 0;
3179                         } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3180                                    ch == LEADING_BYTE_JAPANESE_JISX0208)
3181                                 ch = c;
3182                         else {
3183                                 /* j1 is bessel j1 function,
3184                                  * so we use something else */
3185                                 /* unsigned char j1, j2; */
3186                                 unsigned char tt1, tt2;
3187
3188                                 ENCODE_SJIS(ch, c, tt1, tt2);
3189                                 Dynarr_add(dst, tt1);
3190                                 Dynarr_add(dst, tt2);
3191                                 ch = 0;
3192                         }
3193                 }
3194         }
3195
3196         str->flags = flags;
3197         str->ch = ch;
3198 }
3199
3200 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3201 Decode a JISX0208 character of Shift-JIS coding-system.
3202 CODE is the character code in Shift-JIS as a cons of type bytes.
3203 Return the corresponding character.
3204 */
3205       (code))
3206 {
3207         unsigned char c1, c2, s1, s2;
3208
3209         CHECK_CONS(code);
3210         CHECK_INT(XCAR(code));
3211         CHECK_INT(XCDR(code));
3212         s1 = XINT(XCAR(code));
3213         s2 = XINT(XCDR(code));
3214         if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3215                 DECODE_SJIS(s1, s2, c1, c2);
3216                 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3217                                            c1 & 0x7F, c2 & 0x7F));
3218         } else
3219                 return Qnil;
3220 }
3221
3222 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3223 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3224 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3225 */
3226       (character))
3227 {
3228         Lisp_Object charset;
3229         int c1, c2, s1, s2;
3230
3231         CHECK_CHAR_COERCE_INT(character);
3232         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3233         if (EQ(charset, Vcharset_japanese_jisx0208)) {
3234                 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3235                 return Fcons(make_int(s1), make_int(s2));
3236         } else
3237                 return Qnil;
3238 }
3239 \f
3240 /************************************************************************/
3241 /*                            Big5 methods                              */
3242 /************************************************************************/
3243
3244 /* BIG5 is a coding system encoding two character sets: ASCII and
3245    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3246    character set and is encoded in two-byte.
3247
3248    --- CODE RANGE of BIG5 ---
3249    (character set)      (range)
3250    ASCII                0x00 .. 0x7F
3251    Big5 (1st byte)      0xA1 .. 0xFE
3252         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3253    --------------------------
3254
3255    Since the number of characters in Big5 is larger than maximum
3256    characters in Emacs' charset (96x96), it can't be handled as one
3257    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3258    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3259    contains frequently used characters and the latter contains less
3260    frequently used characters.  */
3261
3262 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3263   ((c) >= 0xA1 && (c) <= 0xFE)
3264
3265 /* Is this the second byte of a Shift-JIS two-byte char? */
3266
3267 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3268   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3269
3270 /* Number of Big5 characters which have the same code in 1st byte.  */
3271
3272 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3273
3274 /* Code conversion macros.  These are macros because they are used in
3275    inner loops during code conversion.
3276
3277    Note that temporary variables in macros introduce the classic
3278    dynamic-scoping problems with variable names.  We use capital-
3279    lettered variables in the assumption that SXEmacs does not use
3280    capital letters in variables except in a very formalized way
3281    (e.g. Qstring). */
3282
3283 /* Convert Big5 code (b1, b2) into its internal string representation
3284    (lb, c1, c2). */
3285
3286 /* There is a much simpler way to split the Big5 charset into two.
3287    For the moment I'm going to leave the algorithm as-is because it
3288    claims to separate out the most-used characters into a single
3289    charset, which perhaps will lead to optimizations in various
3290    places.
3291
3292    The way the algorithm works is something like this:
3293
3294    Big5 can be viewed as a 94x157 charset, where the row is
3295    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3296    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3297    the split between low and high column numbers is apparently
3298    meaningless; ascending rows produce less and less frequent chars.
3299    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3300    the first charset, and the upper half (0xC9 .. 0xFE) to the
3301    second.  To do the conversion, we convert the character into
3302    a single number where 0 .. 156 is the first row, 157 .. 313
3303    is the second, etc.  That way, the characters are ordered by
3304    decreasing frequency.  Then we just chop the space in two
3305    and coerce the result into a 94x94 space.
3306    */
3307
3308 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3309 {                                                                       \
3310   int B1 = b1, B2 = b2;                                                 \
3311   unsigned int I                                                        \
3312     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3313                                                                         \
3314   if (B1 < 0xC9)                                                        \
3315     {                                                                   \
3316       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3317     }                                                                   \
3318   else                                                                  \
3319     {                                                                   \
3320       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3321       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3322     }                                                                   \
3323   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3324   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3325 } while (0)
3326
3327 /* Convert the internal string representation of a Big5 character
3328    (lb, c1, c2) into Big5 code (b1, b2). */
3329
3330 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3331 {                                                                       \
3332   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3333                                                                         \
3334   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3335     {                                                                   \
3336       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3337     }                                                                   \
3338   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3339   b2 = I % BIG5_SAME_ROW;                                               \
3340   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3341 } while (0)
3342
3343 static int
3344 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3345                    Lstream_data_count n)
3346 {
3347         while (n--) {
3348                 const unsigned char c = *(const unsigned char *)src++;
3349                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3350                     (c >= 0x80 && c <= 0xA0))
3351                         return 0;
3352                 if (st->big5.in_second_byte) {
3353                         st->big5.in_second_byte = 0;
3354                         if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3355                                 return 0;
3356                 } else if (c >= 0xA1)
3357                         st->big5.in_second_byte = 1;
3358         }
3359         return CODING_CATEGORY_BIG5_MASK;
3360 }
3361
3362 /* Convert Big5 data to internal format. */
3363
3364 static void
3365 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3366                    unsigned_char_dynarr * dst, Lstream_data_count n)
3367 {
3368         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3369         unsigned int flags = str->flags;
3370         unsigned int ch = str->ch;
3371         eol_type_t eol_type = str->eol_type;
3372
3373         while (n--) {
3374                 const unsigned char c = *(const unsigned char *)src++;
3375                 if (ch) {
3376                         /* Previous character was first byte of Big5 char. */
3377                         if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3378                                 unsigned char b1, b2, b3;
3379                                 DECODE_BIG5(ch, c, b1, b2, b3);
3380                                 Dynarr_add(dst, b1);
3381                                 Dynarr_add(dst, b2);
3382                                 Dynarr_add(dst, b3);
3383                         } else {
3384                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3385                                 DECODE_ADD_BINARY_CHAR(c, dst);
3386                         }
3387                         ch = 0;
3388                 } else {
3389                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3390                         if (BYTE_BIG5_TWO_BYTE_1_P(c))
3391                                 ch = c;
3392                         else
3393                                 DECODE_ADD_BINARY_CHAR(c, dst);
3394                 }
3395               label_continue_loop:;
3396         }
3397
3398         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3399
3400         str->flags = flags;
3401         str->ch = ch;
3402 }
3403
3404 /* Convert internally-formatted data to Big5. */
3405
3406 static void
3407 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3408                    unsigned_char_dynarr * dst, Lstream_data_count n)
3409 {
3410         unsigned char c;
3411         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3412         unsigned int flags = str->flags;
3413         unsigned int ch = str->ch;
3414         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3415
3416         while (n--) {
3417                 c = *src++;
3418                 if (c == '\n') {
3419                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3420                                 Dynarr_add(dst, '\r');
3421                         if (eol_type != EOL_CR)
3422                                 Dynarr_add(dst, '\n');
3423                 } else if (BYTE_ASCII_P(c)) {
3424                         /* ASCII. */
3425                         Dynarr_add(dst, c);
3426                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3427                         if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3428                             c == LEADING_BYTE_CHINESE_BIG5_2) {
3429                                 /* A recognized leading byte. */
3430                                 ch = c;
3431                                 continue;       /* not done with this character. */
3432                         }
3433                         /* otherwise just ignore this character. */
3434                 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3435                            ch == LEADING_BYTE_CHINESE_BIG5_2) {
3436                         /* Previous char was a recognized leading byte. */
3437                         ch = (ch << 8) | c;
3438                         continue;       /* not done with this character. */
3439                 } else if (ch) {
3440                         /* Encountering second byte of a Big5 character. */
3441                         unsigned char b1, b2;
3442
3443                         ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3444                         Dynarr_add(dst, b1);
3445                         Dynarr_add(dst, b2);
3446                 }
3447
3448                 ch = 0;
3449         }
3450
3451         str->flags = flags;
3452         str->ch = ch;
3453 }
3454
3455 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0,   /*
3456 Decode a Big5 character CODE of BIG5 coding-system.
3457 CODE is the character code in BIG5, a cons of two integers.
3458 Return the corresponding character.
3459 */
3460       (code))
3461 {
3462         unsigned char c1, c2, b1, b2;
3463
3464         CHECK_CONS(code);
3465         CHECK_INT(XCAR(code));
3466         CHECK_INT(XCDR(code));
3467         b1 = XINT(XCAR(code));
3468         b2 = XINT(XCDR(code));
3469         if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3470                 int leading_byte;
3471                 Lisp_Object charset;
3472                 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3473                 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3474                 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3475         } else
3476                 return Qnil;
3477 }
3478
3479 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0,   /*
3480 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3481 Return the corresponding character code in Big5.
3482 */
3483       (character))
3484 {
3485         Lisp_Object charset;
3486         int c1, c2, b1, b2;
3487
3488         CHECK_CHAR_COERCE_INT(character);
3489         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3490         if (EQ(charset, Vcharset_chinese_big5_1) ||
3491             EQ(charset, Vcharset_chinese_big5_2)) {
3492                 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3493                             c2 | 0x80, b1, b2);
3494                 return Fcons(make_int(b1), make_int(b2));
3495         } else
3496                 return Qnil;
3497 }
3498 \f
3499 /************************************************************************/
3500 /*                           UCS-4 methods                              */
3501 /*                                                                      */
3502 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3503 /*                                                                      */
3504 /************************************************************************/
3505
3506 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0,   /*
3507 Map UCS-4 code CODE to Mule character CHARACTER.
3508
3509 Return T on success, NIL on failure.
3510 */
3511       (code, character))
3512 {
3513         size_t c;
3514
3515         CHECK_CHAR(character);
3516         CHECK_NATNUM(code);
3517         c = XINT(code);
3518
3519         if (c < countof(fcd->ucs_to_mule_table)) {
3520                 fcd->ucs_to_mule_table[c] = character;
3521                 return Qt;
3522         } else
3523                 return Qnil;
3524 }
3525
3526 static Lisp_Object ucs_to_char(unsigned long code)
3527 {
3528         if (code < countof(fcd->ucs_to_mule_table)) {
3529                 return fcd->ucs_to_mule_table[code];
3530         } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3531                 unsigned int c;
3532
3533                 code -= 0xe00000;
3534                 c = code % (94 * 94);
3535                 return make_char
3536                     (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3537                                (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3538                                 CHARSET_LEFT_TO_RIGHT),
3539                                c / 94 + 33, c % 94 + 33));
3540         } else
3541                 return Qnil;
3542 }
3543
3544 DEFUN("ucs-char", Fucs_char, 1, 1, 0,   /*
3545 Return Mule character corresponding to UCS code CODE (a positive integer).
3546 */
3547       (code))
3548 {
3549         CHECK_NATNUM(code);
3550         return ucs_to_char(XINT(code));
3551 }
3552
3553 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0,   /*
3554 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3555 */
3556       (character, code))
3557 {
3558         /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3559            Fset_char_ucs is more restrictive on index arg, but should
3560            check code arg in a char_table method. */
3561         CHECK_CHAR(character);
3562         CHECK_NATNUM(code);
3563         return Fput_char_table(character, code, mule_to_ucs_table);
3564 }
3565
3566 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0,   /*
3567 Return the UCS code (a positive integer) corresponding to CHARACTER.
3568 */
3569       (character))
3570 {
3571         return Fget_char_table(character, mule_to_ucs_table);
3572 }
3573
3574 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3575    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3576    is not found, instead.
3577    #### do something more appropriate (use blob?)
3578         Danger, Will Robinson!  Data loss.  Should we signal user? */
3579 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3580 {
3581         Lisp_Object chr = ucs_to_char(ch);
3582
3583         if (!NILP(chr)) {
3584                 Bufbyte work[MAX_EMCHAR_LEN];
3585                 int len;
3586
3587                 ch = XCHAR(chr);
3588                 len = (ch < 128) ?
3589                     simple_set_charptr_emchar(work, ch) :
3590                     non_ascii_set_charptr_emchar(work, ch);
3591                 Dynarr_add_many(dst, work, len);
3592         } else {
3593                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3594                 Dynarr_add(dst, 34 + 128);
3595                 Dynarr_add(dst, 46 + 128);
3596         }
3597 }
3598
3599 static unsigned long
3600 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3601 {
3602         Lisp_Object code
3603             = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3604                               mule_to_ucs_table);
3605
3606         if (INTP(code)) {
3607                 return XINT(code);
3608         } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3609                    (XCHARSET_CHARS(charset) == 94)) {
3610                 unsigned char final = XCHARSET_FINAL(charset);
3611
3612                 if (('@' <= final) && (final < 0x7f)) {
3613                         return 0xe00000 + (final - '@') * 94 * 94
3614                             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3615                 } else {
3616                         return '?';
3617                 }
3618         } else {
3619                 return '?';
3620         }
3621 }
3622
3623 static void
3624 encode_ucs4(Lisp_Object charset,
3625             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3626 {
3627         unsigned long code = mule_char_to_ucs4(charset, h, l);
3628         Dynarr_add(dst, code >> 24);
3629         Dynarr_add(dst, (code >> 16) & 255);
3630         Dynarr_add(dst, (code >> 8) & 255);
3631         Dynarr_add(dst, code & 255);
3632 }
3633
3634 static int
3635 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3636                    Lstream_data_count n)
3637 {
3638         while (n--) {
3639                 const unsigned char c = *(const unsigned char *)src++;
3640                 switch (st->ucs4.in_byte) {
3641                 case 0:
3642                         if (c >= 128)
3643                                 return 0;
3644                         else
3645                                 st->ucs4.in_byte++;
3646                         break;
3647                 case 3:
3648                         st->ucs4.in_byte = 0;
3649                         break;
3650                 default:
3651                         st->ucs4.in_byte++;
3652                 }
3653         }
3654         return CODING_CATEGORY_UCS4_MASK;
3655 }
3656
3657 static void
3658 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3659                    unsigned_char_dynarr * dst, Lstream_data_count n)
3660 {
3661         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3662         unsigned int flags = str->flags;
3663         unsigned int ch = str->ch;
3664         unsigned char counter = str->counter;
3665
3666         while (n--) {
3667                 const unsigned char c = *(const unsigned char *)src++;
3668                 switch (counter) {
3669                 case 0:
3670                         ch = c;
3671                         counter = 3;
3672                         break;
3673                 case 1:
3674                         decode_ucs4((ch << 8) | c, dst);
3675                         ch = 0;
3676                         counter = 0;
3677                         break;
3678                 default:
3679                         ch = (ch << 8) | c;
3680                         counter--;
3681                 }
3682         }
3683         if (counter & CODING_STATE_END)
3684                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3685
3686         str->flags = flags;
3687         str->ch = ch;
3688         str->counter = counter;
3689 }
3690
3691 static void
3692 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3693                    unsigned_char_dynarr * dst, Lstream_data_count n)
3694 {
3695         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3696         unsigned int flags = str->flags;
3697         unsigned int ch = str->ch;
3698         unsigned char char_boundary = str->iso2022.current_char_boundary;
3699         Lisp_Object charset = str->iso2022.current_charset;
3700
3701 #ifdef ENABLE_COMPOSITE_CHARS
3702         /* flags for handling composite chars.  We do a little switcharoo
3703            on the source while we're outputting the composite char. */
3704         unsigned int saved_n = 0;
3705         const unsigned char *saved_src = NULL;
3706         int in_composite = 0;
3707
3708       back_to_square_n:
3709 #endif
3710
3711         while (n--) {
3712                 unsigned char c = *src++;
3713
3714                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3715                         ch = 0;
3716                         encode_ucs4(Vcharset_ascii, c, 0, dst);
3717                         char_boundary = 1;
3718                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3719                         ch = 0;
3720                         charset = CHARSET_BY_LEADING_BYTE(c);
3721                         if (LEADING_BYTE_PREFIX_P(c))
3722                                 ch = c;
3723                         char_boundary = 0;
3724                 } else {        /* Processing Non-ASCII character */
3725                         char_boundary = 1;
3726                         if (EQ(charset, Vcharset_control_1)) {
3727                                 encode_ucs4(Vcharset_control_1, c, 0, dst);
3728                         } else {
3729                                 switch (XCHARSET_REP_BYTES(charset)) {
3730                                 case 2:
3731                                         encode_ucs4(charset, c, 0, dst);
3732                                         break;
3733                                 case 3:
3734                                         if (XCHARSET_PRIVATE_P(charset)) {
3735                                                 encode_ucs4(charset, c, 0, dst);
3736                                                 ch = 0;
3737                                         } else if (ch) {
3738 #ifdef ENABLE_COMPOSITE_CHARS
3739                                                 if (EQ
3740                                                     (charset,
3741                                                      Vcharset_composite)) {
3742                                                         if (in_composite) {
3743                                                                 /* #### Bother! We don't know how to
3744                                                                    handle this yet. */
3745                                                                 Dynarr_add(dst,
3746                                                                            '\0');
3747                                                                 Dynarr_add(dst,
3748                                                                            '\0');
3749                                                                 Dynarr_add(dst,
3750                                                                            '\0');
3751                                                                 Dynarr_add(dst,
3752                                                                            '~');
3753                                                         } else {
3754                                                                 Emchar emch =
3755                                                                     MAKE_CHAR
3756                                                                     (Vcharset_composite,
3757                                                                      ch & 0x7F,
3758                                                                      c & 0x7F);
3759                                                                 Lisp_Object lstr
3760                                                                     =
3761                                                                     composite_char_string
3762                                                                     (emch);
3763                                                                 saved_n = n;
3764                                                                 saved_src = src;
3765                                                                 in_composite =
3766                                                                     1;
3767                                                                 src =
3768                                                                     XSTRING_DATA
3769                                                                     (lstr);
3770                                                                 n = XSTRING_LENGTH(lstr);
3771                                                         }
3772                                                 } else
3773 #endif                          /* ENABLE_COMPOSITE_CHARS */
3774                                                 {
3775                                                         encode_ucs4(charset, ch,
3776                                                                     c, dst);
3777                                                 }
3778                                                 ch = 0;
3779                                         } else {
3780                                                 ch = c;
3781                                                 char_boundary = 0;
3782                                         }
3783                                         break;
3784                                 case 4:
3785                                         if (ch) {
3786                                                 encode_ucs4(charset, ch, c,
3787                                                             dst);
3788                                                 ch = 0;
3789                                         } else {
3790                                                 ch = c;
3791                                                 char_boundary = 0;
3792                                         }
3793                                         break;
3794                                 default:
3795                                         abort();
3796                                 }
3797                         }
3798                 }
3799         }
3800
3801 #ifdef ENABLE_COMPOSITE_CHARS
3802         if (in_composite) {
3803                 n = saved_n;
3804                 src = saved_src;
3805                 in_composite = 0;
3806                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
3807         }
3808 #endif                          /* ENABLE_COMPOSITE_CHARS */
3809
3810         str->flags = flags;
3811         str->ch = ch;
3812         str->iso2022.current_char_boundary = char_boundary;
3813         str->iso2022.current_charset = charset;
3814
3815         /* Verbum caro factum est! */
3816 }
3817 \f
3818 /************************************************************************/
3819 /*                           UTF-8 methods                              */
3820 /************************************************************************/
3821
3822 static int
3823 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3824                    Lstream_data_count n)
3825 {
3826         while (n--) {
3827                 const unsigned char c = *(const unsigned char *)src++;
3828                 switch (st->utf8.in_byte) {
3829                 case 0:
3830                         if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3831                             || c == ISO_CODE_SO)
3832                                 return 0;
3833                         else if (c >= 0xfc)
3834                                 st->utf8.in_byte = 5;
3835                         else if (c >= 0xf8)
3836                                 st->utf8.in_byte = 4;
3837                         else if (c >= 0xf0)
3838                                 st->utf8.in_byte = 3;
3839                         else if (c >= 0xe0)
3840                                 st->utf8.in_byte = 2;
3841                         else if (c >= 0xc0)
3842                                 st->utf8.in_byte = 1;
3843                         else if (c >= 0x80)
3844                                 return 0;
3845                         break;
3846                 default:
3847                         if ((c & 0xc0) != 0x80)
3848                                 return 0;
3849                         else
3850                                 st->utf8.in_byte--;
3851                 }
3852         }
3853         return CODING_CATEGORY_UTF8_MASK;
3854 }
3855
3856 static void
3857 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3858                    unsigned_char_dynarr * dst, Lstream_data_count n)
3859 {
3860         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3861         unsigned int flags = str->flags;
3862         unsigned int ch = str->ch;
3863         eol_type_t eol_type = str->eol_type;
3864         unsigned char counter = str->counter;
3865
3866         while (n--) {
3867                 const unsigned char c = *(const unsigned char *)src++;
3868                 switch (counter) {
3869                 case 0:
3870                         if (c >= 0xfc) {
3871                                 ch = c & 0x01;
3872                                 counter = 5;
3873                         } else if (c >= 0xf8) {
3874                                 ch = c & 0x03;
3875                                 counter = 4;
3876                         } else if (c >= 0xf0) {
3877                                 ch = c & 0x07;
3878                                 counter = 3;
3879                         } else if (c >= 0xe0) {
3880                                 ch = c & 0x0f;
3881                                 counter = 2;
3882                         } else if (c >= 0xc0) {
3883                                 ch = c & 0x1f;
3884                                 counter = 1;
3885                         } else {
3886                                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3887                                 decode_ucs4(c, dst);
3888                         }
3889                         break;
3890                 case 1:
3891                         ch = (ch << 6) | (c & 0x3f);
3892                         decode_ucs4(ch, dst);
3893                         ch = 0;
3894                         counter = 0;
3895                         break;
3896                 default:
3897                         ch = (ch << 6) | (c & 0x3f);
3898                         counter--;
3899                 }
3900               label_continue_loop:;
3901         }
3902
3903         if (flags & CODING_STATE_END)
3904                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3905
3906         str->flags = flags;
3907         str->ch = ch;
3908         str->counter = counter;
3909 }
3910
3911 static void
3912 encode_utf8(Lisp_Object charset,
3913             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3914 {
3915         unsigned long code = mule_char_to_ucs4(charset, h, l);
3916         if (code <= 0x7f) {
3917                 Dynarr_add(dst, code);
3918         } else if (code <= 0x7ff) {
3919                 Dynarr_add(dst, (code >> 6) | 0xc0);
3920                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3921         } else if (code <= 0xffff) {
3922                 Dynarr_add(dst, (code >> 12) | 0xe0);
3923                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3924                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3925         } else if (code <= 0x1fffff) {
3926                 Dynarr_add(dst, (code >> 18) | 0xf0);
3927                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3928                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3929                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3930         } else if (code <= 0x3ffffff) {
3931                 Dynarr_add(dst, (code >> 24) | 0xf8);
3932                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3933                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3934                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3935                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3936         } else {
3937                 Dynarr_add(dst, (code >> 30) | 0xfc);
3938                 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3939                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3940                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3941                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3942                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3943         }
3944 }
3945
3946 static void
3947 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3948                    unsigned_char_dynarr * dst, Lstream_data_count n)
3949 {
3950         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3951         unsigned int flags = str->flags;
3952         unsigned int ch = str->ch;
3953         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3954         unsigned char char_boundary = str->iso2022.current_char_boundary;
3955         Lisp_Object charset = str->iso2022.current_charset;
3956
3957 #ifdef ENABLE_COMPOSITE_CHARS
3958         /* flags for handling composite chars.  We do a little switcharoo
3959            on the source while we're outputting the composite char. */
3960         unsigned int saved_n = 0;
3961         const unsigned char *saved_src = NULL;
3962         int in_composite = 0;
3963
3964       back_to_square_n:
3965 #endif                          /* ENABLE_COMPOSITE_CHARS */
3966
3967         while (n--) {
3968                 unsigned char c = *src++;
3969
3970                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3971                         ch = 0;
3972                         if (c == '\n') {
3973                                 if (eol_type != EOL_LF
3974                                     && eol_type != EOL_AUTODETECT)
3975                                         Dynarr_add(dst, '\r');
3976                                 if (eol_type != EOL_CR)
3977                                         Dynarr_add(dst, c);
3978                         } else
3979                                 encode_utf8(Vcharset_ascii, c, 0, dst);
3980                         char_boundary = 1;
3981                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3982                         ch = 0;
3983                         charset = CHARSET_BY_LEADING_BYTE(c);
3984                         if (LEADING_BYTE_PREFIX_P(c))
3985                                 ch = c;
3986                         char_boundary = 0;
3987                 } else {        /* Processing Non-ASCII character */
3988                         char_boundary = 1;
3989                         if (EQ(charset, Vcharset_control_1)) {
3990                                 encode_utf8(Vcharset_control_1, c, 0, dst);
3991                         } else {
3992                                 switch (XCHARSET_REP_BYTES(charset)) {
3993                                 case 2:
3994                                         encode_utf8(charset, c, 0, dst);
3995                                         break;
3996                                 case 3:
3997                                         if (XCHARSET_PRIVATE_P(charset)) {
3998                                                 encode_utf8(charset, c, 0, dst);
3999                                                 ch = 0;
4000                                         } else if (ch) {
4001 #ifdef ENABLE_COMPOSITE_CHARS
4002                                                 if (EQ
4003                                                     (charset,
4004                                                      Vcharset_composite)) {
4005                                                         if (in_composite) {
4006                                                                 /* #### Bother! We don't know how to
4007                                                                    handle this yet. */
4008                                                                 encode_utf8
4009                                                                     (Vcharset_ascii,
4010                                                                      '~', 0,
4011                                                                      dst);
4012                                                         } else {
4013                                                                 Emchar emch =
4014                                                                     MAKE_CHAR
4015                                                                     (Vcharset_composite,
4016                                                                      ch & 0x7F,
4017                                                                      c & 0x7F);
4018                                                                 Lisp_Object lstr
4019                                                                     =
4020                                                                     composite_char_string
4021                                                                     (emch);
4022                                                                 saved_n = n;
4023                                                                 saved_src = src;
4024                                                                 in_composite =
4025                                                                     1;
4026                                                                 src =
4027                                                                     XSTRING_DATA
4028                                                                     (lstr);
4029                                                                 n = XSTRING_LENGTH(lstr);
4030                                                         }
4031                                                 } else
4032 #endif                          /* ENABLE_COMPOSITE_CHARS */
4033                                                 {
4034                                                         encode_utf8(charset, ch,
4035                                                                     c, dst);
4036                                                 }
4037                                                 ch = 0;
4038                                         } else {
4039                                                 ch = c;
4040                                                 char_boundary = 0;
4041                                         }
4042                                         break;
4043                                 case 4:
4044                                         if (ch) {
4045                                                 encode_utf8(charset, ch, c,
4046                                                             dst);
4047                                                 ch = 0;
4048                                         } else {
4049                                                 ch = c;
4050                                                 char_boundary = 0;
4051                                         }
4052                                         break;
4053                                 default:
4054                                         abort();
4055                                 }
4056                         }
4057                 }
4058         }
4059
4060 #ifdef ENABLE_COMPOSITE_CHARS
4061         if (in_composite) {
4062                 n = saved_n;
4063                 src = saved_src;
4064                 in_composite = 0;
4065                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
4066         }
4067 #endif
4068
4069         str->flags = flags;
4070         str->ch = ch;
4071         str->iso2022.current_char_boundary = char_boundary;
4072         str->iso2022.current_charset = charset;
4073
4074         /* Verbum caro factum est! */
4075 }
4076 \f
4077 /************************************************************************/
4078 /*                           ISO2022 methods                            */
4079 /************************************************************************/
4080
4081 /* The following note describes the coding system ISO2022 briefly.
4082    Since the intention of this note is to help understand the
4083    functions in this file, some parts are NOT ACCURATE or OVERLY
4084    SIMPLIFIED.  For thorough understanding, please refer to the
4085    original document of ISO2022.
4086
4087    ISO2022 provides many mechanisms to encode several character sets
4088    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4089    is encoded using bytes less than 128.  This may make the encoded
4090    text a little bit longer, but the text passes more easily through
4091    several gateways, some of which strip off MSB (Most Signigant Bit).
4092
4093    There are two kinds of character sets: control character set and
4094    graphic character set.  The former contains control characters such
4095    as `newline' and `escape' to provide control functions (control
4096    functions are also provided by escape sequences).  The latter
4097    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4098    two control character sets and many graphic character sets.
4099
4100    Graphic character sets are classified into one of the following
4101    four classes, according to the number of bytes (DIMENSION) and
4102    number of characters in one dimension (CHARS) of the set:
4103    - DIMENSION1_CHARS94
4104    - DIMENSION1_CHARS96
4105    - DIMENSION2_CHARS94
4106    - DIMENSION2_CHARS96
4107
4108    In addition, each character set is assigned an identification tag,
4109    unique for each set, called "final character" (denoted as <F>
4110    hereafter).  The <F> of each character set is decided by ECMA(*)
4111    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4112    (0x30..0x3F are for private use only).
4113
4114    Note (*): ECMA = European Computer Manufacturers Association
4115
4116    Here are examples of graphic character set [NAME(<F>)]:
4117         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4118         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4119         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4120         o DIMENSION2_CHARS96 -- none for the moment
4121
4122    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4123         C0 [0x00..0x1F] -- control character plane 0
4124         GL [0x20..0x7F] -- graphic character plane 0
4125         C1 [0x80..0x9F] -- control character plane 1
4126         GR [0xA0..0xFF] -- graphic character plane 1
4127
4128    A control character set is directly designated and invoked to C0 or
4129    C1 by an escape sequence.  The most common case is that:
4130    - ISO646's  control character set is designated/invoked to C0, and
4131    - ISO6429's control character set is designated/invoked to C1,
4132    and usually these designations/invocations are omitted in encoded
4133    text.  In a 7-bit environment, only C0 can be used, and a control
4134    character for C1 is encoded by an appropriate escape sequence to
4135    fit into the environment.  All control characters for C1 are
4136    defined to have corresponding escape sequences.
4137
4138    A graphic character set is at first designated to one of four
4139    graphic registers (G0 through G3), then these graphic registers are
4140    invoked to GL or GR.  These designations and invocations can be
4141    done independently.  The most common case is that G0 is invoked to
4142    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4143    these invocations and designations are omitted in encoded text.
4144    In a 7-bit environment, only GL can be used.
4145
4146    When a graphic character set of CHARS94 is invoked to GL, codes
4147    0x20 and 0x7F of the GL area work as control characters SPACE and
4148    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4149    be used.
4150
4151    There are two ways of invocation: locking-shift and single-shift.
4152    With locking-shift, the invocation lasts until the next different
4153    invocation, whereas with single-shift, the invocation affects the
4154    following character only and doesn't affect the locking-shift
4155    state.  Invocations are done by the following control characters or
4156    escape sequences:
4157
4158    ----------------------------------------------------------------------
4159    abbrev  function                  cntrl escape seq   description
4160    ----------------------------------------------------------------------
4161    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4162    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4163    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4164    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4165    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4166    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4167    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4168    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4169    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4170    ----------------------------------------------------------------------
4171    (*) These are not used by any known coding system.
4172
4173    Control characters for these functions are defined by macros
4174    ISO_CODE_XXX in `coding.h'.
4175
4176    Designations are done by the following escape sequences:
4177    ----------------------------------------------------------------------
4178    escape sequence      description
4179    ----------------------------------------------------------------------
4180    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4181    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4182    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4183    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4184    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4185    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4186    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4187    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4188    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4189    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4190    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4191    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4192    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4193    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4194    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4195    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4196    ----------------------------------------------------------------------
4197
4198    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4199    of dimension 1, chars 94, and final character <F>, etc...
4200
4201    Note (*): Although these designations are not allowed in ISO2022,
4202    Emacs accepts them on decoding, and produces them on encoding
4203    CHARS96 character sets in a coding system which is characterized as
4204    7-bit environment, non-locking-shift, and non-single-shift.
4205
4206    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4207    '(' can be omitted.  We refer to this as "short-form" hereafter.
4208
4209    Now you may notice that there are a lot of ways for encoding the
4210    same multilingual text in ISO2022.  Actually, there exist many
4211    coding systems such as Compound Text (used in X11's inter client
4212    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4213    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4214    localized platforms), and all of these are variants of ISO2022.
4215
4216    In addition to the above, Emacs handles two more kinds of escape
4217    sequences: ISO6429's direction specification and Emacs' private
4218    sequence for specifying character composition.
4219
4220    ISO6429's direction specification takes the following form:
4221         o CSI ']'      -- end of the current direction
4222         o CSI '0' ']'  -- end of the current direction
4223         o CSI '1' ']'  -- start of left-to-right text
4224         o CSI '2' ']'  -- start of right-to-left text
4225    The control character CSI (0x9B: control sequence introducer) is
4226    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4227
4228    Character composition specification takes the following form:
4229         o ESC '0' -- start character composition
4230         o ESC '1' -- end character composition
4231    Since these are not standard escape sequences of any ISO standard,
4232    their use with these meanings is restricted to Emacs only.  */
4233
4234 static void
4235 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4236 {
4237         int i;
4238
4239         for (i = 0; i < 4; i++) {
4240                 if (!NILP(coding_system))
4241                         iso->charset[i] =
4242                             XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4243                             (coding_system, i);
4244                 else
4245                         iso->charset[i] = Qt;
4246                 iso->invalid_designated[i] = 0;
4247         }
4248         iso->esc = ISO_ESC_NOTHING;
4249         iso->esc_bytes_index = 0;
4250         iso->register_left = 0;
4251         iso->register_right = 1;
4252         iso->switched_dir_and_no_valid_charset_yet = 0;
4253         iso->invalid_switch_dir = 0;
4254         iso->output_direction_sequence = 0;
4255         iso->output_literally = 0;
4256 #ifdef ENABLE_COMPOSITE_CHARS
4257         if (iso->composite_chars)
4258                 Dynarr_reset(iso->composite_chars);
4259 #endif
4260 }
4261
4262 static int fit_to_be_escape_quoted(unsigned char c)
4263 {
4264         switch (c) {
4265         case ISO_CODE_ESC:
4266         case ISO_CODE_CSI:
4267         case ISO_CODE_SS2:
4268         case ISO_CODE_SS3:
4269         case ISO_CODE_SO:
4270         case ISO_CODE_SI:
4271                 return 1;
4272
4273         default:
4274                 return 0;
4275         }
4276 }
4277
4278 /* Parse one byte of an ISO2022 escape sequence.
4279    If the result is an invalid escape sequence, return 0 and
4280    do not change anything in STR.  Otherwise, if the result is
4281    an incomplete escape sequence, update ISO2022.ESC and
4282    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4283    all the state variables (but not ISO2022.ESC_BYTES) and
4284    return 1.
4285
4286    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4287    or invocation of an invalid character set and treat that as
4288    an unrecognized escape sequence.
4289
4290    ********************************************************************
4291
4292    #### Strategies for error annotation and coding orthogonalization
4293
4294    We really want to separate out a number of things.  Conceptually,
4295    there is a nested syntax.
4296
4297    At the top level is the ISO 2022 extension syntax, including charset
4298    designation and invocation, and certain auxiliary controls such as the
4299    ISO 6429 direction specification.  These are octet-oriented, with the
4300    single exception (AFAIK) of the "exit Unicode" sequence which uses the
4301    UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4302    UTF-16, and 4 bytes for UCS-4 and UTF-32).  This will be treated as a
4303    (deprecated) special case in Unicode processing.
4304
4305    The middle layer is ISO 2022 character interpretation.  This will depend
4306    on the current state of the ISO 2022 registers, and assembles octets
4307    into the character's internal representation.
4308
4309    The lowest level is translating system control conventions.  At present
4310    this is restricted to newline translation, but one could imagine doing
4311    tab conversion or line wrapping here.  "Escape from Unicode" processing
4312    would be done at this level.
4313
4314    At each level the parser will verify the syntax.  In the case of a
4315    syntax error or warning (such as a redundant escape sequence that affects
4316    no characters), the parser will take some action, typically inserting the
4317    erroneous octets directly into the output and creating an annotation
4318    which can be used by higher level I/O to mark the affected region.
4319
4320    This should make it possible to do something sensible about separating
4321    newline convention processing from character construction, and about
4322    preventing ISO 2022 escape sequences from being recognized
4323    inappropriately.
4324
4325    The basic strategy will be to have octet classification tables, and
4326    switch processing according to the table entry.
4327
4328    It's possible that, by doing the processing with tables of functions or
4329    the like, the parser can be used for both detection and translation. */
4330
4331 static int
4332 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4333                   unsigned char c, unsigned int *flags,
4334                   int check_invalid_charsets)
4335 {
4336         /* (1) If we're at the end of a designation sequence, CS is the
4337            charset being designated and REG is the register to designate
4338            it to.
4339
4340            (2) If we're at the end of a locking-shift sequence, REG is
4341            the register to invoke and HALF (0 == left, 1 == right) is
4342            the half to invoke it into.
4343
4344            (3) If we're at the end of a single-shift sequence, REG is
4345            the register to invoke. */
4346         Lisp_Object cs = Qnil;
4347         int reg, half;
4348
4349         /* NOTE: This code does goto's all over the fucking place.
4350            The reason for this is that we're basically implementing
4351            a state machine here, and hierarchical languages like C
4352            don't really provide a clean way of doing this. */
4353
4354         if (!(*flags & CODING_STATE_ESCAPE))
4355                 /* At beginning of escape sequence; we need to reset our
4356                    escape-state variables. */
4357                 iso->esc = ISO_ESC_NOTHING;
4358
4359         iso->output_literally = 0;
4360         iso->output_direction_sequence = 0;
4361
4362         switch (iso->esc) {
4363         case ISO_ESC_NOTHING:
4364                 iso->esc_bytes_index = 0;
4365                 switch (c) {
4366                 case ISO_CODE_ESC:      /* Start escape sequence */
4367                         *flags |= CODING_STATE_ESCAPE;
4368                         iso->esc = ISO_ESC;
4369                         goto not_done;
4370
4371                 case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4372                         *flags |= CODING_STATE_ESCAPE;
4373                         iso->esc = ISO_ESC_5_11;
4374                         goto not_done;
4375
4376                 case ISO_CODE_SO:       /* locking shift 1 */
4377                         reg = 1;
4378                         half = 0;
4379                         goto locking_shift;
4380                 case ISO_CODE_SI:       /* locking shift 0 */
4381                         reg = 0;
4382                         half = 0;
4383                         goto locking_shift;
4384
4385                 case ISO_CODE_SS2:      /* single shift */
4386                         reg = 2;
4387                         goto single_shift;
4388                 case ISO_CODE_SS3:      /* single shift */
4389                         reg = 3;
4390                         goto single_shift;
4391
4392                 default:        /* Other control characters */
4393                         return 0;
4394                 }
4395
4396         case ISO_ESC:
4397                 switch (c) {
4398           /**** single shift ****/
4399
4400                 case 'N':       /* single shift 2 */
4401                         reg = 2;
4402                         goto single_shift;
4403                 case 'O':       /* single shift 3 */
4404                         reg = 3;
4405                         goto single_shift;
4406
4407           /**** locking shift ****/
4408
4409                 case '~':       /* locking shift 1 right */
4410                         reg = 1;
4411                         half = 1;
4412                         goto locking_shift;
4413                 case 'n':       /* locking shift 2 */
4414                         reg = 2;
4415                         half = 0;
4416                         goto locking_shift;
4417                 case '}':       /* locking shift 2 right */
4418                         reg = 2;
4419                         half = 1;
4420                         goto locking_shift;
4421                 case 'o':       /* locking shift 3 */
4422                         reg = 3;
4423                         half = 0;
4424                         goto locking_shift;
4425                 case '|':       /* locking shift 3 right */
4426                         reg = 3;
4427                         half = 1;
4428                         goto locking_shift;
4429
4430 #ifdef ENABLE_COMPOSITE_CHARS
4431           /**** composite ****/
4432
4433                 case '0':
4434                         iso->esc = ISO_ESC_START_COMPOSITE;
4435                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4436                             CODING_STATE_COMPOSITE;
4437                         return 1;
4438
4439                 case '1':
4440                         iso->esc = ISO_ESC_END_COMPOSITE;
4441                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4442                             ~CODING_STATE_COMPOSITE;
4443                         return 1;
4444 #endif                          /* ENABLE_COMPOSITE_CHARS */
4445
4446           /**** directionality ****/
4447
4448                 case '[':
4449                         iso->esc = ISO_ESC_5_11;
4450                         goto not_done;
4451
4452           /**** designation ****/
4453
4454                 case '$':       /* multibyte charset prefix */
4455                         iso->esc = ISO_ESC_2_4;
4456                         goto not_done;
4457
4458                 default:
4459                         if (0x28 <= c && c <= 0x2F) {
4460                                 iso->esc =
4461                                     (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4462                                 goto not_done;
4463                         }
4464
4465                         /* This function is called with CODESYS equal to nil when
4466                            doing coding-system detection. */
4467                         if (!NILP(codesys)
4468                             && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4469                             && fit_to_be_escape_quoted(c)) {
4470                                 iso->esc = ISO_ESC_LITERAL;
4471                                 *flags &= CODING_STATE_ISO2022_LOCK;
4472                                 return 1;
4473                         }
4474
4475                         /* bzzzt! */
4476                         return 0;
4477                 }
4478
4479       /**** directionality ****/
4480
4481         case ISO_ESC_5_11:      /* ISO6429 direction control */
4482                 if (c == ']') {
4483                         *flags &=
4484                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4485                         goto directionality;
4486                 }
4487                 if (c == '0')
4488                         iso->esc = ISO_ESC_5_11_0;
4489                 else if (c == '1')
4490                         iso->esc = ISO_ESC_5_11_1;
4491                 else if (c == '2')
4492                         iso->esc = ISO_ESC_5_11_2;
4493                 else
4494                         return 0;
4495                 goto not_done;
4496
4497         case ISO_ESC_5_11_0:
4498                 if (c == ']') {
4499                         *flags &=
4500                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4501                         goto directionality;
4502                 }
4503                 return 0;
4504
4505         case ISO_ESC_5_11_1:
4506                 if (c == ']') {
4507                         *flags =
4508                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4509                         goto directionality;
4510                 }
4511                 return 0;
4512
4513         case ISO_ESC_5_11_2:
4514                 if (c == ']') {
4515                         *flags =
4516                             (*flags & CODING_STATE_ISO2022_LOCK) |
4517                             CODING_STATE_R2L;
4518                         goto directionality;
4519                 }
4520                 return 0;
4521
4522         directionality:
4523                 iso->esc = ISO_ESC_DIRECTIONALITY;
4524                 /* Various junk here to attempt to preserve the direction
4525                    sequences literally in the text if they would otherwise be
4526                    swallowed due to invalid designations that don't show up as
4527                    actual charset changes in the text. */
4528                 if (iso->invalid_switch_dir) {
4529                         /* We already inserted a direction switch literally into
4530                            the text.  We assume (#### this may not be right)
4531                            that the next direction switch is the one going the
4532                            other way, and we need to output that literally as
4533                            well. */
4534                         iso->output_literally = 1;
4535                         iso->invalid_switch_dir = 0;
4536                 } else {
4537                         int jj;
4538
4539                         /* If we are in the thrall of an invalid designation,
4540                            then stick the directionality sequence literally into
4541                            the output stream so it ends up in the original text
4542                            again. */
4543                         for (jj = 0; jj < 4; jj++)
4544                                 if (iso->invalid_designated[jj])
4545                                         break;
4546                         if (jj < 4) {
4547                                 iso->output_literally = 1;
4548                                 iso->invalid_switch_dir = 1;
4549                         } else
4550                                 /* Indicate that we haven't yet seen a valid
4551                                    designation, so that if a switch-dir is
4552                                    directly followed by an invalid designation,
4553                                    both get inserted literally. */
4554                                 iso->switched_dir_and_no_valid_charset_yet = 1;
4555                 }
4556                 return 1;
4557
4558                 /**** designation ****/
4559
4560         case ISO_ESC_2_4:
4561                 if (0x28 <= c && c <= 0x2F) {
4562                         iso->esc =
4563                             (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4564                         goto not_done;
4565                 }
4566                 if (0x40 <= c && c <= 0x42) {
4567                         cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4568                                                    *flags & CODING_STATE_R2L ?
4569                                                    CHARSET_RIGHT_TO_LEFT :
4570                                                    CHARSET_LEFT_TO_RIGHT);
4571                         reg = 0;
4572                         goto designated;
4573                 }
4574                 return 0;
4575
4576                 /* list the rest */
4577         case ISO_ESC_2_8:
4578         case ISO_ESC_2_9:
4579         case ISO_ESC_2_10:
4580         case ISO_ESC_2_11:
4581         case ISO_ESC_2_12:
4582         case ISO_ESC_2_13:
4583         case ISO_ESC_2_14:
4584         case ISO_ESC_2_15:
4585         case ISO_ESC_2_4_8:
4586         case ISO_ESC_2_4_9:
4587         case ISO_ESC_2_4_10:
4588         case ISO_ESC_2_4_11:
4589         case ISO_ESC_2_4_12:
4590         case ISO_ESC_2_4_13:
4591         case ISO_ESC_2_4_14:
4592         case ISO_ESC_2_4_15:
4593         case ISO_ESC_SINGLE_SHIFT:
4594         case ISO_ESC_LOCKING_SHIFT:
4595         case ISO_ESC_DESIGNATE:
4596         case ISO_ESC_DIRECTIONALITY:
4597         case ISO_ESC_LITERAL:
4598
4599         default: {
4600                 int type = -1;
4601
4602                 if (c < '0' || c > '~')
4603                         return 0;       /* bad final byte */
4604
4605                 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4606                         type = ((iso->esc >= ISO_ESC_2_12) ?
4607                                 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4608                         reg = (iso->esc - ISO_ESC_2_8) & 3;
4609                 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4610                            iso->esc <= ISO_ESC_2_4_15) {
4611                         type = ((iso->esc >= ISO_ESC_2_4_12) ?
4612                                 CHARSET_TYPE_96X96 :
4613                                 CHARSET_TYPE_94X94);
4614                         reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4615                 } else {
4616                         /* Can this ever be reached? -slb */
4617                         abort();
4618                         return 0;
4619                 }
4620
4621                 cs = CHARSET_BY_ATTRIBUTES(type, c,
4622                                            *flags & CODING_STATE_R2L ?
4623                                            CHARSET_RIGHT_TO_LEFT :
4624                                            CHARSET_LEFT_TO_RIGHT);
4625                 goto designated;
4626         }
4627         }
4628
4629       not_done:
4630         iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4631         return -1;
4632
4633       single_shift:
4634         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4635                 /* can't invoke something that ain't there. */
4636                 return 0;
4637         iso->esc = ISO_ESC_SINGLE_SHIFT;
4638         *flags &= CODING_STATE_ISO2022_LOCK;
4639         if (reg == 2)
4640                 *flags |= CODING_STATE_SS2;
4641         else
4642                 *flags |= CODING_STATE_SS3;
4643         return 1;
4644
4645       locking_shift:
4646         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4647                 /* can't invoke something that ain't there. */
4648                 return 0;
4649         if (half)
4650                 iso->register_right = reg;
4651         else
4652                 iso->register_left = reg;
4653         *flags &= CODING_STATE_ISO2022_LOCK;
4654         iso->esc = ISO_ESC_LOCKING_SHIFT;
4655         return 1;
4656
4657       designated:
4658         if (NILP(cs) && check_invalid_charsets) {
4659                 iso->invalid_designated[reg] = 1;
4660                 iso->charset[reg] = Vcharset_ascii;
4661                 iso->esc = ISO_ESC_DESIGNATE;
4662                 *flags &= CODING_STATE_ISO2022_LOCK;
4663                 iso->output_literally = 1;
4664                 if (iso->switched_dir_and_no_valid_charset_yet) {
4665                         /* We encountered a switch-direction followed by an
4666                            invalid designation.  Ensure that the switch-direction
4667                            gets outputted; otherwise it will probably get eaten
4668                            when the text is written out again. */
4669                         iso->switched_dir_and_no_valid_charset_yet = 0;
4670                         iso->output_direction_sequence = 1;
4671                         /* And make sure that the switch-dir going the other
4672                            way gets outputted, as well. */
4673                         iso->invalid_switch_dir = 1;
4674                 }
4675                 return 1;
4676         }
4677         /* This function is called with CODESYS equal to nil when
4678            doing coding-system detection. */
4679         if (!NILP(codesys)) {
4680                 charset_conversion_spec_dynarr *dyn =
4681                     XCODING_SYSTEM(codesys)->iso2022.input_conv;
4682
4683                 if (dyn) {
4684                         int i;
4685
4686                         for (i = 0; i < Dynarr_length(dyn); i++) {
4687                                 struct charset_conversion_spec *spec =
4688                                     Dynarr_atp(dyn, i);
4689                                 if (EQ(cs, spec->from_charset))
4690                                         cs = spec->to_charset;
4691                         }
4692                 }
4693         }
4694
4695         iso->charset[reg] = cs;
4696         iso->esc = ISO_ESC_DESIGNATE;
4697         *flags &= CODING_STATE_ISO2022_LOCK;
4698         if (iso->invalid_designated[reg]) {
4699                 iso->invalid_designated[reg] = 0;
4700                 iso->output_literally = 1;
4701         }
4702         if (iso->switched_dir_and_no_valid_charset_yet)
4703                 iso->switched_dir_and_no_valid_charset_yet = 0;
4704         return 1;
4705 }
4706
4707 static int
4708 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4709                       Lstream_data_count n)
4710 {
4711         int mask;
4712
4713         /* #### There are serious deficiencies in the recognition mechanism
4714            here.  This needs to be much smarter if it's going to cut it.
4715            The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4716            it should be detected as Latin-1.
4717            All the ISO2022 stuff in this file should be synced up with the
4718            code from FSF Emacs-20.4, in which Mule should be more or less stable.
4719            Perhaps we should wait till R2L works in FSF Emacs? */
4720
4721         if (!st->iso2022.initted) {
4722                 reset_iso2022(Qnil, &st->iso2022.iso);
4723                 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4724                                     CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4725                                     CODING_CATEGORY_ISO_8_1_MASK |
4726                                     CODING_CATEGORY_ISO_8_2_MASK |
4727                                     CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4728                 st->iso2022.flags = 0;
4729                 st->iso2022.high_byte_count = 0;
4730                 st->iso2022.saw_single_shift = 0;
4731                 st->iso2022.initted = 1;
4732         }
4733
4734         mask = st->iso2022.mask;
4735
4736         while (n--) {
4737                 const unsigned char c = *(const unsigned char *)src++;
4738                 if (c >= 0xA0) {
4739                         mask &= ~CODING_CATEGORY_ISO_7_MASK;
4740                         st->iso2022.high_byte_count++;
4741                 } else {
4742                         if (st->iso2022.high_byte_count
4743                             && !st->iso2022.saw_single_shift) {
4744                                 if (st->iso2022.high_byte_count & 1)
4745                                         /* odd number of high bytes; assume not iso-8-2 */
4746                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4747                         }
4748                         st->iso2022.high_byte_count = 0;
4749                         st->iso2022.saw_single_shift = 0;
4750                         if (c > 0x80)
4751                                 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4752                 }
4753                 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4754                     && (BYTE_C0_P(c) || BYTE_C1_P(c))) {        /* control chars */
4755                         switch (c) {
4756                                 /* Allow and ignore control characters that you might
4757                                    reasonably see in a text file */
4758                         case '\r':
4759                         case '\n':
4760                         case '\t':
4761                         case 7: /* bell */
4762                         case 8: /* backspace */
4763                         case 11:        /* vertical tab */
4764                         case 12:        /* form feed */
4765                         case 26:        /* MS-DOS C-z junk */
4766                         case 31:        /* '^_' -- for info */
4767                                 goto label_continue_loop;
4768
4769                         default:
4770                                 break;
4771                         }
4772                 }
4773
4774                 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4775                     || BYTE_C1_P(c)) {
4776                         if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4777                                               &st->iso2022.flags, 0)) {
4778                                 switch (st->iso2022.iso.esc) {
4779                                 case ISO_ESC_DESIGNATE:
4780                                         mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4781                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4782                                         break;
4783                                 case ISO_ESC_LOCKING_SHIFT:
4784                                         mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4785                                         goto ran_out_of_chars;
4786                                 case ISO_ESC_SINGLE_SHIFT:
4787                                         mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4788                                         st->iso2022.saw_single_shift = 1;
4789                                         break;
4790
4791                                         /* list the rest */
4792                                 case ISO_ESC_NOTHING:
4793                                 case ISO_ESC:
4794                                 case ISO_ESC_2_4:
4795                                 case ISO_ESC_2_8:
4796                                 case ISO_ESC_2_9:
4797                                 case ISO_ESC_2_10:
4798                                 case ISO_ESC_2_11:
4799                                 case ISO_ESC_2_12:
4800                                 case ISO_ESC_2_13:
4801                                 case ISO_ESC_2_14:
4802                                 case ISO_ESC_2_15:
4803                                 case ISO_ESC_2_4_8:
4804                                 case ISO_ESC_2_4_9:
4805                                 case ISO_ESC_2_4_10:
4806                                 case ISO_ESC_2_4_11:
4807                                 case ISO_ESC_2_4_12:
4808                                 case ISO_ESC_2_4_13:
4809                                 case ISO_ESC_2_4_14:
4810                                 case ISO_ESC_2_4_15:
4811                                 case ISO_ESC_5_11:
4812                                 case ISO_ESC_5_11_0:
4813                                 case ISO_ESC_5_11_1:
4814                                 case ISO_ESC_5_11_2:
4815                                 case ISO_ESC_DIRECTIONALITY:
4816                                 case ISO_ESC_LITERAL:
4817                                 default:
4818                                         break;
4819                                 }
4820                         } else {
4821                                 mask = 0;
4822                                 goto ran_out_of_chars;
4823                         }
4824                 }
4825         label_continue_loop:;
4826         }
4827
4828 ran_out_of_chars:
4829         return mask;
4830 }
4831
4832 static int postprocess_iso2022_mask(int mask)
4833 {
4834         /* #### kind of cheesy */
4835         /* If seven-bit ISO is allowed, then assume that the encoding is
4836            entirely seven-bit and turn off the eight-bit ones. */
4837         if (mask & CODING_CATEGORY_ISO_7_MASK)
4838                 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4839                           CODING_CATEGORY_ISO_8_1_MASK |
4840                           CODING_CATEGORY_ISO_8_2_MASK);
4841         return mask;
4842 }
4843
4844 /* If FLAGS is a null pointer or specifies right-to-left motion,
4845    output a switch-dir-to-left-to-right sequence to DST.
4846    Also update FLAGS if it is not a null pointer.
4847    If INTERNAL_P is set, we are outputting in internal format and
4848    need to handle the CSI differently. */
4849
4850 static void
4851 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4852                                 unsigned_char_dynarr * dst,
4853                                 unsigned int *flags, int internal_p)
4854 {
4855         if (!flags || (*flags & CODING_STATE_R2L)) {
4856                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4857                         Dynarr_add(dst, ISO_CODE_ESC);
4858                         Dynarr_add(dst, '[');
4859                 } else if (internal_p)
4860                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4861                 else
4862                         Dynarr_add(dst, ISO_CODE_CSI);
4863                 Dynarr_add(dst, '0');
4864                 Dynarr_add(dst, ']');
4865                 if (flags)
4866                         *flags &= ~CODING_STATE_R2L;
4867         }
4868 }
4869
4870 /* If FLAGS is a null pointer or specifies a direction different from
4871    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4872    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4873    sequence to DST.  Also update FLAGS if it is not a null pointer.
4874    If INTERNAL_P is set, we are outputting in internal format and
4875    need to handle the CSI differently. */
4876
4877 static void
4878 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4879                          unsigned_char_dynarr * dst, unsigned int *flags,
4880                          int internal_p)
4881 {
4882         if ((!flags || (*flags & CODING_STATE_R2L)) &&
4883             direction == CHARSET_LEFT_TO_RIGHT)
4884                 restore_left_to_right_direction(codesys, dst, flags,
4885                                                 internal_p);
4886         else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4887                  && (!flags || !(*flags & CODING_STATE_R2L)) &&
4888                  direction == CHARSET_RIGHT_TO_LEFT) {
4889                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4890                         Dynarr_add(dst, ISO_CODE_ESC);
4891                         Dynarr_add(dst, '[');
4892                 } else if (internal_p)
4893                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4894                 else
4895                         Dynarr_add(dst, ISO_CODE_CSI);
4896                 Dynarr_add(dst, '2');
4897                 Dynarr_add(dst, ']');
4898                 if (flags)
4899                         *flags |= CODING_STATE_R2L;
4900         }
4901 }
4902
4903 /* Convert ISO2022-format data to internal format. */
4904
4905 static void
4906 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4907                       unsigned_char_dynarr * dst, Lstream_data_count n)
4908 {
4909         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4910         unsigned int flags = str->flags;
4911         unsigned int ch = str->ch;
4912         eol_type_t eol_type = str->eol_type;
4913 #ifdef ENABLE_COMPOSITE_CHARS
4914         unsigned_char_dynarr *real_dst = dst;
4915 #endif
4916         Lisp_Object coding_system;
4917
4918         XSETCODING_SYSTEM(coding_system, str->codesys);
4919
4920 #ifdef ENABLE_COMPOSITE_CHARS
4921         if (flags & CODING_STATE_COMPOSITE)
4922                 dst = str->iso2022.composite_chars;
4923 #endif                          /* ENABLE_COMPOSITE_CHARS */
4924
4925         while (n--) {
4926                 const unsigned char c = *(const unsigned char *)src++;
4927                 if (flags & CODING_STATE_ESCAPE) {
4928                         /* Within ESC sequence */
4929                         int retval = parse_iso2022_esc(
4930                                 coding_system, &str->iso2022, c, &flags, 1);
4931
4932                         if (retval) {
4933                                 switch (str->iso2022.esc) {
4934 #ifdef ENABLE_COMPOSITE_CHARS
4935                                 case ISO_ESC_START_COMPOSITE:
4936                                         if (str->iso2022.composite_chars)
4937                                                 Dynarr_reset(str->iso2022.
4938                                                              composite_chars);
4939                                         else
4940                                                 str->iso2022.composite_chars =
4941                                                     Dynarr_new(unsigned_char);
4942                                         dst = str->iso2022.composite_chars;
4943                                         break;
4944                                 case ISO_ESC_END_COMPOSITE:
4945                                         {
4946                                                 Bufbyte comstr[MAX_EMCHAR_LEN];
4947                                                 Bytecount len;
4948                                                 Emchar emch =
4949                                                     lookup_composite_char
4950                                                     (Dynarr_atp(dst, 0),
4951                                                      Dynarr_length(dst));
4952                                                 dst = real_dst;
4953                                                 len =
4954                                                     set_charptr_emchar(comstr,
4955                                                                        emch);
4956                                                 Dynarr_add_many(dst, comstr,
4957                                                                 len);
4958                                                 break;
4959                                         }
4960 #endif                          /* ENABLE_COMPOSITE_CHARS */
4961
4962                                 case ISO_ESC_LITERAL:
4963                                         DECODE_ADD_BINARY_CHAR(c, dst);
4964                                         break;
4965
4966                                 case ISO_ESC_NOTHING:
4967                                 case ISO_ESC:
4968                                 case ISO_ESC_2_4:
4969                                 case ISO_ESC_2_8:
4970                                 case ISO_ESC_2_9:
4971                                 case ISO_ESC_2_10:
4972                                 case ISO_ESC_2_11:
4973                                 case ISO_ESC_2_12:
4974                                 case ISO_ESC_2_13:
4975                                 case ISO_ESC_2_14:
4976                                 case ISO_ESC_2_15:
4977                                 case ISO_ESC_2_4_8:
4978                                 case ISO_ESC_2_4_9:
4979                                 case ISO_ESC_2_4_10:
4980                                 case ISO_ESC_2_4_11:
4981                                 case ISO_ESC_2_4_12:
4982                                 case ISO_ESC_2_4_13:
4983                                 case ISO_ESC_2_4_14:
4984                                 case ISO_ESC_2_4_15:
4985                                 case ISO_ESC_5_11:
4986                                 case ISO_ESC_5_11_0:
4987                                 case ISO_ESC_5_11_1:
4988                                 case ISO_ESC_5_11_2:
4989                                 case ISO_ESC_SINGLE_SHIFT:
4990                                 case ISO_ESC_LOCKING_SHIFT:
4991                                 case ISO_ESC_DESIGNATE:
4992                                 case ISO_ESC_DIRECTIONALITY:
4993
4994                                 default:
4995                                         /* Everything else handled already */
4996                                         break;
4997                                 }
4998                         }
4999
5000                         /* Attempted error recovery. */
5001                         if (str->iso2022.output_direction_sequence)
5002                                 ensure_correct_direction(flags &
5003                                                          CODING_STATE_R2L ?
5004                                                          CHARSET_RIGHT_TO_LEFT :
5005                                                          CHARSET_LEFT_TO_RIGHT,
5006                                                          str->codesys, dst, 0,
5007                                                          1);
5008                         /* More error recovery. */
5009                         if (!retval || str->iso2022.output_literally) {
5010                                 /* Output the (possibly invalid) sequence */
5011                                 int i;
5012                                 for (i = 0; i < str->iso2022.esc_bytes_index;
5013                                      i++)
5014                                         DECODE_ADD_BINARY_CHAR(str->iso2022.
5015                                                                esc_bytes[i],
5016                                                                dst);
5017                                 flags &= CODING_STATE_ISO2022_LOCK;
5018                                 if (!retval)
5019                                         n++, src--;     /* Repeat the loop with the same character. */
5020                                 else {
5021                                         /* No sense in reprocessing the final byte of the
5022                                            escape sequence; it could mess things up anyway.
5023                                            Just add it now. */
5024                                         DECODE_ADD_BINARY_CHAR(c, dst);
5025                                 }
5026                         }
5027                         ch = 0;
5028                 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) {      /* Control characters */
5029
5030           /***** Error-handling *****/
5031
5032                         /* If we were in the middle of a character, dump out the
5033                            partial character. */
5034                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5035
5036                         /* If we just saw a single-shift character, dump it out.
5037                            This may dump out the wrong sort of single-shift character,
5038                            but least it will give an indication that something went
5039                            wrong. */
5040                         if (flags & CODING_STATE_SS2) {
5041                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5042                                 flags &= ~CODING_STATE_SS2;
5043                         }
5044                         if (flags & CODING_STATE_SS3) {
5045                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5046                                 flags &= ~CODING_STATE_SS3;
5047                         }
5048
5049           /***** Now handle the control characters. *****/
5050
5051                         /* Handle CR/LF */
5052                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5053
5054                         flags &= CODING_STATE_ISO2022_LOCK;
5055
5056                         if (!parse_iso2022_esc
5057                             (coding_system, &str->iso2022, c, &flags, 1))
5058                                 DECODE_ADD_BINARY_CHAR(c, dst);
5059                 } else {        /* Graphic characters */
5060                         Lisp_Object charset;
5061                         int lb;
5062                         int reg;
5063
5064                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5065
5066                         /* Now determine the charset. */
5067                         reg = ((flags & CODING_STATE_SS2) ? 2
5068                                : (flags & CODING_STATE_SS3) ? 3
5069                                : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5070                                : str->iso2022.register_left);
5071                         charset = str->iso2022.charset[reg];
5072
5073                         /* Error checking: */
5074                         if (!CHARSETP(charset)
5075                             || str->iso2022.invalid_designated[reg]
5076                             ||
5077                             (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5078                              && XCHARSET_CHARS(charset) == 94))
5079                                 /* Mrmph.  We are trying to invoke a register that has no
5080                                    or an invalid charset in it, or trying to add a character
5081                                    outside the range of the charset.  Insert that char literally
5082                                    to preserve it for the output. */
5083                         {
5084                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5085                                 DECODE_ADD_BINARY_CHAR(c, dst);
5086                         }
5087
5088                         else {
5089                                 /* Things are probably hunky-dorey. */
5090
5091                                 /* Fetch reverse charset, maybe. */
5092                                 if (((flags & CODING_STATE_R2L) &&
5093                                      XCHARSET_DIRECTION(charset) ==
5094                                      CHARSET_LEFT_TO_RIGHT)
5095                                     || (!(flags & CODING_STATE_R2L)
5096                                         && XCHARSET_DIRECTION(charset) ==
5097                                         CHARSET_RIGHT_TO_LEFT)) {
5098                                         Lisp_Object new_charset =
5099                                             XCHARSET_REVERSE_DIRECTION_CHARSET
5100                                             (charset);
5101                                         if (!NILP(new_charset))
5102                                                 charset = new_charset;
5103                                 }
5104
5105                                 lb = XCHARSET_LEADING_BYTE(charset);
5106                                 switch (XCHARSET_REP_BYTES(charset)) {
5107                                 case 1: /* ASCII */
5108                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5109                                         Dynarr_add(dst, c & 0x7F);
5110                                         break;
5111
5112                                 case 2: /* one-byte official */
5113                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5114                                         Dynarr_add(dst, lb);
5115                                         Dynarr_add(dst, c | 0x80);
5116                                         break;
5117
5118                                 case 3: /* one-byte private or two-byte official */
5119                                         if (XCHARSET_PRIVATE_P(charset)) {
5120                                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5121                                                 Dynarr_add(dst,
5122                                                            PRE_LEADING_BYTE_PRIVATE_1);
5123                                                 Dynarr_add(dst, lb);
5124                                                 Dynarr_add(dst, c | 0x80);
5125                                         } else {
5126                                                 if (ch) {
5127                                                         Dynarr_add(dst, lb);
5128                                                         Dynarr_add(dst,
5129                                                                    ch | 0x80);
5130                                                         Dynarr_add(dst,
5131                                                                    c | 0x80);
5132                                                         ch = 0;
5133                                                 } else
5134                                                         ch = c;
5135                                         }
5136                                         break;
5137
5138                                 default:        /* two-byte private */
5139                                         if (ch) {
5140                                                 Dynarr_add(dst,
5141                                                            PRE_LEADING_BYTE_PRIVATE_2);
5142                                                 Dynarr_add(dst, lb);
5143                                                 Dynarr_add(dst, ch | 0x80);
5144                                                 Dynarr_add(dst, c | 0x80);
5145                                                 ch = 0;
5146                                         } else
5147                                                 ch = c;
5148                                 }
5149                         }
5150
5151                         if (!ch)
5152                                 flags &= CODING_STATE_ISO2022_LOCK;
5153                 }
5154
5155               label_continue_loop:;
5156         }
5157
5158         if (flags & CODING_STATE_END)
5159                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5160
5161         str->flags = flags;
5162         str->ch = ch;
5163 }
5164
5165 /***** ISO2022 encoder *****/
5166
5167 /* Designate CHARSET into register REG. */
5168
5169 static void
5170 iso2022_designate(Lisp_Object charset, unsigned char reg,
5171                   encoding_stream_t str, unsigned_char_dynarr * dst)
5172 {
5173         static const char inter94[] = "()*+";
5174         static const char inter96[] = ",-./";
5175         unsigned int type;
5176         unsigned char final;
5177         Lisp_Object old_charset = str->iso2022.charset[reg];
5178
5179         str->iso2022.charset[reg] = charset;
5180         if (!CHARSETP(charset))
5181                 /* charset might be an initial nil or t. */
5182                 return;
5183         type = XCHARSET_TYPE(charset);
5184         final = XCHARSET_FINAL(charset);
5185         if (!str->iso2022.force_charset_on_output[reg] &&
5186             CHARSETP(old_charset) &&
5187             XCHARSET_TYPE(old_charset) == type &&
5188             XCHARSET_FINAL(old_charset) == final)
5189                 return;
5190
5191         str->iso2022.force_charset_on_output[reg] = 0;
5192
5193         {
5194                 charset_conversion_spec_dynarr *dyn =
5195                     str->codesys->iso2022.output_conv;
5196
5197                 if (dyn) {
5198                         int i;
5199
5200                         for (i = 0; i < Dynarr_length(dyn); i++) {
5201                                 struct charset_conversion_spec *spec =
5202                                     Dynarr_atp(dyn, i);
5203                                 if (EQ(charset, spec->from_charset))
5204                                         charset = spec->to_charset;
5205                         }
5206                 }
5207         }
5208
5209         Dynarr_add(dst, ISO_CODE_ESC);
5210         switch (type) {
5211         case CHARSET_TYPE_94:
5212                 Dynarr_add(dst, inter94[reg]);
5213                 break;
5214         case CHARSET_TYPE_96:
5215                 Dynarr_add(dst, inter96[reg]);
5216                 break;
5217         case CHARSET_TYPE_94X94:
5218                 Dynarr_add(dst, '$');
5219                 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5220                     || final < '@' || final > 'B')
5221                         Dynarr_add(dst, inter94[reg]);
5222                 break;
5223         case CHARSET_TYPE_96X96:
5224                 Dynarr_add(dst, '$');
5225                 Dynarr_add(dst, inter96[reg]);
5226                 break;
5227         default:
5228                 break;
5229         }
5230         Dynarr_add(dst, final);
5231 }
5232
5233 static void
5234 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5235 {
5236         if (str->iso2022.register_left != 0) {
5237                 Dynarr_add(dst, ISO_CODE_SI);
5238                 str->iso2022.register_left = 0;
5239         }
5240 }
5241
5242 static void
5243 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5244 {
5245         if (str->iso2022.register_left != 1) {
5246                 Dynarr_add(dst, ISO_CODE_SO);
5247                 str->iso2022.register_left = 1;
5248         }
5249 }
5250
5251 /* Convert internally-formatted data to ISO2022 format. */
5252
5253 static void
5254 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5255                       unsigned_char_dynarr * dst, Lstream_data_count n)
5256 {
5257         unsigned char charmask, c;
5258         unsigned char char_boundary;
5259         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5260         unsigned int flags = str->flags;
5261         unsigned int ch = str->ch;
5262         Lisp_Coding_System *codesys = str->codesys;
5263         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5264         int i;
5265         Lisp_Object charset;
5266         int half;
5267
5268 #ifdef ENABLE_COMPOSITE_CHARS
5269         /* flags for handling composite chars.  We do a little switcharoo
5270            on the source while we're outputting the composite char. */
5271         unsigned int saved_n = 0;
5272         const unsigned char *saved_src = NULL;
5273         int in_composite = 0;
5274 #endif                          /* ENABLE_COMPOSITE_CHARS */
5275
5276         char_boundary = str->iso2022.current_char_boundary;
5277         charset = str->iso2022.current_charset;
5278         half = str->iso2022.current_half;
5279
5280 #ifdef ENABLE_COMPOSITE_CHARS
5281       back_to_square_n:
5282 #endif
5283         while (n--) {
5284                 c = *src++;
5285
5286                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
5287                         ch = 0;
5288
5289                         restore_left_to_right_direction(codesys, dst, &flags,
5290                                                         0);
5291
5292                         /* Make sure G0 contains ASCII */
5293                         if ((c > ' ' && c < ISO_CODE_DEL) ||
5294                             !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5295                                 ensure_normal_shift(str, dst);
5296                                 iso2022_designate(Vcharset_ascii, 0, str, dst);
5297                         }
5298
5299                         /* If necessary, restore everything to the default state
5300                            at end-of-line */
5301                         if (c == '\n' &&
5302                             !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5303                                 restore_left_to_right_direction(codesys, dst,
5304                                                                 &flags, 0);
5305
5306                                 ensure_normal_shift(str, dst);
5307
5308                                 for (i = 0; i < 4; i++) {
5309                                         Lisp_Object initial_charset =
5310                                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5311                                             (codesys, i);
5312                                         iso2022_designate(initial_charset, i,
5313                                                           str, dst);
5314                                 }
5315                         }
5316                         if (c == '\n') {
5317                                 if (eol_type != EOL_LF
5318                                     && eol_type != EOL_AUTODETECT)
5319                                         Dynarr_add(dst, '\r');
5320                                 if (eol_type != EOL_CR)
5321                                         Dynarr_add(dst, c);
5322                         } else {
5323                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5324                                     && fit_to_be_escape_quoted(c))
5325                                         Dynarr_add(dst, ISO_CODE_ESC);
5326                                 Dynarr_add(dst, c);
5327                         }
5328                         char_boundary = 1;
5329                 }
5330
5331                 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {     /* Processing Leading Byte */
5332                         ch = 0;
5333                         charset = CHARSET_BY_LEADING_BYTE(c);
5334                         if (LEADING_BYTE_PREFIX_P(c))
5335                                 ch = c;
5336                         else if (!EQ(charset, Vcharset_control_1)
5337 #ifdef ENABLE_COMPOSITE_CHARS
5338                                  && !EQ(charset, Vcharset_composite)
5339 #endif
5340                             ) {
5341                                 int reg;
5342
5343                                 ensure_correct_direction(XCHARSET_DIRECTION
5344                                                          (charset), codesys,
5345                                                          dst, &flags, 0);
5346
5347                                 /* Now determine which register to use. */
5348                                 reg = -1;
5349                                 for (i = 0; i < 4; i++) {
5350                                         if (EQ(charset, str->iso2022.charset[i])
5351                                             || EQ(charset,
5352                                                   CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5353                                                   (codesys, i))) {
5354                                                 reg = i;
5355                                                 break;
5356                                         }
5357                                 }
5358
5359                                 if (reg == -1) {
5360                                         if (XCHARSET_GRAPHIC(charset) != 0) {
5361                                                 if (!NILP
5362                                                     (str->iso2022.charset[1])
5363                                                     &&
5364                                                     (!CODING_SYSTEM_ISO2022_SEVEN
5365                                                      (codesys)
5366                                                      ||
5367                                                      CODING_SYSTEM_ISO2022_LOCK_SHIFT
5368                                                      (codesys)))
5369                                                         reg = 1;
5370                                                 else if (!NILP
5371                                                          (str->iso2022.
5372                                                           charset[2]))
5373                                                         reg = 2;
5374                                                 else if (!NILP
5375                                                          (str->iso2022.
5376                                                           charset[3]))
5377                                                         reg = 3;
5378                                                 else
5379                                                         reg = 0;
5380                                         } else
5381                                                 reg = 0;
5382                                 }
5383
5384                                 iso2022_designate(charset, reg, str, dst);
5385
5386                                 /* Now invoke that register. */
5387                                 switch (reg) {
5388                                 case 0:
5389                                         ensure_normal_shift(str, dst);
5390                                         half = 0;
5391                                         break;
5392
5393                                 case 1:
5394                                         if (CODING_SYSTEM_ISO2022_SEVEN
5395                                             (codesys)) {
5396                                                 ensure_shift_out(str, dst);
5397                                                 half = 0;
5398                                         } else
5399                                                 half = 1;
5400                                         break;
5401
5402                                 case 2:
5403                                         if (CODING_SYSTEM_ISO2022_SEVEN
5404                                             (str->codesys)) {
5405                                                 Dynarr_add(dst, ISO_CODE_ESC);
5406                                                 Dynarr_add(dst, 'N');
5407                                                 half = 0;
5408                                         } else {
5409                                                 Dynarr_add(dst, ISO_CODE_SS2);
5410                                                 half = 1;
5411                                         }
5412                                         break;
5413
5414                                 case 3:
5415                                         if (CODING_SYSTEM_ISO2022_SEVEN
5416                                             (str->codesys)) {
5417                                                 Dynarr_add(dst, ISO_CODE_ESC);
5418                                                 Dynarr_add(dst, 'O');
5419                                                 half = 0;
5420                                         } else {
5421                                                 Dynarr_add(dst, ISO_CODE_SS3);
5422                                                 half = 1;
5423                                         }
5424                                         break;
5425
5426                                 default:
5427                                         abort();
5428                                 }
5429                         }
5430                         char_boundary = 0;
5431                 } else {        /* Processing Non-ASCII character */
5432                         charmask = (half == 0 ? 0x7F : 0xFF);
5433                         char_boundary = 1;
5434                         if (EQ(charset, Vcharset_control_1)) {
5435                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5436                                     && fit_to_be_escape_quoted(c))
5437                                         Dynarr_add(dst, ISO_CODE_ESC);
5438                                 /* you asked for it ... */
5439                                 Dynarr_add(dst, c - 0x20);
5440                         } else {
5441                                 switch (XCHARSET_REP_BYTES(charset)) {
5442                                 case 2:
5443                                         Dynarr_add(dst, c & charmask);
5444                                         break;
5445                                 case 3:
5446                                         if (XCHARSET_PRIVATE_P(charset)) {
5447                                                 Dynarr_add(dst, c & charmask);
5448                                                 ch = 0;
5449                                         } else if (ch) {
5450 #ifdef ENABLE_COMPOSITE_CHARS
5451                                                 if (EQ
5452                                                     (charset,
5453                                                      Vcharset_composite)) {
5454                                                         if (in_composite) {
5455                                                                 /* #### Bother! We don't know how to
5456                                                                    handle this yet. */
5457                                                                 Dynarr_add(dst,
5458                                                                            '~');
5459                                                         } else {
5460                                                                 Emchar emch =
5461                                                                     MAKE_CHAR
5462                                                                     (Vcharset_composite,
5463                                                                      ch & 0x7F,
5464                                                                      c & 0x7F);
5465                                                                 Lisp_Object lstr
5466                                                                     =
5467                                                                     composite_char_string
5468                                                                     (emch);
5469                                                                 saved_n = n;
5470                                                                 saved_src = src;
5471                                                                 in_composite =
5472                                                                     1;
5473                                                                 src =
5474                                                                     XSTRING_DATA
5475                                                                     (lstr);
5476                                                                 n = XSTRING_LENGTH(lstr);
5477                                                                 Dynarr_add(dst,
5478                                                                            ISO_CODE_ESC);
5479                                                                 Dynarr_add(dst, '0');   /* start composing */
5480                                                         }
5481                                                 } else
5482 #endif                          /* ENABLE_COMPOSITE_CHARS */
5483                                                 {
5484                                                         Dynarr_add(dst,
5485                                                                    ch &
5486                                                                    charmask);
5487                                                         Dynarr_add(dst,
5488                                                                    c &
5489                                                                    charmask);
5490                                                 }
5491                                                 ch = 0;
5492                                         } else {
5493                                                 ch = c;
5494                                                 char_boundary = 0;
5495                                         }
5496                                         break;
5497                                 case 4:
5498                                         if (ch) {
5499                                                 Dynarr_add(dst, ch & charmask);
5500                                                 Dynarr_add(dst, c & charmask);
5501                                                 ch = 0;
5502                                         } else {
5503                                                 ch = c;
5504                                                 char_boundary = 0;
5505                                         }
5506                                         break;
5507                                 default:
5508                                         abort();
5509                                 }
5510                         }
5511                 }
5512         }
5513
5514 #ifdef ENABLE_COMPOSITE_CHARS
5515         if (in_composite) {
5516                 n = saved_n;
5517                 src = saved_src;
5518                 in_composite = 0;
5519                 Dynarr_add(dst, ISO_CODE_ESC);
5520                 Dynarr_add(dst, '1');   /* end composing */
5521                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
5522         }
5523 #endif                          /* ENABLE_COMPOSITE_CHARS */
5524
5525         if (char_boundary && flags & CODING_STATE_END) {
5526                 restore_left_to_right_direction(codesys, dst, &flags, 0);
5527                 ensure_normal_shift(str, dst);
5528                 for (i = 0; i < 4; i++) {
5529                         Lisp_Object initial_charset =
5530                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5531                         iso2022_designate(initial_charset, i, str, dst);
5532                 }
5533         }
5534
5535         str->flags = flags;
5536         str->ch = ch;
5537         str->iso2022.current_char_boundary = char_boundary;
5538         str->iso2022.current_charset = charset;
5539         str->iso2022.current_half = half;
5540
5541         /* Verbum caro factum est! */
5542 }
5543 #endif                          /* MULE */
5544 \f
5545 /************************************************************************/
5546 /*                     No-conversion methods                            */
5547 /************************************************************************/
5548
5549 /* This is used when reading in "binary" files -- i.e. files that may
5550    contain all 256 possible byte values and that are not to be
5551    interpreted as being in any particular decoding. */
5552 static void
5553 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5554                             unsigned_char_dynarr * dst, Lstream_data_count n)
5555 {
5556         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5557         unsigned int flags = str->flags;
5558         unsigned int ch = str->ch;
5559         eol_type_t eol_type = str->eol_type;
5560
5561         while (n--) {
5562                 const unsigned char c = *(const unsigned char *)src++;
5563
5564                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5565                 DECODE_ADD_BINARY_CHAR(c, dst);
5566         label_continue_loop:;
5567         }
5568
5569         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5570
5571         str->flags = flags;
5572         str->ch = ch;
5573 }
5574
5575 static void
5576 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5577                             unsigned_char_dynarr * dst, Lstream_data_count n)
5578 {
5579         unsigned char c;
5580         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5581         unsigned int flags = str->flags;
5582         unsigned int ch = str->ch;
5583         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5584
5585         while (n--) {
5586                 c = *src++;
5587                 if (c == '\n') {
5588                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5589                                 Dynarr_add(dst, '\r');
5590                         if (eol_type != EOL_CR)
5591                                 Dynarr_add(dst, '\n');
5592                         ch = 0;
5593                 } else if (BYTE_ASCII_P(c)) {
5594                         assert(ch == 0);
5595                         Dynarr_add(dst, c);
5596                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5597                         assert(ch == 0);
5598                         if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5599                             c == LEADING_BYTE_CONTROL_1)
5600                                 ch = c;
5601                         else
5602                                 Dynarr_add(dst, '~');   /* untranslatable character */
5603                 } else {
5604                         if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5605                                 Dynarr_add(dst, c);
5606                         else if (ch == LEADING_BYTE_CONTROL_1) {
5607                                 assert(c < 0xC0);
5608                                 Dynarr_add(dst, c - 0x20);
5609                         }
5610                         /* else it should be the second or third byte of an
5611                            untranslatable character, so ignore it */
5612                         ch = 0;
5613                 }
5614         }
5615
5616         str->flags = flags;
5617         str->ch = ch;
5618 }
5619 \f
5620 /************************************************************************/
5621 /*                             Initialization                           */
5622 /************************************************************************/
5623
5624 void syms_of_file_coding(void)
5625 {
5626         INIT_LRECORD_IMPLEMENTATION(coding_system);
5627
5628         DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5629
5630         DEFSUBR(Fcoding_system_p);
5631         DEFSUBR(Ffind_coding_system);
5632         DEFSUBR(Fget_coding_system);
5633         DEFSUBR(Fcoding_system_list);
5634         DEFSUBR(Fcoding_system_name);
5635         DEFSUBR(Fmake_coding_system);
5636         DEFSUBR(Fcopy_coding_system);
5637         DEFSUBR(Fcoding_system_canonical_name_p);
5638         DEFSUBR(Fcoding_system_alias_p);
5639         DEFSUBR(Fcoding_system_aliasee);
5640         DEFSUBR(Fdefine_coding_system_alias);
5641         DEFSUBR(Fsubsidiary_coding_system);
5642
5643         DEFSUBR(Fcoding_system_type);
5644         DEFSUBR(Fcoding_system_doc_string);
5645 #ifdef MULE
5646         DEFSUBR(Fcoding_system_charset);
5647 #endif
5648         DEFSUBR(Fcoding_system_property);
5649
5650         DEFSUBR(Fcoding_category_list);
5651         DEFSUBR(Fset_coding_priority_list);
5652         DEFSUBR(Fcoding_priority_list);
5653         DEFSUBR(Fset_coding_category_system);
5654         DEFSUBR(Fcoding_category_system);
5655
5656         DEFSUBR(Fdetect_coding_region);
5657         DEFSUBR(Fdecode_coding_region);
5658         DEFSUBR(Fencode_coding_region);
5659 #ifdef MULE
5660         DEFSUBR(Fdecode_shift_jis_char);
5661         DEFSUBR(Fencode_shift_jis_char);
5662         DEFSUBR(Fdecode_big5_char);
5663         DEFSUBR(Fencode_big5_char);
5664         DEFSUBR(Fset_ucs_char);
5665         DEFSUBR(Fucs_char);
5666         DEFSUBR(Fset_char_ucs);
5667         DEFSUBR(Fchar_ucs);
5668 #endif                          /* MULE */
5669         defsymbol(&Qcoding_systemp, "coding-system-p");
5670         defsymbol(&Qno_conversion, "no-conversion");
5671         defsymbol(&Qraw_text, "raw-text");
5672 #ifdef MULE
5673         defsymbol(&Qbig5, "big5");
5674         defsymbol(&Qshift_jis, "shift-jis");
5675         defsymbol(&Qucs4, "ucs-4");
5676         defsymbol(&Qutf8, "utf-8");
5677         defsymbol(&Qccl, "ccl");
5678         defsymbol(&Qiso2022, "iso2022");
5679 #endif                          /* MULE */
5680         defsymbol(&Qmnemonic, "mnemonic");
5681         defsymbol(&Qeol_type, "eol-type");
5682         defsymbol(&Qpost_read_conversion, "post-read-conversion");
5683         defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5684
5685         defsymbol(&Qcr, "cr");
5686         defsymbol(&Qlf, "lf");
5687         defsymbol(&Qcrlf, "crlf");
5688         defsymbol(&Qeol_cr, "eol-cr");
5689         defsymbol(&Qeol_lf, "eol-lf");
5690         defsymbol(&Qeol_crlf, "eol-crlf");
5691 #ifdef MULE
5692         defsymbol(&Qcharset_g0, "charset-g0");
5693         defsymbol(&Qcharset_g1, "charset-g1");
5694         defsymbol(&Qcharset_g2, "charset-g2");
5695         defsymbol(&Qcharset_g3, "charset-g3");
5696         defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5697         defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5698         defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5699         defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5700         defsymbol(&Qno_iso6429, "no-iso6429");
5701         defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5702         defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5703
5704         defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5705         defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5706         defsymbol(&Qseven, "seven");
5707         defsymbol(&Qlock_shift, "lock-shift");
5708         defsymbol(&Qescape_quoted, "escape-quoted");
5709 #endif                          /* MULE */
5710         defsymbol(&Qencode, "encode");
5711         defsymbol(&Qdecode, "decode");
5712
5713 #ifdef MULE
5714         defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5715                   "shift-jis");
5716         defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5717         defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5718         defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5719         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5720         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5721                   "iso-8-designate");
5722         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5723         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5724         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5725                   "iso-lock-shift");
5726 #endif                          /* MULE */
5727         defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5728                   "no-conversion");
5729 }
5730
5731 void lstream_type_create_file_coding(void)
5732 {
5733         LSTREAM_HAS_METHOD(decoding, reader);
5734         LSTREAM_HAS_METHOD(decoding, writer);
5735         LSTREAM_HAS_METHOD(decoding, rewinder);
5736         LSTREAM_HAS_METHOD(decoding, seekable_p);
5737         LSTREAM_HAS_METHOD(decoding, flusher);
5738         LSTREAM_HAS_METHOD(decoding, closer);
5739         LSTREAM_HAS_METHOD(decoding, marker);
5740
5741         LSTREAM_HAS_METHOD(encoding, reader);
5742         LSTREAM_HAS_METHOD(encoding, writer);
5743         LSTREAM_HAS_METHOD(encoding, rewinder);
5744         LSTREAM_HAS_METHOD(encoding, seekable_p);
5745         LSTREAM_HAS_METHOD(encoding, flusher);
5746         LSTREAM_HAS_METHOD(encoding, closer);
5747         LSTREAM_HAS_METHOD(encoding, marker);
5748 }
5749
5750 void vars_of_file_coding(void)
5751 {
5752         int i;
5753
5754         fcd = xnew(struct file_coding_dump);
5755         dump_add_root_struct_ptr(&fcd, &fcd_description);
5756
5757         /* Initialize to something reasonable ... */
5758         for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5759                 fcd->coding_category_system[i] = Qnil;
5760                 fcd->coding_category_by_priority[i] = i;
5761         }
5762
5763         Fprovide(intern("file-coding"));
5764
5765         DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system  /*
5766 Coding system used for TTY keyboard input.
5767 Not used under a windowing system.
5768                                                                          */ );
5769         Vkeyboard_coding_system = Qnil;
5770
5771         DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system  /*
5772 Coding system used for TTY display output.
5773 Not used under a windowing system.
5774                                                                          */ );
5775         Vterminal_coding_system = Qnil;
5776
5777         DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read  /*
5778 Overriding coding system used when reading from a file or process.
5779 You should bind this variable with `let', but do not set it globally.
5780 If this is non-nil, it specifies the coding system that will be used
5781 to decode input on read operations, such as from a file or process.
5782 It overrides `buffer-file-coding-system-for-read',
5783 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5784 this one for permanent changes to the environment.  */ );
5785         Vcoding_system_for_read = Qnil;
5786
5787         DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write        /*
5788 Overriding coding system used when writing to a file or process.
5789 You should bind this variable with `let', but do not set it globally.
5790 If this is non-nil, it specifies the coding system that will be used
5791 to encode output for write operations, such as to a file or process.
5792 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5793 Use those variables instead of this one for permanent changes to the
5794 environment.  */ );
5795         Vcoding_system_for_write = Qnil;
5796
5797         DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system        /*
5798 Coding system used to convert pathnames when accessing files.
5799                                                                                  */ );
5800         Vfile_name_coding_system = Qnil;
5801
5802         DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5803 Non-nil means the buffer contents are regarded as multi-byte form
5804 of characters, not a binary code.  This affects the display, file I/O,
5805 and behaviors of various editing commands.
5806
5807 Setting this to nil does not do anything.
5808                                                                                  */ );
5809         enable_multibyte_characters = 1;
5810 }
5811
5812 void complex_vars_of_file_coding(void)
5813 {
5814         staticpro(&Vcoding_system_hash_table);
5815         Vcoding_system_hash_table =
5816             make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5817
5818         the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5819         dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5820                                  &codesys_prop_dynarr_description);
5821
5822 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5823 {                                               \
5824   struct codesys_prop csp;                      \
5825   csp.sym = (Sym);                              \
5826   csp.prop_type = (Prop_Type);                  \
5827   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5828 } while (0)
5829
5830         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5831         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5832         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5833         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5834         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5835         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5836         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5837 #ifdef MULE
5838         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5839         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5840         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5841         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5842         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5843         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5844         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5845         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5846         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5847         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5848         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5849         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5850         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5851         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5852         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5853         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5854         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5855
5856         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5857         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5858 #endif                          /* MULE */
5859         /* Need to create this here or we're really screwed. */
5860         Fmake_coding_system
5861             (Qraw_text, Qno_conversion,
5862              build_string
5863              ("Raw text, which means it converts only line-break-codes."),
5864              list2(Qmnemonic, build_string("Raw")));
5865
5866         Fmake_coding_system
5867             (Qbinary, Qno_conversion,
5868              build_string("Binary, which means it does not convert anything."),
5869              list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5870
5871         Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5872
5873         Fdefine_coding_system_alias(Qfile_name, Qbinary);
5874
5875         Fdefine_coding_system_alias(Qterminal, Qbinary);
5876         Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5877
5878         /* Need this for bootstrapping */
5879         fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5880             Fget_coding_system(Qraw_text);
5881
5882 #ifdef MULE
5883         {
5884                 size_t i;
5885
5886                 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5887                         fcd->ucs_to_mule_table[i] = Qnil;
5888         }
5889         staticpro(&mule_to_ucs_table);
5890         mule_to_ucs_table = Fmake_char_table(Qgeneric);
5891 #endif                          /* MULE */
5892 }