cgit.sxemacs.org Git - sxemacs/blob - src/mule/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of SXEmacs
   6
   7 SXEmacs is free software: you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation, either version 3 of the License, or
  10 (at your option) any later version.
  11
  12 SXEmacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  19
  20
  21 /* Synched up with: Mule 2.3.   Not in FSF. */
  22
  23 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  24
  25 #include <config.h>
  26 #include "lisp.h"
  27
  28 #include "buffer.h"
  29 #include "elhash.h"
  30 #include "ui/insdel.h"
  31 #include "lstream.h"
  32 #include "opaque.h"
  33 #ifdef MULE
  34 #include "mule-ccl.h"
  35 #include "chartab.h"
  36 #endif
  37 #include "file-coding.h"
  38
  39 Lisp_Object Qcoding_system_error;
  40
  41 Lisp_Object Vkeyboard_coding_system;
  42 Lisp_Object Vterminal_coding_system;
  43 Lisp_Object Vcoding_system_for_read;
  44 Lisp_Object Vcoding_system_for_write;
  45 Lisp_Object Vfile_name_coding_system;
  46
  47 /* Table of symbols identifying each coding category. */
  48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
  49
  50 struct file_coding_dump {
  51         /* Coding system currently associated with each coding category. */
  52         Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
  53
  54         /* Table of all coding categories in decreasing order of priority.
  55            This describes a permutation of the possible coding categories. */
  56         int coding_category_by_priority[CODING_CATEGORY_LAST];
  57
  58 #ifdef MULE
  59         Lisp_Object ucs_to_mule_table[65536];
  60 #endif
  61 } *fcd;
  62
  63 static const struct lrecord_description fcd_description_1[] = {
  64         {XD_LISP_OBJECT_ARRAY,
  65          offsetof(struct file_coding_dump, coding_category_system),
  66          CODING_CATEGORY_LAST},
  67 #ifdef MULE
  68         {XD_LISP_OBJECT_ARRAY,
  69          offsetof(struct file_coding_dump, ucs_to_mule_table),
  70          countof(fcd->ucs_to_mule_table)},
  71 #endif
  72         {XD_END}
  73 };
  74
  75 static const struct struct_description fcd_description = {
  76         sizeof(struct file_coding_dump),
  77         fcd_description_1
  78 };
  79
  80 Lisp_Object mule_to_ucs_table;
  81
  82 Lisp_Object Qcoding_systemp;
  83
  84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  85 /* Qinternal in general.c */
  86
  87 Lisp_Object Qmnemonic, Qeol_type;
  88 Lisp_Object Qcr, Qcrlf, Qlf;
  89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  90 Lisp_Object Qpost_read_conversion;
  91 Lisp_Object Qpre_write_conversion;
  92
  93 #ifdef MULE
  94 Lisp_Object Qucs4, Qutf8;
  95 Lisp_Object Qbig5, Qshift_jis;
  96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  99 Lisp_Object Qno_iso6429;
 100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 101 Lisp_Object Qescape_quoted;
 102 Lisp_Object Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 103 #endif
 104 Lisp_Object Qencode, Qdecode;
 105
 106 Lisp_Object Vcoding_system_hash_table;
 107
 108 int enable_multibyte_characters;
 109
 110 #ifdef MULE
 111 /* Additional information used by the ISO2022 decoder and detector. */
 112 struct iso2022_decoder {
 113         /* CHARSET holds the character sets currently assigned to the G0
 114            through G3 variables.  It is initialized from the array
 115            INITIAL_CHARSET in CODESYS. */
 116         Lisp_Object charset[4];
 117
 118         /* Which registers are currently invoked into the left (GL) and
 119            right (GR) halves of the 8-bit encoding space? */
 120         int register_left, register_right;
 121
 122         /* ISO_ESC holds a value indicating part of an escape sequence
 123            that has already been seen. */
 124         enum iso_esc_flag esc;
 125
 126         /* This records the bytes we've seen so far in an escape sequence,
 127            in case the sequence is invalid (we spit out the bytes unchanged). */
 128         unsigned char esc_bytes[8];
 129
 130         /* Index for next byte to store in ISO escape sequence. */
 131         int esc_bytes_index;
 132
 133 #ifdef ENABLE_COMPOSITE_CHARS
 134         /* Stuff seen so far when composing a string. */
 135         unsigned_char_dynarr *composite_chars;
 136 #endif
 137
 138         /* If we saw an invalid designation sequence for a particular
 139            register, we flag it here and switch to ASCII.  The next time we
 140            see a valid designation for this register, we turn off the flag
 141            and do the designation normally, but pretend the sequence was
 142            invalid.  The effect of all this is that (most of the time) the
 143            escape sequences for both the switch to the unknown charset, and
 144            the switch back to the known charset, get inserted literally into
 145            the buffer and saved out as such.  The hope is that we can
 146            preserve the escape sequences so that the resulting written out
 147            file makes sense.  If we don't do any of this, the designation
 148            to the invalid charset will be preserved but that switch back
 149            to the known charset will probably get eaten because it was
 150            the same charset that was already present in the register. */
 151         unsigned char invalid_designated[4];
 152
 153         /* We try to do similar things as above for direction-switching
 154            sequences.  If we encountered a direction switch while an
 155            invalid designation was present, or an invalid designation
 156            just after a direction switch (i.e. no valid designation
 157            encountered yet), we insert the direction-switch escape
 158            sequence literally into the output stream, and later on
 159            insert the corresponding direction-restoring escape sequence
 160            literally also. */
 161         unsigned int switched_dir_and_no_valid_charset_yet:1;
 162         unsigned int invalid_switch_dir:1;
 163
 164         /* Tells the decoder to output the escape sequence literally
 165            even though it was valid.  Used in the games we play to
 166            avoid lossage when we encounter invalid designations. */
 167         unsigned int output_literally:1;
 168         /* We encountered a direction switch followed by an invalid
 169            designation.  We didn't output the direction switch
 170            literally because we didn't know about the invalid designation;
 171            but we have to do so now. */
 172         unsigned int output_direction_sequence:1;
 173 };
 174 #endif                          /* MULE */
 175 EXFUN(Fcopy_coding_system, 2);
 176 #ifdef MULE
 177 struct detection_state;
 178 static int detect_coding_sjis(struct detection_state *st,
 179                               const Extbyte * src, Lstream_data_count n);
 180 static void decode_coding_sjis(lstream_t decoding, const Extbyte * src,
 181                                unsigned_char_dynarr * dst,
 182                                Lstream_data_count n);
 183 static void encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
 184                                unsigned_char_dynarr * dst,
 185                                Lstream_data_count n);
 186 static int detect_coding_big5(struct detection_state *st, const Extbyte * src,
 187                               Lstream_data_count n);
 188 static void decode_coding_big5(lstream_t decoding, const Extbyte * src,
 189                                unsigned_char_dynarr * dst,
 190                                Lstream_data_count n);
 191 static void encode_coding_big5(lstream_t encoding, const Bufbyte * src,
 192                                unsigned_char_dynarr * dst,
 193                                Lstream_data_count n);
 194 static int detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
 195                               Lstream_data_count n);
 196 static void decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
 197                                unsigned_char_dynarr * dst,
 198                                Lstream_data_count n);
 199 static void encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
 200                                unsigned_char_dynarr * dst,
 201                                Lstream_data_count n);
 202 static int detect_coding_utf8(struct detection_state *st, const Extbyte * src,
 203                               Lstream_data_count n);
 204 static void decode_coding_utf8(lstream_t decoding, const Extbyte * src,
 205                                unsigned_char_dynarr * dst,
 206                                Lstream_data_count n);
 207 static void encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
 208                                unsigned_char_dynarr * dst,
 209                                Lstream_data_count n);
 210 static int postprocess_iso2022_mask(int mask);
 211 static void reset_iso2022(Lisp_Object coding_system,
 212                           struct iso2022_decoder *iso);
 213 static int detect_coding_iso2022(struct detection_state *st,
 214                                  const Extbyte * src, Lstream_data_count n);
 215 static void decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
 216                                   unsigned_char_dynarr * dst,
 217                                   Lstream_data_count n);
 218 static void encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
 219                                   unsigned_char_dynarr * dst,
 220                                   Lstream_data_count n);
 221 #endif                          /* MULE */
 222 static void decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
 223                                         unsigned_char_dynarr * dst,
 224                                         Lstream_data_count n);
 225 static void encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
 226                                         unsigned_char_dynarr * dst,
 227                                         Lstream_data_count n);
 228 static void mule_decode(lstream_t decoding, const Extbyte * src,
 229                         unsigned_char_dynarr * dst, Lstream_data_count n);
 230 static void mule_encode(lstream_t encoding, const Bufbyte * src,
 231                         unsigned_char_dynarr * dst, Lstream_data_count n);
 232
 233 typedef struct codesys_prop codesys_prop;
 234 struct codesys_prop {
 235         Lisp_Object sym;
 236         int prop_type;
 237 };
 238
 239 typedef struct {
 240         Dynarr_declare(codesys_prop);
 241 } codesys_prop_dynarr;
 242
 243 static const struct lrecord_description codesys_prop_description_1[] = {
 244         {XD_LISP_OBJECT, offsetof(codesys_prop, sym)},
 245         {XD_END}
 246 };
 247
 248 static const struct struct_description codesys_prop_description = {
 249         sizeof(codesys_prop),
 250         codesys_prop_description_1
 251 };
 252
 253 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 254         XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
 255         {XD_END}
 256 };
 257
 258 static const struct struct_description codesys_prop_dynarr_description = {
 259         sizeof(codesys_prop_dynarr),
 260         codesys_prop_dynarr_description_1
 261 };
 262
 263 codesys_prop_dynarr *the_codesys_prop_dynarr;
 264
 265 enum codesys_prop_enum {
 266         CODESYS_PROP_ALL_OK,
 267         CODESYS_PROP_ISO2022,
 268         CODESYS_PROP_CCL
 269 };
 270 \f
 271 /************************************************************************/
 272 /*                       Coding system functions                        */
 273 /************************************************************************/
 274
 275 static Lisp_Object mark_coding_system(Lisp_Object);
 276 static void print_coding_system(Lisp_Object, Lisp_Object, int);
 277 static void finalize_coding_system(void *header, int for_disksave);
 278
 279 #ifdef MULE
 280 static const struct lrecord_description ccs_description_1[] = {
 281         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset)},
 282         {XD_LISP_OBJECT, offsetof(charset_conversion_spec, to_charset)},
 283         {XD_END}
 284 };
 285
 286 static const struct struct_description ccs_description = {
 287         sizeof(charset_conversion_spec),
 288         ccs_description_1
 289 };
 290
 291 static const struct lrecord_description ccsd_description_1[] = {
 292         XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
 293         {XD_END}
 294 };
 295
 296 static const struct struct_description ccsd_description = {
 297         sizeof(charset_conversion_spec_dynarr),
 298         ccsd_description_1
 299 };
 300 #endif
 301
 302 static const struct lrecord_description coding_system_description[] = {
 303         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, name)},
 304         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, doc_string)},
 305         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, mnemonic)},
 306         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, post_read_conversion)},
 307         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, pre_write_conversion)},
 308         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_lf)},
 309         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_crlf)},
 310         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, eol_cr)},
 311 #ifdef MULE
 312         {XD_LISP_OBJECT_ARRAY,
 313          offsetof(Lisp_Coding_System, iso2022.initial_charset), 4},
 314         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.input_conv), 1,
 315          &ccsd_description},
 316         {XD_STRUCT_PTR, offsetof(Lisp_Coding_System, iso2022.output_conv), 1,
 317          &ccsd_description},
 318         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.decode)},
 319         {XD_LISP_OBJECT, offsetof(Lisp_Coding_System, ccl.encode)},
 320 #endif
 321         {XD_END}
 322 };
 323
 324 DEFINE_LRECORD_IMPLEMENTATION("coding-system", coding_system,
 325                               mark_coding_system, print_coding_system,
 326                               finalize_coding_system,
 327                               0, 0, coding_system_description,
 328                               Lisp_Coding_System);
 329
 330 static Lisp_Object mark_coding_system(Lisp_Object obj)
 331 {
 332         Lisp_Coding_System *codesys = XCODING_SYSTEM(obj);
 333
 334         mark_object(CODING_SYSTEM_NAME(codesys));
 335         mark_object(CODING_SYSTEM_DOC_STRING(codesys));
 336         mark_object(CODING_SYSTEM_MNEMONIC(codesys));
 337         mark_object(CODING_SYSTEM_EOL_LF(codesys));
 338         mark_object(CODING_SYSTEM_EOL_CRLF(codesys));
 339         mark_object(CODING_SYSTEM_EOL_CR(codesys));
 340
 341         switch (CODING_SYSTEM_TYPE(codesys)) {
 342 #ifdef MULE
 343                 int i;
 344         case CODESYS_ISO2022:
 345                 for (i = 0; i < 4; i++)
 346                         mark_object(CODING_SYSTEM_ISO2022_INITIAL_CHARSET
 347                                     (codesys, i));
 348                 if (codesys->iso2022.input_conv) {
 349                         for (i = 0;
 350                              i < Dynarr_length(codesys->iso2022.input_conv);
 351                              i++) {
 352                                 struct charset_conversion_spec *ccs =
 353                                     Dynarr_atp(codesys->iso2022.input_conv, i);
 354                                 mark_object(ccs->from_charset);
 355                                 mark_object(ccs->to_charset);
 356                         }
 357                 }
 358                 if (codesys->iso2022.output_conv) {
 359                         for (i = 0;
 360                              i < Dynarr_length(codesys->iso2022.output_conv);
 361                              i++) {
 362                                 struct charset_conversion_spec *ccs =
 363                                     Dynarr_atp(codesys->iso2022.output_conv, i);
 364                                 mark_object(ccs->from_charset);
 365                                 mark_object(ccs->to_charset);
 366                         }
 367                 }
 368                 break;
 369
 370         case CODESYS_CCL:
 371                 mark_object(CODING_SYSTEM_CCL_DECODE(codesys));
 372                 mark_object(CODING_SYSTEM_CCL_ENCODE(codesys));
 373                 break;
 374
 375                 /* list the rest of them lot explicitly */
 376         case CODESYS_AUTODETECT:
 377         case CODESYS_SHIFT_JIS:
 378         case CODESYS_BIG5:
 379         case CODESYS_UCS4:
 380         case CODESYS_UTF8:
 381         case CODESYS_NO_CONVERSION:
 382 #ifdef DEBUG_SXEMACS
 383         case CODESYS_INTERNAL:
 384 #endif
 385 #endif                          /* MULE */
 386         default:
 387                 break;
 388         }
 389
 390         mark_object(CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys));
 391         return CODING_SYSTEM_POST_READ_CONVERSION(codesys);
 392 }
 393
 394 static void
 395 print_coding_system(Lisp_Object obj, Lisp_Object printcharfun, int escapeflag)
 396 {
 397         Lisp_Coding_System *c = XCODING_SYSTEM(obj);
 398         if (print_readably)
 399                 error("printing unreadable object #<coding-system 0x%x>",
 400                       c->header.uid);
 401
 402         write_c_string("#<coding-system ", printcharfun);
 403         print_internal(c->name, printcharfun, 1);
 404         write_c_string(">", printcharfun);
 405 }
 406
 407 static void finalize_coding_system(void *header, int for_disksave)
 408 {
 409         Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 410         /* Since coding systems never go away, this function is not
 411            necessary.  But it would be necessary if we changed things
 412            so that coding systems could go away. */
 413         if (!for_disksave) {    /* see comment in lstream.c */
 414                 switch (CODING_SYSTEM_TYPE(c)) {
 415 #ifdef MULE
 416                 case CODESYS_ISO2022:
 417                         if (c->iso2022.input_conv) {
 418                                 Dynarr_free(c->iso2022.input_conv);
 419                                 c->iso2022.input_conv = 0;
 420                         }
 421                         if (c->iso2022.output_conv) {
 422                                 Dynarr_free(c->iso2022.output_conv);
 423                                 c->iso2022.output_conv = 0;
 424                         }
 425                         break;
 426
 427                         /* list the rest of them lot explicitly */
 428                 case CODESYS_AUTODETECT:
 429                 case CODESYS_SHIFT_JIS:
 430                 case CODESYS_BIG5:
 431                 case CODESYS_UCS4:
 432                 case CODESYS_UTF8:
 433                 case CODESYS_CCL:
 434                 case CODESYS_NO_CONVERSION:
 435 #ifdef DEBUG_SXEMACS
 436                 case CODESYS_INTERNAL:
 437 #endif
 438 #endif                          /* MULE */
 439                 default:
 440                         break;
 441                 }
 442         }
 443 }
 444
 445 static eol_type_t symbol_to_eol_type(Lisp_Object symbol)
 446 {
 447         CHECK_SYMBOL(symbol);
 448         if (NILP(symbol))
 449                 return EOL_AUTODETECT;
 450         if (EQ(symbol, Qlf))
 451                 return EOL_LF;
 452         if (EQ(symbol, Qcrlf))
 453                 return EOL_CRLF;
 454         if (EQ(symbol, Qcr))
 455                 return EOL_CR;
 456
 457         signal_simple_error("Unrecognized eol type", symbol);
 458         return EOL_AUTODETECT;  /* not reached */
 459 }
 460
 461 static Lisp_Object eol_type_to_symbol(eol_type_t type)
 462 {
 463         switch (type) {
 464         default:
 465                 abort();
 466                 break;
 467         case EOL_LF:
 468                 return Qlf;
 469         case EOL_CRLF:
 470                 return Qcrlf;
 471         case EOL_CR:
 472                 return Qcr;
 473         case EOL_AUTODETECT:
 474                 return Qnil;
 475         }
 476 }
 477
 478 static void setup_eol_coding_systems(Lisp_Coding_System * codesys)
 479 {
 480         Lisp_Object codesys_obj;
 481         int len = string_length(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name);
 482         char *codesys_name = (char *)alloca(len + 7);
 483         int mlen = -1;
 484         char *codesys_mnemonic = 0;
 485
 486         Lisp_Object codesys_name_sym, sub_codesys_obj;
 487
 488         /* kludge */
 489
 490         XSETCODING_SYSTEM(codesys_obj, codesys);
 491
 492         memcpy(codesys_name,
 493                string_data(XSYMBOL(CODING_SYSTEM_NAME(codesys))->name), len);
 494
 495         if (STRINGP(CODING_SYSTEM_MNEMONIC(codesys))) {
 496                 mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC(codesys));
 497                 codesys_mnemonic = (char *)alloca(mlen + 7);
 498                 memcpy(codesys_mnemonic,
 499                        XSTRING_DATA(CODING_SYSTEM_MNEMONIC(codesys)), mlen);
 500         }
 501 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 502   strcpy (codesys_name + len, "-" op_sys);                                      \
 503   if (mlen != -1)                                                               \
 504     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 505   codesys_name_sym = intern (codesys_name);                                     \
 506   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 507   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 508   if (mlen != -1)                                                               \
 509     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 510       build_string (codesys_mnemonic);                                          \
 511   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 512 } while (0)
 513
 514         DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 515         DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
 516         DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
 517 }
 518
 519 DEFUN("coding-system-p", Fcoding_system_p, 1, 1, 0,     /*
 520 Return t if OBJECT is a coding system.
 521 A coding system is an object that defines how text containing multiple
 522 character sets is encoded into a stream of (typically 8-bit) bytes.
 523 The coding system is used to decode the stream into a series of
 524 characters (which may be from multiple charsets) when the text is read
 525 from a file or process, and is used to encode the text back into the
 526 same format when it is written out to a file or process.
 527
 528 For example, many ISO2022-compliant coding systems (such as Compound
 529 Text, which is used for inter-client data under the X Window System)
 530 use escape sequences to switch between different charsets -- Japanese
 531 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 532 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 533 `make-coding-system' for more information.
 534
 535 Coding systems are normally identified using a symbol, and the
 536 symbol is accepted in place of the actual coding system object whenever
 537 a coding system is called for. (This is similar to how faces work.)
 538 */
 539       (object))
 540 {
 541         return CODING_SYSTEMP(object) ? Qt : Qnil;
 542 }
 543
 544 DEFUN("find-coding-system", Ffind_coding_system, 1, 1, 0,       /*
 545 Retrieve the coding system of the given name.
 546
 547 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 548 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 549 If there is no such coding system, nil is returned.  Otherwise the
 550 associated coding system object is returned.
 551 */
 552       (coding_system_or_name))
 553 {
 554         if (NILP(coding_system_or_name))
 555                 coding_system_or_name = Qbinary;
 556         else if (CODING_SYSTEMP(coding_system_or_name))
 557                 return coding_system_or_name;
 558         else
 559                 CHECK_SYMBOL(coding_system_or_name);
 560
 561         while (1) {
 562                 coding_system_or_name =
 563                     Fgethash(coding_system_or_name, Vcoding_system_hash_table,
 564                              Qnil);
 565
 566                 if (CODING_SYSTEMP(coding_system_or_name)
 567                     || NILP(coding_system_or_name))
 568                         return coding_system_or_name;
 569         }
 570 }
 571
 572 DEFUN("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 573 Retrieve the coding system of the given name.
 574 Same as `find-coding-system' except that if there is no such
 575 coding system, an error is signaled instead of returning nil.
 576 */
 577       (name))
 578 {
 579         Lisp_Object coding_system = Ffind_coding_system(name);
 580
 581         if (NILP(coding_system))
 582                 signal_simple_error("No such coding system", name);
 583         return coding_system;
 584 }
 585
 586 /* We store the coding systems in hash tables with the names as the key and the
 587    actual coding system object as the value.  Occasionally we need to use them
 588    in a list format.  These routines provide us with that. */
 589 struct coding_system_list_closure {
 590         Lisp_Object *coding_system_list;
 591 };
 592
 593 static int
 594 add_coding_system_to_list_mapper(Lisp_Object key, Lisp_Object value,
 595                                  void *coding_system_list_closure)
 596 {
 597         /* This function can GC */
 598         struct coding_system_list_closure *cscl =
 599             (struct coding_system_list_closure *)coding_system_list_closure;
 600         Lisp_Object *coding_system_list = cscl->coding_system_list;
 601
 602         *coding_system_list = Fcons(key, *coding_system_list);
 603         return 0;
 604 }
 605
 606 DEFUN("coding-system-list", Fcoding_system_list, 0, 0, 0,       /*
 607 Return a list of the names of all defined coding systems.
 608 */
 609       ())
 610 {
 611         Lisp_Object coding_system_list = Qnil;
 612         struct gcpro gcpro1;
 613         struct coding_system_list_closure coding_system_list_closure;
 614
 615         GCPRO1(coding_system_list);
 616         coding_system_list_closure.coding_system_list = &coding_system_list;
 617         elisp_maphash(add_coding_system_to_list_mapper,
 618                       Vcoding_system_hash_table, &coding_system_list_closure);
 619         UNGCPRO;
 620
 621         return coding_system_list;
 622 }
 623
 624 DEFUN("coding-system-name", Fcoding_system_name, 1, 1, 0,       /*
 625 Return the name of the given coding system.
 626 */
 627       (coding_system))
 628 {
 629         coding_system = Fget_coding_system(coding_system);
 630         return XCODING_SYSTEM_NAME(coding_system);
 631 }
 632
 633 static Lisp_Coding_System *allocate_coding_system(enum coding_system_type type,
 634                                                   Lisp_Object name)
 635 {
 636         Lisp_Coding_System *codesys =
 637             alloc_lcrecord_type(Lisp_Coding_System, &lrecord_coding_system);
 638
 639         zero_lcrecord(codesys);
 640         CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) = Qnil;
 641         CODING_SYSTEM_POST_READ_CONVERSION(codesys) = Qnil;
 642         CODING_SYSTEM_EOL_TYPE(codesys) = EOL_AUTODETECT;
 643         CODING_SYSTEM_EOL_CRLF(codesys) = Qnil;
 644         CODING_SYSTEM_EOL_CR(codesys) = Qnil;
 645         CODING_SYSTEM_EOL_LF(codesys) = Qnil;
 646         CODING_SYSTEM_TYPE(codesys) = type;
 647         CODING_SYSTEM_MNEMONIC(codesys) = Qnil;
 648 #ifdef MULE
 649         if (type == CODESYS_ISO2022) {
 650                 int i;
 651                 for (i = 0; i < 4; i++)
 652                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i) =
 653                             Qnil;
 654         } else if (type == CODESYS_CCL) {
 655                 CODING_SYSTEM_CCL_DECODE(codesys) = Qnil;
 656                 CODING_SYSTEM_CCL_ENCODE(codesys) = Qnil;
 657         }
 658 #endif                          /* MULE */
 659         CODING_SYSTEM_NAME(codesys) = name;
 660
 661         return codesys;
 662 }
 663
 664 #ifdef MULE
 665 /* Given a list of charset conversion specs as specified in a Lisp
 666    program, parse it into STORE_HERE. */
 667
 668 static void
 669 parse_charset_conversion_specs(charset_conversion_spec_dynarr * store_here,
 670                                Lisp_Object spec_list)
 671 {
 672         Lisp_Object rest;
 673
 674         EXTERNAL_LIST_LOOP(rest, spec_list) {
 675                 Lisp_Object car = XCAR(rest);
 676                 Lisp_Object from, to;
 677                 struct charset_conversion_spec spec;
 678
 679                 if (!CONSP(car) || !CONSP(XCDR(car)) || !NILP(XCDR(XCDR(car))))
 680                         signal_simple_error("Invalid charset conversion spec",
 681                                             car);
 682                 from = Fget_charset(XCAR(car));
 683                 to = Fget_charset(XCAR(XCDR(car)));
 684                 if (XCHARSET_TYPE(from) != XCHARSET_TYPE(to))
 685                         signal_simple_error_2
 686                             ("Attempted conversion between different charset types",
 687                              from, to);
 688                 spec.from_charset = from;
 689                 spec.to_charset = to;
 690
 691                 Dynarr_add(store_here, spec);
 692         }
 693 }
 694
 695 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 696    specs, return the equivalent as the Lisp programmer would see it.
 697
 698    If LOAD_HERE is 0, return Qnil. */
 699
 700 static Lisp_Object
 701 unparse_charset_conversion_specs(charset_conversion_spec_dynarr * load_here)
 702 {
 703         int i;
 704         Lisp_Object result;
 705
 706         if (!load_here)
 707                 return Qnil;
 708         for (i = 0, result = Qnil; i < Dynarr_length(load_here); i++) {
 709                 struct charset_conversion_spec *ccs = Dynarr_atp(load_here, i);
 710                 result =
 711                     Fcons(list2(ccs->from_charset, ccs->to_charset), result);
 712         }
 713
 714         return Fnreverse(result);
 715 }
 716
 717 #endif                          /* MULE */
 718
 719 DEFUN("make-coding-system", Fmake_coding_system, 2, 4, 0,       /*
 720 Register symbol NAME as a coding system.
 721
 722 TYPE describes the conversion method used and should be one of
 723
 724 nil or 'undecided
 725 Automatic conversion.  SXEmacs attempts to detect the coding system
 726 used in the file.
 727 'no-conversion
 728 No conversion.  Use this for binary files and such.  On output,
 729 graphic characters that are not in ASCII or Latin-1 will be
 730 replaced by a ?. (For a no-conversion-encoded buffer, these
 731 characters will only be present if you explicitly insert them.)
 732 'shift-jis
 733 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 734 'ucs-4
 735 ISO 10646 UCS-4 encoding.
 736 'utf-8
 737 ISO 10646 UTF-8 encoding.
 738 'iso2022
 739 Any ISO2022-compliant encoding.  Among other things, this includes
 740 JIS (the Japanese encoding commonly used for e-mail), EUC (the
 741 standard Unix encoding for Japanese and other languages), and
 742 Compound Text (the encoding used in X11).  You can specify more
 743 specific information about the conversion with the PROPS argument.
 744 'big5
 745 Big5 (the encoding commonly used for Taiwanese).
 746 'ccl
 747 The conversion is performed using a user-written pseudo-code
 748 program.  CCL (Code Conversion Language) is the name of this
 749 pseudo-code.
 750 'internal
 751 Write out or read in the raw contents of the memory representing
 752 the buffer's text.  This is primarily useful for debugging
 753 purposes, and is only enabled when SXEmacs has been compiled with
 754 DEBUG_SXEMACS defined (via the --debug configure option).
 755 WARNING: Reading in a file using 'internal conversion can result
 756 in an internal inconsistency in the memory representing a
 757 buffer's text, which will produce unpredictable results and may
 758 cause SXEmacs to crash.  Under normal circumstances you should
 759 never use 'internal conversion.
 760
 761 DOC-STRING is a string describing the coding system.
 762
 763 PROPS is a property list, describing the specific nature of the
 764 character set.  Recognized properties are:
 765
 766 'mnemonic
 767 String to be displayed in the modeline when this coding system is
 768 active.
 769
 770 'eol-type
 771 End-of-line conversion to be used.  It should be one of
 772
 773 nil
 774 Automatically detect the end-of-line type (LF, CRLF,
 775 or CR).  Also generate subsidiary coding systems named
 776 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 777 identical to this coding system but have an EOL-TYPE
 778 value of 'lf, 'crlf, and 'cr, respectively.
 779 'lf
 780 The end of a line is marked externally using ASCII LF.
 781 Since this is also the way that SXEmacs represents an
 782 end-of-line internally, specifying this option results
 783 in no end-of-line conversion.  This is the standard
 784 format for Unix text files.
 785 'crlf
 786 The end of a line is marked externally using ASCII
 787 CRLF.  This is the standard format for MS-DOS text
 788 files.
 789 'cr
 790 The end of a line is marked externally using ASCII CR.
 791 This is the standard format for Macintosh text files.
 792 t
 793 Automatically detect the end-of-line type but do not
 794 generate subsidiary coding systems.  (This value is
 795 converted to nil when stored internally, and
 796 `coding-system-property' will return nil.)
 797
 798 'post-read-conversion
 799 Function called after a file has been read in, to perform the
 800 decoding.  Called with two arguments, START and END, denoting
 801 a region of the current buffer to be decoded.
 802
 803 'pre-write-conversion
 804 Function called before a file is written out, to perform the
 805 encoding.  Called with two arguments, START and END, denoting
 806 a region of the current buffer to be encoded.
 807
 808 The following additional properties are recognized if TYPE is 'iso2022:
 809
 810 'charset-g0
 811 'charset-g1
 812 'charset-g2
 813 'charset-g3
 814 The character set initially designated to the G0 - G3 registers.
 815 The value should be one of
 816
 817 -- A charset object (designate that character set)
 818 -- nil (do not ever use this register)
 819 -- t (no character set is initially designated to
 820 the register, but may be later on; this automatically
 821 sets the corresponding `force-g*-on-output' property)
 822
 823 'force-g0-on-output
 824 'force-g1-on-output
 825 'force-g2-on-output
 826 'force-g2-on-output
 827 If non-nil, send an explicit designation sequence on output before
 828 using the specified register.
 829
 830 'short
 831 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 832 "ESC $ B" on output in place of the full designation sequences
 833 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 834
 835 'no-ascii-eol
 836 If non-nil, don't designate ASCII to G0 at each end of line on output.
 837 Setting this to non-nil also suppresses other state-resetting that
 838 normally happens at the end of a line.
 839
 840 'no-ascii-cntl
 841 If non-nil, don't designate ASCII to G0 before control chars on output.
 842
 843 'seven
 844 If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 845 environment.
 846
 847 'lock-shift
 848 If non-nil, use locking-shift (SO/SI) instead of single-shift
 849 or designation by escape sequence.
 850
 851 'no-iso6429
 852 If non-nil, don't use ISO6429's direction specification.
 853
 854 'escape-quoted
 855 If non-nil, literal control characters that are the same as
 856 the beginning of a recognized ISO2022 or ISO6429 escape sequence
 857 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 858 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 859 so that they can be properly distinguished from an escape sequence.
 860 (Note that doing this results in a non-portable encoding.) This
 861 encoding flag is used for byte-compiled files.  Note that ESC
 862 is a good choice for a quoting character because there are no
 863 escape sequences whose second byte is a character from the Control-0
 864 or Control-1 character sets; this is explicitly disallowed by the
 865 ISO2022 standard.
 866
 867 'input-charset-conversion
 868 A list of conversion specifications, specifying conversion of
 869 characters in one charset to another when decoding is performed.
 870 Each specification is a list of two elements: the source charset,
 871 and the destination charset.
 872
 873 'output-charset-conversion
 874 A list of conversion specifications, specifying conversion of
 875 characters in one charset to another when encoding is performed.
 876 The form of each specification is the same as for
 877 'input-charset-conversion.
 878
 879 The following additional properties are recognized (and required)
 880 if TYPE is 'ccl:
 881
 882 'decode
 883 CCL program used for decoding (converting to internal format).
 884
 885 'encode
 886 CCL program used for encoding (converting to external format).
 887 */
 888       (name, type, doc_string, props))
 889 {
 890         Lisp_Coding_System *codesys;
 891         enum coding_system_type ty;
 892         int need_to_setup_eol_systems = 1;
 893
 894         /* Convert type to constant */
 895         if (NILP(type) || EQ(type, Qundecided)) {
 896                 ty = CODESYS_AUTODETECT;
 897         }
 898 #ifdef MULE
 899         else if (EQ(type, Qshift_jis)) {
 900                 ty = CODESYS_SHIFT_JIS;
 901         } else if (EQ(type, Qiso2022)) {
 902                 ty = CODESYS_ISO2022;
 903         } else if (EQ(type, Qbig5)) {
 904                 ty = CODESYS_BIG5;
 905         } else if (EQ(type, Qucs4)) {
 906                 ty = CODESYS_UCS4;
 907         } else if (EQ(type, Qutf8)) {
 908                 ty = CODESYS_UTF8;
 909         } else if (EQ(type, Qccl)) {
 910                 ty = CODESYS_CCL;
 911         }
 912 #endif
 913         else if (EQ(type, Qno_conversion)) {
 914                 ty = CODESYS_NO_CONVERSION;
 915         }
 916 #ifdef DEBUG_SXEMACS
 917         else if (EQ(type, Qinternal)) {
 918                 ty = CODESYS_INTERNAL;
 919         }
 920 #endif
 921         else
 922                 signal_simple_error("Invalid coding system type", type);
 923
 924         CHECK_SYMBOL(name);
 925
 926         codesys = allocate_coding_system(ty, name);
 927
 928         if (NILP(doc_string))
 929                 doc_string = build_string("");
 930         else
 931                 CHECK_STRING(doc_string);
 932         CODING_SYSTEM_DOC_STRING(codesys) = doc_string;
 933
 934         {
 935                 EXTERNAL_PROPERTY_LIST_LOOP_3(key, value, props) {
 936                         if (EQ(key, Qmnemonic)) {
 937                                 if (!NILP(value))
 938                                         CHECK_STRING(value);
 939                                 CODING_SYSTEM_MNEMONIC(codesys) = value;
 940                         }
 941
 942                         else if (EQ(key, Qeol_type)) {
 943                                 need_to_setup_eol_systems = NILP(value);
 944                                 if (EQ(value, Qt))
 945                                         value = Qnil;
 946                                 CODING_SYSTEM_EOL_TYPE(codesys) =
 947                                     symbol_to_eol_type(value);
 948                         }
 949
 950                         else if (EQ(key, Qpost_read_conversion))
 951                                 CODING_SYSTEM_POST_READ_CONVERSION(codesys) =
 952                                     value;
 953                         else if (EQ(key, Qpre_write_conversion))
 954                                 CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) =
 955                                     value;
 956 #ifdef MULE
 957                         else if (ty == CODESYS_ISO2022) {
 958 #define FROB_INITIAL_CHARSET(charset_num) \
 959   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 960     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 961
 962                                 if (EQ(key, Qcharset_g0))
 963                                         FROB_INITIAL_CHARSET(0);
 964                                 else if (EQ(key, Qcharset_g1))
 965                                         FROB_INITIAL_CHARSET(1);
 966                                 else if (EQ(key, Qcharset_g2))
 967                                         FROB_INITIAL_CHARSET(2);
 968                                 else if (EQ(key, Qcharset_g3))
 969                                         FROB_INITIAL_CHARSET(3);
 970
 971 #define FROB_FORCE_CHARSET(charset_num) \
 972   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 973
 974                                 else if (EQ(key, Qforce_g0_on_output))
 975                                         FROB_FORCE_CHARSET(0);
 976                                 else if (EQ(key, Qforce_g1_on_output))
 977                                         FROB_FORCE_CHARSET(1);
 978                                 else if (EQ(key, Qforce_g2_on_output))
 979                                         FROB_FORCE_CHARSET(2);
 980                                 else if (EQ(key, Qforce_g3_on_output))
 981                                         FROB_FORCE_CHARSET(3);
 982
 983 #define FROB_BOOLEAN_PROPERTY(prop) \
 984   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 985
 986                                 else if (EQ(key, Qshort))
 987                                         FROB_BOOLEAN_PROPERTY(SHORT);
 988                                 else if (EQ(key, Qno_ascii_eol))
 989                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_EOL);
 990                                 else if (EQ(key, Qno_ascii_cntl))
 991                                         FROB_BOOLEAN_PROPERTY(NO_ASCII_CNTL);
 992                                 else if (EQ(key, Qseven))
 993                                         FROB_BOOLEAN_PROPERTY(SEVEN);
 994                                 else if (EQ(key, Qlock_shift))
 995                                         FROB_BOOLEAN_PROPERTY(LOCK_SHIFT);
 996                                 else if (EQ(key, Qno_iso6429))
 997                                         FROB_BOOLEAN_PROPERTY(NO_ISO6429);
 998                                 else if (EQ(key, Qescape_quoted))
 999                                         FROB_BOOLEAN_PROPERTY(ESCAPE_QUOTED);
1000
1001                                 else if (EQ(key, Qinput_charset_conversion)) {
1002                                         codesys->iso2022.input_conv =
1003                                             Dynarr_new(charset_conversion_spec);
1004                                         parse_charset_conversion_specs(codesys->
1005                                                                        iso2022.
1006                                                                        input_conv,
1007                                                                        value);
1008                                 } else if (EQ(key, Qoutput_charset_conversion)) {
1009                                         codesys->iso2022.output_conv =
1010                                             Dynarr_new(charset_conversion_spec);
1011                                         parse_charset_conversion_specs(codesys->
1012                                                                        iso2022.
1013                                                                        output_conv,
1014                                                                        value);
1015                                 } else
1016                                         signal_simple_error
1017                                             ("Unrecognized property", key);
1018                         } else if (EQ(type, Qccl)) {
1019                                 Lisp_Object sym;
1020                                 struct ccl_program test_ccl;
1021                                 Extbyte *suffix;
1022
1023                                 /* Check key first.  */
1024                                 if (EQ(key, Qdecode))
1025                                         suffix = "-ccl-decode";
1026                                 else if (EQ(key, Qencode))
1027                                         suffix = "-ccl-encode";
1028                                 else
1029                                         signal_simple_error
1030                                             ("Unrecognized property", key);
1031
1032                                 /* If value is vector, register it as a ccl program
1033                                    associated with an newly created symbol for
1034                                    backward compatibility.  */
1035                                 if (VECTORP(value)) {
1036                                         sym =
1037                                             Fintern(concat2
1038                                                     (Fsymbol_name(name),
1039                                                      build_string(suffix)),
1040                                                     Qnil);
1041                                         Fregister_ccl_program(sym, value);
1042                                 } else {
1043                                         CHECK_SYMBOL(value);
1044                                         sym = value;
1045                                 }
1046                                 /* check if the given ccl programs are valid.  */
1047                                 if (setup_ccl_program(&test_ccl, sym) < 0)
1048                                         signal_simple_error
1049                                             ("Invalid CCL program", value);
1050
1051                                 if (EQ(key, Qdecode))
1052                                         CODING_SYSTEM_CCL_DECODE(codesys) = sym;
1053                                 else if (EQ(key, Qencode))
1054                                         CODING_SYSTEM_CCL_ENCODE(codesys) = sym;
1055
1056                         }
1057 #endif                          /* MULE */
1058                         else
1059                                 signal_simple_error("Unrecognized property",
1060                                                     key);
1061                 }
1062         }
1063
1064         if (need_to_setup_eol_systems)
1065                 setup_eol_coding_systems(codesys);
1066
1067         {
1068                 Lisp_Object codesys_obj;
1069                 XSETCODING_SYSTEM(codesys_obj, codesys);
1070                 Fputhash(name, codesys_obj, Vcoding_system_hash_table);
1071                 return codesys_obj;
1072         }
1073 }
1074
1075 DEFUN("copy-coding-system", Fcopy_coding_system, 2, 2, 0,       /*
1076 Copy OLD-CODING-SYSTEM to NEW-NAME.
1077 If NEW-NAME does not name an existing coding system, a new one will
1078 be created.
1079 */
1080       (old_coding_system, new_name))
1081 {
1082         Lisp_Object new_coding_system;
1083         old_coding_system = Fget_coding_system(old_coding_system);
1084         new_coding_system = Ffind_coding_system(new_name);
1085         if (NILP(new_coding_system)) {
1086                 XSETCODING_SYSTEM(new_coding_system,
1087                                   allocate_coding_system
1088                                   (XCODING_SYSTEM_TYPE(old_coding_system),
1089                                    new_name));
1090                 Fputhash(new_name, new_coding_system,
1091                          Vcoding_system_hash_table);
1092         }
1093
1094         {
1095                 Lisp_Coding_System *to = XCODING_SYSTEM(new_coding_system);
1096                 Lisp_Coding_System *from = XCODING_SYSTEM(old_coding_system);
1097                 memcpy(((char *)to) + sizeof(to->header),
1098                        ((char *)from) + sizeof(from->header),
1099                        sizeof(*from) - sizeof(from->header));
1100                 to->name = new_name;
1101         }
1102         return new_coding_system;
1103 }
1104
1105 DEFUN("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0,       /*
1106 Return t if OBJECT names a coding system, and is not a coding system alias.
1107 */
1108       (object))
1109 {
1110         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qnil);
1111         return CODING_SYSTEMP(val) ? Qt : Qnil;
1112 }
1113
1114 DEFUN("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1115  Return t if OBJECT is a coding system alias.
1116 All coding system aliases are created by `define-coding-system-alias'.
1117 */
1118       (object))
1119 {
1120         Lisp_Object val = Fgethash(object, Vcoding_system_hash_table, Qzero);
1121         return SYMBOLP(val) ? Qt : Qnil;
1122 }
1123
1124 DEFUN("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1125 Return the coding-system symbol for which symbol ALIAS is an alias.
1126 */
1127       (alias))
1128 {
1129         Lisp_Object aliasee = Fgethash(alias, Vcoding_system_hash_table, Qnil);
1130         if (SYMBOLP(aliasee)) {
1131                 return aliasee;
1132         } else {
1133                 signal_simple_error(
1134                         "Symbol is not a coding system alias", alias);
1135         }
1136         /* To keep the compiler happy */
1137         return Qnil;
1138 }
1139
1140 static Lisp_Object
1141 append_suffix_to_symbol(Lisp_Object symbol, char *ascii_string)
1142 {
1143         return Fintern(concat2(Fsymbol_name(symbol),
1144                                build_string(ascii_string)), Qnil);
1145 }
1146
1147 /* A maphash function, for removing dangling coding system aliases. */
1148 static int
1149 dangling_coding_system_alias_p(Lisp_Object alias,
1150                                Lisp_Object aliasee, void *dangling_aliases)
1151 {
1152         if (SYMBOLP(aliasee)
1153             && NILP(Fgethash(aliasee, Vcoding_system_hash_table, Qnil))) {
1154                 (*(int *)dangling_aliases)++;
1155                 return 1;
1156         } else {
1157                 return 0;
1158         }
1159 }
1160
1161 DEFUN("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0,       /*
1162 Define symbol ALIAS as an alias for coding system ALIASEE.
1163
1164 You can use this function to redefine an alias that has already been defined,
1165 but you cannot redefine a name which is the canonical name for a coding system.
1166 \(a canonical name of a coding system is what is returned when you call
1167 `coding-system-name' on a coding system).
1168
1169 ALIASEE itself can be an alias, which allows you to define nested aliases.
1170
1171 You are forbidden, however, from creating alias loops or `dangling' aliases.
1172 These will be detected, and an error will be signaled if you attempt to do so.
1173
1174 If ALIASEE is nil, then ALIAS will simply be undefined.
1175
1176 See also `coding-system-alias-p', `coding-system-aliasee',
1177 and `coding-system-canonical-name-p'.
1178 */
1179       (alias, aliasee))
1180 {
1181         Lisp_Object real_coding_system, probe;
1182
1183         CHECK_SYMBOL(alias);
1184
1185         if (!NILP(Fcoding_system_canonical_name_p(alias)))
1186                 signal_simple_error
1187                     ("Symbol is the canonical name of a coding system and cannot be redefined",
1188                      alias);
1189
1190         if (NILP(aliasee)) {
1191                 Lisp_Object subsidiary_unix =
1192                     append_suffix_to_symbol(alias, "-unix");
1193                 Lisp_Object subsidiary_dos =
1194                     append_suffix_to_symbol(alias, "-dos");
1195                 Lisp_Object subsidiary_mac =
1196                     append_suffix_to_symbol(alias, "-mac");
1197
1198                 Fremhash(alias, Vcoding_system_hash_table);
1199
1200                 /* Undefine subsidiary aliases,
1201                    presumably created by a previous call to this function */
1202                 if (!NILP(Fcoding_system_alias_p(subsidiary_unix)) &&
1203                     !NILP(Fcoding_system_alias_p(subsidiary_dos)) &&
1204                     !NILP(Fcoding_system_alias_p(subsidiary_mac))) {
1205                         Fdefine_coding_system_alias(subsidiary_unix, Qnil);
1206                         Fdefine_coding_system_alias(subsidiary_dos, Qnil);
1207                         Fdefine_coding_system_alias(subsidiary_mac, Qnil);
1208                 }
1209
1210                 /* Undefine dangling coding system aliases. */
1211                 {
1212                         int dangling_aliases;
1213
1214                         do {
1215                                 dangling_aliases = 0;
1216                                 elisp_map_remhash
1217                                     (dangling_coding_system_alias_p,
1218                                      Vcoding_system_hash_table,
1219                                      &dangling_aliases);
1220                         } while (dangling_aliases > 0);
1221                 }
1222
1223                 return Qnil;
1224         }
1225
1226         if (CODING_SYSTEMP(aliasee))
1227                 aliasee = XCODING_SYSTEM_NAME(aliasee);
1228
1229         /* Checks that aliasee names a coding-system */
1230         real_coding_system = Fget_coding_system(aliasee);
1231
1232         /* Check for coding system alias loops */
1233         if (EQ(alias, aliasee))
1234               alias_loop:signal_simple_error_2
1235                     ("Attempt to create a coding system alias loop", alias,
1236                      aliasee);
1237
1238         for (probe = aliasee;
1239              SYMBOLP(probe);
1240              probe = Fgethash(probe, Vcoding_system_hash_table, Qzero)) {
1241                 if (EQ(probe, alias))
1242                         goto alias_loop;
1243         }
1244
1245         Fputhash(alias, aliasee, Vcoding_system_hash_table);
1246
1247         /* Set up aliases for subsidiaries.
1248            #### There must be a better way to handle subsidiary coding
1249            #### systems. */
1250         {
1251                 static char *suffixes[] = { "-unix", "-dos", "-mac" };
1252
1253                 for (int i = 0; i < countof(suffixes); i++) {
1254                         Lisp_Object alias_subsidiary =
1255                                 append_suffix_to_symbol(alias, suffixes[i]);
1256                         Lisp_Object aliasee_subsidiary =
1257                                 append_suffix_to_symbol(aliasee, suffixes[i]);
1258
1259                         if (!NILP(Ffind_coding_system(aliasee_subsidiary))) {
1260                                 Fdefine_coding_system_alias(alias_subsidiary,
1261                                                             aliasee_subsidiary);
1262                         }
1263                 }
1264         }
1265         /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1266            but it doesn't look intentional, so I'd rather return something
1267            meaningful or nothing at all. */
1268         return Qnil;
1269 }
1270
1271 static Lisp_Object
1272 subsidiary_coding_system(Lisp_Object coding_system, eol_type_t type)
1273 {
1274         Lisp_Coding_System *cs = XCODING_SYSTEM(coding_system);
1275         Lisp_Object new_coding_system;
1276
1277         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT)
1278                 return coding_system;
1279
1280         switch (type) {
1281         case EOL_AUTODETECT:
1282                 return coding_system;
1283         case EOL_LF:
1284                 new_coding_system = CODING_SYSTEM_EOL_LF(cs);
1285                 break;
1286         case EOL_CR:
1287                 new_coding_system = CODING_SYSTEM_EOL_CR(cs);
1288                 break;
1289         case EOL_CRLF:
1290                 new_coding_system = CODING_SYSTEM_EOL_CRLF(cs);
1291                 break;
1292         default:
1293                 abort();
1294                 return Qnil;
1295         }
1296
1297         return NILP(new_coding_system) ? coding_system : new_coding_system;
1298 }
1299
1300 DEFUN("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0,   /*
1301 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1302 */
1303       (coding_system, eol_type))
1304 {
1305         coding_system = Fget_coding_system(coding_system);
1306
1307         return subsidiary_coding_system(coding_system,
1308                                         symbol_to_eol_type(eol_type));
1309 }
1310 \f
1311 /************************************************************************/
1312 /*                         Coding system accessors                      */
1313 /************************************************************************/
1314
1315 DEFUN("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0,   /*
1316 Return the doc string for CODING-SYSTEM.
1317 */
1318       (coding_system))
1319 {
1320         coding_system = Fget_coding_system(coding_system);
1321         return XCODING_SYSTEM_DOC_STRING(coding_system);
1322 }
1323
1324 DEFUN("coding-system-type", Fcoding_system_type, 1, 1, 0,       /*
1325 Return the type of CODING-SYSTEM.
1326 */
1327       (coding_system))
1328 {
1329         Lisp_Object tmp = Fget_coding_system(coding_system);
1330
1331         switch (XCODING_SYSTEM_TYPE(tmp)) {
1332         default:
1333                 abort();
1334                 break;
1335         case CODESYS_AUTODETECT:
1336                 return Qundecided;
1337 #ifdef MULE
1338         case CODESYS_SHIFT_JIS:
1339                 return Qshift_jis;
1340         case CODESYS_ISO2022:
1341                 return Qiso2022;
1342         case CODESYS_BIG5:
1343                 return Qbig5;
1344         case CODESYS_UCS4:
1345                 return Qucs4;
1346         case CODESYS_UTF8:
1347                 return Qutf8;
1348         case CODESYS_CCL:
1349                 return Qccl;
1350 #endif
1351         case CODESYS_NO_CONVERSION:
1352                 return Qno_conversion;
1353 #ifdef DEBUG_SXEMACS
1354         case CODESYS_INTERNAL:
1355                 return Qinternal;
1356 #endif
1357         }
1358 }
1359
1360 #ifdef MULE
1361 static
1362 Lisp_Object coding_system_charset(Lisp_Object coding_system, int gnum)
1363 {
1364         Lisp_Object cs
1365             = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(coding_system, gnum);
1366
1367         return CHARSETP(cs) ? XCHARSET_NAME(cs) : Qnil;
1368 }
1369
1370 DEFUN("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1371 Return initial charset of CODING-SYSTEM designated to GNUM.
1372 GNUM allows 0 .. 3.
1373 */
1374       (coding_system, gnum))
1375 {
1376         coding_system = Fget_coding_system(coding_system);
1377         CHECK_INT(gnum);
1378
1379         return coding_system_charset(coding_system, XINT(gnum));
1380 }
1381 #endif                          /* MULE */
1382
1383 DEFUN("coding-system-property", Fcoding_system_property, 2, 2, 0,       /*
1384 Return the PROP property of CODING-SYSTEM.
1385 */
1386       (coding_system, prop))
1387 {
1388         int i, ok = 0;
1389         enum coding_system_type type;
1390
1391         coding_system = Fget_coding_system(coding_system);
1392         CHECK_SYMBOL(prop);
1393         type = XCODING_SYSTEM_TYPE(coding_system);
1394
1395         for (i = 0; !ok && i < Dynarr_length(the_codesys_prop_dynarr); i++)
1396                 if (EQ(Dynarr_at(the_codesys_prop_dynarr, i).sym, prop)) {
1397                         ok = 1;
1398                         switch (Dynarr_at(the_codesys_prop_dynarr, i).prop_type) {
1399                         case CODESYS_PROP_ALL_OK:
1400                                 break;
1401 #ifdef MULE
1402                         case CODESYS_PROP_ISO2022:
1403                                 if (type != CODESYS_ISO2022)
1404                                         signal_simple_error
1405                                             ("Property only valid in ISO2022 coding systems",
1406                                              prop);
1407                                 break;
1408
1409                         case CODESYS_PROP_CCL:
1410                                 if (type != CODESYS_CCL)
1411                                         signal_simple_error
1412                                             ("Property only valid in CCL coding systems",
1413                                              prop);
1414                                 break;
1415 #endif                          /* MULE */
1416                         default:
1417                                 abort();
1418                         }
1419                 }
1420
1421         if (!ok)
1422                 signal_simple_error("Unrecognized property", prop);
1423
1424         if (EQ(prop, Qname))
1425                 return XCODING_SYSTEM_NAME(coding_system);
1426         else if (EQ(prop, Qtype))
1427                 return Fcoding_system_type(coding_system);
1428         else if (EQ(prop, Qdoc_string))
1429                 return XCODING_SYSTEM_DOC_STRING(coding_system);
1430         else if (EQ(prop, Qmnemonic))
1431                 return XCODING_SYSTEM_MNEMONIC(coding_system);
1432         else if (EQ(prop, Qeol_type))
1433                 return
1434                     eol_type_to_symbol(XCODING_SYSTEM_EOL_TYPE(coding_system));
1435         else if (EQ(prop, Qeol_lf))
1436                 return XCODING_SYSTEM_EOL_LF(coding_system);
1437         else if (EQ(prop, Qeol_crlf))
1438                 return XCODING_SYSTEM_EOL_CRLF(coding_system);
1439         else if (EQ(prop, Qeol_cr))
1440                 return XCODING_SYSTEM_EOL_CR(coding_system);
1441         else if (EQ(prop, Qpost_read_conversion))
1442                 return XCODING_SYSTEM_POST_READ_CONVERSION(coding_system);
1443         else if (EQ(prop, Qpre_write_conversion))
1444                 return XCODING_SYSTEM_PRE_WRITE_CONVERSION(coding_system);
1445 #ifdef MULE
1446         else if (type == CODESYS_ISO2022) {
1447                 if (EQ(prop, Qcharset_g0))
1448                         return coding_system_charset(coding_system, 0);
1449                 else if (EQ(prop, Qcharset_g1))
1450                         return coding_system_charset(coding_system, 1);
1451                 else if (EQ(prop, Qcharset_g2))
1452                         return coding_system_charset(coding_system, 2);
1453                 else if (EQ(prop, Qcharset_g3))
1454                         return coding_system_charset(coding_system, 3);
1455
1456 #define FORCE_CHARSET(charset_num) \
1457   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1458    (coding_system, charset_num) ? Qt : Qnil)
1459
1460                 else if (EQ(prop, Qforce_g0_on_output))
1461                         return FORCE_CHARSET(0);
1462                 else if (EQ(prop, Qforce_g1_on_output))
1463                         return FORCE_CHARSET(1);
1464                 else if (EQ(prop, Qforce_g2_on_output))
1465                         return FORCE_CHARSET(2);
1466                 else if (EQ(prop, Qforce_g3_on_output))
1467                         return FORCE_CHARSET(3);
1468
1469 #define LISP_BOOLEAN(prop) \
1470   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1471
1472                 else if (EQ(prop, Qshort))
1473                         return LISP_BOOLEAN(SHORT);
1474                 else if (EQ(prop, Qno_ascii_eol))
1475                         return LISP_BOOLEAN(NO_ASCII_EOL);
1476                 else if (EQ(prop, Qno_ascii_cntl))
1477                         return LISP_BOOLEAN(NO_ASCII_CNTL);
1478                 else if (EQ(prop, Qseven))
1479                         return LISP_BOOLEAN(SEVEN);
1480                 else if (EQ(prop, Qlock_shift))
1481                         return LISP_BOOLEAN(LOCK_SHIFT);
1482                 else if (EQ(prop, Qno_iso6429))
1483                         return LISP_BOOLEAN(NO_ISO6429);
1484                 else if (EQ(prop, Qescape_quoted))
1485                         return LISP_BOOLEAN(ESCAPE_QUOTED);
1486
1487                 else if (EQ(prop, Qinput_charset_conversion))
1488                         return
1489                             unparse_charset_conversion_specs
1490                             (XCODING_SYSTEM(coding_system)->iso2022.input_conv);
1491                 else if (EQ(prop, Qoutput_charset_conversion))
1492                         return
1493                             unparse_charset_conversion_specs
1494                             (XCODING_SYSTEM(coding_system)->iso2022.
1495                              output_conv);
1496                 else
1497                         abort();
1498         } else if (type == CODESYS_CCL) {
1499                 if (EQ(prop, Qdecode))
1500                         return XCODING_SYSTEM_CCL_DECODE(coding_system);
1501                 else if (EQ(prop, Qencode))
1502                         return XCODING_SYSTEM_CCL_ENCODE(coding_system);
1503                 else
1504                         abort();
1505         }
1506 #endif                          /* MULE */
1507         else
1508                 abort();
1509
1510         return Qnil;            /* not reached */
1511 }
1512 \f
1513 /************************************************************************/
1514 /*                       Coding category functions                      */
1515 /************************************************************************/
1516
1517 static int decode_coding_category(Lisp_Object symbol)
1518 {
1519         int i;
1520
1521         CHECK_SYMBOL(symbol);
1522         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1523                 if (EQ(coding_category_symbol[i], symbol))
1524                         return i;
1525
1526         signal_simple_error("Unrecognized coding category", symbol);
1527         return 0;               /* not reached */
1528 }
1529
1530 DEFUN("coding-category-list", Fcoding_category_list, 0, 0, 0,   /*
1531 Return a list of all recognized coding categories.
1532 */
1533       ())
1534 {
1535         int i;
1536         Lisp_Object list = Qnil;
1537
1538         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1539                 list = Fcons(coding_category_symbol[i], list);
1540         return list;
1541 }
1542
1543 DEFUN("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0,   /*
1544 Change the priority order of the coding categories.
1545 LIST should be list of coding categories, in descending order of
1546 priority.  Unspecified coding categories will be lower in priority
1547 than all specified ones, in the same relative order they were in
1548 previously.
1549 */
1550       (list))
1551 {
1552         int category_to_priority[CODING_CATEGORY_LAST];
1553         int i, j;
1554         Lisp_Object rest;
1555
1556         /* First generate a list that maps coding categories to priorities. */
1557
1558         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1559                 category_to_priority[i] = -1;
1560
1561         /* Highest priority comes from the specified list. */
1562         i = 0;
1563         EXTERNAL_LIST_LOOP(rest, list) {
1564                 int cat = decode_coding_category(XCAR(rest));
1565
1566                 if (category_to_priority[cat] >= 0)
1567                         signal_simple_error("Duplicate coding category in list",
1568                                             XCAR(rest));
1569                 category_to_priority[cat] = i++;
1570         }
1571
1572         /* Now go through the existing categories by priority to retrieve
1573            the categories not yet specified and preserve their priority
1574            order. */
1575         for (j = 0; j < CODING_CATEGORY_LAST; j++) {
1576                 int cat = fcd->coding_category_by_priority[j];
1577                 if (category_to_priority[cat] < 0)
1578                         category_to_priority[cat] = i++;
1579         }
1580
1581         /* Now we need to construct the inverse of the mapping we just
1582            constructed. */
1583
1584         for (i = 0; i < CODING_CATEGORY_LAST; i++)
1585                 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1586
1587         /* Phew!  That was confusing. */
1588         return Qnil;
1589 }
1590
1591 DEFUN("coding-priority-list", Fcoding_priority_list, 0, 0, 0,   /*
1592 Return a list of coding categories in descending order of priority.
1593 */
1594       ())
1595 {
1596         int i;
1597         Lisp_Object list = Qnil;
1598
1599         for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1600                 list =
1601                     Fcons(coding_category_symbol
1602                           [fcd->coding_category_by_priority[i]], list);
1603         return list;
1604 }
1605
1606 DEFUN("set-coding-category-system", Fset_coding_category_system, 2, 2, 0,       /*
1607 Change the coding system associated with a coding category.
1608 */
1609       (coding_category, coding_system))
1610 {
1611         int cat = decode_coding_category(coding_category);
1612
1613         coding_system = Fget_coding_system(coding_system);
1614         fcd->coding_category_system[cat] = coding_system;
1615         return Qnil;
1616 }
1617
1618 DEFUN("coding-category-system", Fcoding_category_system, 1, 1, 0,       /*
1619 Return the coding system associated with a coding category.
1620 */
1621       (coding_category))
1622 {
1623         int cat = decode_coding_category(coding_category);
1624         Lisp_Object sys = fcd->coding_category_system[cat];
1625
1626         if (!NILP(sys))
1627                 return XCODING_SYSTEM_NAME(sys);
1628         return Qnil;
1629 }
1630 \f
1631 /************************************************************************/
1632 /*                     Detecting the encoding of data                   */
1633 /************************************************************************/
1634
1635 struct detection_state {
1636         eol_type_t eol_type;
1637         int seen_non_ascii;
1638         int mask;
1639 #ifdef MULE
1640         struct {
1641                 int mask;
1642                 int in_second_byte;
1643         } big5;
1644
1645         struct {
1646                 int mask;
1647                 int in_second_byte;
1648         } shift_jis;
1649
1650         struct {
1651                 int mask;
1652                 int in_byte;
1653         } ucs4;
1654
1655         struct {
1656                 int mask;
1657                 int in_byte;
1658         } utf8;
1659
1660         struct {
1661                 int mask;
1662                 int initted;
1663                 struct iso2022_decoder iso;
1664                 unsigned int flags;
1665                 int high_byte_count;
1666                 unsigned int saw_single_shift:1;
1667         } iso2022;
1668 #endif
1669         struct {
1670                 int seen_anything;
1671                 int just_saw_cr;
1672         } eol;
1673 };
1674
1675 static int acceptable_control_char_p(int c)
1676 {
1677         switch (c) {
1678                 /* Allow and ignore control characters that you might
1679                    reasonably see in a text file */
1680         case '\r':
1681         case '\n':
1682         case '\t':
1683         case 7:         /* bell */
1684         case 8:         /* backspace */
1685         case 11:                /* vertical tab */
1686         case 12:                /* form feed */
1687         case 26:                /* MS-DOS C-z junk */
1688         case 31:                /* '^_' -- for info */
1689                 return 1;
1690         default:
1691                 return 0;
1692         }
1693 }
1694
1695 static int mask_has_at_most_one_bit_p(int mask)
1696 {
1697         /* Perhaps the only thing useful you learn from intensive Microsoft
1698            technical interviews */
1699         return (mask & (mask - 1)) == 0;
1700 }
1701
1702 static eol_type_t
1703 detect_eol_type(struct detection_state *st, const Extbyte * src,
1704                 Lstream_data_count n)
1705 {
1706         while (n--) {
1707                 const unsigned char c = *(const unsigned char*)src++;
1708                 if (c == '\n') {
1709                         if (st->eol.just_saw_cr)
1710                                 return EOL_CRLF;
1711                         else if (st->eol.seen_anything)
1712                                 return EOL_LF;
1713                 } else if (st->eol.just_saw_cr)
1714                         return EOL_CR;
1715                 else if (c == '\r')
1716                         st->eol.just_saw_cr = 1;
1717                 else
1718                         st->eol.just_saw_cr = 0;
1719                 st->eol.seen_anything = 1;
1720         }
1721
1722         return EOL_AUTODETECT;
1723 }
1724
1725 /* Attempt to determine the encoding and EOL type of the given text.
1726    Before calling this function for the first type, you must initialize
1727    st->eol_type as appropriate and initialize st->mask to ~0.
1728
1729    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1730    not yet known.
1731
1732    st->mask holds the determined coding category mask, or ~0 if only
1733    ASCII has been seen so far.
1734
1735    Returns:
1736
1737    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1738         is present in st->mask
1739    1 == definitive answers are here for both st->eol_type and st->mask
1740 */
1741
1742 static int
1743 detect_coding_type(struct detection_state *st, const Extbyte * src,
1744                    Lstream_data_count n, int just_do_eol)
1745 {
1746         if (st->eol_type == EOL_AUTODETECT)
1747                 st->eol_type = detect_eol_type(st, src, n);
1748
1749         if (just_do_eol)
1750                 return st->eol_type != EOL_AUTODETECT;
1751
1752         if (!st->seen_non_ascii) {
1753                 for (; n; n--, src++) {
1754                         const unsigned char c = *(const unsigned char *)src;
1755                         if ((c < 0x20 && !acceptable_control_char_p(c))
1756                             || c >= 0x80) {
1757                                 st->seen_non_ascii = 1;
1758 #ifdef MULE
1759                                 st->shift_jis.mask = ~0;
1760                                 st->big5.mask = ~0;
1761                                 st->ucs4.mask = ~0;
1762                                 st->utf8.mask = ~0;
1763                                 st->iso2022.mask = ~0;
1764 #endif
1765                                 break;
1766                         }
1767                 }
1768         }
1769
1770         if (!n) {
1771                 return 0;
1772         }
1773 #ifdef MULE
1774         if (!mask_has_at_most_one_bit_p(st->iso2022.mask))
1775                 st->iso2022.mask = detect_coding_iso2022(st, src, n);
1776         if (!mask_has_at_most_one_bit_p(st->shift_jis.mask))
1777                 st->shift_jis.mask = detect_coding_sjis(st, src, n);
1778         if (!mask_has_at_most_one_bit_p(st->big5.mask))
1779                 st->big5.mask = detect_coding_big5(st, src, n);
1780         if (!mask_has_at_most_one_bit_p(st->utf8.mask))
1781                 st->utf8.mask = detect_coding_utf8(st, src, n);
1782         if (!mask_has_at_most_one_bit_p(st->ucs4.mask))
1783                 st->ucs4.mask = detect_coding_ucs4(st, src, n);
1784
1785         st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1786                 | st->utf8.mask | st->ucs4.mask;
1787 #endif
1788         {
1789                 int retval = mask_has_at_most_one_bit_p(st->mask);
1790                 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1791                 return retval && st->eol_type != EOL_AUTODETECT;
1792         }
1793 }
1794
1795 static Lisp_Object coding_system_from_mask(int mask)
1796 {
1797         if (mask == ~0) {
1798                 /* If the file was entirely or basically ASCII, use the
1799                    default value of `buffer-file-coding-system'. */
1800                 Lisp_Object retval =
1801                     XBUFFER(Vbuffer_defaults)->buffer_file_coding_system;
1802                 if (!NILP(retval)) {
1803                         retval = Ffind_coding_system(retval);
1804                         if (NILP(retval)) {
1805                                 warn_when_safe
1806                                     (Qbad_variable, Qwarning,
1807                                      "Invalid `default-buffer-file-coding-system', set to nil");
1808                                 XBUFFER(Vbuffer_defaults)->
1809                                     buffer_file_coding_system = Qnil;
1810                         }
1811                 }
1812                 if (NILP(retval))
1813                         retval = Fget_coding_system(Qraw_text);
1814                 return retval;
1815         } else {
1816                 int i;
1817                 int cat = -1;
1818 #ifdef MULE
1819                 mask = postprocess_iso2022_mask(mask);
1820 #endif
1821                 /* Look through the coding categories by priority and find
1822                    the first one that is allowed. */
1823                 for (i = 0; i < CODING_CATEGORY_LAST; i++) {
1824                         cat = fcd->coding_category_by_priority[i];
1825                         if (cat<0)
1826                                 continue;
1827                         if ((mask & (1 << cat)) &&
1828                             !NILP(fcd->coding_category_system[cat]))
1829                                 break;
1830                 }
1831                 if (cat >= 0)
1832                         return fcd->coding_category_system[cat];
1833                 else
1834                         return Fget_coding_system(Qraw_text);
1835         }
1836 }
1837
1838 /* Given a seekable read stream and potential coding system and EOL type
1839    as specified, do any autodetection that is called for.  If the
1840    coding system and/or EOL type are not `autodetect', they will be left
1841    alone; but this function will never return an autodetect coding system
1842    or EOL type.
1843
1844    This function does not automatically fetch subsidiary coding systems;
1845    that should be unnecessary with the explicit eol-type argument. */
1846
1847 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1848 /* number of leading lines to check for a coding cookie */
1849 #define LINES_TO_CHECK 2
1850
1851 void
1852 determine_real_coding_system(lstream_t stream, Lisp_Object * codesys_in_out,
1853                              eol_type_t * eol_type_in_out)
1854 {
1855         struct detection_state decst;
1856
1857         if (*eol_type_in_out == EOL_AUTODETECT)
1858                 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE(*codesys_in_out);
1859
1860         xzero(decst);
1861         decst.eol_type = *eol_type_in_out;
1862         decst.mask = ~0;
1863
1864         /* If autodetection is called for, do it now. */
1865         if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1866             || *eol_type_in_out == EOL_AUTODETECT) {
1867                 Extbyte buf[4096];
1868                 Lisp_Object coding_system = Qnil;
1869                 Extbyte *p;
1870                 Lstream_data_count nread =
1871                     Lstream_read(stream, buf, sizeof(buf));
1872                 Extbyte *scan_end;
1873                 int lines_checked = 0;
1874
1875                 /* Look for initial "-*-"; mode line prefix */
1876                 for (p = buf,
1877                      scan_end = buf + nread - LENGTH("-*-coding:?-*-");
1878                      p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1879                         if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1880                                 Extbyte *local_vars_beg = p + 3;
1881                                 /* Look for final "-*-"; mode line suffix */
1882                                 for (p = local_vars_beg, scan_end = buf + nread - LENGTH("-*-");
1883                                      p <= scan_end && lines_checked < LINES_TO_CHECK; p++)
1884                                         if (*p == '-' && *(p + 1) == '*' && *(p + 2) == '-') {
1885                                                 Extbyte *suffix = p;
1886                                                 /* Look for "coding:" */
1887                                                 for (p = local_vars_beg, scan_end = suffix - LENGTH("coding:?");
1888                                                      p <= scan_end; p++)
1889                                                         if (memcmp("coding:", p, LENGTH("coding:")) == 0
1890                                                             && (p == local_vars_beg
1891                                                                 || (*(p - 1) == ' '
1892                                                                     || *(p - 1) == '\t'
1893                                                                     || *(p - 1) == ';'))) {
1894                                                                 Extbyte save;
1895                                                                 int n;
1896                                                                 p += LENGTH("coding:");
1897                                                                 while (*p == ' ' || *p == '\t') {
1898                                                                         p++;
1899                                                                 }
1900
1901                                                                 /* Get coding system name */
1902                                                                 save = *suffix;
1903                                                                 *suffix = '\0';
1904                                                                 /* Characters valid in a MIME charset name (rfc 1521),
1905                                                                    and in a Lisp symbol name. */
1906                                                                 n = strspn((char *)p,
1907                                                                            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1908                                                                            "abcdefghijklmnopqrstuvwxyz"
1909                                                                            "0123456789"
1910                                                                            "!$%&*+-.^_{|}~");
1911                                                                 *suffix = save;
1912                                                                 if (n > 0) {
1913                                                                         save = p[n];
1914                                                                         p[n] = '\0';
1915                                                                         coding_system = Ffind_coding_system(intern((char *)p));
1916                                                                         p[n] = save;
1917                                                                 }
1918                                                                 break;
1919                                                         }
1920                                                 break;
1921                                         }
1922                                 /* #### file must use standard EOLs or we miss 2d line */
1923                                 /* #### not to mention this is broken for UTF-16 DOS files */
1924                                         else if (*p == '\n' || *p == '\r') {
1925                                                 lines_checked++;
1926                                                 /* skip past multibyte (DOS) newline */
1927                                                 if (*p == '\r'
1928                                                     && *(p + 1) == '\n')
1929                                                         p++;
1930                                         }
1931                                 break;
1932                         }
1933                 /* #### file must use standard EOLs or we miss 2d line */
1934                 /* #### not to mention this is broken for UTF-16 DOS files */
1935                         else if (*p == '\n' || *p == '\r') {
1936                                 lines_checked++;
1937                                 /* skip past multibyte (DOS) newline */
1938                                 if (*p == '\r' && *(p + 1) == '\n')
1939                                         p++;
1940                         }
1941
1942                 if (NILP(coding_system))
1943                         do {
1944                                 if (detect_coding_type(&decst, buf, nread,
1945                                                        XCODING_SYSTEM_TYPE(*codesys_in_out)
1946                                                        != CODESYS_AUTODETECT))
1947                                         break;
1948                                 nread = Lstream_read(stream, buf, sizeof(buf));
1949                                 if (nread == 0)
1950                                         break;
1951                         }
1952                         while (1);
1953
1954                 else if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT
1955                          && XCODING_SYSTEM_EOL_TYPE(coding_system) == EOL_AUTODETECT)
1956                         do {
1957                                 if (detect_coding_type(&decst, buf, nread, 1))
1958                                         break;
1959                                 nread = Lstream_read(stream, buf, sizeof(buf));
1960                                 if (!nread)
1961                                         break;
1962                         }
1963                         while (1);
1964
1965                 *eol_type_in_out = decst.eol_type;
1966                 if (XCODING_SYSTEM_TYPE(*codesys_in_out) == CODESYS_AUTODETECT) {
1967                         if (NILP(coding_system))
1968                                 *codesys_in_out =
1969                                     coding_system_from_mask(decst.mask);
1970                         else
1971                                 *codesys_in_out = coding_system;
1972                 }
1973         }
1974
1975         /* If we absolutely can't determine the EOL type, just assume LF. */
1976         if (*eol_type_in_out == EOL_AUTODETECT)
1977                 *eol_type_in_out = EOL_LF;
1978
1979         Lstream_rewind(stream);
1980 }
1981
1982 DEFUN("detect-coding-region", Fdetect_coding_region, 2, 3, 0,   /*
1983 Detect coding system of the text in the region between START and END.
1984 Return a list of possible coding systems ordered by priority.
1985 If only ASCII characters are found, return 'undecided or one of
1986 its subsidiary coding systems according to a detected end-of-line
1987 type.  Optional arg BUFFER defaults to the current buffer.
1988 */
1989       (start, end, buffer))
1990 {
1991         Lisp_Object val = Qnil;
1992         struct buffer *buf = decode_buffer(buffer, 0);
1993         Bufpos b, e;
1994         Lisp_Object instream, lb_instream;
1995         lstream_t istr, lb_istr;
1996         struct detection_state decst;
1997         struct gcpro gcpro1, gcpro2;
1998
1999         get_buffer_range_char(buf, start, end, &b, &e, 0);
2000         lb_instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2001         lb_istr = XLSTREAM(lb_instream);
2002         instream =
2003             make_encoding_input_stream(lb_istr, Fget_coding_system(Qbinary));
2004         istr = XLSTREAM(instream);
2005         GCPRO2(instream, lb_instream);
2006         xzero(decst);
2007         decst.eol_type = EOL_AUTODETECT;
2008         decst.mask = ~0;
2009         while (1) {
2010                 Extbyte random_buffer[4096];
2011                 Lstream_data_count nread =
2012                     Lstream_read(istr, random_buffer, sizeof(random_buffer));
2013
2014                 if (!nread)
2015                         break;
2016                 if (detect_coding_type(&decst, random_buffer, nread, 0))
2017                         break;
2018         }
2019
2020         if (decst.mask == ~0)
2021                 val = subsidiary_coding_system(Fget_coding_system(Qundecided),
2022                                                decst.eol_type);
2023         else {
2024                 int i;
2025
2026                 val = Qnil;
2027 #ifdef MULE
2028                 decst.mask = postprocess_iso2022_mask(decst.mask);
2029 #endif
2030                 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) {
2031                         int sys = fcd->coding_category_by_priority[i];
2032                         if (decst.mask & (1 << sys)) {
2033                                 Lisp_Object codesys =
2034                                     fcd->coding_category_system[sys];
2035                                 if (!NILP(codesys))
2036                                         codesys =
2037                                             subsidiary_coding_system(codesys,
2038                                                                      decst.
2039                                                                      eol_type);
2040                                 val = Fcons(codesys, val);
2041                         }
2042                 }
2043         }
2044         Lstream_close(istr);
2045         UNGCPRO;
2046         Lstream_delete(istr);
2047         Lstream_delete(lb_istr);
2048         return val;
2049 }
2050 \f
2051 /************************************************************************/
2052 /*           Converting to internal Mule format ("decoding")            */
2053 /************************************************************************/
2054
2055 /* A decoding stream is a stream used for decoding text (i.e.
2056    converting from some external format to internal format).
2057    The decoding-stream object keeps track of the actual coding
2058    stream, the stream that is at the other end, and data that
2059    needs to be persistent across the lifetime of the stream. */
2060
2061 /* Handle the EOL stuff related to just-read-in character C.
2062    EOL_TYPE is the EOL type of the coding stream.
2063    FLAGS is the current value of FLAGS in the coding stream, and may
2064    be modified by this macro.  (The macro only looks at the
2065    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2066    bytes are to be written.  You need to also define a local goto
2067    label "label_continue_loop" that is at the end of the main
2068    character-reading loop.
2069
2070    If C is a CR character, then this macro handles it entirely and
2071    jumps to label_continue_loop.  Otherwise, this macro does not add
2072    anything to DST, and continues normally.  You should continue
2073    processing C normally after this macro. */
2074
2075 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2076 do {                                                            \
2077   if (c == '\r')                                                \
2078     {                                                           \
2079       if (eol_type == EOL_CR)                                   \
2080         Dynarr_add (dst, '\n');                                 \
2081       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2082         Dynarr_add (dst, c);                                    \
2083       else                                                      \
2084         flags |= CODING_STATE_CR;                               \
2085       goto label_continue_loop;                                 \
2086     }                                                           \
2087   else if (flags & CODING_STATE_CR)                             \
2088     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2089       if (c != '\n')                                            \
2090         Dynarr_add (dst, '\r');                                 \
2091       flags &= ~CODING_STATE_CR;                                \
2092     }                                                           \
2093 } while (0)
2094
2095 /* C should be a binary character in the range 0 - 255; convert
2096    to internal format and add to Dynarr DST. */
2097
2098 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2099 do {                                            \
2100   if (BYTE_ASCII_P (c))                         \
2101     Dynarr_add (dst, c);                        \
2102   else if (BYTE_C1_P (c))                       \
2103     {                                           \
2104       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2105       Dynarr_add (dst, c + 0x20);               \
2106     }                                           \
2107   else                                          \
2108     {                                           \
2109       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2110       Dynarr_add (dst, c);                      \
2111     }                                           \
2112 } while (0)
2113
2114 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2115 do {                                    \
2116   if (ch)                               \
2117     {                                   \
2118       DECODE_ADD_BINARY_CHAR (ch, dst); \
2119       ch = 0;                           \
2120     }                                   \
2121 } while (0)
2122
2123 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2124 do {                                    \
2125   if (flags & CODING_STATE_END)         \
2126     {                                   \
2127       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2128       if (flags & CODING_STATE_CR)      \
2129         Dynarr_add (dst, '\r');         \
2130     }                                   \
2131 } while (0)
2132
2133 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2134
2135 typedef struct decoding_stream_s *decoding_stream_t;
2136 struct decoding_stream_s {
2137         /* Coding system that governs the conversion. */
2138         Lisp_Coding_System *codesys;
2139
2140         /* Stream that we read the encoded data from or
2141            write the decoded data to. */
2142         lstream_t other_end;
2143
2144         /* If we are reading, then we can return only a fixed amount of
2145            data, so if the conversion resulted in too much data, we store it
2146            here for retrieval the next time around. */
2147         unsigned_char_dynarr *runoff;
2148
2149         /* FLAGS holds flags indicating the current state of the decoding.
2150            Some of these flags are dependent on the coding system. */
2151         unsigned int flags;
2152
2153         /* CH holds a partially built-up character.  Since we only deal
2154            with one- and two-byte characters at the moment, we only use
2155            this to store the first byte of a two-byte character. */
2156         unsigned int ch;
2157
2158         /* EOL_TYPE specifies the type of end-of-line conversion that
2159            currently applies.  We need to keep this separate from the
2160            EOL type stored in CODESYS because the latter might indicate
2161            automatic EOL-type detection while the former will always
2162            indicate a particular EOL type. */
2163         eol_type_t eol_type;
2164 #ifdef MULE
2165         /* Additional ISO2022 information.  We define the structure above
2166            because it's also needed by the detection routines. */
2167         struct iso2022_decoder iso2022;
2168
2169         /* Additional information (the state of the running CCL program)
2170            used by the CCL decoder. */
2171         struct ccl_program ccl;
2172
2173         /* counter for UTF-8 or UCS-4 */
2174         unsigned char counter;
2175 #endif
2176         struct detection_state decst;
2177 };
2178
2179 static Lstream_data_count
2180 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2181 static Lstream_data_count
2182 decoding_writer(lstream_t stream,
2183                 const unsigned char *data, Lstream_data_count size);
2184 static int decoding_rewinder(lstream_t stream);
2185 static int decoding_seekable_p(lstream_t stream);
2186 static int decoding_flusher(lstream_t stream);
2187 static int decoding_closer(lstream_t stream);
2188
2189 static Lisp_Object decoding_marker(Lisp_Object stream);
2190
2191 DEFINE_LSTREAM_IMPLEMENTATION("decoding", lstream_decoding,
2192                               sizeof(struct decoding_stream_s));
2193
2194 static Lisp_Object
2195 decoding_marker(Lisp_Object stream)
2196 {
2197         lstream_t str = DECODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2198         Lisp_Object str_obj;
2199
2200         /* We do not need to mark the coding systems or charsets stored
2201            within the stream because they are stored in a global list
2202            and automatically marked. */
2203
2204         XSETLSTREAM(str_obj, str);
2205         mark_object(str_obj);
2206         if (str->imp->marker) {
2207                 return str->imp->marker(str_obj);
2208         } else {
2209                 return Qnil;
2210         }
2211 }
2212
2213 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2214    so we read data from the other end, decode it, and store it into DATA. */
2215
2216 static Lstream_data_count
2217 decoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2218 {
2219         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2220         unsigned char *orig_data = data;
2221         Lstream_data_count read_size;
2222         int error_occurred = 0;
2223
2224         /* We need to interface to mule_decode(), which expects to take some
2225            amount of data and store the result into a Dynarr.  We have
2226            mule_decode() store into str->runoff, and take data from there
2227            as necessary. */
2228
2229         /* We loop until we have enough data, reading chunks from the other
2230            end and decoding it. */
2231         while (1) {
2232                 /* Take data from the runoff if we can.  Make sure to take at
2233                    most SIZE bytes, and delete the data from the runoff. */
2234                 if (Dynarr_length(str->runoff) > 0) {
2235                         Lstream_data_count chunk =
2236                                 min(size,
2237                                     (Lstream_data_count)
2238                                     Dynarr_length(str->runoff));
2239                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2240                         Dynarr_delete_many(str->runoff, 0, chunk);
2241                         data += chunk;
2242                         size -= chunk;
2243                 }
2244
2245                 if (size == 0) {
2246                         /* No more room for data */
2247                         break;
2248                 }
2249
2250                 if (str->flags & CODING_STATE_END) {
2251                         /* This means that on the previous iteration, we hit the
2252                            EOF on the other end.  We loop once more so that
2253                            mule_decode() can output any final stuff it may be
2254                            holding, or any "go back to a sane state" escape
2255                            sequences. (This latter makes sense during
2256                            encoding.) */
2257                         break;
2258                 }
2259
2260                 /* Exhausted the runoff, so get some more.  DATA has at least
2261                    SIZE bytes left of storage in it, so it's OK to read directly
2262                    into it.  (We'll be overwriting above, after we've decoded it
2263                    into the runoff.) */
2264                 read_size = Lstream_read(str->other_end, data, size);
2265                 if (read_size < 0) {
2266                         error_occurred = 1;
2267                         break;
2268                 }
2269                 if (read_size == 0) {
2270                         /* There might be some more end data produced in the
2271                            translation.  See the comment above. */
2272                         str->flags |= CODING_STATE_END;
2273                 }
2274                 mule_decode(stream, (Extbyte *) data, str->runoff, read_size);
2275         }
2276
2277         if (data - orig_data == 0) {
2278                 return error_occurred ? -1 : 0;
2279         } else {
2280                 return data - orig_data;
2281         }
2282 }
2283
2284 static Lstream_data_count
2285 decoding_writer(lstream_t stream, const unsigned char *data,
2286                 Lstream_data_count size)
2287 {
2288         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2289         Lstream_data_count retval;
2290
2291         /* Decode all our data into the runoff, and then attempt to write
2292            it all out to the other end.  Remove whatever chunk we succeeded
2293            in writing. */
2294         mule_decode(stream, (const Extbyte *)data, str->runoff, size);
2295         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2296                                Dynarr_length(str->runoff));
2297         if (retval > 0) {
2298                 Dynarr_delete_many(str->runoff, 0, retval);
2299         }
2300         /* Do NOT return retval.  The return value indicates how much
2301            of the incoming data was written, not how many bytes were
2302            written. */
2303         return size;
2304 }
2305
2306 static void
2307 reset_decoding_stream(decoding_stream_t str)
2308 {
2309 #ifdef MULE
2310         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_ISO2022) {
2311                 Lisp_Object coding_system;
2312                 XSETCODING_SYSTEM(coding_system, str->codesys);
2313                 reset_iso2022(coding_system, &str->iso2022);
2314         } else if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_CCL) {
2315                 setup_ccl_program(&str->ccl,
2316                                   CODING_SYSTEM_CCL_DECODE(str->codesys));
2317         }
2318         str->counter = 0;
2319 #endif                          /* MULE */
2320         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT
2321             || CODING_SYSTEM_EOL_TYPE(str->codesys) == EOL_AUTODETECT) {
2322                 xzero(str->decst);
2323                 str->decst.eol_type = EOL_AUTODETECT;
2324                 str->decst.mask = ~0;
2325         }
2326         str->flags = str->ch = 0;
2327 }
2328
2329 static int
2330 decoding_rewinder(lstream_t stream)
2331 {
2332         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2333         reset_decoding_stream(str);
2334         Dynarr_reset(str->runoff);
2335         return Lstream_rewind(str->other_end);
2336 }
2337
2338 static int
2339 decoding_seekable_p(lstream_t stream)
2340 {
2341         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2342         return Lstream_seekable_p(str->other_end);
2343 }
2344
2345 static int
2346 decoding_flusher(lstream_t stream)
2347 {
2348         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2349         return Lstream_flush(str->other_end);
2350 }
2351
2352 static int
2353 decoding_closer(lstream_t stream)
2354 {
2355         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2356         if (stream->flags & LSTREAM_FL_WRITE) {
2357                 str->flags |= CODING_STATE_END;
2358                 decoding_writer(stream, 0, 0);
2359         }
2360         Dynarr_free(str->runoff);
2361 #ifdef MULE
2362 #ifdef ENABLE_COMPOSITE_CHARS
2363         if (str->iso2022.composite_chars) {
2364                 Dynarr_free(str->iso2022.composite_chars);
2365         }
2366 #endif
2367 #endif
2368         return Lstream_close(str->other_end);
2369 }
2370
2371 Lisp_Object
2372 decoding_stream_coding_system(lstream_t stream)
2373 {
2374         Lisp_Object coding_system;
2375         decoding_stream_t str = DECODING_STREAM_DATA(stream);
2376
2377         XSETCODING_SYSTEM(coding_system, str->codesys);
2378         return subsidiary_coding_system(coding_system, str->eol_type);
2379 }
2380
2381 void
2382 set_decoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2383 {
2384         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2385         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2386         str->codesys = cs;
2387         if (CODING_SYSTEM_EOL_TYPE(cs) != EOL_AUTODETECT) {
2388                 str->eol_type = CODING_SYSTEM_EOL_TYPE(cs);
2389         }
2390         reset_decoding_stream(str);
2391         return;
2392 }
2393
2394 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2395    stream for writing, no automatic code detection will be performed.
2396    The reason for this is that automatic code detection requires a
2397    seekable input.  Things will also fail if you open a decoding
2398    stream for reading using a non-fully-specified coding system and
2399    a non-seekable input stream. */
2400
2401 static Lisp_Object
2402 make_decoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2403 {
2404         lstream_t lstr = Lstream_new(lstream_decoding, mode);
2405         decoding_stream_t str = DECODING_STREAM_DATA(lstr);
2406         Lisp_Object obj;
2407
2408         xzero(*str);
2409         str->other_end = stream;
2410         str->runoff = (unsigned_char_dynarr *) Dynarr_new(unsigned_char);
2411         str->eol_type = EOL_AUTODETECT;
2412         if (!strcmp(mode, "r") && Lstream_seekable_p(stream)) {
2413                 /* We can determine the coding system now. */
2414                 determine_real_coding_system(stream, &codesys, &str->eol_type);
2415         }
2416         set_decoding_stream_coding_system(lstr, codesys);
2417         str->decst.eol_type = str->eol_type;
2418         str->decst.mask = ~0;
2419         XSETLSTREAM(obj, lstr);
2420         return obj;
2421 }
2422
2423 Lisp_Object
2424 make_decoding_input_stream(lstream_t stream, Lisp_Object codesys)
2425 {
2426         return make_decoding_stream_1(stream, codesys, "r");
2427 }
2428
2429 Lisp_Object
2430 make_decoding_output_stream(lstream_t stream, Lisp_Object codesys)
2431 {
2432         return make_decoding_stream_1(stream, codesys, "w");
2433 }
2434
2435 /* Note: the decode_coding_* functions all take the same
2436    arguments as mule_decode(), which is to say some SRC data of
2437    size N, which is to be stored into dynamic array DST.
2438    DECODING is the stream within which the decoding is
2439    taking place, but no data is actually read from or
2440    written to that stream; that is handled in decoding_reader()
2441    or decoding_writer().  This allows the same functions to
2442    be used for both reading and writing. */
2443
2444 static void
2445 mule_decode(lstream_t decoding, const Extbyte * src,
2446             unsigned_char_dynarr * dst, Lstream_data_count n)
2447 {
2448         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
2449
2450         /* If necessary, do encoding-detection now.  We do this when
2451            we're a writing stream or a non-seekable reading stream,
2452            meaning that we can't just process the whole input,
2453            rewind, and start over. */
2454
2455         if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT ||
2456             str->eol_type == EOL_AUTODETECT) {
2457                 Lisp_Object codesys;
2458
2459                 XSETCODING_SYSTEM(codesys, str->codesys);
2460                 detect_coding_type(&str->decst, src, n,
2461                                    CODING_SYSTEM_TYPE(str->codesys) !=
2462                                    CODESYS_AUTODETECT);
2463                 if (CODING_SYSTEM_TYPE(str->codesys) == CODESYS_AUTODETECT &&
2464                     str->decst.mask != ~0)
2465                         /* #### This is cheesy.  What we really ought to do is
2466                            buffer up a certain amount of data so as to get a
2467                            less random result. */
2468                         codesys = coding_system_from_mask(str->decst.mask);
2469                 str->eol_type = str->decst.eol_type;
2470                 if (XCODING_SYSTEM(codesys) != str->codesys) {
2471                         /* Preserve the CODING_STATE_END flag in case it was set.
2472                            If we erase it, bad things might happen. */
2473                         int was_end = str->flags & CODING_STATE_END;
2474                         set_decoding_stream_coding_system(decoding, codesys);
2475                         if (was_end)
2476                                 str->flags |= CODING_STATE_END;
2477                 }
2478         }
2479
2480         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2481 #ifdef DEBUG_SXEMACS
2482         case CODESYS_INTERNAL:
2483                 Dynarr_add_many(dst, src, n);
2484                 break;
2485 #endif
2486         case CODESYS_AUTODETECT:
2487                 /* If we got this far and still haven't decided on the coding
2488                    system, then do no conversion. */
2489         case CODESYS_NO_CONVERSION:
2490                 decode_coding_no_conversion(decoding, src, dst, n);
2491                 break;
2492 #ifdef MULE
2493         case CODESYS_SHIFT_JIS:
2494                 decode_coding_sjis(decoding, src, dst, n);
2495                 break;
2496         case CODESYS_BIG5:
2497                 decode_coding_big5(decoding, src, dst, n);
2498                 break;
2499         case CODESYS_UCS4:
2500                 decode_coding_ucs4(decoding, src, dst, n);
2501                 break;
2502         case CODESYS_UTF8:
2503                 decode_coding_utf8(decoding, src, dst, n);
2504                 break;
2505         case CODESYS_CCL:
2506                 str->ccl.last_block = str->flags & CODING_STATE_END;
2507                 /* When applying ccl program to stream, MUST NOT set NULL
2508                    pointer to src.  */
2509                 ccl_driver(&str->ccl,
2510                            (src
2511                             ? (const unsigned char *)src
2512                             : (const unsigned char *)""),
2513                            dst, n, 0, CCL_MODE_DECODING);
2514                 break;
2515         case CODESYS_ISO2022:
2516                 decode_coding_iso2022(decoding, src, dst, n);
2517                 break;
2518 #endif                          /* MULE */
2519         default:
2520                 abort();
2521         }
2522 }
2523
2524 DEFUN("decode-coding-region", Fdecode_coding_region, 3, 4, 0,   /*
2525 Decode the text between START and END which is encoded in CODING-SYSTEM.
2526 This is useful if you've read in encoded text from a file without decoding
2527 it (e.g. you read in a JIS-formatted file but used the `binary' or
2528 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2529 Return length of decoded text.
2530 BUFFER defaults to the current buffer if unspecified.
2531 */
2532       (start, end, coding_system, buffer))
2533 {
2534         Bufpos b, e;
2535         struct buffer *buf = decode_buffer(buffer, 0);
2536         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2537         lstream_t istr, ostr;
2538         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2539
2540         get_buffer_range_char(buf, start, end, &b, &e, 0);
2541
2542         barf_if_buffer_read_only(buf, b, e);
2543
2544         coding_system = Fget_coding_system(coding_system);
2545         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2546         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2547         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2548                                                    coding_system);
2549         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2550                                                 Fget_coding_system(Qbinary));
2551         istr = XLSTREAM(instream);
2552         ostr = XLSTREAM(outstream);
2553         GCPRO4(instream, lb_outstream, de_outstream, outstream);
2554
2555         /* The chain of streams looks like this:
2556
2557            [BUFFER] <----- send through
2558            ------> [ENCODE AS BINARY]
2559            ------> [DECODE AS SPECIFIED]
2560            ------> [BUFFER]
2561          */
2562
2563         while (1) {
2564                 char tempbuf[1024];     /* some random amount */
2565                 Bufpos newpos, even_newer_pos;
2566                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2567                 Lstream_data_count size_in_bytes =
2568                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
2569
2570                 if (!size_in_bytes)
2571                         break;
2572                 newpos = lisp_buffer_stream_startpos(istr);
2573                 Lstream_write(ostr, tempbuf, size_in_bytes);
2574                 even_newer_pos = lisp_buffer_stream_startpos(istr);
2575                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
2576                                     even_newer_pos, 0);
2577         }
2578         Lstream_close(istr);
2579         Lstream_close(ostr);
2580         UNGCPRO;
2581         Lstream_delete(istr);
2582         Lstream_delete(ostr);
2583         Lstream_delete(XLSTREAM(de_outstream));
2584         Lstream_delete(XLSTREAM(lb_outstream));
2585         return Qnil;
2586 }
2587 \f
2588 /************************************************************************/
2589 /*           Converting to an external encoding ("encoding")            */
2590 /************************************************************************/
2591
2592 /* An encoding stream is an output stream.  When you create the
2593    stream, you specify the coding system that governs the encoding
2594    and another stream that the resulting encoded data is to be
2595    sent to, and then start sending data to it. */
2596
2597 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2598
2599 typedef struct encoding_stream_s *encoding_stream_t;
2600 struct encoding_stream_s {
2601         /* Coding system that governs the conversion. */
2602         Lisp_Coding_System *codesys;
2603
2604         /* Stream that we read the encoded data from or
2605            write the decoded data to. */
2606         lstream_t other_end;
2607
2608         /* If we are reading, then we can return only a fixed amount of
2609            data, so if the conversion resulted in too much data, we store it
2610            here for retrieval the next time around. */
2611         unsigned_char_dynarr *runoff;
2612
2613         /* FLAGS holds flags indicating the current state of the encoding.
2614            Some of these flags are dependent on the coding system. */
2615         unsigned int flags;
2616
2617         /* CH holds a partially built-up character.  Since we only deal
2618            with one- and two-byte characters at the moment, we only use
2619            this to store the first byte of a two-byte character. */
2620         unsigned int ch;
2621 #ifdef MULE
2622         /* Additional information used by the ISO2022 encoder. */
2623         struct {
2624                 /* CHARSET holds the character sets currently assigned to the G0
2625                    through G3 registers.  It is initialized from the array
2626                    INITIAL_CHARSET in CODESYS. */
2627                 Lisp_Object charset[4];
2628
2629                 /* Which registers are currently invoked into the left (GL) and
2630                    right (GR) halves of the 8-bit encoding space? */
2631                 int register_left, register_right;
2632
2633                 /* Whether we need to explicitly designate the charset in the
2634                    G? register before using it.  It is initialized from the
2635                    array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2636                 unsigned char force_charset_on_output[4];
2637
2638                 /* Other state variables that need to be preserved across
2639                    invocations. */
2640                 Lisp_Object current_charset;
2641                 int current_half;
2642                 int current_char_boundary;
2643         } iso2022;
2644
2645         /* Additional information (the state of the running CCL program)
2646            used by the CCL encoder. */
2647         struct ccl_program ccl;
2648 #endif                          /* MULE */
2649 };
2650
2651 static Lstream_data_count
2652 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size);
2653 static Lstream_data_count
2654 encoding_writer(lstream_t stream,
2655                 const unsigned char *data, Lstream_data_count size);
2656 static int encoding_rewinder(lstream_t stream);
2657 static int encoding_seekable_p(lstream_t stream);
2658 static int encoding_flusher(lstream_t stream);
2659 static int encoding_closer(lstream_t stream);
2660
2661 static Lisp_Object encoding_marker(Lisp_Object stream);
2662
2663 DEFINE_LSTREAM_IMPLEMENTATION("encoding", lstream_encoding,
2664                               sizeof(struct encoding_stream_s));
2665
2666 static Lisp_Object
2667 encoding_marker(Lisp_Object stream)
2668 {
2669         lstream_t str = ENCODING_STREAM_DATA(XLSTREAM(stream))->other_end;
2670         Lisp_Object str_obj;
2671
2672         /* We do not need to mark the coding systems or charsets stored
2673            within the stream because they are stored in a global list
2674            and automatically marked. */
2675
2676         XSETLSTREAM(str_obj, str);
2677         mark_object(str_obj);
2678         if (str->imp->marker) {
2679                 return str->imp->marker(str_obj);
2680         } else {
2681                 return Qnil;
2682         }
2683 }
2684
2685 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2686    so we read data from the other end, encode it, and store it into DATA. */
2687
2688 static Lstream_data_count
2689 encoding_reader(lstream_t stream, unsigned char *data, Lstream_data_count size)
2690 {
2691         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2692         unsigned char *orig_data = data;
2693         Lstream_data_count read_size;
2694         int error_occurred = 0;
2695
2696         /* We need to interface to mule_encode(), which expects to take some
2697            amount of data and store the result into a Dynarr.  We have
2698            mule_encode() store into str->runoff, and take data from there
2699            as necessary. */
2700
2701         /* We loop until we have enough data, reading chunks from the other
2702            end and encoding it. */
2703         while (1) {
2704                 /* Take data from the runoff if we can.  Make sure to take at
2705                    most SIZE bytes, and delete the data from the runoff. */
2706                 if (Dynarr_length(str->runoff) > 0) {
2707                         int chunk = min((int)size, Dynarr_length(str->runoff));
2708                         memcpy(data, Dynarr_atp(str->runoff, 0), chunk);
2709                         Dynarr_delete_many(str->runoff, 0, chunk);
2710                         data += chunk;
2711                         size -= chunk;
2712                 }
2713
2714                 if (size == 0) {
2715                         /* No more room for data */
2716                         break;
2717                 }
2718
2719                 if (str->flags & CODING_STATE_END) {
2720                         /* This means that on the previous iteration, we hit the
2721                            EOF on the other end.  We loop once more so that
2722                            mule_encode() can output any final stuff it may be
2723                            holding, or any "go back to a sane state" escape
2724                            sequences. (This latter makes sense during
2725                            encoding.) */
2726                         break;
2727                 }
2728
2729                 /* Exhausted the runoff, so get some more.  DATA at least SIZE
2730                    bytes left of storage in it, so it's OK to read directly into
2731                    it.  (We'll be overwriting above, after we've encoded it into
2732                    the runoff.) */
2733                 read_size = Lstream_read(str->other_end, data, size);
2734                 if (read_size < 0) {
2735                         error_occurred = 1;
2736                         break;
2737                 }
2738                 if (read_size == 0) {
2739                         /* There might be some more end data produced in the
2740                            translation.  See the comment above. */
2741                         str->flags |= CODING_STATE_END;
2742                 }
2743                 mule_encode(stream, data, str->runoff, read_size);
2744         }
2745
2746         if (data == orig_data) {
2747                 return error_occurred ? -1 : 0;
2748         } else {
2749                 return data - orig_data;
2750         }
2751 }
2752
2753 static Lstream_data_count
2754 encoding_writer(lstream_t stream, const unsigned char *data,
2755                 Lstream_data_count size)
2756 {
2757         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2758         Lstream_data_count retval;
2759
2760         /* Encode all our data into the runoff, and then attempt to write
2761            it all out to the other end.  Remove whatever chunk we succeeded
2762            in writing. */
2763         mule_encode(stream, data, str->runoff, size);
2764         retval = Lstream_write(str->other_end, Dynarr_atp(str->runoff, 0),
2765                                Dynarr_length(str->runoff));
2766         if (retval > 0) {
2767                 Dynarr_delete_many(str->runoff, 0, retval);
2768         }
2769         /* Do NOT return retval.  The return value indicates how much
2770            of the incoming data was written, not how many bytes were
2771            written. */
2772         return size;
2773 }
2774
2775 static void
2776 reset_encoding_stream(encoding_stream_t str)
2777 {
2778 #ifdef MULE
2779         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2780         case CODESYS_ISO2022: {
2781                 int i;
2782
2783                 for (i = 0; i < 4; i++) {
2784                         str->iso2022.charset[i] =
2785                                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET(
2786                                         str->codesys, i);
2787                         str->iso2022.force_charset_on_output[i] =
2788                                 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(
2789                                         str->codesys, i);
2790                 }
2791                 str->iso2022.register_left = 0;
2792                 str->iso2022.register_right = 1;
2793                 str->iso2022.current_charset = Qnil;
2794                 str->iso2022.current_half = 0;
2795                 str->iso2022.current_char_boundary = 1;
2796                 break;
2797         }
2798         case CODESYS_CCL:
2799                 setup_ccl_program(&str->ccl,
2800                                   CODING_SYSTEM_CCL_ENCODE(str->codesys));
2801                 break;
2802
2803                 /* list the rest of them lot explicitly */
2804         case CODESYS_AUTODETECT:
2805         case CODESYS_SHIFT_JIS:
2806         case CODESYS_BIG5:
2807         case CODESYS_UCS4:
2808         case CODESYS_UTF8:
2809         case CODESYS_NO_CONVERSION:
2810 #ifdef DEBUG_SXEMACS
2811         case CODESYS_INTERNAL:
2812 #endif
2813         default:
2814                 break;
2815         }
2816 #endif                          /* MULE */
2817
2818         str->flags = str->ch = 0;
2819 }
2820
2821 static int
2822 encoding_rewinder(lstream_t stream)
2823 {
2824         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2825         reset_encoding_stream(str);
2826         Dynarr_reset(str->runoff);
2827         return Lstream_rewind(str->other_end);
2828 }
2829
2830 static int
2831 encoding_seekable_p(lstream_t stream)
2832 {
2833         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2834         return Lstream_seekable_p(str->other_end);
2835 }
2836
2837 static int
2838 encoding_flusher(lstream_t stream)
2839 {
2840         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2841         return Lstream_flush(str->other_end);
2842 }
2843
2844 static int
2845 encoding_closer(lstream_t stream)
2846 {
2847         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2848         if (stream->flags & LSTREAM_FL_WRITE) {
2849                 str->flags |= CODING_STATE_END;
2850                 encoding_writer(stream, 0, 0);
2851         }
2852         Dynarr_free(str->runoff);
2853         return Lstream_close(str->other_end);
2854 }
2855
2856 Lisp_Object
2857 encoding_stream_coding_system(lstream_t stream)
2858 {
2859         Lisp_Object coding_system;
2860         encoding_stream_t str = ENCODING_STREAM_DATA(stream);
2861
2862         XSETCODING_SYSTEM(coding_system, str->codesys);
2863         return coding_system;
2864 }
2865
2866 void
2867 set_encoding_stream_coding_system(lstream_t lstr, Lisp_Object codesys)
2868 {
2869         Lisp_Coding_System *cs = XCODING_SYSTEM(codesys);
2870         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2871         str->codesys = cs;
2872         reset_encoding_stream(str);
2873 }
2874
2875 static Lisp_Object
2876 make_encoding_stream_1(lstream_t stream, Lisp_Object codesys, const char *mode)
2877 {
2878         lstream_t lstr = Lstream_new(lstream_encoding, mode);
2879         encoding_stream_t str = ENCODING_STREAM_DATA(lstr);
2880         Lisp_Object obj;
2881
2882         xzero(*str);
2883         str->runoff = Dynarr_new(unsigned_char);
2884         str->other_end = stream;
2885         set_encoding_stream_coding_system(lstr, codesys);
2886         XSETLSTREAM(obj, lstr);
2887         return obj;
2888 }
2889
2890 Lisp_Object
2891 make_encoding_input_stream(lstream_t stream, Lisp_Object codesys)
2892 {
2893         return make_encoding_stream_1(stream, codesys, "r");
2894 }
2895
2896 Lisp_Object
2897 make_encoding_output_stream(lstream_t stream, Lisp_Object codesys)
2898 {
2899         return make_encoding_stream_1(stream, codesys, "w");
2900 }
2901
2902 /* Convert N bytes of internally-formatted data stored in SRC to an
2903    external format, according to the encoding stream ENCODING.
2904    Store the encoded data into DST. */
2905
2906 static void
2907 mule_encode(lstream_t encoding, const Bufbyte * src,
2908             unsigned_char_dynarr * dst, Lstream_data_count n)
2909 {
2910         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
2911
2912         switch (CODING_SYSTEM_TYPE(str->codesys)) {
2913 #ifdef DEBUG_SXEMACS
2914         case CODESYS_INTERNAL:
2915                 Dynarr_add_many(dst, src, n);
2916                 break;
2917 #endif
2918         case CODESYS_AUTODETECT:
2919                 /* If we got this far and still haven't decided on the coding
2920                    system, then do no conversion. */
2921         case CODESYS_NO_CONVERSION:
2922                 encode_coding_no_conversion(encoding, src, dst, n);
2923                 break;
2924 #ifdef MULE
2925         case CODESYS_SHIFT_JIS:
2926                 encode_coding_sjis(encoding, src, dst, n);
2927                 break;
2928         case CODESYS_BIG5:
2929                 encode_coding_big5(encoding, src, dst, n);
2930                 break;
2931         case CODESYS_UCS4:
2932                 encode_coding_ucs4(encoding, src, dst, n);
2933                 break;
2934         case CODESYS_UTF8:
2935                 encode_coding_utf8(encoding, src, dst, n);
2936                 break;
2937         case CODESYS_CCL:
2938                 str->ccl.last_block = str->flags & CODING_STATE_END;
2939                 /* When applying ccl program to stream, MUST NOT set NULL
2940                    pointer to src.  */
2941                 ccl_driver(&str->ccl, ((src) ? src : (unsigned char *)""),
2942                            dst, n, 0, CCL_MODE_ENCODING);
2943                 break;
2944         case CODESYS_ISO2022:
2945                 encode_coding_iso2022(encoding, src, dst, n);
2946                 break;
2947 #endif                          /* MULE */
2948         default:
2949                 abort();
2950         }
2951 }
2952
2953 DEFUN("encode-coding-region", Fencode_coding_region, 3, 4, 0,   /*
2954 Encode the text between START and END using CODING-SYSTEM.
2955 This will, for example, convert Japanese characters into stuff such as
2956 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2957 text.  BUFFER defaults to the current buffer if unspecified.
2958 */
2959       (start, end, coding_system, buffer))
2960 {
2961         Bufpos b, e;
2962         struct buffer *buf = decode_buffer(buffer, 0);
2963         Lisp_Object instream, lb_outstream, de_outstream, outstream;
2964         lstream_t istr, ostr;
2965         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2966
2967         get_buffer_range_char(buf, start, end, &b, &e, 0);
2968
2969         barf_if_buffer_read_only(buf, b, e);
2970
2971         coding_system = Fget_coding_system(coding_system);
2972         instream = make_lisp_buffer_input_stream(buf, b, e, 0);
2973         lb_outstream = make_lisp_buffer_output_stream(buf, b, 0);
2974         de_outstream = make_decoding_output_stream(XLSTREAM(lb_outstream),
2975                                                    Fget_coding_system(Qbinary));
2976         outstream = make_encoding_output_stream(XLSTREAM(de_outstream),
2977                                                 coding_system);
2978         istr = XLSTREAM(instream);
2979         ostr = XLSTREAM(outstream);
2980         GCPRO4(instream, outstream, de_outstream, lb_outstream);
2981         /* The chain of streams looks like this:
2982
2983            [BUFFER] <----- send through
2984            ------> [ENCODE AS SPECIFIED]
2985            ------> [DECODE AS BINARY]
2986            ------> [BUFFER]
2987          */
2988         while (1) {
2989                 char tempbuf[1024];     /* some random amount */
2990                 Bufpos newpos, even_newer_pos;
2991                 Bufpos oldpos = lisp_buffer_stream_startpos(istr);
2992                 Lstream_data_count size_in_bytes =
2993                     Lstream_read(istr, tempbuf, sizeof(tempbuf));
2994
2995                 if (!size_in_bytes)
2996                         break;
2997                 newpos = lisp_buffer_stream_startpos(istr);
2998                 Lstream_write(ostr, tempbuf, size_in_bytes);
2999                 even_newer_pos = lisp_buffer_stream_startpos(istr);
3000                 buffer_delete_range(buf, even_newer_pos - (newpos - oldpos),
3001                                     even_newer_pos, 0);
3002         }
3003
3004         {
3005                 Charcount retlen =
3006                     lisp_buffer_stream_startpos(XLSTREAM(instream)) - b;
3007                 Lstream_close(istr);
3008                 Lstream_close(ostr);
3009                 UNGCPRO;
3010                 Lstream_delete(istr);
3011                 Lstream_delete(ostr);
3012                 Lstream_delete(XLSTREAM(de_outstream));
3013                 Lstream_delete(XLSTREAM(lb_outstream));
3014                 return make_int(retlen);
3015         }
3016 }
3017
3018 #ifdef MULE
3019 \f
3020 /************************************************************************/
3021 /*                          Shift-JIS methods                           */
3022 /************************************************************************/
3023
3024 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3025    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3026    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3027    encoded by "position-code + 0x80".  A character of JISX0208
3028    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3029    position-codes are divided and shifted so that it fit in the range
3030    below.
3031
3032    --- CODE RANGE of Shift-JIS ---
3033    (character set)      (range)
3034    ASCII                0x00 .. 0x7F
3035    JISX0201-Kana        0xA0 .. 0xDF
3036    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3037             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3038    -------------------------------
3039
3040 */
3041
3042 /* Is this the first byte of a Shift-JIS two-byte char? */
3043
3044 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3045   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3046
3047 /* Is this the second byte of a Shift-JIS two-byte char? */
3048
3049 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3050   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3051
3052 #define BYTE_SJIS_KATAKANA_P(c) \
3053   ((c) >= 0xA1 && (c) <= 0xDF)
3054
3055 static int
3056 detect_coding_sjis(struct detection_state *st, const Extbyte * src,
3057                    Lstream_data_count n)
3058 {
3059         while (n--) {
3060                 const unsigned char c = *(const unsigned char *)src++;
3061                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3062                         return 0;
3063                 if (st->shift_jis.in_second_byte) {
3064                         st->shift_jis.in_second_byte = 0;
3065                         if (c < 0x40)
3066                                 return 0;
3067                 } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3068                         st->shift_jis.in_second_byte = 1;
3069         }
3070         return CODING_CATEGORY_SHIFT_JIS_MASK;
3071 }
3072
3073 /* Convert Shift-JIS data to internal format. */
3074
3075 static void
3076 decode_coding_sjis(lstream_t decoding, const Extbyte * src,
3077                    unsigned_char_dynarr * dst, Lstream_data_count n)
3078 {
3079         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3080         unsigned int flags = str->flags;
3081         unsigned int ch = str->ch;
3082         eol_type_t eol_type = str->eol_type;
3083
3084         while (n--) {
3085                 const unsigned char c = *(const unsigned char *)src++;
3086
3087                 if (ch) {
3088                         /* Previous character was first byte of Shift-JIS Kanji
3089                            char. */
3090                         if (BYTE_SJIS_TWO_BYTE_2_P(c)) {
3091                                 unsigned char e1, e2;
3092
3093                                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3094                                 DECODE_SJIS(ch, c, e1, e2);
3095                                 Dynarr_add(dst, e1);
3096                                 Dynarr_add(dst, e2);
3097                         } else {
3098                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3099                                 DECODE_ADD_BINARY_CHAR(c, dst);
3100                         }
3101                         ch = 0;
3102                 } else {
3103                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3104                         if (BYTE_SJIS_TWO_BYTE_1_P(c))
3105                                 ch = c;
3106                         else if (BYTE_SJIS_KATAKANA_P(c)) {
3107                                 Dynarr_add(dst, LEADING_BYTE_KATAKANA_JISX0201);
3108                                 Dynarr_add(dst, c);
3109                         } else
3110                                 DECODE_ADD_BINARY_CHAR(c, dst);
3111                 }
3112               label_continue_loop:;
3113         }
3114
3115         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3116
3117         str->flags = flags;
3118         str->ch = ch;
3119 }
3120
3121 /* Convert internally-formatted data to Shift-JIS. */
3122
3123 static void
3124 encode_coding_sjis(lstream_t encoding, const Bufbyte * src,
3125                    unsigned_char_dynarr * dst, Lstream_data_count n)
3126 {
3127         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3128         unsigned int flags = str->flags;
3129         unsigned int ch = str->ch;
3130         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3131
3132         while (n--) {
3133                 Bufbyte c = *src++;
3134                 if (c == '\n') {
3135                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3136                                 Dynarr_add(dst, '\r');
3137                         if (eol_type != EOL_CR)
3138                                 Dynarr_add(dst, '\n');
3139                         ch = 0;
3140                 } else if (BYTE_ASCII_P(c)) {
3141                         Dynarr_add(dst, c);
3142                         ch = 0;
3143                 } else if (BUFBYTE_LEADING_BYTE_P(c))
3144                         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3145                               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3146                               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3147                 else if (ch) {
3148                         if (ch == LEADING_BYTE_KATAKANA_JISX0201) {
3149                                 Dynarr_add(dst, c);
3150                                 ch = 0;
3151                         } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3152                                    ch == LEADING_BYTE_JAPANESE_JISX0208)
3153                                 ch = c;
3154                         else {
3155                                 /* j1 is bessel j1 function,
3156                                  * so we use something else */
3157                                 /* unsigned char j1, j2; */
3158                                 unsigned char tt1, tt2;
3159
3160                                 ENCODE_SJIS(ch, c, tt1, tt2);
3161                                 Dynarr_add(dst, tt1);
3162                                 Dynarr_add(dst, tt2);
3163                                 ch = 0;
3164                         }
3165                 }
3166         }
3167
3168         str->flags = flags;
3169         str->ch = ch;
3170 }
3171
3172 DEFUN("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3173 Decode a JISX0208 character of Shift-JIS coding-system.
3174 CODE is the character code in Shift-JIS as a cons of type bytes.
3175 Return the corresponding character.
3176 */
3177       (code))
3178 {
3179         unsigned char c1, c2, s1, s2;
3180
3181         CHECK_CONS(code);
3182         CHECK_INT(XCAR(code));
3183         CHECK_INT(XCDR(code));
3184         s1 = XINT(XCAR(code));
3185         s2 = XINT(XCDR(code));
3186         if (BYTE_SJIS_TWO_BYTE_1_P(s1) && BYTE_SJIS_TWO_BYTE_2_P(s2)) {
3187                 DECODE_SJIS(s1, s2, c1, c2);
3188                 return make_char(MAKE_CHAR(Vcharset_japanese_jisx0208,
3189                                            c1 & 0x7F, c2 & 0x7F));
3190         } else
3191                 return Qnil;
3192 }
3193
3194 DEFUN("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3195 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3196 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3197 */
3198       (character))
3199 {
3200         Lisp_Object charset;
3201         int c1, c2, s1, s2;
3202
3203         CHECK_CHAR_COERCE_INT(character);
3204         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3205         if (EQ(charset, Vcharset_japanese_jisx0208)) {
3206                 ENCODE_SJIS(c1 | 0x80, c2 | 0x80, s1, s2);
3207                 return Fcons(make_int(s1), make_int(s2));
3208         } else
3209                 return Qnil;
3210 }
3211 \f
3212 /************************************************************************/
3213 /*                            Big5 methods                              */
3214 /************************************************************************/
3215
3216 /* BIG5 is a coding system encoding two character sets: ASCII and
3217    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3218    character set and is encoded in two-byte.
3219
3220    --- CODE RANGE of BIG5 ---
3221    (character set)      (range)
3222    ASCII                0x00 .. 0x7F
3223    Big5 (1st byte)      0xA1 .. 0xFE
3224         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3225    --------------------------
3226
3227    Since the number of characters in Big5 is larger than maximum
3228    characters in Emacs' charset (96x96), it can't be handled as one
3229    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3230    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3231    contains frequently used characters and the latter contains less
3232    frequently used characters.  */
3233
3234 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3235   ((c) >= 0xA1 && (c) <= 0xFE)
3236
3237 /* Is this the second byte of a Shift-JIS two-byte char? */
3238
3239 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3240   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3241
3242 /* Number of Big5 characters which have the same code in 1st byte.  */
3243
3244 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3245
3246 /* Code conversion macros.  These are macros because they are used in
3247    inner loops during code conversion.
3248
3249    Note that temporary variables in macros introduce the classic
3250    dynamic-scoping problems with variable names.  We use capital-
3251    lettered variables in the assumption that SXEmacs does not use
3252    capital letters in variables except in a very formalized way
3253    (e.g. Qstring). */
3254
3255 /* Convert Big5 code (b1, b2) into its internal string representation
3256    (lb, c1, c2). */
3257
3258 /* There is a much simpler way to split the Big5 charset into two.
3259    For the moment I'm going to leave the algorithm as-is because it
3260    claims to separate out the most-used characters into a single
3261    charset, which perhaps will lead to optimizations in various
3262    places.
3263
3264    The way the algorithm works is something like this:
3265
3266    Big5 can be viewed as a 94x157 charset, where the row is
3267    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3268    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3269    the split between low and high column numbers is apparently
3270    meaningless; ascending rows produce less and less frequent chars.
3271    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3272    the first charset, and the upper half (0xC9 .. 0xFE) to the
3273    second.  To do the conversion, we convert the character into
3274    a single number where 0 .. 156 is the first row, 157 .. 313
3275    is the second, etc.  That way, the characters are ordered by
3276    decreasing frequency.  Then we just chop the space in two
3277    and coerce the result into a 94x94 space.
3278    */
3279
3280 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3281 {                                                                       \
3282   int B1 = b1, B2 = b2;                                                 \
3283   unsigned int I                                                        \
3284     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3285                                                                         \
3286   if (B1 < 0xC9)                                                        \
3287     {                                                                   \
3288       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3289     }                                                                   \
3290   else                                                                  \
3291     {                                                                   \
3292       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3293       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3294     }                                                                   \
3295   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3296   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3297 } while (0)
3298
3299 /* Convert the internal string representation of a Big5 character
3300    (lb, c1, c2) into Big5 code (b1, b2). */
3301
3302 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3303 {                                                                       \
3304   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3305                                                                         \
3306   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3307     {                                                                   \
3308       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3309     }                                                                   \
3310   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3311   b2 = I % BIG5_SAME_ROW;                                               \
3312   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3313 } while (0)
3314
3315 static int
3316 detect_coding_big5(struct detection_state *st, const Extbyte * src,
3317                    Lstream_data_count n)
3318 {
3319         while (n--) {
3320                 const unsigned char c = *(const unsigned char *)src++;
3321                 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3322                     (c >= 0x80 && c <= 0xA0))
3323                         return 0;
3324                 if (st->big5.in_second_byte) {
3325                         st->big5.in_second_byte = 0;
3326                         if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3327                                 return 0;
3328                 } else if (c >= 0xA1)
3329                         st->big5.in_second_byte = 1;
3330         }
3331         return CODING_CATEGORY_BIG5_MASK;
3332 }
3333
3334 /* Convert Big5 data to internal format. */
3335
3336 static void
3337 decode_coding_big5(lstream_t decoding, const Extbyte * src,
3338                    unsigned_char_dynarr * dst, Lstream_data_count n)
3339 {
3340         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3341         unsigned int flags = str->flags;
3342         unsigned int ch = str->ch;
3343         eol_type_t eol_type = str->eol_type;
3344
3345         while (n--) {
3346                 const unsigned char c = *(const unsigned char *)src++;
3347                 if (ch) {
3348                         /* Previous character was first byte of Big5 char. */
3349                         if (BYTE_BIG5_TWO_BYTE_2_P(c)) {
3350                                 unsigned char b1, b2, b3;
3351                                 DECODE_BIG5(ch, c, b1, b2, b3);
3352                                 Dynarr_add(dst, b1);
3353                                 Dynarr_add(dst, b2);
3354                                 Dynarr_add(dst, b3);
3355                         } else {
3356                                 DECODE_ADD_BINARY_CHAR(ch, dst);
3357                                 DECODE_ADD_BINARY_CHAR(c, dst);
3358                         }
3359                         ch = 0;
3360                 } else {
3361                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3362                         if (BYTE_BIG5_TWO_BYTE_1_P(c))
3363                                 ch = c;
3364                         else
3365                                 DECODE_ADD_BINARY_CHAR(c, dst);
3366                 }
3367               label_continue_loop:;
3368         }
3369
3370         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
3371
3372         str->flags = flags;
3373         str->ch = ch;
3374 }
3375
3376 /* Convert internally-formatted data to Big5. */
3377
3378 static void
3379 encode_coding_big5(lstream_t encoding, const Bufbyte * src,
3380                    unsigned_char_dynarr * dst, Lstream_data_count n)
3381 {
3382         unsigned char c;
3383         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3384         unsigned int flags = str->flags;
3385         unsigned int ch = str->ch;
3386         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3387
3388         while (n--) {
3389                 c = *src++;
3390                 if (c == '\n') {
3391                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3392                                 Dynarr_add(dst, '\r');
3393                         if (eol_type != EOL_CR)
3394                                 Dynarr_add(dst, '\n');
3395                 } else if (BYTE_ASCII_P(c)) {
3396                         /* ASCII. */
3397                         Dynarr_add(dst, c);
3398                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
3399                         if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3400                             c == LEADING_BYTE_CHINESE_BIG5_2) {
3401                                 /* A recognized leading byte. */
3402                                 ch = c;
3403                                 continue;       /* not done with this character. */
3404                         }
3405                         /* otherwise just ignore this character. */
3406                 } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3407                            ch == LEADING_BYTE_CHINESE_BIG5_2) {
3408                         /* Previous char was a recognized leading byte. */
3409                         ch = (ch << 8) | c;
3410                         continue;       /* not done with this character. */
3411                 } else if (ch) {
3412                         /* Encountering second byte of a Big5 character. */
3413                         unsigned char b1, b2;
3414
3415                         ENCODE_BIG5(ch >> 8, ch & 0xFF, c, b1, b2);
3416                         Dynarr_add(dst, b1);
3417                         Dynarr_add(dst, b2);
3418                 }
3419
3420                 ch = 0;
3421         }
3422
3423         str->flags = flags;
3424         str->ch = ch;
3425 }
3426
3427 DEFUN("decode-big5-char", Fdecode_big5_char, 1, 1, 0,   /*
3428 Decode a Big5 character CODE of BIG5 coding-system.
3429 CODE is the character code in BIG5, a cons of two integers.
3430 Return the corresponding character.
3431 */
3432       (code))
3433 {
3434         unsigned char c1, c2, b1, b2;
3435
3436         CHECK_CONS(code);
3437         CHECK_INT(XCAR(code));
3438         CHECK_INT(XCDR(code));
3439         b1 = XINT(XCAR(code));
3440         b2 = XINT(XCDR(code));
3441         if (BYTE_BIG5_TWO_BYTE_1_P(b1) && BYTE_BIG5_TWO_BYTE_2_P(b2)) {
3442                 int leading_byte;
3443                 Lisp_Object charset;
3444                 DECODE_BIG5(b1, b2, leading_byte, c1, c2);
3445                 charset = CHARSET_BY_LEADING_BYTE(leading_byte);
3446                 return make_char(MAKE_CHAR(charset, c1 & 0x7F, c2 & 0x7F));
3447         } else
3448                 return Qnil;
3449 }
3450
3451 DEFUN("encode-big5-char", Fencode_big5_char, 1, 1, 0,   /*
3452 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3453 Return the corresponding character code in Big5.
3454 */
3455       (character))
3456 {
3457         Lisp_Object charset;
3458         int c1, c2, b1, b2;
3459
3460         CHECK_CHAR_COERCE_INT(character);
3461         BREAKUP_CHAR(XCHAR(character), charset, c1, c2);
3462         if (EQ(charset, Vcharset_chinese_big5_1) ||
3463             EQ(charset, Vcharset_chinese_big5_2)) {
3464                 ENCODE_BIG5(XCHARSET_LEADING_BYTE(charset), c1 | 0x80,
3465                             c2 | 0x80, b1, b2);
3466                 return Fcons(make_int(b1), make_int(b2));
3467         } else
3468                 return Qnil;
3469 }
3470 \f
3471 /************************************************************************/
3472 /*                           UCS-4 methods                              */
3473 /*                                                                      */
3474 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3475 /*                                                                      */
3476 /************************************************************************/
3477
3478 DEFUN("set-ucs-char", Fset_ucs_char, 2, 2, 0,   /*
3479 Map UCS-4 code CODE to Mule character CHARACTER.
3480
3481 Return T on success, NIL on failure.
3482 */
3483       (code, character))
3484 {
3485         size_t c;
3486
3487         CHECK_CHAR(character);
3488         CHECK_NATNUM(code);
3489         c = XINT(code);
3490
3491         if (c < countof(fcd->ucs_to_mule_table)) {
3492                 fcd->ucs_to_mule_table[c] = character;
3493                 return Qt;
3494         } else
3495                 return Qnil;
3496 }
3497
3498 static Lisp_Object ucs_to_char(unsigned long code)
3499 {
3500         if (code < countof(fcd->ucs_to_mule_table)) {
3501                 return fcd->ucs_to_mule_table[code];
3502         } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) {
3503                 unsigned int c;
3504
3505                 code -= 0xe00000;
3506                 c = code % (94 * 94);
3507                 return make_char
3508                     (MAKE_CHAR(CHARSET_BY_ATTRIBUTES
3509                                (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3510                                 CHARSET_LEFT_TO_RIGHT),
3511                                c / 94 + 33, c % 94 + 33));
3512         } else
3513                 return Qnil;
3514 }
3515
3516 DEFUN("ucs-char", Fucs_char, 1, 1, 0,   /*
3517 Return Mule character corresponding to UCS code CODE (a positive integer).
3518 */
3519       (code))
3520 {
3521         CHECK_NATNUM(code);
3522         return ucs_to_char(XINT(code));
3523 }
3524
3525 DEFUN("set-char-ucs", Fset_char_ucs, 2, 2, 0,   /*
3526 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3527 */
3528       (character, code))
3529 {
3530         /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3531            Fset_char_ucs is more restrictive on index arg, but should
3532            check code arg in a char_table method. */
3533         CHECK_CHAR(character);
3534         CHECK_NATNUM(code);
3535         return Fput_char_table(character, code, mule_to_ucs_table);
3536 }
3537
3538 DEFUN("char-ucs", Fchar_ucs, 1, 1, 0,   /*
3539 Return the UCS code (a positive integer) corresponding to CHARACTER.
3540 */
3541       (character))
3542 {
3543         return Fget_char_table(character, mule_to_ucs_table);
3544 }
3545
3546 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3547    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3548    is not found, instead.
3549    #### do something more appropriate (use blob?)
3550         Danger, Will Robinson!  Data loss.  Should we signal user? */
3551 static void decode_ucs4(unsigned long ch, unsigned_char_dynarr * dst)
3552 {
3553         Lisp_Object chr = ucs_to_char(ch);
3554
3555         if (!NILP(chr)) {
3556                 Bufbyte work[MAX_EMCHAR_LEN];
3557                 int len;
3558
3559                 ch = XCHAR(chr);
3560                 len = (ch < 128) ?
3561                     simple_set_charptr_emchar(work, ch) :
3562                     non_ascii_set_charptr_emchar(work, ch);
3563                 Dynarr_add_many(dst, work, len);
3564         } else {
3565                 Dynarr_add(dst, LEADING_BYTE_JAPANESE_JISX0208);
3566                 Dynarr_add(dst, 34 + 128);
3567                 Dynarr_add(dst, 46 + 128);
3568         }
3569 }
3570
3571 static unsigned long
3572 mule_char_to_ucs4(Lisp_Object charset, unsigned char h, unsigned char l)
3573 {
3574         Lisp_Object code
3575             = Fget_char_table(make_char(MAKE_CHAR(charset, h & 127, l & 127)),
3576                               mule_to_ucs_table);
3577
3578         if (INTP(code)) {
3579                 return XINT(code);
3580         } else if ((XCHARSET_DIMENSION(charset) == 2) &&
3581                    (XCHARSET_CHARS(charset) == 94)) {
3582                 unsigned char final = XCHARSET_FINAL(charset);
3583
3584                 if (('@' <= final) && (final < 0x7f)) {
3585                         return 0xe00000 + (final - '@') * 94 * 94
3586                             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3587                 } else {
3588                         return '?';
3589                 }
3590         } else {
3591                 return '?';
3592         }
3593 }
3594
3595 static void
3596 encode_ucs4(Lisp_Object charset,
3597             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3598 {
3599         unsigned long code = mule_char_to_ucs4(charset, h, l);
3600         Dynarr_add(dst, code >> 24);
3601         Dynarr_add(dst, (code >> 16) & 255);
3602         Dynarr_add(dst, (code >> 8) & 255);
3603         Dynarr_add(dst, code & 255);
3604 }
3605
3606 static int
3607 detect_coding_ucs4(struct detection_state *st, const Extbyte * src,
3608                    Lstream_data_count n)
3609 {
3610         while (n--) {
3611                 const unsigned char c = *(const unsigned char *)src++;
3612                 switch (st->ucs4.in_byte) {
3613                 case 0:
3614                         if (c >= 128)
3615                                 return 0;
3616                         else
3617                                 st->ucs4.in_byte++;
3618                         break;
3619                 case 3:
3620                         st->ucs4.in_byte = 0;
3621                         break;
3622                 default:
3623                         st->ucs4.in_byte++;
3624                 }
3625         }
3626         return CODING_CATEGORY_UCS4_MASK;
3627 }
3628
3629 static void
3630 decode_coding_ucs4(lstream_t decoding, const Extbyte * src,
3631                    unsigned_char_dynarr * dst, Lstream_data_count n)
3632 {
3633         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3634         unsigned int flags = str->flags;
3635         unsigned int ch = str->ch;
3636         unsigned char counter = str->counter;
3637
3638         while (n--) {
3639                 const unsigned char c = *(const unsigned char *)src++;
3640                 switch (counter) {
3641                 case 0:
3642                         ch = c;
3643                         counter = 3;
3644                         break;
3645                 case 1:
3646                         decode_ucs4((ch << 8) | c, dst);
3647                         ch = 0;
3648                         counter = 0;
3649                         break;
3650                 default:
3651                         ch = (ch << 8) | c;
3652                         counter--;
3653                 }
3654         }
3655         if (counter & CODING_STATE_END)
3656                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3657
3658         str->flags = flags;
3659         str->ch = ch;
3660         str->counter = counter;
3661 }
3662
3663 static void
3664 encode_coding_ucs4(lstream_t encoding, const Bufbyte * src,
3665                    unsigned_char_dynarr * dst, Lstream_data_count n)
3666 {
3667         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3668         unsigned int flags = str->flags;
3669         unsigned int ch = str->ch;
3670         unsigned char char_boundary = str->iso2022.current_char_boundary;
3671         Lisp_Object charset = str->iso2022.current_charset;
3672
3673 #ifdef ENABLE_COMPOSITE_CHARS
3674         /* flags for handling composite chars.  We do a little switcharoo
3675            on the source while we're outputting the composite char. */
3676         unsigned int saved_n = 0;
3677         const unsigned char *saved_src = NULL;
3678         int in_composite = 0;
3679
3680       back_to_square_n:
3681 #endif
3682
3683         while (n--) {
3684                 unsigned char c = *src++;
3685
3686                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3687                         ch = 0;
3688                         encode_ucs4(Vcharset_ascii, c, 0, dst);
3689                         char_boundary = 1;
3690                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3691                         ch = 0;
3692                         charset = CHARSET_BY_LEADING_BYTE(c);
3693                         if (LEADING_BYTE_PREFIX_P(c))
3694                                 ch = c;
3695                         char_boundary = 0;
3696                 } else {        /* Processing Non-ASCII character */
3697                         char_boundary = 1;
3698                         if (EQ(charset, Vcharset_control_1)) {
3699                                 encode_ucs4(Vcharset_control_1, c, 0, dst);
3700                         } else {
3701                                 switch (XCHARSET_REP_BYTES(charset)) {
3702                                 case 2:
3703                                         encode_ucs4(charset, c, 0, dst);
3704                                         break;
3705                                 case 3:
3706                                         if (XCHARSET_PRIVATE_P(charset)) {
3707                                                 encode_ucs4(charset, c, 0, dst);
3708                                                 ch = 0;
3709                                         } else if (ch) {
3710 #ifdef ENABLE_COMPOSITE_CHARS
3711                                                 if (EQ
3712                                                     (charset,
3713                                                      Vcharset_composite)) {
3714                                                         if (in_composite) {
3715                                                                 /* #### Bother! We don't know how to
3716                                                                    handle this yet. */
3717                                                                 Dynarr_add(dst,
3718                                                                            '\0');
3719                                                                 Dynarr_add(dst,
3720                                                                            '\0');
3721                                                                 Dynarr_add(dst,
3722                                                                            '\0');
3723                                                                 Dynarr_add(dst,
3724                                                                            '~');
3725                                                         } else {
3726                                                                 Emchar emch =
3727                                                                     MAKE_CHAR
3728                                                                     (Vcharset_composite,
3729                                                                      ch & 0x7F,
3730                                                                      c & 0x7F);
3731                                                                 Lisp_Object lstr
3732                                                                     =
3733                                                                     composite_char_string
3734                                                                     (emch);
3735                                                                 saved_n = n;
3736                                                                 saved_src = src;
3737                                                                 in_composite =
3738                                                                     1;
3739                                                                 src =
3740                                                                     XSTRING_DATA
3741                                                                     (lstr);
3742                                                                 n = XSTRING_LENGTH(lstr);
3743                                                         }
3744                                                 } else
3745 #endif                          /* ENABLE_COMPOSITE_CHARS */
3746                                                 {
3747                                                         encode_ucs4(charset, ch,
3748                                                                     c, dst);
3749                                                 }
3750                                                 ch = 0;
3751                                         } else {
3752                                                 ch = c;
3753                                                 char_boundary = 0;
3754                                         }
3755                                         break;
3756                                 case 4:
3757                                         if (ch) {
3758                                                 encode_ucs4(charset, ch, c,
3759                                                             dst);
3760                                                 ch = 0;
3761                                         } else {
3762                                                 ch = c;
3763                                                 char_boundary = 0;
3764                                         }
3765                                         break;
3766                                 default:
3767                                         abort();
3768                                 }
3769                         }
3770                 }
3771         }
3772
3773 #ifdef ENABLE_COMPOSITE_CHARS
3774         if (in_composite) {
3775                 n = saved_n;
3776                 src = saved_src;
3777                 in_composite = 0;
3778                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
3779         }
3780 #endif                          /* ENABLE_COMPOSITE_CHARS */
3781
3782         str->flags = flags;
3783         str->ch = ch;
3784         str->iso2022.current_char_boundary = char_boundary;
3785         str->iso2022.current_charset = charset;
3786
3787         /* Verbum caro factum est! */
3788 }
3789 \f
3790 /************************************************************************/
3791 /*                           UTF-8 methods                              */
3792 /************************************************************************/
3793
3794 static int
3795 detect_coding_utf8(struct detection_state *st, const Extbyte * src,
3796                    Lstream_data_count n)
3797 {
3798         while (n--) {
3799                 const unsigned char c = *(const unsigned char *)src++;
3800                 switch (st->utf8.in_byte) {
3801                 case 0:
3802                         if (c == ISO_CODE_ESC || c == ISO_CODE_SI
3803                             || c == ISO_CODE_SO)
3804                                 return 0;
3805                         else if (c >= 0xfc)
3806                                 st->utf8.in_byte = 5;
3807                         else if (c >= 0xf8)
3808                                 st->utf8.in_byte = 4;
3809                         else if (c >= 0xf0)
3810                                 st->utf8.in_byte = 3;
3811                         else if (c >= 0xe0)
3812                                 st->utf8.in_byte = 2;
3813                         else if (c >= 0xc0)
3814                                 st->utf8.in_byte = 1;
3815                         else if (c >= 0x80)
3816                                 return 0;
3817                         break;
3818                 default:
3819                         if ((c & 0xc0) != 0x80)
3820                                 return 0;
3821                         else
3822                                 st->utf8.in_byte--;
3823                 }
3824         }
3825         return CODING_CATEGORY_UTF8_MASK;
3826 }
3827
3828 static void
3829 decode_coding_utf8(lstream_t decoding, const Extbyte * src,
3830                    unsigned_char_dynarr * dst, Lstream_data_count n)
3831 {
3832         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
3833         unsigned int flags = str->flags;
3834         unsigned int ch = str->ch;
3835         eol_type_t eol_type = str->eol_type;
3836         unsigned char counter = str->counter;
3837
3838         while (n--) {
3839                 const unsigned char c = *(const unsigned char *)src++;
3840                 switch (counter) {
3841                 case 0:
3842                         if (c >= 0xfc) {
3843                                 ch = c & 0x01;
3844                                 counter = 5;
3845                         } else if (c >= 0xf8) {
3846                                 ch = c & 0x03;
3847                                 counter = 4;
3848                         } else if (c >= 0xf0) {
3849                                 ch = c & 0x07;
3850                                 counter = 3;
3851                         } else if (c >= 0xe0) {
3852                                 ch = c & 0x0f;
3853                                 counter = 2;
3854                         } else if (c >= 0xc0) {
3855                                 ch = c & 0x1f;
3856                                 counter = 1;
3857                         } else {
3858                                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
3859                                 decode_ucs4(c, dst);
3860                         }
3861                         break;
3862                 case 1:
3863                         ch = (ch << 6) | (c & 0x3f);
3864                         decode_ucs4(ch, dst);
3865                         ch = 0;
3866                         counter = 0;
3867                         break;
3868                 default:
3869                         ch = (ch << 6) | (c & 0x3f);
3870                         counter--;
3871                 }
3872               label_continue_loop:;
3873         }
3874
3875         if (flags & CODING_STATE_END)
3876                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
3877
3878         str->flags = flags;
3879         str->ch = ch;
3880         str->counter = counter;
3881 }
3882
3883 static void
3884 encode_utf8(Lisp_Object charset,
3885             unsigned char h, unsigned char l, unsigned_char_dynarr * dst)
3886 {
3887         unsigned long code = mule_char_to_ucs4(charset, h, l);
3888         if (code <= 0x7f) {
3889                 Dynarr_add(dst, code);
3890         } else if (code <= 0x7ff) {
3891                 Dynarr_add(dst, (code >> 6) | 0xc0);
3892                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3893         } else if (code <= 0xffff) {
3894                 Dynarr_add(dst, (code >> 12) | 0xe0);
3895                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3896                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3897         } else if (code <= 0x1fffff) {
3898                 Dynarr_add(dst, (code >> 18) | 0xf0);
3899                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3900                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3901                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3902         } else if (code <= 0x3ffffff) {
3903                 Dynarr_add(dst, (code >> 24) | 0xf8);
3904                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3905                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3906                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3907                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3908         } else {
3909                 Dynarr_add(dst, (code >> 30) | 0xfc);
3910                 Dynarr_add(dst, ((code >> 24) & 0x3f) | 0x80);
3911                 Dynarr_add(dst, ((code >> 18) & 0x3f) | 0x80);
3912                 Dynarr_add(dst, ((code >> 12) & 0x3f) | 0x80);
3913                 Dynarr_add(dst, ((code >> 6) & 0x3f) | 0x80);
3914                 Dynarr_add(dst, (code & 0x3f) | 0x80);
3915         }
3916 }
3917
3918 static void
3919 encode_coding_utf8(lstream_t encoding, const Bufbyte * src,
3920                    unsigned_char_dynarr * dst, Lstream_data_count n)
3921 {
3922         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
3923         unsigned int flags = str->flags;
3924         unsigned int ch = str->ch;
3925         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
3926         unsigned char char_boundary = str->iso2022.current_char_boundary;
3927         Lisp_Object charset = str->iso2022.current_charset;
3928
3929 #ifdef ENABLE_COMPOSITE_CHARS
3930         /* flags for handling composite chars.  We do a little switcharoo
3931            on the source while we're outputting the composite char. */
3932         unsigned int saved_n = 0;
3933         const unsigned char *saved_src = NULL;
3934         int in_composite = 0;
3935
3936       back_to_square_n:
3937 #endif                          /* ENABLE_COMPOSITE_CHARS */
3938
3939         while (n--) {
3940                 unsigned char c = *src++;
3941
3942                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
3943                         ch = 0;
3944                         if (c == '\n') {
3945                                 if (eol_type != EOL_LF
3946                                     && eol_type != EOL_AUTODETECT)
3947                                         Dynarr_add(dst, '\r');
3948                                 if (eol_type != EOL_CR)
3949                                         Dynarr_add(dst, c);
3950                         } else
3951                                 encode_utf8(Vcharset_ascii, c, 0, dst);
3952                         char_boundary = 1;
3953                 } else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {   /* Processing Leading Byte */
3954                         ch = 0;
3955                         charset = CHARSET_BY_LEADING_BYTE(c);
3956                         if (LEADING_BYTE_PREFIX_P(c))
3957                                 ch = c;
3958                         char_boundary = 0;
3959                 } else {        /* Processing Non-ASCII character */
3960                         char_boundary = 1;
3961                         if (EQ(charset, Vcharset_control_1)) {
3962                                 encode_utf8(Vcharset_control_1, c, 0, dst);
3963                         } else {
3964                                 switch (XCHARSET_REP_BYTES(charset)) {
3965                                 case 2:
3966                                         encode_utf8(charset, c, 0, dst);
3967                                         break;
3968                                 case 3:
3969                                         if (XCHARSET_PRIVATE_P(charset)) {
3970                                                 encode_utf8(charset, c, 0, dst);
3971                                                 ch = 0;
3972                                         } else if (ch) {
3973 #ifdef ENABLE_COMPOSITE_CHARS
3974                                                 if (EQ
3975                                                     (charset,
3976                                                      Vcharset_composite)) {
3977                                                         if (in_composite) {
3978                                                                 /* #### Bother! We don't know how to
3979                                                                    handle this yet. */
3980                                                                 encode_utf8
3981                                                                     (Vcharset_ascii,
3982                                                                      '~', 0,
3983                                                                      dst);
3984                                                         } else {
3985                                                                 Emchar emch =
3986                                                                     MAKE_CHAR
3987                                                                     (Vcharset_composite,
3988                                                                      ch & 0x7F,
3989                                                                      c & 0x7F);
3990                                                                 Lisp_Object lstr
3991                                                                     =
3992                                                                     composite_char_string
3993                                                                     (emch);
3994                                                                 saved_n = n;
3995                                                                 saved_src = src;
3996                                                                 in_composite =
3997                                                                     1;
3998                                                                 src =
3999                                                                     XSTRING_DATA
4000                                                                     (lstr);
4001                                                                 n = XSTRING_LENGTH(lstr);
4002                                                         }
4003                                                 } else
4004 #endif                          /* ENABLE_COMPOSITE_CHARS */
4005                                                 {
4006                                                         encode_utf8(charset, ch,
4007                                                                     c, dst);
4008                                                 }
4009                                                 ch = 0;
4010                                         } else {
4011                                                 ch = c;
4012                                                 char_boundary = 0;
4013                                         }
4014                                         break;
4015                                 case 4:
4016                                         if (ch) {
4017                                                 encode_utf8(charset, ch, c,
4018                                                             dst);
4019                                                 ch = 0;
4020                                         } else {
4021                                                 ch = c;
4022                                                 char_boundary = 0;
4023                                         }
4024                                         break;
4025                                 default:
4026                                         abort();
4027                                 }
4028                         }
4029                 }
4030         }
4031
4032 #ifdef ENABLE_COMPOSITE_CHARS
4033         if (in_composite) {
4034                 n = saved_n;
4035                 src = saved_src;
4036                 in_composite = 0;
4037                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
4038         }
4039 #endif
4040
4041         str->flags = flags;
4042         str->ch = ch;
4043         str->iso2022.current_char_boundary = char_boundary;
4044         str->iso2022.current_charset = charset;
4045
4046         /* Verbum caro factum est! */
4047 }
4048 \f
4049 /************************************************************************/
4050 /*                           ISO2022 methods                            */
4051 /************************************************************************/
4052
4053 /* The following note describes the coding system ISO2022 briefly.
4054    Since the intention of this note is to help understand the
4055    functions in this file, some parts are NOT ACCURATE or OVERLY
4056    SIMPLIFIED.  For thorough understanding, please refer to the
4057    original document of ISO2022.
4058
4059    ISO2022 provides many mechanisms to encode several character sets
4060    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4061    is encoded using bytes less than 128.  This may make the encoded
4062    text a little bit longer, but the text passes more easily through
4063    several gateways, some of which strip off MSB (Most Signigant Bit).
4064
4065    There are two kinds of character sets: control character set and
4066    graphic character set.  The former contains control characters such
4067    as `newline' and `escape' to provide control functions (control
4068    functions are also provided by escape sequences).  The latter
4069    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4070    two control character sets and many graphic character sets.
4071
4072    Graphic character sets are classified into one of the following
4073    four classes, according to the number of bytes (DIMENSION) and
4074    number of characters in one dimension (CHARS) of the set:
4075    - DIMENSION1_CHARS94
4076    - DIMENSION1_CHARS96
4077    - DIMENSION2_CHARS94
4078    - DIMENSION2_CHARS96
4079
4080    In addition, each character set is assigned an identification tag,
4081    unique for each set, called "final character" (denoted as <F>
4082    hereafter).  The <F> of each character set is decided by ECMA(*)
4083    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4084    (0x30..0x3F are for private use only).
4085
4086    Note (*): ECMA = European Computer Manufacturers Association
4087
4088    Here are examples of graphic character set [NAME(<F>)]:
4089         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4090         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4091         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4092         o DIMENSION2_CHARS96 -- none for the moment
4093
4094    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4095         C0 [0x00..0x1F] -- control character plane 0
4096         GL [0x20..0x7F] -- graphic character plane 0
4097         C1 [0x80..0x9F] -- control character plane 1
4098         GR [0xA0..0xFF] -- graphic character plane 1
4099
4100    A control character set is directly designated and invoked to C0 or
4101    C1 by an escape sequence.  The most common case is that:
4102    - ISO646's  control character set is designated/invoked to C0, and
4103    - ISO6429's control character set is designated/invoked to C1,
4104    and usually these designations/invocations are omitted in encoded
4105    text.  In a 7-bit environment, only C0 can be used, and a control
4106    character for C1 is encoded by an appropriate escape sequence to
4107    fit into the environment.  All control characters for C1 are
4108    defined to have corresponding escape sequences.
4109
4110    A graphic character set is at first designated to one of four
4111    graphic registers (G0 through G3), then these graphic registers are
4112    invoked to GL or GR.  These designations and invocations can be
4113    done independently.  The most common case is that G0 is invoked to
4114    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4115    these invocations and designations are omitted in encoded text.
4116    In a 7-bit environment, only GL can be used.
4117
4118    When a graphic character set of CHARS94 is invoked to GL, codes
4119    0x20 and 0x7F of the GL area work as control characters SPACE and
4120    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4121    be used.
4122
4123    There are two ways of invocation: locking-shift and single-shift.
4124    With locking-shift, the invocation lasts until the next different
4125    invocation, whereas with single-shift, the invocation affects the
4126    following character only and doesn't affect the locking-shift
4127    state.  Invocations are done by the following control characters or
4128    escape sequences:
4129
4130    ----------------------------------------------------------------------
4131    abbrev  function                  cntrl escape seq   description
4132    ----------------------------------------------------------------------
4133    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4134    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4135    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4136    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4137    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4138    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4139    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4140    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4141    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4142    ----------------------------------------------------------------------
4143    (*) These are not used by any known coding system.
4144
4145    Control characters for these functions are defined by macros
4146    ISO_CODE_XXX in `coding.h'.
4147
4148    Designations are done by the following escape sequences:
4149    ----------------------------------------------------------------------
4150    escape sequence      description
4151    ----------------------------------------------------------------------
4152    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4153    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4154    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4155    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4156    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4157    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4158    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4159    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4160    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4161    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4162    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4163    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4164    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4165    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4166    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4167    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4168    ----------------------------------------------------------------------
4169
4170    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4171    of dimension 1, chars 94, and final character <F>, etc...
4172
4173    Note (*): Although these designations are not allowed in ISO2022,
4174    Emacs accepts them on decoding, and produces them on encoding
4175    CHARS96 character sets in a coding system which is characterized as
4176    7-bit environment, non-locking-shift, and non-single-shift.
4177
4178    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4179    '(' can be omitted.  We refer to this as "short-form" hereafter.
4180
4181    Now you may notice that there are a lot of ways for encoding the
4182    same multilingual text in ISO2022.  Actually, there exist many
4183    coding systems such as Compound Text (used in X11's inter client
4184    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4185    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4186    localized platforms), and all of these are variants of ISO2022.
4187
4188    In addition to the above, Emacs handles two more kinds of escape
4189    sequences: ISO6429's direction specification and Emacs' private
4190    sequence for specifying character composition.
4191
4192    ISO6429's direction specification takes the following form:
4193         o CSI ']'      -- end of the current direction
4194         o CSI '0' ']'  -- end of the current direction
4195         o CSI '1' ']'  -- start of left-to-right text
4196         o CSI '2' ']'  -- start of right-to-left text
4197    The control character CSI (0x9B: control sequence introducer) is
4198    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4199
4200    Character composition specification takes the following form:
4201         o ESC '0' -- start character composition
4202         o ESC '1' -- end character composition
4203    Since these are not standard escape sequences of any ISO standard,
4204    their use with these meanings is restricted to Emacs only.  */
4205
4206 static void
4207 reset_iso2022(Lisp_Object coding_system, struct iso2022_decoder *iso)
4208 {
4209         int i;
4210
4211         for (i = 0; i < 4; i++) {
4212                 if (!NILP(coding_system))
4213                         iso->charset[i] =
4214                             XCODING_SYSTEM_ISO2022_INITIAL_CHARSET
4215                             (coding_system, i);
4216                 else
4217                         iso->charset[i] = Qt;
4218                 iso->invalid_designated[i] = 0;
4219         }
4220         iso->esc = ISO_ESC_NOTHING;
4221         iso->esc_bytes_index = 0;
4222         iso->register_left = 0;
4223         iso->register_right = 1;
4224         iso->switched_dir_and_no_valid_charset_yet = 0;
4225         iso->invalid_switch_dir = 0;
4226         iso->output_direction_sequence = 0;
4227         iso->output_literally = 0;
4228 #ifdef ENABLE_COMPOSITE_CHARS
4229         if (iso->composite_chars)
4230                 Dynarr_reset(iso->composite_chars);
4231 #endif
4232 }
4233
4234 static int fit_to_be_escape_quoted(unsigned char c)
4235 {
4236         switch (c) {
4237         case ISO_CODE_ESC:
4238         case ISO_CODE_CSI:
4239         case ISO_CODE_SS2:
4240         case ISO_CODE_SS3:
4241         case ISO_CODE_SO:
4242         case ISO_CODE_SI:
4243                 return 1;
4244
4245         default:
4246                 return 0;
4247         }
4248 }
4249
4250 /* Parse one byte of an ISO2022 escape sequence.
4251    If the result is an invalid escape sequence, return 0 and
4252    do not change anything in STR.  Otherwise, if the result is
4253    an incomplete escape sequence, update ISO2022.ESC and
4254    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4255    all the state variables (but not ISO2022.ESC_BYTES) and
4256    return 1.
4257
4258    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4259    or invocation of an invalid character set and treat that as
4260    an unrecognized escape sequence.
4261
4262    ********************************************************************
4263
4264    #### Strategies for error annotation and coding orthogonalization
4265
4266    We really want to separate out a number of things.  Conceptually,
4267    there is a nested syntax.
4268
4269    At the top level is the ISO 2022 extension syntax, including charset
4270    designation and invocation, and certain auxiliary controls such as the
4271    ISO 6429 direction specification.  These are octet-oriented, with the
4272    single exception (AFAIK) of the "exit Unicode" sequence which uses the
4273    UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4274    UTF-16, and 4 bytes for UCS-4 and UTF-32).  This will be treated as a
4275    (deprecated) special case in Unicode processing.
4276
4277    The middle layer is ISO 2022 character interpretation.  This will depend
4278    on the current state of the ISO 2022 registers, and assembles octets
4279    into the character's internal representation.
4280
4281    The lowest level is translating system control conventions.  At present
4282    this is restricted to newline translation, but one could imagine doing
4283    tab conversion or line wrapping here.  "Escape from Unicode" processing
4284    would be done at this level.
4285
4286    At each level the parser will verify the syntax.  In the case of a
4287    syntax error or warning (such as a redundant escape sequence that affects
4288    no characters), the parser will take some action, typically inserting the
4289    erroneous octets directly into the output and creating an annotation
4290    which can be used by higher level I/O to mark the affected region.
4291
4292    This should make it possible to do something sensible about separating
4293    newline convention processing from character construction, and about
4294    preventing ISO 2022 escape sequences from being recognized
4295    inappropriately.
4296
4297    The basic strategy will be to have octet classification tables, and
4298    switch processing according to the table entry.
4299
4300    It's possible that, by doing the processing with tables of functions or
4301    the like, the parser can be used for both detection and translation. */
4302
4303 static int
4304 parse_iso2022_esc(Lisp_Object codesys, struct iso2022_decoder *iso,
4305                   unsigned char c, unsigned int *flags,
4306                   int check_invalid_charsets)
4307 {
4308         /* (1) If we're at the end of a designation sequence, CS is the
4309            charset being designated and REG is the register to designate
4310            it to.
4311
4312            (2) If we're at the end of a locking-shift sequence, REG is
4313            the register to invoke and HALF (0 == left, 1 == right) is
4314            the half to invoke it into.
4315
4316            (3) If we're at the end of a single-shift sequence, REG is
4317            the register to invoke. */
4318         Lisp_Object cs = Qnil;
4319         int reg, half;
4320
4321         /* NOTE: This code does goto's all over the fucking place.
4322            The reason for this is that we're basically implementing
4323            a state machine here, and hierarchical languages like C
4324            don't really provide a clean way of doing this. */
4325
4326         if (!(*flags & CODING_STATE_ESCAPE))
4327                 /* At beginning of escape sequence; we need to reset our
4328                    escape-state variables. */
4329                 iso->esc = ISO_ESC_NOTHING;
4330
4331         iso->output_literally = 0;
4332         iso->output_direction_sequence = 0;
4333
4334         switch (iso->esc) {
4335         case ISO_ESC_NOTHING:
4336                 iso->esc_bytes_index = 0;
4337                 switch (c) {
4338                 case ISO_CODE_ESC:      /* Start escape sequence */
4339                         *flags |= CODING_STATE_ESCAPE;
4340                         iso->esc = ISO_ESC;
4341                         goto not_done;
4342
4343                 case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4344                         *flags |= CODING_STATE_ESCAPE;
4345                         iso->esc = ISO_ESC_5_11;
4346                         goto not_done;
4347
4348                 case ISO_CODE_SO:       /* locking shift 1 */
4349                         reg = 1;
4350                         half = 0;
4351                         goto locking_shift;
4352                 case ISO_CODE_SI:       /* locking shift 0 */
4353                         reg = 0;
4354                         half = 0;
4355                         goto locking_shift;
4356
4357                 case ISO_CODE_SS2:      /* single shift */
4358                         reg = 2;
4359                         goto single_shift;
4360                 case ISO_CODE_SS3:      /* single shift */
4361                         reg = 3;
4362                         goto single_shift;
4363
4364                 default:        /* Other control characters */
4365                         return 0;
4366                 }
4367
4368         case ISO_ESC:
4369                 switch (c) {
4370           /**** single shift ****/
4371
4372                 case 'N':       /* single shift 2 */
4373                         reg = 2;
4374                         goto single_shift;
4375                 case 'O':       /* single shift 3 */
4376                         reg = 3;
4377                         goto single_shift;
4378
4379           /**** locking shift ****/
4380
4381                 case '~':       /* locking shift 1 right */
4382                         reg = 1;
4383                         half = 1;
4384                         goto locking_shift;
4385                 case 'n':       /* locking shift 2 */
4386                         reg = 2;
4387                         half = 0;
4388                         goto locking_shift;
4389                 case '}':       /* locking shift 2 right */
4390                         reg = 2;
4391                         half = 1;
4392                         goto locking_shift;
4393                 case 'o':       /* locking shift 3 */
4394                         reg = 3;
4395                         half = 0;
4396                         goto locking_shift;
4397                 case '|':       /* locking shift 3 right */
4398                         reg = 3;
4399                         half = 1;
4400                         goto locking_shift;
4401
4402 #ifdef ENABLE_COMPOSITE_CHARS
4403           /**** composite ****/
4404
4405                 case '0':
4406                         iso->esc = ISO_ESC_START_COMPOSITE;
4407                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4408                             CODING_STATE_COMPOSITE;
4409                         return 1;
4410
4411                 case '1':
4412                         iso->esc = ISO_ESC_END_COMPOSITE;
4413                         *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4414                             ~CODING_STATE_COMPOSITE;
4415                         return 1;
4416 #endif                          /* ENABLE_COMPOSITE_CHARS */
4417
4418           /**** directionality ****/
4419
4420                 case '[':
4421                         iso->esc = ISO_ESC_5_11;
4422                         goto not_done;
4423
4424           /**** designation ****/
4425
4426                 case '$':       /* multibyte charset prefix */
4427                         iso->esc = ISO_ESC_2_4;
4428                         goto not_done;
4429
4430                 default:
4431                         if (0x28 <= c && c <= 0x2F) {
4432                                 iso->esc =
4433                                     (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_8);
4434                                 goto not_done;
4435                         }
4436
4437                         /* This function is called with CODESYS equal to nil when
4438                            doing coding-system detection. */
4439                         if (!NILP(codesys)
4440                             && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
4441                             && fit_to_be_escape_quoted(c)) {
4442                                 iso->esc = ISO_ESC_LITERAL;
4443                                 *flags &= CODING_STATE_ISO2022_LOCK;
4444                                 return 1;
4445                         }
4446
4447                         /* bzzzt! */
4448                         return 0;
4449                 }
4450
4451       /**** directionality ****/
4452
4453         case ISO_ESC_5_11:      /* ISO6429 direction control */
4454                 if (c == ']') {
4455                         *flags &=
4456                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4457                         goto directionality;
4458                 }
4459                 if (c == '0')
4460                         iso->esc = ISO_ESC_5_11_0;
4461                 else if (c == '1')
4462                         iso->esc = ISO_ESC_5_11_1;
4463                 else if (c == '2')
4464                         iso->esc = ISO_ESC_5_11_2;
4465                 else
4466                         return 0;
4467                 goto not_done;
4468
4469         case ISO_ESC_5_11_0:
4470                 if (c == ']') {
4471                         *flags &=
4472                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4473                         goto directionality;
4474                 }
4475                 return 0;
4476
4477         case ISO_ESC_5_11_1:
4478                 if (c == ']') {
4479                         *flags =
4480                             (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4481                         goto directionality;
4482                 }
4483                 return 0;
4484
4485         case ISO_ESC_5_11_2:
4486                 if (c == ']') {
4487                         *flags =
4488                             (*flags & CODING_STATE_ISO2022_LOCK) |
4489                             CODING_STATE_R2L;
4490                         goto directionality;
4491                 }
4492                 return 0;
4493
4494         directionality:
4495                 iso->esc = ISO_ESC_DIRECTIONALITY;
4496                 /* Various junk here to attempt to preserve the direction
4497                    sequences literally in the text if they would otherwise be
4498                    swallowed due to invalid designations that don't show up as
4499                    actual charset changes in the text. */
4500                 if (iso->invalid_switch_dir) {
4501                         /* We already inserted a direction switch literally into
4502                            the text.  We assume (#### this may not be right)
4503                            that the next direction switch is the one going the
4504                            other way, and we need to output that literally as
4505                            well. */
4506                         iso->output_literally = 1;
4507                         iso->invalid_switch_dir = 0;
4508                 } else {
4509                         int jj;
4510
4511                         /* If we are in the thrall of an invalid designation,
4512                            then stick the directionality sequence literally into
4513                            the output stream so it ends up in the original text
4514                            again. */
4515                         for (jj = 0; jj < 4; jj++)
4516                                 if (iso->invalid_designated[jj])
4517                                         break;
4518                         if (jj < 4) {
4519                                 iso->output_literally = 1;
4520                                 iso->invalid_switch_dir = 1;
4521                         } else
4522                                 /* Indicate that we haven't yet seen a valid
4523                                    designation, so that if a switch-dir is
4524                                    directly followed by an invalid designation,
4525                                    both get inserted literally. */
4526                                 iso->switched_dir_and_no_valid_charset_yet = 1;
4527                 }
4528                 return 1;
4529
4530                 /**** designation ****/
4531
4532         case ISO_ESC_2_4:
4533                 if (0x28 <= c && c <= 0x2F) {
4534                         iso->esc =
4535                             (enum iso_esc_flag)(c - 0x28 + ISO_ESC_2_4_8);
4536                         goto not_done;
4537                 }
4538                 if (0x40 <= c && c <= 0x42) {
4539                         cs = CHARSET_BY_ATTRIBUTES(CHARSET_TYPE_94X94, c,
4540                                                    *flags & CODING_STATE_R2L ?
4541                                                    CHARSET_RIGHT_TO_LEFT :
4542                                                    CHARSET_LEFT_TO_RIGHT);
4543                         reg = 0;
4544                         goto designated;
4545                 }
4546                 return 0;
4547
4548                 /* list the rest */
4549         case ISO_ESC_2_8:
4550         case ISO_ESC_2_9:
4551         case ISO_ESC_2_10:
4552         case ISO_ESC_2_11:
4553         case ISO_ESC_2_12:
4554         case ISO_ESC_2_13:
4555         case ISO_ESC_2_14:
4556         case ISO_ESC_2_15:
4557         case ISO_ESC_2_4_8:
4558         case ISO_ESC_2_4_9:
4559         case ISO_ESC_2_4_10:
4560         case ISO_ESC_2_4_11:
4561         case ISO_ESC_2_4_12:
4562         case ISO_ESC_2_4_13:
4563         case ISO_ESC_2_4_14:
4564         case ISO_ESC_2_4_15:
4565         case ISO_ESC_SINGLE_SHIFT:
4566         case ISO_ESC_LOCKING_SHIFT:
4567         case ISO_ESC_DESIGNATE:
4568         case ISO_ESC_DIRECTIONALITY:
4569         case ISO_ESC_LITERAL:
4570
4571         default: {
4572                 int type = -1;
4573
4574                 if (c < '0' || c > '~')
4575                         return 0;       /* bad final byte */
4576
4577                 if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) {
4578                         type = ((iso->esc >= ISO_ESC_2_12) ?
4579                                 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4580                         reg = (iso->esc - ISO_ESC_2_8) & 3;
4581                 } else if (iso->esc >= ISO_ESC_2_4_8 &&
4582                            iso->esc <= ISO_ESC_2_4_15) {
4583                         type = ((iso->esc >= ISO_ESC_2_4_12) ?
4584                                 CHARSET_TYPE_96X96 :
4585                                 CHARSET_TYPE_94X94);
4586                         reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4587                 } else {
4588                         /* Can this ever be reached? -slb */
4589                         abort();
4590                         return 0;
4591                 }
4592
4593                 cs = CHARSET_BY_ATTRIBUTES(type, c,
4594                                            *flags & CODING_STATE_R2L ?
4595                                            CHARSET_RIGHT_TO_LEFT :
4596                                            CHARSET_LEFT_TO_RIGHT);
4597                 goto designated;
4598         }
4599         }
4600
4601       not_done:
4602         iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char)c;
4603         return -1;
4604
4605       single_shift:
4606         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4607                 /* can't invoke something that ain't there. */
4608                 return 0;
4609         iso->esc = ISO_ESC_SINGLE_SHIFT;
4610         *flags &= CODING_STATE_ISO2022_LOCK;
4611         if (reg == 2)
4612                 *flags |= CODING_STATE_SS2;
4613         else
4614                 *flags |= CODING_STATE_SS3;
4615         return 1;
4616
4617       locking_shift:
4618         if (check_invalid_charsets && !CHARSETP(iso->charset[reg]))
4619                 /* can't invoke something that ain't there. */
4620                 return 0;
4621         if (half)
4622                 iso->register_right = reg;
4623         else
4624                 iso->register_left = reg;
4625         *flags &= CODING_STATE_ISO2022_LOCK;
4626         iso->esc = ISO_ESC_LOCKING_SHIFT;
4627         return 1;
4628
4629       designated:
4630         if (NILP(cs) && check_invalid_charsets) {
4631                 iso->invalid_designated[reg] = 1;
4632                 iso->charset[reg] = Vcharset_ascii;
4633                 iso->esc = ISO_ESC_DESIGNATE;
4634                 *flags &= CODING_STATE_ISO2022_LOCK;
4635                 iso->output_literally = 1;
4636                 if (iso->switched_dir_and_no_valid_charset_yet) {
4637                         /* We encountered a switch-direction followed by an
4638                            invalid designation.  Ensure that the switch-direction
4639                            gets outputted; otherwise it will probably get eaten
4640                            when the text is written out again. */
4641                         iso->switched_dir_and_no_valid_charset_yet = 0;
4642                         iso->output_direction_sequence = 1;
4643                         /* And make sure that the switch-dir going the other
4644                            way gets outputted, as well. */
4645                         iso->invalid_switch_dir = 1;
4646                 }
4647                 return 1;
4648         }
4649         /* This function is called with CODESYS equal to nil when
4650            doing coding-system detection. */
4651         if (!NILP(codesys)) {
4652                 charset_conversion_spec_dynarr *dyn =
4653                     XCODING_SYSTEM(codesys)->iso2022.input_conv;
4654
4655                 if (dyn) {
4656                         int i;
4657
4658                         for (i = 0; i < Dynarr_length(dyn); i++) {
4659                                 struct charset_conversion_spec *spec =
4660                                     Dynarr_atp(dyn, i);
4661                                 if (EQ(cs, spec->from_charset))
4662                                         cs = spec->to_charset;
4663                         }
4664                 }
4665         }
4666
4667         iso->charset[reg] = cs;
4668         iso->esc = ISO_ESC_DESIGNATE;
4669         *flags &= CODING_STATE_ISO2022_LOCK;
4670         if (iso->invalid_designated[reg]) {
4671                 iso->invalid_designated[reg] = 0;
4672                 iso->output_literally = 1;
4673         }
4674         if (iso->switched_dir_and_no_valid_charset_yet)
4675                 iso->switched_dir_and_no_valid_charset_yet = 0;
4676         return 1;
4677 }
4678
4679 static int
4680 detect_coding_iso2022(struct detection_state *st, const Extbyte * src,
4681                       Lstream_data_count n)
4682 {
4683         int mask;
4684
4685         /* #### There are serious deficiencies in the recognition mechanism
4686            here.  This needs to be much smarter if it's going to cut it.
4687            The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4688            it should be detected as Latin-1.
4689            All the ISO2022 stuff in this file should be synced up with the
4690            code from FSF Emacs-20.4, in which Mule should be more or less stable.
4691            Perhaps we should wait till R2L works in FSF Emacs? */
4692
4693         if (!st->iso2022.initted) {
4694                 reset_iso2022(Qnil, &st->iso2022.iso);
4695                 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4696                                     CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4697                                     CODING_CATEGORY_ISO_8_1_MASK |
4698                                     CODING_CATEGORY_ISO_8_2_MASK |
4699                                     CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4700                 st->iso2022.flags = 0;
4701                 st->iso2022.high_byte_count = 0;
4702                 st->iso2022.saw_single_shift = 0;
4703                 st->iso2022.initted = 1;
4704         }
4705
4706         mask = st->iso2022.mask;
4707
4708         while (n--) {
4709                 const unsigned char c = *(const unsigned char *)src++;
4710                 if (c >= 0xA0) {
4711                         mask &= ~CODING_CATEGORY_ISO_7_MASK;
4712                         st->iso2022.high_byte_count++;
4713                 } else {
4714                         if (st->iso2022.high_byte_count
4715                             && !st->iso2022.saw_single_shift) {
4716                                 if (st->iso2022.high_byte_count & 1)
4717                                         /* odd number of high bytes; assume not iso-8-2 */
4718                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4719                         }
4720                         st->iso2022.high_byte_count = 0;
4721                         st->iso2022.saw_single_shift = 0;
4722                         if (c > 0x80)
4723                                 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4724                 }
4725                 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4726                     && (BYTE_C0_P(c) || BYTE_C1_P(c))) {        /* control chars */
4727                         switch (c) {
4728                                 /* Allow and ignore control characters that you might
4729                                    reasonably see in a text file */
4730                         case '\r':
4731                         case '\n':
4732                         case '\t':
4733                         case 7: /* bell */
4734                         case 8: /* backspace */
4735                         case 11:        /* vertical tab */
4736                         case 12:        /* form feed */
4737                         case 26:        /* MS-DOS C-z junk */
4738                         case 31:        /* '^_' -- for info */
4739                                 goto label_continue_loop;
4740
4741                         default:
4742                                 break;
4743                         }
4744                 }
4745
4746                 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P(c)
4747                     || BYTE_C1_P(c)) {
4748                         if (parse_iso2022_esc(Qnil, &st->iso2022.iso, c,
4749                                               &st->iso2022.flags, 0)) {
4750                                 switch (st->iso2022.iso.esc) {
4751                                 case ISO_ESC_DESIGNATE:
4752                                         mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4753                                         mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4754                                         break;
4755                                 case ISO_ESC_LOCKING_SHIFT:
4756                                         mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4757                                         goto ran_out_of_chars;
4758                                 case ISO_ESC_SINGLE_SHIFT:
4759                                         mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4760                                         st->iso2022.saw_single_shift = 1;
4761                                         break;
4762
4763                                         /* list the rest */
4764                                 case ISO_ESC_NOTHING:
4765                                 case ISO_ESC:
4766                                 case ISO_ESC_2_4:
4767                                 case ISO_ESC_2_8:
4768                                 case ISO_ESC_2_9:
4769                                 case ISO_ESC_2_10:
4770                                 case ISO_ESC_2_11:
4771                                 case ISO_ESC_2_12:
4772                                 case ISO_ESC_2_13:
4773                                 case ISO_ESC_2_14:
4774                                 case ISO_ESC_2_15:
4775                                 case ISO_ESC_2_4_8:
4776                                 case ISO_ESC_2_4_9:
4777                                 case ISO_ESC_2_4_10:
4778                                 case ISO_ESC_2_4_11:
4779                                 case ISO_ESC_2_4_12:
4780                                 case ISO_ESC_2_4_13:
4781                                 case ISO_ESC_2_4_14:
4782                                 case ISO_ESC_2_4_15:
4783                                 case ISO_ESC_5_11:
4784                                 case ISO_ESC_5_11_0:
4785                                 case ISO_ESC_5_11_1:
4786                                 case ISO_ESC_5_11_2:
4787                                 case ISO_ESC_DIRECTIONALITY:
4788                                 case ISO_ESC_LITERAL:
4789                                 default:
4790                                         break;
4791                                 }
4792                         } else {
4793                                 mask = 0;
4794                                 goto ran_out_of_chars;
4795                         }
4796                 }
4797         label_continue_loop:;
4798         }
4799
4800 ran_out_of_chars:
4801         return mask;
4802 }
4803
4804 static int postprocess_iso2022_mask(int mask)
4805 {
4806         /* #### kind of cheesy */
4807         /* If seven-bit ISO is allowed, then assume that the encoding is
4808            entirely seven-bit and turn off the eight-bit ones. */
4809         if (mask & CODING_CATEGORY_ISO_7_MASK)
4810                 mask &= ~(CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4811                           CODING_CATEGORY_ISO_8_1_MASK |
4812                           CODING_CATEGORY_ISO_8_2_MASK);
4813         return mask;
4814 }
4815
4816 /* If FLAGS is a null pointer or specifies right-to-left motion,
4817    output a switch-dir-to-left-to-right sequence to DST.
4818    Also update FLAGS if it is not a null pointer.
4819    If INTERNAL_P is set, we are outputting in internal format and
4820    need to handle the CSI differently. */
4821
4822 static void
4823 restore_left_to_right_direction(Lisp_Coding_System * codesys,
4824                                 unsigned_char_dynarr * dst,
4825                                 unsigned int *flags, int internal_p)
4826 {
4827         if (!flags || (*flags & CODING_STATE_R2L)) {
4828                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4829                         Dynarr_add(dst, ISO_CODE_ESC);
4830                         Dynarr_add(dst, '[');
4831                 } else if (internal_p)
4832                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4833                 else
4834                         Dynarr_add(dst, ISO_CODE_CSI);
4835                 Dynarr_add(dst, '0');
4836                 Dynarr_add(dst, ']');
4837                 if (flags)
4838                         *flags &= ~CODING_STATE_R2L;
4839         }
4840 }
4841
4842 /* If FLAGS is a null pointer or specifies a direction different from
4843    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4844    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4845    sequence to DST.  Also update FLAGS if it is not a null pointer.
4846    If INTERNAL_P is set, we are outputting in internal format and
4847    need to handle the CSI differently. */
4848
4849 static void
4850 ensure_correct_direction(int direction, Lisp_Coding_System * codesys,
4851                          unsigned_char_dynarr * dst, unsigned int *flags,
4852                          int internal_p)
4853 {
4854         if ((!flags || (*flags & CODING_STATE_R2L)) &&
4855             direction == CHARSET_LEFT_TO_RIGHT)
4856                 restore_left_to_right_direction(codesys, dst, flags,
4857                                                 internal_p);
4858         else if (!CODING_SYSTEM_ISO2022_NO_ISO6429(codesys)
4859                  && (!flags || !(*flags & CODING_STATE_R2L)) &&
4860                  direction == CHARSET_RIGHT_TO_LEFT) {
4861                 if (CODING_SYSTEM_ISO2022_SEVEN(codesys)) {
4862                         Dynarr_add(dst, ISO_CODE_ESC);
4863                         Dynarr_add(dst, '[');
4864                 } else if (internal_p)
4865                         DECODE_ADD_BINARY_CHAR(ISO_CODE_CSI, dst);
4866                 else
4867                         Dynarr_add(dst, ISO_CODE_CSI);
4868                 Dynarr_add(dst, '2');
4869                 Dynarr_add(dst, ']');
4870                 if (flags)
4871                         *flags |= CODING_STATE_R2L;
4872         }
4873 }
4874
4875 /* Convert ISO2022-format data to internal format. */
4876
4877 static void
4878 decode_coding_iso2022(lstream_t decoding, const Extbyte * src,
4879                       unsigned_char_dynarr * dst, Lstream_data_count n)
4880 {
4881         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
4882         unsigned int flags = str->flags;
4883         unsigned int ch = str->ch;
4884         eol_type_t eol_type = str->eol_type;
4885 #ifdef ENABLE_COMPOSITE_CHARS
4886         unsigned_char_dynarr *real_dst = dst;
4887 #endif
4888         Lisp_Object coding_system;
4889
4890         XSETCODING_SYSTEM(coding_system, str->codesys);
4891
4892 #ifdef ENABLE_COMPOSITE_CHARS
4893         if (flags & CODING_STATE_COMPOSITE)
4894                 dst = str->iso2022.composite_chars;
4895 #endif                          /* ENABLE_COMPOSITE_CHARS */
4896
4897         while (n--) {
4898                 const unsigned char c = *(const unsigned char *)src++;
4899                 if (flags & CODING_STATE_ESCAPE) {
4900                         /* Within ESC sequence */
4901                         int retval = parse_iso2022_esc(
4902                                 coding_system, &str->iso2022, c, &flags, 1);
4903
4904                         if (retval) {
4905                                 switch (str->iso2022.esc) {
4906 #ifdef ENABLE_COMPOSITE_CHARS
4907                                 case ISO_ESC_START_COMPOSITE:
4908                                         if (str->iso2022.composite_chars)
4909                                                 Dynarr_reset(str->iso2022.
4910                                                              composite_chars);
4911                                         else
4912                                                 str->iso2022.composite_chars =
4913                                                     Dynarr_new(unsigned_char);
4914                                         dst = str->iso2022.composite_chars;
4915                                         break;
4916                                 case ISO_ESC_END_COMPOSITE:
4917                                         {
4918                                                 Bufbyte comstr[MAX_EMCHAR_LEN];
4919                                                 Bytecount len;
4920                                                 Emchar emch =
4921                                                     lookup_composite_char
4922                                                     (Dynarr_atp(dst, 0),
4923                                                      Dynarr_length(dst));
4924                                                 dst = real_dst;
4925                                                 len =
4926                                                     set_charptr_emchar(comstr,
4927                                                                        emch);
4928                                                 Dynarr_add_many(dst, comstr,
4929                                                                 len);
4930                                                 break;
4931                                         }
4932 #endif                          /* ENABLE_COMPOSITE_CHARS */
4933
4934                                 case ISO_ESC_LITERAL:
4935                                         DECODE_ADD_BINARY_CHAR(c, dst);
4936                                         break;
4937
4938                                 case ISO_ESC_NOTHING:
4939                                 case ISO_ESC:
4940                                 case ISO_ESC_2_4:
4941                                 case ISO_ESC_2_8:
4942                                 case ISO_ESC_2_9:
4943                                 case ISO_ESC_2_10:
4944                                 case ISO_ESC_2_11:
4945                                 case ISO_ESC_2_12:
4946                                 case ISO_ESC_2_13:
4947                                 case ISO_ESC_2_14:
4948                                 case ISO_ESC_2_15:
4949                                 case ISO_ESC_2_4_8:
4950                                 case ISO_ESC_2_4_9:
4951                                 case ISO_ESC_2_4_10:
4952                                 case ISO_ESC_2_4_11:
4953                                 case ISO_ESC_2_4_12:
4954                                 case ISO_ESC_2_4_13:
4955                                 case ISO_ESC_2_4_14:
4956                                 case ISO_ESC_2_4_15:
4957                                 case ISO_ESC_5_11:
4958                                 case ISO_ESC_5_11_0:
4959                                 case ISO_ESC_5_11_1:
4960                                 case ISO_ESC_5_11_2:
4961                                 case ISO_ESC_SINGLE_SHIFT:
4962                                 case ISO_ESC_LOCKING_SHIFT:
4963                                 case ISO_ESC_DESIGNATE:
4964                                 case ISO_ESC_DIRECTIONALITY:
4965
4966                                 default:
4967                                         /* Everything else handled already */
4968                                         break;
4969                                 }
4970                         }
4971
4972                         /* Attempted error recovery. */
4973                         if (str->iso2022.output_direction_sequence)
4974                                 ensure_correct_direction(flags &
4975                                                          CODING_STATE_R2L ?
4976                                                          CHARSET_RIGHT_TO_LEFT :
4977                                                          CHARSET_LEFT_TO_RIGHT,
4978                                                          str->codesys, dst, 0,
4979                                                          1);
4980                         /* More error recovery. */
4981                         if (!retval || str->iso2022.output_literally) {
4982                                 /* Output the (possibly invalid) sequence */
4983                                 int i;
4984                                 for (i = 0; i < str->iso2022.esc_bytes_index;
4985                                      i++)
4986                                         DECODE_ADD_BINARY_CHAR(str->iso2022.
4987                                                                esc_bytes[i],
4988                                                                dst);
4989                                 flags &= CODING_STATE_ISO2022_LOCK;
4990                                 if (!retval)
4991                                         n++, src--;     /* Repeat the loop with the same character. */
4992                                 else {
4993                                         /* No sense in reprocessing the final byte of the
4994                                            escape sequence; it could mess things up anyway.
4995                                            Just add it now. */
4996                                         DECODE_ADD_BINARY_CHAR(c, dst);
4997                                 }
4998                         }
4999                         ch = 0;
5000                 } else if (BYTE_C0_P(c) || BYTE_C1_P(c)) {      /* Control characters */
5001
5002           /***** Error-handling *****/
5003
5004                         /* If we were in the middle of a character, dump out the
5005                            partial character. */
5006                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5007
5008                         /* If we just saw a single-shift character, dump it out.
5009                            This may dump out the wrong sort of single-shift character,
5010                            but least it will give an indication that something went
5011                            wrong. */
5012                         if (flags & CODING_STATE_SS2) {
5013                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS2, dst);
5014                                 flags &= ~CODING_STATE_SS2;
5015                         }
5016                         if (flags & CODING_STATE_SS3) {
5017                                 DECODE_ADD_BINARY_CHAR(ISO_CODE_SS3, dst);
5018                                 flags &= ~CODING_STATE_SS3;
5019                         }
5020
5021           /***** Now handle the control characters. *****/
5022
5023                         /* Handle CR/LF */
5024                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5025
5026                         flags &= CODING_STATE_ISO2022_LOCK;
5027
5028                         if (!parse_iso2022_esc
5029                             (coding_system, &str->iso2022, c, &flags, 1))
5030                                 DECODE_ADD_BINARY_CHAR(c, dst);
5031                 } else {        /* Graphic characters */
5032                         Lisp_Object charset;
5033                         int lb;
5034                         int reg;
5035
5036                         DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5037
5038                         /* Now determine the charset. */
5039                         reg = ((flags & CODING_STATE_SS2) ? 2
5040                                : (flags & CODING_STATE_SS3) ? 3
5041                                : !BYTE_ASCII_P(c) ? str->iso2022.register_right
5042                                : str->iso2022.register_left);
5043                         charset = str->iso2022.charset[reg];
5044
5045                         /* Error checking: */
5046                         if (!CHARSETP(charset)
5047                             || str->iso2022.invalid_designated[reg]
5048                             ||
5049                             (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5050                              && XCHARSET_CHARS(charset) == 94))
5051                                 /* Mrmph.  We are trying to invoke a register that has no
5052                                    or an invalid charset in it, or trying to add a character
5053                                    outside the range of the charset.  Insert that char literally
5054                                    to preserve it for the output. */
5055                         {
5056                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5057                                 DECODE_ADD_BINARY_CHAR(c, dst);
5058                         }
5059
5060                         else {
5061                                 /* Things are probably hunky-dorey. */
5062
5063                                 /* Fetch reverse charset, maybe. */
5064                                 if (((flags & CODING_STATE_R2L) &&
5065                                      XCHARSET_DIRECTION(charset) ==
5066                                      CHARSET_LEFT_TO_RIGHT)
5067                                     || (!(flags & CODING_STATE_R2L)
5068                                         && XCHARSET_DIRECTION(charset) ==
5069                                         CHARSET_RIGHT_TO_LEFT)) {
5070                                         Lisp_Object new_charset =
5071                                             XCHARSET_REVERSE_DIRECTION_CHARSET
5072                                             (charset);
5073                                         if (!NILP(new_charset))
5074                                                 charset = new_charset;
5075                                 }
5076
5077                                 lb = XCHARSET_LEADING_BYTE(charset);
5078                                 switch (XCHARSET_REP_BYTES(charset)) {
5079                                 case 1: /* ASCII */
5080                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5081                                         Dynarr_add(dst, c & 0x7F);
5082                                         break;
5083
5084                                 case 2: /* one-byte official */
5085                                         DECODE_OUTPUT_PARTIAL_CHAR(ch);
5086                                         Dynarr_add(dst, lb);
5087                                         Dynarr_add(dst, c | 0x80);
5088                                         break;
5089
5090                                 case 3: /* one-byte private or two-byte official */
5091                                         if (XCHARSET_PRIVATE_P(charset)) {
5092                                                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5093                                                 Dynarr_add(dst,
5094                                                            PRE_LEADING_BYTE_PRIVATE_1);
5095                                                 Dynarr_add(dst, lb);
5096                                                 Dynarr_add(dst, c | 0x80);
5097                                         } else {
5098                                                 if (ch) {
5099                                                         Dynarr_add(dst, lb);
5100                                                         Dynarr_add(dst,
5101                                                                    ch | 0x80);
5102                                                         Dynarr_add(dst,
5103                                                                    c | 0x80);
5104                                                         ch = 0;
5105                                                 } else
5106                                                         ch = c;
5107                                         }
5108                                         break;
5109
5110                                 default:        /* two-byte private */
5111                                         if (ch) {
5112                                                 Dynarr_add(dst,
5113                                                            PRE_LEADING_BYTE_PRIVATE_2);
5114                                                 Dynarr_add(dst, lb);
5115                                                 Dynarr_add(dst, ch | 0x80);
5116                                                 Dynarr_add(dst, c | 0x80);
5117                                                 ch = 0;
5118                                         } else
5119                                                 ch = c;
5120                                 }
5121                         }
5122
5123                         if (!ch)
5124                                 flags &= CODING_STATE_ISO2022_LOCK;
5125                 }
5126
5127               label_continue_loop:;
5128         }
5129
5130         if (flags & CODING_STATE_END)
5131                 DECODE_OUTPUT_PARTIAL_CHAR(ch);
5132
5133         str->flags = flags;
5134         str->ch = ch;
5135 }
5136
5137 /***** ISO2022 encoder *****/
5138
5139 /* Designate CHARSET into register REG. */
5140
5141 static void
5142 iso2022_designate(Lisp_Object charset, unsigned char reg,
5143                   encoding_stream_t str, unsigned_char_dynarr * dst)
5144 {
5145         static const char inter94[] = "()*+";
5146         static const char inter96[] = ",-./";
5147         unsigned int type;
5148         unsigned char final;
5149         Lisp_Object old_charset = str->iso2022.charset[reg];
5150
5151         str->iso2022.charset[reg] = charset;
5152         if (!CHARSETP(charset))
5153                 /* charset might be an initial nil or t. */
5154                 return;
5155         type = XCHARSET_TYPE(charset);
5156         final = XCHARSET_FINAL(charset);
5157         if (!str->iso2022.force_charset_on_output[reg] &&
5158             CHARSETP(old_charset) &&
5159             XCHARSET_TYPE(old_charset) == type &&
5160             XCHARSET_FINAL(old_charset) == final)
5161                 return;
5162
5163         str->iso2022.force_charset_on_output[reg] = 0;
5164
5165         {
5166                 charset_conversion_spec_dynarr *dyn =
5167                     str->codesys->iso2022.output_conv;
5168
5169                 if (dyn) {
5170                         int i;
5171
5172                         for (i = 0; i < Dynarr_length(dyn); i++) {
5173                                 struct charset_conversion_spec *spec =
5174                                     Dynarr_atp(dyn, i);
5175                                 if (EQ(charset, spec->from_charset))
5176                                         charset = spec->to_charset;
5177                         }
5178                 }
5179         }
5180
5181         Dynarr_add(dst, ISO_CODE_ESC);
5182         switch (type) {
5183         case CHARSET_TYPE_94:
5184                 Dynarr_add(dst, inter94[reg]);
5185                 break;
5186         case CHARSET_TYPE_96:
5187                 Dynarr_add(dst, inter96[reg]);
5188                 break;
5189         case CHARSET_TYPE_94X94:
5190                 Dynarr_add(dst, '$');
5191                 if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT(str->codesys))
5192                     || final < '@' || final > 'B')
5193                         Dynarr_add(dst, inter94[reg]);
5194                 break;
5195         case CHARSET_TYPE_96X96:
5196                 Dynarr_add(dst, '$');
5197                 Dynarr_add(dst, inter96[reg]);
5198                 break;
5199         default:
5200                 break;
5201         }
5202         Dynarr_add(dst, final);
5203 }
5204
5205 static void
5206 ensure_normal_shift(encoding_stream_t str, unsigned_char_dynarr * dst)
5207 {
5208         if (str->iso2022.register_left != 0) {
5209                 Dynarr_add(dst, ISO_CODE_SI);
5210                 str->iso2022.register_left = 0;
5211         }
5212 }
5213
5214 static void
5215 ensure_shift_out(encoding_stream_t str, unsigned_char_dynarr * dst)
5216 {
5217         if (str->iso2022.register_left != 1) {
5218                 Dynarr_add(dst, ISO_CODE_SO);
5219                 str->iso2022.register_left = 1;
5220         }
5221 }
5222
5223 /* Convert internally-formatted data to ISO2022 format. */
5224
5225 static void
5226 encode_coding_iso2022(lstream_t encoding, const Bufbyte * src,
5227                       unsigned_char_dynarr * dst, Lstream_data_count n)
5228 {
5229         unsigned char charmask, c;
5230         unsigned char char_boundary;
5231         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5232         unsigned int flags = str->flags;
5233         unsigned int ch = str->ch;
5234         Lisp_Coding_System *codesys = str->codesys;
5235         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5236         int i;
5237         Lisp_Object charset;
5238         int half;
5239
5240 #ifdef ENABLE_COMPOSITE_CHARS
5241         /* flags for handling composite chars.  We do a little switcharoo
5242            on the source while we're outputting the composite char. */
5243         unsigned int saved_n = 0;
5244         const unsigned char *saved_src = NULL;
5245         int in_composite = 0;
5246 #endif                          /* ENABLE_COMPOSITE_CHARS */
5247
5248         char_boundary = str->iso2022.current_char_boundary;
5249         charset = str->iso2022.current_charset;
5250         half = str->iso2022.current_half;
5251
5252 #ifdef ENABLE_COMPOSITE_CHARS
5253       back_to_square_n:
5254 #endif
5255         while (n--) {
5256                 c = *src++;
5257
5258                 if (BYTE_ASCII_P(c)) {  /* Processing ASCII character */
5259                         ch = 0;
5260
5261                         restore_left_to_right_direction(codesys, dst, &flags,
5262                                                         0);
5263
5264                         /* Make sure G0 contains ASCII */
5265                         if ((c > ' ' && c < ISO_CODE_DEL) ||
5266                             !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys)) {
5267                                 ensure_normal_shift(str, dst);
5268                                 iso2022_designate(Vcharset_ascii, 0, str, dst);
5269                         }
5270
5271                         /* If necessary, restore everything to the default state
5272                            at end-of-line */
5273                         if (c == '\n' &&
5274                             !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys))) {
5275                                 restore_left_to_right_direction(codesys, dst,
5276                                                                 &flags, 0);
5277
5278                                 ensure_normal_shift(str, dst);
5279
5280                                 for (i = 0; i < 4; i++) {
5281                                         Lisp_Object initial_charset =
5282                                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5283                                             (codesys, i);
5284                                         iso2022_designate(initial_charset, i,
5285                                                           str, dst);
5286                                 }
5287                         }
5288                         if (c == '\n') {
5289                                 if (eol_type != EOL_LF
5290                                     && eol_type != EOL_AUTODETECT)
5291                                         Dynarr_add(dst, '\r');
5292                                 if (eol_type != EOL_CR)
5293                                         Dynarr_add(dst, c);
5294                         } else {
5295                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5296                                     && fit_to_be_escape_quoted(c))
5297                                         Dynarr_add(dst, ISO_CODE_ESC);
5298                                 Dynarr_add(dst, c);
5299                         }
5300                         char_boundary = 1;
5301                 }
5302
5303                 else if (BUFBYTE_LEADING_BYTE_P(c) || BUFBYTE_LEADING_BYTE_P(ch)) {     /* Processing Leading Byte */
5304                         ch = 0;
5305                         charset = CHARSET_BY_LEADING_BYTE(c);
5306                         if (LEADING_BYTE_PREFIX_P(c))
5307                                 ch = c;
5308                         else if (!EQ(charset, Vcharset_control_1)
5309 #ifdef ENABLE_COMPOSITE_CHARS
5310                                  && !EQ(charset, Vcharset_composite)
5311 #endif
5312                             ) {
5313                                 int reg;
5314
5315                                 ensure_correct_direction(XCHARSET_DIRECTION
5316                                                          (charset), codesys,
5317                                                          dst, &flags, 0);
5318
5319                                 /* Now determine which register to use. */
5320                                 reg = -1;
5321                                 for (i = 0; i < 4; i++) {
5322                                         if (EQ(charset, str->iso2022.charset[i])
5323                                             || EQ(charset,
5324                                                   CODING_SYSTEM_ISO2022_INITIAL_CHARSET
5325                                                   (codesys, i))) {
5326                                                 reg = i;
5327                                                 break;
5328                                         }
5329                                 }
5330
5331                                 if (reg == -1) {
5332                                         if (XCHARSET_GRAPHIC(charset) != 0) {
5333                                                 if (!NILP
5334                                                     (str->iso2022.charset[1])
5335                                                     &&
5336                                                     (!CODING_SYSTEM_ISO2022_SEVEN
5337                                                      (codesys)
5338                                                      ||
5339                                                      CODING_SYSTEM_ISO2022_LOCK_SHIFT
5340                                                      (codesys)))
5341                                                         reg = 1;
5342                                                 else if (!NILP
5343                                                          (str->iso2022.
5344                                                           charset[2]))
5345                                                         reg = 2;
5346                                                 else if (!NILP
5347                                                          (str->iso2022.
5348                                                           charset[3]))
5349                                                         reg = 3;
5350                                                 else
5351                                                         reg = 0;
5352                                         } else
5353                                                 reg = 0;
5354                                 }
5355
5356                                 iso2022_designate(charset, reg, str, dst);
5357
5358                                 /* Now invoke that register. */
5359                                 switch (reg) {
5360                                 case 0:
5361                                         ensure_normal_shift(str, dst);
5362                                         half = 0;
5363                                         break;
5364
5365                                 case 1:
5366                                         if (CODING_SYSTEM_ISO2022_SEVEN
5367                                             (codesys)) {
5368                                                 ensure_shift_out(str, dst);
5369                                                 half = 0;
5370                                         } else
5371                                                 half = 1;
5372                                         break;
5373
5374                                 case 2:
5375                                         if (CODING_SYSTEM_ISO2022_SEVEN
5376                                             (str->codesys)) {
5377                                                 Dynarr_add(dst, ISO_CODE_ESC);
5378                                                 Dynarr_add(dst, 'N');
5379                                                 half = 0;
5380                                         } else {
5381                                                 Dynarr_add(dst, ISO_CODE_SS2);
5382                                                 half = 1;
5383                                         }
5384                                         break;
5385
5386                                 case 3:
5387                                         if (CODING_SYSTEM_ISO2022_SEVEN
5388                                             (str->codesys)) {
5389                                                 Dynarr_add(dst, ISO_CODE_ESC);
5390                                                 Dynarr_add(dst, 'O');
5391                                                 half = 0;
5392                                         } else {
5393                                                 Dynarr_add(dst, ISO_CODE_SS3);
5394                                                 half = 1;
5395                                         }
5396                                         break;
5397
5398                                 default:
5399                                         abort();
5400                                 }
5401                         }
5402                         char_boundary = 0;
5403                 } else {        /* Processing Non-ASCII character */
5404                         charmask = (half == 0 ? 0x7F : 0xFF);
5405                         char_boundary = 1;
5406                         if (EQ(charset, Vcharset_control_1)) {
5407                                 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys)
5408                                     && fit_to_be_escape_quoted(c))
5409                                         Dynarr_add(dst, ISO_CODE_ESC);
5410                                 /* you asked for it ... */
5411                                 Dynarr_add(dst, c - 0x20);
5412                         } else {
5413                                 switch (XCHARSET_REP_BYTES(charset)) {
5414                                 case 2:
5415                                         Dynarr_add(dst, c & charmask);
5416                                         break;
5417                                 case 3:
5418                                         if (XCHARSET_PRIVATE_P(charset)) {
5419                                                 Dynarr_add(dst, c & charmask);
5420                                                 ch = 0;
5421                                         } else if (ch) {
5422 #ifdef ENABLE_COMPOSITE_CHARS
5423                                                 if (EQ
5424                                                     (charset,
5425                                                      Vcharset_composite)) {
5426                                                         if (in_composite) {
5427                                                                 /* #### Bother! We don't know how to
5428                                                                    handle this yet. */
5429                                                                 Dynarr_add(dst,
5430                                                                            '~');
5431                                                         } else {
5432                                                                 Emchar emch =
5433                                                                     MAKE_CHAR
5434                                                                     (Vcharset_composite,
5435                                                                      ch & 0x7F,
5436                                                                      c & 0x7F);
5437                                                                 Lisp_Object lstr
5438                                                                     =
5439                                                                     composite_char_string
5440                                                                     (emch);
5441                                                                 saved_n = n;
5442                                                                 saved_src = src;
5443                                                                 in_composite =
5444                                                                     1;
5445                                                                 src =
5446                                                                     XSTRING_DATA
5447                                                                     (lstr);
5448                                                                 n = XSTRING_LENGTH(lstr);
5449                                                                 Dynarr_add(dst,
5450                                                                            ISO_CODE_ESC);
5451                                                                 Dynarr_add(dst, '0');   /* start composing */
5452                                                         }
5453                                                 } else
5454 #endif                          /* ENABLE_COMPOSITE_CHARS */
5455                                                 {
5456                                                         Dynarr_add(dst,
5457                                                                    ch &
5458                                                                    charmask);
5459                                                         Dynarr_add(dst,
5460                                                                    c &
5461                                                                    charmask);
5462                                                 }
5463                                                 ch = 0;
5464                                         } else {
5465                                                 ch = c;
5466                                                 char_boundary = 0;
5467                                         }
5468                                         break;
5469                                 case 4:
5470                                         if (ch) {
5471                                                 Dynarr_add(dst, ch & charmask);
5472                                                 Dynarr_add(dst, c & charmask);
5473                                                 ch = 0;
5474                                         } else {
5475                                                 ch = c;
5476                                                 char_boundary = 0;
5477                                         }
5478                                         break;
5479                                 default:
5480                                         abort();
5481                                 }
5482                         }
5483                 }
5484         }
5485
5486 #ifdef ENABLE_COMPOSITE_CHARS
5487         if (in_composite) {
5488                 n = saved_n;
5489                 src = saved_src;
5490                 in_composite = 0;
5491                 Dynarr_add(dst, ISO_CODE_ESC);
5492                 Dynarr_add(dst, '1');   /* end composing */
5493                 goto back_to_square_n;  /* Wheeeeeeeee ..... */
5494         }
5495 #endif                          /* ENABLE_COMPOSITE_CHARS */
5496
5497         if (char_boundary && flags & CODING_STATE_END) {
5498                 restore_left_to_right_direction(codesys, dst, &flags, 0);
5499                 ensure_normal_shift(str, dst);
5500                 for (i = 0; i < 4; i++) {
5501                         Lisp_Object initial_charset =
5502                             CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, i);
5503                         iso2022_designate(initial_charset, i, str, dst);
5504                 }
5505         }
5506
5507         str->flags = flags;
5508         str->ch = ch;
5509         str->iso2022.current_char_boundary = char_boundary;
5510         str->iso2022.current_charset = charset;
5511         str->iso2022.current_half = half;
5512
5513         /* Verbum caro factum est! */
5514 }
5515 #endif                          /* MULE */
5516 \f
5517 /************************************************************************/
5518 /*                     No-conversion methods                            */
5519 /************************************************************************/
5520
5521 /* This is used when reading in "binary" files -- i.e. files that may
5522    contain all 256 possible byte values and that are not to be
5523    interpreted as being in any particular decoding. */
5524 static void
5525 decode_coding_no_conversion(lstream_t decoding, const Extbyte * src,
5526                             unsigned_char_dynarr * dst, Lstream_data_count n)
5527 {
5528         decoding_stream_t str = DECODING_STREAM_DATA(decoding);
5529         unsigned int flags = str->flags;
5530         unsigned int ch = str->ch;
5531         eol_type_t eol_type = str->eol_type;
5532
5533         while (n--) {
5534                 const unsigned char c = *(const unsigned char *)src++;
5535
5536                 DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst);
5537                 DECODE_ADD_BINARY_CHAR(c, dst);
5538         label_continue_loop:;
5539         }
5540
5541         DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst);
5542
5543         str->flags = flags;
5544         str->ch = ch;
5545 }
5546
5547 static void
5548 encode_coding_no_conversion(lstream_t encoding, const Bufbyte * src,
5549                             unsigned_char_dynarr * dst, Lstream_data_count n)
5550 {
5551         unsigned char c;
5552         encoding_stream_t str = ENCODING_STREAM_DATA(encoding);
5553         unsigned int flags = str->flags;
5554         unsigned int ch = str->ch;
5555         eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE(str->codesys);
5556
5557         while (n--) {
5558                 c = *src++;
5559                 if (c == '\n') {
5560                         if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5561                                 Dynarr_add(dst, '\r');
5562                         if (eol_type != EOL_CR)
5563                                 Dynarr_add(dst, '\n');
5564                         ch = 0;
5565                 } else if (BYTE_ASCII_P(c)) {
5566                         assert(ch == 0);
5567                         Dynarr_add(dst, c);
5568                 } else if (BUFBYTE_LEADING_BYTE_P(c)) {
5569                         assert(ch == 0);
5570                         if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5571                             c == LEADING_BYTE_CONTROL_1)
5572                                 ch = c;
5573                         else
5574                                 Dynarr_add(dst, '~');   /* untranslatable character */
5575                 } else {
5576                         if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5577                                 Dynarr_add(dst, c);
5578                         else if (ch == LEADING_BYTE_CONTROL_1) {
5579                                 assert(c < 0xC0);
5580                                 Dynarr_add(dst, c - 0x20);
5581                         }
5582                         /* else it should be the second or third byte of an
5583                            untranslatable character, so ignore it */
5584                         ch = 0;
5585                 }
5586         }
5587
5588         str->flags = flags;
5589         str->ch = ch;
5590 }
5591 \f
5592 /************************************************************************/
5593 /*                             Initialization                           */
5594 /************************************************************************/
5595
5596 void syms_of_file_coding(void)
5597 {
5598         INIT_LRECORD_IMPLEMENTATION(coding_system);
5599
5600         DEFERROR_STANDARD(Qcoding_system_error, Qio_error);
5601
5602         DEFSUBR(Fcoding_system_p);
5603         DEFSUBR(Ffind_coding_system);
5604         DEFSUBR(Fget_coding_system);
5605         DEFSUBR(Fcoding_system_list);
5606         DEFSUBR(Fcoding_system_name);
5607         DEFSUBR(Fmake_coding_system);
5608         DEFSUBR(Fcopy_coding_system);
5609         DEFSUBR(Fcoding_system_canonical_name_p);
5610         DEFSUBR(Fcoding_system_alias_p);
5611         DEFSUBR(Fcoding_system_aliasee);
5612         DEFSUBR(Fdefine_coding_system_alias);
5613         DEFSUBR(Fsubsidiary_coding_system);
5614
5615         DEFSUBR(Fcoding_system_type);
5616         DEFSUBR(Fcoding_system_doc_string);
5617 #ifdef MULE
5618         DEFSUBR(Fcoding_system_charset);
5619 #endif
5620         DEFSUBR(Fcoding_system_property);
5621
5622         DEFSUBR(Fcoding_category_list);
5623         DEFSUBR(Fset_coding_priority_list);
5624         DEFSUBR(Fcoding_priority_list);
5625         DEFSUBR(Fset_coding_category_system);
5626         DEFSUBR(Fcoding_category_system);
5627
5628         DEFSUBR(Fdetect_coding_region);
5629         DEFSUBR(Fdecode_coding_region);
5630         DEFSUBR(Fencode_coding_region);
5631 #ifdef MULE
5632         DEFSUBR(Fdecode_shift_jis_char);
5633         DEFSUBR(Fencode_shift_jis_char);
5634         DEFSUBR(Fdecode_big5_char);
5635         DEFSUBR(Fencode_big5_char);
5636         DEFSUBR(Fset_ucs_char);
5637         DEFSUBR(Fucs_char);
5638         DEFSUBR(Fset_char_ucs);
5639         DEFSUBR(Fchar_ucs);
5640 #endif                          /* MULE */
5641         defsymbol(&Qcoding_systemp, "coding-system-p");
5642         defsymbol(&Qno_conversion, "no-conversion");
5643         defsymbol(&Qraw_text, "raw-text");
5644 #ifdef MULE
5645         defsymbol(&Qbig5, "big5");
5646         defsymbol(&Qshift_jis, "shift-jis");
5647         defsymbol(&Qucs4, "ucs-4");
5648         defsymbol(&Qutf8, "utf-8");
5649         defsymbol(&Qccl, "ccl");
5650         defsymbol(&Qiso2022, "iso2022");
5651 #endif                          /* MULE */
5652         defsymbol(&Qmnemonic, "mnemonic");
5653         defsymbol(&Qeol_type, "eol-type");
5654         defsymbol(&Qpost_read_conversion, "post-read-conversion");
5655         defsymbol(&Qpre_write_conversion, "pre-write-conversion");
5656
5657         defsymbol(&Qcr, "cr");
5658         defsymbol(&Qlf, "lf");
5659         defsymbol(&Qcrlf, "crlf");
5660         defsymbol(&Qeol_cr, "eol-cr");
5661         defsymbol(&Qeol_lf, "eol-lf");
5662         defsymbol(&Qeol_crlf, "eol-crlf");
5663 #ifdef MULE
5664         defsymbol(&Qcharset_g0, "charset-g0");
5665         defsymbol(&Qcharset_g1, "charset-g1");
5666         defsymbol(&Qcharset_g2, "charset-g2");
5667         defsymbol(&Qcharset_g3, "charset-g3");
5668         defsymbol(&Qforce_g0_on_output, "force-g0-on-output");
5669         defsymbol(&Qforce_g1_on_output, "force-g1-on-output");
5670         defsymbol(&Qforce_g2_on_output, "force-g2-on-output");
5671         defsymbol(&Qforce_g3_on_output, "force-g3-on-output");
5672         defsymbol(&Qno_iso6429, "no-iso6429");
5673         defsymbol(&Qinput_charset_conversion, "input-charset-conversion");
5674         defsymbol(&Qoutput_charset_conversion, "output-charset-conversion");
5675
5676         defsymbol(&Qno_ascii_eol, "no-ascii-eol");
5677         defsymbol(&Qno_ascii_cntl, "no-ascii-cntl");
5678         defsymbol(&Qseven, "seven");
5679         defsymbol(&Qlock_shift, "lock-shift");
5680         defsymbol(&Qescape_quoted, "escape-quoted");
5681 #endif                          /* MULE */
5682         defsymbol(&Qencode, "encode");
5683         defsymbol(&Qdecode, "decode");
5684
5685 #ifdef MULE
5686         defsymbol(&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5687                   "shift-jis");
5688         defsymbol(&coding_category_symbol[CODING_CATEGORY_BIG5], "big5");
5689         defsymbol(&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4");
5690         defsymbol(&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8");
5691         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7");
5692         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5693                   "iso-8-designate");
5694         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1");
5695         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2");
5696         defsymbol(&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5697                   "iso-lock-shift");
5698 #endif                          /* MULE */
5699         defsymbol(&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5700                   "no-conversion");
5701 }
5702
5703 void lstream_type_create_file_coding(void)
5704 {
5705         LSTREAM_HAS_METHOD(decoding, reader);
5706         LSTREAM_HAS_METHOD(decoding, writer);
5707         LSTREAM_HAS_METHOD(decoding, rewinder);
5708         LSTREAM_HAS_METHOD(decoding, seekable_p);
5709         LSTREAM_HAS_METHOD(decoding, flusher);
5710         LSTREAM_HAS_METHOD(decoding, closer);
5711         LSTREAM_HAS_METHOD(decoding, marker);
5712
5713         LSTREAM_HAS_METHOD(encoding, reader);
5714         LSTREAM_HAS_METHOD(encoding, writer);
5715         LSTREAM_HAS_METHOD(encoding, rewinder);
5716         LSTREAM_HAS_METHOD(encoding, seekable_p);
5717         LSTREAM_HAS_METHOD(encoding, flusher);
5718         LSTREAM_HAS_METHOD(encoding, closer);
5719         LSTREAM_HAS_METHOD(encoding, marker);
5720 }
5721
5722 void vars_of_file_coding(void)
5723 {
5724         int i;
5725
5726         fcd = xnew(struct file_coding_dump);
5727         dump_add_root_struct_ptr(&fcd, &fcd_description);
5728
5729         /* Initialize to something reasonable ... */
5730         for (i = 0; i < CODING_CATEGORY_LAST; i++) {
5731                 fcd->coding_category_system[i] = Qnil;
5732                 fcd->coding_category_by_priority[i] = i;
5733         }
5734
5735         Fprovide(intern("file-coding"));
5736
5737         DEFVAR_LISP("keyboard-coding-system", &Vkeyboard_coding_system  /*
5738 Coding system used for TTY keyboard input.
5739 Not used under a windowing system.
5740                                                                          */ );
5741         Vkeyboard_coding_system = Qnil;
5742
5743         DEFVAR_LISP("terminal-coding-system", &Vterminal_coding_system  /*
5744 Coding system used for TTY display output.
5745 Not used under a windowing system.
5746                                                                          */ );
5747         Vterminal_coding_system = Qnil;
5748
5749         DEFVAR_LISP("coding-system-for-read", &Vcoding_system_for_read  /*
5750 Overriding coding system used when reading from a file or process.
5751 You should bind this variable with `let', but do not set it globally.
5752 If this is non-nil, it specifies the coding system that will be used
5753 to decode input on read operations, such as from a file or process.
5754 It overrides `buffer-file-coding-system-for-read',
5755 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5756 this one for permanent changes to the environment.  */ );
5757         Vcoding_system_for_read = Qnil;
5758
5759         DEFVAR_LISP("coding-system-for-write", &Vcoding_system_for_write        /*
5760 Overriding coding system used when writing to a file or process.
5761 You should bind this variable with `let', but do not set it globally.
5762 If this is non-nil, it specifies the coding system that will be used
5763 to encode output for write operations, such as to a file or process.
5764 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5765 Use those variables instead of this one for permanent changes to the
5766 environment.  */ );
5767         Vcoding_system_for_write = Qnil;
5768
5769         DEFVAR_LISP("file-name-coding-system", &Vfile_name_coding_system        /*
5770 Coding system used to convert pathnames when accessing files.
5771                                                                                  */ );
5772         Vfile_name_coding_system = Qnil;
5773
5774         DEFVAR_BOOL("enable-multibyte-characters", &enable_multibyte_characters /*
5775 Non-nil means the buffer contents are regarded as multi-byte form
5776 of characters, not a binary code.  This affects the display, file I/O,
5777 and behaviors of various editing commands.
5778
5779 Setting this to nil does not do anything.
5780                                                                                  */ );
5781         enable_multibyte_characters = 1;
5782 }
5783
5784 void complex_vars_of_file_coding(void)
5785 {
5786         staticpro(&Vcoding_system_hash_table);
5787         Vcoding_system_hash_table =
5788             make_lisp_hash_table(50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5789
5790         the_codesys_prop_dynarr = Dynarr_new(codesys_prop);
5791         dump_add_root_struct_ptr(&the_codesys_prop_dynarr,
5792                                  &codesys_prop_dynarr_description);
5793
5794 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5795 {                                               \
5796   struct codesys_prop csp;                      \
5797   csp.sym = (Sym);                              \
5798   csp.prop_type = (Prop_Type);                  \
5799   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5800 } while (0)
5801
5802         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qmnemonic);
5803         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_type);
5804         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_cr);
5805         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_crlf);
5806         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qeol_lf);
5807         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5808         DEFINE_CODESYS_PROP(CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5809 #ifdef MULE
5810         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g0);
5811         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g1);
5812         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g2);
5813         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qcharset_g3);
5814         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5815         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5816         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5817         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5818         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qshort);
5819         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_eol);
5820         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5821         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qseven);
5822         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qlock_shift);
5823         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qno_iso6429);
5824         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qescape_quoted);
5825         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5826         DEFINE_CODESYS_PROP(CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5827
5828         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qencode);
5829         DEFINE_CODESYS_PROP(CODESYS_PROP_CCL, Qdecode);
5830 #endif                          /* MULE */
5831         /* Need to create this here or we're really screwed. */
5832         Fmake_coding_system
5833             (Qraw_text, Qno_conversion,
5834              build_string
5835              ("Raw text, which means it converts only line-break-codes."),
5836              list2(Qmnemonic, build_string("Raw")));
5837
5838         Fmake_coding_system
5839             (Qbinary, Qno_conversion,
5840              build_string("Binary, which means it does not convert anything."),
5841              list4(Qeol_type, Qlf, Qmnemonic, build_string("Binary")));
5842
5843         Fdefine_coding_system_alias(Qno_conversion, Qraw_text);
5844
5845         Fdefine_coding_system_alias(Qfile_name, Qbinary);
5846
5847         Fdefine_coding_system_alias(Qterminal, Qbinary);
5848         Fdefine_coding_system_alias(Qkeyboard, Qbinary);
5849
5850         /* Need this for bootstrapping */
5851         fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5852             Fget_coding_system(Qraw_text);
5853
5854 #ifdef MULE
5855         {
5856                 size_t i;
5857
5858                 for (i = 0; i < countof(fcd->ucs_to_mule_table); i++)
5859                         fcd->ucs_to_mule_table[i] = Qnil;
5860         }
5861         staticpro(&mule_to_ucs_table);
5862         mule_to_ucs_table = Fmake_char_table(Qgeneric);
5863 #endif                          /* MULE */
5864 }