checking in all the old panacean stuff

2016-07-25 15:42:39 -04:00
parent c996cdd81f
commit 8fd9e44ae5
1210 changed files with 220657 additions and 0 deletions
--- a/puttysrc/CHARSET/CHARSET.H
+++ b/puttysrc/CHARSET/CHARSET.H
@@ -0,0 +1,154 @@
+/*
+ * charset.h - header file for general character set conversion
+ * routines.
+ */
+
+#ifndef charset_charset_h
+#define charset_charset_h
+
+#include <stddef.h>
+
+/*
+ * Enumeration that lists all the multibyte or single-byte
+ * character sets known to this library.
+ */
+typedef enum {
+    CS_NONE,			       /* used for reporting errors, etc */
+    CS_ISO8859_1,
+    CS_ISO8859_1_X11,		       /* X font encoding with VT100 glyphs */
+    CS_ISO8859_2,
+    CS_ISO8859_3,
+    CS_ISO8859_4,
+    CS_ISO8859_5,
+    CS_ISO8859_6,
+    CS_ISO8859_7,
+    CS_ISO8859_8,
+    CS_ISO8859_9,
+    CS_ISO8859_10,
+    CS_ISO8859_11,
+    CS_ISO8859_13,
+    CS_ISO8859_14,
+    CS_ISO8859_15,
+    CS_ISO8859_16,
+    CS_CP437,
+    CS_CP850,
+    CS_CP866,
+    CS_CP1250,
+    CS_CP1251,
+    CS_CP1252,
+    CS_CP1253,
+    CS_CP1254,
+    CS_CP1255,
+    CS_CP1256,
+    CS_CP1257,
+    CS_CP1258,
+    CS_KOI8_R,
+    CS_KOI8_U,
+    CS_MAC_ROMAN,
+    CS_MAC_TURKISH,
+    CS_MAC_CROATIAN,
+    CS_MAC_ICELAND,
+    CS_MAC_ROMANIAN,
+    CS_MAC_GREEK,
+    CS_MAC_CYRILLIC,
+    CS_MAC_THAI,
+    CS_MAC_CENTEURO,
+    CS_MAC_SYMBOL,
+    CS_MAC_DINGBATS,
+    CS_MAC_ROMAN_OLD,
+    CS_MAC_CROATIAN_OLD,
+    CS_MAC_ICELAND_OLD,
+    CS_MAC_ROMANIAN_OLD,
+    CS_MAC_GREEK_OLD,
+    CS_MAC_CYRILLIC_OLD,
+    CS_MAC_UKRAINE,
+    CS_MAC_VT100,
+    CS_MAC_VT100_OLD,
+    CS_VISCII,
+    CS_HP_ROMAN8,
+    CS_DEC_MCS,
+    CS_UTF8
+} charset_t;
+
+typedef struct {
+    unsigned long s0;
+} charset_state;
+
+/*
+ * Routine to convert a MB/SB character set to Unicode.
+ * 
+ * This routine accepts some number of bytes, updates a state
+ * variable, and outputs some number of Unicode characters. There
+ * are no guarantees. You can't even guarantee that at most one
+ * Unicode character will be output per byte you feed in; for
+ * example, suppose you're reading UTF-8, you've seen E1 80, and
+ * then you suddenly see FE. Now you need to output _two_ error
+ * characters - one for the incomplete sequence E1 80, and one for
+ * the completely invalid UTF-8 byte FE.
+ * 
+ * Returns the number of wide characters output; will never output
+ * more than the size of the buffer (as specified on input).
+ * Advances the `input' pointer and decrements `inlen', to indicate
+ * how far along the input string it got.
+ * 
+ * The sequence of `errlen' wide characters pointed to by `errstr'
+ * will be used to indicate a conversion error. If `errstr' is
+ * NULL, `errlen' will be ignored, and the library will choose
+ * something sensible to do on its own. For Unicode, this will be
+ * U+FFFD (REPLACEMENT CHARACTER).
+ */
+
+int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen,
+		       int charset, charset_state *state,
+		       const wchar_t *errstr, int errlen);
+
+/*
+ * Routine to convert Unicode to an MB/SB character set.
+ * 
+ * This routine accepts some number of Unicode characters, updates
+ * a state variable, and outputs some number of bytes.
+ * 
+ * Returns the number of bytes characters output; will never output
+ * more than the size of the buffer (as specified on input), and
+ * will never output a partial MB character. Advances the `input'
+ * pointer and decrements `inlen', to indicate how far along the
+ * input string it got.
+ * 
+ * The sequence of `errlen' characters pointed to by `errstr' will
+ * be used to indicate a conversion error. If `errstr' is NULL,
+ * `errlen' will be ignored, and the library will choose something
+ * sensible to do on its own (which will vary depending on the
+ * output charset).
+ */
+
+int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen,
+			 int charset, charset_state *state,
+			 const char *errstr, int errlen);
+
+/*
+ * Convert X11 encoding names to and from our charset identifiers.
+ */
+const char *charset_to_xenc(int charset);
+int charset_from_xenc(const char *name);
+
+/*
+ * Convert MIME encoding names to and from our charset identifiers.
+ */
+const char *charset_to_mimeenc(int charset);
+int charset_from_mimeenc(const char *name);
+
+/*
+ * Convert our own encoding names to and from our charset
+ * identifiers.
+ */
+const char *charset_to_localenc(int charset);
+int charset_from_localenc(const char *name);
+int charset_localenc_nth(int n);
+
+/*
+ * Convert Mac OS script/region/font to our charset identifiers.
+ */
+int charset_from_macenc(int script, int region, int sysvers,
+			const char *fontname);
+
+#endif /* charset_charset_h */
--- a/puttysrc/CHARSET/ENUM.C
+++ b/puttysrc/CHARSET/ENUM.C
@@ -0,0 +1,19 @@
+/*
+ * enum.c - enumerate all charsets defined by the library.
+ * 
+ * This file maintains a list of every other source file which
+ * contains ENUM_CHARSET definitions. It #includes each one with
+ * ENUM_CHARSETS defined, which causes those source files to do
+ * nothing at all except call the ENUM_CHARSET macro on each
+ * charset they define.
+ * 
+ * This file in turn is included from various other places, with
+ * the ENUM_CHARSET macro defined to various different things. This
+ * allows us to have multiple implementations of the master charset
+ * lookup table (a static one and a dynamic one).
+ */
+
+#define ENUM_CHARSETS
+#include "sbcsdat.c"
+#include "utf8.c"
+#undef ENUM_CHARSETS
--- a/puttysrc/CHARSET/FROMUCS.C
+++ b/puttysrc/CHARSET/FROMUCS.C
@@ -0,0 +1,91 @@
+/*
+ * fromucs.c - convert Unicode to other character sets.
+ */
+
+#include "charset.h"
+#include "internal.h"
+
+struct charset_emit_param {
+    char *output;
+    int outlen;
+    const char *errstr;
+    int errlen;
+    int stopped;
+};
+
+static void charset_emit(void *ctx, long int output)
+{
+    struct charset_emit_param *param = (struct charset_emit_param *)ctx;
+    char outval;
+    char const *p;
+    int outlen;
+
+    if (output == ERROR) {
+	p = param->errstr;
+	outlen = param->errlen;
+    } else {
+	outval = output;
+	p = &outval;
+	outlen = 1;
+    }
+
+    if (param->outlen >= outlen) {
+	while (outlen > 0) {
+	    *param->output++ = *p++;
+	    param->outlen--;
+	    outlen--;
+	}
+    } else {
+	param->stopped = 1;
+    }
+}
+
+int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen,
+			 int charset, charset_state *state,
+			 const char *errstr, int errlen)
+{
+    charset_spec const *spec = charset_find_spec(charset);
+    charset_state localstate;
+    struct charset_emit_param param;
+
+    param.output = output;
+    param.outlen = outlen;
+    param.stopped = 0;
+
+    /*
+     * charset_emit will expect a valid errstr.
+     */
+    if (!errstr) {
+	/* *shrug* this is good enough, and consistent across all SBCS... */
+	param.errstr = ".";
+	param.errlen = 1;
+    }
+    param.errstr = errstr;
+    param.errlen = errlen;
+
+    if (!state) {
+	localstate.s0 = 0;
+    } else {
+	localstate = *state;	       /* structure copy */
+    }
+    state = &localstate;
+
+    while (*inlen > 0) {
+	int lenbefore = param.output - output;
+	spec->write(spec, **input, &localstate, charset_emit, &param);
+	if (param.stopped) {
+	    /*
+	     * The emit function has _tried_ to output some
+	     * characters, but ran up against the end of the
+	     * buffer. Leave immediately, and return what happened
+	     * _before_ attempting to process this character.
+	     */
+	    return lenbefore;
+	}
+	if (state)
+	    *state = localstate;       /* structure copy */
+	(*input)++;
+	(*inlen)--;
+    }
+    return param.output - output;
+}
--- a/puttysrc/CHARSET/INTERNAL.H
+++ b/puttysrc/CHARSET/INTERNAL.H
@@ -0,0 +1,89 @@
+/*
+ * internal.h - internal header stuff for the charset library.
+ */
+
+#ifndef charset_internal_h
+#define charset_internal_h
+
+/* This invariably comes in handy */
+#define lenof(x) ( sizeof((x)) / sizeof(*(x)) )
+
+/* This is an invalid Unicode value used to indicate an error. */
+#define ERROR 0xFFFFL		       /* Unicode value representing error */
+
+typedef struct charset_spec charset_spec;
+typedef struct sbcs_data sbcs_data;
+
+struct charset_spec {
+    int charset;		       /* numeric identifier */
+
+    /*
+     * A function to read the character set and output Unicode
+     * characters. The `emit' function expects to get Unicode chars
+     * passed to it; it should be sent ERROR for any encoding error
+     * on the input.
+     */
+    void (*read)(charset_spec const *charset, long int input_chr,
+		 charset_state *state,
+		 void (*emit)(void *ctx, long int output), void *emitctx);
+    /*
+     * A function to read Unicode characters and output in this
+     * character set. The `emit' function expects to get byte
+     * values passed to it; it should be sent ERROR for any
+     * non-representable characters on the input.
+     */
+    void (*write)(charset_spec const *charset, long int input_chr,
+		  charset_state *state,
+		  void (*emit)(void *ctx, long int output), void *emitctx);
+    void const *data;
+};
+
+/*
+ * This is the format of `data' used by the SBCS read and write
+ * functions; so it's the format used in all SBCS definitions.
+ */
+struct sbcs_data {
+    /*
+     * This is a simple mapping table converting each SBCS position
+     * to a Unicode code point. Some positions may contain ERROR,
+     * indicating that that byte value is not defined in the SBCS
+     * in question and its occurrence in input is an error.
+     */
+    unsigned long sbcs2ucs[256];
+
+    /*
+     * This lookup table is used to convert Unicode back to the
+     * SBCS. It consists of the valid byte values in the SBCS,
+     * sorted in order of their Unicode translation. So given a
+     * Unicode value U, you can do a binary search on this table
+     * using the above table as a lookup: when testing the Xth
+     * position in this table, you branch according to whether
+     * sbcs2ucs[ucs2sbcs[X]] is less than, greater than, or equal
+     * to U.
+     * 
+     * Note that since there may be fewer than 256 valid byte
+     * values in a particular SBCS, we must supply the length of
+     * this table as well as the contents.
+     */
+    unsigned char ucs2sbcs[256];
+    int nvalid;
+};
+
+/*
+ * Prototypes for internal library functions.
+ */
+charset_spec const *charset_find_spec(int charset);
+void read_sbcs(charset_spec const *charset, long int input_chr,
+	       charset_state *state,
+	       void (*emit)(void *ctx, long int output), void *emitctx);
+void write_sbcs(charset_spec const *charset, long int input_chr,
+		charset_state *state,
+		void (*emit)(void *ctx, long int output), void *emitctx);
+
+/*
+ * Placate compiler warning about unused parameters, of which we
+ * expect to have some in this library.
+ */
+#define UNUSEDARG(x) ( (x) = (x) )
+
+#endif /* charset_internal_h */
--- a/puttysrc/CHARSET/LOCALENC.C
+++ b/puttysrc/CHARSET/LOCALENC.C
@@ -0,0 +1,125 @@
+/*
+ * local.c - translate our internal character set codes to and from
+ * our own set of plausibly legible character-set names. Also
+ * provides a canonical name for each encoding (useful for software
+ * announcing what character set it will be using), and a set of
+ * enumeration functions which return a list of supported
+ * encodings one by one.
+ * 
+ * charset_from_localenc will attempt all other text translations
+ * as well as this table, to maximise the number of different ways
+ * you can select a supported charset.
+ */
+
+#include <ctype.h>
+#include "charset.h"
+#include "internal.h"
+
+static const struct {
+    const char *name;
+    int charset;
+    int return_in_enum;   /* enumeration misses some charsets */
+} localencs[] = {
+    { "<UNKNOWN>", CS_NONE, 0 },
+    { "ISO-8859-1", CS_ISO8859_1, 1 },
+    { "ISO-8859-1 with X11 line drawing", CS_ISO8859_1_X11, 0 },
+    { "ISO-8859-2", CS_ISO8859_2, 1 },
+    { "ISO-8859-3", CS_ISO8859_3, 1 },
+    { "ISO-8859-4", CS_ISO8859_4, 1 },
+    { "ISO-8859-5", CS_ISO8859_5, 1 },
+    { "ISO-8859-6", CS_ISO8859_6, 1 },
+    { "ISO-8859-7", CS_ISO8859_7, 1 },
+    { "ISO-8859-8", CS_ISO8859_8, 1 },
+    { "ISO-8859-9", CS_ISO8859_9, 1 },
+    { "ISO-8859-10", CS_ISO8859_10, 1 },
+    { "ISO-8859-11", CS_ISO8859_11, 1 },
+    { "ISO-8859-13", CS_ISO8859_13, 1 },
+    { "ISO-8859-14", CS_ISO8859_14, 1 },
+    { "ISO-8859-15", CS_ISO8859_15, 1 },
+    { "ISO-8859-16", CS_ISO8859_16, 1 },
+    { "CP437", CS_CP437, 1 },
+    { "CP850", CS_CP850, 1 },
+    { "CP866", CS_CP866, 1 },
+    { "CP1250", CS_CP1250, 1 },
+    { "CP1251", CS_CP1251, 1 },
+    { "CP1252", CS_CP1252, 1 },
+    { "CP1253", CS_CP1253, 1 },
+    { "CP1254", CS_CP1254, 1 },
+    { "CP1255", CS_CP1255, 1 },
+    { "CP1256", CS_CP1256, 1 },
+    { "CP1257", CS_CP1257, 1 },
+    { "CP1258", CS_CP1258, 1 },
+    { "KOI8-R", CS_KOI8_R, 1 },
+    { "KOI8-U", CS_KOI8_U, 1 },
+    { "Mac Roman", CS_MAC_ROMAN, 1 },
+    { "Mac Turkish", CS_MAC_TURKISH, 1 },
+    { "Mac Croatian", CS_MAC_CROATIAN, 1 },
+    { "Mac Iceland", CS_MAC_ICELAND, 1 },
+    { "Mac Romanian", CS_MAC_ROMANIAN, 1 },
+    { "Mac Greek", CS_MAC_GREEK, 1 },
+    { "Mac Cyrillic", CS_MAC_CYRILLIC, 1 },
+    { "Mac Thai", CS_MAC_THAI, 1 },
+    { "Mac Centeuro", CS_MAC_CENTEURO, 1 },
+    { "Mac Symbol", CS_MAC_SYMBOL, 1 },
+    { "Mac Dingbats", CS_MAC_DINGBATS, 1 },
+    { "Mac Roman (old)", CS_MAC_ROMAN_OLD, 0 },
+    { "Mac Croatian (old)", CS_MAC_CROATIAN_OLD, 0 },
+    { "Mac Iceland (old)", CS_MAC_ICELAND_OLD, 0 },
+    { "Mac Romanian (old)", CS_MAC_ROMANIAN_OLD, 0 },
+    { "Mac Greek (old)", CS_MAC_GREEK_OLD, 0 },
+    { "Mac Cyrillic (old)", CS_MAC_CYRILLIC_OLD, 0 },
+    { "Mac Ukraine", CS_MAC_UKRAINE, 1 },
+    { "Mac VT100", CS_MAC_VT100, 1 },
+    { "Mac VT100 (old)", CS_MAC_VT100_OLD, 0 },
+    { "VISCII", CS_VISCII, 1 },
+    { "HP ROMAN8", CS_HP_ROMAN8, 1 },
+    { "DEC MCS", CS_DEC_MCS, 1 },
+    { "UTF-8", CS_UTF8, 1 },
+};
+
+const char *charset_to_localenc(int charset)
+{
+    int i;
+
+    for (i = 0; i < (int)lenof(localencs); i++)
+	if (charset == localencs[i].charset)
+	    return localencs[i].name;
+
+    return NULL;		       /* not found */
+}
+
+int charset_from_localenc(const char *name)
+{
+    int i;
+
+    if ( (i = charset_from_mimeenc(name)) != CS_NONE)
+	return i;
+    if ( (i = charset_from_xenc(name)) != CS_NONE)
+	return i;
+
+    for (i = 0; i < (int)lenof(localencs); i++) {
+	const char *p, *q;
+	p = name;
+	q = localencs[i].name;
+	while (*p || *q) {
+	    if (tolower(*p) != tolower(*q))
+		break;
+	    p++; q++;
+	}
+	if (!*p && !*q)
+	    return localencs[i].charset;
+    }
+
+    return CS_NONE;		       /* not found */
+}
+
+int charset_localenc_nth(int n)
+{
+    int i;
+
+    for (i = 0; i < (int)lenof(localencs); i++)
+	if (localencs[i].return_in_enum && !n--)
+	    return localencs[i].charset;
+
+    return CS_NONE;		       /* end of list */
+}
--- a/puttysrc/CHARSET/MACENC.C
+++ b/puttysrc/CHARSET/MACENC.C
@@ -0,0 +1,169 @@
+/* $Id: macenc.c 4787 2004-11-16 15:27:00Z simon $ */
+/*
+ * Copyright (c) 2003 Ben Harris
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * macenc.c -- Convert a Mac OS script/region/font combination to our
+ * internal charset code.
+ */
+
+#include <string.h>
+
+#include "charset.h"
+#include "internal.h"
+
+/*
+ * These are defined by Mac OS's <Script.h>, but we'd like to be
+ * independent of that.
+ */
+
+#define smRoman			0
+#define smJapanese		1
+#define smTradChinese		2
+#define smKorean		3
+#define smArabic		4
+#define smHebrew		5
+#define smCyrillic		7
+#define smDevenagari		9
+#define smGurmukhi		10
+#define smGujurati		11
+#define smThai			21
+#define smSimpChinese		25
+#define smTibetan		26
+#define smEthiopic		28
+#define smCentralEuroRoman	29
+
+#define verGreece		20
+#define verIceland		21
+#define verTurkey		24
+#define verYugoCroatian		25
+#define verRomania		39
+#define verFaroeIsl		47
+#define verIran			48
+#define verRussia		49
+#define verSlovenian		66
+#define verCroatia		68
+#define verBulgaria		72
+#define verScottishGaelic	75
+#define verManxGaelic		76
+#define verBreton		77
+#define verNunavut		78
+#define verWelsh		79
+#define verIrishGaelicScript	81
+
+static const struct {
+    int script;
+    int region;
+    int sysvermin;
+    char const *fontname;
+    int charset;
+} macencs[] = {
+    { smRoman, -1,                   0x850, "VT100", CS_MAC_VT100 },
+    { smRoman, -1,                   0,     "VT100", CS_MAC_VT100_OLD },
+    /*
+     * From here on, this table is largely derived from
+     * <http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/README.TXT>,
+     * with _OLD version added based on the comments in individual
+     * mapping files.
+     */
+    { smRoman, -1,                   0,     "Symbol", CS_MAC_SYMBOL },
+    { smRoman, -1,                   0,     "Zapf Dingbats", CS_MAC_DINGBATS },
+    { smRoman, verTurkey,            0,     NULL,    CS_MAC_TURKISH },
+    { smRoman, verYugoCroatian,      0x850, NULL,    CS_MAC_CROATIAN },
+    { smRoman, verYugoCroatian,      0,     NULL,    CS_MAC_CROATIAN_OLD },
+    { smRoman, verSlovenian,         0x850, NULL,    CS_MAC_CROATIAN },
+    { smRoman, verSlovenian,         0,     NULL,    CS_MAC_CROATIAN_OLD },
+    { smRoman, verCroatia,           0x850, NULL,    CS_MAC_CROATIAN },
+    { smRoman, verCroatia,           0,     NULL,    CS_MAC_CROATIAN_OLD },
+    { smRoman, verIceland,           0x850, NULL,    CS_MAC_ICELAND },
+    { smRoman, verIceland,           0,     NULL,    CS_MAC_ICELAND_OLD },
+    { smRoman, verFaroeIsl,          0x850, NULL,    CS_MAC_ICELAND },
+    { smRoman, verFaroeIsl,          0,     NULL,    CS_MAC_ICELAND_OLD },
+    { smRoman, verRomania,           0x850, NULL,    CS_MAC_ROMANIAN },
+    { smRoman, verRomania,           0,     NULL,    CS_MAC_ROMANIAN_OLD },
+#if 0 /* No mapping table on ftp.unicode.org */
+    { smRoman, verIreland,           0x850, NULL,    CS_MAC_CELTIC },
+    { smRoman, verIreland,           0,     NULL,    CS_MAC_CELTIC_OLD },
+    { smRoman, verScottishGaelic,    0x850, NULL,    CS_MAC_CELTIC },
+    { smRoman, verScottishGaelic,    0,     NULL,    CS_MAC_CELTIC_OLD },
+    { smRoman, verManxGaelic,        0x850, NULL,    CS_MAC_CELTIC },
+    { smRoman, verManxGaelic,        0,     NULL,    CS_MAC_CELTIC_OLD },
+    { smRoman, verBreton,            0x850, NULL,    CS_MAC_CELTIC },
+    { smRoman, verBreton,            0,     NULL,    CS_MAC_CELTIC_OLD },
+    { smRoman, verWelsh,             0x850, NULL,    CS_MAC_CELTIC },
+    { smRoman, verWelsh,             0,     NULL,    CS_MAC_CELTIC_OLD },
+    { smRoman, verIrishGaelicScript, 0x850, NULL,    CS_MAC_GAELIC },
+    { smRoman, verIrishGaelicScript, 0,     NULL,    CS_MAC_GAELIC_OLD },
+#endif
+    { smRoman, verGreece,            0x922, NULL,    CS_MAC_GREEK },
+    { smRoman, verGreece,            0,     NULL,    CS_MAC_GREEK_OLD },
+    { smRoman, -1,                   0x850, NULL,    CS_MAC_ROMAN },
+    { smRoman, -1,                   0,     NULL,    CS_MAC_ROMAN_OLD },
+#if 0 /* Multi-byte encodings, not yet supported */
+    { smJapanese,    -1,             0,     NULL,    CS_MAC_JAPANESE },
+    { smTradChinese, -1,             0,     NULL,    CS_MAC_CHINTRAD },
+    { smKorean,      -1,             0,     NULL,    CS_MAC_KOREAN },
+#endif
+#if 0 /* Bidirectional encodings, not yet supported */
+    { smArabic, verIran,             0,     NULL,    CS_MAC_FARSI },
+    { smArabic, -1,                  0,     NULL,    CS_MAC_ARABIC },
+    { smHebrew, -1,                  0,     NULL,    CS_MAC_HEBREW },
+#endif
+    { smCyrillic, -1,                0x900, NULL,    CS_MAC_CYRILLIC },
+    { smCyrillic, verRussia,         0,     NULL,    CS_MAC_CYRILLIC_OLD },
+    { smCyrillic, verBulgaria,       0,     NULL,    CS_MAC_CYRILLIC_OLD },
+    { smCyrillic, -1,                0,     NULL,    CS_MAC_UKRAINE },
+#if 0 /* Complex Indic scripts, not yet supported */
+    { smDevanagari, -1,              0,     NULL,    CS_MAC_DEVENAGA },
+    { smGurmukhi, -1,                0,     NULL,    CS_MAC_GURMUKHI },
+    { smGujurati, -1,                0,     NULL,    CS_MAC_GUJURATI },
+#endif
+    { smThai,  -1,                   0,     NULL,    CS_MAC_THAI },
+#if 0 /* Multi-byte encoding, not yet supported */
+    { smSimpChinese, -1,             0,     NULL,    CS_MAC_CHINSIMP },
+#endif
+#if 0 /* No mapping table on ftp.unicode.org */
+    { smTibetan, -1,                 0,     NULL,    CS_MAC_TIBETAN },
+    { smEthiopic, -1,                0,     NULL,    CS_MAC_ETHIOPIC },
+    { smEthiopic, verNanavut,        0,     NULL,    CS_MAC_INUIT },
+#endif
+    { smCentralEuroRoman, -1,        0,     NULL,    CS_MAC_CENTEURO },
+};
+
+int charset_from_macenc(int script, int region, int sysvers,
+			char const *fontname)
+{
+    int i;
+
+    for (i = 0; i < (int)lenof(macencs); i++)
+	if ((macencs[i].script == script) &&
+	    (macencs[i].region < 0 || macencs[i].region == region) &&
+	    (macencs[i].sysvermin <= sysvers) &&
+	    (macencs[i].fontname == NULL ||
+	     (fontname != NULL && strcmp(macencs[i].fontname, fontname) == 0)))
+	    return macencs[i].charset;
+
+    return CS_NONE;
+}
--- a/puttysrc/CHARSET/MIMEENC.C
+++ b/puttysrc/CHARSET/MIMEENC.C
@@ -0,0 +1,214 @@
+/*
+ * mimeenc.c - translate our internal character set codes to and
+ * from MIME standard character-set names.
+ * 
+ */
+
+#include <ctype.h>
+#include "charset.h"
+#include "internal.h"
+
+static const struct {
+    const char *name;
+    int charset;
+} mimeencs[] = {
+    /*
+     * These names are taken from
+     * 
+     *   http://www.iana.org/assignments/character-sets
+     * 
+     * Where multiple encoding names map to the same encoding id
+     * (such as the variety of aliases for ISO-8859-1), the first
+     * is considered canonical and will be returned when
+     * translating the id to a string.
+     */
+    { "ISO-8859-1", CS_ISO8859_1 },
+    { "iso-ir-100", CS_ISO8859_1 },
+    { "ISO_8859-1", CS_ISO8859_1 },
+    { "ISO_8859-1:1987", CS_ISO8859_1 },
+    { "latin1", CS_ISO8859_1 },
+    { "l1", CS_ISO8859_1 },
+    { "IBM819", CS_ISO8859_1 },
+    { "CP819", CS_ISO8859_1 },
+    { "csISOLatin1", CS_ISO8859_1 },
+
+    { "ISO-8859-2", CS_ISO8859_2 },
+    { "ISO_8859-2:1987", CS_ISO8859_2 },
+    { "iso-ir-101", CS_ISO8859_2 },
+    { "ISO_8859-2", CS_ISO8859_2 },
+    { "latin2", CS_ISO8859_2 },
+    { "l2", CS_ISO8859_2 },
+    { "csISOLatin2", CS_ISO8859_2 },
+
+    { "ISO-8859-3", CS_ISO8859_3 },
+    { "ISO_8859-3:1988", CS_ISO8859_3 },
+    { "iso-ir-109", CS_ISO8859_3 },
+    { "ISO_8859-3", CS_ISO8859_3 },
+    { "latin3", CS_ISO8859_3 },
+    { "l3", CS_ISO8859_3 },
+    { "csISOLatin3", CS_ISO8859_3 },
+
+    { "ISO-8859-4", CS_ISO8859_4 },
+    { "ISO_8859-4:1988", CS_ISO8859_4 },
+    { "iso-ir-110", CS_ISO8859_4 },
+    { "ISO_8859-4", CS_ISO8859_4 },
+    { "latin4", CS_ISO8859_4 },
+    { "l4", CS_ISO8859_4 },
+    { "csISOLatin4", CS_ISO8859_4 },
+
+    { "ISO-8859-5", CS_ISO8859_5 },
+    { "ISO_8859-5:1988", CS_ISO8859_5 },
+    { "iso-ir-144", CS_ISO8859_5 },
+    { "ISO_8859-5", CS_ISO8859_5 },
+    { "cyrillic", CS_ISO8859_5 },
+    { "csISOLatinCyrillic", CS_ISO8859_5 },
+
+    { "ISO-8859-6", CS_ISO8859_6 },
+    { "ISO_8859-6:1987", CS_ISO8859_6 },
+    { "iso-ir-127", CS_ISO8859_6 },
+    { "ISO_8859-6", CS_ISO8859_6 },
+    { "ECMA-114", CS_ISO8859_6 },
+    { "ASMO-708", CS_ISO8859_6 },
+    { "arabic", CS_ISO8859_6 },
+    { "csISOLatinArabic", CS_ISO8859_6 },
+
+    { "ISO-8859-7", CS_ISO8859_7 },
+    { "ISO_8859-7:1987", CS_ISO8859_7 },
+    { "iso-ir-126", CS_ISO8859_7 },
+    { "ISO_8859-7", CS_ISO8859_7 },
+    { "ELOT_928", CS_ISO8859_7 },
+    { "ECMA-118", CS_ISO8859_7 },
+    { "greek", CS_ISO8859_7 },
+    { "greek8", CS_ISO8859_7 },
+    { "csISOLatinGreek", CS_ISO8859_7 },
+
+    { "ISO-8859-8", CS_ISO8859_8 },
+    { "ISO_8859-8:1988", CS_ISO8859_8 },
+    { "iso-ir-138", CS_ISO8859_8 },
+    { "ISO_8859-8", CS_ISO8859_8 },
+    { "hebrew", CS_ISO8859_8 },
+    { "csISOLatinHebrew", CS_ISO8859_8 },
+
+    { "ISO-8859-9", CS_ISO8859_9 },
+    { "ISO_8859-9:1989", CS_ISO8859_9 },
+    { "iso-ir-148", CS_ISO8859_9 },
+    { "ISO_8859-9", CS_ISO8859_9 },
+    { "latin5", CS_ISO8859_9 },
+    { "l5", CS_ISO8859_9 },
+    { "csISOLatin5", CS_ISO8859_9 },
+
+    { "ISO-8859-10", CS_ISO8859_10 },
+    { "iso-ir-157", CS_ISO8859_10 },
+    { "l6", CS_ISO8859_10 },
+    { "ISO_8859-10:1992", CS_ISO8859_10 },
+    { "csISOLatin6", CS_ISO8859_10 },
+    { "latin6", CS_ISO8859_10 },
+
+    { "ISO-8859-13", CS_ISO8859_13 },
+
+    { "ISO-8859-14", CS_ISO8859_14 },
+    { "iso-ir-199", CS_ISO8859_14 },
+    { "ISO_8859-14:1998", CS_ISO8859_14 },
+    { "ISO_8859-14", CS_ISO8859_14 },
+    { "latin8", CS_ISO8859_14 },
+    { "iso-celtic", CS_ISO8859_14 },
+    { "l8", CS_ISO8859_14 },
+
+    { "ISO-8859-15", CS_ISO8859_15 },
+    { "ISO_8859-15", CS_ISO8859_15 },
+    { "Latin-9", CS_ISO8859_15 },
+
+    { "ISO-8859-16", CS_ISO8859_16 },
+    { "iso-ir-226", CS_ISO8859_16 },
+    { "ISO_8859-16", CS_ISO8859_16 },
+    { "ISO_8859-16:2001", CS_ISO8859_16 },
+    { "latin10", CS_ISO8859_16 },
+    { "l10", CS_ISO8859_16 },
+
+    { "IBM437", CS_CP437 },
+    { "cp437", CS_CP437 },
+    { "437", CS_CP437 },
+    { "csPC8CodePage437", CS_CP437 },
+
+    { "IBM850", CS_CP850 },
+    { "cp850", CS_CP850 },
+    { "850", CS_CP850 },
+    { "csPC850Multilingual", CS_CP850 },
+
+    { "IBM866", CS_CP866 },
+    { "cp866", CS_CP866 },
+    { "866", CS_CP866 },
+    { "csIBM866", CS_CP866 },
+
+    { "windows-1250", CS_CP1250 },
+
+    { "windows-1251", CS_CP1251 },
+
+    { "windows-1252", CS_CP1252 },
+
+    { "windows-1253", CS_CP1253 },
+
+    { "windows-1254", CS_CP1254 },
+
+    { "windows-1255", CS_CP1255 },
+
+    { "windows-1256", CS_CP1256 },
+
+    { "windows-1257", CS_CP1257 },
+
+    { "windows-1258", CS_CP1258 },
+
+    { "KOI8-R", CS_KOI8_R },
+    { "csKOI8R", CS_KOI8_R },
+
+    { "KOI8-U", CS_KOI8_U },
+
+    { "macintosh", CS_MAC_ROMAN_OLD },
+    { "mac", CS_MAC_ROMAN_OLD },
+    { "csMacintosh", CS_MAC_ROMAN_OLD },
+
+    { "VISCII", CS_VISCII },
+    { "csVISCII", CS_VISCII },
+
+    { "hp-roman8", CS_HP_ROMAN8 },
+    { "roman8", CS_HP_ROMAN8 },
+    { "r8", CS_HP_ROMAN8 },
+    { "csHPRoman8", CS_HP_ROMAN8 },
+
+    { "DEC-MCS", CS_DEC_MCS },
+    { "dec", CS_DEC_MCS },
+    { "csDECMCS", CS_DEC_MCS },
+
+    { "UTF-8", CS_UTF8 },
+};
+
+const char *charset_to_mimeenc(int charset)
+{
+    int i;
+
+    for (i = 0; i < (int)lenof(mimeencs); i++)
+	if (charset == mimeencs[i].charset)
+	    return mimeencs[i].name;
+
+    return NULL;		       /* not found */
+}
+
+int charset_from_mimeenc(const char *name)
+{
+    int i;
+
+    for (i = 0; i < (int)lenof(mimeencs); i++) {
+	const char *p, *q;
+	p = name;
+	q = mimeencs[i].name;
+	while (*p || *q) {
+	    if (tolower(*p) != tolower(*q))
+		break;
+	    p++; q++;
+	}
+	if (!*p && !*q)
+	    return mimeencs[i].charset;
+    }
+
+    return CS_NONE;		       /* not found */
+}
--- a/puttysrc/CHARSET/README
+++ b/puttysrc/CHARSET/README
@@ -0,0 +1,15 @@
+This subdirectory contains a general character-set conversion
+library, used in the Unix port of PuTTY, and available for use in
+other ports if it should happen to be useful.
+
+This is a variant of a library that's currently used in some other
+programs such as Timber and Halibut. At some future date, we would
+like to merge the two libraries, so that all programs use the same
+libcharset.
+
+It is therefore a _strong_ design goal that this library should remain
+perfectly general, and not tied to particulars of PuTTY. It must not
+reference any code outside its own subdirectory; it should not have
+PuTTY-specific helper routines added to it unless they can be
+documented in a general manner which might make them useful in other
+circumstances as well.
--- a/puttysrc/CHARSET/SBCS.C
+++ b/puttysrc/CHARSET/SBCS.C
@@ -0,0 +1,53 @@
+/*
+ * sbcs.c - routines to handle single-byte character sets.
+ */
+
+#include "charset.h"
+#include "internal.h"
+
+/*
+ * The charset_spec for any single-byte character set should
+ * provide read_sbcs() as its read function, and its `data' field
+ * should be a wchar_t string constant containing the 256 entries
+ * of the translation table.
+ */
+
+void read_sbcs(charset_spec const *charset, long int input_chr,
+	       charset_state *state,
+	       void (*emit)(void *ctx, long int output), void *emitctx)
+{
+    const struct sbcs_data *sd = charset->data;
+
+    UNUSEDARG(state);
+
+    emit(emitctx, sd->sbcs2ucs[input_chr]);
+}
+
+void write_sbcs(charset_spec const *charset, long int input_chr,
+		charset_state *state,
+		void (*emit)(void *ctx, long int output), void *emitctx)
+{
+    const struct sbcs_data *sd = charset->data;
+    int i, j, k, c;
+
+    UNUSEDARG(state);
+
+    /*
+     * Binary-search in the ucs2sbcs table.
+     */
+    i = -1;
+    j = sd->nvalid;
+    while (i+1 < j) {
+	k = (i+j)/2;
+	c = sd->ucs2sbcs[k];
+	if (input_chr < sd->sbcs2ucs[c])
+	    j = k;
+	else if (input_chr > sd->sbcs2ucs[c])
+	    i = k;
+	else {
+	    emit(emitctx, c);
+	    return;
+	}
+    }
+    emit(emitctx, ERROR);
+}
--- a/puttysrc/CHARSET/SBCS.DAT
+++ b/puttysrc/CHARSET/SBCS.DAT
--- a/puttysrc/CHARSET/SBCSDAT.C
+++ b/puttysrc/CHARSET/SBCSDAT.C
--- a/puttysrc/CHARSET/SBCSGEN.PL
+++ b/puttysrc/CHARSET/SBCSGEN.PL
@@ -0,0 +1,110 @@
+#!/usr/bin/env perl -w
+
+# This script generates sbcsdat.c (the data for all the SBCSes) from its
+# source form sbcs.dat.
+
+$infile = "sbcs.dat";
+$outfile = "sbcsdat.c";
+
+open FOO, $infile;
+open BAR, ">$outfile";
+select BAR;
+
+print "/*\n";
+print " * sbcsdat.c - data definitions for single-byte character sets.\n";
+print " *\n";
+print " * Generated by sbcsgen.pl from sbcs.dat.\n";
+print " * You should edit those files rather than editing this one.\n";
+print " */\n";
+print "\n";
+print "#ifndef ENUM_CHARSETS\n";
+print "\n";
+print "#include \"charset.h\"\n";
+print "#include \"internal.h\"\n";
+print "\n";
+
+my $charsetname = undef;
+my @vals = ();
+
+my @charsetnames = ();
+my @sortpriority = ();
+
+while (<FOO>) {
+    chomp;
+    if (/^charset (.*)$/) {
+	$charsetname = $1;
+	@vals = ();
+	@sortpriority = map { 0 } 0..255;
+    } elsif (/^sortpriority ([^-]*)-([^-]*) (.*)$/) {
+	for ($i = hex $1; $i <= hex $2; $i++) {
+	    $sortpriority[$i] += $3;
+	}
+    } elsif (/^[0-9a-fA-FX]/) {
+	push @vals, map { $_ eq "XXXX" ? -1 : hex $_ } split / +/, $_;
+	if (scalar @vals > 256) {
+	    die "$infile:$.: charset $charsetname has more than 256 values\n";
+	} elsif (scalar @vals == 256) {
+	    &outcharset($charsetname, \@vals, \@sortpriority);
+	    push @charsetnames, $charsetname;
+	    $charsetname = undef;
+	    @vals = ();
+	    @sortpriority = map { 0 } 0..255;
+	}
+    }
+}
+
+print "#else /* ENUM_CHARSETS */\n";
+print "\n";
+
+foreach $i (@charsetnames) {
+    print "ENUM_CHARSET($i)\n";
+}
+
+print "\n";
+print "#endif /* ENUM_CHARSETS */\n";
+
+sub outcharset($$$) {
+    my ($name, $vals, $sortpriority) = @_;
+    my ($prefix, $i, @sorted);
+
+    print "static const sbcs_data data_$name = {\n";
+    print "    {\n";
+    $prefix = "    ";
+    @sorted = ();
+    for ($i = 0; $i < 256; $i++) {
+	if ($vals->[$i] < 0) {
+	    printf "%sERROR ", $prefix;
+	} else {
+	    printf "%s0x%04x", $prefix, $vals->[$i];
+	    die "ooh? $i\n" unless defined $sortpriority->[$i];
+	    push @sorted, [$i, $vals->[$i], 0+$sortpriority->[$i]];
+	}
+	if ($i % 8 == 7) {
+	    $prefix = ",\n    ";
+	} else {
+	    $prefix = ", ";
+	}
+    }
+    print "\n    },\n    {\n";
+    @sorted = sort { ($a->[1] == $b->[1] ?
+	              $b->[2] <=> $a->[2] :
+	              $a->[1] <=> $b->[1]) ||
+                     $a->[0] <=> $b->[0] } @sorted;
+    $prefix = "    ";
+    $uval = -1;
+    for ($i = $j = 0; $i < scalar @sorted; $i++) {
+	next if ($uval == $sorted[$i]->[1]); # low-priority alternative
+	$uval = $sorted[$i]->[1];
+	printf "%s0x%02x", $prefix, $sorted[$i]->[0];
+	if ($j % 8 == 7) {
+	    $prefix = ",\n    ";
+	} else {
+	    $prefix = ", ";
+	}
+	$j++;
+    }
+    printf "\n    },\n    %d\n", $j;
+    print "};\n";
+    print "const charset_spec charset_$name = {\n" .
+          "    $name, read_sbcs, write_sbcs, &data_$name\n};\n\n";
+}
--- a/puttysrc/CHARSET/SLOOKUP.C
+++ b/puttysrc/CHARSET/SLOOKUP.C
@@ -0,0 +1,29 @@
+/*
+ * slookup.c - static lookup of character sets.
+ */
+
+#include "charset.h"
+#include "internal.h"
+
+#define ENUM_CHARSET(x) extern charset_spec const charset_##x;
+#include "enum.c"
+#undef ENUM_CHARSET
+
+static charset_spec const *const cs_table[] = {
+
+#define ENUM_CHARSET(x) &charset_##x,
+#include "enum.c"
+#undef ENUM_CHARSET
+
+};
+
+charset_spec const *charset_find_spec(int charset)
+{
+    int i;
+
+    for (i = 0; i < (int)lenof(cs_table); i++)
+	if (cs_table[i]->charset == charset)
+	    return cs_table[i];
+
+    return NULL;
+}
--- a/puttysrc/CHARSET/TOUCS.C
+++ b/puttysrc/CHARSET/TOUCS.C
@@ -0,0 +1,89 @@
+/*
+ * toucs.c - convert charsets to Unicode.
+ */
+
+#include "charset.h"
+#include "internal.h"
+
+struct unicode_emit_param {
+    wchar_t *output;
+    int outlen;
+    const wchar_t *errstr;
+    int errlen;
+    int stopped;
+};
+
+static void unicode_emit(void *ctx, long int output)
+{
+    struct unicode_emit_param *param = (struct unicode_emit_param *)ctx;
+    wchar_t outval;
+    wchar_t const *p;
+    int outlen;
+
+    if (output == ERROR) {
+	if (param->errstr) {
+	    p = param->errstr;
+	    outlen = param->errlen;
+	} else {
+	    outval = 0xFFFD;	       /* U+FFFD REPLACEMENT CHARACTER */
+	    p = &outval;
+	    outlen = 1;
+	}
+    } else {
+	outval = output;
+	p = &outval;
+	outlen = 1;
+    }
+
+    if (param->outlen >= outlen) {
+	while (outlen > 0) {
+	    *param->output++ = *p++;
+	    param->outlen--;
+	    outlen--;
+	}
+    } else {
+	param->stopped = 1;
+    }
+}
+
+int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen,
+		       int charset, charset_state *state,
+		       const wchar_t *errstr, int errlen)
+{
+    charset_spec const *spec = charset_find_spec(charset);
+    charset_state localstate;
+    struct unicode_emit_param param;
+
+    param.output = output;
+    param.outlen = outlen;
+    param.errstr = errstr;
+    param.errlen = errlen;
+    param.stopped = 0;
+
+    if (!state) {
+	localstate.s0 = 0;
+    } else {
+	localstate = *state;	       /* structure copy */
+    }
+
+    while (*inlen > 0) {
+	int lenbefore = param.output - output;
+	spec->read(spec, (unsigned char)**input, &localstate,
+		   unicode_emit, &param);
+	if (param.stopped) {
+	    /*
+	     * The emit function has _tried_ to output some
+	     * characters, but ran up against the end of the
+	     * buffer. Leave immediately, and return what happened
+	     * _before_ attempting to process this character.
+	     */
+	    return lenbefore;
+	}
+	if (state)
+	    *state = localstate;   /* structure copy */
+	(*input)++;
+	(*inlen)--;
+    }
+
+    return param.output - output;
+}
--- a/puttysrc/CHARSET/UTF8.C
+++ b/puttysrc/CHARSET/UTF8.C
@@ -0,0 +1,882 @@
+/*
+ * utf8.c - routines to handle UTF-8.
+ */
+
+#ifndef ENUM_CHARSETS
+
+#include "charset.h"
+#include "internal.h"
+
+void read_utf8(charset_spec const *, long int, charset_state *,
+	       void (*)(void *, long int), void *);
+void write_utf8(charset_spec const *, long int,
+		charset_state *, void (*)(void *, long int), void *);
+
+/*
+ * UTF-8 has no associated data, so `charset' may be ignored.
+ */
+
+void read_utf8(charset_spec const *charset, long int input_chr,
+	       charset_state *state,
+	       void (*emit)(void *ctx, long int output), void *emitctx)
+{
+    UNUSEDARG(charset);
+
+    /*
+     * For reading UTF-8, the `state' word contains:
+     * 
+     *  - in bits 29-31, the number of bytes expected to be in the
+     *    current multibyte character (which we can tell instantly
+     *    from the first byte, of course).
+     * 
+     *  - in bits 26-28, the number of bytes _seen so far_ in the
+     *    current multibyte character.
+     * 
+     *  - in the remainder of the word, the current value of the
+     *    character, which is shifted upwards by 6 bits to
+     *    accommodate each new byte.
+     * 
+     * As required, the state is zero when we are not in the middle
+     * of a multibyte character at all.
+     * 
+     * For example, when reading E9 8D 8B, starting at state=0:
+     * 
+     *  - after E9, the state is 0x64000009
+     *  - after 8D, the state is 0x6800024d
+     *  - after 8B, the state conceptually becomes 0x6c00934b, at
+     *    which point we notice we've got as many characters as we
+     *    were expecting, output U+934B, and reset the state to
+     *    zero.
+     *
+     * Note that the maximum number of bits we might need to store
+     * in the character value field is 25 (U+7FFFFFFF contains 31
+     * bits, but we will never actually store its full value
+     * because when we receive the last 6 bits in the final
+     * continuation byte we will output it and revert the state to
+     * zero). Hence the character value field never collides with
+     * the byte counts.
+     */
+
+    if (input_chr < 0x80) {
+	/*
+	 * Single-byte character. If the state is nonzero before
+	 * coming here, output an error for an incomplete sequence.
+	 * Then output the character.
+	 */
+	if (state->s0 != 0) {
+	    emit(emitctx, ERROR);
+	    state->s0 = 0;
+	}
+	emit(emitctx, input_chr);
+    } else if (input_chr == 0xFE || input_chr == 0xFF) {
+	/*
+	 * FE and FF bytes should _never_ occur in UTF-8. They are
+	 * automatic errors; if the state was nonzero to start
+	 * with, output a further error for an incomplete sequence.
+	 */
+	if (state->s0 != 0) {
+	    emit(emitctx, ERROR);
+	    state->s0 = 0;
+	}
+	emit(emitctx, ERROR);
+    } else if (input_chr >= 0x80 && input_chr < 0xC0) {
+	/*
+	 * Continuation byte. Output an error for an unexpected
+	 * continuation byte, if the state is zero.
+	 */
+	if (state->s0 == 0) {
+	    emit(emitctx, ERROR);
+	} else {
+	    unsigned long charval;
+	    unsigned long topstuff;
+	    int bytes;
+
+	    /*
+	     * Otherwise, accumulate more of the character value.
+	     */
+	    charval = state->s0 & 0x03ffffffL;
+	    charval = (charval << 6) | (input_chr & 0x3F);
+
+	    /*
+	     * Check the byte counts; if we have not reached the
+	     * end of the character, update the state and return.
+	     */
+	    topstuff = state->s0 & 0xfc000000L;
+	    topstuff += 0x04000000L;   /* add one to the byte count */
+	    if (((topstuff << 3) ^ topstuff) & 0xe0000000L) {
+		state->s0 = topstuff | charval;
+		return;
+	    }
+
+	    /*
+	     * Now we know we've reached the end of the character.
+	     * `charval' is the Unicode value. We should check for
+	     * various invalid things, and then either output
+	     * charval or an error. In all cases we reset the state
+	     * to zero.
+	     */
+	    bytes = topstuff >> 29;
+	    state->s0 = 0;
+
+	    if (charval >= 0xD800 && charval < 0xE000) {
+		/*
+		 * Surrogates (0xD800-0xDFFF) may never be encoded
+		 * in UTF-8. A surrogate pair in Unicode should
+		 * have been encoded as a single UTF-8 character
+		 * occupying more than three bytes.
+		 */
+		emit(emitctx, ERROR);
+	    } else if (charval == 0xFFFE || charval == 0xFFFF) {
+		/*
+		 * U+FFFE and U+FFFF are invalid Unicode characters
+		 * and may never be encoded in UTF-8. (This is one
+		 * reason why U+FFFF is our way of signalling an
+		 * error to our `emit' function :-)
+		 */
+		emit(emitctx, ERROR);
+	    } else if ((charval <= 0x7FL /* && bytes > 1 */) ||
+		       (charval <= 0x7FFL && bytes > 2) ||
+		       (charval <= 0xFFFFL && bytes > 3) ||
+		       (charval <= 0x1FFFFFL && bytes > 4) ||
+		       (charval <= 0x3FFFFFFL && bytes > 5)) {
+		/*
+		 * Overlong sequences are not to be tolerated,
+		 * under any circumstances.
+		 */
+		emit(emitctx, ERROR);
+	    } else {
+		/*
+		 * Oh, all right. We'll let this one off.
+		 */
+		emit(emitctx, charval);
+	    }
+	}
+
+    } else {
+	/*
+	 * Lead byte. First output an error for an incomplete
+	 * sequence, if the state is nonzero.
+	 */
+	if (state->s0 != 0)
+	    emit(emitctx, ERROR);
+
+	/*
+	 * Now deal with the lead byte: work out the number of
+	 * bytes we expect to see in this character, and extract
+	 * the initial bits of it too.
+	 */
+	if (input_chr >= 0xC0 && input_chr < 0xE0) {
+	    state->s0 = 0x44000000L | (input_chr & 0x1F);
+	} else if (input_chr >= 0xE0 && input_chr < 0xF0) {
+	    state->s0 = 0x64000000L | (input_chr & 0x0F);
+	} else if (input_chr >= 0xF0 && input_chr < 0xF8) {
+	    state->s0 = 0x84000000L | (input_chr & 0x07);
+	} else if (input_chr >= 0xF8 && input_chr < 0xFC) {
+	    state->s0 = 0xa4000000L | (input_chr & 0x03);
+	} else if (input_chr >= 0xFC && input_chr < 0xFE) {
+	    state->s0 = 0xc4000000L | (input_chr & 0x01);
+	}
+    }
+}
+
+/*
+ * UTF-8 is a stateless multi-byte encoding (in the sense that just
+ * after any character has been completed, the state is always the
+ * same); hence when writing it, there is no need to use the
+ * charset_state.
+ */
+
+void write_utf8(charset_spec const *charset, long int input_chr,
+		charset_state *state,
+		void (*emit)(void *ctx, long int output), void *emitctx)
+{
+    UNUSEDARG(charset);
+    UNUSEDARG(state);
+
+    /*
+     * Refuse to output any illegal code points.
+     */
+    if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
+	(input_chr >= 0xD800 && input_chr < 0xE000)) {
+	emit(emitctx, ERROR);
+    } else if (input_chr < 0x80) {     /* one-byte character */
+	emit(emitctx, input_chr);
+    } else if (input_chr < 0x800) {    /* two-byte character */
+	emit(emitctx, 0xC0 | (0x1F & (input_chr >>  6)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
+    } else if (input_chr < 0x10000) {  /* three-byte character */
+	emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
+    } else if (input_chr < 0x200000) { /* four-byte character */
+	emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
+    } else if (input_chr < 0x4000000) {/* five-byte character */
+	emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
+    } else {			       /* six-byte character */
+	emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
+	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
+    }
+}
+
+#ifdef TESTMODE
+
+#include <stdio.h>
+#include <stdarg.h>
+
+int total_errs = 0;
+
+void utf8_emit(void *ctx, long output)
+{
+    wchar_t **p = (wchar_t **)ctx;
+    *(*p)++ = output;
+}
+
+void utf8_read_test(int line, char *input, int inlen, ...)
+{
+    va_list ap;
+    wchar_t *p, str[512];
+    int i;
+    charset_state state;
+    unsigned long l;
+
+    state.s0 = 0;
+    p = str;
+
+    for (i = 0; i < inlen; i++)
+	read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
+
+    va_start(ap, inlen);
+    l = 0;
+    for (i = 0; i < p - str; i++) {
+	l = va_arg(ap, long int);
+	if (l == -1) {
+	    printf("%d: correct string shorter than output\n", line);
+	    total_errs++;
+	    break;
+	}
+	if (l != str[i]) {
+	    printf("%d: char %d came out as %08x, should be %08x\n",
+		    line, i, str[i], l);
+	    total_errs++;
+	}
+    }
+    if (l != -1) {
+	l = va_arg(ap, long int);
+	if (l != -1) {
+	    printf("%d: correct string longer than output\n", line);
+	    total_errs++;
+	}
+    }
+    va_end(ap);
+}
+
+void utf8_write_test(int line, const long *input, int inlen, ...)
+{
+    va_list ap;
+    wchar_t *p, str[512];
+    int i;
+    charset_state state;
+    unsigned long l;
+
+    state.s0 = 0;
+    p = str;
+
+    for (i = 0; i < inlen; i++)
+	write_utf8(NULL, input[i], &state, utf8_emit, &p);
+
+    va_start(ap, inlen);
+    l = 0;
+    for (i = 0; i < p - str; i++) {
+	l = va_arg(ap, long int);
+	if (l == -1) {
+	    printf("%d: correct string shorter than output\n", line);
+	    total_errs++;
+	    break;
+	}
+	if (l != str[i]) {
+	    printf("%d: char %d came out as %08x, should be %08x\n",
+		    line, i, str[i], l);
+	    total_errs++;
+	}
+    }
+    if (l != -1) {
+	l = va_arg(ap, long int);
+	if (l != -1) {
+	    printf("%d: correct string longer than output\n", line);
+	    total_errs++;
+	}
+    }
+    va_end(ap);
+}
+
+/* Macro to concoct the first three parameters of utf8_read_test. */
+#define TESTSTR(x) __LINE__, x, lenof(x)
+
+int main(void)
+{
+    printf("read tests beginning\n");
+    utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
+		   0x000003BA, /* GREEK SMALL LETTER KAPPA */
+		   0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
+		   0x000003C3, /* GREEK SMALL LETTER SIGMA */
+		   0x000003BC, /* GREEK SMALL LETTER MU */
+		   0x000003B5, /* GREEK SMALL LETTER EPSILON */
+		   0, -1);
+    utf8_read_test(TESTSTR("\x00"),
+		   0x00000000, /* <control> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xC2\x80"),
+		   0x00000080, /* <control> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xE0\xA0\x80"),
+		   0x00000800, /* <no name available> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
+		   0x00010000, /* <no name available> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
+		   0x00200000, /* <no name available> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
+		   0x04000000, /* <no name available> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\x7F"),
+		   0x0000007F, /* <control> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xDF\xBF"),
+		   0x000007FF, /* <no name available> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
+		   0x0000FFFD, /* REPLACEMENT CHARACTER */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
+		   ERROR,      /* <no name available> (invalid char) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
+		   0x001FFFFF, /* <no name available> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
+		   0x03FFFFFF, /* <no name available> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
+		   0x7FFFFFFF, /* <no name available> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\x9F\xBF"),
+		   0x0000D7FF, /* <no name available> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xEE\x80\x80"),
+		   0x0000E000, /* <Private Use, First> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
+		   0x0000FFFD, /* REPLACEMENT CHARACTER */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
+		   0x0010FFFF, /* <no name available> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
+		   0x00110000, /* <no name available> */
+		   0, -1);
+    utf8_read_test(TESTSTR("\x80"),
+		   ERROR,      /* (unexpected continuation byte) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xBF"),
+		   ERROR,      /* (unexpected continuation byte) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\x80\xBF"),
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\x80\xBF\x80"),
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   ERROR,      /* (unexpected continuation byte) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   ERROR,      /* (incomplete sequence) */
+		   0x00000020, /* SPACE */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xC0"),
+		   ERROR,      /* (incomplete sequence) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xE0\x80"),
+		   ERROR,      /* (incomplete sequence) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF0\x80\x80"),
+		   ERROR,      /* (incomplete sequence) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
+		   ERROR,      /* (incomplete sequence) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
+		   ERROR,      /* (incomplete sequence) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xDF"),
+		   ERROR,      /* (incomplete sequence) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xEF\xBF"),
+		   ERROR,      /* (incomplete sequence) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
+		   ERROR,      /* (incomplete sequence) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
+		   ERROR,      /* (incomplete sequence) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
+		   ERROR,      /* (incomplete sequence) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
+		   ERROR,      /* (incomplete sequence) */
+		   ERROR,      /* (incomplete sequence) */
+		   ERROR,      /* (incomplete sequence) */
+		   ERROR,      /* (incomplete sequence) */
+		   ERROR,      /* (incomplete sequence) */
+		   ERROR,      /* (incomplete sequence) */
+		   ERROR,      /* (incomplete sequence) */
+		   ERROR,      /* (incomplete sequence) */
+		   ERROR,      /* (incomplete sequence) */
+		   ERROR,      /* (incomplete sequence) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFE"),
+		   ERROR,      /* (invalid UTF-8 byte) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFF"),
+		   ERROR,      /* (invalid UTF-8 byte) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
+		   ERROR,      /* (invalid UTF-8 byte) */
+		   ERROR,      /* (invalid UTF-8 byte) */
+		   ERROR,      /* (invalid UTF-8 byte) */
+		   ERROR,      /* (invalid UTF-8 byte) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xC0\xAF"),
+		   ERROR,      /* SOLIDUS (overlong form of 2F) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xE0\x80\xAF"),
+		   ERROR,      /* SOLIDUS (overlong form of 2F) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
+		   ERROR,      /* SOLIDUS (overlong form of 2F) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
+		   ERROR,      /* SOLIDUS (overlong form of 2F) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
+		   ERROR,      /* SOLIDUS (overlong form of 2F) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xC1\xBF"),
+		   ERROR,      /* <control> (overlong form of 7F) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
+		   ERROR,      /* <no name available> (overlong form of DF BF) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
+		   ERROR,      /* <no name available> (overlong form of EF BF BF) (invalid char) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
+		   ERROR,      /* <no name available> (overlong form of F7 BF BF BF) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
+		   ERROR,      /* <no name available> (overlong form of FB BF BF BF BF) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xC0\x80"),
+		   ERROR,      /* <control> (overlong form of 00) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xE0\x80\x80"),
+		   ERROR,      /* <control> (overlong form of 00) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
+		   ERROR,      /* <control> (overlong form of 00) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
+		   ERROR,      /* <control> (overlong form of 00) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
+		   ERROR,      /* <control> (overlong form of 00) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xA0\x80"),
+		   ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xAD\xBF"),
+		   ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xAE\x80"),
+		   ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xAF\xBF"),
+		   ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xB0\x80"),
+		   ERROR,      /* <Low Surrogate, First> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xBE\x80"),
+		   ERROR,      /* <no name available> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xBF\xBF"),
+		   ERROR,      /* <Low Surrogate, Last> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
+		   ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
+		   ERROR,      /* <Low Surrogate, First> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
+		   ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
+		   ERROR,      /* <Low Surrogate, Last> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
+		   ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
+		   ERROR,      /* <Low Surrogate, First> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
+		   ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
+		   ERROR,      /* <Low Surrogate, Last> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
+		   ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
+		   ERROR,      /* <Low Surrogate, First> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
+		   ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
+		   ERROR,      /* <Low Surrogate, Last> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
+		   ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
+		   ERROR,      /* <Low Surrogate, First> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
+		   ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
+		   ERROR,      /* <Low Surrogate, Last> (surrogate) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
+		   ERROR,      /* <no name available> (invalid char) */
+		   0, -1);
+    utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
+		   ERROR,      /* <no name available> (invalid char) */
+		   0, -1);
+    printf("read tests completed\n");
+    printf("write tests beginning\n");
+    {
+	const static long str[] =
+	{0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
+	utf8_write_test(TESTSTR(str),
+			0xCE, 0xBA,
+			0xE1, 0xBD, 0xB9,
+			0xCF, 0x83,
+			0xCE, 0xBC,
+			0xCE, 0xB5,
+			0, -1);
+    }
+    {
+	const static long str[] = {0x0000L, 0};
+	utf8_write_test(TESTSTR(str),
+			0x00,
+			0, -1);
+    }
+    {
+	const static long str[] = {0x0080L, 0};
+	utf8_write_test(TESTSTR(str),
+			0xC2, 0x80,
+			0, -1);
+    }
+    {
+	const static long str[] = {0x0800L, 0};
+	utf8_write_test(TESTSTR(str),
+			0xE0, 0xA0, 0x80,
+			0, -1);
+    }
+    {
+	const static long str[] = {0x00010000L, 0};
+	utf8_write_test(TESTSTR(str),
+			0xF0, 0x90, 0x80, 0x80,
+			0, -1);
+    }
+    {
+	const static long str[] = {0x00200000L, 0};
+	utf8_write_test(TESTSTR(str),
+			0xF8, 0x88, 0x80, 0x80, 0x80,
+			0, -1);
+    }
+    {
+	const static long str[] = {0x04000000L, 0};
+	utf8_write_test(TESTSTR(str),
+			0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
+			0, -1);
+    }
+    {
+	const static long str[] = {0x007FL, 0};
+	utf8_write_test(TESTSTR(str),
+			0x7F,
+			0, -1);
+    }
+    {
+	const static long str[] = {0x07FFL, 0};
+	utf8_write_test(TESTSTR(str),
+			0xDF, 0xBF,
+			0, -1);
+    }
+    {
+	const static long str[] = {0xFFFDL, 0};
+	utf8_write_test(TESTSTR(str),
+			0xEF, 0xBF, 0xBD,
+			0, -1);
+    }
+    {
+	const static long str[] = {0xFFFFL, 0};
+	utf8_write_test(TESTSTR(str),
+			ERROR,
+			0, -1);
+    }
+    {
+	const static long str[] = {0x001FFFFFL, 0};
+	utf8_write_test(TESTSTR(str),
+			0xF7, 0xBF, 0xBF, 0xBF,
+			0, -1);
+    }
+    {
+	const static long str[] = {0x03FFFFFFL, 0};
+	utf8_write_test(TESTSTR(str),
+			0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
+			0, -1);
+    }
+    {
+	const static long str[] = {0x7FFFFFFFL, 0};
+	utf8_write_test(TESTSTR(str),
+			0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
+			0, -1);
+    }
+    {
+	const static long str[] = {0xD7FFL, 0};
+	utf8_write_test(TESTSTR(str),
+			0xED, 0x9F, 0xBF,
+			0, -1);
+    }
+    {
+	const static long str[] = {0xD800L, 0};
+	utf8_write_test(TESTSTR(str),
+			ERROR,
+			0, -1);
+    }
+    {
+	const static long str[] = {0xD800L, 0xDC00L, 0};
+	utf8_write_test(TESTSTR(str),
+			ERROR,
+			ERROR,
+			0, -1);
+    }
+    {
+	const static long str[] = {0xDFFFL, 0};
+	utf8_write_test(TESTSTR(str),
+			ERROR,
+			0, -1);
+    }
+    {
+	const static long str[] = {0xE000L, 0};
+	utf8_write_test(TESTSTR(str),
+			0xEE, 0x80, 0x80,
+			0, -1);
+    }
+    printf("write tests completed\n");
+
+    printf("total: %d errors\n", total_errs);
+    return (total_errs != 0);
+}
+#endif /* TESTMODE */
+
+const charset_spec charset_CS_UTF8 = {
+    CS_UTF8, read_utf8, write_utf8, NULL
+};
+
+#else /* ENUM_CHARSETS */
+
+ENUM_CHARSET(CS_UTF8)
+
+#endif /* ENUM_CHARSETS */
--- a/puttysrc/CHARSET/XENC.C
+++ b/puttysrc/CHARSET/XENC.C
@@ -0,0 +1,93 @@
+/*
+ * xenc.c - translate our internal character set codes to and from
+ * X11 character encoding names.
+ * 
+ */
+
+#include <ctype.h>
+#include "charset.h"
+#include "internal.h"
+
+static const struct {
+    const char *name;
+    int charset;
+} xencs[] = {
+    /*
+     * Officially registered encoding names. This list is derived
+     * from the font encodings section of
+     * 
+     *   http://ftp.x.org/pub/DOCS/registry
+     * 
+     * Where multiple encoding names map to the same encoding id
+     * (such as iso8859-15 and fcd8859-15), the first is considered
+     * canonical and will be returned when translating the id to a
+     * string.
+     */
+    { "iso8859-1", CS_ISO8859_1 },
+    { "iso8859-2", CS_ISO8859_2 },
+    { "iso8859-3", CS_ISO8859_3 },
+    { "iso8859-4", CS_ISO8859_4 },
+    { "iso8859-5", CS_ISO8859_5 },
+    { "iso8859-6", CS_ISO8859_6 },
+    { "iso8859-7", CS_ISO8859_7 },
+    { "iso8859-8", CS_ISO8859_8 },
+    { "iso8859-9", CS_ISO8859_9 },
+    { "iso8859-10", CS_ISO8859_10 },
+    { "iso8859-13", CS_ISO8859_13 },
+    { "iso8859-14", CS_ISO8859_14 },
+    { "iso8859-15", CS_ISO8859_15 },
+    { "fcd8859-15", CS_ISO8859_15 },
+    { "hp-roman8", CS_HP_ROMAN8 },
+    { "koi8-r", CS_KOI8_R },
+    /*
+     * Unofficial encoding names found in the wild.
+     */
+    { "iso8859-16", CS_ISO8859_16 },
+    { "koi8-u", CS_KOI8_U },
+    { "ibm-cp437", CS_CP437 },
+    { "ibm-cp850", CS_CP850 },
+    { "ibm-cp866", CS_CP866 },
+    { "microsoft-cp1250", CS_CP1250 },
+    { "microsoft-cp1251", CS_CP1251 },
+    { "microsoft-cp1252", CS_CP1252 },
+    { "microsoft-cp1253", CS_CP1253 },
+    { "microsoft-cp1254", CS_CP1254 },
+    { "microsoft-cp1255", CS_CP1255 },
+    { "microsoft-cp1256", CS_CP1256 },
+    { "microsoft-cp1257", CS_CP1257 },
+    { "microsoft-cp1258", CS_CP1258 },
+    { "mac-roman", CS_MAC_ROMAN },
+    { "viscii1.1-1", CS_VISCII },
+    { "viscii1-1", CS_VISCII },
+};
+
+const char *charset_to_xenc(int charset)
+{
+    int i;
+
+    for (i = 0; i < (int)lenof(xencs); i++)
+	if (charset == xencs[i].charset)
+	    return xencs[i].name;
+
+    return NULL;		       /* not found */
+}
+
+int charset_from_xenc(const char *name)
+{
+    int i;
+
+    for (i = 0; i < (int)lenof(xencs); i++) {
+	const char *p, *q;
+	p = name;
+	q = xencs[i].name;
+	while (*p || *q) {
+	    if (tolower(*p) != tolower(*q))
+		break;
+	    p++; q++;
+	}
+	if (!*p && !*q)
+	    return xencs[i].charset;
+    }
+
+    return CS_NONE;		       /* not found */
+}