checking in all the old panacean stuff

This commit is contained in:
2016-07-25 15:42:39 -04:00
parent c996cdd81f
commit 8fd9e44ae5
1210 changed files with 220657 additions and 0 deletions

154
puttysrc/CHARSET/CHARSET.H Normal file
View File

@@ -0,0 +1,154 @@
/*
* charset.h - header file for general character set conversion
* routines.
*/
#ifndef charset_charset_h
#define charset_charset_h
#include <stddef.h>
/*
* Enumeration that lists all the multibyte or single-byte
* character sets known to this library.
*/
typedef enum {
CS_NONE, /* used for reporting errors, etc */
CS_ISO8859_1,
CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */
CS_ISO8859_2,
CS_ISO8859_3,
CS_ISO8859_4,
CS_ISO8859_5,
CS_ISO8859_6,
CS_ISO8859_7,
CS_ISO8859_8,
CS_ISO8859_9,
CS_ISO8859_10,
CS_ISO8859_11,
CS_ISO8859_13,
CS_ISO8859_14,
CS_ISO8859_15,
CS_ISO8859_16,
CS_CP437,
CS_CP850,
CS_CP866,
CS_CP1250,
CS_CP1251,
CS_CP1252,
CS_CP1253,
CS_CP1254,
CS_CP1255,
CS_CP1256,
CS_CP1257,
CS_CP1258,
CS_KOI8_R,
CS_KOI8_U,
CS_MAC_ROMAN,
CS_MAC_TURKISH,
CS_MAC_CROATIAN,
CS_MAC_ICELAND,
CS_MAC_ROMANIAN,
CS_MAC_GREEK,
CS_MAC_CYRILLIC,
CS_MAC_THAI,
CS_MAC_CENTEURO,
CS_MAC_SYMBOL,
CS_MAC_DINGBATS,
CS_MAC_ROMAN_OLD,
CS_MAC_CROATIAN_OLD,
CS_MAC_ICELAND_OLD,
CS_MAC_ROMANIAN_OLD,
CS_MAC_GREEK_OLD,
CS_MAC_CYRILLIC_OLD,
CS_MAC_UKRAINE,
CS_MAC_VT100,
CS_MAC_VT100_OLD,
CS_VISCII,
CS_HP_ROMAN8,
CS_DEC_MCS,
CS_UTF8
} charset_t;
typedef struct {
unsigned long s0;
} charset_state;
/*
* Routine to convert a MB/SB character set to Unicode.
*
* This routine accepts some number of bytes, updates a state
* variable, and outputs some number of Unicode characters. There
* are no guarantees. You can't even guarantee that at most one
* Unicode character will be output per byte you feed in; for
* example, suppose you're reading UTF-8, you've seen E1 80, and
* then you suddenly see FE. Now you need to output _two_ error
* characters - one for the incomplete sequence E1 80, and one for
* the completely invalid UTF-8 byte FE.
*
* Returns the number of wide characters output; will never output
* more than the size of the buffer (as specified on input).
* Advances the `input' pointer and decrements `inlen', to indicate
* how far along the input string it got.
*
* The sequence of `errlen' wide characters pointed to by `errstr'
* will be used to indicate a conversion error. If `errstr' is
* NULL, `errlen' will be ignored, and the library will choose
* something sensible to do on its own. For Unicode, this will be
* U+FFFD (REPLACEMENT CHARACTER).
*/
int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen,
int charset, charset_state *state,
const wchar_t *errstr, int errlen);
/*
* Routine to convert Unicode to an MB/SB character set.
*
* This routine accepts some number of Unicode characters, updates
* a state variable, and outputs some number of bytes.
*
* Returns the number of bytes characters output; will never output
* more than the size of the buffer (as specified on input), and
* will never output a partial MB character. Advances the `input'
* pointer and decrements `inlen', to indicate how far along the
* input string it got.
*
* The sequence of `errlen' characters pointed to by `errstr' will
* be used to indicate a conversion error. If `errstr' is NULL,
* `errlen' will be ignored, and the library will choose something
* sensible to do on its own (which will vary depending on the
* output charset).
*/
int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen,
int charset, charset_state *state,
const char *errstr, int errlen);
/*
* Convert X11 encoding names to and from our charset identifiers.
*/
const char *charset_to_xenc(int charset);
int charset_from_xenc(const char *name);
/*
* Convert MIME encoding names to and from our charset identifiers.
*/
const char *charset_to_mimeenc(int charset);
int charset_from_mimeenc(const char *name);
/*
* Convert our own encoding names to and from our charset
* identifiers.
*/
const char *charset_to_localenc(int charset);
int charset_from_localenc(const char *name);
int charset_localenc_nth(int n);
/*
* Convert Mac OS script/region/font to our charset identifiers.
*/
int charset_from_macenc(int script, int region, int sysvers,
const char *fontname);
#endif /* charset_charset_h */

19
puttysrc/CHARSET/ENUM.C Normal file
View File

@@ -0,0 +1,19 @@
/*
* enum.c - enumerate all charsets defined by the library.
*
* This file maintains a list of every other source file which
* contains ENUM_CHARSET definitions. It #includes each one with
* ENUM_CHARSETS defined, which causes those source files to do
* nothing at all except call the ENUM_CHARSET macro on each
* charset they define.
*
* This file in turn is included from various other places, with
* the ENUM_CHARSET macro defined to various different things. This
* allows us to have multiple implementations of the master charset
* lookup table (a static one and a dynamic one).
*/
#define ENUM_CHARSETS
#include "sbcsdat.c"
#include "utf8.c"
#undef ENUM_CHARSETS

View File

@@ -0,0 +1,91 @@
/*
* fromucs.c - convert Unicode to other character sets.
*/
#include "charset.h"
#include "internal.h"
struct charset_emit_param {
char *output;
int outlen;
const char *errstr;
int errlen;
int stopped;
};
static void charset_emit(void *ctx, long int output)
{
struct charset_emit_param *param = (struct charset_emit_param *)ctx;
char outval;
char const *p;
int outlen;
if (output == ERROR) {
p = param->errstr;
outlen = param->errlen;
} else {
outval = output;
p = &outval;
outlen = 1;
}
if (param->outlen >= outlen) {
while (outlen > 0) {
*param->output++ = *p++;
param->outlen--;
outlen--;
}
} else {
param->stopped = 1;
}
}
int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen,
int charset, charset_state *state,
const char *errstr, int errlen)
{
charset_spec const *spec = charset_find_spec(charset);
charset_state localstate;
struct charset_emit_param param;
param.output = output;
param.outlen = outlen;
param.stopped = 0;
/*
* charset_emit will expect a valid errstr.
*/
if (!errstr) {
/* *shrug* this is good enough, and consistent across all SBCS... */
param.errstr = ".";
param.errlen = 1;
}
param.errstr = errstr;
param.errlen = errlen;
if (!state) {
localstate.s0 = 0;
} else {
localstate = *state; /* structure copy */
}
state = &localstate;
while (*inlen > 0) {
int lenbefore = param.output - output;
spec->write(spec, **input, &localstate, charset_emit, &param);
if (param.stopped) {
/*
* The emit function has _tried_ to output some
* characters, but ran up against the end of the
* buffer. Leave immediately, and return what happened
* _before_ attempting to process this character.
*/
return lenbefore;
}
if (state)
*state = localstate; /* structure copy */
(*input)++;
(*inlen)--;
}
return param.output - output;
}

View File

@@ -0,0 +1,89 @@
/*
* internal.h - internal header stuff for the charset library.
*/
#ifndef charset_internal_h
#define charset_internal_h
/* This invariably comes in handy */
#define lenof(x) ( sizeof((x)) / sizeof(*(x)) )
/* This is an invalid Unicode value used to indicate an error. */
#define ERROR 0xFFFFL /* Unicode value representing error */
typedef struct charset_spec charset_spec;
typedef struct sbcs_data sbcs_data;
struct charset_spec {
int charset; /* numeric identifier */
/*
* A function to read the character set and output Unicode
* characters. The `emit' function expects to get Unicode chars
* passed to it; it should be sent ERROR for any encoding error
* on the input.
*/
void (*read)(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx);
/*
* A function to read Unicode characters and output in this
* character set. The `emit' function expects to get byte
* values passed to it; it should be sent ERROR for any
* non-representable characters on the input.
*/
void (*write)(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx);
void const *data;
};
/*
* This is the format of `data' used by the SBCS read and write
* functions; so it's the format used in all SBCS definitions.
*/
struct sbcs_data {
/*
* This is a simple mapping table converting each SBCS position
* to a Unicode code point. Some positions may contain ERROR,
* indicating that that byte value is not defined in the SBCS
* in question and its occurrence in input is an error.
*/
unsigned long sbcs2ucs[256];
/*
* This lookup table is used to convert Unicode back to the
* SBCS. It consists of the valid byte values in the SBCS,
* sorted in order of their Unicode translation. So given a
* Unicode value U, you can do a binary search on this table
* using the above table as a lookup: when testing the Xth
* position in this table, you branch according to whether
* sbcs2ucs[ucs2sbcs[X]] is less than, greater than, or equal
* to U.
*
* Note that since there may be fewer than 256 valid byte
* values in a particular SBCS, we must supply the length of
* this table as well as the contents.
*/
unsigned char ucs2sbcs[256];
int nvalid;
};
/*
* Prototypes for internal library functions.
*/
charset_spec const *charset_find_spec(int charset);
void read_sbcs(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx);
void write_sbcs(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx);
/*
* Placate compiler warning about unused parameters, of which we
* expect to have some in this library.
*/
#define UNUSEDARG(x) ( (x) = (x) )
#endif /* charset_internal_h */

125
puttysrc/CHARSET/LOCALENC.C Normal file
View File

@@ -0,0 +1,125 @@
/*
* local.c - translate our internal character set codes to and from
* our own set of plausibly legible character-set names. Also
* provides a canonical name for each encoding (useful for software
* announcing what character set it will be using), and a set of
* enumeration functions which return a list of supported
* encodings one by one.
*
* charset_from_localenc will attempt all other text translations
* as well as this table, to maximise the number of different ways
* you can select a supported charset.
*/
#include <ctype.h>
#include "charset.h"
#include "internal.h"
static const struct {
const char *name;
int charset;
int return_in_enum; /* enumeration misses some charsets */
} localencs[] = {
{ "<UNKNOWN>", CS_NONE, 0 },
{ "ISO-8859-1", CS_ISO8859_1, 1 },
{ "ISO-8859-1 with X11 line drawing", CS_ISO8859_1_X11, 0 },
{ "ISO-8859-2", CS_ISO8859_2, 1 },
{ "ISO-8859-3", CS_ISO8859_3, 1 },
{ "ISO-8859-4", CS_ISO8859_4, 1 },
{ "ISO-8859-5", CS_ISO8859_5, 1 },
{ "ISO-8859-6", CS_ISO8859_6, 1 },
{ "ISO-8859-7", CS_ISO8859_7, 1 },
{ "ISO-8859-8", CS_ISO8859_8, 1 },
{ "ISO-8859-9", CS_ISO8859_9, 1 },
{ "ISO-8859-10", CS_ISO8859_10, 1 },
{ "ISO-8859-11", CS_ISO8859_11, 1 },
{ "ISO-8859-13", CS_ISO8859_13, 1 },
{ "ISO-8859-14", CS_ISO8859_14, 1 },
{ "ISO-8859-15", CS_ISO8859_15, 1 },
{ "ISO-8859-16", CS_ISO8859_16, 1 },
{ "CP437", CS_CP437, 1 },
{ "CP850", CS_CP850, 1 },
{ "CP866", CS_CP866, 1 },
{ "CP1250", CS_CP1250, 1 },
{ "CP1251", CS_CP1251, 1 },
{ "CP1252", CS_CP1252, 1 },
{ "CP1253", CS_CP1253, 1 },
{ "CP1254", CS_CP1254, 1 },
{ "CP1255", CS_CP1255, 1 },
{ "CP1256", CS_CP1256, 1 },
{ "CP1257", CS_CP1257, 1 },
{ "CP1258", CS_CP1258, 1 },
{ "KOI8-R", CS_KOI8_R, 1 },
{ "KOI8-U", CS_KOI8_U, 1 },
{ "Mac Roman", CS_MAC_ROMAN, 1 },
{ "Mac Turkish", CS_MAC_TURKISH, 1 },
{ "Mac Croatian", CS_MAC_CROATIAN, 1 },
{ "Mac Iceland", CS_MAC_ICELAND, 1 },
{ "Mac Romanian", CS_MAC_ROMANIAN, 1 },
{ "Mac Greek", CS_MAC_GREEK, 1 },
{ "Mac Cyrillic", CS_MAC_CYRILLIC, 1 },
{ "Mac Thai", CS_MAC_THAI, 1 },
{ "Mac Centeuro", CS_MAC_CENTEURO, 1 },
{ "Mac Symbol", CS_MAC_SYMBOL, 1 },
{ "Mac Dingbats", CS_MAC_DINGBATS, 1 },
{ "Mac Roman (old)", CS_MAC_ROMAN_OLD, 0 },
{ "Mac Croatian (old)", CS_MAC_CROATIAN_OLD, 0 },
{ "Mac Iceland (old)", CS_MAC_ICELAND_OLD, 0 },
{ "Mac Romanian (old)", CS_MAC_ROMANIAN_OLD, 0 },
{ "Mac Greek (old)", CS_MAC_GREEK_OLD, 0 },
{ "Mac Cyrillic (old)", CS_MAC_CYRILLIC_OLD, 0 },
{ "Mac Ukraine", CS_MAC_UKRAINE, 1 },
{ "Mac VT100", CS_MAC_VT100, 1 },
{ "Mac VT100 (old)", CS_MAC_VT100_OLD, 0 },
{ "VISCII", CS_VISCII, 1 },
{ "HP ROMAN8", CS_HP_ROMAN8, 1 },
{ "DEC MCS", CS_DEC_MCS, 1 },
{ "UTF-8", CS_UTF8, 1 },
};
const char *charset_to_localenc(int charset)
{
int i;
for (i = 0; i < (int)lenof(localencs); i++)
if (charset == localencs[i].charset)
return localencs[i].name;
return NULL; /* not found */
}
int charset_from_localenc(const char *name)
{
int i;
if ( (i = charset_from_mimeenc(name)) != CS_NONE)
return i;
if ( (i = charset_from_xenc(name)) != CS_NONE)
return i;
for (i = 0; i < (int)lenof(localencs); i++) {
const char *p, *q;
p = name;
q = localencs[i].name;
while (*p || *q) {
if (tolower(*p) != tolower(*q))
break;
p++; q++;
}
if (!*p && !*q)
return localencs[i].charset;
}
return CS_NONE; /* not found */
}
int charset_localenc_nth(int n)
{
int i;
for (i = 0; i < (int)lenof(localencs); i++)
if (localencs[i].return_in_enum && !n--)
return localencs[i].charset;
return CS_NONE; /* end of list */
}

169
puttysrc/CHARSET/MACENC.C Normal file
View File

@@ -0,0 +1,169 @@
/* $Id: macenc.c 4787 2004-11-16 15:27:00Z simon $ */
/*
* Copyright (c) 2003 Ben Harris
* All rights reserved.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/*
* macenc.c -- Convert a Mac OS script/region/font combination to our
* internal charset code.
*/
#include <string.h>
#include "charset.h"
#include "internal.h"
/*
* These are defined by Mac OS's <Script.h>, but we'd like to be
* independent of that.
*/
#define smRoman 0
#define smJapanese 1
#define smTradChinese 2
#define smKorean 3
#define smArabic 4
#define smHebrew 5
#define smCyrillic 7
#define smDevenagari 9
#define smGurmukhi 10
#define smGujurati 11
#define smThai 21
#define smSimpChinese 25
#define smTibetan 26
#define smEthiopic 28
#define smCentralEuroRoman 29
#define verGreece 20
#define verIceland 21
#define verTurkey 24
#define verYugoCroatian 25
#define verRomania 39
#define verFaroeIsl 47
#define verIran 48
#define verRussia 49
#define verSlovenian 66
#define verCroatia 68
#define verBulgaria 72
#define verScottishGaelic 75
#define verManxGaelic 76
#define verBreton 77
#define verNunavut 78
#define verWelsh 79
#define verIrishGaelicScript 81
static const struct {
int script;
int region;
int sysvermin;
char const *fontname;
int charset;
} macencs[] = {
{ smRoman, -1, 0x850, "VT100", CS_MAC_VT100 },
{ smRoman, -1, 0, "VT100", CS_MAC_VT100_OLD },
/*
* From here on, this table is largely derived from
* <http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/README.TXT>,
* with _OLD version added based on the comments in individual
* mapping files.
*/
{ smRoman, -1, 0, "Symbol", CS_MAC_SYMBOL },
{ smRoman, -1, 0, "Zapf Dingbats", CS_MAC_DINGBATS },
{ smRoman, verTurkey, 0, NULL, CS_MAC_TURKISH },
{ smRoman, verYugoCroatian, 0x850, NULL, CS_MAC_CROATIAN },
{ smRoman, verYugoCroatian, 0, NULL, CS_MAC_CROATIAN_OLD },
{ smRoman, verSlovenian, 0x850, NULL, CS_MAC_CROATIAN },
{ smRoman, verSlovenian, 0, NULL, CS_MAC_CROATIAN_OLD },
{ smRoman, verCroatia, 0x850, NULL, CS_MAC_CROATIAN },
{ smRoman, verCroatia, 0, NULL, CS_MAC_CROATIAN_OLD },
{ smRoman, verIceland, 0x850, NULL, CS_MAC_ICELAND },
{ smRoman, verIceland, 0, NULL, CS_MAC_ICELAND_OLD },
{ smRoman, verFaroeIsl, 0x850, NULL, CS_MAC_ICELAND },
{ smRoman, verFaroeIsl, 0, NULL, CS_MAC_ICELAND_OLD },
{ smRoman, verRomania, 0x850, NULL, CS_MAC_ROMANIAN },
{ smRoman, verRomania, 0, NULL, CS_MAC_ROMANIAN_OLD },
#if 0 /* No mapping table on ftp.unicode.org */
{ smRoman, verIreland, 0x850, NULL, CS_MAC_CELTIC },
{ smRoman, verIreland, 0, NULL, CS_MAC_CELTIC_OLD },
{ smRoman, verScottishGaelic, 0x850, NULL, CS_MAC_CELTIC },
{ smRoman, verScottishGaelic, 0, NULL, CS_MAC_CELTIC_OLD },
{ smRoman, verManxGaelic, 0x850, NULL, CS_MAC_CELTIC },
{ smRoman, verManxGaelic, 0, NULL, CS_MAC_CELTIC_OLD },
{ smRoman, verBreton, 0x850, NULL, CS_MAC_CELTIC },
{ smRoman, verBreton, 0, NULL, CS_MAC_CELTIC_OLD },
{ smRoman, verWelsh, 0x850, NULL, CS_MAC_CELTIC },
{ smRoman, verWelsh, 0, NULL, CS_MAC_CELTIC_OLD },
{ smRoman, verIrishGaelicScript, 0x850, NULL, CS_MAC_GAELIC },
{ smRoman, verIrishGaelicScript, 0, NULL, CS_MAC_GAELIC_OLD },
#endif
{ smRoman, verGreece, 0x922, NULL, CS_MAC_GREEK },
{ smRoman, verGreece, 0, NULL, CS_MAC_GREEK_OLD },
{ smRoman, -1, 0x850, NULL, CS_MAC_ROMAN },
{ smRoman, -1, 0, NULL, CS_MAC_ROMAN_OLD },
#if 0 /* Multi-byte encodings, not yet supported */
{ smJapanese, -1, 0, NULL, CS_MAC_JAPANESE },
{ smTradChinese, -1, 0, NULL, CS_MAC_CHINTRAD },
{ smKorean, -1, 0, NULL, CS_MAC_KOREAN },
#endif
#if 0 /* Bidirectional encodings, not yet supported */
{ smArabic, verIran, 0, NULL, CS_MAC_FARSI },
{ smArabic, -1, 0, NULL, CS_MAC_ARABIC },
{ smHebrew, -1, 0, NULL, CS_MAC_HEBREW },
#endif
{ smCyrillic, -1, 0x900, NULL, CS_MAC_CYRILLIC },
{ smCyrillic, verRussia, 0, NULL, CS_MAC_CYRILLIC_OLD },
{ smCyrillic, verBulgaria, 0, NULL, CS_MAC_CYRILLIC_OLD },
{ smCyrillic, -1, 0, NULL, CS_MAC_UKRAINE },
#if 0 /* Complex Indic scripts, not yet supported */
{ smDevanagari, -1, 0, NULL, CS_MAC_DEVENAGA },
{ smGurmukhi, -1, 0, NULL, CS_MAC_GURMUKHI },
{ smGujurati, -1, 0, NULL, CS_MAC_GUJURATI },
#endif
{ smThai, -1, 0, NULL, CS_MAC_THAI },
#if 0 /* Multi-byte encoding, not yet supported */
{ smSimpChinese, -1, 0, NULL, CS_MAC_CHINSIMP },
#endif
#if 0 /* No mapping table on ftp.unicode.org */
{ smTibetan, -1, 0, NULL, CS_MAC_TIBETAN },
{ smEthiopic, -1, 0, NULL, CS_MAC_ETHIOPIC },
{ smEthiopic, verNanavut, 0, NULL, CS_MAC_INUIT },
#endif
{ smCentralEuroRoman, -1, 0, NULL, CS_MAC_CENTEURO },
};
int charset_from_macenc(int script, int region, int sysvers,
char const *fontname)
{
int i;
for (i = 0; i < (int)lenof(macencs); i++)
if ((macencs[i].script == script) &&
(macencs[i].region < 0 || macencs[i].region == region) &&
(macencs[i].sysvermin <= sysvers) &&
(macencs[i].fontname == NULL ||
(fontname != NULL && strcmp(macencs[i].fontname, fontname) == 0)))
return macencs[i].charset;
return CS_NONE;
}

214
puttysrc/CHARSET/MIMEENC.C Normal file
View File

@@ -0,0 +1,214 @@
/*
* mimeenc.c - translate our internal character set codes to and
* from MIME standard character-set names.
*
*/
#include <ctype.h>
#include "charset.h"
#include "internal.h"
static const struct {
const char *name;
int charset;
} mimeencs[] = {
/*
* These names are taken from
*
* http://www.iana.org/assignments/character-sets
*
* Where multiple encoding names map to the same encoding id
* (such as the variety of aliases for ISO-8859-1), the first
* is considered canonical and will be returned when
* translating the id to a string.
*/
{ "ISO-8859-1", CS_ISO8859_1 },
{ "iso-ir-100", CS_ISO8859_1 },
{ "ISO_8859-1", CS_ISO8859_1 },
{ "ISO_8859-1:1987", CS_ISO8859_1 },
{ "latin1", CS_ISO8859_1 },
{ "l1", CS_ISO8859_1 },
{ "IBM819", CS_ISO8859_1 },
{ "CP819", CS_ISO8859_1 },
{ "csISOLatin1", CS_ISO8859_1 },
{ "ISO-8859-2", CS_ISO8859_2 },
{ "ISO_8859-2:1987", CS_ISO8859_2 },
{ "iso-ir-101", CS_ISO8859_2 },
{ "ISO_8859-2", CS_ISO8859_2 },
{ "latin2", CS_ISO8859_2 },
{ "l2", CS_ISO8859_2 },
{ "csISOLatin2", CS_ISO8859_2 },
{ "ISO-8859-3", CS_ISO8859_3 },
{ "ISO_8859-3:1988", CS_ISO8859_3 },
{ "iso-ir-109", CS_ISO8859_3 },
{ "ISO_8859-3", CS_ISO8859_3 },
{ "latin3", CS_ISO8859_3 },
{ "l3", CS_ISO8859_3 },
{ "csISOLatin3", CS_ISO8859_3 },
{ "ISO-8859-4", CS_ISO8859_4 },
{ "ISO_8859-4:1988", CS_ISO8859_4 },
{ "iso-ir-110", CS_ISO8859_4 },
{ "ISO_8859-4", CS_ISO8859_4 },
{ "latin4", CS_ISO8859_4 },
{ "l4", CS_ISO8859_4 },
{ "csISOLatin4", CS_ISO8859_4 },
{ "ISO-8859-5", CS_ISO8859_5 },
{ "ISO_8859-5:1988", CS_ISO8859_5 },
{ "iso-ir-144", CS_ISO8859_5 },
{ "ISO_8859-5", CS_ISO8859_5 },
{ "cyrillic", CS_ISO8859_5 },
{ "csISOLatinCyrillic", CS_ISO8859_5 },
{ "ISO-8859-6", CS_ISO8859_6 },
{ "ISO_8859-6:1987", CS_ISO8859_6 },
{ "iso-ir-127", CS_ISO8859_6 },
{ "ISO_8859-6", CS_ISO8859_6 },
{ "ECMA-114", CS_ISO8859_6 },
{ "ASMO-708", CS_ISO8859_6 },
{ "arabic", CS_ISO8859_6 },
{ "csISOLatinArabic", CS_ISO8859_6 },
{ "ISO-8859-7", CS_ISO8859_7 },
{ "ISO_8859-7:1987", CS_ISO8859_7 },
{ "iso-ir-126", CS_ISO8859_7 },
{ "ISO_8859-7", CS_ISO8859_7 },
{ "ELOT_928", CS_ISO8859_7 },
{ "ECMA-118", CS_ISO8859_7 },
{ "greek", CS_ISO8859_7 },
{ "greek8", CS_ISO8859_7 },
{ "csISOLatinGreek", CS_ISO8859_7 },
{ "ISO-8859-8", CS_ISO8859_8 },
{ "ISO_8859-8:1988", CS_ISO8859_8 },
{ "iso-ir-138", CS_ISO8859_8 },
{ "ISO_8859-8", CS_ISO8859_8 },
{ "hebrew", CS_ISO8859_8 },
{ "csISOLatinHebrew", CS_ISO8859_8 },
{ "ISO-8859-9", CS_ISO8859_9 },
{ "ISO_8859-9:1989", CS_ISO8859_9 },
{ "iso-ir-148", CS_ISO8859_9 },
{ "ISO_8859-9", CS_ISO8859_9 },
{ "latin5", CS_ISO8859_9 },
{ "l5", CS_ISO8859_9 },
{ "csISOLatin5", CS_ISO8859_9 },
{ "ISO-8859-10", CS_ISO8859_10 },
{ "iso-ir-157", CS_ISO8859_10 },
{ "l6", CS_ISO8859_10 },
{ "ISO_8859-10:1992", CS_ISO8859_10 },
{ "csISOLatin6", CS_ISO8859_10 },
{ "latin6", CS_ISO8859_10 },
{ "ISO-8859-13", CS_ISO8859_13 },
{ "ISO-8859-14", CS_ISO8859_14 },
{ "iso-ir-199", CS_ISO8859_14 },
{ "ISO_8859-14:1998", CS_ISO8859_14 },
{ "ISO_8859-14", CS_ISO8859_14 },
{ "latin8", CS_ISO8859_14 },
{ "iso-celtic", CS_ISO8859_14 },
{ "l8", CS_ISO8859_14 },
{ "ISO-8859-15", CS_ISO8859_15 },
{ "ISO_8859-15", CS_ISO8859_15 },
{ "Latin-9", CS_ISO8859_15 },
{ "ISO-8859-16", CS_ISO8859_16 },
{ "iso-ir-226", CS_ISO8859_16 },
{ "ISO_8859-16", CS_ISO8859_16 },
{ "ISO_8859-16:2001", CS_ISO8859_16 },
{ "latin10", CS_ISO8859_16 },
{ "l10", CS_ISO8859_16 },
{ "IBM437", CS_CP437 },
{ "cp437", CS_CP437 },
{ "437", CS_CP437 },
{ "csPC8CodePage437", CS_CP437 },
{ "IBM850", CS_CP850 },
{ "cp850", CS_CP850 },
{ "850", CS_CP850 },
{ "csPC850Multilingual", CS_CP850 },
{ "IBM866", CS_CP866 },
{ "cp866", CS_CP866 },
{ "866", CS_CP866 },
{ "csIBM866", CS_CP866 },
{ "windows-1250", CS_CP1250 },
{ "windows-1251", CS_CP1251 },
{ "windows-1252", CS_CP1252 },
{ "windows-1253", CS_CP1253 },
{ "windows-1254", CS_CP1254 },
{ "windows-1255", CS_CP1255 },
{ "windows-1256", CS_CP1256 },
{ "windows-1257", CS_CP1257 },
{ "windows-1258", CS_CP1258 },
{ "KOI8-R", CS_KOI8_R },
{ "csKOI8R", CS_KOI8_R },
{ "KOI8-U", CS_KOI8_U },
{ "macintosh", CS_MAC_ROMAN_OLD },
{ "mac", CS_MAC_ROMAN_OLD },
{ "csMacintosh", CS_MAC_ROMAN_OLD },
{ "VISCII", CS_VISCII },
{ "csVISCII", CS_VISCII },
{ "hp-roman8", CS_HP_ROMAN8 },
{ "roman8", CS_HP_ROMAN8 },
{ "r8", CS_HP_ROMAN8 },
{ "csHPRoman8", CS_HP_ROMAN8 },
{ "DEC-MCS", CS_DEC_MCS },
{ "dec", CS_DEC_MCS },
{ "csDECMCS", CS_DEC_MCS },
{ "UTF-8", CS_UTF8 },
};
const char *charset_to_mimeenc(int charset)
{
int i;
for (i = 0; i < (int)lenof(mimeencs); i++)
if (charset == mimeencs[i].charset)
return mimeencs[i].name;
return NULL; /* not found */
}
int charset_from_mimeenc(const char *name)
{
int i;
for (i = 0; i < (int)lenof(mimeencs); i++) {
const char *p, *q;
p = name;
q = mimeencs[i].name;
while (*p || *q) {
if (tolower(*p) != tolower(*q))
break;
p++; q++;
}
if (!*p && !*q)
return mimeencs[i].charset;
}
return CS_NONE; /* not found */
}

15
puttysrc/CHARSET/README Normal file
View File

@@ -0,0 +1,15 @@
This subdirectory contains a general character-set conversion
library, used in the Unix port of PuTTY, and available for use in
other ports if it should happen to be useful.
This is a variant of a library that's currently used in some other
programs such as Timber and Halibut. At some future date, we would
like to merge the two libraries, so that all programs use the same
libcharset.
It is therefore a _strong_ design goal that this library should remain
perfectly general, and not tied to particulars of PuTTY. It must not
reference any code outside its own subdirectory; it should not have
PuTTY-specific helper routines added to it unless they can be
documented in a general manner which might make them useful in other
circumstances as well.

53
puttysrc/CHARSET/SBCS.C Normal file
View File

@@ -0,0 +1,53 @@
/*
* sbcs.c - routines to handle single-byte character sets.
*/
#include "charset.h"
#include "internal.h"
/*
* The charset_spec for any single-byte character set should
* provide read_sbcs() as its read function, and its `data' field
* should be a wchar_t string constant containing the 256 entries
* of the translation table.
*/
void read_sbcs(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx)
{
const struct sbcs_data *sd = charset->data;
UNUSEDARG(state);
emit(emitctx, sd->sbcs2ucs[input_chr]);
}
void write_sbcs(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx)
{
const struct sbcs_data *sd = charset->data;
int i, j, k, c;
UNUSEDARG(state);
/*
* Binary-search in the ucs2sbcs table.
*/
i = -1;
j = sd->nvalid;
while (i+1 < j) {
k = (i+j)/2;
c = sd->ucs2sbcs[k];
if (input_chr < sd->sbcs2ucs[c])
j = k;
else if (input_chr > sd->sbcs2ucs[c])
i = k;
else {
emit(emitctx, c);
return;
}
}
emit(emitctx, ERROR);
}

1117
puttysrc/CHARSET/SBCS.DAT Normal file

File diff suppressed because it is too large Load Diff

4018
puttysrc/CHARSET/SBCSDAT.C Normal file

File diff suppressed because it is too large Load Diff

110
puttysrc/CHARSET/SBCSGEN.PL Normal file
View File

@@ -0,0 +1,110 @@
#!/usr/bin/env perl -w
# This script generates sbcsdat.c (the data for all the SBCSes) from its
# source form sbcs.dat.
$infile = "sbcs.dat";
$outfile = "sbcsdat.c";
open FOO, $infile;
open BAR, ">$outfile";
select BAR;
print "/*\n";
print " * sbcsdat.c - data definitions for single-byte character sets.\n";
print " *\n";
print " * Generated by sbcsgen.pl from sbcs.dat.\n";
print " * You should edit those files rather than editing this one.\n";
print " */\n";
print "\n";
print "#ifndef ENUM_CHARSETS\n";
print "\n";
print "#include \"charset.h\"\n";
print "#include \"internal.h\"\n";
print "\n";
my $charsetname = undef;
my @vals = ();
my @charsetnames = ();
my @sortpriority = ();
while (<FOO>) {
chomp;
if (/^charset (.*)$/) {
$charsetname = $1;
@vals = ();
@sortpriority = map { 0 } 0..255;
} elsif (/^sortpriority ([^-]*)-([^-]*) (.*)$/) {
for ($i = hex $1; $i <= hex $2; $i++) {
$sortpriority[$i] += $3;
}
} elsif (/^[0-9a-fA-FX]/) {
push @vals, map { $_ eq "XXXX" ? -1 : hex $_ } split / +/, $_;
if (scalar @vals > 256) {
die "$infile:$.: charset $charsetname has more than 256 values\n";
} elsif (scalar @vals == 256) {
&outcharset($charsetname, \@vals, \@sortpriority);
push @charsetnames, $charsetname;
$charsetname = undef;
@vals = ();
@sortpriority = map { 0 } 0..255;
}
}
}
print "#else /* ENUM_CHARSETS */\n";
print "\n";
foreach $i (@charsetnames) {
print "ENUM_CHARSET($i)\n";
}
print "\n";
print "#endif /* ENUM_CHARSETS */\n";
sub outcharset($$$) {
my ($name, $vals, $sortpriority) = @_;
my ($prefix, $i, @sorted);
print "static const sbcs_data data_$name = {\n";
print " {\n";
$prefix = " ";
@sorted = ();
for ($i = 0; $i < 256; $i++) {
if ($vals->[$i] < 0) {
printf "%sERROR ", $prefix;
} else {
printf "%s0x%04x", $prefix, $vals->[$i];
die "ooh? $i\n" unless defined $sortpriority->[$i];
push @sorted, [$i, $vals->[$i], 0+$sortpriority->[$i]];
}
if ($i % 8 == 7) {
$prefix = ",\n ";
} else {
$prefix = ", ";
}
}
print "\n },\n {\n";
@sorted = sort { ($a->[1] == $b->[1] ?
$b->[2] <=> $a->[2] :
$a->[1] <=> $b->[1]) ||
$a->[0] <=> $b->[0] } @sorted;
$prefix = " ";
$uval = -1;
for ($i = $j = 0; $i < scalar @sorted; $i++) {
next if ($uval == $sorted[$i]->[1]); # low-priority alternative
$uval = $sorted[$i]->[1];
printf "%s0x%02x", $prefix, $sorted[$i]->[0];
if ($j % 8 == 7) {
$prefix = ",\n ";
} else {
$prefix = ", ";
}
$j++;
}
printf "\n },\n %d\n", $j;
print "};\n";
print "const charset_spec charset_$name = {\n" .
" $name, read_sbcs, write_sbcs, &data_$name\n};\n\n";
}

View File

@@ -0,0 +1,29 @@
/*
* slookup.c - static lookup of character sets.
*/
#include "charset.h"
#include "internal.h"
#define ENUM_CHARSET(x) extern charset_spec const charset_##x;
#include "enum.c"
#undef ENUM_CHARSET
static charset_spec const *const cs_table[] = {
#define ENUM_CHARSET(x) &charset_##x,
#include "enum.c"
#undef ENUM_CHARSET
};
charset_spec const *charset_find_spec(int charset)
{
int i;
for (i = 0; i < (int)lenof(cs_table); i++)
if (cs_table[i]->charset == charset)
return cs_table[i];
return NULL;
}

89
puttysrc/CHARSET/TOUCS.C Normal file
View File

@@ -0,0 +1,89 @@
/*
* toucs.c - convert charsets to Unicode.
*/
#include "charset.h"
#include "internal.h"
struct unicode_emit_param {
wchar_t *output;
int outlen;
const wchar_t *errstr;
int errlen;
int stopped;
};
static void unicode_emit(void *ctx, long int output)
{
struct unicode_emit_param *param = (struct unicode_emit_param *)ctx;
wchar_t outval;
wchar_t const *p;
int outlen;
if (output == ERROR) {
if (param->errstr) {
p = param->errstr;
outlen = param->errlen;
} else {
outval = 0xFFFD; /* U+FFFD REPLACEMENT CHARACTER */
p = &outval;
outlen = 1;
}
} else {
outval = output;
p = &outval;
outlen = 1;
}
if (param->outlen >= outlen) {
while (outlen > 0) {
*param->output++ = *p++;
param->outlen--;
outlen--;
}
} else {
param->stopped = 1;
}
}
int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen,
int charset, charset_state *state,
const wchar_t *errstr, int errlen)
{
charset_spec const *spec = charset_find_spec(charset);
charset_state localstate;
struct unicode_emit_param param;
param.output = output;
param.outlen = outlen;
param.errstr = errstr;
param.errlen = errlen;
param.stopped = 0;
if (!state) {
localstate.s0 = 0;
} else {
localstate = *state; /* structure copy */
}
while (*inlen > 0) {
int lenbefore = param.output - output;
spec->read(spec, (unsigned char)**input, &localstate,
unicode_emit, &param);
if (param.stopped) {
/*
* The emit function has _tried_ to output some
* characters, but ran up against the end of the
* buffer. Leave immediately, and return what happened
* _before_ attempting to process this character.
*/
return lenbefore;
}
if (state)
*state = localstate; /* structure copy */
(*input)++;
(*inlen)--;
}
return param.output - output;
}

882
puttysrc/CHARSET/UTF8.C Normal file
View File

@@ -0,0 +1,882 @@
/*
* utf8.c - routines to handle UTF-8.
*/
#ifndef ENUM_CHARSETS
#include "charset.h"
#include "internal.h"
void read_utf8(charset_spec const *, long int, charset_state *,
void (*)(void *, long int), void *);
void write_utf8(charset_spec const *, long int,
charset_state *, void (*)(void *, long int), void *);
/*
* UTF-8 has no associated data, so `charset' may be ignored.
*/
void read_utf8(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx)
{
UNUSEDARG(charset);
/*
* For reading UTF-8, the `state' word contains:
*
* - in bits 29-31, the number of bytes expected to be in the
* current multibyte character (which we can tell instantly
* from the first byte, of course).
*
* - in bits 26-28, the number of bytes _seen so far_ in the
* current multibyte character.
*
* - in the remainder of the word, the current value of the
* character, which is shifted upwards by 6 bits to
* accommodate each new byte.
*
* As required, the state is zero when we are not in the middle
* of a multibyte character at all.
*
* For example, when reading E9 8D 8B, starting at state=0:
*
* - after E9, the state is 0x64000009
* - after 8D, the state is 0x6800024d
* - after 8B, the state conceptually becomes 0x6c00934b, at
* which point we notice we've got as many characters as we
* were expecting, output U+934B, and reset the state to
* zero.
*
* Note that the maximum number of bits we might need to store
* in the character value field is 25 (U+7FFFFFFF contains 31
* bits, but we will never actually store its full value
* because when we receive the last 6 bits in the final
* continuation byte we will output it and revert the state to
* zero). Hence the character value field never collides with
* the byte counts.
*/
if (input_chr < 0x80) {
/*
* Single-byte character. If the state is nonzero before
* coming here, output an error for an incomplete sequence.
* Then output the character.
*/
if (state->s0 != 0) {
emit(emitctx, ERROR);
state->s0 = 0;
}
emit(emitctx, input_chr);
} else if (input_chr == 0xFE || input_chr == 0xFF) {
/*
* FE and FF bytes should _never_ occur in UTF-8. They are
* automatic errors; if the state was nonzero to start
* with, output a further error for an incomplete sequence.
*/
if (state->s0 != 0) {
emit(emitctx, ERROR);
state->s0 = 0;
}
emit(emitctx, ERROR);
} else if (input_chr >= 0x80 && input_chr < 0xC0) {
/*
* Continuation byte. Output an error for an unexpected
* continuation byte, if the state is zero.
*/
if (state->s0 == 0) {
emit(emitctx, ERROR);
} else {
unsigned long charval;
unsigned long topstuff;
int bytes;
/*
* Otherwise, accumulate more of the character value.
*/
charval = state->s0 & 0x03ffffffL;
charval = (charval << 6) | (input_chr & 0x3F);
/*
* Check the byte counts; if we have not reached the
* end of the character, update the state and return.
*/
topstuff = state->s0 & 0xfc000000L;
topstuff += 0x04000000L; /* add one to the byte count */
if (((topstuff << 3) ^ topstuff) & 0xe0000000L) {
state->s0 = topstuff | charval;
return;
}
/*
* Now we know we've reached the end of the character.
* `charval' is the Unicode value. We should check for
* various invalid things, and then either output
* charval or an error. In all cases we reset the state
* to zero.
*/
bytes = topstuff >> 29;
state->s0 = 0;
if (charval >= 0xD800 && charval < 0xE000) {
/*
* Surrogates (0xD800-0xDFFF) may never be encoded
* in UTF-8. A surrogate pair in Unicode should
* have been encoded as a single UTF-8 character
* occupying more than three bytes.
*/
emit(emitctx, ERROR);
} else if (charval == 0xFFFE || charval == 0xFFFF) {
/*
* U+FFFE and U+FFFF are invalid Unicode characters
* and may never be encoded in UTF-8. (This is one
* reason why U+FFFF is our way of signalling an
* error to our `emit' function :-)
*/
emit(emitctx, ERROR);
} else if ((charval <= 0x7FL /* && bytes > 1 */) ||
(charval <= 0x7FFL && bytes > 2) ||
(charval <= 0xFFFFL && bytes > 3) ||
(charval <= 0x1FFFFFL && bytes > 4) ||
(charval <= 0x3FFFFFFL && bytes > 5)) {
/*
* Overlong sequences are not to be tolerated,
* under any circumstances.
*/
emit(emitctx, ERROR);
} else {
/*
* Oh, all right. We'll let this one off.
*/
emit(emitctx, charval);
}
}
} else {
/*
* Lead byte. First output an error for an incomplete
* sequence, if the state is nonzero.
*/
if (state->s0 != 0)
emit(emitctx, ERROR);
/*
* Now deal with the lead byte: work out the number of
* bytes we expect to see in this character, and extract
* the initial bits of it too.
*/
if (input_chr >= 0xC0 && input_chr < 0xE0) {
state->s0 = 0x44000000L | (input_chr & 0x1F);
} else if (input_chr >= 0xE0 && input_chr < 0xF0) {
state->s0 = 0x64000000L | (input_chr & 0x0F);
} else if (input_chr >= 0xF0 && input_chr < 0xF8) {
state->s0 = 0x84000000L | (input_chr & 0x07);
} else if (input_chr >= 0xF8 && input_chr < 0xFC) {
state->s0 = 0xa4000000L | (input_chr & 0x03);
} else if (input_chr >= 0xFC && input_chr < 0xFE) {
state->s0 = 0xc4000000L | (input_chr & 0x01);
}
}
}
/*
* UTF-8 is a stateless multi-byte encoding (in the sense that just
* after any character has been completed, the state is always the
* same); hence when writing it, there is no need to use the
* charset_state.
*/
void write_utf8(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx)
{
UNUSEDARG(charset);
UNUSEDARG(state);
/*
* Refuse to output any illegal code points.
*/
if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
(input_chr >= 0xD800 && input_chr < 0xE000)) {
emit(emitctx, ERROR);
} else if (input_chr < 0x80) { /* one-byte character */
emit(emitctx, input_chr);
} else if (input_chr < 0x800) { /* two-byte character */
emit(emitctx, 0xC0 | (0x1F & (input_chr >> 6)));
emit(emitctx, 0x80 | (0x3F & (input_chr )));
} else if (input_chr < 0x10000) { /* three-byte character */
emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
emit(emitctx, 0x80 | (0x3F & (input_chr )));
} else if (input_chr < 0x200000) { /* four-byte character */
emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
emit(emitctx, 0x80 | (0x3F & (input_chr )));
} else if (input_chr < 0x4000000) {/* five-byte character */
emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
emit(emitctx, 0x80 | (0x3F & (input_chr )));
} else { /* six-byte character */
emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
emit(emitctx, 0x80 | (0x3F & (input_chr )));
}
}
#ifdef TESTMODE
#include <stdio.h>
#include <stdarg.h>
int total_errs = 0;
void utf8_emit(void *ctx, long output)
{
wchar_t **p = (wchar_t **)ctx;
*(*p)++ = output;
}
void utf8_read_test(int line, char *input, int inlen, ...)
{
va_list ap;
wchar_t *p, str[512];
int i;
charset_state state;
unsigned long l;
state.s0 = 0;
p = str;
for (i = 0; i < inlen; i++)
read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
va_start(ap, inlen);
l = 0;
for (i = 0; i < p - str; i++) {
l = va_arg(ap, long int);
if (l == -1) {
printf("%d: correct string shorter than output\n", line);
total_errs++;
break;
}
if (l != str[i]) {
printf("%d: char %d came out as %08x, should be %08x\n",
line, i, str[i], l);
total_errs++;
}
}
if (l != -1) {
l = va_arg(ap, long int);
if (l != -1) {
printf("%d: correct string longer than output\n", line);
total_errs++;
}
}
va_end(ap);
}
void utf8_write_test(int line, const long *input, int inlen, ...)
{
va_list ap;
wchar_t *p, str[512];
int i;
charset_state state;
unsigned long l;
state.s0 = 0;
p = str;
for (i = 0; i < inlen; i++)
write_utf8(NULL, input[i], &state, utf8_emit, &p);
va_start(ap, inlen);
l = 0;
for (i = 0; i < p - str; i++) {
l = va_arg(ap, long int);
if (l == -1) {
printf("%d: correct string shorter than output\n", line);
total_errs++;
break;
}
if (l != str[i]) {
printf("%d: char %d came out as %08x, should be %08x\n",
line, i, str[i], l);
total_errs++;
}
}
if (l != -1) {
l = va_arg(ap, long int);
if (l != -1) {
printf("%d: correct string longer than output\n", line);
total_errs++;
}
}
va_end(ap);
}
/* Macro to concoct the first three parameters of utf8_read_test. */
#define TESTSTR(x) __LINE__, x, lenof(x)
int main(void)
{
printf("read tests beginning\n");
utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
0x000003BA, /* GREEK SMALL LETTER KAPPA */
0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
0x000003C3, /* GREEK SMALL LETTER SIGMA */
0x000003BC, /* GREEK SMALL LETTER MU */
0x000003B5, /* GREEK SMALL LETTER EPSILON */
0, -1);
utf8_read_test(TESTSTR("\x00"),
0x00000000, /* <control> */
0, -1);
utf8_read_test(TESTSTR("\xC2\x80"),
0x00000080, /* <control> */
0, -1);
utf8_read_test(TESTSTR("\xE0\xA0\x80"),
0x00000800, /* <no name available> */
0, -1);
utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
0x00010000, /* <no name available> */
0, -1);
utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
0x00200000, /* <no name available> */
0, -1);
utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
0x04000000, /* <no name available> */
0, -1);
utf8_read_test(TESTSTR("\x7F"),
0x0000007F, /* <control> */
0, -1);
utf8_read_test(TESTSTR("\xDF\xBF"),
0x000007FF, /* <no name available> */
0, -1);
utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
0x0000FFFD, /* REPLACEMENT CHARACTER */
0, -1);
utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
ERROR, /* <no name available> (invalid char) */
0, -1);
utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
0x001FFFFF, /* <no name available> */
0, -1);
utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
0x03FFFFFF, /* <no name available> */
0, -1);
utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
0x7FFFFFFF, /* <no name available> */
0, -1);
utf8_read_test(TESTSTR("\xED\x9F\xBF"),
0x0000D7FF, /* <no name available> */
0, -1);
utf8_read_test(TESTSTR("\xEE\x80\x80"),
0x0000E000, /* <Private Use, First> */
0, -1);
utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
0x0000FFFD, /* REPLACEMENT CHARACTER */
0, -1);
utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
0x0010FFFF, /* <no name available> */
0, -1);
utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
0x00110000, /* <no name available> */
0, -1);
utf8_read_test(TESTSTR("\x80"),
ERROR, /* (unexpected continuation byte) */
0, -1);
utf8_read_test(TESTSTR("\xBF"),
ERROR, /* (unexpected continuation byte) */
0, -1);
utf8_read_test(TESTSTR("\x80\xBF"),
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
0, -1);
utf8_read_test(TESTSTR("\x80\xBF\x80"),
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
0, -1);
utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
0, -1);
utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
0, -1);
utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
0, -1);
utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
0, -1);
utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
ERROR, /* (unexpected continuation byte) */
0, -1);
utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
0, -1);
utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
0, -1);
utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
0, -1);
utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
0, -1);
utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
ERROR, /* (incomplete sequence) */
0x00000020, /* SPACE */
0, -1);
utf8_read_test(TESTSTR("\xC0"),
ERROR, /* (incomplete sequence) */
0, -1);
utf8_read_test(TESTSTR("\xE0\x80"),
ERROR, /* (incomplete sequence) */
0, -1);
utf8_read_test(TESTSTR("\xF0\x80\x80"),
ERROR, /* (incomplete sequence) */
0, -1);
utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
ERROR, /* (incomplete sequence) */
0, -1);
utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
ERROR, /* (incomplete sequence) */
0, -1);
utf8_read_test(TESTSTR("\xDF"),
ERROR, /* (incomplete sequence) */
0, -1);
utf8_read_test(TESTSTR("\xEF\xBF"),
ERROR, /* (incomplete sequence) */
0, -1);
utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
ERROR, /* (incomplete sequence) */
0, -1);
utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
ERROR, /* (incomplete sequence) */
0, -1);
utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
ERROR, /* (incomplete sequence) */
0, -1);
utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
ERROR, /* (incomplete sequence) */
ERROR, /* (incomplete sequence) */
ERROR, /* (incomplete sequence) */
ERROR, /* (incomplete sequence) */
ERROR, /* (incomplete sequence) */
ERROR, /* (incomplete sequence) */
ERROR, /* (incomplete sequence) */
ERROR, /* (incomplete sequence) */
ERROR, /* (incomplete sequence) */
ERROR, /* (incomplete sequence) */
0, -1);
utf8_read_test(TESTSTR("\xFE"),
ERROR, /* (invalid UTF-8 byte) */
0, -1);
utf8_read_test(TESTSTR("\xFF"),
ERROR, /* (invalid UTF-8 byte) */
0, -1);
utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
ERROR, /* (invalid UTF-8 byte) */
ERROR, /* (invalid UTF-8 byte) */
ERROR, /* (invalid UTF-8 byte) */
ERROR, /* (invalid UTF-8 byte) */
0, -1);
utf8_read_test(TESTSTR("\xC0\xAF"),
ERROR, /* SOLIDUS (overlong form of 2F) */
0, -1);
utf8_read_test(TESTSTR("\xE0\x80\xAF"),
ERROR, /* SOLIDUS (overlong form of 2F) */
0, -1);
utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
ERROR, /* SOLIDUS (overlong form of 2F) */
0, -1);
utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
ERROR, /* SOLIDUS (overlong form of 2F) */
0, -1);
utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
ERROR, /* SOLIDUS (overlong form of 2F) */
0, -1);
utf8_read_test(TESTSTR("\xC1\xBF"),
ERROR, /* <control> (overlong form of 7F) */
0, -1);
utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
ERROR, /* <no name available> (overlong form of DF BF) */
0, -1);
utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
ERROR, /* <no name available> (overlong form of EF BF BF) (invalid char) */
0, -1);
utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
ERROR, /* <no name available> (overlong form of F7 BF BF BF) */
0, -1);
utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
ERROR, /* <no name available> (overlong form of FB BF BF BF BF) */
0, -1);
utf8_read_test(TESTSTR("\xC0\x80"),
ERROR, /* <control> (overlong form of 00) */
0, -1);
utf8_read_test(TESTSTR("\xE0\x80\x80"),
ERROR, /* <control> (overlong form of 00) */
0, -1);
utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
ERROR, /* <control> (overlong form of 00) */
0, -1);
utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
ERROR, /* <control> (overlong form of 00) */
0, -1);
utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
ERROR, /* <control> (overlong form of 00) */
0, -1);
utf8_read_test(TESTSTR("\xED\xA0\x80"),
ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xAD\xBF"),
ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xAE\x80"),
ERROR, /* <Private Use High Surrogate, First> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xAF\xBF"),
ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xB0\x80"),
ERROR, /* <Low Surrogate, First> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xBE\x80"),
ERROR, /* <no name available> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xBF\xBF"),
ERROR, /* <Low Surrogate, Last> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
ERROR, /* <Low Surrogate, First> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
ERROR, /* <Low Surrogate, Last> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
ERROR, /* <Low Surrogate, First> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
ERROR, /* <Low Surrogate, Last> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
ERROR, /* <Private Use High Surrogate, First> (surrogate) */
ERROR, /* <Low Surrogate, First> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
ERROR, /* <Private Use High Surrogate, First> (surrogate) */
ERROR, /* <Low Surrogate, Last> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
ERROR, /* <Low Surrogate, First> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
ERROR, /* <Low Surrogate, Last> (surrogate) */
0, -1);
utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
ERROR, /* <no name available> (invalid char) */
0, -1);
utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
ERROR, /* <no name available> (invalid char) */
0, -1);
printf("read tests completed\n");
printf("write tests beginning\n");
{
const static long str[] =
{0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
utf8_write_test(TESTSTR(str),
0xCE, 0xBA,
0xE1, 0xBD, 0xB9,
0xCF, 0x83,
0xCE, 0xBC,
0xCE, 0xB5,
0, -1);
}
{
const static long str[] = {0x0000L, 0};
utf8_write_test(TESTSTR(str),
0x00,
0, -1);
}
{
const static long str[] = {0x0080L, 0};
utf8_write_test(TESTSTR(str),
0xC2, 0x80,
0, -1);
}
{
const static long str[] = {0x0800L, 0};
utf8_write_test(TESTSTR(str),
0xE0, 0xA0, 0x80,
0, -1);
}
{
const static long str[] = {0x00010000L, 0};
utf8_write_test(TESTSTR(str),
0xF0, 0x90, 0x80, 0x80,
0, -1);
}
{
const static long str[] = {0x00200000L, 0};
utf8_write_test(TESTSTR(str),
0xF8, 0x88, 0x80, 0x80, 0x80,
0, -1);
}
{
const static long str[] = {0x04000000L, 0};
utf8_write_test(TESTSTR(str),
0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
0, -1);
}
{
const static long str[] = {0x007FL, 0};
utf8_write_test(TESTSTR(str),
0x7F,
0, -1);
}
{
const static long str[] = {0x07FFL, 0};
utf8_write_test(TESTSTR(str),
0xDF, 0xBF,
0, -1);
}
{
const static long str[] = {0xFFFDL, 0};
utf8_write_test(TESTSTR(str),
0xEF, 0xBF, 0xBD,
0, -1);
}
{
const static long str[] = {0xFFFFL, 0};
utf8_write_test(TESTSTR(str),
ERROR,
0, -1);
}
{
const static long str[] = {0x001FFFFFL, 0};
utf8_write_test(TESTSTR(str),
0xF7, 0xBF, 0xBF, 0xBF,
0, -1);
}
{
const static long str[] = {0x03FFFFFFL, 0};
utf8_write_test(TESTSTR(str),
0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
0, -1);
}
{
const static long str[] = {0x7FFFFFFFL, 0};
utf8_write_test(TESTSTR(str),
0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
0, -1);
}
{
const static long str[] = {0xD7FFL, 0};
utf8_write_test(TESTSTR(str),
0xED, 0x9F, 0xBF,
0, -1);
}
{
const static long str[] = {0xD800L, 0};
utf8_write_test(TESTSTR(str),
ERROR,
0, -1);
}
{
const static long str[] = {0xD800L, 0xDC00L, 0};
utf8_write_test(TESTSTR(str),
ERROR,
ERROR,
0, -1);
}
{
const static long str[] = {0xDFFFL, 0};
utf8_write_test(TESTSTR(str),
ERROR,
0, -1);
}
{
const static long str[] = {0xE000L, 0};
utf8_write_test(TESTSTR(str),
0xEE, 0x80, 0x80,
0, -1);
}
printf("write tests completed\n");
printf("total: %d errors\n", total_errs);
return (total_errs != 0);
}
#endif /* TESTMODE */
const charset_spec charset_CS_UTF8 = {
CS_UTF8, read_utf8, write_utf8, NULL
};
#else /* ENUM_CHARSETS */
ENUM_CHARSET(CS_UTF8)
#endif /* ENUM_CHARSETS */

93
puttysrc/CHARSET/XENC.C Normal file
View File

@@ -0,0 +1,93 @@
/*
* xenc.c - translate our internal character set codes to and from
* X11 character encoding names.
*
*/
#include <ctype.h>
#include "charset.h"
#include "internal.h"
static const struct {
const char *name;
int charset;
} xencs[] = {
/*
* Officially registered encoding names. This list is derived
* from the font encodings section of
*
* http://ftp.x.org/pub/DOCS/registry
*
* Where multiple encoding names map to the same encoding id
* (such as iso8859-15 and fcd8859-15), the first is considered
* canonical and will be returned when translating the id to a
* string.
*/
{ "iso8859-1", CS_ISO8859_1 },
{ "iso8859-2", CS_ISO8859_2 },
{ "iso8859-3", CS_ISO8859_3 },
{ "iso8859-4", CS_ISO8859_4 },
{ "iso8859-5", CS_ISO8859_5 },
{ "iso8859-6", CS_ISO8859_6 },
{ "iso8859-7", CS_ISO8859_7 },
{ "iso8859-8", CS_ISO8859_8 },
{ "iso8859-9", CS_ISO8859_9 },
{ "iso8859-10", CS_ISO8859_10 },
{ "iso8859-13", CS_ISO8859_13 },
{ "iso8859-14", CS_ISO8859_14 },
{ "iso8859-15", CS_ISO8859_15 },
{ "fcd8859-15", CS_ISO8859_15 },
{ "hp-roman8", CS_HP_ROMAN8 },
{ "koi8-r", CS_KOI8_R },
/*
* Unofficial encoding names found in the wild.
*/
{ "iso8859-16", CS_ISO8859_16 },
{ "koi8-u", CS_KOI8_U },
{ "ibm-cp437", CS_CP437 },
{ "ibm-cp850", CS_CP850 },
{ "ibm-cp866", CS_CP866 },
{ "microsoft-cp1250", CS_CP1250 },
{ "microsoft-cp1251", CS_CP1251 },
{ "microsoft-cp1252", CS_CP1252 },
{ "microsoft-cp1253", CS_CP1253 },
{ "microsoft-cp1254", CS_CP1254 },
{ "microsoft-cp1255", CS_CP1255 },
{ "microsoft-cp1256", CS_CP1256 },
{ "microsoft-cp1257", CS_CP1257 },
{ "microsoft-cp1258", CS_CP1258 },
{ "mac-roman", CS_MAC_ROMAN },
{ "viscii1.1-1", CS_VISCII },
{ "viscii1-1", CS_VISCII },
};
const char *charset_to_xenc(int charset)
{
int i;
for (i = 0; i < (int)lenof(xencs); i++)
if (charset == xencs[i].charset)
return xencs[i].name;
return NULL; /* not found */
}
int charset_from_xenc(const char *name)
{
int i;
for (i = 0; i < (int)lenof(xencs); i++) {
const char *p, *q;
p = name;
q = xencs[i].name;
while (*p || *q) {
if (tolower(*p) != tolower(*q))
break;
p++; q++;
}
if (!*p && !*q)
return xencs[i].charset;
}
return CS_NONE; /* not found */
}