checking in all the old panacean stuff
This commit is contained in:
154
puttysrc/CHARSET/CHARSET.H
Normal file
154
puttysrc/CHARSET/CHARSET.H
Normal file
@@ -0,0 +1,154 @@
|
||||
/*
|
||||
* charset.h - header file for general character set conversion
|
||||
* routines.
|
||||
*/
|
||||
|
||||
#ifndef charset_charset_h
|
||||
#define charset_charset_h
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
/*
|
||||
* Enumeration that lists all the multibyte or single-byte
|
||||
* character sets known to this library.
|
||||
*/
|
||||
typedef enum {
|
||||
CS_NONE, /* used for reporting errors, etc */
|
||||
CS_ISO8859_1,
|
||||
CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */
|
||||
CS_ISO8859_2,
|
||||
CS_ISO8859_3,
|
||||
CS_ISO8859_4,
|
||||
CS_ISO8859_5,
|
||||
CS_ISO8859_6,
|
||||
CS_ISO8859_7,
|
||||
CS_ISO8859_8,
|
||||
CS_ISO8859_9,
|
||||
CS_ISO8859_10,
|
||||
CS_ISO8859_11,
|
||||
CS_ISO8859_13,
|
||||
CS_ISO8859_14,
|
||||
CS_ISO8859_15,
|
||||
CS_ISO8859_16,
|
||||
CS_CP437,
|
||||
CS_CP850,
|
||||
CS_CP866,
|
||||
CS_CP1250,
|
||||
CS_CP1251,
|
||||
CS_CP1252,
|
||||
CS_CP1253,
|
||||
CS_CP1254,
|
||||
CS_CP1255,
|
||||
CS_CP1256,
|
||||
CS_CP1257,
|
||||
CS_CP1258,
|
||||
CS_KOI8_R,
|
||||
CS_KOI8_U,
|
||||
CS_MAC_ROMAN,
|
||||
CS_MAC_TURKISH,
|
||||
CS_MAC_CROATIAN,
|
||||
CS_MAC_ICELAND,
|
||||
CS_MAC_ROMANIAN,
|
||||
CS_MAC_GREEK,
|
||||
CS_MAC_CYRILLIC,
|
||||
CS_MAC_THAI,
|
||||
CS_MAC_CENTEURO,
|
||||
CS_MAC_SYMBOL,
|
||||
CS_MAC_DINGBATS,
|
||||
CS_MAC_ROMAN_OLD,
|
||||
CS_MAC_CROATIAN_OLD,
|
||||
CS_MAC_ICELAND_OLD,
|
||||
CS_MAC_ROMANIAN_OLD,
|
||||
CS_MAC_GREEK_OLD,
|
||||
CS_MAC_CYRILLIC_OLD,
|
||||
CS_MAC_UKRAINE,
|
||||
CS_MAC_VT100,
|
||||
CS_MAC_VT100_OLD,
|
||||
CS_VISCII,
|
||||
CS_HP_ROMAN8,
|
||||
CS_DEC_MCS,
|
||||
CS_UTF8
|
||||
} charset_t;
|
||||
|
||||
typedef struct {
|
||||
unsigned long s0;
|
||||
} charset_state;
|
||||
|
||||
/*
|
||||
* Routine to convert a MB/SB character set to Unicode.
|
||||
*
|
||||
* This routine accepts some number of bytes, updates a state
|
||||
* variable, and outputs some number of Unicode characters. There
|
||||
* are no guarantees. You can't even guarantee that at most one
|
||||
* Unicode character will be output per byte you feed in; for
|
||||
* example, suppose you're reading UTF-8, you've seen E1 80, and
|
||||
* then you suddenly see FE. Now you need to output _two_ error
|
||||
* characters - one for the incomplete sequence E1 80, and one for
|
||||
* the completely invalid UTF-8 byte FE.
|
||||
*
|
||||
* Returns the number of wide characters output; will never output
|
||||
* more than the size of the buffer (as specified on input).
|
||||
* Advances the `input' pointer and decrements `inlen', to indicate
|
||||
* how far along the input string it got.
|
||||
*
|
||||
* The sequence of `errlen' wide characters pointed to by `errstr'
|
||||
* will be used to indicate a conversion error. If `errstr' is
|
||||
* NULL, `errlen' will be ignored, and the library will choose
|
||||
* something sensible to do on its own. For Unicode, this will be
|
||||
* U+FFFD (REPLACEMENT CHARACTER).
|
||||
*/
|
||||
|
||||
int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen,
|
||||
int charset, charset_state *state,
|
||||
const wchar_t *errstr, int errlen);
|
||||
|
||||
/*
|
||||
* Routine to convert Unicode to an MB/SB character set.
|
||||
*
|
||||
* This routine accepts some number of Unicode characters, updates
|
||||
* a state variable, and outputs some number of bytes.
|
||||
*
|
||||
* Returns the number of bytes characters output; will never output
|
||||
* more than the size of the buffer (as specified on input), and
|
||||
* will never output a partial MB character. Advances the `input'
|
||||
* pointer and decrements `inlen', to indicate how far along the
|
||||
* input string it got.
|
||||
*
|
||||
* The sequence of `errlen' characters pointed to by `errstr' will
|
||||
* be used to indicate a conversion error. If `errstr' is NULL,
|
||||
* `errlen' will be ignored, and the library will choose something
|
||||
* sensible to do on its own (which will vary depending on the
|
||||
* output charset).
|
||||
*/
|
||||
|
||||
int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen,
|
||||
int charset, charset_state *state,
|
||||
const char *errstr, int errlen);
|
||||
|
||||
/*
|
||||
* Convert X11 encoding names to and from our charset identifiers.
|
||||
*/
|
||||
const char *charset_to_xenc(int charset);
|
||||
int charset_from_xenc(const char *name);
|
||||
|
||||
/*
|
||||
* Convert MIME encoding names to and from our charset identifiers.
|
||||
*/
|
||||
const char *charset_to_mimeenc(int charset);
|
||||
int charset_from_mimeenc(const char *name);
|
||||
|
||||
/*
|
||||
* Convert our own encoding names to and from our charset
|
||||
* identifiers.
|
||||
*/
|
||||
const char *charset_to_localenc(int charset);
|
||||
int charset_from_localenc(const char *name);
|
||||
int charset_localenc_nth(int n);
|
||||
|
||||
/*
|
||||
* Convert Mac OS script/region/font to our charset identifiers.
|
||||
*/
|
||||
int charset_from_macenc(int script, int region, int sysvers,
|
||||
const char *fontname);
|
||||
|
||||
#endif /* charset_charset_h */
|
||||
19
puttysrc/CHARSET/ENUM.C
Normal file
19
puttysrc/CHARSET/ENUM.C
Normal file
@@ -0,0 +1,19 @@
|
||||
/*
|
||||
* enum.c - enumerate all charsets defined by the library.
|
||||
*
|
||||
* This file maintains a list of every other source file which
|
||||
* contains ENUM_CHARSET definitions. It #includes each one with
|
||||
* ENUM_CHARSETS defined, which causes those source files to do
|
||||
* nothing at all except call the ENUM_CHARSET macro on each
|
||||
* charset they define.
|
||||
*
|
||||
* This file in turn is included from various other places, with
|
||||
* the ENUM_CHARSET macro defined to various different things. This
|
||||
* allows us to have multiple implementations of the master charset
|
||||
* lookup table (a static one and a dynamic one).
|
||||
*/
|
||||
|
||||
#define ENUM_CHARSETS
|
||||
#include "sbcsdat.c"
|
||||
#include "utf8.c"
|
||||
#undef ENUM_CHARSETS
|
||||
91
puttysrc/CHARSET/FROMUCS.C
Normal file
91
puttysrc/CHARSET/FROMUCS.C
Normal file
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
* fromucs.c - convert Unicode to other character sets.
|
||||
*/
|
||||
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
struct charset_emit_param {
|
||||
char *output;
|
||||
int outlen;
|
||||
const char *errstr;
|
||||
int errlen;
|
||||
int stopped;
|
||||
};
|
||||
|
||||
static void charset_emit(void *ctx, long int output)
|
||||
{
|
||||
struct charset_emit_param *param = (struct charset_emit_param *)ctx;
|
||||
char outval;
|
||||
char const *p;
|
||||
int outlen;
|
||||
|
||||
if (output == ERROR) {
|
||||
p = param->errstr;
|
||||
outlen = param->errlen;
|
||||
} else {
|
||||
outval = output;
|
||||
p = &outval;
|
||||
outlen = 1;
|
||||
}
|
||||
|
||||
if (param->outlen >= outlen) {
|
||||
while (outlen > 0) {
|
||||
*param->output++ = *p++;
|
||||
param->outlen--;
|
||||
outlen--;
|
||||
}
|
||||
} else {
|
||||
param->stopped = 1;
|
||||
}
|
||||
}
|
||||
|
||||
int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen,
|
||||
int charset, charset_state *state,
|
||||
const char *errstr, int errlen)
|
||||
{
|
||||
charset_spec const *spec = charset_find_spec(charset);
|
||||
charset_state localstate;
|
||||
struct charset_emit_param param;
|
||||
|
||||
param.output = output;
|
||||
param.outlen = outlen;
|
||||
param.stopped = 0;
|
||||
|
||||
/*
|
||||
* charset_emit will expect a valid errstr.
|
||||
*/
|
||||
if (!errstr) {
|
||||
/* *shrug* this is good enough, and consistent across all SBCS... */
|
||||
param.errstr = ".";
|
||||
param.errlen = 1;
|
||||
}
|
||||
param.errstr = errstr;
|
||||
param.errlen = errlen;
|
||||
|
||||
if (!state) {
|
||||
localstate.s0 = 0;
|
||||
} else {
|
||||
localstate = *state; /* structure copy */
|
||||
}
|
||||
state = &localstate;
|
||||
|
||||
while (*inlen > 0) {
|
||||
int lenbefore = param.output - output;
|
||||
spec->write(spec, **input, &localstate, charset_emit, ¶m);
|
||||
if (param.stopped) {
|
||||
/*
|
||||
* The emit function has _tried_ to output some
|
||||
* characters, but ran up against the end of the
|
||||
* buffer. Leave immediately, and return what happened
|
||||
* _before_ attempting to process this character.
|
||||
*/
|
||||
return lenbefore;
|
||||
}
|
||||
if (state)
|
||||
*state = localstate; /* structure copy */
|
||||
(*input)++;
|
||||
(*inlen)--;
|
||||
}
|
||||
return param.output - output;
|
||||
}
|
||||
89
puttysrc/CHARSET/INTERNAL.H
Normal file
89
puttysrc/CHARSET/INTERNAL.H
Normal file
@@ -0,0 +1,89 @@
|
||||
/*
|
||||
* internal.h - internal header stuff for the charset library.
|
||||
*/
|
||||
|
||||
#ifndef charset_internal_h
|
||||
#define charset_internal_h
|
||||
|
||||
/* This invariably comes in handy */
|
||||
#define lenof(x) ( sizeof((x)) / sizeof(*(x)) )
|
||||
|
||||
/* This is an invalid Unicode value used to indicate an error. */
|
||||
#define ERROR 0xFFFFL /* Unicode value representing error */
|
||||
|
||||
typedef struct charset_spec charset_spec;
|
||||
typedef struct sbcs_data sbcs_data;
|
||||
|
||||
struct charset_spec {
|
||||
int charset; /* numeric identifier */
|
||||
|
||||
/*
|
||||
* A function to read the character set and output Unicode
|
||||
* characters. The `emit' function expects to get Unicode chars
|
||||
* passed to it; it should be sent ERROR for any encoding error
|
||||
* on the input.
|
||||
*/
|
||||
void (*read)(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx);
|
||||
/*
|
||||
* A function to read Unicode characters and output in this
|
||||
* character set. The `emit' function expects to get byte
|
||||
* values passed to it; it should be sent ERROR for any
|
||||
* non-representable characters on the input.
|
||||
*/
|
||||
void (*write)(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx);
|
||||
void const *data;
|
||||
};
|
||||
|
||||
/*
|
||||
* This is the format of `data' used by the SBCS read and write
|
||||
* functions; so it's the format used in all SBCS definitions.
|
||||
*/
|
||||
struct sbcs_data {
|
||||
/*
|
||||
* This is a simple mapping table converting each SBCS position
|
||||
* to a Unicode code point. Some positions may contain ERROR,
|
||||
* indicating that that byte value is not defined in the SBCS
|
||||
* in question and its occurrence in input is an error.
|
||||
*/
|
||||
unsigned long sbcs2ucs[256];
|
||||
|
||||
/*
|
||||
* This lookup table is used to convert Unicode back to the
|
||||
* SBCS. It consists of the valid byte values in the SBCS,
|
||||
* sorted in order of their Unicode translation. So given a
|
||||
* Unicode value U, you can do a binary search on this table
|
||||
* using the above table as a lookup: when testing the Xth
|
||||
* position in this table, you branch according to whether
|
||||
* sbcs2ucs[ucs2sbcs[X]] is less than, greater than, or equal
|
||||
* to U.
|
||||
*
|
||||
* Note that since there may be fewer than 256 valid byte
|
||||
* values in a particular SBCS, we must supply the length of
|
||||
* this table as well as the contents.
|
||||
*/
|
||||
unsigned char ucs2sbcs[256];
|
||||
int nvalid;
|
||||
};
|
||||
|
||||
/*
|
||||
* Prototypes for internal library functions.
|
||||
*/
|
||||
charset_spec const *charset_find_spec(int charset);
|
||||
void read_sbcs(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx);
|
||||
void write_sbcs(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx);
|
||||
|
||||
/*
|
||||
* Placate compiler warning about unused parameters, of which we
|
||||
* expect to have some in this library.
|
||||
*/
|
||||
#define UNUSEDARG(x) ( (x) = (x) )
|
||||
|
||||
#endif /* charset_internal_h */
|
||||
125
puttysrc/CHARSET/LOCALENC.C
Normal file
125
puttysrc/CHARSET/LOCALENC.C
Normal file
@@ -0,0 +1,125 @@
|
||||
/*
|
||||
* local.c - translate our internal character set codes to and from
|
||||
* our own set of plausibly legible character-set names. Also
|
||||
* provides a canonical name for each encoding (useful for software
|
||||
* announcing what character set it will be using), and a set of
|
||||
* enumeration functions which return a list of supported
|
||||
* encodings one by one.
|
||||
*
|
||||
* charset_from_localenc will attempt all other text translations
|
||||
* as well as this table, to maximise the number of different ways
|
||||
* you can select a supported charset.
|
||||
*/
|
||||
|
||||
#include <ctype.h>
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
static const struct {
|
||||
const char *name;
|
||||
int charset;
|
||||
int return_in_enum; /* enumeration misses some charsets */
|
||||
} localencs[] = {
|
||||
{ "<UNKNOWN>", CS_NONE, 0 },
|
||||
{ "ISO-8859-1", CS_ISO8859_1, 1 },
|
||||
{ "ISO-8859-1 with X11 line drawing", CS_ISO8859_1_X11, 0 },
|
||||
{ "ISO-8859-2", CS_ISO8859_2, 1 },
|
||||
{ "ISO-8859-3", CS_ISO8859_3, 1 },
|
||||
{ "ISO-8859-4", CS_ISO8859_4, 1 },
|
||||
{ "ISO-8859-5", CS_ISO8859_5, 1 },
|
||||
{ "ISO-8859-6", CS_ISO8859_6, 1 },
|
||||
{ "ISO-8859-7", CS_ISO8859_7, 1 },
|
||||
{ "ISO-8859-8", CS_ISO8859_8, 1 },
|
||||
{ "ISO-8859-9", CS_ISO8859_9, 1 },
|
||||
{ "ISO-8859-10", CS_ISO8859_10, 1 },
|
||||
{ "ISO-8859-11", CS_ISO8859_11, 1 },
|
||||
{ "ISO-8859-13", CS_ISO8859_13, 1 },
|
||||
{ "ISO-8859-14", CS_ISO8859_14, 1 },
|
||||
{ "ISO-8859-15", CS_ISO8859_15, 1 },
|
||||
{ "ISO-8859-16", CS_ISO8859_16, 1 },
|
||||
{ "CP437", CS_CP437, 1 },
|
||||
{ "CP850", CS_CP850, 1 },
|
||||
{ "CP866", CS_CP866, 1 },
|
||||
{ "CP1250", CS_CP1250, 1 },
|
||||
{ "CP1251", CS_CP1251, 1 },
|
||||
{ "CP1252", CS_CP1252, 1 },
|
||||
{ "CP1253", CS_CP1253, 1 },
|
||||
{ "CP1254", CS_CP1254, 1 },
|
||||
{ "CP1255", CS_CP1255, 1 },
|
||||
{ "CP1256", CS_CP1256, 1 },
|
||||
{ "CP1257", CS_CP1257, 1 },
|
||||
{ "CP1258", CS_CP1258, 1 },
|
||||
{ "KOI8-R", CS_KOI8_R, 1 },
|
||||
{ "KOI8-U", CS_KOI8_U, 1 },
|
||||
{ "Mac Roman", CS_MAC_ROMAN, 1 },
|
||||
{ "Mac Turkish", CS_MAC_TURKISH, 1 },
|
||||
{ "Mac Croatian", CS_MAC_CROATIAN, 1 },
|
||||
{ "Mac Iceland", CS_MAC_ICELAND, 1 },
|
||||
{ "Mac Romanian", CS_MAC_ROMANIAN, 1 },
|
||||
{ "Mac Greek", CS_MAC_GREEK, 1 },
|
||||
{ "Mac Cyrillic", CS_MAC_CYRILLIC, 1 },
|
||||
{ "Mac Thai", CS_MAC_THAI, 1 },
|
||||
{ "Mac Centeuro", CS_MAC_CENTEURO, 1 },
|
||||
{ "Mac Symbol", CS_MAC_SYMBOL, 1 },
|
||||
{ "Mac Dingbats", CS_MAC_DINGBATS, 1 },
|
||||
{ "Mac Roman (old)", CS_MAC_ROMAN_OLD, 0 },
|
||||
{ "Mac Croatian (old)", CS_MAC_CROATIAN_OLD, 0 },
|
||||
{ "Mac Iceland (old)", CS_MAC_ICELAND_OLD, 0 },
|
||||
{ "Mac Romanian (old)", CS_MAC_ROMANIAN_OLD, 0 },
|
||||
{ "Mac Greek (old)", CS_MAC_GREEK_OLD, 0 },
|
||||
{ "Mac Cyrillic (old)", CS_MAC_CYRILLIC_OLD, 0 },
|
||||
{ "Mac Ukraine", CS_MAC_UKRAINE, 1 },
|
||||
{ "Mac VT100", CS_MAC_VT100, 1 },
|
||||
{ "Mac VT100 (old)", CS_MAC_VT100_OLD, 0 },
|
||||
{ "VISCII", CS_VISCII, 1 },
|
||||
{ "HP ROMAN8", CS_HP_ROMAN8, 1 },
|
||||
{ "DEC MCS", CS_DEC_MCS, 1 },
|
||||
{ "UTF-8", CS_UTF8, 1 },
|
||||
};
|
||||
|
||||
const char *charset_to_localenc(int charset)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(localencs); i++)
|
||||
if (charset == localencs[i].charset)
|
||||
return localencs[i].name;
|
||||
|
||||
return NULL; /* not found */
|
||||
}
|
||||
|
||||
int charset_from_localenc(const char *name)
|
||||
{
|
||||
int i;
|
||||
|
||||
if ( (i = charset_from_mimeenc(name)) != CS_NONE)
|
||||
return i;
|
||||
if ( (i = charset_from_xenc(name)) != CS_NONE)
|
||||
return i;
|
||||
|
||||
for (i = 0; i < (int)lenof(localencs); i++) {
|
||||
const char *p, *q;
|
||||
p = name;
|
||||
q = localencs[i].name;
|
||||
while (*p || *q) {
|
||||
if (tolower(*p) != tolower(*q))
|
||||
break;
|
||||
p++; q++;
|
||||
}
|
||||
if (!*p && !*q)
|
||||
return localencs[i].charset;
|
||||
}
|
||||
|
||||
return CS_NONE; /* not found */
|
||||
}
|
||||
|
||||
int charset_localenc_nth(int n)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(localencs); i++)
|
||||
if (localencs[i].return_in_enum && !n--)
|
||||
return localencs[i].charset;
|
||||
|
||||
return CS_NONE; /* end of list */
|
||||
}
|
||||
169
puttysrc/CHARSET/MACENC.C
Normal file
169
puttysrc/CHARSET/MACENC.C
Normal file
@@ -0,0 +1,169 @@
|
||||
/* $Id: macenc.c 4787 2004-11-16 15:27:00Z simon $ */
|
||||
/*
|
||||
* Copyright (c) 2003 Ben Harris
|
||||
* All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
|
||||
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
|
||||
* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
/*
|
||||
* macenc.c -- Convert a Mac OS script/region/font combination to our
|
||||
* internal charset code.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
/*
|
||||
* These are defined by Mac OS's <Script.h>, but we'd like to be
|
||||
* independent of that.
|
||||
*/
|
||||
|
||||
#define smRoman 0
|
||||
#define smJapanese 1
|
||||
#define smTradChinese 2
|
||||
#define smKorean 3
|
||||
#define smArabic 4
|
||||
#define smHebrew 5
|
||||
#define smCyrillic 7
|
||||
#define smDevenagari 9
|
||||
#define smGurmukhi 10
|
||||
#define smGujurati 11
|
||||
#define smThai 21
|
||||
#define smSimpChinese 25
|
||||
#define smTibetan 26
|
||||
#define smEthiopic 28
|
||||
#define smCentralEuroRoman 29
|
||||
|
||||
#define verGreece 20
|
||||
#define verIceland 21
|
||||
#define verTurkey 24
|
||||
#define verYugoCroatian 25
|
||||
#define verRomania 39
|
||||
#define verFaroeIsl 47
|
||||
#define verIran 48
|
||||
#define verRussia 49
|
||||
#define verSlovenian 66
|
||||
#define verCroatia 68
|
||||
#define verBulgaria 72
|
||||
#define verScottishGaelic 75
|
||||
#define verManxGaelic 76
|
||||
#define verBreton 77
|
||||
#define verNunavut 78
|
||||
#define verWelsh 79
|
||||
#define verIrishGaelicScript 81
|
||||
|
||||
static const struct {
|
||||
int script;
|
||||
int region;
|
||||
int sysvermin;
|
||||
char const *fontname;
|
||||
int charset;
|
||||
} macencs[] = {
|
||||
{ smRoman, -1, 0x850, "VT100", CS_MAC_VT100 },
|
||||
{ smRoman, -1, 0, "VT100", CS_MAC_VT100_OLD },
|
||||
/*
|
||||
* From here on, this table is largely derived from
|
||||
* <http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/README.TXT>,
|
||||
* with _OLD version added based on the comments in individual
|
||||
* mapping files.
|
||||
*/
|
||||
{ smRoman, -1, 0, "Symbol", CS_MAC_SYMBOL },
|
||||
{ smRoman, -1, 0, "Zapf Dingbats", CS_MAC_DINGBATS },
|
||||
{ smRoman, verTurkey, 0, NULL, CS_MAC_TURKISH },
|
||||
{ smRoman, verYugoCroatian, 0x850, NULL, CS_MAC_CROATIAN },
|
||||
{ smRoman, verYugoCroatian, 0, NULL, CS_MAC_CROATIAN_OLD },
|
||||
{ smRoman, verSlovenian, 0x850, NULL, CS_MAC_CROATIAN },
|
||||
{ smRoman, verSlovenian, 0, NULL, CS_MAC_CROATIAN_OLD },
|
||||
{ smRoman, verCroatia, 0x850, NULL, CS_MAC_CROATIAN },
|
||||
{ smRoman, verCroatia, 0, NULL, CS_MAC_CROATIAN_OLD },
|
||||
{ smRoman, verIceland, 0x850, NULL, CS_MAC_ICELAND },
|
||||
{ smRoman, verIceland, 0, NULL, CS_MAC_ICELAND_OLD },
|
||||
{ smRoman, verFaroeIsl, 0x850, NULL, CS_MAC_ICELAND },
|
||||
{ smRoman, verFaroeIsl, 0, NULL, CS_MAC_ICELAND_OLD },
|
||||
{ smRoman, verRomania, 0x850, NULL, CS_MAC_ROMANIAN },
|
||||
{ smRoman, verRomania, 0, NULL, CS_MAC_ROMANIAN_OLD },
|
||||
#if 0 /* No mapping table on ftp.unicode.org */
|
||||
{ smRoman, verIreland, 0x850, NULL, CS_MAC_CELTIC },
|
||||
{ smRoman, verIreland, 0, NULL, CS_MAC_CELTIC_OLD },
|
||||
{ smRoman, verScottishGaelic, 0x850, NULL, CS_MAC_CELTIC },
|
||||
{ smRoman, verScottishGaelic, 0, NULL, CS_MAC_CELTIC_OLD },
|
||||
{ smRoman, verManxGaelic, 0x850, NULL, CS_MAC_CELTIC },
|
||||
{ smRoman, verManxGaelic, 0, NULL, CS_MAC_CELTIC_OLD },
|
||||
{ smRoman, verBreton, 0x850, NULL, CS_MAC_CELTIC },
|
||||
{ smRoman, verBreton, 0, NULL, CS_MAC_CELTIC_OLD },
|
||||
{ smRoman, verWelsh, 0x850, NULL, CS_MAC_CELTIC },
|
||||
{ smRoman, verWelsh, 0, NULL, CS_MAC_CELTIC_OLD },
|
||||
{ smRoman, verIrishGaelicScript, 0x850, NULL, CS_MAC_GAELIC },
|
||||
{ smRoman, verIrishGaelicScript, 0, NULL, CS_MAC_GAELIC_OLD },
|
||||
#endif
|
||||
{ smRoman, verGreece, 0x922, NULL, CS_MAC_GREEK },
|
||||
{ smRoman, verGreece, 0, NULL, CS_MAC_GREEK_OLD },
|
||||
{ smRoman, -1, 0x850, NULL, CS_MAC_ROMAN },
|
||||
{ smRoman, -1, 0, NULL, CS_MAC_ROMAN_OLD },
|
||||
#if 0 /* Multi-byte encodings, not yet supported */
|
||||
{ smJapanese, -1, 0, NULL, CS_MAC_JAPANESE },
|
||||
{ smTradChinese, -1, 0, NULL, CS_MAC_CHINTRAD },
|
||||
{ smKorean, -1, 0, NULL, CS_MAC_KOREAN },
|
||||
#endif
|
||||
#if 0 /* Bidirectional encodings, not yet supported */
|
||||
{ smArabic, verIran, 0, NULL, CS_MAC_FARSI },
|
||||
{ smArabic, -1, 0, NULL, CS_MAC_ARABIC },
|
||||
{ smHebrew, -1, 0, NULL, CS_MAC_HEBREW },
|
||||
#endif
|
||||
{ smCyrillic, -1, 0x900, NULL, CS_MAC_CYRILLIC },
|
||||
{ smCyrillic, verRussia, 0, NULL, CS_MAC_CYRILLIC_OLD },
|
||||
{ smCyrillic, verBulgaria, 0, NULL, CS_MAC_CYRILLIC_OLD },
|
||||
{ smCyrillic, -1, 0, NULL, CS_MAC_UKRAINE },
|
||||
#if 0 /* Complex Indic scripts, not yet supported */
|
||||
{ smDevanagari, -1, 0, NULL, CS_MAC_DEVENAGA },
|
||||
{ smGurmukhi, -1, 0, NULL, CS_MAC_GURMUKHI },
|
||||
{ smGujurati, -1, 0, NULL, CS_MAC_GUJURATI },
|
||||
#endif
|
||||
{ smThai, -1, 0, NULL, CS_MAC_THAI },
|
||||
#if 0 /* Multi-byte encoding, not yet supported */
|
||||
{ smSimpChinese, -1, 0, NULL, CS_MAC_CHINSIMP },
|
||||
#endif
|
||||
#if 0 /* No mapping table on ftp.unicode.org */
|
||||
{ smTibetan, -1, 0, NULL, CS_MAC_TIBETAN },
|
||||
{ smEthiopic, -1, 0, NULL, CS_MAC_ETHIOPIC },
|
||||
{ smEthiopic, verNanavut, 0, NULL, CS_MAC_INUIT },
|
||||
#endif
|
||||
{ smCentralEuroRoman, -1, 0, NULL, CS_MAC_CENTEURO },
|
||||
};
|
||||
|
||||
int charset_from_macenc(int script, int region, int sysvers,
|
||||
char const *fontname)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(macencs); i++)
|
||||
if ((macencs[i].script == script) &&
|
||||
(macencs[i].region < 0 || macencs[i].region == region) &&
|
||||
(macencs[i].sysvermin <= sysvers) &&
|
||||
(macencs[i].fontname == NULL ||
|
||||
(fontname != NULL && strcmp(macencs[i].fontname, fontname) == 0)))
|
||||
return macencs[i].charset;
|
||||
|
||||
return CS_NONE;
|
||||
}
|
||||
214
puttysrc/CHARSET/MIMEENC.C
Normal file
214
puttysrc/CHARSET/MIMEENC.C
Normal file
@@ -0,0 +1,214 @@
|
||||
/*
|
||||
* mimeenc.c - translate our internal character set codes to and
|
||||
* from MIME standard character-set names.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <ctype.h>
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
static const struct {
|
||||
const char *name;
|
||||
int charset;
|
||||
} mimeencs[] = {
|
||||
/*
|
||||
* These names are taken from
|
||||
*
|
||||
* http://www.iana.org/assignments/character-sets
|
||||
*
|
||||
* Where multiple encoding names map to the same encoding id
|
||||
* (such as the variety of aliases for ISO-8859-1), the first
|
||||
* is considered canonical and will be returned when
|
||||
* translating the id to a string.
|
||||
*/
|
||||
{ "ISO-8859-1", CS_ISO8859_1 },
|
||||
{ "iso-ir-100", CS_ISO8859_1 },
|
||||
{ "ISO_8859-1", CS_ISO8859_1 },
|
||||
{ "ISO_8859-1:1987", CS_ISO8859_1 },
|
||||
{ "latin1", CS_ISO8859_1 },
|
||||
{ "l1", CS_ISO8859_1 },
|
||||
{ "IBM819", CS_ISO8859_1 },
|
||||
{ "CP819", CS_ISO8859_1 },
|
||||
{ "csISOLatin1", CS_ISO8859_1 },
|
||||
|
||||
{ "ISO-8859-2", CS_ISO8859_2 },
|
||||
{ "ISO_8859-2:1987", CS_ISO8859_2 },
|
||||
{ "iso-ir-101", CS_ISO8859_2 },
|
||||
{ "ISO_8859-2", CS_ISO8859_2 },
|
||||
{ "latin2", CS_ISO8859_2 },
|
||||
{ "l2", CS_ISO8859_2 },
|
||||
{ "csISOLatin2", CS_ISO8859_2 },
|
||||
|
||||
{ "ISO-8859-3", CS_ISO8859_3 },
|
||||
{ "ISO_8859-3:1988", CS_ISO8859_3 },
|
||||
{ "iso-ir-109", CS_ISO8859_3 },
|
||||
{ "ISO_8859-3", CS_ISO8859_3 },
|
||||
{ "latin3", CS_ISO8859_3 },
|
||||
{ "l3", CS_ISO8859_3 },
|
||||
{ "csISOLatin3", CS_ISO8859_3 },
|
||||
|
||||
{ "ISO-8859-4", CS_ISO8859_4 },
|
||||
{ "ISO_8859-4:1988", CS_ISO8859_4 },
|
||||
{ "iso-ir-110", CS_ISO8859_4 },
|
||||
{ "ISO_8859-4", CS_ISO8859_4 },
|
||||
{ "latin4", CS_ISO8859_4 },
|
||||
{ "l4", CS_ISO8859_4 },
|
||||
{ "csISOLatin4", CS_ISO8859_4 },
|
||||
|
||||
{ "ISO-8859-5", CS_ISO8859_5 },
|
||||
{ "ISO_8859-5:1988", CS_ISO8859_5 },
|
||||
{ "iso-ir-144", CS_ISO8859_5 },
|
||||
{ "ISO_8859-5", CS_ISO8859_5 },
|
||||
{ "cyrillic", CS_ISO8859_5 },
|
||||
{ "csISOLatinCyrillic", CS_ISO8859_5 },
|
||||
|
||||
{ "ISO-8859-6", CS_ISO8859_6 },
|
||||
{ "ISO_8859-6:1987", CS_ISO8859_6 },
|
||||
{ "iso-ir-127", CS_ISO8859_6 },
|
||||
{ "ISO_8859-6", CS_ISO8859_6 },
|
||||
{ "ECMA-114", CS_ISO8859_6 },
|
||||
{ "ASMO-708", CS_ISO8859_6 },
|
||||
{ "arabic", CS_ISO8859_6 },
|
||||
{ "csISOLatinArabic", CS_ISO8859_6 },
|
||||
|
||||
{ "ISO-8859-7", CS_ISO8859_7 },
|
||||
{ "ISO_8859-7:1987", CS_ISO8859_7 },
|
||||
{ "iso-ir-126", CS_ISO8859_7 },
|
||||
{ "ISO_8859-7", CS_ISO8859_7 },
|
||||
{ "ELOT_928", CS_ISO8859_7 },
|
||||
{ "ECMA-118", CS_ISO8859_7 },
|
||||
{ "greek", CS_ISO8859_7 },
|
||||
{ "greek8", CS_ISO8859_7 },
|
||||
{ "csISOLatinGreek", CS_ISO8859_7 },
|
||||
|
||||
{ "ISO-8859-8", CS_ISO8859_8 },
|
||||
{ "ISO_8859-8:1988", CS_ISO8859_8 },
|
||||
{ "iso-ir-138", CS_ISO8859_8 },
|
||||
{ "ISO_8859-8", CS_ISO8859_8 },
|
||||
{ "hebrew", CS_ISO8859_8 },
|
||||
{ "csISOLatinHebrew", CS_ISO8859_8 },
|
||||
|
||||
{ "ISO-8859-9", CS_ISO8859_9 },
|
||||
{ "ISO_8859-9:1989", CS_ISO8859_9 },
|
||||
{ "iso-ir-148", CS_ISO8859_9 },
|
||||
{ "ISO_8859-9", CS_ISO8859_9 },
|
||||
{ "latin5", CS_ISO8859_9 },
|
||||
{ "l5", CS_ISO8859_9 },
|
||||
{ "csISOLatin5", CS_ISO8859_9 },
|
||||
|
||||
{ "ISO-8859-10", CS_ISO8859_10 },
|
||||
{ "iso-ir-157", CS_ISO8859_10 },
|
||||
{ "l6", CS_ISO8859_10 },
|
||||
{ "ISO_8859-10:1992", CS_ISO8859_10 },
|
||||
{ "csISOLatin6", CS_ISO8859_10 },
|
||||
{ "latin6", CS_ISO8859_10 },
|
||||
|
||||
{ "ISO-8859-13", CS_ISO8859_13 },
|
||||
|
||||
{ "ISO-8859-14", CS_ISO8859_14 },
|
||||
{ "iso-ir-199", CS_ISO8859_14 },
|
||||
{ "ISO_8859-14:1998", CS_ISO8859_14 },
|
||||
{ "ISO_8859-14", CS_ISO8859_14 },
|
||||
{ "latin8", CS_ISO8859_14 },
|
||||
{ "iso-celtic", CS_ISO8859_14 },
|
||||
{ "l8", CS_ISO8859_14 },
|
||||
|
||||
{ "ISO-8859-15", CS_ISO8859_15 },
|
||||
{ "ISO_8859-15", CS_ISO8859_15 },
|
||||
{ "Latin-9", CS_ISO8859_15 },
|
||||
|
||||
{ "ISO-8859-16", CS_ISO8859_16 },
|
||||
{ "iso-ir-226", CS_ISO8859_16 },
|
||||
{ "ISO_8859-16", CS_ISO8859_16 },
|
||||
{ "ISO_8859-16:2001", CS_ISO8859_16 },
|
||||
{ "latin10", CS_ISO8859_16 },
|
||||
{ "l10", CS_ISO8859_16 },
|
||||
|
||||
{ "IBM437", CS_CP437 },
|
||||
{ "cp437", CS_CP437 },
|
||||
{ "437", CS_CP437 },
|
||||
{ "csPC8CodePage437", CS_CP437 },
|
||||
|
||||
{ "IBM850", CS_CP850 },
|
||||
{ "cp850", CS_CP850 },
|
||||
{ "850", CS_CP850 },
|
||||
{ "csPC850Multilingual", CS_CP850 },
|
||||
|
||||
{ "IBM866", CS_CP866 },
|
||||
{ "cp866", CS_CP866 },
|
||||
{ "866", CS_CP866 },
|
||||
{ "csIBM866", CS_CP866 },
|
||||
|
||||
{ "windows-1250", CS_CP1250 },
|
||||
|
||||
{ "windows-1251", CS_CP1251 },
|
||||
|
||||
{ "windows-1252", CS_CP1252 },
|
||||
|
||||
{ "windows-1253", CS_CP1253 },
|
||||
|
||||
{ "windows-1254", CS_CP1254 },
|
||||
|
||||
{ "windows-1255", CS_CP1255 },
|
||||
|
||||
{ "windows-1256", CS_CP1256 },
|
||||
|
||||
{ "windows-1257", CS_CP1257 },
|
||||
|
||||
{ "windows-1258", CS_CP1258 },
|
||||
|
||||
{ "KOI8-R", CS_KOI8_R },
|
||||
{ "csKOI8R", CS_KOI8_R },
|
||||
|
||||
{ "KOI8-U", CS_KOI8_U },
|
||||
|
||||
{ "macintosh", CS_MAC_ROMAN_OLD },
|
||||
{ "mac", CS_MAC_ROMAN_OLD },
|
||||
{ "csMacintosh", CS_MAC_ROMAN_OLD },
|
||||
|
||||
{ "VISCII", CS_VISCII },
|
||||
{ "csVISCII", CS_VISCII },
|
||||
|
||||
{ "hp-roman8", CS_HP_ROMAN8 },
|
||||
{ "roman8", CS_HP_ROMAN8 },
|
||||
{ "r8", CS_HP_ROMAN8 },
|
||||
{ "csHPRoman8", CS_HP_ROMAN8 },
|
||||
|
||||
{ "DEC-MCS", CS_DEC_MCS },
|
||||
{ "dec", CS_DEC_MCS },
|
||||
{ "csDECMCS", CS_DEC_MCS },
|
||||
|
||||
{ "UTF-8", CS_UTF8 },
|
||||
};
|
||||
|
||||
const char *charset_to_mimeenc(int charset)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(mimeencs); i++)
|
||||
if (charset == mimeencs[i].charset)
|
||||
return mimeencs[i].name;
|
||||
|
||||
return NULL; /* not found */
|
||||
}
|
||||
|
||||
int charset_from_mimeenc(const char *name)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(mimeencs); i++) {
|
||||
const char *p, *q;
|
||||
p = name;
|
||||
q = mimeencs[i].name;
|
||||
while (*p || *q) {
|
||||
if (tolower(*p) != tolower(*q))
|
||||
break;
|
||||
p++; q++;
|
||||
}
|
||||
if (!*p && !*q)
|
||||
return mimeencs[i].charset;
|
||||
}
|
||||
|
||||
return CS_NONE; /* not found */
|
||||
}
|
||||
15
puttysrc/CHARSET/README
Normal file
15
puttysrc/CHARSET/README
Normal file
@@ -0,0 +1,15 @@
|
||||
This subdirectory contains a general character-set conversion
|
||||
library, used in the Unix port of PuTTY, and available for use in
|
||||
other ports if it should happen to be useful.
|
||||
|
||||
This is a variant of a library that's currently used in some other
|
||||
programs such as Timber and Halibut. At some future date, we would
|
||||
like to merge the two libraries, so that all programs use the same
|
||||
libcharset.
|
||||
|
||||
It is therefore a _strong_ design goal that this library should remain
|
||||
perfectly general, and not tied to particulars of PuTTY. It must not
|
||||
reference any code outside its own subdirectory; it should not have
|
||||
PuTTY-specific helper routines added to it unless they can be
|
||||
documented in a general manner which might make them useful in other
|
||||
circumstances as well.
|
||||
53
puttysrc/CHARSET/SBCS.C
Normal file
53
puttysrc/CHARSET/SBCS.C
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
* sbcs.c - routines to handle single-byte character sets.
|
||||
*/
|
||||
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
/*
|
||||
* The charset_spec for any single-byte character set should
|
||||
* provide read_sbcs() as its read function, and its `data' field
|
||||
* should be a wchar_t string constant containing the 256 entries
|
||||
* of the translation table.
|
||||
*/
|
||||
|
||||
void read_sbcs(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx)
|
||||
{
|
||||
const struct sbcs_data *sd = charset->data;
|
||||
|
||||
UNUSEDARG(state);
|
||||
|
||||
emit(emitctx, sd->sbcs2ucs[input_chr]);
|
||||
}
|
||||
|
||||
void write_sbcs(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx)
|
||||
{
|
||||
const struct sbcs_data *sd = charset->data;
|
||||
int i, j, k, c;
|
||||
|
||||
UNUSEDARG(state);
|
||||
|
||||
/*
|
||||
* Binary-search in the ucs2sbcs table.
|
||||
*/
|
||||
i = -1;
|
||||
j = sd->nvalid;
|
||||
while (i+1 < j) {
|
||||
k = (i+j)/2;
|
||||
c = sd->ucs2sbcs[k];
|
||||
if (input_chr < sd->sbcs2ucs[c])
|
||||
j = k;
|
||||
else if (input_chr > sd->sbcs2ucs[c])
|
||||
i = k;
|
||||
else {
|
||||
emit(emitctx, c);
|
||||
return;
|
||||
}
|
||||
}
|
||||
emit(emitctx, ERROR);
|
||||
}
|
||||
1117
puttysrc/CHARSET/SBCS.DAT
Normal file
1117
puttysrc/CHARSET/SBCS.DAT
Normal file
File diff suppressed because it is too large
Load Diff
4018
puttysrc/CHARSET/SBCSDAT.C
Normal file
4018
puttysrc/CHARSET/SBCSDAT.C
Normal file
File diff suppressed because it is too large
Load Diff
110
puttysrc/CHARSET/SBCSGEN.PL
Normal file
110
puttysrc/CHARSET/SBCSGEN.PL
Normal file
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env perl -w
|
||||
|
||||
# This script generates sbcsdat.c (the data for all the SBCSes) from its
|
||||
# source form sbcs.dat.
|
||||
|
||||
$infile = "sbcs.dat";
|
||||
$outfile = "sbcsdat.c";
|
||||
|
||||
open FOO, $infile;
|
||||
open BAR, ">$outfile";
|
||||
select BAR;
|
||||
|
||||
print "/*\n";
|
||||
print " * sbcsdat.c - data definitions for single-byte character sets.\n";
|
||||
print " *\n";
|
||||
print " * Generated by sbcsgen.pl from sbcs.dat.\n";
|
||||
print " * You should edit those files rather than editing this one.\n";
|
||||
print " */\n";
|
||||
print "\n";
|
||||
print "#ifndef ENUM_CHARSETS\n";
|
||||
print "\n";
|
||||
print "#include \"charset.h\"\n";
|
||||
print "#include \"internal.h\"\n";
|
||||
print "\n";
|
||||
|
||||
my $charsetname = undef;
|
||||
my @vals = ();
|
||||
|
||||
my @charsetnames = ();
|
||||
my @sortpriority = ();
|
||||
|
||||
while (<FOO>) {
|
||||
chomp;
|
||||
if (/^charset (.*)$/) {
|
||||
$charsetname = $1;
|
||||
@vals = ();
|
||||
@sortpriority = map { 0 } 0..255;
|
||||
} elsif (/^sortpriority ([^-]*)-([^-]*) (.*)$/) {
|
||||
for ($i = hex $1; $i <= hex $2; $i++) {
|
||||
$sortpriority[$i] += $3;
|
||||
}
|
||||
} elsif (/^[0-9a-fA-FX]/) {
|
||||
push @vals, map { $_ eq "XXXX" ? -1 : hex $_ } split / +/, $_;
|
||||
if (scalar @vals > 256) {
|
||||
die "$infile:$.: charset $charsetname has more than 256 values\n";
|
||||
} elsif (scalar @vals == 256) {
|
||||
&outcharset($charsetname, \@vals, \@sortpriority);
|
||||
push @charsetnames, $charsetname;
|
||||
$charsetname = undef;
|
||||
@vals = ();
|
||||
@sortpriority = map { 0 } 0..255;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print "#else /* ENUM_CHARSETS */\n";
|
||||
print "\n";
|
||||
|
||||
foreach $i (@charsetnames) {
|
||||
print "ENUM_CHARSET($i)\n";
|
||||
}
|
||||
|
||||
print "\n";
|
||||
print "#endif /* ENUM_CHARSETS */\n";
|
||||
|
||||
sub outcharset($$$) {
|
||||
my ($name, $vals, $sortpriority) = @_;
|
||||
my ($prefix, $i, @sorted);
|
||||
|
||||
print "static const sbcs_data data_$name = {\n";
|
||||
print " {\n";
|
||||
$prefix = " ";
|
||||
@sorted = ();
|
||||
for ($i = 0; $i < 256; $i++) {
|
||||
if ($vals->[$i] < 0) {
|
||||
printf "%sERROR ", $prefix;
|
||||
} else {
|
||||
printf "%s0x%04x", $prefix, $vals->[$i];
|
||||
die "ooh? $i\n" unless defined $sortpriority->[$i];
|
||||
push @sorted, [$i, $vals->[$i], 0+$sortpriority->[$i]];
|
||||
}
|
||||
if ($i % 8 == 7) {
|
||||
$prefix = ",\n ";
|
||||
} else {
|
||||
$prefix = ", ";
|
||||
}
|
||||
}
|
||||
print "\n },\n {\n";
|
||||
@sorted = sort { ($a->[1] == $b->[1] ?
|
||||
$b->[2] <=> $a->[2] :
|
||||
$a->[1] <=> $b->[1]) ||
|
||||
$a->[0] <=> $b->[0] } @sorted;
|
||||
$prefix = " ";
|
||||
$uval = -1;
|
||||
for ($i = $j = 0; $i < scalar @sorted; $i++) {
|
||||
next if ($uval == $sorted[$i]->[1]); # low-priority alternative
|
||||
$uval = $sorted[$i]->[1];
|
||||
printf "%s0x%02x", $prefix, $sorted[$i]->[0];
|
||||
if ($j % 8 == 7) {
|
||||
$prefix = ",\n ";
|
||||
} else {
|
||||
$prefix = ", ";
|
||||
}
|
||||
$j++;
|
||||
}
|
||||
printf "\n },\n %d\n", $j;
|
||||
print "};\n";
|
||||
print "const charset_spec charset_$name = {\n" .
|
||||
" $name, read_sbcs, write_sbcs, &data_$name\n};\n\n";
|
||||
}
|
||||
29
puttysrc/CHARSET/SLOOKUP.C
Normal file
29
puttysrc/CHARSET/SLOOKUP.C
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* slookup.c - static lookup of character sets.
|
||||
*/
|
||||
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
#define ENUM_CHARSET(x) extern charset_spec const charset_##x;
|
||||
#include "enum.c"
|
||||
#undef ENUM_CHARSET
|
||||
|
||||
static charset_spec const *const cs_table[] = {
|
||||
|
||||
#define ENUM_CHARSET(x) &charset_##x,
|
||||
#include "enum.c"
|
||||
#undef ENUM_CHARSET
|
||||
|
||||
};
|
||||
|
||||
charset_spec const *charset_find_spec(int charset)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(cs_table); i++)
|
||||
if (cs_table[i]->charset == charset)
|
||||
return cs_table[i];
|
||||
|
||||
return NULL;
|
||||
}
|
||||
89
puttysrc/CHARSET/TOUCS.C
Normal file
89
puttysrc/CHARSET/TOUCS.C
Normal file
@@ -0,0 +1,89 @@
|
||||
/*
|
||||
* toucs.c - convert charsets to Unicode.
|
||||
*/
|
||||
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
struct unicode_emit_param {
|
||||
wchar_t *output;
|
||||
int outlen;
|
||||
const wchar_t *errstr;
|
||||
int errlen;
|
||||
int stopped;
|
||||
};
|
||||
|
||||
static void unicode_emit(void *ctx, long int output)
|
||||
{
|
||||
struct unicode_emit_param *param = (struct unicode_emit_param *)ctx;
|
||||
wchar_t outval;
|
||||
wchar_t const *p;
|
||||
int outlen;
|
||||
|
||||
if (output == ERROR) {
|
||||
if (param->errstr) {
|
||||
p = param->errstr;
|
||||
outlen = param->errlen;
|
||||
} else {
|
||||
outval = 0xFFFD; /* U+FFFD REPLACEMENT CHARACTER */
|
||||
p = &outval;
|
||||
outlen = 1;
|
||||
}
|
||||
} else {
|
||||
outval = output;
|
||||
p = &outval;
|
||||
outlen = 1;
|
||||
}
|
||||
|
||||
if (param->outlen >= outlen) {
|
||||
while (outlen > 0) {
|
||||
*param->output++ = *p++;
|
||||
param->outlen--;
|
||||
outlen--;
|
||||
}
|
||||
} else {
|
||||
param->stopped = 1;
|
||||
}
|
||||
}
|
||||
|
||||
int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen,
|
||||
int charset, charset_state *state,
|
||||
const wchar_t *errstr, int errlen)
|
||||
{
|
||||
charset_spec const *spec = charset_find_spec(charset);
|
||||
charset_state localstate;
|
||||
struct unicode_emit_param param;
|
||||
|
||||
param.output = output;
|
||||
param.outlen = outlen;
|
||||
param.errstr = errstr;
|
||||
param.errlen = errlen;
|
||||
param.stopped = 0;
|
||||
|
||||
if (!state) {
|
||||
localstate.s0 = 0;
|
||||
} else {
|
||||
localstate = *state; /* structure copy */
|
||||
}
|
||||
|
||||
while (*inlen > 0) {
|
||||
int lenbefore = param.output - output;
|
||||
spec->read(spec, (unsigned char)**input, &localstate,
|
||||
unicode_emit, ¶m);
|
||||
if (param.stopped) {
|
||||
/*
|
||||
* The emit function has _tried_ to output some
|
||||
* characters, but ran up against the end of the
|
||||
* buffer. Leave immediately, and return what happened
|
||||
* _before_ attempting to process this character.
|
||||
*/
|
||||
return lenbefore;
|
||||
}
|
||||
if (state)
|
||||
*state = localstate; /* structure copy */
|
||||
(*input)++;
|
||||
(*inlen)--;
|
||||
}
|
||||
|
||||
return param.output - output;
|
||||
}
|
||||
882
puttysrc/CHARSET/UTF8.C
Normal file
882
puttysrc/CHARSET/UTF8.C
Normal file
@@ -0,0 +1,882 @@
|
||||
/*
|
||||
* utf8.c - routines to handle UTF-8.
|
||||
*/
|
||||
|
||||
#ifndef ENUM_CHARSETS
|
||||
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
void read_utf8(charset_spec const *, long int, charset_state *,
|
||||
void (*)(void *, long int), void *);
|
||||
void write_utf8(charset_spec const *, long int,
|
||||
charset_state *, void (*)(void *, long int), void *);
|
||||
|
||||
/*
|
||||
* UTF-8 has no associated data, so `charset' may be ignored.
|
||||
*/
|
||||
|
||||
void read_utf8(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx)
|
||||
{
|
||||
UNUSEDARG(charset);
|
||||
|
||||
/*
|
||||
* For reading UTF-8, the `state' word contains:
|
||||
*
|
||||
* - in bits 29-31, the number of bytes expected to be in the
|
||||
* current multibyte character (which we can tell instantly
|
||||
* from the first byte, of course).
|
||||
*
|
||||
* - in bits 26-28, the number of bytes _seen so far_ in the
|
||||
* current multibyte character.
|
||||
*
|
||||
* - in the remainder of the word, the current value of the
|
||||
* character, which is shifted upwards by 6 bits to
|
||||
* accommodate each new byte.
|
||||
*
|
||||
* As required, the state is zero when we are not in the middle
|
||||
* of a multibyte character at all.
|
||||
*
|
||||
* For example, when reading E9 8D 8B, starting at state=0:
|
||||
*
|
||||
* - after E9, the state is 0x64000009
|
||||
* - after 8D, the state is 0x6800024d
|
||||
* - after 8B, the state conceptually becomes 0x6c00934b, at
|
||||
* which point we notice we've got as many characters as we
|
||||
* were expecting, output U+934B, and reset the state to
|
||||
* zero.
|
||||
*
|
||||
* Note that the maximum number of bits we might need to store
|
||||
* in the character value field is 25 (U+7FFFFFFF contains 31
|
||||
* bits, but we will never actually store its full value
|
||||
* because when we receive the last 6 bits in the final
|
||||
* continuation byte we will output it and revert the state to
|
||||
* zero). Hence the character value field never collides with
|
||||
* the byte counts.
|
||||
*/
|
||||
|
||||
if (input_chr < 0x80) {
|
||||
/*
|
||||
* Single-byte character. If the state is nonzero before
|
||||
* coming here, output an error for an incomplete sequence.
|
||||
* Then output the character.
|
||||
*/
|
||||
if (state->s0 != 0) {
|
||||
emit(emitctx, ERROR);
|
||||
state->s0 = 0;
|
||||
}
|
||||
emit(emitctx, input_chr);
|
||||
} else if (input_chr == 0xFE || input_chr == 0xFF) {
|
||||
/*
|
||||
* FE and FF bytes should _never_ occur in UTF-8. They are
|
||||
* automatic errors; if the state was nonzero to start
|
||||
* with, output a further error for an incomplete sequence.
|
||||
*/
|
||||
if (state->s0 != 0) {
|
||||
emit(emitctx, ERROR);
|
||||
state->s0 = 0;
|
||||
}
|
||||
emit(emitctx, ERROR);
|
||||
} else if (input_chr >= 0x80 && input_chr < 0xC0) {
|
||||
/*
|
||||
* Continuation byte. Output an error for an unexpected
|
||||
* continuation byte, if the state is zero.
|
||||
*/
|
||||
if (state->s0 == 0) {
|
||||
emit(emitctx, ERROR);
|
||||
} else {
|
||||
unsigned long charval;
|
||||
unsigned long topstuff;
|
||||
int bytes;
|
||||
|
||||
/*
|
||||
* Otherwise, accumulate more of the character value.
|
||||
*/
|
||||
charval = state->s0 & 0x03ffffffL;
|
||||
charval = (charval << 6) | (input_chr & 0x3F);
|
||||
|
||||
/*
|
||||
* Check the byte counts; if we have not reached the
|
||||
* end of the character, update the state and return.
|
||||
*/
|
||||
topstuff = state->s0 & 0xfc000000L;
|
||||
topstuff += 0x04000000L; /* add one to the byte count */
|
||||
if (((topstuff << 3) ^ topstuff) & 0xe0000000L) {
|
||||
state->s0 = topstuff | charval;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now we know we've reached the end of the character.
|
||||
* `charval' is the Unicode value. We should check for
|
||||
* various invalid things, and then either output
|
||||
* charval or an error. In all cases we reset the state
|
||||
* to zero.
|
||||
*/
|
||||
bytes = topstuff >> 29;
|
||||
state->s0 = 0;
|
||||
|
||||
if (charval >= 0xD800 && charval < 0xE000) {
|
||||
/*
|
||||
* Surrogates (0xD800-0xDFFF) may never be encoded
|
||||
* in UTF-8. A surrogate pair in Unicode should
|
||||
* have been encoded as a single UTF-8 character
|
||||
* occupying more than three bytes.
|
||||
*/
|
||||
emit(emitctx, ERROR);
|
||||
} else if (charval == 0xFFFE || charval == 0xFFFF) {
|
||||
/*
|
||||
* U+FFFE and U+FFFF are invalid Unicode characters
|
||||
* and may never be encoded in UTF-8. (This is one
|
||||
* reason why U+FFFF is our way of signalling an
|
||||
* error to our `emit' function :-)
|
||||
*/
|
||||
emit(emitctx, ERROR);
|
||||
} else if ((charval <= 0x7FL /* && bytes > 1 */) ||
|
||||
(charval <= 0x7FFL && bytes > 2) ||
|
||||
(charval <= 0xFFFFL && bytes > 3) ||
|
||||
(charval <= 0x1FFFFFL && bytes > 4) ||
|
||||
(charval <= 0x3FFFFFFL && bytes > 5)) {
|
||||
/*
|
||||
* Overlong sequences are not to be tolerated,
|
||||
* under any circumstances.
|
||||
*/
|
||||
emit(emitctx, ERROR);
|
||||
} else {
|
||||
/*
|
||||
* Oh, all right. We'll let this one off.
|
||||
*/
|
||||
emit(emitctx, charval);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
/*
|
||||
* Lead byte. First output an error for an incomplete
|
||||
* sequence, if the state is nonzero.
|
||||
*/
|
||||
if (state->s0 != 0)
|
||||
emit(emitctx, ERROR);
|
||||
|
||||
/*
|
||||
* Now deal with the lead byte: work out the number of
|
||||
* bytes we expect to see in this character, and extract
|
||||
* the initial bits of it too.
|
||||
*/
|
||||
if (input_chr >= 0xC0 && input_chr < 0xE0) {
|
||||
state->s0 = 0x44000000L | (input_chr & 0x1F);
|
||||
} else if (input_chr >= 0xE0 && input_chr < 0xF0) {
|
||||
state->s0 = 0x64000000L | (input_chr & 0x0F);
|
||||
} else if (input_chr >= 0xF0 && input_chr < 0xF8) {
|
||||
state->s0 = 0x84000000L | (input_chr & 0x07);
|
||||
} else if (input_chr >= 0xF8 && input_chr < 0xFC) {
|
||||
state->s0 = 0xa4000000L | (input_chr & 0x03);
|
||||
} else if (input_chr >= 0xFC && input_chr < 0xFE) {
|
||||
state->s0 = 0xc4000000L | (input_chr & 0x01);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* UTF-8 is a stateless multi-byte encoding (in the sense that just
|
||||
* after any character has been completed, the state is always the
|
||||
* same); hence when writing it, there is no need to use the
|
||||
* charset_state.
|
||||
*/
|
||||
|
||||
void write_utf8(charset_spec const *charset, long int input_chr,
|
||||
charset_state *state,
|
||||
void (*emit)(void *ctx, long int output), void *emitctx)
|
||||
{
|
||||
UNUSEDARG(charset);
|
||||
UNUSEDARG(state);
|
||||
|
||||
/*
|
||||
* Refuse to output any illegal code points.
|
||||
*/
|
||||
if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
|
||||
(input_chr >= 0xD800 && input_chr < 0xE000)) {
|
||||
emit(emitctx, ERROR);
|
||||
} else if (input_chr < 0x80) { /* one-byte character */
|
||||
emit(emitctx, input_chr);
|
||||
} else if (input_chr < 0x800) { /* two-byte character */
|
||||
emit(emitctx, 0xC0 | (0x1F & (input_chr >> 6)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr )));
|
||||
} else if (input_chr < 0x10000) { /* three-byte character */
|
||||
emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr )));
|
||||
} else if (input_chr < 0x200000) { /* four-byte character */
|
||||
emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr )));
|
||||
} else if (input_chr < 0x4000000) {/* five-byte character */
|
||||
emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr )));
|
||||
} else { /* six-byte character */
|
||||
emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
|
||||
emit(emitctx, 0x80 | (0x3F & (input_chr )));
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef TESTMODE
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
int total_errs = 0;
|
||||
|
||||
void utf8_emit(void *ctx, long output)
|
||||
{
|
||||
wchar_t **p = (wchar_t **)ctx;
|
||||
*(*p)++ = output;
|
||||
}
|
||||
|
||||
void utf8_read_test(int line, char *input, int inlen, ...)
|
||||
{
|
||||
va_list ap;
|
||||
wchar_t *p, str[512];
|
||||
int i;
|
||||
charset_state state;
|
||||
unsigned long l;
|
||||
|
||||
state.s0 = 0;
|
||||
p = str;
|
||||
|
||||
for (i = 0; i < inlen; i++)
|
||||
read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
|
||||
|
||||
va_start(ap, inlen);
|
||||
l = 0;
|
||||
for (i = 0; i < p - str; i++) {
|
||||
l = va_arg(ap, long int);
|
||||
if (l == -1) {
|
||||
printf("%d: correct string shorter than output\n", line);
|
||||
total_errs++;
|
||||
break;
|
||||
}
|
||||
if (l != str[i]) {
|
||||
printf("%d: char %d came out as %08x, should be %08x\n",
|
||||
line, i, str[i], l);
|
||||
total_errs++;
|
||||
}
|
||||
}
|
||||
if (l != -1) {
|
||||
l = va_arg(ap, long int);
|
||||
if (l != -1) {
|
||||
printf("%d: correct string longer than output\n", line);
|
||||
total_errs++;
|
||||
}
|
||||
}
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
void utf8_write_test(int line, const long *input, int inlen, ...)
|
||||
{
|
||||
va_list ap;
|
||||
wchar_t *p, str[512];
|
||||
int i;
|
||||
charset_state state;
|
||||
unsigned long l;
|
||||
|
||||
state.s0 = 0;
|
||||
p = str;
|
||||
|
||||
for (i = 0; i < inlen; i++)
|
||||
write_utf8(NULL, input[i], &state, utf8_emit, &p);
|
||||
|
||||
va_start(ap, inlen);
|
||||
l = 0;
|
||||
for (i = 0; i < p - str; i++) {
|
||||
l = va_arg(ap, long int);
|
||||
if (l == -1) {
|
||||
printf("%d: correct string shorter than output\n", line);
|
||||
total_errs++;
|
||||
break;
|
||||
}
|
||||
if (l != str[i]) {
|
||||
printf("%d: char %d came out as %08x, should be %08x\n",
|
||||
line, i, str[i], l);
|
||||
total_errs++;
|
||||
}
|
||||
}
|
||||
if (l != -1) {
|
||||
l = va_arg(ap, long int);
|
||||
if (l != -1) {
|
||||
printf("%d: correct string longer than output\n", line);
|
||||
total_errs++;
|
||||
}
|
||||
}
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
/* Macro to concoct the first three parameters of utf8_read_test. */
|
||||
#define TESTSTR(x) __LINE__, x, lenof(x)
|
||||
|
||||
int main(void)
|
||||
{
|
||||
printf("read tests beginning\n");
|
||||
utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
|
||||
0x000003BA, /* GREEK SMALL LETTER KAPPA */
|
||||
0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
|
||||
0x000003C3, /* GREEK SMALL LETTER SIGMA */
|
||||
0x000003BC, /* GREEK SMALL LETTER MU */
|
||||
0x000003B5, /* GREEK SMALL LETTER EPSILON */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x00"),
|
||||
0x00000000, /* <control> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC2\x80"),
|
||||
0x00000080, /* <control> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xE0\xA0\x80"),
|
||||
0x00000800, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
|
||||
0x00010000, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
|
||||
0x00200000, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
|
||||
0x04000000, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x7F"),
|
||||
0x0000007F, /* <control> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xDF\xBF"),
|
||||
0x000007FF, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
|
||||
0x0000FFFD, /* REPLACEMENT CHARACTER */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
|
||||
ERROR, /* <no name available> (invalid char) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
|
||||
0x001FFFFF, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
|
||||
0x03FFFFFF, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
|
||||
0x7FFFFFFF, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\x9F\xBF"),
|
||||
0x0000D7FF, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEE\x80\x80"),
|
||||
0x0000E000, /* <Private Use, First> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
|
||||
0x0000FFFD, /* REPLACEMENT CHARACTER */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
|
||||
0x0010FFFF, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
|
||||
0x00110000, /* <no name available> */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xBF"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\xBF"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\xBF\x80"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
ERROR, /* (unexpected continuation byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0x00000020, /* SPACE */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC0"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xE0\x80"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF0\x80\x80"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xDF"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEF\xBF"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
ERROR, /* (incomplete sequence) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFE"),
|
||||
ERROR, /* (invalid UTF-8 byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFF"),
|
||||
ERROR, /* (invalid UTF-8 byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
|
||||
ERROR, /* (invalid UTF-8 byte) */
|
||||
ERROR, /* (invalid UTF-8 byte) */
|
||||
ERROR, /* (invalid UTF-8 byte) */
|
||||
ERROR, /* (invalid UTF-8 byte) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC0\xAF"),
|
||||
ERROR, /* SOLIDUS (overlong form of 2F) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xE0\x80\xAF"),
|
||||
ERROR, /* SOLIDUS (overlong form of 2F) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
|
||||
ERROR, /* SOLIDUS (overlong form of 2F) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
|
||||
ERROR, /* SOLIDUS (overlong form of 2F) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
|
||||
ERROR, /* SOLIDUS (overlong form of 2F) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC1\xBF"),
|
||||
ERROR, /* <control> (overlong form of 7F) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
|
||||
ERROR, /* <no name available> (overlong form of DF BF) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
|
||||
ERROR, /* <no name available> (overlong form of EF BF BF) (invalid char) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
|
||||
ERROR, /* <no name available> (overlong form of F7 BF BF BF) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
|
||||
ERROR, /* <no name available> (overlong form of FB BF BF BF BF) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xC0\x80"),
|
||||
ERROR, /* <control> (overlong form of 00) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xE0\x80\x80"),
|
||||
ERROR, /* <control> (overlong form of 00) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
|
||||
ERROR, /* <control> (overlong form of 00) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
|
||||
ERROR, /* <control> (overlong form of 00) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
|
||||
ERROR, /* <control> (overlong form of 00) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xA0\x80"),
|
||||
ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAD\xBF"),
|
||||
ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAE\x80"),
|
||||
ERROR, /* <Private Use High Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAF\xBF"),
|
||||
ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xB0\x80"),
|
||||
ERROR, /* <Low Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xBE\x80"),
|
||||
ERROR, /* <no name available> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xBF\xBF"),
|
||||
ERROR, /* <Low Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
|
||||
ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
|
||||
ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
|
||||
ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
|
||||
ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
|
||||
ERROR, /* <Private Use High Surrogate, First> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
|
||||
ERROR, /* <Private Use High Surrogate, First> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
|
||||
ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, First> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
|
||||
ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
|
||||
ERROR, /* <Low Surrogate, Last> (surrogate) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
|
||||
ERROR, /* <no name available> (invalid char) */
|
||||
0, -1);
|
||||
utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
|
||||
ERROR, /* <no name available> (invalid char) */
|
||||
0, -1);
|
||||
printf("read tests completed\n");
|
||||
printf("write tests beginning\n");
|
||||
{
|
||||
const static long str[] =
|
||||
{0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xCE, 0xBA,
|
||||
0xE1, 0xBD, 0xB9,
|
||||
0xCF, 0x83,
|
||||
0xCE, 0xBC,
|
||||
0xCE, 0xB5,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x0000L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0x00,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x0080L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xC2, 0x80,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x0800L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xE0, 0xA0, 0x80,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x00010000L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xF0, 0x90, 0x80, 0x80,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x00200000L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xF8, 0x88, 0x80, 0x80, 0x80,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x04000000L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x007FL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0x7F,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x07FFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xDF, 0xBF,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xFFFDL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xEF, 0xBF, 0xBD,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xFFFFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
ERROR,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x001FFFFFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xF7, 0xBF, 0xBF, 0xBF,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x03FFFFFFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0x7FFFFFFFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xD7FFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xED, 0x9F, 0xBF,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xD800L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
ERROR,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xD800L, 0xDC00L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
ERROR,
|
||||
ERROR,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xDFFFL, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
ERROR,
|
||||
0, -1);
|
||||
}
|
||||
{
|
||||
const static long str[] = {0xE000L, 0};
|
||||
utf8_write_test(TESTSTR(str),
|
||||
0xEE, 0x80, 0x80,
|
||||
0, -1);
|
||||
}
|
||||
printf("write tests completed\n");
|
||||
|
||||
printf("total: %d errors\n", total_errs);
|
||||
return (total_errs != 0);
|
||||
}
|
||||
#endif /* TESTMODE */
|
||||
|
||||
const charset_spec charset_CS_UTF8 = {
|
||||
CS_UTF8, read_utf8, write_utf8, NULL
|
||||
};
|
||||
|
||||
#else /* ENUM_CHARSETS */
|
||||
|
||||
ENUM_CHARSET(CS_UTF8)
|
||||
|
||||
#endif /* ENUM_CHARSETS */
|
||||
93
puttysrc/CHARSET/XENC.C
Normal file
93
puttysrc/CHARSET/XENC.C
Normal file
@@ -0,0 +1,93 @@
|
||||
/*
|
||||
* xenc.c - translate our internal character set codes to and from
|
||||
* X11 character encoding names.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <ctype.h>
|
||||
#include "charset.h"
|
||||
#include "internal.h"
|
||||
|
||||
static const struct {
|
||||
const char *name;
|
||||
int charset;
|
||||
} xencs[] = {
|
||||
/*
|
||||
* Officially registered encoding names. This list is derived
|
||||
* from the font encodings section of
|
||||
*
|
||||
* http://ftp.x.org/pub/DOCS/registry
|
||||
*
|
||||
* Where multiple encoding names map to the same encoding id
|
||||
* (such as iso8859-15 and fcd8859-15), the first is considered
|
||||
* canonical and will be returned when translating the id to a
|
||||
* string.
|
||||
*/
|
||||
{ "iso8859-1", CS_ISO8859_1 },
|
||||
{ "iso8859-2", CS_ISO8859_2 },
|
||||
{ "iso8859-3", CS_ISO8859_3 },
|
||||
{ "iso8859-4", CS_ISO8859_4 },
|
||||
{ "iso8859-5", CS_ISO8859_5 },
|
||||
{ "iso8859-6", CS_ISO8859_6 },
|
||||
{ "iso8859-7", CS_ISO8859_7 },
|
||||
{ "iso8859-8", CS_ISO8859_8 },
|
||||
{ "iso8859-9", CS_ISO8859_9 },
|
||||
{ "iso8859-10", CS_ISO8859_10 },
|
||||
{ "iso8859-13", CS_ISO8859_13 },
|
||||
{ "iso8859-14", CS_ISO8859_14 },
|
||||
{ "iso8859-15", CS_ISO8859_15 },
|
||||
{ "fcd8859-15", CS_ISO8859_15 },
|
||||
{ "hp-roman8", CS_HP_ROMAN8 },
|
||||
{ "koi8-r", CS_KOI8_R },
|
||||
/*
|
||||
* Unofficial encoding names found in the wild.
|
||||
*/
|
||||
{ "iso8859-16", CS_ISO8859_16 },
|
||||
{ "koi8-u", CS_KOI8_U },
|
||||
{ "ibm-cp437", CS_CP437 },
|
||||
{ "ibm-cp850", CS_CP850 },
|
||||
{ "ibm-cp866", CS_CP866 },
|
||||
{ "microsoft-cp1250", CS_CP1250 },
|
||||
{ "microsoft-cp1251", CS_CP1251 },
|
||||
{ "microsoft-cp1252", CS_CP1252 },
|
||||
{ "microsoft-cp1253", CS_CP1253 },
|
||||
{ "microsoft-cp1254", CS_CP1254 },
|
||||
{ "microsoft-cp1255", CS_CP1255 },
|
||||
{ "microsoft-cp1256", CS_CP1256 },
|
||||
{ "microsoft-cp1257", CS_CP1257 },
|
||||
{ "microsoft-cp1258", CS_CP1258 },
|
||||
{ "mac-roman", CS_MAC_ROMAN },
|
||||
{ "viscii1.1-1", CS_VISCII },
|
||||
{ "viscii1-1", CS_VISCII },
|
||||
};
|
||||
|
||||
const char *charset_to_xenc(int charset)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(xencs); i++)
|
||||
if (charset == xencs[i].charset)
|
||||
return xencs[i].name;
|
||||
|
||||
return NULL; /* not found */
|
||||
}
|
||||
|
||||
int charset_from_xenc(const char *name)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (int)lenof(xencs); i++) {
|
||||
const char *p, *q;
|
||||
p = name;
|
||||
q = xencs[i].name;
|
||||
while (*p || *q) {
|
||||
if (tolower(*p) != tolower(*q))
|
||||
break;
|
||||
p++; q++;
|
||||
}
|
||||
if (!*p && !*q)
|
||||
return xencs[i].charset;
|
||||
}
|
||||
|
||||
return CS_NONE; /* not found */
|
||||
}
|
||||
Reference in New Issue
Block a user