/* | |
* _codecs_iso2022.c: Codecs collection for ISO-2022 encodings. | |
* | |
* Written by Hye-Shik Chang <perky@FreeBSD.org> | |
*/ | |
#define USING_IMPORTED_MAPS | |
#define USING_BINARY_PAIR_SEARCH | |
#define EXTERN_JISX0213_PAIR | |
#define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE | |
#define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE | |
#include "cjkcodecs.h" | |
#include "alg_jisx0201.h" | |
#include "emu_jisx0213_2000.h" | |
#include "mappings_jisx0213_pair.h" | |
/* STATE | |
state->c[0-3] | |
00000000 | |
||^^^^^| | |
|+-----+---- G0-3 Character Set | |
+----------- Is G0-3 double byte? | |
state->c[4] | |
00000000 | |
|| | |
|+---- Locked-Shift? | |
+----- ESC Throughout | |
*/ | |
#define ESC 0x1B | |
#define SO 0x0E | |
#define SI 0x0F | |
#define LF 0x0A | |
#define MAX_ESCSEQLEN 16 | |
#define CHARSET_ISO8859_1 'A' | |
#define CHARSET_ASCII 'B' | |
#define CHARSET_ISO8859_7 'F' | |
#define CHARSET_JISX0201_K 'I' | |
#define CHARSET_JISX0201_R 'J' | |
#define CHARSET_GB2312 ('A'|CHARSET_DBCS) | |
#define CHARSET_JISX0208 ('B'|CHARSET_DBCS) | |
#define CHARSET_KSX1001 ('C'|CHARSET_DBCS) | |
#define CHARSET_JISX0212 ('D'|CHARSET_DBCS) | |
#define CHARSET_GB2312_8565 ('E'|CHARSET_DBCS) | |
#define CHARSET_CNS11643_1 ('G'|CHARSET_DBCS) | |
#define CHARSET_CNS11643_2 ('H'|CHARSET_DBCS) | |
#define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS) | |
#define CHARSET_JISX0213_2 ('P'|CHARSET_DBCS) | |
#define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS) | |
#define CHARSET_JISX0208_O ('@'|CHARSET_DBCS) | |
#define CHARSET_DBCS 0x80 | |
#define ESCMARK(mark) ((mark) & 0x7f) | |
#define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@') | |
#define IS_ISO2022ESC(c2) \ | |
((c2) == '(' || (c2) == ')' || (c2) == '$' || \ | |
(c2) == '.' || (c2) == '&') | |
/* this is not a complete list of ISO-2022 escape sequence headers. | |
* but, it's enough to implement CJK instances of iso-2022. */ | |
#define MAP_UNMAPPABLE 0xFFFF | |
#define MAP_MULTIPLE_AVAIL 0xFFFE /* for JIS X 0213 */ | |
#define F_SHIFTED 0x01 | |
#define F_ESCTHROUGHOUT 0x02 | |
#define STATE_SETG(dn, v) ((state)->c[dn]) = (v); | |
#define STATE_GETG(dn) ((state)->c[dn]) | |
#define STATE_G0 STATE_GETG(0) | |
#define STATE_G1 STATE_GETG(1) | |
#define STATE_G2 STATE_GETG(2) | |
#define STATE_G3 STATE_GETG(3) | |
#define STATE_SETG0(v) STATE_SETG(0, v) | |
#define STATE_SETG1(v) STATE_SETG(1, v) | |
#define STATE_SETG2(v) STATE_SETG(2, v) | |
#define STATE_SETG3(v) STATE_SETG(3, v) | |
#define STATE_SETFLAG(f) ((state)->c[4]) |= (f); | |
#define STATE_GETFLAG(f) ((state)->c[4] & (f)) | |
#define STATE_CLEARFLAG(f) ((state)->c[4]) &= ~(f); | |
#define STATE_CLEARFLAGS() ((state)->c[4]) = 0; | |
#define ISO2022_CONFIG ((const struct iso2022_config *)config) | |
#define CONFIG_ISSET(flag) (ISO2022_CONFIG->flags & (flag)) | |
#define CONFIG_DESIGNATIONS (ISO2022_CONFIG->designations) | |
/* iso2022_config.flags */ | |
#define NO_SHIFT 0x01 | |
#define USE_G2 0x02 | |
#define USE_JISX0208_EXT 0x04 | |
/*-*- internal data structures -*-*/ | |
typedef int (*iso2022_init_func)(void); | |
typedef ucs4_t (*iso2022_decode_func)(const unsigned char *data); | |
typedef DBCHAR (*iso2022_encode_func)(const ucs4_t *data, Py_ssize_t *length); | |
struct iso2022_designation { | |
unsigned char mark; | |
unsigned char plane; | |
unsigned char width; | |
iso2022_init_func initializer; | |
iso2022_decode_func decoder; | |
iso2022_encode_func encoder; | |
}; | |
struct iso2022_config { | |
int flags; | |
const struct iso2022_designation *designations; /* non-ascii desigs */ | |
}; | |
/*-*- iso-2022 codec implementation -*-*/ | |
CODEC_INIT(iso2022) | |
{ | |
const struct iso2022_designation *desig = CONFIG_DESIGNATIONS; | |
for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++) | |
if (desig->initializer != NULL && desig->initializer() != 0) | |
return -1; | |
return 0; | |
} | |
ENCODER_INIT(iso2022) | |
{ | |
STATE_CLEARFLAGS() | |
STATE_SETG0(CHARSET_ASCII) | |
STATE_SETG1(CHARSET_ASCII) | |
return 0; | |
} | |
ENCODER_RESET(iso2022) | |
{ | |
if (STATE_GETFLAG(F_SHIFTED)) { | |
WRITE1(SI) | |
NEXT_OUT(1) | |
STATE_CLEARFLAG(F_SHIFTED) | |
} | |
if (STATE_G0 != CHARSET_ASCII) { | |
WRITE3(ESC, '(', 'B') | |
NEXT_OUT(3) | |
STATE_SETG0(CHARSET_ASCII) | |
} | |
return 0; | |
} | |
ENCODER(iso2022) | |
{ | |
while (inleft > 0) { | |
const struct iso2022_designation *dsg; | |
DBCHAR encoded; | |
ucs4_t c = **inbuf; | |
Py_ssize_t insize; | |
if (c < 0x80) { | |
if (STATE_G0 != CHARSET_ASCII) { | |
WRITE3(ESC, '(', 'B') | |
STATE_SETG0(CHARSET_ASCII) | |
NEXT_OUT(3) | |
} | |
if (STATE_GETFLAG(F_SHIFTED)) { | |
WRITE1(SI) | |
STATE_CLEARFLAG(F_SHIFTED) | |
NEXT_OUT(1) | |
} | |
WRITE1((unsigned char)c) | |
NEXT(1, 1) | |
continue; | |
} | |
DECODE_SURROGATE(c) | |
insize = GET_INSIZE(c); | |
encoded = MAP_UNMAPPABLE; | |
for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) { | |
Py_ssize_t length = 1; | |
encoded = dsg->encoder(&c, &length); | |
if (encoded == MAP_MULTIPLE_AVAIL) { | |
/* this implementation won't work for pair | |
* of non-bmp characters. */ | |
if (inleft < 2) { | |
if (!(flags & MBENC_FLUSH)) | |
return MBERR_TOOFEW; | |
length = -1; | |
} | |
else | |
length = 2; | |
#if Py_UNICODE_SIZE == 2 | |
if (length == 2) { | |
ucs4_t u4in[2]; | |
u4in[0] = (ucs4_t)IN1; | |
u4in[1] = (ucs4_t)IN2; | |
encoded = dsg->encoder(u4in, &length); | |
} else | |
encoded = dsg->encoder(&c, &length); | |
#else | |
encoded = dsg->encoder(&c, &length); | |
#endif | |
if (encoded != MAP_UNMAPPABLE) { | |
insize = length; | |
break; | |
} | |
} | |
else if (encoded != MAP_UNMAPPABLE) | |
break; | |
} | |
if (!dsg->mark) | |
return 1; | |
assert(dsg->width == 1 || dsg->width == 2); | |
switch (dsg->plane) { | |
case 0: /* G0 */ | |
if (STATE_GETFLAG(F_SHIFTED)) { | |
WRITE1(SI) | |
STATE_CLEARFLAG(F_SHIFTED) | |
NEXT_OUT(1) | |
} | |
if (STATE_G0 != dsg->mark) { | |
if (dsg->width == 1) { | |
WRITE3(ESC, '(', ESCMARK(dsg->mark)) | |
STATE_SETG0(dsg->mark) | |
NEXT_OUT(3) | |
} | |
else if (dsg->mark == CHARSET_JISX0208) { | |
WRITE3(ESC, '$', ESCMARK(dsg->mark)) | |
STATE_SETG0(dsg->mark) | |
NEXT_OUT(3) | |
} | |
else { | |
WRITE4(ESC, '$', '(', | |
ESCMARK(dsg->mark)) | |
STATE_SETG0(dsg->mark) | |
NEXT_OUT(4) | |
} | |
} | |
break; | |
case 1: /* G1 */ | |
if (STATE_G1 != dsg->mark) { | |
if (dsg->width == 1) { | |
WRITE3(ESC, ')', ESCMARK(dsg->mark)) | |
STATE_SETG1(dsg->mark) | |
NEXT_OUT(3) | |
} | |
else { | |
WRITE4(ESC, '$', ')', | |
ESCMARK(dsg->mark)) | |
STATE_SETG1(dsg->mark) | |
NEXT_OUT(4) | |
} | |
} | |
if (!STATE_GETFLAG(F_SHIFTED)) { | |
WRITE1(SO) | |
STATE_SETFLAG(F_SHIFTED) | |
NEXT_OUT(1) | |
} | |
break; | |
default: /* G2 and G3 is not supported: no encoding in | |
* CJKCodecs are using them yet */ | |
return MBERR_INTERNAL; | |
} | |
if (dsg->width == 1) { | |
WRITE1((unsigned char)encoded) | |
NEXT_OUT(1) | |
} | |
else { | |
WRITE2(encoded >> 8, encoded & 0xff) | |
NEXT_OUT(2) | |
} | |
NEXT_IN(insize) | |
} | |
return 0; | |
} | |
DECODER_INIT(iso2022) | |
{ | |
STATE_CLEARFLAGS() | |
STATE_SETG0(CHARSET_ASCII) | |
STATE_SETG1(CHARSET_ASCII) | |
STATE_SETG2(CHARSET_ASCII) | |
return 0; | |
} | |
DECODER_RESET(iso2022) | |
{ | |
STATE_SETG0(CHARSET_ASCII) | |
STATE_CLEARFLAG(F_SHIFTED) | |
return 0; | |
} | |
static Py_ssize_t | |
iso2022processesc(const void *config, MultibyteCodec_State *state, | |
const unsigned char **inbuf, Py_ssize_t *inleft) | |
{ | |
unsigned char charset, designation; | |
Py_ssize_t i, esclen; | |
for (i = 1;i < MAX_ESCSEQLEN;i++) { | |
if (i >= *inleft) | |
return MBERR_TOOFEW; | |
if (IS_ESCEND((*inbuf)[i])) { | |
esclen = i + 1; | |
break; | |
} | |
else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft && | |
(*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') | |
i += 2; | |
} | |
if (i >= MAX_ESCSEQLEN) | |
return 1; /* unterminated escape sequence */ | |
switch (esclen) { | |
case 3: | |
if (IN2 == '$') { | |
charset = IN3 | CHARSET_DBCS; | |
designation = 0; | |
} | |
else { | |
charset = IN3; | |
if (IN2 == '(') designation = 0; | |
else if (IN2 == ')') designation = 1; | |
else if (CONFIG_ISSET(USE_G2) && IN2 == '.') | |
designation = 2; | |
else return 3; | |
} | |
break; | |
case 4: | |
if (IN2 != '$') | |
return 4; | |
charset = IN4 | CHARSET_DBCS; | |
if (IN3 == '(') designation = 0; | |
else if (IN3 == ')') designation = 1; | |
else return 4; | |
break; | |
case 6: /* designation with prefix */ | |
if (CONFIG_ISSET(USE_JISX0208_EXT) && | |
(*inbuf)[3] == ESC && (*inbuf)[4] == '$' && | |
(*inbuf)[5] == 'B') { | |
charset = 'B' | CHARSET_DBCS; | |
designation = 0; | |
} | |
else | |
return 6; | |
break; | |
default: | |
return esclen; | |
} | |
/* raise error when the charset is not designated for this encoding */ | |
if (charset != CHARSET_ASCII) { | |
const struct iso2022_designation *dsg; | |
for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) | |
if (dsg->mark == charset) | |
break; | |
if (!dsg->mark) | |
return esclen; | |
} | |
STATE_SETG(designation, charset) | |
*inleft -= esclen; | |
(*inbuf) += esclen; | |
return 0; | |
} | |
#define ISO8859_7_DECODE(c, assi) \ | |
if ((c) < 0xa0) (assi) = (c); \ | |
else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) \ | |
(assi) = (c); \ | |
else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \ | |
(0xbffffd77L & (1L << ((c)-0xb4))))) \ | |
(assi) = 0x02d0 + (c); \ | |
else if ((c) == 0xa1) (assi) = 0x2018; \ | |
else if ((c) == 0xa2) (assi) = 0x2019; \ | |
else if ((c) == 0xaf) (assi) = 0x2015; | |
static Py_ssize_t | |
iso2022processg2(const void *config, MultibyteCodec_State *state, | |
const unsigned char **inbuf, Py_ssize_t *inleft, | |
Py_UNICODE **outbuf, Py_ssize_t *outleft) | |
{ | |
/* not written to use encoder, decoder functions because only few | |
* encodings use G2 designations in CJKCodecs */ | |
if (STATE_G2 == CHARSET_ISO8859_1) { | |
if (IN3 < 0x80) | |
OUT1(IN3 + 0x80) | |
else | |
return 3; | |
} | |
else if (STATE_G2 == CHARSET_ISO8859_7) { | |
ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf) | |
else return 3; | |
} | |
else if (STATE_G2 == CHARSET_ASCII) { | |
if (IN3 & 0x80) return 3; | |
else **outbuf = IN3; | |
} | |
else | |
return MBERR_INTERNAL; | |
(*inbuf) += 3; | |
*inleft -= 3; | |
(*outbuf) += 1; | |
*outleft -= 1; | |
return 0; | |
} | |
DECODER(iso2022) | |
{ | |
const struct iso2022_designation *dsgcache = NULL; | |
while (inleft > 0) { | |
unsigned char c = IN1; | |
Py_ssize_t err; | |
if (STATE_GETFLAG(F_ESCTHROUGHOUT)) { | |
/* ESC throughout mode: | |
* for non-iso2022 escape sequences */ | |
WRITE1(c) /* assume as ISO-8859-1 */ | |
NEXT(1, 1) | |
if (IS_ESCEND(c)) { | |
STATE_CLEARFLAG(F_ESCTHROUGHOUT) | |
} | |
continue; | |
} | |
switch (c) { | |
case ESC: | |
REQUIRE_INBUF(2) | |
if (IS_ISO2022ESC(IN2)) { | |
err = iso2022processesc(config, state, | |
inbuf, &inleft); | |
if (err != 0) | |
return err; | |
} | |
else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */ | |
REQUIRE_INBUF(3) | |
err = iso2022processg2(config, state, | |
inbuf, &inleft, outbuf, &outleft); | |
if (err != 0) | |
return err; | |
} | |
else { | |
WRITE1(ESC) | |
STATE_SETFLAG(F_ESCTHROUGHOUT) | |
NEXT(1, 1) | |
} | |
break; | |
case SI: | |
if (CONFIG_ISSET(NO_SHIFT)) | |
goto bypass; | |
STATE_CLEARFLAG(F_SHIFTED) | |
NEXT_IN(1) | |
break; | |
case SO: | |
if (CONFIG_ISSET(NO_SHIFT)) | |
goto bypass; | |
STATE_SETFLAG(F_SHIFTED) | |
NEXT_IN(1) | |
break; | |
case LF: | |
STATE_CLEARFLAG(F_SHIFTED) | |
WRITE1(LF) | |
NEXT(1, 1) | |
break; | |
default: | |
if (c < 0x20) /* C0 */ | |
goto bypass; | |
else if (c >= 0x80) | |
return 1; | |
else { | |
const struct iso2022_designation *dsg; | |
unsigned char charset; | |
ucs4_t decoded; | |
if (STATE_GETFLAG(F_SHIFTED)) | |
charset = STATE_G1; | |
else | |
charset = STATE_G0; | |
if (charset == CHARSET_ASCII) { | |
bypass: WRITE1(c) | |
NEXT(1, 1) | |
break; | |
} | |
if (dsgcache != NULL && | |
dsgcache->mark == charset) | |
dsg = dsgcache; | |
else { | |
for (dsg = CONFIG_DESIGNATIONS; | |
dsg->mark != charset | |
#ifdef Py_DEBUG | |
&& dsg->mark != '\0' | |
#endif | |
;dsg++) | |
/* noop */; | |
assert(dsg->mark != '\0'); | |
dsgcache = dsg; | |
} | |
REQUIRE_INBUF(dsg->width) | |
decoded = dsg->decoder(*inbuf); | |
if (decoded == MAP_UNMAPPABLE) | |
return dsg->width; | |
if (decoded < 0x10000) { | |
WRITE1(decoded) | |
NEXT_OUT(1) | |
} | |
else if (decoded < 0x30000) { | |
WRITEUCS4(decoded) | |
} | |
else { /* JIS X 0213 pairs */ | |
WRITE2(decoded >> 16, decoded & 0xffff) | |
NEXT_OUT(2) | |
} | |
NEXT_IN(dsg->width) | |
} | |
break; | |
} | |
} | |
return 0; | |
} | |
/*-*- mapping table holders -*-*/ | |
#define ENCMAP(enc) static const encode_map *enc##_encmap = NULL; | |
#define DECMAP(enc) static const decode_map *enc##_decmap = NULL; | |
/* kr */ | |
ENCMAP(cp949) | |
DECMAP(ksx1001) | |
/* jp */ | |
ENCMAP(jisxcommon) | |
DECMAP(jisx0208) | |
DECMAP(jisx0212) | |
ENCMAP(jisx0213_bmp) | |
DECMAP(jisx0213_1_bmp) | |
DECMAP(jisx0213_2_bmp) | |
ENCMAP(jisx0213_emp) | |
DECMAP(jisx0213_1_emp) | |
DECMAP(jisx0213_2_emp) | |
/* cn */ | |
ENCMAP(gbcommon) | |
DECMAP(gb2312) | |
/* tw */ | |
/*-*- mapping access functions -*-*/ | |
static int | |
ksx1001_init(void) | |
{ | |
static int initialized = 0; | |
if (!initialized && ( | |
IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) || | |
IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap))) | |
return -1; | |
initialized = 1; | |
return 0; | |
} | |
static ucs4_t | |
ksx1001_decoder(const unsigned char *data) | |
{ | |
ucs4_t u; | |
TRYMAP_DEC(ksx1001, u, data[0], data[1]) | |
return u; | |
else | |
return MAP_UNMAPPABLE; | |
} | |
static DBCHAR | |
ksx1001_encoder(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
DBCHAR coded; | |
assert(*length == 1); | |
if (*data < 0x10000) { | |
TRYMAP_ENC(cp949, coded, *data) | |
if (!(coded & 0x8000)) | |
return coded; | |
} | |
return MAP_UNMAPPABLE; | |
} | |
static int | |
jisx0208_init(void) | |
{ | |
static int initialized = 0; | |
if (!initialized && ( | |
IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) || | |
IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap))) | |
return -1; | |
initialized = 1; | |
return 0; | |
} | |
static ucs4_t | |
jisx0208_decoder(const unsigned char *data) | |
{ | |
ucs4_t u; | |
if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ | |
return 0xff3c; | |
else TRYMAP_DEC(jisx0208, u, data[0], data[1]) | |
return u; | |
else | |
return MAP_UNMAPPABLE; | |
} | |
static DBCHAR | |
jisx0208_encoder(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
DBCHAR coded; | |
assert(*length == 1); | |
if (*data < 0x10000) { | |
if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */ | |
return 0x2140; | |
else TRYMAP_ENC(jisxcommon, coded, *data) { | |
if (!(coded & 0x8000)) | |
return coded; | |
} | |
} | |
return MAP_UNMAPPABLE; | |
} | |
static int | |
jisx0212_init(void) | |
{ | |
static int initialized = 0; | |
if (!initialized && ( | |
IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) || | |
IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap))) | |
return -1; | |
initialized = 1; | |
return 0; | |
} | |
static ucs4_t | |
jisx0212_decoder(const unsigned char *data) | |
{ | |
ucs4_t u; | |
TRYMAP_DEC(jisx0212, u, data[0], data[1]) | |
return u; | |
else | |
return MAP_UNMAPPABLE; | |
} | |
static DBCHAR | |
jisx0212_encoder(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
DBCHAR coded; | |
assert(*length == 1); | |
if (*data < 0x10000) { | |
TRYMAP_ENC(jisxcommon, coded, *data) { | |
if (coded & 0x8000) | |
return coded & 0x7fff; | |
} | |
} | |
return MAP_UNMAPPABLE; | |
} | |
static int | |
jisx0213_init(void) | |
{ | |
static int initialized = 0; | |
if (!initialized && ( | |
jisx0208_init() || | |
IMPORT_MAP(jp, jisx0213_bmp, | |
&jisx0213_bmp_encmap, NULL) || | |
IMPORT_MAP(jp, jisx0213_1_bmp, | |
NULL, &jisx0213_1_bmp_decmap) || | |
IMPORT_MAP(jp, jisx0213_2_bmp, | |
NULL, &jisx0213_2_bmp_decmap) || | |
IMPORT_MAP(jp, jisx0213_emp, | |
&jisx0213_emp_encmap, NULL) || | |
IMPORT_MAP(jp, jisx0213_1_emp, | |
NULL, &jisx0213_1_emp_decmap) || | |
IMPORT_MAP(jp, jisx0213_2_emp, | |
NULL, &jisx0213_2_emp_decmap) || | |
IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap, | |
&jisx0213_pair_decmap))) | |
return -1; | |
initialized = 1; | |
return 0; | |
} | |
#define config ((void *)2000) | |
static ucs4_t | |
jisx0213_2000_1_decoder(const unsigned char *data) | |
{ | |
ucs4_t u; | |
EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1]) | |
else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ | |
return 0xff3c; | |
else TRYMAP_DEC(jisx0208, u, data[0], data[1]); | |
else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]); | |
else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1]) | |
u |= 0x20000; | |
else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]); | |
else | |
return MAP_UNMAPPABLE; | |
return u; | |
} | |
static ucs4_t | |
jisx0213_2000_2_decoder(const unsigned char *data) | |
{ | |
ucs4_t u; | |
EMULATE_JISX0213_2000_DECODE_PLANE2(u, data[0], data[1]) | |
TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]); | |
else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1]) | |
u |= 0x20000; | |
else | |
return MAP_UNMAPPABLE; | |
return u; | |
} | |
#undef config | |
static ucs4_t | |
jisx0213_2004_1_decoder(const unsigned char *data) | |
{ | |
ucs4_t u; | |
if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ | |
return 0xff3c; | |
else TRYMAP_DEC(jisx0208, u, data[0], data[1]); | |
else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]); | |
else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1]) | |
u |= 0x20000; | |
else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]); | |
else | |
return MAP_UNMAPPABLE; | |
return u; | |
} | |
static ucs4_t | |
jisx0213_2004_2_decoder(const unsigned char *data) | |
{ | |
ucs4_t u; | |
TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]); | |
else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1]) | |
u |= 0x20000; | |
else | |
return MAP_UNMAPPABLE; | |
return u; | |
} | |
static DBCHAR | |
jisx0213_encoder(const ucs4_t *data, Py_ssize_t *length, void *config) | |
{ | |
DBCHAR coded; | |
switch (*length) { | |
case 1: /* first character */ | |
if (*data >= 0x10000) { | |
if ((*data) >> 16 == 0x20000 >> 16) { | |
EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data) | |
else TRYMAP_ENC(jisx0213_emp, coded, | |
(*data) & 0xffff) | |
return coded; | |
} | |
return MAP_UNMAPPABLE; | |
} | |
EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data) | |
else TRYMAP_ENC(jisx0213_bmp, coded, *data) { | |
if (coded == MULTIC) | |
return MAP_MULTIPLE_AVAIL; | |
} | |
else TRYMAP_ENC(jisxcommon, coded, *data) { | |
if (coded & 0x8000) | |
return MAP_UNMAPPABLE; | |
} | |
else | |
return MAP_UNMAPPABLE; | |
return coded; | |
case 2: /* second character of unicode pair */ | |
coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1], | |
jisx0213_pair_encmap, JISX0213_ENCPAIRS); | |
if (coded == DBCINV) { | |
*length = 1; | |
coded = find_pairencmap((ucs2_t)data[0], 0, | |
jisx0213_pair_encmap, JISX0213_ENCPAIRS); | |
if (coded == DBCINV) | |
return MAP_UNMAPPABLE; | |
} | |
else | |
return coded; | |
case -1: /* flush unterminated */ | |
*length = 1; | |
coded = find_pairencmap((ucs2_t)data[0], 0, | |
jisx0213_pair_encmap, JISX0213_ENCPAIRS); | |
if (coded == DBCINV) | |
return MAP_UNMAPPABLE; | |
else | |
return coded; | |
default: | |
return MAP_UNMAPPABLE; | |
} | |
} | |
static DBCHAR | |
jisx0213_2000_1_encoder(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
DBCHAR coded = jisx0213_encoder(data, length, (void *)2000); | |
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) | |
return coded; | |
else if (coded & 0x8000) | |
return MAP_UNMAPPABLE; | |
else | |
return coded; | |
} | |
static DBCHAR | |
jisx0213_2000_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
DBCHAR coded; | |
Py_ssize_t ilength = *length; | |
coded = jisx0213_encoder(data, length, (void *)2000); | |
switch (ilength) { | |
case 1: | |
if (coded == MAP_MULTIPLE_AVAIL) | |
return MAP_MULTIPLE_AVAIL; | |
else | |
return MAP_UNMAPPABLE; | |
case 2: | |
if (*length != 2) | |
return MAP_UNMAPPABLE; | |
else | |
return coded; | |
default: | |
return MAP_UNMAPPABLE; | |
} | |
} | |
static DBCHAR | |
jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
DBCHAR coded = jisx0213_encoder(data, length, (void *)2000); | |
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) | |
return coded; | |
else if (coded & 0x8000) | |
return coded & 0x7fff; | |
else | |
return MAP_UNMAPPABLE; | |
} | |
static DBCHAR | |
jisx0213_2004_1_encoder(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
DBCHAR coded = jisx0213_encoder(data, length, NULL); | |
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) | |
return coded; | |
else if (coded & 0x8000) | |
return MAP_UNMAPPABLE; | |
else | |
return coded; | |
} | |
static DBCHAR | |
jisx0213_2004_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
DBCHAR coded; | |
Py_ssize_t ilength = *length; | |
coded = jisx0213_encoder(data, length, NULL); | |
switch (ilength) { | |
case 1: | |
if (coded == MAP_MULTIPLE_AVAIL) | |
return MAP_MULTIPLE_AVAIL; | |
else | |
return MAP_UNMAPPABLE; | |
case 2: | |
if (*length != 2) | |
return MAP_UNMAPPABLE; | |
else | |
return coded; | |
default: | |
return MAP_UNMAPPABLE; | |
} | |
} | |
static DBCHAR | |
jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
DBCHAR coded = jisx0213_encoder(data, length, NULL); | |
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) | |
return coded; | |
else if (coded & 0x8000) | |
return coded & 0x7fff; | |
else | |
return MAP_UNMAPPABLE; | |
} | |
static ucs4_t | |
jisx0201_r_decoder(const unsigned char *data) | |
{ | |
ucs4_t u; | |
JISX0201_R_DECODE(*data, u) | |
else return MAP_UNMAPPABLE; | |
return u; | |
} | |
static DBCHAR | |
jisx0201_r_encoder(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
DBCHAR coded; | |
JISX0201_R_ENCODE(*data, coded) | |
else return MAP_UNMAPPABLE; | |
return coded; | |
} | |
static ucs4_t | |
jisx0201_k_decoder(const unsigned char *data) | |
{ | |
ucs4_t u; | |
JISX0201_K_DECODE(*data ^ 0x80, u) | |
else return MAP_UNMAPPABLE; | |
return u; | |
} | |
static DBCHAR | |
jisx0201_k_encoder(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
DBCHAR coded; | |
JISX0201_K_ENCODE(*data, coded) | |
else return MAP_UNMAPPABLE; | |
return coded - 0x80; | |
} | |
static int | |
gb2312_init(void) | |
{ | |
static int initialized = 0; | |
if (!initialized && ( | |
IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) || | |
IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap))) | |
return -1; | |
initialized = 1; | |
return 0; | |
} | |
static ucs4_t | |
gb2312_decoder(const unsigned char *data) | |
{ | |
ucs4_t u; | |
TRYMAP_DEC(gb2312, u, data[0], data[1]) | |
return u; | |
else | |
return MAP_UNMAPPABLE; | |
} | |
static DBCHAR | |
gb2312_encoder(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
DBCHAR coded; | |
assert(*length == 1); | |
if (*data < 0x10000) { | |
TRYMAP_ENC(gbcommon, coded, *data) { | |
if (!(coded & 0x8000)) | |
return coded; | |
} | |
} | |
return MAP_UNMAPPABLE; | |
} | |
static ucs4_t | |
dummy_decoder(const unsigned char *data) | |
{ | |
return MAP_UNMAPPABLE; | |
} | |
static DBCHAR | |
dummy_encoder(const ucs4_t *data, Py_ssize_t *length) | |
{ | |
return MAP_UNMAPPABLE; | |
} | |
/*-*- registry tables -*-*/ | |
#define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \ | |
ksx1001_init, \ | |
ksx1001_decoder, ksx1001_encoder } | |
#define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \ | |
ksx1001_init, \ | |
ksx1001_decoder, ksx1001_encoder } | |
#define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \ | |
NULL, \ | |
jisx0201_r_decoder, jisx0201_r_encoder } | |
#define REGISTRY_JISX0201_K { CHARSET_JISX0201_K, 0, 1, \ | |
NULL, \ | |
jisx0201_k_decoder, jisx0201_k_encoder } | |
#define REGISTRY_JISX0208 { CHARSET_JISX0208, 0, 2, \ | |
jisx0208_init, \ | |
jisx0208_decoder, jisx0208_encoder } | |
#define REGISTRY_JISX0208_O { CHARSET_JISX0208_O, 0, 2, \ | |
jisx0208_init, \ | |
jisx0208_decoder, jisx0208_encoder } | |
#define REGISTRY_JISX0212 { CHARSET_JISX0212, 0, 2, \ | |
jisx0212_init, \ | |
jisx0212_decoder, jisx0212_encoder } | |
#define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2, \ | |
jisx0213_init, \ | |
jisx0213_2000_1_decoder, \ | |
jisx0213_2000_1_encoder } | |
#define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \ | |
jisx0213_init, \ | |
jisx0213_2000_1_decoder, \ | |
jisx0213_2000_1_encoder_paironly } | |
#define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2, \ | |
jisx0213_init, \ | |
jisx0213_2000_2_decoder, \ | |
jisx0213_2000_2_encoder } | |
#define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2, \ | |
jisx0213_init, \ | |
jisx0213_2004_1_decoder, \ | |
jisx0213_2004_1_encoder } | |
#define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \ | |
jisx0213_init, \ | |
jisx0213_2004_1_decoder, \ | |
jisx0213_2004_1_encoder_paironly } | |
#define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2, \ | |
jisx0213_init, \ | |
jisx0213_2004_2_decoder, \ | |
jisx0213_2004_2_encoder } | |
#define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \ | |
gb2312_init, \ | |
gb2312_decoder, gb2312_encoder } | |
#define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \ | |
cns11643_init, \ | |
cns11643_1_decoder, cns11643_1_encoder } | |
#define REGISTRY_CNS11643_2 { CHARSET_CNS11643_2, 2, 2, \ | |
cns11643_init, \ | |
cns11643_2_decoder, cns11643_2_encoder } | |
#define REGISTRY_ISO8859_1 { CHARSET_ISO8859_1, 2, 1, \ | |
NULL, dummy_decoder, dummy_encoder } | |
#define REGISTRY_ISO8859_7 { CHARSET_ISO8859_7, 2, 1, \ | |
NULL, dummy_decoder, dummy_encoder } | |
#define REGISTRY_SENTINEL { 0, } | |
#define CONFIGDEF(var, attrs) \ | |
static const struct iso2022_config iso2022_##var##_config = { \ | |
attrs, iso2022_##var##_designations \ | |
}; | |
static const struct iso2022_designation iso2022_kr_designations[] = { | |
REGISTRY_KSX1001_G1, REGISTRY_SENTINEL | |
}; | |
CONFIGDEF(kr, 0) | |
static const struct iso2022_designation iso2022_jp_designations[] = { | |
REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O, | |
REGISTRY_SENTINEL | |
}; | |
CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT) | |
static const struct iso2022_designation iso2022_jp_1_designations[] = { | |
REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R, | |
REGISTRY_JISX0208_O, REGISTRY_SENTINEL | |
}; | |
CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT) | |
static const struct iso2022_designation iso2022_jp_2_designations[] = { | |
REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0, | |
REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O, | |
REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL | |
}; | |
CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT) | |
static const struct iso2022_designation iso2022_jp_2004_designations[] = { | |
REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208, | |
REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL | |
}; | |
CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT) | |
static const struct iso2022_designation iso2022_jp_3_designations[] = { | |
REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208, | |
REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL | |
}; | |
CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT) | |
static const struct iso2022_designation iso2022_jp_ext_designations[] = { | |
REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R, | |
REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL | |
}; | |
CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT) | |
BEGIN_MAPPINGS_LIST | |
/* no mapping table here */ | |
END_MAPPINGS_LIST | |
#define ISO2022_CODEC(variation) { \ | |
"iso2022_" #variation, \ | |
&iso2022_##variation##_config, \ | |
iso2022_codec_init, \ | |
_STATEFUL_METHODS(iso2022) \ | |
}, | |
BEGIN_CODECS_LIST | |
ISO2022_CODEC(kr) | |
ISO2022_CODEC(jp) | |
ISO2022_CODEC(jp_1) | |
ISO2022_CODEC(jp_2) | |
ISO2022_CODEC(jp_2004) | |
ISO2022_CODEC(jp_3) | |
ISO2022_CODEC(jp_ext) | |
END_CODECS_LIST | |
I_AM_A_MODULE_FOR(iso2022) |