// UTFConvert.cpp | |
#include "StdAfx.h" | |
#include "MyTypes.h" | |
#include "UTFConvert.h" | |
#ifdef _WIN32 | |
#define _WCHART_IS_16BIT 1 | |
#endif | |
/* | |
_UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte | |
n : _UTF8_START(n) : Bits of code point | |
0 : 0x80 : : unused | |
1 : 0xC0 : 11 : | |
2 : 0xE0 : 16 : Basic Multilingual Plane | |
3 : 0xF0 : 21 : Unicode space | |
3 : 0xF8 : 26 : | |
5 : 0xFC : 31 : UCS-4 | |
6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value | |
7 : 0xFF : | |
*/ | |
#define _UTF8_START(n) (0x100 - (1 << (7 - (n)))) | |
#define _UTF8_HEAD_PARSE2(n) if (c < _UTF8_START((n) + 1)) { numBytes = (n); c -= _UTF8_START(n); } | |
#define _UTF8_HEAD_PARSE \ | |
_UTF8_HEAD_PARSE2(1) \ | |
else _UTF8_HEAD_PARSE2(2) \ | |
else _UTF8_HEAD_PARSE2(3) \ | |
else _UTF8_HEAD_PARSE2(4) \ | |
else _UTF8_HEAD_PARSE2(5) \ | |
// else _UTF8_HEAD_PARSE2(6) | |
bool CheckUTF8(const char *src, bool allowReduced) throw() | |
{ | |
for (;;) | |
{ | |
Byte c = *src++; | |
if (c == 0) | |
return true; | |
if (c < 0x80) | |
continue; | |
if (c < 0xC0) // (c < 0xC0 + 2) // if we support only optimal encoding chars | |
return false; | |
unsigned numBytes; | |
_UTF8_HEAD_PARSE | |
else | |
return false; | |
UInt32 val = c; | |
do | |
{ | |
Byte c2 = *src++; | |
if (c2 < 0x80 || c2 >= 0xC0) | |
return allowReduced && c2 == 0; | |
val <<= 6; | |
val |= (c2 - 0x80); | |
} | |
while (--numBytes); | |
if (val >= 0x110000) | |
return false; | |
} | |
} | |
#define _ERROR_UTF8 \ | |
{ if (dest) dest[destPos] = (wchar_t)0xFFFD; destPos++; ok = false; continue; } | |
static bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim) throw() | |
{ | |
size_t destPos = 0; | |
bool ok = true; | |
for (;;) | |
{ | |
Byte c; | |
if (src == srcLim) | |
{ | |
*destLen = destPos; | |
return ok; | |
} | |
c = *src++; | |
if (c < 0x80) | |
{ | |
if (dest) | |
dest[destPos] = (wchar_t)c; | |
destPos++; | |
continue; | |
} | |
if (c < 0xC0) | |
_ERROR_UTF8 | |
unsigned numBytes; | |
_UTF8_HEAD_PARSE | |
else | |
_ERROR_UTF8 | |
UInt32 val = c; | |
do | |
{ | |
Byte c2; | |
if (src == srcLim) | |
break; | |
c2 = *src; | |
if (c2 < 0x80 || c2 >= 0xC0) | |
break; | |
src++; | |
val <<= 6; | |
val |= (c2 - 0x80); | |
} | |
while (--numBytes); | |
if (numBytes != 0) | |
_ERROR_UTF8 | |
if (val < 0x10000) | |
{ | |
if (dest) | |
dest[destPos] = (wchar_t)val; | |
destPos++; | |
} | |
else | |
{ | |
val -= 0x10000; | |
if (val >= 0x100000) | |
_ERROR_UTF8 | |
if (dest) | |
{ | |
dest[destPos + 0] = (wchar_t)(0xD800 + (val >> 10)); | |
dest[destPos + 1] = (wchar_t)(0xDC00 + (val & 0x3FF)); | |
} | |
destPos += 2; | |
} | |
} | |
} | |
#define _UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6)) | |
#define _UTF8_HEAD(n, val) ((char)(_UTF8_START(n) + (val >> (6 * (n))))) | |
#define _UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F))) | |
static size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim) | |
{ | |
size_t size = srcLim - src; | |
for (;;) | |
{ | |
if (src == srcLim) | |
return size; | |
UInt32 val = *src++; | |
if (val < 0x80) | |
continue; | |
if (val < _UTF8_RANGE(1)) | |
{ | |
size++; | |
continue; | |
} | |
if (val >= 0xD800 && val < 0xDC00 && src != srcLim) | |
{ | |
UInt32 c2 = *src; | |
if (c2 >= 0xDC00 && c2 < 0xE000) | |
{ | |
src++; | |
size += 2; | |
continue; | |
} | |
} | |
#ifdef _WCHART_IS_16BIT | |
size += 2; | |
#else | |
if (val < _UTF8_RANGE(2)) size += 2; | |
else if (val < _UTF8_RANGE(3)) size += 3; | |
else if (val < _UTF8_RANGE(4)) size += 4; | |
else if (val < _UTF8_RANGE(5)) size += 5; | |
else size += 6; | |
#endif | |
} | |
} | |
static char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim) | |
{ | |
for (;;) | |
{ | |
if (src == srcLim) | |
return dest; | |
UInt32 val = *src++; | |
if (val < 0x80) | |
{ | |
*dest++ = (char)val; | |
continue; | |
} | |
if (val < _UTF8_RANGE(1)) | |
{ | |
dest[0] = _UTF8_HEAD(1, val); | |
dest[1] = _UTF8_CHAR(0, val); | |
dest += 2; | |
continue; | |
} | |
if (val >= 0xD800 && val < 0xDC00 && src != srcLim) | |
{ | |
UInt32 c2 = *src; | |
if (c2 >= 0xDC00 && c2 < 0xE000) | |
{ | |
src++; | |
val = (((val - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000; | |
dest[0] = _UTF8_HEAD(3, val); | |
dest[1] = _UTF8_CHAR(2, val); | |
dest[2] = _UTF8_CHAR(1, val); | |
dest[3] = _UTF8_CHAR(0, val); | |
dest += 4; | |
continue; | |
} | |
} | |
#ifndef _WCHART_IS_16BIT | |
if (val < _UTF8_RANGE(2)) | |
#endif | |
{ | |
dest[0] = _UTF8_HEAD(2, val); | |
dest[1] = _UTF8_CHAR(1, val); | |
dest[2] = _UTF8_CHAR(0, val); | |
dest += 3; | |
continue; | |
} | |
#ifndef _WCHART_IS_16BIT | |
UInt32 b; | |
unsigned numBits; | |
if (val < _UTF8_RANGE(3)) { numBits = 6 * 3; b = _UTF8_HEAD(3, val); } | |
else if (val < _UTF8_RANGE(4)) { numBits = 6 * 4; b = _UTF8_HEAD(4, val); } | |
else if (val < _UTF8_RANGE(5)) { numBits = 6 * 5; b = _UTF8_HEAD(5, val); } | |
else { numBits = 6 * 6; b = _UTF8_START(6); } | |
*dest++ = (Byte)b; | |
do | |
{ | |
numBits -= 6; | |
*dest++ = (char)(0x80 + ((val >> numBits) & 0x3F)); | |
} | |
while (numBits != 0); | |
#endif | |
} | |
} | |
bool ConvertUTF8ToUnicode(const AString &src, UString &dest) | |
{ | |
dest.Empty(); | |
size_t destLen = 0; | |
Utf8_To_Utf16(NULL, &destLen, src, src.Ptr(src.Len())); | |
bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src.Ptr(src.Len())); | |
dest.ReleaseBuf_SetEnd((unsigned)destLen); | |
return res; | |
} | |
void ConvertUnicodeToUTF8(const UString &src, AString &dest) | |
{ | |
dest.Empty(); | |
size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len())); | |
Utf16_To_Utf8(dest.GetBuf((unsigned)destLen), src, src.Ptr(src.Len())); | |
dest.ReleaseBuf_SetEnd((unsigned)destLen); | |
} |