/**************************************************************************** | |
** | |
** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). | |
** All rights reserved. | |
** Contact: Nokia Corporation (qt-info@nokia.com) | |
** | |
** This file is part of the QtCore module of the Qt Toolkit. | |
** | |
** $QT_BEGIN_LICENSE:LGPL$ | |
** GNU Lesser General Public License Usage | |
** This file may be used under the terms of the GNU Lesser General Public | |
** License version 2.1 as published by the Free Software Foundation and | |
** appearing in the file LICENSE.LGPL included in the packaging of this | |
** file. Please review the following information to ensure the GNU Lesser | |
** General Public License version 2.1 requirements will be met: | |
** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. | |
** | |
** In addition, as a special exception, Nokia gives you certain additional | |
** rights. These rights are described in the Nokia Qt LGPL Exception | |
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. | |
** | |
** GNU General Public License Usage | |
** Alternatively, this file may be used under the terms of the GNU General | |
** Public License version 3.0 as published by the Free Software Foundation | |
** and appearing in the file LICENSE.GPL included in the packaging of this | |
** file. Please review the following information to ensure the GNU General | |
** Public License version 3.0 requirements will be met: | |
** http://www.gnu.org/copyleft/gpl.html. | |
** | |
** Other Usage | |
** Alternatively, this file may be used in accordance with the terms and | |
** conditions contained in a signed written agreement between you and Nokia. | |
** | |
** | |
** | |
** | |
** | |
** $QT_END_LICENSE$ | |
** | |
****************************************************************************/ | |
#include "qutfcodec_p.h" | |
#include "qlist.h" | |
#include "qendian.h" | |
#include "qchar.h" | |
QT_BEGIN_NAMESPACE | |
enum { Endian = 0, Data = 1 }; | |
static inline bool isUnicodeNonCharacter(uint ucs4) | |
{ | |
// Unicode has a couple of "non-characters" that one can use internally, | |
// but are not allowed to be used for text interchange. | |
// | |
// Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, | |
// U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and | |
// U+FDEF (inclusive) | |
return (ucs4 & 0xfffe) == 0xfffe | |
|| (ucs4 - 0xfdd0U) < 16; | |
} | |
QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state) | |
{ | |
uchar replacement = '?'; | |
int rlen = 3*len; | |
int surrogate_high = -1; | |
if (state) { | |
if (state->flags & QTextCodec::ConvertInvalidToNull) | |
replacement = 0; | |
if (!(state->flags & QTextCodec::IgnoreHeader)) | |
rlen += 3; | |
if (state->remainingChars) | |
surrogate_high = state->state_data[0]; | |
} | |
QByteArray rstr; | |
rstr.resize(rlen); | |
uchar* cursor = (uchar*)rstr.data(); | |
const QChar *ch = uc; | |
int invalid = 0; | |
if (state && !(state->flags & QTextCodec::IgnoreHeader)) { | |
*cursor++ = 0xef; | |
*cursor++ = 0xbb; | |
*cursor++ = 0xbf; | |
} | |
const QChar *end = ch + len; | |
while (ch < end) { | |
uint u = ch->unicode(); | |
if (surrogate_high >= 0) { | |
if (u >= 0xdc00 && u < 0xe000) { | |
u = (surrogate_high - 0xd800)*0x400 + (u - 0xdc00) + 0x10000; | |
surrogate_high = -1; | |
} else { | |
// high surrogate without low | |
*cursor = replacement; | |
++ch; | |
++invalid; | |
surrogate_high = -1; | |
continue; | |
} | |
} else if (u >= 0xdc00 && u < 0xe000) { | |
// low surrogate without high | |
*cursor = replacement; | |
++ch; | |
++invalid; | |
continue; | |
} else if (u >= 0xd800 && u < 0xdc00) { | |
surrogate_high = u; | |
++ch; | |
continue; | |
} | |
if (u < 0x80) { | |
*cursor++ = (uchar)u; | |
} else { | |
if (u < 0x0800) { | |
*cursor++ = 0xc0 | ((uchar) (u >> 6)); | |
} else { | |
// is it one of the Unicode non-characters? | |
if (isUnicodeNonCharacter(u)) { | |
*cursor++ = replacement; | |
++ch; | |
++invalid; | |
continue; | |
} | |
if (u > 0xffff) { | |
*cursor++ = 0xf0 | ((uchar) (u >> 18)); | |
*cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); | |
} else { | |
*cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f); | |
} | |
*cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f); | |
} | |
*cursor++ = 0x80 | ((uchar) (u&0x3f)); | |
} | |
++ch; | |
} | |
rstr.resize(cursor - (const uchar*)rstr.constData()); | |
if (state) { | |
state->invalidChars += invalid; | |
state->flags |= QTextCodec::IgnoreHeader; | |
state->remainingChars = 0; | |
if (surrogate_high >= 0) { | |
state->remainingChars = 1; | |
state->state_data[0] = surrogate_high; | |
} | |
} | |
return rstr; | |
} | |
QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state) | |
{ | |
bool headerdone = false; | |
ushort replacement = QChar::ReplacementCharacter; | |
int need = 0; | |
int error = -1; | |
uint uc = 0; | |
uint min_uc = 0; | |
if (state) { | |
if (state->flags & QTextCodec::IgnoreHeader) | |
headerdone = true; | |
if (state->flags & QTextCodec::ConvertInvalidToNull) | |
replacement = QChar::Null; | |
need = state->remainingChars; | |
if (need) { | |
uc = state->state_data[0]; | |
min_uc = state->state_data[1]; | |
} | |
} | |
if (!headerdone && len > 3 | |
&& (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) { | |
// starts with a byte order mark | |
chars += 3; | |
len -= 3; | |
headerdone = true; | |
} | |
QString result(need + len + 1, Qt::Uninitialized); // worst case | |
ushort *qch = (ushort *)result.unicode(); | |
uchar ch; | |
int invalid = 0; | |
for (int i = 0; i < len; ++i) { | |
ch = chars[i]; | |
if (need) { | |
if ((ch&0xc0) == 0x80) { | |
uc = (uc << 6) | (ch & 0x3f); | |
--need; | |
if (!need) { | |
// utf-8 bom composes into 0xfeff code point | |
bool nonCharacter; | |
if (!headerdone && uc == 0xfeff) { | |
// don't do anything, just skip the BOM | |
} else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) { | |
// surrogate pair | |
Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); | |
*qch++ = QChar::highSurrogate(uc); | |
*qch++ = QChar::lowSurrogate(uc); | |
} else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) { | |
// error: overlong sequence, UTF16 surrogate or non-character | |
*qch++ = replacement; | |
++invalid; | |
} else { | |
*qch++ = uc; | |
} | |
headerdone = true; | |
} | |
} else { | |
// error | |
i = error; | |
*qch++ = replacement; | |
++invalid; | |
need = 0; | |
headerdone = true; | |
} | |
} else { | |
if (ch < 128) { | |
*qch++ = ushort(ch); | |
headerdone = true; | |
} else if ((ch & 0xe0) == 0xc0) { | |
uc = ch & 0x1f; | |
need = 1; | |
error = i; | |
min_uc = 0x80; | |
headerdone = true; | |
} else if ((ch & 0xf0) == 0xe0) { | |
uc = ch & 0x0f; | |
need = 2; | |
error = i; | |
min_uc = 0x800; | |
} else if ((ch&0xf8) == 0xf0) { | |
uc = ch & 0x07; | |
need = 3; | |
error = i; | |
min_uc = 0x10000; | |
headerdone = true; | |
} else { | |
// error | |
*qch++ = replacement; | |
++invalid; | |
headerdone = true; | |
} | |
} | |
} | |
if (!state && need > 0) { | |
// unterminated UTF sequence | |
for (int i = error; i < len; ++i) { | |
*qch++ = replacement; | |
++invalid; | |
} | |
} | |
result.truncate(qch - (ushort *)result.unicode()); | |
if (state) { | |
state->invalidChars += invalid; | |
state->remainingChars = need; | |
if (headerdone) | |
state->flags |= QTextCodec::IgnoreHeader; | |
state->state_data[0] = need ? uc : 0; | |
state->state_data[1] = need ? min_uc : 0; | |
} | |
return result; | |
} | |
QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e) | |
{ | |
DataEndianness endian = e; | |
int length = 2*len; | |
if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) { | |
length += 2; | |
} | |
if (e == DetectEndianness) { | |
endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; | |
} | |
QByteArray d; | |
d.resize(length); | |
char *data = d.data(); | |
if (!state || !(state->flags & QTextCodec::IgnoreHeader)) { | |
QChar bom(QChar::ByteOrderMark); | |
if (endian == BigEndianness) { | |
data[0] = bom.row(); | |
data[1] = bom.cell(); | |
} else { | |
data[0] = bom.cell(); | |
data[1] = bom.row(); | |
} | |
data += 2; | |
} | |
if (endian == BigEndianness) { | |
for (int i = 0; i < len; ++i) { | |
*(data++) = uc[i].row(); | |
*(data++) = uc[i].cell(); | |
} | |
} else { | |
for (int i = 0; i < len; ++i) { | |
*(data++) = uc[i].cell(); | |
*(data++) = uc[i].row(); | |
} | |
} | |
if (state) { | |
state->remainingChars = 0; | |
state->flags |= QTextCodec::IgnoreHeader; | |
} | |
return d; | |
} | |
QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e) | |
{ | |
DataEndianness endian = e; | |
bool half = false; | |
uchar buf = 0; | |
bool headerdone = false; | |
if (state) { | |
headerdone = state->flags & QTextCodec::IgnoreHeader; | |
if (endian == DetectEndianness) | |
endian = (DataEndianness)state->state_data[Endian]; | |
if (state->remainingChars) { | |
half = true; | |
buf = state->state_data[Data]; | |
} | |
} | |
if (headerdone && endian == DetectEndianness) | |
endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; | |
QString result(len, Qt::Uninitialized); // worst case | |
QChar *qch = (QChar *)result.unicode(); | |
while (len--) { | |
if (half) { | |
QChar ch; | |
if (endian == LittleEndianness) { | |
ch.setRow(*chars++); | |
ch.setCell(buf); | |
} else { | |
ch.setRow(buf); | |
ch.setCell(*chars++); | |
} | |
if (!headerdone) { | |
headerdone = true; | |
if (endian == DetectEndianness) { | |
if (ch == QChar::ByteOrderSwapped) { | |
endian = LittleEndianness; | |
} else if (ch == QChar::ByteOrderMark) { | |
endian = BigEndianness; | |
} else { | |
if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { | |
endian = BigEndianness; | |
} else { | |
endian = LittleEndianness; | |
ch = QChar((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8)); | |
} | |
*qch++ = ch; | |
} | |
} else if (ch != QChar::ByteOrderMark) { | |
*qch++ = ch; | |
} | |
} else { | |
*qch++ = ch; | |
} | |
half = false; | |
} else { | |
buf = *chars++; | |
half = true; | |
} | |
} | |
result.truncate(qch - result.unicode()); | |
if (state) { | |
if (headerdone) | |
state->flags |= QTextCodec::IgnoreHeader; | |
state->state_data[Endian] = endian; | |
if (half) { | |
state->remainingChars = 1; | |
state->state_data[Data] = buf; | |
} else { | |
state->remainingChars = 0; | |
state->state_data[Data] = 0; | |
} | |
} | |
return result; | |
} | |
QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e) | |
{ | |
DataEndianness endian = e; | |
int length = 4*len; | |
if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) { | |
length += 4; | |
} | |
if (e == DetectEndianness) { | |
endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; | |
} | |
QByteArray d(length, Qt::Uninitialized); | |
char *data = d.data(); | |
if (!state || !(state->flags & QTextCodec::IgnoreHeader)) { | |
if (endian == BigEndianness) { | |
data[0] = 0; | |
data[1] = 0; | |
data[2] = (char)0xfe; | |
data[3] = (char)0xff; | |
} else { | |
data[0] = (char)0xff; | |
data[1] = (char)0xfe; | |
data[2] = 0; | |
data[3] = 0; | |
} | |
data += 4; | |
} | |
if (endian == BigEndianness) { | |
for (int i = 0; i < len; ++i) { | |
uint cp = uc[i].unicode(); | |
if (uc[i].isHighSurrogate() && i < len - 1) | |
cp = QChar::surrogateToUcs4(cp, uc[++i].unicode()); | |
*(data++) = cp >> 24; | |
*(data++) = (cp >> 16) & 0xff; | |
*(data++) = (cp >> 8) & 0xff; | |
*(data++) = cp & 0xff; | |
} | |
} else { | |
for (int i = 0; i < len; ++i) { | |
uint cp = uc[i].unicode(); | |
if (uc[i].isHighSurrogate() && i < len - 1) | |
cp = QChar::surrogateToUcs4(cp, uc[++i].unicode()); | |
*(data++) = cp & 0xff; | |
*(data++) = (cp >> 8) & 0xff; | |
*(data++) = (cp >> 16) & 0xff; | |
*(data++) = cp >> 24; | |
} | |
} | |
if (state) { | |
state->remainingChars = 0; | |
state->flags |= QTextCodec::IgnoreHeader; | |
} | |
return d; | |
} | |
QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e) | |
{ | |
DataEndianness endian = e; | |
uchar tuple[4]; | |
int num = 0; | |
bool headerdone = false; | |
if (state) { | |
headerdone = state->flags & QTextCodec::IgnoreHeader; | |
if (endian == DetectEndianness) { | |
endian = (DataEndianness)state->state_data[Endian]; | |
} | |
num = state->remainingChars; | |
memcpy(tuple, &state->state_data[Data], 4); | |
} | |
if (headerdone && endian == DetectEndianness) | |
endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; | |
QString result; | |
result.resize((num + len) >> 2 << 1); // worst case | |
QChar *qch = (QChar *)result.unicode(); | |
const char *end = chars + len; | |
while (chars < end) { | |
tuple[num++] = *chars++; | |
if (num == 4) { | |
if (!headerdone) { | |
if (endian == DetectEndianness) { | |
if (endian == DetectEndianness) { | |
if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) { | |
endian = LittleEndianness; | |
num = 0; | |
continue; | |
} else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) { | |
endian = BigEndianness; | |
num = 0; | |
continue; | |
} else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { | |
endian = BigEndianness; | |
} else { | |
endian = LittleEndianness; | |
} | |
} | |
} else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) { | |
num = 0; | |
continue; | |
} | |
} | |
uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple); | |
if (code >= 0x10000) { | |
*qch++ = QChar::highSurrogate(code); | |
*qch++ = QChar::lowSurrogate(code); | |
} else { | |
*qch++ = code; | |
} | |
num = 0; | |
} | |
} | |
result.truncate(qch - result.unicode()); | |
if (state) { | |
if (headerdone) | |
state->flags |= QTextCodec::IgnoreHeader; | |
state->state_data[Endian] = endian; | |
state->remainingChars = num; | |
memcpy(&state->state_data[Data], tuple, 4); | |
} | |
return result; | |
} | |
#ifndef QT_NO_TEXTCODEC | |
QUtf8Codec::~QUtf8Codec() | |
{ | |
} | |
QByteArray QUtf8Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const | |
{ | |
return QUtf8::convertFromUnicode(uc, len, state); | |
} | |
void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, ConverterState *state) const | |
{ | |
*target += QUtf8::convertToUnicode(chars, len, state); | |
} | |
QString QUtf8Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const | |
{ | |
return QUtf8::convertToUnicode(chars, len, state); | |
} | |
QByteArray QUtf8Codec::name() const | |
{ | |
return "UTF-8"; | |
} | |
int QUtf8Codec::mibEnum() const | |
{ | |
return 106; | |
} | |
QUtf16Codec::~QUtf16Codec() | |
{ | |
} | |
QByteArray QUtf16Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const | |
{ | |
return QUtf16::convertFromUnicode(uc, len, state, e); | |
} | |
QString QUtf16Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const | |
{ | |
return QUtf16::convertToUnicode(chars, len, state, e); | |
} | |
int QUtf16Codec::mibEnum() const | |
{ | |
return 1015; | |
} | |
QByteArray QUtf16Codec::name() const | |
{ | |
return "UTF-16"; | |
} | |
QList<QByteArray> QUtf16Codec::aliases() const | |
{ | |
return QList<QByteArray>(); | |
} | |
int QUtf16BECodec::mibEnum() const | |
{ | |
return 1013; | |
} | |
QByteArray QUtf16BECodec::name() const | |
{ | |
return "UTF-16BE"; | |
} | |
QList<QByteArray> QUtf16BECodec::aliases() const | |
{ | |
QList<QByteArray> list; | |
return list; | |
} | |
int QUtf16LECodec::mibEnum() const | |
{ | |
return 1014; | |
} | |
QByteArray QUtf16LECodec::name() const | |
{ | |
return "UTF-16LE"; | |
} | |
QList<QByteArray> QUtf16LECodec::aliases() const | |
{ | |
QList<QByteArray> list; | |
return list; | |
} | |
QUtf32Codec::~QUtf32Codec() | |
{ | |
} | |
QByteArray QUtf32Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const | |
{ | |
return QUtf32::convertFromUnicode(uc, len, state, e); | |
} | |
QString QUtf32Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const | |
{ | |
return QUtf32::convertToUnicode(chars, len, state, e); | |
} | |
int QUtf32Codec::mibEnum() const | |
{ | |
return 1017; | |
} | |
QByteArray QUtf32Codec::name() const | |
{ | |
return "UTF-32"; | |
} | |
QList<QByteArray> QUtf32Codec::aliases() const | |
{ | |
QList<QByteArray> list; | |
return list; | |
} | |
int QUtf32BECodec::mibEnum() const | |
{ | |
return 1018; | |
} | |
QByteArray QUtf32BECodec::name() const | |
{ | |
return "UTF-32BE"; | |
} | |
QList<QByteArray> QUtf32BECodec::aliases() const | |
{ | |
QList<QByteArray> list; | |
return list; | |
} | |
int QUtf32LECodec::mibEnum() const | |
{ | |
return 1019; | |
} | |
QByteArray QUtf32LECodec::name() const | |
{ | |
return "UTF-32LE"; | |
} | |
QList<QByteArray> QUtf32LECodec::aliases() const | |
{ | |
QList<QByteArray> list; | |
return list; | |
} | |
#endif //QT_NO_TEXTCODEC | |
QT_END_NAMESPACE |