| /**************************************************************************** |
| ** |
| ** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). |
| ** All rights reserved. |
| ** Contact: Nokia Corporation (qt-info@nokia.com) |
| ** |
| ** This file is part of the QtCore module of the Qt Toolkit. |
| ** |
| ** $QT_BEGIN_LICENSE:LGPL$ |
| ** GNU Lesser General Public License Usage |
| ** This file may be used under the terms of the GNU Lesser General Public |
| ** License version 2.1 as published by the Free Software Foundation and |
| ** appearing in the file LICENSE.LGPL included in the packaging of this |
| ** file. Please review the following information to ensure the GNU Lesser |
| ** General Public License version 2.1 requirements will be met: |
| ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
| ** |
| ** In addition, as a special exception, Nokia gives you certain additional |
| ** rights. These rights are described in the Nokia Qt LGPL Exception |
| ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
| ** |
| ** GNU General Public License Usage |
| ** Alternatively, this file may be used under the terms of the GNU General |
| ** Public License version 3.0 as published by the Free Software Foundation |
| ** and appearing in the file LICENSE.GPL included in the packaging of this |
| ** file. Please review the following information to ensure the GNU General |
| ** Public License version 3.0 requirements will be met: |
| ** http://www.gnu.org/copyleft/gpl.html. |
| ** |
| ** Other Usage |
| ** Alternatively, this file may be used in accordance with the terms and |
| ** conditions contained in a signed written agreement between you and Nokia. |
| ** |
| ** |
| ** |
| ** |
| ** |
| ** $QT_END_LICENSE$ |
| ** |
| ****************************************************************************/ |
| |
| #include "qutfcodec_p.h" |
| #include "qlist.h" |
| #include "qendian.h" |
| #include "qchar.h" |
| |
| QT_BEGIN_NAMESPACE |
| |
| enum { Endian = 0, Data = 1 }; |
| |
| static inline bool isUnicodeNonCharacter(uint ucs4) |
| { |
| // Unicode has a couple of "non-characters" that one can use internally, |
| // but are not allowed to be used for text interchange. |
| // |
| // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, |
| // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and |
| // U+FDEF (inclusive) |
| |
| return (ucs4 & 0xfffe) == 0xfffe |
| || (ucs4 - 0xfdd0U) < 16; |
| } |
| |
| QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state) |
| { |
| uchar replacement = '?'; |
| int rlen = 3*len; |
| int surrogate_high = -1; |
| if (state) { |
| if (state->flags & QTextCodec::ConvertInvalidToNull) |
| replacement = 0; |
| if (!(state->flags & QTextCodec::IgnoreHeader)) |
| rlen += 3; |
| if (state->remainingChars) |
| surrogate_high = state->state_data[0]; |
| } |
| |
| QByteArray rstr; |
| rstr.resize(rlen); |
| uchar* cursor = (uchar*)rstr.data(); |
| const QChar *ch = uc; |
| int invalid = 0; |
| if (state && !(state->flags & QTextCodec::IgnoreHeader)) { |
| *cursor++ = 0xef; |
| *cursor++ = 0xbb; |
| *cursor++ = 0xbf; |
| } |
| |
| const QChar *end = ch + len; |
| while (ch < end) { |
| uint u = ch->unicode(); |
| if (surrogate_high >= 0) { |
| if (u >= 0xdc00 && u < 0xe000) { |
| u = (surrogate_high - 0xd800)*0x400 + (u - 0xdc00) + 0x10000; |
| surrogate_high = -1; |
| } else { |
| // high surrogate without low |
| *cursor = replacement; |
| ++ch; |
| ++invalid; |
| surrogate_high = -1; |
| continue; |
| } |
| } else if (u >= 0xdc00 && u < 0xe000) { |
| // low surrogate without high |
| *cursor = replacement; |
| ++ch; |
| ++invalid; |
| continue; |
| } else if (u >= 0xd800 && u < 0xdc00) { |
| surrogate_high = u; |
| ++ch; |
| continue; |
| } |
| |
| if (u < 0x80) { |
| *cursor++ = (uchar)u; |
| } else { |
| if (u < 0x0800) { |
| *cursor++ = 0xc0 | ((uchar) (u >> 6)); |
| } else { |
| // is it one of the Unicode non-characters? |
| if (isUnicodeNonCharacter(u)) { |
| *cursor++ = replacement; |
| ++ch; |
| ++invalid; |
| continue; |
| } |
| |
| if (u > 0xffff) { |
| *cursor++ = 0xf0 | ((uchar) (u >> 18)); |
| *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); |
| } else { |
| *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f); |
| } |
| *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f); |
| } |
| *cursor++ = 0x80 | ((uchar) (u&0x3f)); |
| } |
| ++ch; |
| } |
| |
| rstr.resize(cursor - (const uchar*)rstr.constData()); |
| if (state) { |
| state->invalidChars += invalid; |
| state->flags |= QTextCodec::IgnoreHeader; |
| state->remainingChars = 0; |
| if (surrogate_high >= 0) { |
| state->remainingChars = 1; |
| state->state_data[0] = surrogate_high; |
| } |
| } |
| return rstr; |
| } |
| |
| QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state) |
| { |
| bool headerdone = false; |
| ushort replacement = QChar::ReplacementCharacter; |
| int need = 0; |
| int error = -1; |
| uint uc = 0; |
| uint min_uc = 0; |
| if (state) { |
| if (state->flags & QTextCodec::IgnoreHeader) |
| headerdone = true; |
| if (state->flags & QTextCodec::ConvertInvalidToNull) |
| replacement = QChar::Null; |
| need = state->remainingChars; |
| if (need) { |
| uc = state->state_data[0]; |
| min_uc = state->state_data[1]; |
| } |
| } |
| if (!headerdone && len > 3 |
| && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) { |
| // starts with a byte order mark |
| chars += 3; |
| len -= 3; |
| headerdone = true; |
| } |
| |
| QString result(need + len + 1, Qt::Uninitialized); // worst case |
| ushort *qch = (ushort *)result.unicode(); |
| uchar ch; |
| int invalid = 0; |
| |
| for (int i = 0; i < len; ++i) { |
| ch = chars[i]; |
| if (need) { |
| if ((ch&0xc0) == 0x80) { |
| uc = (uc << 6) | (ch & 0x3f); |
| --need; |
| if (!need) { |
| // utf-8 bom composes into 0xfeff code point |
| bool nonCharacter; |
| if (!headerdone && uc == 0xfeff) { |
| // don't do anything, just skip the BOM |
| } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) { |
| // surrogate pair |
| Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); |
| *qch++ = QChar::highSurrogate(uc); |
| *qch++ = QChar::lowSurrogate(uc); |
| } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) { |
| // error: overlong sequence, UTF16 surrogate or non-character |
| *qch++ = replacement; |
| ++invalid; |
| } else { |
| *qch++ = uc; |
| } |
| headerdone = true; |
| } |
| } else { |
| // error |
| i = error; |
| *qch++ = replacement; |
| ++invalid; |
| need = 0; |
| headerdone = true; |
| } |
| } else { |
| if (ch < 128) { |
| *qch++ = ushort(ch); |
| headerdone = true; |
| } else if ((ch & 0xe0) == 0xc0) { |
| uc = ch & 0x1f; |
| need = 1; |
| error = i; |
| min_uc = 0x80; |
| headerdone = true; |
| } else if ((ch & 0xf0) == 0xe0) { |
| uc = ch & 0x0f; |
| need = 2; |
| error = i; |
| min_uc = 0x800; |
| } else if ((ch&0xf8) == 0xf0) { |
| uc = ch & 0x07; |
| need = 3; |
| error = i; |
| min_uc = 0x10000; |
| headerdone = true; |
| } else { |
| // error |
| *qch++ = replacement; |
| ++invalid; |
| headerdone = true; |
| } |
| } |
| } |
| if (!state && need > 0) { |
| // unterminated UTF sequence |
| for (int i = error; i < len; ++i) { |
| *qch++ = replacement; |
| ++invalid; |
| } |
| } |
| result.truncate(qch - (ushort *)result.unicode()); |
| if (state) { |
| state->invalidChars += invalid; |
| state->remainingChars = need; |
| if (headerdone) |
| state->flags |= QTextCodec::IgnoreHeader; |
| state->state_data[0] = need ? uc : 0; |
| state->state_data[1] = need ? min_uc : 0; |
| } |
| return result; |
| } |
| |
| QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e) |
| { |
| DataEndianness endian = e; |
| int length = 2*len; |
| if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) { |
| length += 2; |
| } |
| if (e == DetectEndianness) { |
| endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; |
| } |
| |
| QByteArray d; |
| d.resize(length); |
| char *data = d.data(); |
| if (!state || !(state->flags & QTextCodec::IgnoreHeader)) { |
| QChar bom(QChar::ByteOrderMark); |
| if (endian == BigEndianness) { |
| data[0] = bom.row(); |
| data[1] = bom.cell(); |
| } else { |
| data[0] = bom.cell(); |
| data[1] = bom.row(); |
| } |
| data += 2; |
| } |
| if (endian == BigEndianness) { |
| for (int i = 0; i < len; ++i) { |
| *(data++) = uc[i].row(); |
| *(data++) = uc[i].cell(); |
| } |
| } else { |
| for (int i = 0; i < len; ++i) { |
| *(data++) = uc[i].cell(); |
| *(data++) = uc[i].row(); |
| } |
| } |
| |
| if (state) { |
| state->remainingChars = 0; |
| state->flags |= QTextCodec::IgnoreHeader; |
| } |
| return d; |
| } |
| |
| QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e) |
| { |
| DataEndianness endian = e; |
| bool half = false; |
| uchar buf = 0; |
| bool headerdone = false; |
| if (state) { |
| headerdone = state->flags & QTextCodec::IgnoreHeader; |
| if (endian == DetectEndianness) |
| endian = (DataEndianness)state->state_data[Endian]; |
| if (state->remainingChars) { |
| half = true; |
| buf = state->state_data[Data]; |
| } |
| } |
| if (headerdone && endian == DetectEndianness) |
| endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; |
| |
| QString result(len, Qt::Uninitialized); // worst case |
| QChar *qch = (QChar *)result.unicode(); |
| while (len--) { |
| if (half) { |
| QChar ch; |
| if (endian == LittleEndianness) { |
| ch.setRow(*chars++); |
| ch.setCell(buf); |
| } else { |
| ch.setRow(buf); |
| ch.setCell(*chars++); |
| } |
| if (!headerdone) { |
| headerdone = true; |
| if (endian == DetectEndianness) { |
| if (ch == QChar::ByteOrderSwapped) { |
| endian = LittleEndianness; |
| } else if (ch == QChar::ByteOrderMark) { |
| endian = BigEndianness; |
| } else { |
| if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { |
| endian = BigEndianness; |
| } else { |
| endian = LittleEndianness; |
| ch = QChar((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8)); |
| } |
| *qch++ = ch; |
| } |
| } else if (ch != QChar::ByteOrderMark) { |
| *qch++ = ch; |
| } |
| } else { |
| *qch++ = ch; |
| } |
| half = false; |
| } else { |
| buf = *chars++; |
| half = true; |
| } |
| } |
| result.truncate(qch - result.unicode()); |
| |
| if (state) { |
| if (headerdone) |
| state->flags |= QTextCodec::IgnoreHeader; |
| state->state_data[Endian] = endian; |
| if (half) { |
| state->remainingChars = 1; |
| state->state_data[Data] = buf; |
| } else { |
| state->remainingChars = 0; |
| state->state_data[Data] = 0; |
| } |
| } |
| return result; |
| } |
| |
| QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e) |
| { |
| DataEndianness endian = e; |
| int length = 4*len; |
| if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) { |
| length += 4; |
| } |
| if (e == DetectEndianness) { |
| endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; |
| } |
| |
| QByteArray d(length, Qt::Uninitialized); |
| char *data = d.data(); |
| if (!state || !(state->flags & QTextCodec::IgnoreHeader)) { |
| if (endian == BigEndianness) { |
| data[0] = 0; |
| data[1] = 0; |
| data[2] = (char)0xfe; |
| data[3] = (char)0xff; |
| } else { |
| data[0] = (char)0xff; |
| data[1] = (char)0xfe; |
| data[2] = 0; |
| data[3] = 0; |
| } |
| data += 4; |
| } |
| if (endian == BigEndianness) { |
| for (int i = 0; i < len; ++i) { |
| uint cp = uc[i].unicode(); |
| if (uc[i].isHighSurrogate() && i < len - 1) |
| cp = QChar::surrogateToUcs4(cp, uc[++i].unicode()); |
| *(data++) = cp >> 24; |
| *(data++) = (cp >> 16) & 0xff; |
| *(data++) = (cp >> 8) & 0xff; |
| *(data++) = cp & 0xff; |
| } |
| } else { |
| for (int i = 0; i < len; ++i) { |
| uint cp = uc[i].unicode(); |
| if (uc[i].isHighSurrogate() && i < len - 1) |
| cp = QChar::surrogateToUcs4(cp, uc[++i].unicode()); |
| *(data++) = cp & 0xff; |
| *(data++) = (cp >> 8) & 0xff; |
| *(data++) = (cp >> 16) & 0xff; |
| *(data++) = cp >> 24; |
| } |
| } |
| |
| if (state) { |
| state->remainingChars = 0; |
| state->flags |= QTextCodec::IgnoreHeader; |
| } |
| return d; |
| } |
| |
| QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e) |
| { |
| DataEndianness endian = e; |
| uchar tuple[4]; |
| int num = 0; |
| bool headerdone = false; |
| if (state) { |
| headerdone = state->flags & QTextCodec::IgnoreHeader; |
| if (endian == DetectEndianness) { |
| endian = (DataEndianness)state->state_data[Endian]; |
| } |
| num = state->remainingChars; |
| memcpy(tuple, &state->state_data[Data], 4); |
| } |
| if (headerdone && endian == DetectEndianness) |
| endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; |
| |
| QString result; |
| result.resize((num + len) >> 2 << 1); // worst case |
| QChar *qch = (QChar *)result.unicode(); |
| |
| const char *end = chars + len; |
| while (chars < end) { |
| tuple[num++] = *chars++; |
| if (num == 4) { |
| if (!headerdone) { |
| if (endian == DetectEndianness) { |
| if (endian == DetectEndianness) { |
| if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) { |
| endian = LittleEndianness; |
| num = 0; |
| continue; |
| } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) { |
| endian = BigEndianness; |
| num = 0; |
| continue; |
| } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { |
| endian = BigEndianness; |
| } else { |
| endian = LittleEndianness; |
| } |
| } |
| } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) { |
| num = 0; |
| continue; |
| } |
| } |
| uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple); |
| if (code >= 0x10000) { |
| *qch++ = QChar::highSurrogate(code); |
| *qch++ = QChar::lowSurrogate(code); |
| } else { |
| *qch++ = code; |
| } |
| num = 0; |
| } |
| } |
| result.truncate(qch - result.unicode()); |
| |
| if (state) { |
| if (headerdone) |
| state->flags |= QTextCodec::IgnoreHeader; |
| state->state_data[Endian] = endian; |
| state->remainingChars = num; |
| memcpy(&state->state_data[Data], tuple, 4); |
| } |
| return result; |
| } |
| |
| |
| #ifndef QT_NO_TEXTCODEC |
| |
| QUtf8Codec::~QUtf8Codec() |
| { |
| } |
| |
| QByteArray QUtf8Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const |
| { |
| return QUtf8::convertFromUnicode(uc, len, state); |
| } |
| |
| void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, ConverterState *state) const |
| { |
| *target += QUtf8::convertToUnicode(chars, len, state); |
| } |
| |
| QString QUtf8Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const |
| { |
| return QUtf8::convertToUnicode(chars, len, state); |
| } |
| |
| QByteArray QUtf8Codec::name() const |
| { |
| return "UTF-8"; |
| } |
| |
| int QUtf8Codec::mibEnum() const |
| { |
| return 106; |
| } |
| |
| QUtf16Codec::~QUtf16Codec() |
| { |
| } |
| |
| QByteArray QUtf16Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const |
| { |
| return QUtf16::convertFromUnicode(uc, len, state, e); |
| } |
| |
| QString QUtf16Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const |
| { |
| return QUtf16::convertToUnicode(chars, len, state, e); |
| } |
| |
| int QUtf16Codec::mibEnum() const |
| { |
| return 1015; |
| } |
| |
| QByteArray QUtf16Codec::name() const |
| { |
| return "UTF-16"; |
| } |
| |
| QList<QByteArray> QUtf16Codec::aliases() const |
| { |
| return QList<QByteArray>(); |
| } |
| |
| int QUtf16BECodec::mibEnum() const |
| { |
| return 1013; |
| } |
| |
| QByteArray QUtf16BECodec::name() const |
| { |
| return "UTF-16BE"; |
| } |
| |
| QList<QByteArray> QUtf16BECodec::aliases() const |
| { |
| QList<QByteArray> list; |
| return list; |
| } |
| |
| int QUtf16LECodec::mibEnum() const |
| { |
| return 1014; |
| } |
| |
| QByteArray QUtf16LECodec::name() const |
| { |
| return "UTF-16LE"; |
| } |
| |
| QList<QByteArray> QUtf16LECodec::aliases() const |
| { |
| QList<QByteArray> list; |
| return list; |
| } |
| |
| QUtf32Codec::~QUtf32Codec() |
| { |
| } |
| |
| QByteArray QUtf32Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const |
| { |
| return QUtf32::convertFromUnicode(uc, len, state, e); |
| } |
| |
| QString QUtf32Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const |
| { |
| return QUtf32::convertToUnicode(chars, len, state, e); |
| } |
| |
| int QUtf32Codec::mibEnum() const |
| { |
| return 1017; |
| } |
| |
| QByteArray QUtf32Codec::name() const |
| { |
| return "UTF-32"; |
| } |
| |
| QList<QByteArray> QUtf32Codec::aliases() const |
| { |
| QList<QByteArray> list; |
| return list; |
| } |
| |
| int QUtf32BECodec::mibEnum() const |
| { |
| return 1018; |
| } |
| |
| QByteArray QUtf32BECodec::name() const |
| { |
| return "UTF-32BE"; |
| } |
| |
| QList<QByteArray> QUtf32BECodec::aliases() const |
| { |
| QList<QByteArray> list; |
| return list; |
| } |
| |
| int QUtf32LECodec::mibEnum() const |
| { |
| return 1019; |
| } |
| |
| QByteArray QUtf32LECodec::name() const |
| { |
| return "UTF-32LE"; |
| } |
| |
| QList<QByteArray> QUtf32LECodec::aliases() const |
| { |
| QList<QByteArray> list; |
| return list; |
| } |
| |
| #endif //QT_NO_TEXTCODEC |
| |
| QT_END_NAMESPACE |