blob: bec8e9af2a69f11e3697cb8593b3621374354923 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* $Id: XMLRecognizer.cpp 568078 2007-08-21 11:43:25Z amassari $
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/framework/XMLRecognizer.hpp>
#include <xercesc/util/RuntimeException.hpp>
#include <xercesc/util/XMLString.hpp>
XERCES_CPP_NAMESPACE_BEGIN
// ---------------------------------------------------------------------------
// Local data
//
// gEncodingNameMap
// This array maps the Encodings enum values to their canonical names.
// Be sure to keep this in sync with that enum!
// ---------------------------------------------------------------------------
static const XMLCh* gEncodingNameMap[XMLRecognizer::Encodings_Count] =
{
XMLUni::fgEBCDICEncodingString
, XMLUni::fgUCS4BEncodingString
, XMLUni::fgUCS4LEncodingString
, XMLUni::fgUSASCIIEncodingString
, XMLUni::fgUTF8EncodingString
, XMLUni::fgUTF16BEncodingString
, XMLUni::fgUTF16LEncodingString
, XMLUni::fgXMLChEncodingString
};
// ---------------------------------------------------------------------------
// XMLRecognizer: Public, const static data
//
// gXXXPre
// gXXXPreLen
// The byte sequence prefixes for all of the encodings that we can
// auto sense. Also included is the length of each sequence.
// ---------------------------------------------------------------------------
const char XMLRecognizer::fgASCIIPre[] = { 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20 };
const unsigned int XMLRecognizer::fgASCIIPreLen = 6;
const XMLByte XMLRecognizer::fgEBCDICPre[] = { 0x4C, 0x6F, 0xA7, 0x94, 0x93, 0x40 };
const unsigned int XMLRecognizer::fgEBCDICPreLen = 6;
const XMLByte XMLRecognizer::fgUTF16BPre[] = { 0x00, 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20 };
const XMLByte XMLRecognizer::fgUTF16LPre[] = { 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20, 0x00 };
const unsigned int XMLRecognizer::fgUTF16PreLen = 12;
const XMLByte XMLRecognizer::fgUCS4BPre[] =
{
0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F
, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D
, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x20
};
const XMLByte XMLRecognizer::fgUCS4LPre[] =
{
0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00
, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00
, 0x6C, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00
};
const unsigned int XMLRecognizer::fgUCS4PreLen = 24;
const char XMLRecognizer::fgUTF8BOM[] = {(char)0xEF, (char)0xBB, (char)0xBF};
const unsigned int XMLRecognizer::fgUTF8BOMLen = 3;
// ---------------------------------------------------------------------------
// XMLRecognizer: Encoding recognition methods
// ---------------------------------------------------------------------------
XMLRecognizer::Encodings
XMLRecognizer::basicEncodingProbe( const XMLByte* const rawBuffer
, const unsigned int rawByteCount)
{
//
// As an optimization to check the 90% case, check first for the ASCII
// sequence '<?xml', which means its either US-ASCII, UTF-8, or some
// other encoding that we don't do manually but which happens to share
// the US-ASCII code points for these characters. So just return UTF-8
// to get us through the first line.
//
if (rawByteCount >= fgASCIIPreLen)
{
if (!memcmp(rawBuffer, fgASCIIPre, fgASCIIPreLen))
return UTF_8;
}
//
// If the count of raw bytes is less than 2, it cannot be anything
// we understand, so return UTF-8 as a fallback.
//
if (rawByteCount < 2)
return UTF_8;
//
// We have two to four bytes, so lets check for a UTF-16 BOM. That
// is quick to check and enough to identify two major encodings.
//
if (rawByteCount < 4)
{
if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
return UTF_16B;
else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
return UTF_16L;
else
return UTF_8;
}
/***
* F.1 Detection Without External Encoding Information
*
* Because each XML entity not accompanied by external encoding information and
* not in UTF-8 or UTF-16 encoding must begin with an XML encoding declaration,
* in which the first characters must be '<?xml', any conforming processor can detect,
* after two to four octets of input, which of the following cases apply.
*
* In reading this list, it may help to know that in UCS-4, '<' is "#x0000003C" and
* '?' is "#x0000003F", and the Byte Order Mark required of UTF-16 data streams is
* "#xFEFF". The notation ## is used to denote any byte value except that two consecutive
* ##s cannot be both 00.
*
* With a Byte Order Mark:
*
* 00 00 FE FF UCS-4, big-endian machine (1234 order)
* FF FE 00 00 UCS-4, little-endian machine (4321 order)
* 00 00 FF FE UCS-4, unusual octet order (2143)
* FE FF 00 00 UCS-4, unusual octet order (3412)
* FE FF ## ## UTF-16, big-endian
* FF FE ## ## UTF-16, little-endian
* EF BB BF UTF-8
*
***/
//
// We have at least four bytes, so we can check all BOM
// for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE as well.
//
if ((rawBuffer[0] == 0x00) && (rawBuffer[1] == 0x00) && (rawBuffer[2] == 0xFE) && (rawBuffer[3] == 0xFF))
return UCS_4B;
else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE) && (rawBuffer[2] == 0x00) && (rawBuffer[3] == 0x00))
return UCS_4L;
else if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
return UTF_16B;
else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
return UTF_16L;
//
// We have at least 4 bytes. So lets check the 4 byte sequences that
// indicate other UTF-16 and UCS encodings.
//
if ((rawBuffer[0] == 0x00) || (rawBuffer[0] == 0x3C))
{
if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4BPre, fgUCS4PreLen))
return UCS_4B;
else if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4LPre, fgUCS4PreLen))
return UCS_4L;
else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16BPre, fgUTF16PreLen))
return UTF_16B;
else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16LPre, fgUTF16PreLen))
return UTF_16L;
}
//
// See if we have enough bytes to possibly match the EBCDIC prefix.
// If so, try it.
//
if (rawByteCount > fgEBCDICPreLen)
{
if (!memcmp(rawBuffer, fgEBCDICPre, fgEBCDICPreLen))
return EBCDIC;
}
//
// Does not seem to be anything we know, so go with UTF-8 to get at
// least through the first line and see what it really is.
//
return UTF_8;
}
XMLRecognizer::Encodings
XMLRecognizer::encodingForName(const XMLCh* const encName)
{
//
// Compare the passed string, assume input string is already uppercased,
// to the variations that we recognize.
//
// !!NOTE: Note that we don't handle EBCDIC here because we don't handle
// that one ourselves. It is allowed to fall into 'other'.
//
if (encName == XMLUni::fgXMLChEncodingString ||
!XMLString::compareString(encName, XMLUni::fgXMLChEncodingString))
{
return XMLRecognizer::XERCES_XMLCH;
}
else if (!XMLString::compareString(encName, XMLUni::fgUTF8EncodingString)
|| !XMLString::compareString(encName, XMLUni::fgUTF8EncodingString2))
{
return XMLRecognizer::UTF_8;
}
else if (!XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString)
|| !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString2)
|| !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString3)
|| !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString4))
{
return XMLRecognizer::US_ASCII;
}
else if (!XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString)
|| !XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString2))
{
return XMLRecognizer::UTF_16L;
}
else if (!XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString)
|| !XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString2))
{
return XMLRecognizer::UTF_16B;
}
else if (!XMLString::compareString(encName, XMLUni::fgUTF16EncodingString))
{
#if defined(ENDIANMODE_LITTLE)
return XMLRecognizer::UTF_16L;
#elif defined(ENDIANMODE_BIG)
return XMLRecognizer::UTF_16B;
#endif
}
else if (!XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString)
|| !XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString2))
{
return XMLRecognizer::UCS_4L;
}
else if (!XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString)
|| !XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString2))
{
return XMLRecognizer::UCS_4B;
}
else if (!XMLString::compareString(encName, XMLUni::fgUCS4EncodingString))
{
#if defined(ENDIANMODE_LITTLE)
return XMLRecognizer::UCS_4L;
#elif defined(ENDIANMODE_BIG)
return XMLRecognizer::UCS_4B;
#endif
}
// Return 'other' since we don't recognizer it
return XMLRecognizer::OtherEncoding;
}
const XMLCh*
XMLRecognizer::nameForEncoding(const XMLRecognizer::Encodings theEncoding
, MemoryManager* const manager)
{
if (theEncoding >= Encodings_Count)
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::XMLRec_UnknownEncoding, manager);
return gEncodingNameMap[theEncoding];
}
XERCES_CPP_NAMESPACE_END