| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /** |
| * $Id: XMLRecognizer.cpp 568078 2007-08-21 11:43:25Z amassari $ |
| */ |
| |
| |
| // --------------------------------------------------------------------------- |
| // Includes |
| // --------------------------------------------------------------------------- |
| #include <xercesc/framework/XMLRecognizer.hpp> |
| #include <xercesc/util/RuntimeException.hpp> |
| #include <xercesc/util/XMLString.hpp> |
| |
| XERCES_CPP_NAMESPACE_BEGIN |
| |
| // --------------------------------------------------------------------------- |
| // Local data |
| // |
| // gEncodingNameMap |
| // This array maps the Encodings enum values to their canonical names. |
| // Be sure to keep this in sync with that enum! |
| // --------------------------------------------------------------------------- |
| static const XMLCh* gEncodingNameMap[XMLRecognizer::Encodings_Count] = |
| { |
| XMLUni::fgEBCDICEncodingString |
| , XMLUni::fgUCS4BEncodingString |
| , XMLUni::fgUCS4LEncodingString |
| , XMLUni::fgUSASCIIEncodingString |
| , XMLUni::fgUTF8EncodingString |
| , XMLUni::fgUTF16BEncodingString |
| , XMLUni::fgUTF16LEncodingString |
| , XMLUni::fgXMLChEncodingString |
| }; |
| |
| |
| |
| // --------------------------------------------------------------------------- |
| // XMLRecognizer: Public, const static data |
| // |
| // gXXXPre |
| // gXXXPreLen |
| // The byte sequence prefixes for all of the encodings that we can |
| // auto sense. Also included is the length of each sequence. |
| // --------------------------------------------------------------------------- |
| const char XMLRecognizer::fgASCIIPre[] = { 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20 }; |
| const unsigned int XMLRecognizer::fgASCIIPreLen = 6; |
| const XMLByte XMLRecognizer::fgEBCDICPre[] = { 0x4C, 0x6F, 0xA7, 0x94, 0x93, 0x40 }; |
| const unsigned int XMLRecognizer::fgEBCDICPreLen = 6; |
| const XMLByte XMLRecognizer::fgUTF16BPre[] = { 0x00, 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20 }; |
| const XMLByte XMLRecognizer::fgUTF16LPre[] = { 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20, 0x00 }; |
| const unsigned int XMLRecognizer::fgUTF16PreLen = 12; |
| const XMLByte XMLRecognizer::fgUCS4BPre[] = |
| { |
| 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F |
| , 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D |
| , 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x20 |
| }; |
| const XMLByte XMLRecognizer::fgUCS4LPre[] = |
| { |
| 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00 |
| , 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00 |
| , 0x6C, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00 |
| }; |
| const unsigned int XMLRecognizer::fgUCS4PreLen = 24; |
| |
| const char XMLRecognizer::fgUTF8BOM[] = {(char)0xEF, (char)0xBB, (char)0xBF}; |
| const unsigned int XMLRecognizer::fgUTF8BOMLen = 3; |
| |
| // --------------------------------------------------------------------------- |
| // XMLRecognizer: Encoding recognition methods |
| // --------------------------------------------------------------------------- |
| XMLRecognizer::Encodings |
| XMLRecognizer::basicEncodingProbe( const XMLByte* const rawBuffer |
| , const unsigned int rawByteCount) |
| { |
| // |
| // As an optimization to check the 90% case, check first for the ASCII |
| // sequence '<?xml', which means its either US-ASCII, UTF-8, or some |
| // other encoding that we don't do manually but which happens to share |
| // the US-ASCII code points for these characters. So just return UTF-8 |
| // to get us through the first line. |
| // |
| if (rawByteCount >= fgASCIIPreLen) |
| { |
| if (!memcmp(rawBuffer, fgASCIIPre, fgASCIIPreLen)) |
| return UTF_8; |
| } |
| |
| // |
| // If the count of raw bytes is less than 2, it cannot be anything |
| // we understand, so return UTF-8 as a fallback. |
| // |
| if (rawByteCount < 2) |
| return UTF_8; |
| |
| // |
| // We have two to four bytes, so lets check for a UTF-16 BOM. That |
| // is quick to check and enough to identify two major encodings. |
| // |
| |
| if (rawByteCount < 4) |
| { |
| if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF)) |
| return UTF_16B; |
| else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE)) |
| return UTF_16L; |
| else |
| return UTF_8; |
| } |
| |
| /*** |
| * F.1 Detection Without External Encoding Information |
| * |
| * Because each XML entity not accompanied by external encoding information and |
| * not in UTF-8 or UTF-16 encoding must begin with an XML encoding declaration, |
| * in which the first characters must be '<?xml', any conforming processor can detect, |
| * after two to four octets of input, which of the following cases apply. |
| * |
| * In reading this list, it may help to know that in UCS-4, '<' is "#x0000003C" and |
| * '?' is "#x0000003F", and the Byte Order Mark required of UTF-16 data streams is |
| * "#xFEFF". The notation ## is used to denote any byte value except that two consecutive |
| * ##s cannot be both 00. |
| * |
| * With a Byte Order Mark: |
| * |
| * 00 00 FE FF UCS-4, big-endian machine (1234 order) |
| * FF FE 00 00 UCS-4, little-endian machine (4321 order) |
| * 00 00 FF FE UCS-4, unusual octet order (2143) |
| * FE FF 00 00 UCS-4, unusual octet order (3412) |
| * FE FF ## ## UTF-16, big-endian |
| * FF FE ## ## UTF-16, little-endian |
| * EF BB BF UTF-8 |
| * |
| ***/ |
| |
| // |
| // We have at least four bytes, so we can check all BOM |
| // for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE as well. |
| // |
| if ((rawBuffer[0] == 0x00) && (rawBuffer[1] == 0x00) && (rawBuffer[2] == 0xFE) && (rawBuffer[3] == 0xFF)) |
| return UCS_4B; |
| else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE) && (rawBuffer[2] == 0x00) && (rawBuffer[3] == 0x00)) |
| return UCS_4L; |
| else if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF)) |
| return UTF_16B; |
| else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE)) |
| return UTF_16L; |
| |
| // |
| // We have at least 4 bytes. So lets check the 4 byte sequences that |
| // indicate other UTF-16 and UCS encodings. |
| // |
| if ((rawBuffer[0] == 0x00) || (rawBuffer[0] == 0x3C)) |
| { |
| if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4BPre, fgUCS4PreLen)) |
| return UCS_4B; |
| else if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4LPre, fgUCS4PreLen)) |
| return UCS_4L; |
| else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16BPre, fgUTF16PreLen)) |
| return UTF_16B; |
| else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16LPre, fgUTF16PreLen)) |
| return UTF_16L; |
| } |
| |
| // |
| // See if we have enough bytes to possibly match the EBCDIC prefix. |
| // If so, try it. |
| // |
| if (rawByteCount > fgEBCDICPreLen) |
| { |
| if (!memcmp(rawBuffer, fgEBCDICPre, fgEBCDICPreLen)) |
| return EBCDIC; |
| } |
| |
| // |
| // Does not seem to be anything we know, so go with UTF-8 to get at |
| // least through the first line and see what it really is. |
| // |
| return UTF_8; |
| } |
| |
| |
| XMLRecognizer::Encodings |
| XMLRecognizer::encodingForName(const XMLCh* const encName) |
| { |
| // |
| // Compare the passed string, assume input string is already uppercased, |
| // to the variations that we recognize. |
| // |
| // !!NOTE: Note that we don't handle EBCDIC here because we don't handle |
| // that one ourselves. It is allowed to fall into 'other'. |
| // |
| if (encName == XMLUni::fgXMLChEncodingString || |
| !XMLString::compareString(encName, XMLUni::fgXMLChEncodingString)) |
| { |
| return XMLRecognizer::XERCES_XMLCH; |
| } |
| else if (!XMLString::compareString(encName, XMLUni::fgUTF8EncodingString) |
| || !XMLString::compareString(encName, XMLUni::fgUTF8EncodingString2)) |
| { |
| return XMLRecognizer::UTF_8; |
| } |
| else if (!XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString) |
| || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString2) |
| || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString3) |
| || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString4)) |
| { |
| return XMLRecognizer::US_ASCII; |
| } |
| else if (!XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString) |
| || !XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString2)) |
| { |
| return XMLRecognizer::UTF_16L; |
| } |
| else if (!XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString) |
| || !XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString2)) |
| { |
| return XMLRecognizer::UTF_16B; |
| } |
| else if (!XMLString::compareString(encName, XMLUni::fgUTF16EncodingString)) |
| { |
| #if defined(ENDIANMODE_LITTLE) |
| return XMLRecognizer::UTF_16L; |
| #elif defined(ENDIANMODE_BIG) |
| return XMLRecognizer::UTF_16B; |
| #endif |
| } |
| else if (!XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString) |
| || !XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString2)) |
| { |
| return XMLRecognizer::UCS_4L; |
| } |
| else if (!XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString) |
| || !XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString2)) |
| { |
| return XMLRecognizer::UCS_4B; |
| } |
| else if (!XMLString::compareString(encName, XMLUni::fgUCS4EncodingString)) |
| { |
| #if defined(ENDIANMODE_LITTLE) |
| return XMLRecognizer::UCS_4L; |
| #elif defined(ENDIANMODE_BIG) |
| return XMLRecognizer::UCS_4B; |
| #endif |
| } |
| |
| // Return 'other' since we don't recognizer it |
| return XMLRecognizer::OtherEncoding; |
| } |
| |
| |
| const XMLCh* |
| XMLRecognizer::nameForEncoding(const XMLRecognizer::Encodings theEncoding |
| , MemoryManager* const manager) |
| { |
| if (theEncoding >= Encodings_Count) |
| ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::XMLRec_UnknownEncoding, manager); |
| |
| return gEncodingNameMap[theEncoding]; |
| } |
| |
| XERCES_CPP_NAMESPACE_END |