blob: 8bebcfbbc3310ace4fd1a9693f6e8d2eb996b85d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Id: XMLReader.cpp 568078 2007-08-21 11:43:25Z amassari $
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/internal/XMLReader.hpp>
#include <xercesc/util/BitOps.hpp>
#include <xercesc/util/BinInputStream.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/RuntimeException.hpp>
#include <xercesc/util/TransService.hpp>
#include <xercesc/util/XMLEBCDICTranscoder.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/Janitor.hpp>
XERCES_CPP_NAMESPACE_BEGIN
// ---------------------------------------------------------------------------
// XMLReader: Query Methods
// ---------------------------------------------------------------------------
// Checks whether all of the chars in the passed buffer are whitespace or
// not. Breaks out on the first non-whitespace.
//
bool XMLReader::isAllSpaces(const XMLCh* const toCheck
, const unsigned int count) const
{
const XMLCh* curCh = toCheck;
const XMLCh* endPtr = toCheck + count;
while (curCh < endPtr)
{
if (!(fgCharCharsTable[*curCh++] & gWhitespaceCharMask))
return false;
}
return true;
}
//
// Checks whether at least one of the chars in the passed buffer are whitespace or
// not.
//
bool XMLReader::containsWhiteSpace(const XMLCh* const toCheck
, const unsigned int count) const
{
const XMLCh* curCh = toCheck;
const XMLCh* endPtr = toCheck + count;
while (curCh < endPtr)
{
if (fgCharCharsTable[*curCh++] & gWhitespaceCharMask)
return true;
}
return false;
}
//
// This one is not called terribly often, so call the XMLChar utility
//
bool XMLReader::isPublicIdChar(const XMLCh toCheck) const
{
if (fXMLVersion == XMLV1_1)
return XMLChar1_1::isPublicIdChar(toCheck);
else
return XMLChar1_0::isPublicIdChar(toCheck);
}
// ---------------------------------------------------------------------------
// XMLReader: Constructors and Destructor
// ---------------------------------------------------------------------------
XMLReader::XMLReader(const XMLCh* const pubId
, const XMLCh* const sysId
, BinInputStream* const streamToAdopt
, const RefFrom from
, const Types type
, const Sources source
, const bool throwAtEnd
, const bool calculateSrcOfs
, const XMLVersion version
, MemoryManager* const manager) :
fCharIndex(0)
, fCharsAvail(0)
, fCurCol(1)
, fCurLine(1)
, fEncodingStr(0)
, fForcedEncoding(false)
, fNoMore(false)
, fPublicId(XMLString::replicate(pubId, manager))
, fRawBufIndex(0)
, fRawBytesAvail(0)
, fReaderNum(0xFFFFFFFF)
, fRefFrom(from)
, fSentTrailingSpace(false)
, fSource(source)
, fSrcOfsBase(0)
, fSrcOfsSupported(false)
, fCalculateSrcOfs(calculateSrcOfs)
, fSystemId(XMLString::replicate(sysId, manager))
, fStream(streamToAdopt)
, fSwapped(false)
, fThrowAtEnd(throwAtEnd)
, fTranscoder(0)
, fType(type)
, fMemoryManager(manager)
{
setXMLVersion(version);
// Do an initial load of raw bytes
refreshRawBuffer();
// Ask the transcoding service if it supports src offset info
fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
//
// Use the recognizer class to get a basic sense of what family of
// encodings this file is in. We'll start off with a reader of that
// type, and update it later if needed when we read the XMLDecl line.
//
fEncoding = XMLRecognizer::basicEncodingProbe(fRawByteBuf, fRawBytesAvail);
#if defined(XERCES_DEBUG)
if ((fEncoding < XMLRecognizer::Encodings_Min)
|| (fEncoding > XMLRecognizer::Encodings_Max))
{
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
}
#endif
fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding, fMemoryManager), fMemoryManager);
// Check whether the fSwapped flag should be set or not
checkForSwapped();
//
// This will check to see if the first line is an XMLDecl and, if
// so, decode that first line manually one character at a time. This
// leaves enough characters in the buffer that the high level code
// can get through the Decl and call us back with the real encoding.
//
doInitDecode();
//
// NOTE: We won't create a transcoder until we either get a call to
// setEncoding() or we get a call to refreshCharBuffer() and no
// transcoder has been set yet.
//
}
XMLReader::XMLReader(const XMLCh* const pubId
, const XMLCh* const sysId
, BinInputStream* const streamToAdopt
, const XMLCh* const encodingStr
, const RefFrom from
, const Types type
, const Sources source
, const bool throwAtEnd
, const bool calculateSrcOfs
, const XMLVersion version
, MemoryManager* const manager) :
fCharIndex(0)
, fCharsAvail(0)
, fCurCol(1)
, fCurLine(1)
, fEncoding(XMLRecognizer::UTF_8)
, fEncodingStr(0)
, fForcedEncoding(true)
, fNoMore(false)
, fPublicId(XMLString::replicate(pubId, manager))
, fRawBufIndex(0)
, fRawBytesAvail(0)
, fReaderNum(0xFFFFFFFF)
, fRefFrom(from)
, fSentTrailingSpace(false)
, fSource(source)
, fSrcOfsBase(0)
, fSrcOfsSupported(false)
, fCalculateSrcOfs(calculateSrcOfs)
, fSystemId(XMLString::replicate(sysId, manager))
, fStream(streamToAdopt)
, fSwapped(false)
, fThrowAtEnd(throwAtEnd)
, fTranscoder(0)
, fType(type)
, fMemoryManager(manager)
{
setXMLVersion(version);
// Do an initial load of raw bytes
refreshRawBuffer();
// Copy the encoding string to our member
fEncodingStr = XMLString::replicate(encodingStr, fMemoryManager);
XMLString::upperCaseASCII(fEncodingStr);
// Ask the transcoding service if it supports src offset info
fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
//
// Map the passed encoding name to one of our enums. If it does not
// match one of the intrinsic encodings, it will come back 'other',
// which tells us to create a transcoder based reader.
//
fEncoding = XMLRecognizer::encodingForName(fEncodingStr);
// test the presence of the BOM and remove it from the source
switch(fEncoding)
{
case XMLRecognizer::UCS_4B :
case XMLRecognizer::UCS_4L :
{
if (fRawBytesAvail > 4 &&
((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00)) )
{
fRawBufIndex += 4;
}
break;
}
case XMLRecognizer::UTF_8 :
{
// Look at the raw buffer as short chars
const char* asChars = (const char*)fRawByteBuf;
if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
XMLString::compareNString( asChars
, XMLRecognizer::fgUTF8BOM
, XMLRecognizer::fgUTF8BOMLen) == 0)
{
fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
}
break;
}
case XMLRecognizer::UTF_16B :
case XMLRecognizer::UTF_16L :
{
if (fRawBytesAvail < 2)
break;
const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex];
if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
{
fRawBufIndex += sizeof(UTF16Ch);
}
break;
}
}
// Check whether the fSwapped flag should be set or not
checkForSwapped();
//
// Create a transcoder for the encoding. Since the encoding has been
// forced, this will be the one we will use, period.
//
XMLTransService::Codes failReason;
if (fEncoding == XMLRecognizer::OtherEncoding)
{
//
// fEncodingStr not pre-recognized, use it
// directly for transcoder
//
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncodingStr
, failReason
, kCharBufSize
, fMemoryManager
);
}
else
{
//
// Use the recognized fEncoding to create the transcoder
//
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncoding
, failReason
, kCharBufSize
, fMemoryManager
);
}
if (!fTranscoder)
{
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Trans_CantCreateCvtrFor
, fEncodingStr
, fMemoryManager
);
}
//
// Note that, unlike above, we do not do an initial decode of the
// first line. We take the caller's word that the encoding is correct
// and just assume that the first bulk decode (kicked off by the first
// get of a character) will work.
//
// So we do here the slipping in of the leading space if required.
//
if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
{
// This represents no data from the source
fCharSizeBuf[fCharsAvail] = 0;
fCharOfsBuf[fCharsAvail] = 0;
fCharBuf[fCharsAvail++] = chSpace;
}
}
XMLReader::XMLReader(const XMLCh* const pubId
, const XMLCh* const sysId
, BinInputStream* const streamToAdopt
, XMLRecognizer::Encodings encodingEnum
, const RefFrom from
, const Types type
, const Sources source
, const bool throwAtEnd
, const bool calculateSrcOfs
, const XMLVersion version
, MemoryManager* const manager) :
fCharIndex(0)
, fCharsAvail(0)
, fCurCol(1)
, fCurLine(1)
, fEncoding(XMLRecognizer::UTF_8)
, fEncodingStr(0)
, fForcedEncoding(true)
, fNoMore(false)
, fPublicId(XMLString::replicate(pubId, manager))
, fRawBufIndex(0)
, fRawBytesAvail(0)
, fReaderNum(0xFFFFFFFF)
, fRefFrom(from)
, fSentTrailingSpace(false)
, fSource(source)
, fSrcOfsBase(0)
, fSrcOfsSupported(false)
, fCalculateSrcOfs(calculateSrcOfs)
, fSystemId(XMLString::replicate(sysId, manager))
, fStream(streamToAdopt)
, fSwapped(false)
, fThrowAtEnd(throwAtEnd)
, fTranscoder(0)
, fType(type)
, fMemoryManager(manager)
{
setXMLVersion(version);
// Do an initial load of raw bytes
refreshRawBuffer();
// Ask the transcoding service if it supports src offset info
fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
//
// Use the passed encoding code
//
fEncoding = encodingEnum;
fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding, fMemoryManager), fMemoryManager);
// Check whether the fSwapped flag should be set or not
checkForSwapped();
//
// Create a transcoder for the encoding. Since the encoding has been
// forced, this will be the one we will use, period.
//
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncoding
, failReason
, kCharBufSize
, fMemoryManager
);
if (!fTranscoder)
{
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Trans_CantCreateCvtrFor
, fEncodingStr
, fMemoryManager
);
}
//
// Note that, unlike above, we do not do an initial decode of the
// first line. We take the caller's word that the encoding is correct
// and just assume that the first bulk decode (kicked off by the first
// get of a character) will work.
//
// So we do here the slipping in of the leading space if required.
//
if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
{
// This represents no data from the source
fCharSizeBuf[fCharsAvail] = 0;
fCharOfsBuf[fCharsAvail] = 0;
fCharBuf[fCharsAvail++] = chSpace;
}
}
XMLReader::~XMLReader()
{
fMemoryManager->deallocate(fEncodingStr);
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fSystemId);
delete fStream;
delete fTranscoder;
}
// ---------------------------------------------------------------------------
// XMLReader: Character buffer management methods
// ---------------------------------------------------------------------------
unsigned int XMLReader::getSrcOffset() const
{
if (!fSrcOfsSupported || !fCalculateSrcOfs)
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_SrcOfsNotSupported, fMemoryManager);
//
// Take the current source offset and add in the sizes that we've
// eaten from the source so far.
//
if( fCharIndex == 0 ) {
return fSrcOfsBase;
}
if( fCharIndex < fCharsAvail ) {
return (fSrcOfsBase + fCharOfsBuf[fCharIndex]);
}
return (fSrcOfsBase + fCharOfsBuf[fCharIndex-1] + fCharSizeBuf[fCharIndex-1]);
}
bool XMLReader::refreshCharBuffer()
{
// If the no more flag is set, then don't both doing anything
if (fNoMore)
return false;
unsigned int startInd;
// See if we have any existing chars.
const unsigned int spareChars = fCharsAvail - fCharIndex;
// If we are full, then don't do anything.
if (spareChars == kCharBufSize)
return true;
//
// If no transcoder has been created yet, then we never saw the
// any encoding="" string and the encoding was not forced, so lets
// create one now. We know that it won't change now.
//
// However, note that if we autosensed EBCDIC, then we have to
// consider it an error if we never got an encoding since we don't
// know what variant of EBCDIC it is.
//
if (!fTranscoder)
{
if (fEncoding == XMLRecognizer::EBCDIC)
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_EncodingStrRequired, fMemoryManager);
// Ask the transcoding service to make use a transcoder
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncodingStr
, failReason
, kCharBufSize
, fMemoryManager
);
if (!fTranscoder)
{
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Trans_CantCreateCvtrFor
, fEncodingStr
, fMemoryManager
);
}
}
//
// Add the number of source bytes eaten so far to the base src
// offset member.
//
if (fCalculateSrcOfs) {
for (startInd = 0; startInd < fCharIndex; startInd++)
fSrcOfsBase += fCharSizeBuf[startInd];
}
//
// If there are spare chars, then move then down to the bottom. We
// have to move the char sizes down also.
//
startInd = 0;
if (spareChars)
{
for (unsigned int index = fCharIndex; index < fCharsAvail; index++)
{
fCharBuf[startInd] = fCharBuf[index];
fCharSizeBuf[startInd] = fCharSizeBuf[index];
startInd++;
}
}
//
// And then get more chars, starting after any spare chars that were
// left over from the last time.
//
fCharsAvail = xcodeMoreChars
(
&fCharBuf[startInd]
, &fCharSizeBuf[startInd]
, kCharBufSize - spareChars
);
// Add back in the spare chars
fCharsAvail += spareChars;
// Reset the buffer index to zero, so we start from the 0th char again
fCharIndex = 0;
//
// If no chars available, then we have to check for one last thing. If
// this is reader for a PE and its not being expanded inside a literal,
// then unget a trailing space. We use a boolean to avoid triggering
// this more than once.
//
if (!fCharsAvail
&& (fType == Type_PE)
&& (fRefFrom == RefFrom_NonLiteral)
&& !fSentTrailingSpace)
{
fCharBuf[0] = chSpace;
fCharsAvail = 1;
fSentTrailingSpace = true;
}
//
// If we get here with no more chars, then set the fNoMore flag which
// lets us optimize and know without checking that no more chars are
// available.
//
if (!fCharsAvail)
fNoMore = true;
// Calculate fCharOfsBuf using the elements from fCharBufSize
if (fCalculateSrcOfs)
{
fCharOfsBuf[0] = 0;
for (unsigned int index = 1; index < fCharsAvail; ++index) {
fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
}
}
return (fCharsAvail != 0);
}
// ---------------------------------------------------------------------------
// XMLReader: Scanning methods
// ---------------------------------------------------------------------------
bool XMLReader::getName(XMLBuffer& toFill, const bool token)
{
// Ok, first lets see if we have chars in the buffer. If not, then lets
// reload.
if (fCharIndex == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
}
unsigned int charIndex_start = fCharIndex;
// Lets check the first char for being a first name char. If not, then
// what's the point in living mannnn? Just give up now. We only do this
// if its a name and not a name token that they want.
if (!token)
{
if (fXMLVersion == XMLV1_1 && ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F))) {
// make sure one more char is in the buffer, the transcoder
// should put only a complete surrogate pair into the buffer
assert(fCharIndex+1 < fCharsAvail);
if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
return false;
// Looks ok, so lets eat it
fCharIndex += 2;
}
else {
if (!isFirstNameChar(fCharBuf[fCharIndex]))
return false;
// Looks ok, so lets eat it
fCharIndex ++;
}
}
// And now we loop until we run out of data in this reader or we hit
// a non-name char.
while (true)
{
if (fXMLVersion == XMLV1_1)
{
while (fCharIndex < fCharsAvail)
{
// Check the current char and take it if its a name char. Else
// break out.
if ( (fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) )
{
// make sure one more char is in the buffer, the transcoder
// should put only a complete surrogate pair into the buffer
assert(fCharIndex+1 < fCharsAvail);
if ( (fCharBuf[fCharIndex+1] < 0xDC00) ||
(fCharBuf[fCharIndex+1] > 0xDFFF) )
break;
fCharIndex += 2;
}
else
{
if (!isNameChar(fCharBuf[fCharIndex]))
break;
fCharIndex++;
}
}
}
else // XMLV1_0
{
while (fCharIndex < fCharsAvail)
{
if (!isNameChar(fCharBuf[fCharIndex]))
break;
fCharIndex++;
}
}
// we have to copy the accepted character(s), and update column
if (fCharIndex != charIndex_start)
{
fCurCol += fCharIndex - charIndex_start;
toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
}
// something is wrong if there is still something in the buffer
// or if we don't get no more, then break out.
if ((fCharIndex < fCharsAvail) ||
!refreshCharBuffer())
break;
charIndex_start = fCharIndex;
}
return !toFill.isEmpty();
}
bool XMLReader::getQName(XMLBuffer& toFill, int* colonPosition)
{
unsigned int charIndex_start;
bool checkNextCharacterForFirstNCName = true;
// We are only looking for two iterations (i.e. 'NCANAME':'NCNAME').
// We will stop when we finished scanning for a QName (i.e. either a second
// colon or an invalid char).
*colonPosition = -1;
for (;;) {
// Ok, first lets see if we have chars in the buffer. If not, then lets
// reload.
if (fCharIndex == fCharsAvail) {
if (!refreshCharBuffer()) {
break;
}
}
charIndex_start = fCharIndex;
if (checkNextCharacterForFirstNCName) {
checkNextCharacterForFirstNCName = false;
// Lets check the first char for being a first name char. If not, then
// what's the point in living mannnn? Just give up now. We only do this
// if its a name and not a name token that they want.
if (fXMLVersion == XMLV1_1
&& ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F))) {
// make sure one more char is in the buffer, the transcoder
// should put only a complete surrogate pair into the buffer
assert(fCharIndex+1 < fCharsAvail);
if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
return false;
// Looks ok, so lets eat it
fCharIndex += 2;
}
else {
if (!isFirstNCNameChar(fCharBuf[fCharIndex])) {
return false;
}
// Looks ok, so lets eat it
fCharIndex++;
}
}
while (fCharIndex < fCharsAvail) {
// Check the current char and take it if its a name char. Else
// break out.
if ( (fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) )
{
// make sure one more char is in the buffer, the transcoder
// should put only a complete surrogate pair into the buffer
assert(fCharIndex+1 < fCharsAvail);
if ( (fXMLVersion == XMLV1_0) ||
(fCharBuf[fCharIndex+1] < 0xDC00) ||
(fCharBuf[fCharIndex+1] > 0xDFFF) ) {
break;
}
fCharIndex += 2;
continue;
}
if (!isNCNameChar(fCharBuf[fCharIndex])) {
break;
}
fCharIndex++;
}
// we have to copy the accepted character(s), and update column
if (fCharIndex != charIndex_start)
{
fCurCol += fCharIndex - charIndex_start;
toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
}
// something is wrong if there is still something in the buffer
// or if we don't get no more, then break out.
if (fCharIndex < fCharsAvail) {
if (fCharBuf[fCharIndex] != chColon) {
break;
}
if (*colonPosition != -1) {
return false;
}
*colonPosition = toFill.getLen();
toFill.append(chColon);
fCharIndex++;
fCurCol++;
checkNextCharacterForFirstNCName = true;
}
}
if (checkNextCharacterForFirstNCName) {
return false;
}
return !toFill.isEmpty();
}
bool XMLReader::getSpaces(XMLBuffer& toFill)
{
//
// We just loop until we either hit a non-space or the end of this
// entity. We return true if we returned because of a non-space and
// false if because of end of entity.
//
// NOTE: We have to maintain line/col info here and we have to do
// whitespace normalization if we are not already internalized.
//
while (true)
{
// Loop through the current chars in the buffer
while (fCharIndex < fCharsAvail)
{
// Get the current char out of the buffer
XMLCh curCh = fCharBuf[fCharIndex];
//
// See if its a white space char. If so, then process it. Else
// we've hit a non-space and need to return.
//
if (isWhitespace(curCh))
{
// Eat this char
fCharIndex++;
//
// 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
// end-of-line combinations with a leading chCR(xD) or chLF(xA)
//
// 100000 x20
// 001001 x9
// 001010 chLF
// 001101 chCR
// -----------
// 000110 == (chCR|chLF) & ~(0x9|0x20)
//
// if the result of thelogical-& operation is
// true : 'curCh' must be xA or xD
// false : 'curCh' must be x20 or x9
//
if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
{
fCurCol++;
} else
{
handleEOL(curCh, false);
}
// Ok we can add this guy to our buffer
toFill.append(curCh);
}
else
{
// Return true to indicate we broke out due to a whitespace
return true;
}
}
//
// We've eaten up the current buffer, so lets try to reload it. If
// we don't get anything new, then break out. If we do, then we go
// back to the top to keep getting spaces.
//
if (!refreshCharBuffer())
break;
}
return false;
}
bool XMLReader::getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck)
{
while (true)
{
// Loop through the current chars in the buffer
while (fCharIndex < fCharsAvail)
{
// Get the current char out of the buffer
XMLCh curCh = fCharBuf[fCharIndex];
//
// See if its not a white space or our target char, then process
// it. Else, we need to return.
//
if (!isWhitespace(curCh) && (curCh != toCheck))
{
// Eat this char
fCharIndex++;
//
// 'curCh' is not a whitespace(x20|x9|xD|xA), so we only can
// have end-of-line combinations with a leading chNEL(x85) or
// chLineSeparator(x2028)
//
// 0010000000101000 chLineSeparator
// 0000000010000101 chNEL
// ---------------------
// 1101111101010010 == ~(chNEL|chLineSeparator)
//
// if the result of the logical-& operation is
// true : 'curCh' can not be chNEL or chLineSeparator
// false : 'curCh' can be chNEL or chLineSeparator
//
if ( curCh & (XMLCh) ~(chNEL|chLineSeparator) )
{
fCurCol++;
} else
{
handleEOL(curCh, false);
}
// Add it to our buffer
toFill.append(curCh);
}
else
{
return true;
}
}
//
// We've eaten up the current buffer, so lets try to reload it. If
// we don't get anything new, then break out. If we do, then we go
// back to the top to keep getting spaces.
//
if (!refreshCharBuffer())
break;
}
// We never hit any non-space and ate up the whole reader
return false;
}
bool XMLReader::skipIfQuote(XMLCh& chGotten)
{
if (fCharIndex == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
}
const XMLCh curCh = fCharBuf[fCharIndex];
if ((curCh == chDoubleQuote) || (curCh == chSingleQuote))
{
chGotten = curCh;
fCharIndex++;
fCurCol++;
return true;
}
return false;
}
bool XMLReader::skipSpaces(bool& skippedSomething, bool inDecl)
{
// Remember the current line and column
XMLSSize_t orgLine = fCurLine;
XMLSSize_t orgCol = fCurCol;
// We enter a loop where we skip over spaces until we hit the end of
// this reader or a non-space value. The return indicates whether we
// hit the non-space (true) or the end (false).
while (true)
{
// Loop through the current chars in the buffer
while (fCharIndex < fCharsAvail)
{
// See if its a white space char. If so, then process it. Else
// we've hit a non-space and need to return.
if (isWhitespace(fCharBuf[fCharIndex]))
{
// Get the current char out of the buffer and eat it
XMLCh curCh = fCharBuf[fCharIndex++];
//
// 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
// end-of-line combinations with a leading chCR(xD) or chLF(xA)
//
// 100000 x20
// 001001 x9
// 001010 chLF
// 001101 chCR
// -----------
// 000110 == (chCR|chLF) & ~(0x9|0x20)
//
// if the result of the logical-& operation is
// true : 'curCh' must be xA or xD
// false : 'curCh' must be x20 or x9
//
if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
{
fCurCol++;
} else
{
handleEOL(curCh, inDecl);
}
}
else
{
skippedSomething = (orgLine != fCurLine) || (orgCol != fCurCol);
return true;
}
}
// We've eaten up the current buffer, so lets try to reload it. If
// we don't get anything new, then break out. If we do, then we go
// back to the top to keep getting spaces.
if (!refreshCharBuffer())
break;
}
// We never hit any non-space and ate up the whole reader
skippedSomething = (orgLine != fCurLine) || (orgCol != fCurCol);
return false;
}
bool XMLReader::skippedChar(const XMLCh toSkip)
{
//
// If the buffer is empty, then try to reload it. If we still get
// nothing, then return false.
//
if (fCharIndex == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
}
//
// See if the current char is the one we want. If so, then we need
// to eat it and return true.
//
if (fCharBuf[fCharIndex] == toSkip)
{
fCharIndex++;
fCurCol++;
return true;
}
return false;
}
bool XMLReader::skippedSpace()
{
//
// If the buffer is empty, then try to reload it. If we still get
// nothing, then return false.
//
if (fCharIndex == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
}
//
// See if the current char is a whitespace. If so, then we need to eat
// it and return true.
//
const XMLCh curCh = fCharBuf[fCharIndex];
if (isWhitespace(curCh))
{
// Eat the character
fCharIndex++;
//
// 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
// end-of-line combinations with a leading chCR(xD) or chLF(xA)
//
// 100000 x20
// 001001 x9
// 001010 chLF
// 001101 chCR
// -----------
// 000110 == (chCR|chLF) & ~(0x9|0x20)
//
// if the result of the logical-& operation is
// true : 'curCh' must be xA or xD
// false : 'curCh' must be x20 or x9
//
if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
{
fCurCol++;
} else
{
handleEOL((XMLCh&)curCh, false);
}
return true;
}
return false;
}
bool XMLReader::skippedString(const XMLCh* const toSkip)
{
// Get the length of the string to skip
const unsigned int srcLen = XMLString::stringLen(toSkip);
unsigned int charsLeft = charsLeftInBuffer();
if (srcLen <= fCharsAvail) {
//
// See if the current reader has enough chars to test against this
// string. If not, then ask it to reload its buffer. If that does not
// get us enough, then it cannot match.
//
// NOTE: This works because strings never have to cross a reader! And
// a string to skip will never have a new line in it, so we will never
// miss adjusting the current line.
//
while (charsLeft < srcLen)
{
refreshCharBuffer();
unsigned int t = charsLeftInBuffer();
if (t == charsLeft) // if the refreshCharBuf() did not add anything new
return false; // give up and return.
charsLeft = t;
}
//
// Ok, now we now that the current reader has enough chars in its
// buffer and that its index is back at zero. So we can do a quick and
// dirty comparison straight to its buffer with no requirement to unget
// if it fails.
//
if (memcmp(&fCharBuf[fCharIndex], toSkip, srcLen*sizeof(XMLCh)))
return false;
//
// And get the character buffer index back right by just adding the
// source len to it.
//
fCharIndex += srcLen;
}
else {
if (charsLeft == 0) {
refreshCharBuffer();
charsLeft = charsLeftInBuffer();
if (charsLeft == 0)
return false; // error situation
}
if (memcmp(&fCharBuf[fCharIndex], toSkip, charsLeft*sizeof(XMLCh)))
return false;
fCharIndex += charsLeft;
unsigned int offset = charsLeft;
unsigned int remainingLen = srcLen - charsLeft;
while (remainingLen > 0) {
refreshCharBuffer();
charsLeft = charsLeftInBuffer();
if (charsLeft == 0)
return false; // error situation
if (charsLeft > remainingLen)
charsLeft = remainingLen;
if (memcmp(&fCharBuf[fCharIndex], toSkip+offset, charsLeft*sizeof(XMLCh)))
return false;
offset += charsLeft;
remainingLen -= charsLeft;
fCharIndex += charsLeft;
}
}
// Add the source length to the current column to get it back right
fCurCol += srcLen;
return true;
}
//
// This is just to peek if the next coming buffer
// matches the string toPeek.
// Similar to skippedString, but just the fCharIndex and fCurCol are not updated
//
bool XMLReader::peekString(const XMLCh* const toPeek)
{
// Get the length of the string to skip
const unsigned int srcLen = XMLString::stringLen(toPeek);
//
// See if the current reader has enough chars to test against this
// string. If not, then ask it to reload its buffer. If that does not
// get us enough, then it cannot match.
//
// NOTE: This works because strings never have to cross a reader! And
// a string to skip will never have a new line in it, so we will never
// miss adjusting the current line.
//
unsigned int charsLeft = charsLeftInBuffer();
while (charsLeft < srcLen)
{
refreshCharBuffer();
unsigned int t = charsLeftInBuffer();
if (t == charsLeft) // if the refreshCharBuf() did not add anything new
return false; // give up and return.
charsLeft = t;
}
//
// Ok, now we now that the current reader has enough chars in its
// buffer and that its index is back at zero. So we can do a quick and
// dirty comparison straight to its buffer with no requirement to unget
// if it fails.
//
if (memcmp(&fCharBuf[fCharIndex], toPeek, srcLen*sizeof(XMLCh)))
return false;
return true;
}
// ---------------------------------------------------------------------------
// XMLReader: Setter methods (most are inlined)
// ---------------------------------------------------------------------------
bool XMLReader::setEncoding(const XMLCh* const newEncoding)
{
//
// If the encoding was forced, then we ignore the new value and just
// return with success. If it was forced, then we are to use that
// encoding without question. Note that, if we are forced, we created
// a transcoder up front so there is no need to do one here in that
// case.
//
if (fForcedEncoding)
return true;
//
// upperCase the newEncoding first for better performance
//
XMLCh* inputEncoding = XMLString::replicate(newEncoding, fMemoryManager);
XMLString::upperCaseASCII(inputEncoding);
XMLRecognizer::Encodings newBaseEncoding;
//
// Check for non-endian specific UTF-16 or UCS-4. If so, and if we
// are already in one of the endian versions of those encodings,
// then just keep it and go on. Otherwise, its not valid.
//
if (!XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString2)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString3)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString4)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString5)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString6)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString7))
{
fMemoryManager->deallocate(inputEncoding);
if ((fEncoding != XMLRecognizer::UTF_16L)
&& (fEncoding != XMLRecognizer::UTF_16B))
{
return false;
}
// Override with the original endian specific encoding
newBaseEncoding = fEncoding;
if (fEncoding == XMLRecognizer::UTF_16L) {
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = 0;
fEncodingStr = XMLString::replicate(XMLUni::fgUTF16LEncodingString, fMemoryManager);
}
else {
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = 0;
fEncodingStr = XMLString::replicate(XMLUni::fgUTF16BEncodingString, fMemoryManager);
}
}
else if (!XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString2)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString3)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString4))
{
fMemoryManager->deallocate(inputEncoding);
if ((fEncoding != XMLRecognizer::UCS_4L)
&& (fEncoding != XMLRecognizer::UCS_4B))
{
return false;
}
// Override with the original endian specific encoding
newBaseEncoding = fEncoding;
if (fEncoding == XMLRecognizer::UCS_4L) {
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = 0;
fEncodingStr = XMLString::replicate(XMLUni::fgUCS4LEncodingString, fMemoryManager);
}
else {
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = 0;
fEncodingStr = XMLString::replicate(XMLUni::fgUCS4BEncodingString, fMemoryManager);
}
}
else
{
//
// Try to map the string to one of our standard encodings. If its not
// one of them, then it has to be one of the non-intrinsic encodings,
// in which case we have to delete our intrinsic encoder and create a
// new one.
//
newBaseEncoding = XMLRecognizer::encodingForName(inputEncoding);
//
// If it does not come back as one of the auto-sensed encodings, then we
// have to possibly replace it and at least check a few things.
//
if (newBaseEncoding == XMLRecognizer::OtherEncoding)
{
//
// We already know it's none of those non-endian special cases,
// so just replicate the new name and use it directly to create the transcoder
//
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = inputEncoding;
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncodingStr
, failReason
, kCharBufSize
, fMemoryManager
);
}
else
{
// Store the new encoding string since it is just an intrinsic
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = inputEncoding;
}
}
if (!fTranscoder) {
//
// Now we can create a transcoder using the recognized fEncoding. We
// might get back a transcoder for an intrinsically supported encoding,
// or we might get one from the underlying transcoding service.
//
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
newBaseEncoding
, failReason
, kCharBufSize
, fMemoryManager
);
if (!fTranscoder)
ThrowXMLwithMemMgr1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr, fMemoryManager);
}
// Update the base encoding member with the new base encoding found
fEncoding = newBaseEncoding;
// Looks ok to us
return true;
}
// ---------------------------------------------------------------------------
// XMLReader: Private helper methods
// ---------------------------------------------------------------------------
//
// This is called when the encoding flag is set and just sets the fSwapped
// flag appropriately.
//
void XMLReader::checkForSwapped()
{
// Assume not swapped
fSwapped = false;
#if defined(ENDIANMODE_LITTLE)
if ((fEncoding == XMLRecognizer::UTF_16B)
|| (fEncoding == XMLRecognizer::UCS_4B))
{
fSwapped = true;
}
#elif defined(ENDIANMODE_BIG)
if ((fEncoding == XMLRecognizer::UTF_16L)
|| (fEncoding == XMLRecognizer::UCS_4L))
{
fSwapped = true;
}
#endif
}
//
// This is called from the constructor when the encoding is not forced.
// We assume that the encoding has been auto-sensed at this point and that
// fSwapped is set correctly.
//
// In the case of UCS-4 and EBCDIC, we don't have to check for a decl.
// The fact that we got here, means that there is one, because that's the
// only way we can autosense those.
//
void XMLReader::doInitDecode()
{
switch(fEncoding)
{
case XMLRecognizer::UCS_4B :
case XMLRecognizer::UCS_4L :
{
// Remove bom if any
if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00)) )
{
for (unsigned int i = 0; i < fRawBytesAvail; i++)
fRawByteBuf[i] = fRawByteBuf[i+4];
fRawBytesAvail -=4;
}
// Look at the raw buffer as UCS4 chars
const UCS4Ch* asUCS = (const UCS4Ch*)fRawByteBuf;
while (fRawBufIndex < fRawBytesAvail)
{
// Get out the current 4 byte value and inc our raw buf index
UCS4Ch curVal = *asUCS++;
fRawBufIndex += sizeof(UCS4Ch);
// Swap if that is required for this machine
if (fSwapped)
curVal = BitOps::swapBytes(curVal);
// Make sure its at least semi legal. If not, undo and throw
if (curVal > 0xFFFF)
{
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Convert the value to an XML char and store it
fCharSizeBuf[fCharsAvail] = 4;
fCharBuf[fCharsAvail++] = XMLCh(curVal);
// Break out on the > character
if (curVal == chCloseAngle)
break;
}
break;
}
case XMLRecognizer::UTF_8 :
{
// If there's a utf-8 BOM (0xEF 0xBB 0xBF), skip past it.
// Don't move to char buf - no one wants to see it.
// Note: this causes any encoding= declaration to override
// the BOM's attempt to say that the encoding is utf-8.
// Look at the raw buffer as short chars
const char* asChars = (const char*)fRawByteBuf;
if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
XMLString::compareNString( asChars
, XMLRecognizer::fgUTF8BOM
, XMLRecognizer::fgUTF8BOMLen) == 0)
{
fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
asChars += XMLRecognizer::fgUTF8BOMLen;
}
//
// First check that there are enough bytes to even see the
// decl indentifier. If not, get out now with no action since
// there is no decl.
//
if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen)
break;
// Check for the opening sequence. If not, then no decl
if (XMLString::compareNString( asChars
, XMLRecognizer::fgASCIIPre
, XMLRecognizer::fgASCIIPreLen))
{
break;
}
while (fRawBufIndex < fRawBytesAvail)
{
const char curCh = *asChars++;
fRawBufIndex++;
// Looks ok, so store it
fCharSizeBuf[fCharsAvail] = 1;
fCharBuf[fCharsAvail++] = XMLCh(curCh);
// Break out on a > character
if (curCh == chCloseAngle)
break;
//
// A char greater than 0x7F is not allowed in this case. If
// so, undo and throw.
//
if (curCh & 0x80)
{
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
}
break;
}
case XMLRecognizer::UTF_16B :
case XMLRecognizer::UTF_16L :
{
//
// If there is a decl here, we just truncate back the characters
// as we go. No surrogate creation would be allowed here in legal
// XML, so we consider it a transoding error if we find one.
//
if (fRawBytesAvail < 2)
break;
unsigned int postBOMIndex = 0;
const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex];
if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
{
fRawBufIndex += sizeof(UTF16Ch);
asUTF16++;
postBOMIndex = fRawBufIndex;
}
// First check that there are enough raw bytes for there to even
// be a decl indentifier. If not, then nothing to do.
//
if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen)
{
fRawBufIndex = postBOMIndex;
break;
}
//
// See we get a match on the prefix. If not, then reset and
// break out.
//
if (fEncoding == XMLRecognizer::UTF_16B)
{
if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen))
{
fRawBufIndex = postBOMIndex;
break;
}
}
else
{
if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen))
{
fRawBufIndex = postBOMIndex;
break;
}
}
while (fRawBufIndex < fRawBytesAvail)
{
// Get out the current 2 byte value
UTF16Ch curVal = *asUTF16++;
fRawBufIndex += sizeof(UTF16Ch);
// Swap if that is required for this machine
if (fSwapped)
curVal = BitOps::swapBytes(curVal);
//
// Store it and bump the target index, implicitly converting
// if UTF16Ch and XMLCh are not the same size.
//
fCharSizeBuf[fCharsAvail] = 2;
fCharBuf[fCharsAvail++] = curVal;
// Break out on a > char
if (curVal == chCloseAngle)
break;
}
break;
}
case XMLRecognizer::EBCDIC :
{
//
// We use special support in the intrinsic EBCDIC-US transcoder
// to go through one char at a time.
//
const XMLByte* srcPtr = fRawByteBuf;
while (1)
{
// Transcode one char from the source
const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++);
fRawBufIndex++;
//
// And put it into the character buffer. This stuff has to
// look like it was normally transcoded.
//
fCharSizeBuf[fCharsAvail] = 1;
fCharBuf[fCharsAvail++] = chCur;
// If its a > char, then break out
if (chCur == chCloseAngle)
break;
// Watch for using up all input and get out
if (fRawBufIndex == fRawBytesAvail)
break;
}
break;
}
default :
// It should never be anything else here
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
fMemoryManager->deallocate(fSystemId);
ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
break;
}
//
// Ok, by the time we get here, if its a legal XML file we have eaten
// the XML/TextDecl. So, if we are a PE and are being referenced from
// outside a literal, then we need to throw in an arbitrary space that
// is required by XML.
//
if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
fCharBuf[fCharsAvail++] = chSpace;
// Calculate fCharOfsBuf buffer using the elements from fCharBufSize
if (fCalculateSrcOfs)
{
fCharOfsBuf[0] = 0;
for (unsigned int index = 1; index < fCharsAvail; ++index) {
fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
}
}
}
//
// This method is called internally when we run out of bytes in the raw
// buffer. We just read as many bytes as we can into the raw buffer again
// and store the number of bytes we got.
//
void XMLReader::refreshRawBuffer()
{
//
// If there are any bytes left, move them down to the start. There
// should only ever be (max bytes per char - 1) at the most.
//
const unsigned int bytesLeft = fRawBytesAvail - fRawBufIndex;
// Move the existing ones down
for (unsigned int index = 0; index < bytesLeft; index++)
fRawByteBuf[index] = fRawByteBuf[fRawBufIndex + index];
//
// And then read into the buffer past the existing bytes. Add back in
// that many to the bytes read, and subtract that many from the bytes
// requested.
//
fRawBytesAvail = fStream->readBytes
(
&fRawByteBuf[bytesLeft], kRawBufSize - bytesLeft
) + bytesLeft;
//
// We need to reset the buffer index back to the start in all cases,
// since any trailing data was copied down to the start.
//
fRawBufIndex = 0;
}
//
// This method is called internally when we run out of characters in the
// trancoded character buffer. We transcode up to another maxChars chars
// from the
//
unsigned int
XMLReader::xcodeMoreChars( XMLCh* const bufToFill
, unsigned char* const charSizes
, const unsigned int maxChars)
{
// If we are plain tuckered out, then return zero now
if (!fRawBytesAvail)
return 0;
//
// If our raw buffer is low, then lets load up another batch of
// raw bytes now. We can't check for exactly zero bytes left because
// transcoding of multi-byte encodings may have left a few bytes
// representing a partial character in the buffer that can't be
// used until the next buffer (and the rest of the character)
// is read.
//
unsigned int bytesLeft = fRawBytesAvail - fRawBufIndex;
if (bytesLeft < 100)
{
refreshRawBuffer();
// If we didn't get anything more just return a zero now
if (!fRawBytesAvail)
return 0;
}
// Ask the transcoder to internalize another batch of chars
unsigned int bytesEaten;
const unsigned int charsDone = fTranscoder->transcodeFrom
(
&fRawByteBuf[fRawBufIndex]
, fRawBytesAvail - fRawBufIndex
, bufToFill
, maxChars
, bytesEaten
, charSizes
);
// Update the raw buffer index
fRawBufIndex += bytesEaten;
return charsDone;
}
/***
*
* XML1.1
*
* 2.11 End-of-Line Handling
*
* XML parsed entities are often stored in computer files which, for editing
* convenience, are organized into lines. These lines are typically separated
* by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA).
*
* To simplify the tasks of applications, the XML processor MUST behave as if
* it normalized all line breaks in external parsed entities (including the document
* entity) on input, before parsing, by translating all of the following to a single
* #xA character:
*
* 1. the two-character sequence #xD #xA
* 2. the two-character sequence #xD #x85
* 3. the single character #x85
* 4. the single character #x2028
* 5. any #xD character that is not immediately followed by #xA or #x85.
*
*
***/
void XMLReader::handleEOL(XMLCh& curCh, bool inDecl)
{
// 1. the two-character sequence #xD #xA
// 2. the two-character sequence #xD #x85
// 5. any #xD character that is not immediately followed by #xA or #x85.
if (curCh == chCR)
{
fCurCol = 1;
fCurLine++;
//
// If not already internalized, then convert it to an
// LF and eat any following LF.
//
if (fSource == Source_External)
{
if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
{
if ( fCharBuf[fCharIndex] == chLF ||
((fCharBuf[fCharIndex] == chNEL) && fNEL) )
{
fCharIndex++;
}
}
curCh = chLF;
}
}
else if (curCh == chLF)
{
fCurCol = 1;
fCurLine++;
}
// 3. the single character #x85
// 4. the single character #x2028
else if (curCh == chNEL || curCh == chLineSeparator)
{
if (inDecl && fXMLVersion == XMLV1_1)
{
/***
* XML1.1
*
* 2.11 End-of-Line Handling
* ...
* The characters #x85 and #x2028 cannot be reliably recognized and translated
* until an entity's encoding declaration (if present) has been read.
* Therefore, it is a fatal error to use them within the XML declaration or
* text declaration.
*
***/
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_NelLsepinDecl
, fSystemId
, fMemoryManager
);
}
if (fNEL && fSource == Source_External)
{
fCurCol = 1;
fCurLine++;
curCh = chLF;
}
}
else
{
fCurCol++;
}
}
XERCES_CPP_NAMESPACE_END