blob: 2c061d584ba9e04c0e4e1d6c842de0555570d94e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Id: XMLScanner.cpp 568078 2007-08-21 11:43:25Z amassari $
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/internal/XMLScanner.hpp>
#include <xercesc/internal/ValidationContextImpl.hpp>
#include <xercesc/util/Janitor.hpp>
#include <xercesc/util/Mutexes.hpp>
#include <xercesc/util/RuntimeException.hpp>
#include <xercesc/util/UnexpectedEOFException.hpp>
#include <xercesc/util/XMLMsgLoader.hpp>
#include <xercesc/util/XMLRegisterCleanup.hpp>
#include <xercesc/util/XMLInitializer.hpp>
#include <xercesc/framework/LocalFileInputSource.hpp>
#include <xercesc/framework/URLInputSource.hpp>
#include <xercesc/framework/XMLDocumentHandler.hpp>
#include <xercesc/framework/XMLEntityHandler.hpp>
#include <xercesc/framework/XMLPScanToken.hpp>
#include <xercesc/framework/XMLValidator.hpp>
#include <xercesc/internal/EndOfEntityException.hpp>
#include <xercesc/validators/DTD/DocTypeHandler.hpp>
#include <xercesc/validators/common/GrammarResolver.hpp>
#include <xercesc/util/OutOfMemoryException.hpp>
#include <xercesc/util/XMLResourceIdentifier.hpp>
XERCES_CPP_NAMESPACE_BEGIN
// ---------------------------------------------------------------------------
// Local static data
// ---------------------------------------------------------------------------
static XMLUInt32 gScannerId;
static bool sRegistered = false;
static XMLMutex* sScannerMutex = 0;
static XMLRegisterCleanup scannerMutexCleanup;
static XMLMsgLoader* gMsgLoader = 0;
static XMLRegisterCleanup cleanupMsgLoader;
// ---------------------------------------------------------------------------
// Local, static functions
// ---------------------------------------------------------------------------
// Cleanup for the message loader
void XMLScanner::reinitMsgLoader()
{
delete gMsgLoader;
gMsgLoader = 0;
}
// Cleanup for the scanner mutex
void XMLScanner::reinitScannerMutex()
{
delete sScannerMutex;
sScannerMutex = 0;
sRegistered = false;
}
//
// We need to fault in this mutex. But, since its used for synchronization
// itself, we have to do this the low level way using a compare and swap.
//
static XMLMutex& gScannerMutex()
{
if (!sRegistered)
{
XMLMutexLock lockInit(XMLPlatformUtils::fgAtomicMutex);
if (!sRegistered)
{
sScannerMutex = new XMLMutex(XMLPlatformUtils::fgMemoryManager);
scannerMutexCleanup.registerCleanup(XMLScanner::reinitScannerMutex);
sRegistered = true;
}
}
return *sScannerMutex;
}
static XMLMsgLoader& gScannerMsgLoader()
{
if (!gMsgLoader)
{
XMLMutexLock lockInit(&gScannerMutex());
// If we haven't loaded our message yet, then do that
if (!gMsgLoader)
{
gMsgLoader = XMLPlatformUtils::loadMsgSet(XMLUni::fgXMLErrDomain);
if (!gMsgLoader)
XMLPlatformUtils::panic(PanicHandler::Panic_CantLoadMsgDomain);
// Register this object to be cleaned up at termination
cleanupMsgLoader.registerCleanup(XMLScanner::reinitMsgLoader);
}
}
return *gMsgLoader;
}
void XMLInitializer::initializeScannerMsgLoader()
{
gMsgLoader = XMLPlatformUtils::loadMsgSet(XMLUni::fgXMLErrDomain);
// Register this object to be cleaned up at termination
if (gMsgLoader) {
cleanupMsgLoader.registerCleanup(XMLScanner::reinitMsgLoader);
}
sScannerMutex = new XMLMutex(XMLPlatformUtils::fgMemoryManager);
if (sScannerMutex) {
scannerMutexCleanup.registerCleanup(XMLScanner::reinitScannerMutex);
sRegistered = true;
}
}
typedef JanitorMemFunCall<XMLScanner> CleanupType;
typedef JanitorMemFunCall<ReaderMgr> ReaderMgrResetType;
// ---------------------------------------------------------------------------
// XMLScanner: Constructors and Destructor
// ---------------------------------------------------------------------------
XMLScanner::XMLScanner(XMLValidator* const valToAdopt,
GrammarResolver* const grammarResolver,
MemoryManager* const manager)
: fBufferSize(1024 * 1024)
, fStandardUriConformant(false)
, fCalculateSrcOfs(false)
, fDoNamespaces(false)
, fExitOnFirstFatal(true)
, fValidationConstraintFatal(false)
, fInException(false)
, fStandalone(false)
, fHasNoDTD(true)
, fValidate(false)
, fValidatorFromUser(false)
, fDoSchema(false)
, fSchemaFullChecking(false)
, fIdentityConstraintChecking(true)
, fToCacheGrammar(false)
, fUseCachedGrammar(false)
, fLoadExternalDTD(true)
, fNormalizeData(true)
, fGenerateSyntheticAnnotations(false)
, fValidateAnnotations(false)
, fIgnoreCachedDTD(false)
, fIgnoreAnnotations(false)
, fDisableDefaultEntityResolution(false)
, fSkipDTDValidation(false)
, fErrorCount(0)
, fEntityExpansionLimit(0)
, fEntityExpansionCount(0)
, fEmptyNamespaceId(0)
, fUnknownNamespaceId(0)
, fXMLNamespaceId(0)
, fXMLNSNamespaceId(0)
, fSchemaNamespaceId(0)
, fUIntPool(0)
, fUIntPoolRow(0)
, fUIntPoolCol(0)
, fUIntPoolRowTotal(2)
, fScannerId(0)
, fSequenceId(0)
, fAttrList(0)
, fAttrDupChkRegistry(0)
, fDocHandler(0)
, fDocTypeHandler(0)
, fEntityHandler(0)
, fErrorReporter(0)
, fErrorHandler(0)
, fPSVIHandler(0)
, fValidationContext(0)
, fEntityDeclPoolRetrieved(false)
, fReaderMgr(manager)
, fValidator(valToAdopt)
, fValScheme(Val_Never)
, fGrammarResolver(grammarResolver)
, fGrammarPoolMemoryManager(grammarResolver->getGrammarPoolMemoryManager())
, fGrammar(0)
, fRootGrammar(0)
, fURIStringPool(0)
, fRootElemName(0)
, fExternalSchemaLocation(0)
, fExternalNoNamespaceSchemaLocation(0)
, fSecurityManager(0)
, fXMLVersion(XMLReader::XMLV1_0)
, fMemoryManager(manager)
, fBufMgr(manager)
, fAttNameBuf(1023, manager)
, fAttValueBuf(1023, manager)
, fCDataBuf(1023, manager)
, fQNameBuf(1023, manager)
, fPrefixBuf(1023, manager)
, fURIBuf(1023, manager)
, fWSNormalizeBuf(1023, manager)
, fElemStack(manager)
{
CleanupType cleanup(this, &XMLScanner::cleanUp);
try
{
commonInit();
}
catch(const OutOfMemoryException&)
{
// Don't cleanup when out of memory, since executing the
// code can cause problems.
cleanup.release();
throw;
}
cleanup.release();
}
XMLScanner::XMLScanner( XMLDocumentHandler* const docHandler
, DocTypeHandler* const docTypeHandler
, XMLEntityHandler* const entityHandler
, XMLErrorReporter* const errHandler
, XMLValidator* const valToAdopt
, GrammarResolver* const grammarResolver
, MemoryManager* const manager)
: fBufferSize(1024 * 1024)
, fStandardUriConformant(false)
, fCalculateSrcOfs(false)
, fDoNamespaces(false)
, fExitOnFirstFatal(true)
, fValidationConstraintFatal(false)
, fInException(false)
, fStandalone(false)
, fHasNoDTD(true)
, fValidate(false)
, fValidatorFromUser(false)
, fDoSchema(false)
, fSchemaFullChecking(false)
, fIdentityConstraintChecking(true)
, fToCacheGrammar(false)
, fUseCachedGrammar(false)
, fLoadExternalDTD(true)
, fNormalizeData(true)
, fGenerateSyntheticAnnotations(false)
, fValidateAnnotations(false)
, fIgnoreCachedDTD(false)
, fIgnoreAnnotations(false)
, fDisableDefaultEntityResolution(false)
, fSkipDTDValidation(false)
, fErrorCount(0)
, fEntityExpansionLimit(0)
, fEntityExpansionCount(0)
, fEmptyNamespaceId(0)
, fUnknownNamespaceId(0)
, fXMLNamespaceId(0)
, fXMLNSNamespaceId(0)
, fSchemaNamespaceId(0)
, fUIntPool(0)
, fUIntPoolRow(0)
, fUIntPoolCol(0)
, fUIntPoolRowTotal(2)
, fScannerId(0)
, fSequenceId(0)
, fAttrList(0)
, fAttrDupChkRegistry(0)
, fDocHandler(docHandler)
, fDocTypeHandler(docTypeHandler)
, fEntityHandler(entityHandler)
, fErrorReporter(errHandler)
, fErrorHandler(0)
, fPSVIHandler(0)
, fValidationContext(0)
, fEntityDeclPoolRetrieved(false)
, fReaderMgr(manager)
, fValidator(valToAdopt)
, fValScheme(Val_Never)
, fGrammarResolver(grammarResolver)
, fGrammarPoolMemoryManager(grammarResolver->getGrammarPoolMemoryManager())
, fGrammar(0)
, fRootGrammar(0)
, fURIStringPool(0)
, fRootElemName(0)
, fExternalSchemaLocation(0)
, fExternalNoNamespaceSchemaLocation(0)
, fSecurityManager(0)
, fXMLVersion(XMLReader::XMLV1_0)
, fMemoryManager(manager)
, fBufMgr(manager)
, fAttNameBuf(1023, manager)
, fAttValueBuf(1023, manager)
, fCDataBuf(1023, manager)
, fQNameBuf(1023, manager)
, fPrefixBuf(1023, manager)
, fURIBuf(1023, manager)
, fWSNormalizeBuf(1023, manager)
, fElemStack(manager)
{
CleanupType cleanup(this, &XMLScanner::cleanUp);
try
{
commonInit();
}
catch(const OutOfMemoryException&)
{
// Don't cleanup when out of memory, since executing the
// code can cause problems.
cleanup.release();
throw;
}
cleanup.release();
}
XMLScanner::~XMLScanner()
{
cleanUp();
}
void XMLScanner::setValidator(XMLValidator* const valToAdopt)
{
if (fValidatorFromUser)
delete fValidator;
fValidator = valToAdopt;
fValidatorFromUser = true;
initValidator(fValidator);
}
// ---------------------------------------------------------------------------
// XMLScanner: Main entry point to scan a document
// ---------------------------------------------------------------------------
void XMLScanner::scanDocument( const XMLCh* const systemId)
{
// First we try to parse it as a URL. If that fails, we assume its
// a file and try it that way.
InputSource* srcToUse = 0;
try
{
// Create a temporary URL. Since this is the primary document,
// it has to be fully qualified. If not, then assume we are just
// mistaking a file for a URL.
XMLURL tmpURL(fMemoryManager);
if (XMLURL::parse(systemId, tmpURL)) {
if (tmpURL.isRelative()) {
if (!fStandardUriConformant)
srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
else {
// since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
// emit the error directly
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getCode()
, e.getType()
, e.getMessage()
);
return;
}
}
else
{
if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getCode()
, e.getType()
, e.getMessage()
);
return;
}
srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
}
}
else {
if (!fStandardUriConformant)
srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
else {
// since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
// emit the error directly
// lazy bypass ... since all MalformedURLException are fatal, no need to check the type
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getCode()
, e.getType()
, e.getMessage()
);
return;
}
}
}
catch(const XMLException& excToCatch)
{
// For any other XMLException,
// emit the error and catch any user exception thrown from here.
fInException = true;
if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
emitError
(
XMLErrs::XMLException_Warning
, excToCatch.getCode()
, excToCatch.getType()
, excToCatch.getMessage()
);
else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
emitError
(
XMLErrs::XMLException_Fatal
, excToCatch.getCode()
, excToCatch.getType()
, excToCatch.getMessage()
);
else
emitError
(
XMLErrs::XMLException_Error
, excToCatch.getCode()
, excToCatch.getType()
, excToCatch.getMessage()
);
return;
}
Janitor<InputSource> janSrc(srcToUse);
scanDocument(*srcToUse);
}
void XMLScanner::scanDocument( const char* const systemId)
{
// We just delegate this to the XMLCh version after transcoding
XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
scanDocument(tmpBuf);
}
// This method begins a progressive parse. It scans through the prolog and
// returns a token to be used on subsequent scanNext() calls. If the return
// value is true, then the token is legal and ready for further use. If it
// returns false, then the scan of the prolog failed and the token is not
// going to work on subsequent scanNext() calls.
bool XMLScanner::scanFirst( const XMLCh* const systemId
, XMLPScanToken& toFill)
{
// First we try to parse it as a URL. If that fails, we assume its
// a file and try it that way.
InputSource* srcToUse = 0;
try
{
// Create a temporary URL. Since this is the primary document,
// it has to be fully qualified. If not, then assume we are just
// mistaking a file for a URL.
XMLURL tmpURL(fMemoryManager);
if (XMLURL::parse(systemId, tmpURL)) {
if (tmpURL.isRelative()) {
if (!fStandardUriConformant)
srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
else {
// since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
// emit the error directly
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getCode()
, e.getType()
, e.getMessage()
);
return false;
}
}
else
{
if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getCode()
, e.getType()
, e.getMessage()
);
return false;
}
srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
}
}
else {
if (!fStandardUriConformant)
srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
else {
// since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
// emit the error directly
// lazy bypass ... since all MalformedURLException are fatal, no need to check the type
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getCode()
, e.getType()
, e.getMessage()
);
return false;
}
}
}
catch(const XMLException& excToCatch)
{
// For any other XMLException,
// emit the error and catch any user exception thrown from here.
fInException = true;
if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
emitError
(
XMLErrs::XMLException_Warning
, excToCatch.getCode()
, excToCatch.getType()
, excToCatch.getMessage()
);
else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
emitError
(
XMLErrs::XMLException_Fatal
, excToCatch.getCode()
, excToCatch.getType()
, excToCatch.getMessage()
);
else
emitError
(
XMLErrs::XMLException_Error
, excToCatch.getCode()
, excToCatch.getType()
, excToCatch.getMessage()
);
return false;
}
Janitor<InputSource> janSrc(srcToUse);
return scanFirst(*srcToUse, toFill);
}
bool XMLScanner::scanFirst( const char* const systemId
, XMLPScanToken& toFill)
{
// We just delegate this to the XMLCh version after transcoding
XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
return scanFirst(tmpBuf, toFill);
}
bool XMLScanner::scanFirst( const InputSource& src
, XMLPScanToken& toFill)
{
// Bump up the sequence id for this new scan cycle. This will invalidate
// any previous tokens we've returned.
fSequenceId++;
ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset);
// Reset the scanner and its plugged in stuff for a new run. This
// resets all the data structures, creates the initial reader and
// pushes it on the stack, and sets up the base document path
scanReset(src);
// If we have a document handler, then call the start document
if (fDocHandler)
fDocHandler->startDocument();
try
{
// Scan the prolog part, which is everything before the root element
// including the DTD subsets. This is all that is done on the scan
// first.
scanProlog();
// If we got to the end of input, then its not a valid XML file.
// Else, go on to scan the content.
if (fReaderMgr.atEOF())
{
emitError(XMLErrs::EmptyMainEntity);
}
}
// NOTE:
//
// In all of the error processing below, the emitError() call MUST come
// before the flush of the reader mgr, or it will fail because it tries
// to find out the position in the XML source of the error.
catch(const XMLErrs::Codes)
{
// This is a 'first failure' exception so return failure
return false;
}
catch(const XMLValid::Codes)
{
// This is a 'first fatal error' type exit, return failure
return false;
}
catch(const XMLException& excToCatch)
{
// Emit the error and catch any user exception thrown from here. Make
// sure in all cases we flush the reader manager.
fInException = true;
try
{
if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
emitError
(
XMLErrs::XMLException_Warning
, excToCatch.getCode()
, excToCatch.getType()
, excToCatch.getMessage()
);
else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
emitError
(
XMLErrs::XMLException_Fatal
, excToCatch.getCode()
, excToCatch.getType()
, excToCatch.getMessage()
);
else
emitError
(
XMLErrs::XMLException_Error
, excToCatch.getCode()
, excToCatch.getType()
, excToCatch.getMessage()
);
}
catch(const OutOfMemoryException&)
{
// This is a special case for out-of-memory
// conditions, because resetting the ReaderMgr
// can be problematic.
resetReaderMgr.release();
throw;
}
return false;
}
catch(const OutOfMemoryException&)
{
// This is a special case for out-of-memory
// conditions, because resetting the ReaderMgr
// can be problematic.
resetReaderMgr.release();
throw;
}
// Fill in the caller's token to make it legal and return success
toFill.set(fScannerId, fSequenceId);
// Release the object that will reset the ReaderMgr, since there's
// more to scan.
resetReaderMgr.release();
return true;
}
void XMLScanner::scanReset(XMLPScanToken& token)
{
// Make sure this token is still legal
if (!isLegalToken(token))
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager);
// Reset the reader manager
fReaderMgr.reset();
// And invalidate any tokens by bumping our sequence number
fSequenceId++;
// Reset our error count
fErrorCount = 0;
}
void XMLScanner::setParseSettings(XMLScanner* const refScanner)
{
setDocHandler(refScanner->getDocHandler());
setDocTypeHandler(refScanner->getDocTypeHandler());
setErrorHandler(refScanner->getErrorHandler());
setErrorReporter(refScanner->getErrorReporter());
setEntityHandler(refScanner->getEntityHandler());
setDoNamespaces(refScanner->getDoNamespaces());
setDoSchema(refScanner->getDoSchema());
setCalculateSrcOfs(refScanner->getCalculateSrcOfs());
setStandardUriConformant(refScanner->getStandardUriConformant());
setExitOnFirstFatal(refScanner->getExitOnFirstFatal());
setValidationConstraintFatal(refScanner->getValidationConstraintFatal());
setIdentityConstraintChecking(refScanner->getIdentityConstraintChecking());
setValidationSchemaFullChecking(refScanner->getValidationSchemaFullChecking());
cacheGrammarFromParse(refScanner->isCachingGrammarFromParse());
useCachedGrammarInParse(refScanner->isUsingCachedGrammarInParse());
setLoadExternalDTD(refScanner->getLoadExternalDTD());
setNormalizeData(refScanner->getNormalizeData());
setExternalSchemaLocation(refScanner->getExternalSchemaLocation());
setExternalNoNamespaceSchemaLocation(refScanner->getExternalNoNamespaceSchemaLocation());
setValidationScheme(refScanner->getValidationScheme());
setSecurityManager(refScanner->getSecurityManager());
setPSVIHandler(refScanner->getPSVIHandler());
}
// ---------------------------------------------------------------------------
// XMLScanner: Private helper methods.
// ---------------------------------------------------------------------------
// This method handles the common initialization, to avoid having to do
// it redundantly in multiple constructors.
void XMLScanner::commonInit()
{
// We have to do a little init that involves statics, so we have to
// use the mutex to protect it.
{
XMLMutexLock lockInit(&gScannerMutex());
// And assign ourselves the next available scanner id
fScannerId = ++gScannerId;
}
// Create the attribute list, which is used to store attribute values
// during start tag processing. Give it a reasonable initial size that
// will serve for most folks, though it will grow as required.
fAttrList = new (fMemoryManager) RefVectorOf<XMLAttr>(32, true, fMemoryManager);
// Create the id ref list. This is used to enforce XML 1.0 ID ref
// semantics, i.e. all id refs must refer to elements that exist
fValidationContext = new (fMemoryManager) ValidationContextImpl(fMemoryManager);
fValidationContext->setElemStack(&fElemStack);
// Create the GrammarResolver
//fGrammarResolver = new GrammarResolver();
// create initial, 64-element, fUIntPool
fUIntPool = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) *fUIntPoolRowTotal);
memset(fUIntPool, 0, sizeof(unsigned int *) * fUIntPoolRowTotal);
fUIntPool[0] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6);
memset(fUIntPool[0], 0, sizeof(unsigned int) << 6);
// Register self as handler for XMLBufferFull events on the CDATA buffer
fCDataBuf.setFullHandler(this, fBufferSize);
if (fValidator) {
fValidatorFromUser = true;
initValidator(fValidator);
}
}
void XMLScanner::cleanUp()
{
delete fAttrList;
delete fAttrDupChkRegistry;
delete fValidationContext;
fMemoryManager->deallocate(fRootElemName);//delete [] fRootElemName;
fMemoryManager->deallocate(fExternalSchemaLocation);//delete [] fExternalSchemaLocation;
fMemoryManager->deallocate(fExternalNoNamespaceSchemaLocation);//delete [] fExternalNoNamespaceSchemaLocation;
// delete fUIntPool
if (fUIntPool)
{
for (unsigned int i=0; i<=fUIntPoolRow; i++)
{
fMemoryManager->deallocate(fUIntPool[i]);
}
fMemoryManager->deallocate(fUIntPool);
}
}
void XMLScanner::initValidator(XMLValidator* theValidator) {
// Tell the validator about the stuff it needs to know in order to
// do its work.
theValidator->setScannerInfo(this, &fReaderMgr, &fBufMgr);
theValidator->setErrorReporter(fErrorReporter);
}
// ---------------------------------------------------------------------------
// XMLScanner: Error emitting methods
// ---------------------------------------------------------------------------
// These methods are called whenever the scanner wants to emit an error.
// It handles getting the message loaded, doing token replacement, etc...
// and then calling the error handler, if its installed.
bool XMLScanner::emitErrorWillThrowException(const XMLErrs::Codes toEmit)
{
if (XMLErrs::isFatal(toEmit) && fExitOnFirstFatal && !fInException)
return true;
return false;
}
void XMLScanner::emitError(const XMLErrs::Codes toEmit)
{
// Bump the error count if it is not a warning
if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
incrementErrorCount();
if (fErrorReporter)
{
// Load the message into a local for display
const unsigned int msgSize = 1023;
XMLCh errText[msgSize + 1];
if (!gScannerMsgLoader().loadMsg(toEmit, errText, msgSize))
{
// <TBD> Probably should load a default msg here
}
// Create a LastExtEntityInfo structure and get the reader manager
// to fill it in for us. This will give us the information about
// the last reader on the stack that was an external entity of some
// sort (i.e. it will ignore internal entities.
ReaderMgr::LastExtEntityInfo lastInfo;
fReaderMgr.getLastExtEntityInfo(lastInfo);
fErrorReporter->error
(
toEmit
, XMLUni::fgXMLErrDomain
, XMLErrs::errorType(toEmit)
, errText
, lastInfo.systemId
, lastInfo.publicId
, lastInfo.lineNumber
, lastInfo.colNumber
);
}
// Bail out if its fatal an we are to give up on the first fatal error
if (emitErrorWillThrowException(toEmit))
throw toEmit;
}
void XMLScanner::emitError( const XMLErrs::Codes toEmit
, const XMLCh* const text1
, const XMLCh* const text2
, const XMLCh* const text3
, const XMLCh* const text4)
{
// Bump the error count if it is not a warning
if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
incrementErrorCount();
if (fErrorReporter)
{
// Load the message into alocal and replace any tokens found in
// the text.
const unsigned int maxChars = 2047;
XMLCh errText[maxChars + 1];
if (!gScannerMsgLoader().loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager))
{
// <TBD> Should probably load a default message here
}
// Create a LastExtEntityInfo structure and get the reader manager
// to fill it in for us. This will give us the information about
// the last reader on the stack that was an external entity of some
// sort (i.e. it will ignore internal entities.
ReaderMgr::LastExtEntityInfo lastInfo;
fReaderMgr.getLastExtEntityInfo(lastInfo);
fErrorReporter->error
(
toEmit
, XMLUni::fgXMLErrDomain
, XMLErrs::errorType(toEmit)
, errText
, lastInfo.systemId
, lastInfo.publicId
, lastInfo.lineNumber
, lastInfo.colNumber
);
}
// Bail out if its fatal an we are to give up on the first fatal error
if (emitErrorWillThrowException(toEmit))
throw toEmit;
}
void XMLScanner::emitError( const XMLErrs::Codes toEmit
, const char* const text1
, const char* const text2
, const char* const text3
, const char* const text4)
{
// Bump the error count if it is not a warning
if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
incrementErrorCount();
if (fErrorReporter)
{
// Load the message into alocal and replace any tokens found in
// the text.
const unsigned int maxChars = 2047;
XMLCh errText[maxChars + 1];
if (!gScannerMsgLoader().loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager))
{
// <TBD> Should probably load a default message here
}
// Create a LastExtEntityInfo structure and get the reader manager
// to fill it in for us. This will give us the information about
// the last reader on the stack that was an external entity of some
// sort (i.e. it will ignore internal entities.
ReaderMgr::LastExtEntityInfo lastInfo;
fReaderMgr.getLastExtEntityInfo(lastInfo);
fErrorReporter->error
(
toEmit
, XMLUni::fgXMLErrDomain
, XMLErrs::errorType(toEmit)
, errText
, lastInfo.systemId
, lastInfo.publicId
, lastInfo.lineNumber
, lastInfo.colNumber
);
}
// Bail out if its fatal an we are to give up on the first fatal error
if (emitErrorWillThrowException(toEmit))
throw toEmit;
}
void XMLScanner::emitError( const XMLErrs::Codes toEmit
, const XMLExcepts::Codes originalExceptCode
, const XMLCh* const text1
, const XMLCh* const text2
, const XMLCh* const text3
, const XMLCh* const text4)
{
// Bump the error count if it is not a warning
if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
incrementErrorCount();
if (fErrorReporter)
{
// Load the message into alocal and replace any tokens found in
// the text.
const unsigned int maxChars = 2047;
XMLCh errText[maxChars + 1];
if (!gScannerMsgLoader().loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager))
{
// <TBD> Should probably load a default message here
}
// Create a LastExtEntityInfo structure and get the reader manager
// to fill it in for us. This will give us the information about
// the last reader on the stack that was an external entity of some
// sort (i.e. it will ignore internal entities.
ReaderMgr::LastExtEntityInfo lastInfo;
fReaderMgr.getLastExtEntityInfo(lastInfo);
fErrorReporter->error
(
originalExceptCode
, XMLUni::fgExceptDomain //fgXMLErrDomain
, XMLErrs::errorType(toEmit)
, errText
, lastInfo.systemId
, lastInfo.publicId
, lastInfo.lineNumber
, lastInfo.colNumber
);
}
// Bail out if its fatal an we are to give up on the first fatal error
if (emitErrorWillThrowException(toEmit))
throw toEmit;
}
// ---------------------------------------------------------------------------
// XMLScanner: Getter methods
// ---------------------------------------------------------------------------
// This method allows the caller to query the current location of the scanner.
// It will return the sys/public ids of the current entity, and the line/col
// position within it.
//
// NOTE: This API returns the location with the last external file. So if its
// currently scanning an entity, the position returned will be the end of
// the entity reference in the file that had the reference.
//
/*bool
XMLScanner::getLastExtLocation( XMLCh* const sysIdToFill
, const unsigned int maxSysIdChars
, XMLCh* const pubIdToFill
, const unsigned int maxPubIdChars
, XMLSSize_t& lineToFill
, XMLSSize_t& colToFill) const
{
// Create a local info object and get it filled in by the reader manager
ReaderMgr::LastExtEntityInfo lastInfo;
fReaderMgr.getLastExtEntityInfo(lastInfo);
// Fill in the line and column number
lineToFill = lastInfo.lineNumber;
colToFill = lastInfo.colNumber;
// And copy over as much of the ids as will fit
sysIdToFill[0] = 0;
if (lastInfo.systemId)
{
if (XMLString::stringLen(lastInfo.systemId) > maxSysIdChars)
return false;
XMLString::copyString(sysIdToFill, lastInfo.systemId);
}
pubIdToFill[0] = 0;
if (lastInfo.publicId)
{
if (XMLString::stringLen(lastInfo.publicId) > maxPubIdChars)
return false;
XMLString::copyString(pubIdToFill, lastInfo.publicId);
}
return true;
}*/
// ---------------------------------------------------------------------------
// XMLScanner: Private scanning methods
// ---------------------------------------------------------------------------
// This method is called after the end of the root element, to handle
// any miscellaneous stuff hanging around.
void XMLScanner::scanMiscellaneous()
{
// Get a buffer for this work
XMLBufBid bbCData(&fBufMgr);
while (true)
{
try
{
const XMLCh nextCh = fReaderMgr.peekNextChar();
// Watch for end of file and break out
if (!nextCh)
break;
if (nextCh == chOpenAngle)
{
if (checkXMLDecl(true))
{
// Can't have an XML decl here
emitError(XMLErrs::NotValidAfterContent);
fReaderMgr.skipPastChar(chCloseAngle);
}
else if (fReaderMgr.skippedString(XMLUni::fgPIString))
{
scanPI();
}
else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
{
scanComment();
}
else
{
// This can't be possible, so just give up
emitError(XMLErrs::ExpectedCommentOrPI);
fReaderMgr.skipPastChar(chCloseAngle);
}
}
else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
{
// If we have a doc handler, then gather up the spaces and
// call back. Otherwise, just skip over whitespace.
if (fDocHandler)
{
fReaderMgr.getSpaces(bbCData.getBuffer());
fDocHandler->ignorableWhitespace
(
bbCData.getRawBuffer()
, bbCData.getLen()
, false
);
}
else
{
fReaderMgr.skipPastSpaces();
}
}
else
{
emitError(XMLErrs::ExpectedCommentOrPI);
fReaderMgr.skipPastChar(chCloseAngle);
}
}
catch(const EndOfEntityException&)
{
// Some entity leaked out of the content part of the document. Issue
// a warning and keep going.
emitError(XMLErrs::EntityPropogated);
}
}
}
// Scans a PI and calls the appropriate callbacks. At entry we have just
// scanned the <? part, and need to now start on the PI target name.
void XMLScanner::scanPI()
{
const XMLCh* namePtr = 0;
const XMLCh* targetPtr = 0;
// If there are any spaces here, then warn about it. If we aren't in
// 'first error' mode, then we'll come back and can easily pick up
// again by just skipping them.
if (fReaderMgr.lookingAtSpace())
{
emitError(XMLErrs::PINameExpected);
fReaderMgr.skipPastSpaces();
}
// Get a buffer for the PI name and scan it in
XMLBufBid bbName(&fBufMgr);
if (!fReaderMgr.getName(bbName.getBuffer()))
{
emitError(XMLErrs::PINameExpected);
fReaderMgr.skipPastChar(chCloseAngle);
return;
}
// Point the name pointer at the raw data
namePtr = bbName.getRawBuffer();
// See if it is some form of 'xml' and emit a warning
//if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString))
if (bbName.getLen() == 3 &&
(((namePtr[0] == chLatin_x) || (namePtr[0] == chLatin_X)) &&
((namePtr[1] == chLatin_m) || (namePtr[1] == chLatin_M)) &&
((namePtr[2] == chLatin_l) || (namePtr[2] == chLatin_L))))
emitError(XMLErrs::NoPIStartsWithXML);
// If namespaces are enabled, then no colons allowed
if (fDoNamespaces)
{
if (XMLString::indexOf(namePtr, chColon) != -1)
emitError(XMLErrs::ColonNotLegalWithNS);
}
// If we don't hit a space next, then the PI has no target. If we do
// then get out the target. Get a buffer for it as well
XMLBufBid bbTarget(&fBufMgr);
if (fReaderMgr.skippedSpace())
{
// Skip any leading spaces
fReaderMgr.skipPastSpaces();
bool gotLeadingSurrogate = false;
// It does have a target, so lets move on to deal with that.
while (1)
{
const XMLCh nextCh = fReaderMgr.getNextChar();
// Watch for an end of file, which is always bad here
if (!nextCh)
{
emitError(XMLErrs::UnterminatedPI);
ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
}
// Watch for potential terminating character
if (nextCh == chQuestion)
{
// It must be followed by '>' to be a termination of the target
if (fReaderMgr.skippedChar(chCloseAngle))
break;
}
// Check for correct surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
if (gotLeadingSurrogate)
{
if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
emitError(XMLErrs::Expected2ndSurrogateChar);
}
// Its got to at least be a valid XML character
else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) {
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
, fMemoryManager
);
emitError(XMLErrs::InvalidCharacter, tmpBuf);
}
gotLeadingSurrogate = false;
}
bbTarget.append(nextCh);
}
}
else
{
// No target, but make sure its terminated ok
if (!fReaderMgr.skippedChar(chQuestion))
{
emitError(XMLErrs::UnterminatedPI);
fReaderMgr.skipPastChar(chCloseAngle);
return;
}
if (!fReaderMgr.skippedChar(chCloseAngle))
{
emitError(XMLErrs::UnterminatedPI);
fReaderMgr.skipPastChar(chCloseAngle);
return;
}
}
// Point the target pointer at the raw data
targetPtr = bbTarget.getRawBuffer();
// If we have a handler, then call it
if (fDocHandler)
{
fDocHandler->docPI
(
namePtr
, targetPtr
);
}
//mark PI is seen within the current element
if (! fElemStack.isEmpty())
fElemStack.setCommentOrPISeen();
}
// Scans all the input from the start of the file to the root element.
// There does not have to be anything in the prolog necessarily, but usually
// there is at least an XMLDecl.
//
// On exit from here we are either at the end of the file or about to read
// the opening < of the root element.
void XMLScanner::scanProlog()
{
bool sawDocTypeDecl = false;
// Get a buffer for whitespace processing
XMLBufBid bbCData(&fBufMgr);
// Loop through the prolog. If there is no content, this could go all
// the way to the end of the file.
try
{
while (true)
{
const XMLCh nextCh = fReaderMgr.peekNextChar();
if (nextCh == chOpenAngle)
{
// Ok, it could be the xml decl, a comment, the doc type line,
// or the start of the root element.
if (checkXMLDecl(true))
{
// There shall be at lease --ONE-- space in between
// the tag '<?xml' and the VersionInfo.
//
// If we are not at line 1, col 6, then the decl was not
// the first text, so its invalid.
const XMLReader* curReader = fReaderMgr.getCurrentReader();
if ((curReader->getLineNumber() != 1)
|| (curReader->getColumnNumber() != 7))
{
emitError(XMLErrs::XMLDeclMustBeFirst);
}
scanXMLDecl(Decl_XML);
}
else if (fReaderMgr.skippedString(XMLUni::fgPIString))
{
scanPI();
}
else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
{
scanComment();
}
else if (fReaderMgr.skippedString(XMLUni::fgDocTypeString))
{
if (sawDocTypeDecl) {
emitError(XMLErrs::DuplicateDocTypeDecl);
}
scanDocTypeDecl();
sawDocTypeDecl = true;
// if reusing grammar, this has been validated already in first scan
// skip for performance
if (fValidate && !fGrammar->getValidated()) {
// validate the DTD scan so far
fValidator->preContentValidation(fUseCachedGrammar, true);
}
}
else
{
// Assume its the start of the root element
return;
}
}
else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
{
// If we have a document handler then gather up the
// whitespace and call back. Otherwise just skip over spaces.
if (fDocHandler)
{
fReaderMgr.getSpaces(bbCData.getBuffer());
fDocHandler->ignorableWhitespace
(
bbCData.getRawBuffer()
, bbCData.getLen()
, false
);
}
else
{
fReaderMgr.skipPastSpaces();
}
}
else
{
emitError(XMLErrs::InvalidDocumentStructure);
// Watch for end of file and break out
if (!nextCh)
break;
else
fReaderMgr.skipPastChar(chCloseAngle);
}
}
}
catch(const EndOfEntityException&)
{
// We should never get an end of entity here. They should only
// occur within the doc type scanning method, and not leak out to
// here.
emitError
(
XMLErrs::UnexpectedEOE
, "in prolog"
);
}
}
// Scans the <?xml .... ?> line. This stuff is all sequential so we don't
// do any state machine loop here. We just bull straight through it. It ends
// past the closing bracket. If there is a document handler, then its called
// on the XMLDecl callback.
//
// On entry, the <?xml has been scanned, and we pick it up from there.
//
// NOTE: In order to provide good recovery from bad XML here, we try to be
// very flexible. No matter what order the stuff is in, we'll keep going
// though we'll issue errors.
//
// The parameter tells us which type of decl we should expect, Text or XML.
// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
// [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>'
void XMLScanner::scanXMLDecl(const DeclTypes type)
{
// Get us some buffers to use
XMLBufBid bbVersion(&fBufMgr);
XMLBufBid bbEncoding(&fBufMgr);
XMLBufBid bbStand(&fBufMgr);
XMLBufBid bbDummy(&fBufMgr);
XMLBufBid bbName(&fBufMgr);
// We use this little enum and array to keep up with what we found
// and what order we found them in. This lets us get them free form
// without too much overhead, but still know that they were in the
// wrong order.
enum Strings
{
VersionString
, EncodingString
, StandaloneString
, UnknownString
, StringCount
};
int flags[StringCount] = { -1, -1, -1, -1 };
// Also set up a list of buffers in the right order so that we know
// where to put stuff.
XMLBuffer* buffers[StringCount] ;
buffers[0] = &bbVersion.getBuffer();
buffers[1] = &bbEncoding.getBuffer();
buffers[2] = &bbStand.getBuffer();
buffers[3] = &bbDummy.getBuffer();
int curCount = 0;
Strings curString;
XMLBuffer& nameBuf = bbName.getBuffer();
while (true)
{
// Skip any spaces
const unsigned int spaceCount = fReaderMgr.skipPastSpaces(true);
// If we are looking at a question mark, then break out
if (fReaderMgr.lookingAtChar(chQuestion))
break;
// If this is not the first string, then we require the spaces
if (!spaceCount && curCount)
emitError(XMLErrs::ExpectedWhitespace);
// Get characters up to the next whitespace or equal's sign.
if (!scanUpToWSOr(nameBuf, chEqual))
emitError(XMLErrs::ExpectedDeclString);
// See if it matches any of our expected strings
if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgVersionString))
curString = VersionString;
else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgEncodingString))
curString = EncodingString;
else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgStandaloneString))
curString = StandaloneString;
else
curString = UnknownString;
// If its an unknown string, then give that error. Else check to
// see if this one has been done already and give that error.
if (curString == UnknownString)
emitError(XMLErrs::ExpectedDeclString, nameBuf.getRawBuffer());
else if (flags[curString] != -1)
emitError(XMLErrs::DeclStringRep, nameBuf.getRawBuffer());
else if (flags[curString] == -1)
flags[curString] = ++curCount;
// Scan for an equal's sign. If we don't find it, issue an error
// but keep trying to go on.
if (!scanEq(true))
emitError(XMLErrs::ExpectedEqSign);
// Get a quote string into the buffer for the string that we are
// currently working on.
if (!getQuotedString(*buffers[curString]))
{
emitError(XMLErrs::ExpectedQuotedString);
fReaderMgr.skipPastChar(chCloseAngle);
return;
}
// And validate the value according which one it was
const XMLCh* rawValue = buffers[curString]->getRawBuffer();
if (curString == VersionString)
{
if (XMLString::equals(rawValue, XMLUni::fgVersion1_1)) {
if (type == Decl_XML) {
fXMLVersion = XMLReader::XMLV1_1;
fReaderMgr.setXMLVersion(XMLReader::XMLV1_1);
}
else {
if (fXMLVersion != XMLReader::XMLV1_1)
emitError(XMLErrs::UnsupportedXMLVersion, rawValue);
}
}
else if (XMLString::equals(rawValue, XMLUni::fgVersion1_0)) {
if (type == Decl_XML) {
fXMLVersion = XMLReader::XMLV1_0;
fReaderMgr.setXMLVersion(XMLReader::XMLV1_0);
}
}
else
emitError(XMLErrs::UnsupportedXMLVersion, rawValue);
}
else if (curString == EncodingString)
{
if (!XMLString::isValidEncName(rawValue))
emitError(XMLErrs::BadXMLEncoding, rawValue);
}
else if (curString == StandaloneString)
{
if (XMLString::equals(rawValue, XMLUni::fgYesString))
fStandalone = true;
else if (XMLString::equals(rawValue, XMLUni::fgNoString))
fStandalone = false;
else
{
emitError(XMLErrs::BadStandalone);
//if (!XMLString::compareIString(rawValue, XMLUni::fgYesString))
//else if (!XMLString::compareIString(rawValue, XMLUni::fgNoString))
if (buffers[curString]->getLen() == 3 &&
(((rawValue[0] == chLatin_y) || (rawValue[0] == chLatin_Y)) &&
((rawValue[1] == chLatin_e) || (rawValue[1] == chLatin_E)) &&
((rawValue[2] == chLatin_s) || (rawValue[2] == chLatin_S))))
fStandalone = true;
else if (buffers[curString]->getLen() == 2 &&
(((rawValue[0] == chLatin_n) || (rawValue[0] == chLatin_N)) &&
((rawValue[1] == chLatin_o) || (rawValue[1] == chLatin_O))))
fStandalone = false;
}
}
}
// Make sure that the strings present are in order. We don't care about
// which ones are present at this point, just that any there are in the
// right order.
int curTop = 0;
for (int index = VersionString; index < StandaloneString; index++)
{
if (flags[index] != -1)
{
if (flags[index] != curTop + 1)
{
emitError(XMLErrs::DeclStringsInWrongOrder);
break;
}
curTop = flags[index];
}
}
// If its an XML decl, the version must be present.
// If its a Text decl, then encoding must be present AND standalone must not be present.
if ((type == Decl_XML) && (flags[VersionString] == -1))
emitError(XMLErrs::XMLVersionRequired);
else if (type == Decl_Text) {
if (flags[StandaloneString] != -1)
emitError(XMLErrs::StandaloneNotLegal);
if (flags[EncodingString] == -1)
emitError(XMLErrs::EncodingRequired);
}
if (!fReaderMgr.skippedChar(chQuestion))
{
emitError(XMLErrs::UnterminatedXMLDecl);
fReaderMgr.skipPastChar(chCloseAngle);
}
else if (!fReaderMgr.skippedChar(chCloseAngle))
{
emitError(XMLErrs::UnterminatedXMLDecl);
fReaderMgr.skipPastChar(chCloseAngle);
}
// Do this before we possibly update the reader with the
// actual encoding string. Otherwise, we will pass the wrong thing
// for the last parameter!
const XMLCh* actualEnc = fReaderMgr.getCurrentEncodingStr();
// Ok, we've now seen the real encoding string, if there was one, so
// lets call back on the current reader and tell it what the real
// encoding string was. If it fails, that's because it represents some
// sort of contradiction with the autosensed format, and it keeps the
// original encoding.
//
// NOTE: This can fail for a number of reasons, such as a bogus encoding
// name or because its in flagrant contradiction of the auto-sensed
// format.
if (flags[EncodingString] != -1)
{
if (!fReaderMgr.getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
else
actualEnc = bbEncoding.getRawBuffer();
}
// If we have a document handler then call the XML Decl callback.
if (type == Decl_XML)
{
if (fDocHandler)
fDocHandler->XMLDecl
(
bbVersion.getRawBuffer()
, bbEncoding.getRawBuffer()
, bbStand.getRawBuffer()
, actualEnc
);
}
else if (type == Decl_Text)
{
if (fDocTypeHandler)
fDocTypeHandler->TextDecl
(
bbVersion.getRawBuffer()
, bbEncoding.getRawBuffer()
);
}
}
const XMLCh* XMLScanner::getURIText(const unsigned int uriId) const
{
if (fURIStringPool->exists(uriId)) {
// Look up the URI in the string pool and return its id
const XMLCh* value = fURIStringPool->getValueForId(uriId);
if (!value)
return XMLUni::fgZeroLenString;
return value;
}
else
return XMLUni::fgZeroLenString;
}
bool XMLScanner::getURIText( const unsigned int uriId
, XMLBuffer& uriBufToFill) const
{
if (fURIStringPool->exists(uriId)) {
// Look up the URI in the string pool and return its id
const XMLCh* value = fURIStringPool->getValueForId(uriId);
if (!value)
return false;
uriBufToFill.set(value);
return true;
}
else
return false;
}
bool XMLScanner::checkXMLDecl(bool startWithAngle) {
// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
//
// [3] S ::= (#x20 | #x9 | #xD | #xA)+
if (startWithAngle) {
if (fReaderMgr.peekString(XMLUni::fgXMLDeclString)) {
if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpace)
|| fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTab)
|| fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLF)
|| fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCR))
{
return true;
}
}
else if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpaceU)
|| fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTabU)
|| fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLFU)
|| fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCRU))
{
// Just in case, check for upper case. If found, issue
// an error, but keep going.
emitError(XMLErrs::XMLDeclMustBeLowerCase);
return true;
}
}
else {
if (fReaderMgr.peekString(XMLUni::fgXMLString)) {
if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpace)
|| fReaderMgr.skippedString(XMLUni::fgXMLStringHTab)
|| fReaderMgr.skippedString(XMLUni::fgXMLStringLF)
|| fReaderMgr.skippedString(XMLUni::fgXMLStringCR))
{
return true;
}
}
else if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpaceU)
|| fReaderMgr.skippedString(XMLUni::fgXMLStringHTabU)
|| fReaderMgr.skippedString(XMLUni::fgXMLStringLFU)
|| fReaderMgr.skippedString(XMLUni::fgXMLStringCRU))
{
// Just in case, check for upper case. If found, issue
// an error, but keep going.
emitError(XMLErrs::XMLDeclMustBeLowerCase);
return true;
}
}
return false;
}
// ---------------------------------------------------------------------------
// XMLScanner: Grammar preparsing
// ---------------------------------------------------------------------------
Grammar* XMLScanner::loadGrammar(const XMLCh* const systemId
, const short grammarType
, const bool toCache)
{
InputSource* srcToUse = 0;
if (fEntityHandler){
ReaderMgr::LastExtEntityInfo lastInfo;
fReaderMgr.getLastExtEntityInfo(lastInfo);
XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity,
systemId, 0, XMLUni::fgZeroLenString, lastInfo.systemId,
&fReaderMgr);
srcToUse = fEntityHandler->resolveEntity(&resourceIdentifier);
}
// First we try to parse it as a URL. If that fails, we assume its
// a file and try it that way.
if (!srcToUse) {
if (fDisableDefaultEntityResolution)
return 0;
try
{
// Create a temporary URL. Since this is the primary document,
// it has to be fully qualified. If not, then assume we are just
// mistaking a file for a URL.
XMLURL tmpURL(fMemoryManager);
if (XMLURL::parse(systemId, tmpURL)) {
if (tmpURL.isRelative())
{
if (!fStandardUriConformant)
srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
else {
// since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
// emit the error directly
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getCode()
, e.getType()
, e.getMessage()
);
return 0;
}
}
else
{
if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getCode()
, e.getType()
, e.getMessage()
);
return 0;
}
srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
}
}
else
{
if (!fStandardUriConformant)
srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
else {
// since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
// emit the error directly
// lazy bypass ... since all MalformedURLException are fatal, no need to check the type
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getCode()
, e.getType()
, e.getMessage()
);
return 0;
}
}
}
catch(const XMLException& excToCatch)
{
// For any other XMLException,
// emit the error and catch any user exception thrown from here.
fInException = true;
if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
emitError
(
XMLErrs::XMLException_Warning
, excToCatch.getCode()
, excToCatch.getType()
, excToCatch.getMessage()
);
else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
emitError
(
XMLErrs::XMLException_Fatal
, excToCatch.getCode()
, excToCatch.getType()
, excToCatch.getMessage()
);
else
emitError
(
XMLErrs::XMLException_Error
, excToCatch.getCode()
, excToCatch.getType()
, excToCatch.getMessage()
);
return 0;
}
}
Janitor<InputSource> janSrc(srcToUse);
return loadGrammar(*srcToUse, grammarType, toCache);
}
Grammar* XMLScanner::loadGrammar(const char* const systemId
, const short grammarType
, const bool toCache)
{
// We just delegate this to the XMLCh version after transcoding
XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
return loadGrammar(tmpBuf, grammarType, toCache);
}
// ---------------------------------------------------------------------------
// XMLScanner: Setter methods
// ---------------------------------------------------------------------------
void XMLScanner::setURIStringPool(XMLStringPool* const stringPool)
{
fURIStringPool = stringPool;
fEmptyNamespaceId = fURIStringPool->addOrFind(XMLUni::fgZeroLenString);
fUnknownNamespaceId = fURIStringPool->addOrFind(XMLUni::fgUnknownURIName);
fXMLNamespaceId = fURIStringPool->addOrFind(XMLUni::fgXMLURIName);
fXMLNSNamespaceId = fURIStringPool->addOrFind(XMLUni::fgXMLNSURIName);
}
// ---------------------------------------------------------------------------
// XMLScanner: Private helper methods
// ---------------------------------------------------------------------------
/***
* In reusing grammars (cacheing grammar from parse, or use cached grammar), internal
* dtd is allowed conditionally.
*
* In the case of cacheing grammar from parse, it is NOT allowed.
*
* In the case of use cached grammar,
* if external dtd is present and it is parsed before, then it is not allowed,
* otherwise it is allowed.
*
***/
void XMLScanner::checkInternalDTD(bool hasExtSubset
,const XMLCh* const sysId
,const XMLCh* const pubId)
{
if (fToCacheGrammar)
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Val_CantHaveIntSS, fMemoryManager);
if (fUseCachedGrammar && hasExtSubset && !fIgnoreCachedDTD)
{
InputSource* sysIdSrc = resolveSystemId(sysId, pubId);
if (sysIdSrc) {
Janitor<InputSource> janSysIdSrc(sysIdSrc);
Grammar* grammar = fGrammarResolver->getGrammar(sysIdSrc->getSystemId());
if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType)
{
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Val_CantHaveIntSS, fMemoryManager);
}
}
}
}
// This method is called after the content scan to insure that all the
// ID/IDREF attributes match up (i.e. that all IDREFs refer to IDs.) This is
// an XML 1.0 rule, so we can do here in the core.
void XMLScanner::checkIDRefs()
{
// Iterate the id ref list. If we find any entries here which are used
// but not declared, then that's an error.
RefHashTableOfEnumerator<XMLRefInfo> refEnum(fValidationContext->getIdRefList(), false, fMemoryManager);
while (refEnum.hasMoreElements())
{
// Get a ref to the current element
const XMLRefInfo& curRef = refEnum.nextElement();
// If its used but not declared, then its an error
if (!curRef.getDeclared() && curRef.getUsed() && fValidate)
fValidator->emitError(XMLValid::IDNotDeclared, curRef.getRefName());
}
}
// This just does a simple check that the passed progressive scan token is
// legal for this scanner.
bool XMLScanner::isLegalToken(const XMLPScanToken& toCheck)
{
return ((fScannerId == toCheck.fScannerId)
&& (fSequenceId == toCheck.fSequenceId));
}
// This method will handle figuring out what the next top level token is
// in the input stream. It will return an enumerated value that indicates
// what it believes the next XML level token must be. It will eat as many
// chars are required to figure out what is next.
XMLScanner::XMLTokens XMLScanner::senseNextToken(unsigned int& orgReader)
{
// Get the next character and use it to guesstimate what the next token
// is going to be. We turn on end of entity exceptions when we do this
// in order to catch the scenario where the current entity ended at
// the > of some markup.
XMLCh nextCh;
// avoid setting up the ThrowEOEJanitor if we know that we have data in the current reader
if(fReaderMgr.getCurrentReader() && fReaderMgr.getCurrentReader()->charsLeftInBuffer()>0)
nextCh = fReaderMgr.peekNextChar();
else
{
ThrowEOEJanitor janMgr(&fReaderMgr, true);
nextCh = fReaderMgr.peekNextChar();
}
// Check for special chars. Start with the most
// obvious end of file, which should be legal here at top level.
if (!nextCh)
return Token_EOF;
// If it's not a '<' we must be in content.
//
// This includes entity references '&' of some sort. These must
// be character data because that's the only place a reference can
// occur in content.
if (nextCh != chOpenAngle)
return Token_CharData;
// Ok it had to have been a '<' character. So get it out of the reader
// and store the reader number where we saw it, passing it back to the
// caller.
fReaderMgr.getNextChar();
orgReader = fReaderMgr.getCurrentReaderNum();
// Ok, so lets go through the things that it could be at this point which
// are all some form of markup.
nextCh = fReaderMgr.peekNextChar();
if (nextCh == chForwardSlash)
{
fReaderMgr.getNextChar();
return Token_EndTag;
}
else if (nextCh == chBang)
{
static const XMLCh gCDATAStr[] =
{
chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
, chLatin_T, chLatin_A, chNull
};
static const XMLCh gCommentString[] =
{
chBang, chDash, chDash, chNull
};
if (fReaderMgr.skippedString(gCDATAStr))
return Token_CData;
if (fReaderMgr.skippedString(gCommentString))
return Token_Comment;
emitError(XMLErrs::ExpectedCommentOrCDATA);
return Token_Unknown;
}
else if (nextCh == chQuestion)
{
// It must be a PI
fReaderMgr.getNextChar();
return Token_PI;
}
// Assume its an element name, so return with a start tag token. If it
// turns out not to be, then it will fail when it cannot get a valid tag.
return Token_StartTag;
}
// ---------------------------------------------------------------------------
// XMLScanner: Private parsing methods
// ---------------------------------------------------------------------------
// This guy just scans out a single or double quoted string of characters.
// It does not pass any judgement on the contents and assumes that it is
// illegal to have another quote of the same kind inside the string's
// contents.
//
// NOTE: This is for simple stuff like the strings in the XMLDecl which
// cannot have any entities inside them. So this guy does not handle any
// end of entity stuff.
bool XMLScanner::getQuotedString(XMLBuffer& toFill)
{
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr.skipIfQuote(quoteCh))
return false;
while (true)
{
// Get another char
const XMLCh nextCh = fReaderMgr.getNextChar();
// See if it matches the starting quote char
if (nextCh == quoteCh)
break;
// We should never get either an end of file null char here. If we
// do, just fail. It will be handled more gracefully in the higher
// level code that called us.
if (!nextCh)
return false;
// Else add it to the buffer
toFill.append(nextCh);
}
return true;
}
// This method scans a character reference and returns the character that
// was refered to. It assumes that we've already scanned the &# characters
// that prefix the numeric code.
bool XMLScanner::scanCharRef(XMLCh& toFill, XMLCh& second)
{
bool gotOne = false;
unsigned int value = 0;
// Set the radix. Its supposed to be a lower case x if hex. But, in
// order to recover well, we check for an upper and put out an error
// for that.
unsigned int radix = 10;
if (fReaderMgr.skippedChar(chLatin_x))
{
radix = 16;
}
else if (fReaderMgr.skippedChar(chLatin_X))
{
emitError(XMLErrs::HexRadixMustBeLowerCase);
radix = 16;
}
while (true)
{
const XMLCh nextCh = fReaderMgr.peekNextChar();
// Watch for EOF
if (!nextCh)
ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
// Break out on the terminating semicolon
if (nextCh == chSemiColon)
{
fReaderMgr.getNextChar();
break;
}
// Convert this char to a binary value, or bail out if its not
// one.
unsigned int nextVal;
if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
nextVal = (unsigned int)(nextCh - chDigit_0);
else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
else
{
// Return a zero
toFill = 0;
// If we got at least a sigit, then do an unterminated ref error.
// Else, do an expected a numerical ref thing.
if (gotOne)
emitError(XMLErrs::UnterminatedCharRef);
else
emitError(XMLErrs::ExpectedNumericalCharRef);
// Return failure
return false;
}
// Make sure its valid for the radix. If not, then just eat the
// digit and go on after issueing an error. Else, update the
// running value with this new digit.
if (nextVal >= radix)
{
XMLCh tmpStr[2];
tmpStr[0] = nextCh;
tmpStr[1] = chNull;
emitError(XMLErrs::BadDigitForRadix, tmpStr);
}
else
{
value = (value * radix) + nextVal;
// Guard against overflow.
if (value > 0x10FFFF) {
// Character reference was not in the valid range
emitError(XMLErrs::InvalidCharacterRef);
return false;
}
}
// Indicate that we got at least one good digit
gotOne = true;
// And eat the last char
fReaderMgr.getNextChar();
}
// Return the char (or chars)
// And check if the character expanded is valid or not
if (value >= 0x10000 && value <= 0x10FFFF)
{
value -= 0x10000;
toFill = XMLCh((value >> 10) + 0xD800);
second = XMLCh((value & 0x3FF) + 0xDC00);
}
else if (value <= 0xFFFD)
{
toFill = XMLCh(value);
second = 0;
if (!fReaderMgr.getCurrentReader()->isXMLChar(toFill) && !fReaderMgr.getCurrentReader()->isControlChar(toFill)) {
// Character reference was not in the valid range
emitError(XMLErrs::InvalidCharacterRef);
return false;
}
}
else {
// Character reference was not in the valid range
emitError(XMLErrs::InvalidCharacterRef);
return false;
}
return true;
}
// We get here after the '<!--' part of the comment. We scan past the
// terminating '-->' It will calls the appropriate handler with the comment
// text, if one is provided. A comment can be in either the document or
// the DTD, so the fInDocument flag is used to know which handler to send
// it to.
void XMLScanner::scanComment()
{
enum States
{
InText
, OneDash
, TwoDashes
};
// Get a buffer for this
XMLBufBid bbComment(&fBufMgr);
// Get the comment text into a temp buffer. Be sure to use temp buffer
// two here, since its to be used for stuff that is potentially longer
// than just a name.
States curState = InText;
bool gotLeadingSurrogate = false;
while (true)
{
// Get the next character
const XMLCh nextCh = fReaderMgr.getNextChar();
// Watch for an end of file
if (!nextCh)
{
emitError(XMLErrs::UnterminatedComment);
ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
}
// Check for correct surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
if (gotLeadingSurrogate)
{
if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
emitError(XMLErrs::Expected2ndSurrogateChar);
}
// Its got to at least be a valid XML character
else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) {
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
, fMemoryManager
);
emitError(XMLErrs::InvalidCharacter, tmpBuf);
}
gotLeadingSurrogate = false;
}
if (curState == InText)
{
// If its a dash, go to OneDash state. Otherwise take as text
if (nextCh == chDash)
curState = OneDash;
else
bbComment.append(nextCh);
}
else if (curState == OneDash)
{
// If its another dash, then we change to the two dashes states.
// Otherwise, we have to put in the deficit dash and the new
// character and go back to InText.
if (nextCh == chDash)
{
curState = TwoDashes;
}
else
{
bbComment.append(chDash);
bbComment.append(nextCh);
curState = InText;
}
}
else if (curState == TwoDashes)
{
// The next character must be the closing bracket
if (nextCh != chCloseAngle)
{
emitError(XMLErrs::IllegalSequenceInComment);
fReaderMgr.skipPastChar(chCloseAngle);
return;
}
break;
}
}
// If we have an available handler, call back with the comment.
if (fDocHandler)
{
fDocHandler->docComment
(
bbComment.getRawBuffer()
);
}
//mark comment is seen within the current element
if (! fElemStack.isEmpty())
fElemStack.setCommentOrPISeen();
}
// Most equal signs can have white space around them, so this little guy
// just makes the calling code cleaner by eating whitespace.
bool XMLScanner::scanEq(bool inDecl)
{
fReaderMgr.skipPastSpaces(inDecl);
if (fReaderMgr.skippedChar(chEqual))
{
fReaderMgr.skipPastSpaces(inDecl);
return true;
}
return false;
}
unsigned int
XMLScanner::scanUpToWSOr(XMLBuffer& toFill, const XMLCh chEndChar)
{
fReaderMgr.getUpToCharOrWS(toFill, chEndChar);
return toFill.getLen();
}
unsigned int *XMLScanner::getNewUIntPtr()
{
// this method hands back a new pointer initialized to 0
unsigned int *retVal;
if(fUIntPoolCol < 64)
{
retVal = fUIntPool[fUIntPoolRow]+fUIntPoolCol;
fUIntPoolCol++;
return retVal;
}
// time to grow the pool...
if(fUIntPoolRow+1 == fUIntPoolRowTotal)
{
// and time to add some space for new rows:
fUIntPoolRowTotal <<= 1;
unsigned int **newArray = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) * fUIntPoolRowTotal );
memcpy(newArray, fUIntPool, (fUIntPoolRow+1) * sizeof(unsigned int *));
fMemoryManager->deallocate(fUIntPool);
fUIntPool = newArray;
// need to 0 out new elements we won't need:
for (unsigned int i=fUIntPoolRow+2; i<fUIntPoolRowTotal; i++)
fUIntPool[i] = 0;
}
// now to add a new row; we just made sure we have space
fUIntPoolRow++;
fUIntPool[fUIntPoolRow] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6);
memset(fUIntPool[fUIntPoolRow], 0, sizeof(unsigned int) << 6);
// point to next element
fUIntPoolCol = 1;
return fUIntPool[fUIntPoolRow];
}
void XMLScanner::resetUIntPool()
{
// to reuse the unsigned int pool--and the hashtables that use it--
// simply reinitialize everything to 0's
for(unsigned int i = 0; i<= fUIntPoolRow; i++)
memset(fUIntPool[i], 0, sizeof(unsigned int) << 6);
}
void XMLScanner::recreateUIntPool()
{
// this allows a bloated unsigned int pool to be dispensed with
// first, delete old fUIntPool
for (unsigned int i=0; i<=fUIntPoolRow; i++)
{
fMemoryManager->deallocate(fUIntPool[i]);
}
fMemoryManager->deallocate(fUIntPool);
fUIntPoolRow = fUIntPoolCol = 0;
fUIntPoolRowTotal = 2;
fUIntPool = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) * fUIntPoolRowTotal);
fUIntPool[0] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6);
memset(fUIntPool[fUIntPoolRow], 0, sizeof(unsigned int) << 6);
fUIntPool[1] = 0;
}
XERCES_CPP_NAMESPACE_END