libwbxml/src/wbxml_parser.cpp - platform/packages/apps/IM - Git at Google

 /*
  * Copyright (C) 2008 Esmertec AG.
  * Copyright (C) 2008 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <stdio.h>
 #include <stdlib.h>
 #include <setjmp.h>
 #include <assert.h>
 #include "wbxml_parser.h"
 #include "csp13_data.h"
 #ifdef SUPPORT_SYNCML
 #include "syncml_data.h"
 #endif

 #ifdef PLATFORM_ANDROID
 extern "C" void *bsearch(const void *key, const void *base0, size_t nmemb,
         size_t size, int (*compar)(const void *, const void *));
 #endif

 #define ARRAY_SIZE(a)   (sizeof(a) / sizeof(a[0]))

 //#define WBXML_DEBUG 1

 /* Major TODO items:
    - Attribute value tokens (not used by IMPS CSP)
    - EXT_* except EXT_T_0 (not used by IMPS CSP)
    - PI (not used by IMPS CSP)
    - cleanups

    Other TODO:
    - Support more public ID? Only IMPS is supported now.
    - Support other charsets than UTF-8
  */

 static int compareTokenData(const void * t1, const void * t2)
 {
     return ((TokenData *)t1)->token - ((TokenData *)t2)->token;
 }

 static int compareAttrData(const void * t1, const void * t2)
 {
     return ((AttrData *)t1)->token - ((AttrData *)t2)->token;
 }

 static bool isTagStart(int token)
 {
     if (token == TOKEN_SWITCH_PAGE)
         return true;

     token &= 0x3f;
     return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0);
 }

 static bool isAttrStart(int token)
 {
     return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0) ||
         (token > TOKEN_LITERAL_C && token < 0x80);
 }

 WbxmlParser::WbxmlParser(uint32_t transportEncoding) :
     mTransportEncoding(transportEncoding)
 {
     reset();
 }

 WbxmlParser::~WbxmlParser()
 {
 }

 void WbxmlParser::reset(void)
 {
     mContentHandler = NULL;

     mExternalChunk = NULL;
     mExternalChunkLen = 0;
     mLastChunk.clear();
     mDataOffset = 0;
     mIsDataEnd = false;

     mStartElemStack.clear();
     mStringTable.clear();

     mCurrTagPage = mCurrAttrPage = 0;
     mPublicId = 0;

     mState = EXPECT_HEADER;
     mLastError = ERROR_NO_ERROR;
 }

 void WbxmlParser::setContentHandler(WbxmlContentHandler * handler)
 {
     mContentHandler = handler;
 }

 int WbxmlParser::parse(const char * data, uint32_t dataLen, bool end)
 {
     if (data == NULL) {
         mLastError = ERROR_INVALID_DATA;
         return WBXML_STATUS_ERROR;
     }

     // All temporary C++ varaibles must be declared before setjmp to make
     // sure they get properly destructed after longjmp.
     vector<Attribute> attribs;
     Attribute attrib;
     string tagName;
     string characters;
     string opaque;

 #ifdef WBXML_DEBUG
     printf("\nparse dataLen %d; end %d; readPos %d; availData %d\n",
         dataLen, end, getReadPos(), availDataSize());
 #endif
     appendData(data, dataLen, end);
     volatile int readPos = getReadPos();
     int setjmpRet;
     switch (setjmpRet = setjmp(mJmpbuf)) {
         case 0:
             break;

         case ERROR_NEED_MORE_DATA:
             if (!mIsDataEnd) {
 #ifdef WBXML_DEBUG
                 printf("\nneed more data: readPos %d\n", readPos);
 #endif
                 setReadPos(readPos);
                 saveRemainingData();
                 return WBXML_STATUS_OK;
             } else {
 #ifdef WBXML_DEBUG
                 printf("wbxml parser error: unexpected data end\n");
 #endif
                 mLastError = ERROR_NEED_MORE_DATA;
                 return WBXML_STATUS_ERROR;
             }
             break;

         case ERROR_UNSUPPORTED_PUBID:
         case ERROR_UNSUPPORTED_CHARSET:
         case ERROR_INVALID_STRING_TABLE:
         case ERROR_INVALID_STRING_TABLE_REFERENCE:
         case ERROR_INVALID_EXT_TOKEN:
         case ERROR_INVALID_MBUINT:
         case ERROR_INVALID_ENTITY:
         case ERROR_UNRECOGNIZED_TAG:
         case ERROR_UNRECOGNIZED_ATTR:
         case ERROR_MISSING_ATTR:
         case ERROR_MISSING_TOKEN_END:
 #ifdef WBXML_DEBUG
             printf("wbxml parser error %d\n", setjmpRet);
 #endif
             mLastError = ParserError(setjmpRet);
             return WBXML_STATUS_ERROR;
             break;

         case ERROR_NOT_SUPPORTED_YET:
             printf("wbxml parser error: Not implemented feature.\n");
             mLastError = ParserError(setjmpRet);
             return WBXML_STATUS_ERROR;
             break;

         default:
             printf("wbxml parser error: Impossible execution path.\n");
             mLastError = ParserError(setjmpRet);
             return WBXML_STATUS_ERROR;
             break;
     }

     for (;;) {
         // save readPos for error recovery
         readPos = getReadPos();

         switch (mState) {
             case EXPECT_HEADER:
                 mDocVersion = readByte();

                 mPublicId = readMbuint32();
                 if (mPublicId != 0) {
                     if (!selectTokenMapping(mPublicId)) {
 #ifdef WBXML_DEBUG
                         printf("wbxml parser error: unsupported public id \n");
 #endif
                         longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID);
                     }
                 } else {
                     mPublicId = -readMbuint32();
                 }
                 mCharset = readMbuint32();
                 if (!mCharset) {
                     mCharset = mTransportEncoding;
                     if (!mCharset) {
                         mCharset = CHARSET_UTF8;
                     }
                 }
                 // TODO: support more charsets other than UTF-8
                 if (mCharset != CHARSET_UTF8) {
 #ifdef WBXML_DEBUG
                     printf("wbxml parser error: unsupported charset\n");
 #endif
                     longjmp(mJmpbuf, ERROR_UNSUPPORTED_CHARSET);
                 }

                 // now advance to next state
                 if (mContentHandler) {
                     mContentHandler->handlePublicId(mPublicId);
                 }
                 mState = EXPECT_STRING_TABLE;
                 break;

             case EXPECT_STRING_TABLE:
             {
                 uint32_t len = readMbuint32();
                 if (availDataSize() < len) {
                     longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
                 }
                 mStringTable.clear();
                 // TODO: optimize this
                 while (len--) {
                     mStringTable += readByte();
                 }
                 if (mStringTable.size()) {
                     if (mStringTable[mStringTable.size() - 1] != 0) {
                         // must have an ending \0
                         //TODO:the byte array returned by SCTS does not contain '\0' at the
                         //end,should this be fixed accordingly?
 #ifdef WBXML_DEBUG
                         printf("wbxml parser error: invalid string table\n");
 #endif
                         longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE);
                     }
                 }
                 mState = EXPECT_BODY_START;
                 if (mPublicId <= 0) {
                     const char * s = mStringTable.c_str() + (-mPublicId);
 #ifdef SUPPORT_SYNCML
                     if (strcmp(s, "-//SYNCML//DTD SyncML 1.2//EN") == 0) {
                         mPublicId = PUBLICID_SYNCML_1_2;
                     } else if (strcmp(s, "-//SYNCML//DTD SyncML 1.1//EN") == 0) {
                         mPublicId = PUBLICID_SYNCML_1_1;
                     } else if (strcmp(s, "-//SYNCML//DTD SyncML 1.0//EN") == 0) {
                         mPublicId = PUBLICID_SYNCML_1_0;
                     }
 #endif
                     if ((mPublicId <= 0) || !selectTokenMapping(mPublicId)) {
                         longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID);
                     }
                 }
                 break;
             }

             case EXPECT_BODY_START:
                 //TODO: handle possible PIs
                 mState = EXPECT_ELEMENT_START;
                 break;

             case EXPECT_ELEMENT_START:
             {
                 int stag = readByte();
                 const char * name;
                 if ((stag & 0x3f) == TOKEN_LITERAL) {
                     name = resolveStrTableRef();
                 } else {
                     if (stag == TOKEN_SWITCH_PAGE) {
                         mCurrTagPage = readByte();
                         stag = readByte();
                     }
                     name = lookupTagName(stag);
                 }
                 if (name == NULL) {
 #ifdef WBXML_DEBUG
                     printf("wbxml parser error: unrecognized tag\n");
 #endif
                     longjmp(mJmpbuf, ERROR_UNRECOGNIZED_TAG);
                 }
                 attribs.clear();
                 if (stag & 0x80) {
                     // followed by 1 or more attributes
                     while (peekByte() != TOKEN_END) {
                         readAttribute(&attrib);
                         attribs.push_back(attrib);
                     }
                     if (!attribs.size()) {
 #ifdef WBXML_DEBUG
                         printf("wbxml parser error: missing attributes\n");
 #endif
                         longjmp(mJmpbuf, ERROR_MISSING_ATTR);
                     }
                     // TOKEN_END
                     readByte();
                 }
                 if (mContentHandler) {
                     mContentHandler->startElement(name, attribs);
                 }
                 if (stag & 0x40) {
                     mState = EXPECT_CONTENT;
                 } else {
                     mState = ELEMENT_END;
                 }
                 tagName = name;
                 mStartElemStack.push_back(name);
                 break;
             }

             case EXPECT_CONTENT:
             {
                 int byte = peekByte();
                 if (byte == TOKEN_SWITCH_PAGE) {
                     readByte();
                     mCurrTagPage = readByte();
                     byte = peekByte();
                 }
                 if (isTagStart(byte) || byte == TOKEN_END) {
                     if (characters.size() && mContentHandler) {
                         mContentHandler->characters(characters.c_str(), characters.size());
                         characters.clear();
                     }
                     if (byte == TOKEN_END) {
                         mState = EXPECT_ELEMENT_END;
                     } else {
                         mState = EXPECT_ELEMENT_START;
                     }
                 } else {
                     // TODO: handle extension and pi
                     switch (byte) {
                         case TOKEN_ENTITY:
                         case TOKEN_STR_I:
                         case TOKEN_STR_T:
                             readString(characters);
                             break;

                         case TOKEN_EXT_T_0:
                         {
                             readByte();
                             uint32_t valueToken = readMbuint32();
                             if (mPublicId == PUBLICID_IMPS_1_1
                                     || mPublicId == PUBLICID_IMPS_1_2
                                     || mPublicId == PUBLICID_IMPS_1_3) {
                                 TokenData t = {valueToken, NULL};
                                 const TokenData * res = (TokenData *)bsearch(&t,
                                         csp13ExtValueTokens, ARRAY_SIZE(csp13ExtValueTokens),
                                         sizeof(csp13ExtValueTokens[0]), compareTokenData);
                                 if (res) {
                                     characters.append(res->tagName);
                                 } else {
                                     longjmp(mJmpbuf, ERROR_INVALID_EXT_TOKEN);
                                 }
                             } else {
                                 printf ("Token 0x%x\n", byte);
                                 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
                             }
                             break;
                         }

                         case TOKEN_OPAQUE:
                         {
                             readByte();
                             uint32_t opaqueDataLen = readMbuint32();
                             opaque.clear();
                             while (opaqueDataLen--) {
                                 opaque += (char)readByte();
                             }
                             if (mContentHandler) {
                                 mContentHandler->opaque(opaque.c_str(), opaque.size());
                             }
                             break;
                         }

                         default:
                             printf ("Token 0x%x\n", byte);
                             longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
                             break;
                     }
                 }
                 break;
             }

             case EXPECT_ELEMENT_END:
                 if (readByte() != TOKEN_END) {
 #ifdef WBXML_DEBUG
                     printf("wbxml parser error: TOKEN_END expected\n");
 #endif
                     longjmp(mJmpbuf, ERROR_MISSING_TOKEN_END);
                 }
                 mState = ELEMENT_END;
                 break;

             case ELEMENT_END:
                 assert(!mStartElemStack.empty());

                 tagName = mStartElemStack.back();
                 mStartElemStack.pop_back();
                 if (mContentHandler) {
                     mContentHandler->endElement(tagName.c_str());
                 }
                 if (mStartElemStack.empty()) {
                     mState = EXPECT_BODY_END;
                 } else {
                     mState = EXPECT_CONTENT;
                 }
                 break;

             case EXPECT_BODY_END:
                 // TODO: handle possible PIs

                 // we're done
                 return WBXML_STATUS_OK;
                 break;
         }
     }
 }

 /*
  * We don't make a copy of the data chunk for the current parse() until
  * it returns.
  * The remaining data will be saved in saveRemainingData() before parse()
  * returns.
  */
 void WbxmlParser::appendData(const char * data, uint32_t len, bool end)
 {
     mExternalChunk = data;
     mExternalChunkLen = len;
     mIsDataEnd = end;
 }

 void WbxmlParser::saveRemainingData()
 {
     if (mDataOffset > mLastChunk.size()) {
         uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
         assert(offsetToExtChunk <= mExternalChunkLen);
         mLastChunk.assign(mExternalChunk + offsetToExtChunk,
                 mExternalChunkLen - offsetToExtChunk);
         mDataOffset = 0;
     } else {
         mLastChunk.append(mExternalChunk, mExternalChunkLen);
     }
     mExternalChunk = NULL;
     mExternalChunkLen = 0;
 }

 int WbxmlParser::readByte()
 {
     if (mDataOffset < mLastChunk.size()) {
 #ifdef WBXML_DEBUG
         printf ("rb 0x%x; ", (unsigned char)mLastChunk[mDataOffset]);
 #endif
         return (unsigned char)mLastChunk[mDataOffset++];
     } else {
         uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
         if (offsetToExtChunk < mExternalChunkLen) {
             mDataOffset++;
 #ifdef WBXML_DEBUG
             printf ("rb 0x%x; ", (unsigned char)mExternalChunk[offsetToExtChunk]);
 #endif
             return (unsigned char)mExternalChunk[offsetToExtChunk];
         }
         longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
     }
 }

 int WbxmlParser::peekByte()
 {
     if (mDataOffset < mLastChunk.size()) {
         return (unsigned char)mLastChunk[mDataOffset];
     } else {
         uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
         if (offsetToExtChunk < mExternalChunkLen) {
             return (unsigned char)mExternalChunk[offsetToExtChunk];
         }
         longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
     }
 }

 uint32_t WbxmlParser::readMbuint32()
 {
     uint32_t value = 0;
     uint32_t byte;
     do {
         if ((value >> 25) != 0) {
             // would go overflow. not a valid uint32.
             longjmp(mJmpbuf, ERROR_INVALID_MBUINT);
         }
         byte = readByte();
         value = (value << 7) | (byte & 0x7f);
     } while (byte & 0x80);
     return value;
 }

 /**
  * Read STR_I | STR_T | ENTITY and *append* to str.
  * Yes this looks ugly...
  */
 void WbxmlParser::readString(string & str)
 {
     int byte = readByte();
     switch (byte) {
         case TOKEN_STR_I:
             //TODO: assuming UTF-8
             while ((byte = readByte()) != 0) {
                 str += (char)byte;
             }
             break;

         case TOKEN_ENTITY:
         {
             uint32_t ch = readMbuint32();
             //TODO: assuming UTF-8 for now.
             if (ch <= 0x7f) {
                 str += (char)ch;
             } else if (ch <= 0x7ff) {
                 str += (char)((ch >> 6) | 0xc0);
                 str += (char)((ch & 0x3f) | 0x80);
             } else if (ch <= 0xffff) {
                 str += (char)((ch >> 12) | 0xe0);
                 str += (char)(((ch >> 6) & 0x3f) | 0x80);
                 str += (char)((ch & 0x3f) | 0x80);
             } else if (ch <= 0x10ffff) {
                 // 010000 - 10FFFF
                 str += (char)((ch >> 18) | 0xf0);
                 str += (char)(((ch >> 12) & 0x3f) | 0x80);
                 str += (char)(((ch >> 6) & 0x3f) | 0x80);
                 str += (char)((ch & 0x3f) | 0x80);
             } else {
                 // not a valid UCS-4 character
                 longjmp(mJmpbuf, ERROR_INVALID_ENTITY);
             }
             break;
         }

         case TOKEN_STR_T:
         {
             const char * s = resolveStrTableRef();
             str.append(s, strlen(s));
             break;
         }

         default:
             // impossible
             printf ("Unknown token 0x%02x\n", byte);
             longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
             break;
     }
 }

 const char * WbxmlParser::resolveStrTableRef(void)
 {
     uint32_t offset = readMbuint32();
     if (offset >= mStringTable.size()) {
         longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE_REFERENCE);
     }
     return mStringTable.c_str() + offset;
 }

 bool WbxmlParser::selectTokenMapping(int publicId)
 {
     switch (publicId) {
         case PUBLICID_IMPS_1_3:
         case PUBLICID_IMPS_1_2:
         case PUBLICID_IMPS_1_1:
             mTagPages = csp13TagPages;
             mNumTagPages = ARRAY_SIZE(csp13TagPages);
             mAttrPages = csp13AttrPages;
             mNumAttrPages = ARRAY_SIZE(csp13AttrPages);
             break;

 #ifdef SUPPORT_SYNCML
         case PUBLICID_SYNCML_1_0:
         case PUBLICID_SYNCML_1_1:
         case PUBLICID_SYNCML_1_2:
         case PUBLICID_SYNCML_METINF_1_2:
             mTagPages = syncmlTagPages;
             mNumTagPages = ARRAY_SIZE(syncmlTagPages);
             mAttrPages = NULL;
             mNumAttrPages = 0;
             break;

         case PUBLICID_SYNCML_DEVINF_1_2:
             mTagPages = syncmlDevInfTagPages;
             mNumTagPages = ARRAY_SIZE(syncmlDevInfTagPages);
             mAttrPages = NULL;
             mNumAttrPages = 0;
             break;
 #endif
         default:
             return false;
     }
     return true;
 }

 const char * WbxmlParser::lookupTagName(int tag) const
 {
     tag = tag & 0x3f;

     // TODO: optimize this
     if (mCurrTagPage >= mNumTagPages) {
         return NULL;
     }
     const TagCodePage * page = &mTagPages[mCurrTagPage];
     if (page == NULL) {
         return NULL;
     }

     TokenData t = {tag, NULL};
     const TokenData * res = (TokenData *)bsearch(&t, page->tags, page->numTokens,
             sizeof(TokenData), compareTokenData);
     if (res) {
         return res->tagName;
     }

     return NULL;
 }

 const char * WbxmlParser::lookupAttrName(int token, const char **prefix) const
 {
     // TODO: optimize this
     if (mCurrAttrPage >= mNumAttrPages) {
         return NULL;
     }
     const AttrCodePage * page = &mAttrPages[mCurrAttrPage];
     if (page == NULL) {
         return NULL;
     }

     AttrData t = {token, NULL, NULL};
     const AttrData * res = (AttrData *)bsearch(&t, page->attrs, page->numTokens,
             sizeof(AttrData), compareAttrData);
     if (res) {
         if (prefix) {
             *prefix = res->attrValuePrefix;
         }
         return res->attrName;
     }

     return NULL;
 }

 void WbxmlParser::readAttribute(Attribute * attrib)
 {
     // attribute start: attrib start token, LITERAL or END
     int attrStart = readByte();
     const char * name;
     const char * valuePrefix = NULL;

     if (attrStart == TOKEN_LITERAL) {
         name = resolveStrTableRef();
     } else {
         if (attrStart == TOKEN_SWITCH_PAGE) {
             mCurrAttrPage = readByte();
             attrStart = readByte();
         }
         name = lookupAttrName(attrStart, &valuePrefix);
     }
     if (name == NULL) {
         longjmp(mJmpbuf, ERROR_UNRECOGNIZED_ATTR);
     }
     attrib->name = name;
     attrib->value = "";
     if (valuePrefix != NULL) {
         attrib->value = valuePrefix;
     }

     // now attribute value: zero or more value, string, entity or extension tokens
     for (;;) {
         int valueToken = peekByte();
         if (isAttrStart(valueToken) || valueToken == TOKEN_END) {
             // An attribute start token, a LITERAL token or the END token
             // indicates the end of an attribute value.
             return;
         }
         switch (valueToken) {
             case TOKEN_ENTITY:
             case TOKEN_STR_I:
             case TOKEN_STR_T:
                 readString(attrib->value);
                 break;

             case TOKEN_EXT_I_0:
             case TOKEN_EXT_I_1:
             case TOKEN_EXT_I_2:
             case TOKEN_EXT_0:
             case TOKEN_EXT_1:
             case TOKEN_EXT_2:
                 //TODO: document type specific
                 printf ("Unsupported Token 0x%x\n", valueToken);
                 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
                 break;

             default:
                 //TODO
                 printf ("Unknown Token 0x%x\n", valueToken);
                 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
                 break;
         }
     }
 }
	/*
	* Copyright (C) 2008 Esmertec AG.
	* Copyright (C) 2008 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <setjmp.h>
	#include <assert.h>
	#include "wbxml_parser.h"
	#include "csp13_data.h"
	#ifdef SUPPORT_SYNCML
	#include "syncml_data.h"
	#endif

	#ifdef PLATFORM_ANDROID
	extern "C" void bsearch(const void key, const void *base0, size_t nmemb,
	size_t size, int (compar)(const void , const void *));
	#endif

	#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))

	//#define WBXML_DEBUG 1

	/* Major TODO items:
	- Attribute value tokens (not used by IMPS CSP)
	- EXT_* except EXT_T_0 (not used by IMPS CSP)
	- PI (not used by IMPS CSP)
	- cleanups

	Other TODO:
	- Support more public ID? Only IMPS is supported now.
	- Support other charsets than UTF-8
	*/

	static int compareTokenData(const void * t1, const void * t2)
	{
	return ((TokenData )t1)->token - ((TokenData )t2)->token;
	}

	static int compareAttrData(const void * t1, const void * t2)
	{
	return ((AttrData )t1)->token - ((AttrData )t2)->token;
	}

	static bool isTagStart(int token)
	{
	if (token == TOKEN_SWITCH_PAGE)
	return true;

	token &= 0x3f;
	return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0);
	}

	static bool isAttrStart(int token)
	{
	return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0) \|\|
	(token > TOKEN_LITERAL_C && token < 0x80);
	}

	WbxmlParser::WbxmlParser(uint32_t transportEncoding) :
	mTransportEncoding(transportEncoding)
	{
	reset();
	}

	WbxmlParser::~WbxmlParser()
	{
	}

	void WbxmlParser::reset(void)
	{
	mContentHandler = NULL;

	mExternalChunk = NULL;
	mExternalChunkLen = 0;
	mLastChunk.clear();
	mDataOffset = 0;
	mIsDataEnd = false;

	mStartElemStack.clear();
	mStringTable.clear();

	mCurrTagPage = mCurrAttrPage = 0;
	mPublicId = 0;

	mState = EXPECT_HEADER;
	mLastError = ERROR_NO_ERROR;
	}

	void WbxmlParser::setContentHandler(WbxmlContentHandler * handler)
	{
	mContentHandler = handler;
	}

	int WbxmlParser::parse(const char * data, uint32_t dataLen, bool end)
	{
	if (data == NULL) {
	mLastError = ERROR_INVALID_DATA;
	return WBXML_STATUS_ERROR;
	}

	// All temporary C++ varaibles must be declared before setjmp to make
	// sure they get properly destructed after longjmp.
	vector<Attribute> attribs;
	Attribute attrib;
	string tagName;
	string characters;
	string opaque;

	#ifdef WBXML_DEBUG
	printf("\nparse dataLen %d; end %d; readPos %d; availData %d\n",
	dataLen, end, getReadPos(), availDataSize());
	#endif
	appendData(data, dataLen, end);
	volatile int readPos = getReadPos();
	int setjmpRet;
	switch (setjmpRet = setjmp(mJmpbuf)) {
	case 0:
	break;

	case ERROR_NEED_MORE_DATA:
	if (!mIsDataEnd) {
	#ifdef WBXML_DEBUG
	printf("\nneed more data: readPos %d\n", readPos);
	#endif
	setReadPos(readPos);
	saveRemainingData();
	return WBXML_STATUS_OK;
	} else {
	#ifdef WBXML_DEBUG
	printf("wbxml parser error: unexpected data end\n");
	#endif
	mLastError = ERROR_NEED_MORE_DATA;
	return WBXML_STATUS_ERROR;
	}
	break;

	case ERROR_UNSUPPORTED_PUBID:
	case ERROR_UNSUPPORTED_CHARSET:
	case ERROR_INVALID_STRING_TABLE:
	case ERROR_INVALID_STRING_TABLE_REFERENCE:
	case ERROR_INVALID_EXT_TOKEN:
	case ERROR_INVALID_MBUINT:
	case ERROR_INVALID_ENTITY:
	case ERROR_UNRECOGNIZED_TAG:
	case ERROR_UNRECOGNIZED_ATTR:
	case ERROR_MISSING_ATTR:
	case ERROR_MISSING_TOKEN_END:
	#ifdef WBXML_DEBUG
	printf("wbxml parser error %d\n", setjmpRet);
	#endif
	mLastError = ParserError(setjmpRet);
	return WBXML_STATUS_ERROR;
	break;

	case ERROR_NOT_SUPPORTED_YET:
	printf("wbxml parser error: Not implemented feature.\n");
	mLastError = ParserError(setjmpRet);
	return WBXML_STATUS_ERROR;
	break;

	default:
	printf("wbxml parser error: Impossible execution path.\n");
	mLastError = ParserError(setjmpRet);
	return WBXML_STATUS_ERROR;
	break;
	}

	for (;;) {
	// save readPos for error recovery
	readPos = getReadPos();

	switch (mState) {
	case EXPECT_HEADER:
	mDocVersion = readByte();

	mPublicId = readMbuint32();
	if (mPublicId != 0) {
	if (!selectTokenMapping(mPublicId)) {
	#ifdef WBXML_DEBUG
	printf("wbxml parser error: unsupported public id \n");
	#endif
	longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID);
	}
	} else {
	mPublicId = -readMbuint32();
	}
	mCharset = readMbuint32();
	if (!mCharset) {
	mCharset = mTransportEncoding;
	if (!mCharset) {
	mCharset = CHARSET_UTF8;
	}
	}
	// TODO: support more charsets other than UTF-8
	if (mCharset != CHARSET_UTF8) {
	#ifdef WBXML_DEBUG
	printf("wbxml parser error: unsupported charset\n");
	#endif
	longjmp(mJmpbuf, ERROR_UNSUPPORTED_CHARSET);
	}

	// now advance to next state
	if (mContentHandler) {
	mContentHandler->handlePublicId(mPublicId);
	}
	mState = EXPECT_STRING_TABLE;
	break;

	case EXPECT_STRING_TABLE:
	{
	uint32_t len = readMbuint32();
	if (availDataSize() < len) {
	longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
	}
	mStringTable.clear();
	// TODO: optimize this
	while (len--) {
	mStringTable += readByte();
	}
	if (mStringTable.size()) {
	if (mStringTable[mStringTable.size() - 1] != 0) {
	// must have an ending \0
	//TODO:the byte array returned by SCTS does not contain '\0' at the
	//end,should this be fixed accordingly?
	#ifdef WBXML_DEBUG
	printf("wbxml parser error: invalid string table\n");
	#endif
	longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE);
	}
	}
	mState = EXPECT_BODY_START;
	if (mPublicId <= 0) {
	const char * s = mStringTable.c_str() + (-mPublicId);
	#ifdef SUPPORT_SYNCML
	if (strcmp(s, "-//SYNCML//DTD SyncML 1.2//EN") == 0) {
	mPublicId = PUBLICID_SYNCML_1_2;
	} else if (strcmp(s, "-//SYNCML//DTD SyncML 1.1//EN") == 0) {
	mPublicId = PUBLICID_SYNCML_1_1;
	} else if (strcmp(s, "-//SYNCML//DTD SyncML 1.0//EN") == 0) {
	mPublicId = PUBLICID_SYNCML_1_0;
	}
	#endif
	if ((mPublicId <= 0) \|\| !selectTokenMapping(mPublicId)) {
	longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID);
	}
	}
	break;
	}

	case EXPECT_BODY_START:
	//TODO: handle possible PIs
	mState = EXPECT_ELEMENT_START;
	break;

	case EXPECT_ELEMENT_START:
	{
	int stag = readByte();
	const char * name;
	if ((stag & 0x3f) == TOKEN_LITERAL) {
	name = resolveStrTableRef();
	} else {
	if (stag == TOKEN_SWITCH_PAGE) {
	mCurrTagPage = readByte();
	stag = readByte();
	}
	name = lookupTagName(stag);
	}
	if (name == NULL) {
	#ifdef WBXML_DEBUG
	printf("wbxml parser error: unrecognized tag\n");
	#endif
	longjmp(mJmpbuf, ERROR_UNRECOGNIZED_TAG);
	}
	attribs.clear();
	if (stag & 0x80) {
	// followed by 1 or more attributes
	while (peekByte() != TOKEN_END) {
	readAttribute(&attrib);
	attribs.push_back(attrib);
	}
	if (!attribs.size()) {
	#ifdef WBXML_DEBUG
	printf("wbxml parser error: missing attributes\n");
	#endif
	longjmp(mJmpbuf, ERROR_MISSING_ATTR);
	}
	// TOKEN_END
	readByte();
	}
	if (mContentHandler) {
	mContentHandler->startElement(name, attribs);
	}
	if (stag & 0x40) {
	mState = EXPECT_CONTENT;
	} else {
	mState = ELEMENT_END;
	}
	tagName = name;
	mStartElemStack.push_back(name);
	break;
	}

	case EXPECT_CONTENT:
	{
	int byte = peekByte();
	if (byte == TOKEN_SWITCH_PAGE) {
	readByte();
	mCurrTagPage = readByte();
	byte = peekByte();
	}
	if (isTagStart(byte) \|\| byte == TOKEN_END) {
	if (characters.size() && mContentHandler) {
	mContentHandler->characters(characters.c_str(), characters.size());
	characters.clear();
	}
	if (byte == TOKEN_END) {
	mState = EXPECT_ELEMENT_END;
	} else {
	mState = EXPECT_ELEMENT_START;
	}
	} else {
	// TODO: handle extension and pi
	switch (byte) {
	case TOKEN_ENTITY:
	case TOKEN_STR_I:
	case TOKEN_STR_T:
	readString(characters);
	break;

	case TOKEN_EXT_T_0:
	{
	readByte();
	uint32_t valueToken = readMbuint32();
	if (mPublicId == PUBLICID_IMPS_1_1
	\|\| mPublicId == PUBLICID_IMPS_1_2
	\|\| mPublicId == PUBLICID_IMPS_1_3) {
	TokenData t = {valueToken, NULL};
	const TokenData * res = (TokenData *)bsearch(&t,
	csp13ExtValueTokens, ARRAY_SIZE(csp13ExtValueTokens),
	sizeof(csp13ExtValueTokens[0]), compareTokenData);
	if (res) {
	characters.append(res->tagName);
	} else {
	longjmp(mJmpbuf, ERROR_INVALID_EXT_TOKEN);
	}
	} else {
	printf ("Token 0x%x\n", byte);
	longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
	}
	break;
	}

	case TOKEN_OPAQUE:
	{
	readByte();
	uint32_t opaqueDataLen = readMbuint32();
	opaque.clear();
	while (opaqueDataLen--) {
	opaque += (char)readByte();
	}
	if (mContentHandler) {
	mContentHandler->opaque(opaque.c_str(), opaque.size());
	}
	break;
	}

	default:
	printf ("Token 0x%x\n", byte);
	longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
	break;
	}
	}
	break;
	}

	case EXPECT_ELEMENT_END:
	if (readByte() != TOKEN_END) {
	#ifdef WBXML_DEBUG
	printf("wbxml parser error: TOKEN_END expected\n");
	#endif
	longjmp(mJmpbuf, ERROR_MISSING_TOKEN_END);
	}
	mState = ELEMENT_END;
	break;

	case ELEMENT_END:
	assert(!mStartElemStack.empty());

	tagName = mStartElemStack.back();
	mStartElemStack.pop_back();
	if (mContentHandler) {
	mContentHandler->endElement(tagName.c_str());
	}
	if (mStartElemStack.empty()) {
	mState = EXPECT_BODY_END;
	} else {
	mState = EXPECT_CONTENT;
	}
	break;

	case EXPECT_BODY_END:
	// TODO: handle possible PIs

	// we're done
	return WBXML_STATUS_OK;
	break;
	}
	}
	}

	/*
	* We don't make a copy of the data chunk for the current parse() until
	* it returns.
	* The remaining data will be saved in saveRemainingData() before parse()
	* returns.
	*/
	void WbxmlParser::appendData(const char * data, uint32_t len, bool end)
	{
	mExternalChunk = data;
	mExternalChunkLen = len;
	mIsDataEnd = end;
	}

	void WbxmlParser::saveRemainingData()
	{
	if (mDataOffset > mLastChunk.size()) {
	uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
	assert(offsetToExtChunk <= mExternalChunkLen);
	mLastChunk.assign(mExternalChunk + offsetToExtChunk,
	mExternalChunkLen - offsetToExtChunk);
	mDataOffset = 0;
	} else {
	mLastChunk.append(mExternalChunk, mExternalChunkLen);
	}
	mExternalChunk = NULL;
	mExternalChunkLen = 0;
	}

	int WbxmlParser::readByte()
	{
	if (mDataOffset < mLastChunk.size()) {
	#ifdef WBXML_DEBUG
	printf ("rb 0x%x; ", (unsigned char)mLastChunk[mDataOffset]);
	#endif
	return (unsigned char)mLastChunk[mDataOffset++];
	} else {
	uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
	if (offsetToExtChunk < mExternalChunkLen) {
	mDataOffset++;
	#ifdef WBXML_DEBUG
	printf ("rb 0x%x; ", (unsigned char)mExternalChunk[offsetToExtChunk]);
	#endif
	return (unsigned char)mExternalChunk[offsetToExtChunk];
	}
	longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
	}
	}

	int WbxmlParser::peekByte()
	{
	if (mDataOffset < mLastChunk.size()) {
	return (unsigned char)mLastChunk[mDataOffset];
	} else {
	uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
	if (offsetToExtChunk < mExternalChunkLen) {
	return (unsigned char)mExternalChunk[offsetToExtChunk];
	}
	longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
	}
	}

	uint32_t WbxmlParser::readMbuint32()
	{
	uint32_t value = 0;
	uint32_t byte;
	do {
	if ((value >> 25) != 0) {
	// would go overflow. not a valid uint32.
	longjmp(mJmpbuf, ERROR_INVALID_MBUINT);
	}
	byte = readByte();
	value = (value << 7) \| (byte & 0x7f);
	} while (byte & 0x80);
	return value;
	}

	/**
	* Read STR_I \| STR_T \| ENTITY and append to str.
	* Yes this looks ugly...
	*/
	void WbxmlParser::readString(string & str)
	{
	int byte = readByte();
	switch (byte) {
	case TOKEN_STR_I:
	//TODO: assuming UTF-8
	while ((byte = readByte()) != 0) {
	str += (char)byte;
	}
	break;

	case TOKEN_ENTITY:
	{
	uint32_t ch = readMbuint32();
	//TODO: assuming UTF-8 for now.
	if (ch <= 0x7f) {
	str += (char)ch;
	} else if (ch <= 0x7ff) {
	str += (char)((ch >> 6) \| 0xc0);
	str += (char)((ch & 0x3f) \| 0x80);
	} else if (ch <= 0xffff) {
	str += (char)((ch >> 12) \| 0xe0);
	str += (char)(((ch >> 6) & 0x3f) \| 0x80);
	str += (char)((ch & 0x3f) \| 0x80);
	} else if (ch <= 0x10ffff) {
	// 010000 - 10FFFF
	str += (char)((ch >> 18) \| 0xf0);
	str += (char)(((ch >> 12) & 0x3f) \| 0x80);
	str += (char)(((ch >> 6) & 0x3f) \| 0x80);
	str += (char)((ch & 0x3f) \| 0x80);
	} else {
	// not a valid UCS-4 character
	longjmp(mJmpbuf, ERROR_INVALID_ENTITY);
	}
	break;
	}

	case TOKEN_STR_T:
	{
	const char * s = resolveStrTableRef();
	str.append(s, strlen(s));
	break;
	}

	default:
	// impossible
	printf ("Unknown token 0x%02x\n", byte);
	longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
	break;
	}
	}

	const char * WbxmlParser::resolveStrTableRef(void)
	{
	uint32_t offset = readMbuint32();
	if (offset >= mStringTable.size()) {
	longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE_REFERENCE);
	}
	return mStringTable.c_str() + offset;
	}

	bool WbxmlParser::selectTokenMapping(int publicId)
	{
	switch (publicId) {
	case PUBLICID_IMPS_1_3:
	case PUBLICID_IMPS_1_2:
	case PUBLICID_IMPS_1_1:
	mTagPages = csp13TagPages;
	mNumTagPages = ARRAY_SIZE(csp13TagPages);
	mAttrPages = csp13AttrPages;
	mNumAttrPages = ARRAY_SIZE(csp13AttrPages);
	break;

	#ifdef SUPPORT_SYNCML
	case PUBLICID_SYNCML_1_0:
	case PUBLICID_SYNCML_1_1:
	case PUBLICID_SYNCML_1_2:
	case PUBLICID_SYNCML_METINF_1_2:
	mTagPages = syncmlTagPages;
	mNumTagPages = ARRAY_SIZE(syncmlTagPages);
	mAttrPages = NULL;
	mNumAttrPages = 0;
	break;

	case PUBLICID_SYNCML_DEVINF_1_2:
	mTagPages = syncmlDevInfTagPages;
	mNumTagPages = ARRAY_SIZE(syncmlDevInfTagPages);
	mAttrPages = NULL;
	mNumAttrPages = 0;
	break;
	#endif
	default:
	return false;
	}
	return true;
	}

	const char * WbxmlParser::lookupTagName(int tag) const
	{
	tag = tag & 0x3f;

	// TODO: optimize this
	if (mCurrTagPage >= mNumTagPages) {
	return NULL;
	}
	const TagCodePage * page = &mTagPages[mCurrTagPage];
	if (page == NULL) {
	return NULL;
	}

	TokenData t = {tag, NULL};
	const TokenData * res = (TokenData *)bsearch(&t, page->tags, page->numTokens,
	sizeof(TokenData), compareTokenData);
	if (res) {
	return res->tagName;
	}

	return NULL;
	}

	const char * WbxmlParser::lookupAttrName(int token, const char **prefix) const
	{
	// TODO: optimize this
	if (mCurrAttrPage >= mNumAttrPages) {
	return NULL;
	}
	const AttrCodePage * page = &mAttrPages[mCurrAttrPage];
	if (page == NULL) {
	return NULL;
	}

	AttrData t = {token, NULL, NULL};
	const AttrData * res = (AttrData *)bsearch(&t, page->attrs, page->numTokens,
	sizeof(AttrData), compareAttrData);
	if (res) {
	if (prefix) {
	*prefix = res->attrValuePrefix;
	}
	return res->attrName;
	}

	return NULL;
	}

	void WbxmlParser::readAttribute(Attribute * attrib)
	{
	// attribute start: attrib start token, LITERAL or END
	int attrStart = readByte();
	const char * name;
	const char * valuePrefix = NULL;

	if (attrStart == TOKEN_LITERAL) {
	name = resolveStrTableRef();
	} else {
	if (attrStart == TOKEN_SWITCH_PAGE) {
	mCurrAttrPage = readByte();
	attrStart = readByte();
	}
	name = lookupAttrName(attrStart, &valuePrefix);
	}
	if (name == NULL) {
	longjmp(mJmpbuf, ERROR_UNRECOGNIZED_ATTR);
	}
	attrib->name = name;
	attrib->value = "";
	if (valuePrefix != NULL) {
	attrib->value = valuePrefix;
	}

	// now attribute value: zero or more value, string, entity or extension tokens
	for (;;) {
	int valueToken = peekByte();
	if (isAttrStart(valueToken) \|\| valueToken == TOKEN_END) {
	// An attribute start token, a LITERAL token or the END token
	// indicates the end of an attribute value.
	return;
	}
	switch (valueToken) {
	case TOKEN_ENTITY:
	case TOKEN_STR_I:
	case TOKEN_STR_T:
	readString(attrib->value);
	break;

	case TOKEN_EXT_I_0:
	case TOKEN_EXT_I_1:
	case TOKEN_EXT_I_2:
	case TOKEN_EXT_0:
	case TOKEN_EXT_1:
	case TOKEN_EXT_2:
	//TODO: document type specific
	printf ("Unsupported Token 0x%x\n", valueToken);
	longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
	break;

	default:
	//TODO
	printf ("Unknown Token 0x%x\n", valueToken);
	longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
	break;
	}
	}
	}