blob: 091611e73d94af0d9a8646f214022f8a33fda45b [file] [log] [blame]
/*
* Copyright (C) 2008 Esmertec AG.
* Copyright (C) 2008 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdio.h>
#include <stdlib.h>
#include <setjmp.h>
#include <assert.h>
#include "wbxml_parser.h"
#include "csp13_data.h"
#ifdef SUPPORT_SYNCML
#include "syncml_data.h"
#endif
#ifdef PLATFORM_ANDROID
extern "C" void *bsearch(const void *key, const void *base0, size_t nmemb,
size_t size, int (*compar)(const void *, const void *));
#endif
#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
//#define WBXML_DEBUG 1
/* Major TODO items:
- Attribute value tokens (not used by IMPS CSP)
- EXT_* except EXT_T_0 (not used by IMPS CSP)
- PI (not used by IMPS CSP)
- cleanups
Other TODO:
- Support more public ID? Only IMPS is supported now.
- Support other charsets than UTF-8
*/
static int compareTokenData(const void * t1, const void * t2)
{
return ((TokenData *)t1)->token - ((TokenData *)t2)->token;
}
static int compareAttrData(const void * t1, const void * t2)
{
return ((AttrData *)t1)->token - ((AttrData *)t2)->token;
}
static bool isTagStart(int token)
{
if (token == TOKEN_SWITCH_PAGE)
return true;
token &= 0x3f;
return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0);
}
static bool isAttrStart(int token)
{
return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0) ||
(token > TOKEN_LITERAL_C && token < 0x80);
}
WbxmlParser::WbxmlParser(uint32_t transportEncoding) :
mTransportEncoding(transportEncoding)
{
reset();
}
WbxmlParser::~WbxmlParser()
{
}
void WbxmlParser::reset(void)
{
mContentHandler = NULL;
mExternalChunk = NULL;
mExternalChunkLen = 0;
mLastChunk.clear();
mDataOffset = 0;
mIsDataEnd = false;
mStartElemStack.clear();
mStringTable.clear();
mCurrTagPage = mCurrAttrPage = 0;
mPublicId = 0;
mState = EXPECT_HEADER;
mLastError = ERROR_NO_ERROR;
}
void WbxmlParser::setContentHandler(WbxmlContentHandler * handler)
{
mContentHandler = handler;
}
int WbxmlParser::parse(const char * data, uint32_t dataLen, bool end)
{
if (data == NULL) {
mLastError = ERROR_INVALID_DATA;
return WBXML_STATUS_ERROR;
}
// All temporary C++ varaibles must be declared before setjmp to make
// sure they get properly destructed after longjmp.
vector<Attribute> attribs;
Attribute attrib;
string tagName;
string characters;
string opaque;
#ifdef WBXML_DEBUG
printf("\nparse dataLen %d; end %d; readPos %d; availData %d\n",
dataLen, end, getReadPos(), availDataSize());
#endif
appendData(data, dataLen, end);
volatile int readPos = getReadPos();
int setjmpRet;
switch (setjmpRet = setjmp(mJmpbuf)) {
case 0:
break;
case ERROR_NEED_MORE_DATA:
if (!mIsDataEnd) {
#ifdef WBXML_DEBUG
printf("\nneed more data: readPos %d\n", readPos);
#endif
setReadPos(readPos);
saveRemainingData();
return WBXML_STATUS_OK;
} else {
#ifdef WBXML_DEBUG
printf("wbxml parser error: unexpected data end\n");
#endif
mLastError = ERROR_NEED_MORE_DATA;
return WBXML_STATUS_ERROR;
}
break;
case ERROR_UNSUPPORTED_PUBID:
case ERROR_UNSUPPORTED_CHARSET:
case ERROR_INVALID_STRING_TABLE:
case ERROR_INVALID_STRING_TABLE_REFERENCE:
case ERROR_INVALID_EXT_TOKEN:
case ERROR_INVALID_MBUINT:
case ERROR_INVALID_ENTITY:
case ERROR_UNRECOGNIZED_TAG:
case ERROR_UNRECOGNIZED_ATTR:
case ERROR_MISSING_ATTR:
case ERROR_MISSING_TOKEN_END:
#ifdef WBXML_DEBUG
printf("wbxml parser error %d\n", setjmpRet);
#endif
mLastError = ParserError(setjmpRet);
return WBXML_STATUS_ERROR;
break;
case ERROR_NOT_SUPPORTED_YET:
printf("wbxml parser error: Not implemented feature.\n");
mLastError = ParserError(setjmpRet);
return WBXML_STATUS_ERROR;
break;
default:
printf("wbxml parser error: Impossible execution path.\n");
mLastError = ParserError(setjmpRet);
return WBXML_STATUS_ERROR;
break;
}
for (;;) {
// save readPos for error recovery
readPos = getReadPos();
switch (mState) {
case EXPECT_HEADER:
mDocVersion = readByte();
mPublicId = readMbuint32();
if (mPublicId != 0) {
if (!selectTokenMapping(mPublicId)) {
#ifdef WBXML_DEBUG
printf("wbxml parser error: unsupported public id \n");
#endif
longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID);
}
} else {
mPublicId = -readMbuint32();
}
mCharset = readMbuint32();
if (!mCharset) {
mCharset = mTransportEncoding;
if (!mCharset) {
mCharset = CHARSET_UTF8;
}
}
// TODO: support more charsets other than UTF-8
if (mCharset != CHARSET_UTF8) {
#ifdef WBXML_DEBUG
printf("wbxml parser error: unsupported charset\n");
#endif
longjmp(mJmpbuf, ERROR_UNSUPPORTED_CHARSET);
}
// now advance to next state
if (mContentHandler) {
mContentHandler->handlePublicId(mPublicId);
}
mState = EXPECT_STRING_TABLE;
break;
case EXPECT_STRING_TABLE:
{
uint32_t len = readMbuint32();
if (availDataSize() < len) {
longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
}
mStringTable.clear();
// TODO: optimize this
while (len--) {
mStringTable += readByte();
}
if (mStringTable.size()) {
if (mStringTable[mStringTable.size() - 1] != 0) {
// must have an ending \0
//TODO:the byte array returned by SCTS does not contain '\0' at the
//end,should this be fixed accordingly?
#ifdef WBXML_DEBUG
printf("wbxml parser error: invalid string table\n");
#endif
longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE);
}
}
mState = EXPECT_BODY_START;
if (mPublicId <= 0) {
const char * s = mStringTable.c_str() + (-mPublicId);
#ifdef SUPPORT_SYNCML
if (strcmp(s, "-//SYNCML//DTD SyncML 1.2//EN") == 0) {
mPublicId = PUBLICID_SYNCML_1_2;
} else if (strcmp(s, "-//SYNCML//DTD SyncML 1.1//EN") == 0) {
mPublicId = PUBLICID_SYNCML_1_1;
} else if (strcmp(s, "-//SYNCML//DTD SyncML 1.0//EN") == 0) {
mPublicId = PUBLICID_SYNCML_1_0;
}
#endif
if ((mPublicId <= 0) || !selectTokenMapping(mPublicId)) {
longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID);
}
}
break;
}
case EXPECT_BODY_START:
//TODO: handle possible PIs
mState = EXPECT_ELEMENT_START;
break;
case EXPECT_ELEMENT_START:
{
int stag = readByte();
const char * name;
if ((stag & 0x3f) == TOKEN_LITERAL) {
name = resolveStrTableRef();
} else {
if (stag == TOKEN_SWITCH_PAGE) {
mCurrTagPage = readByte();
stag = readByte();
}
name = lookupTagName(stag);
}
if (name == NULL) {
#ifdef WBXML_DEBUG
printf("wbxml parser error: unrecognized tag\n");
#endif
longjmp(mJmpbuf, ERROR_UNRECOGNIZED_TAG);
}
attribs.clear();
if (stag & 0x80) {
// followed by 1 or more attributes
while (peekByte() != TOKEN_END) {
readAttribute(&attrib);
attribs.push_back(attrib);
}
if (!attribs.size()) {
#ifdef WBXML_DEBUG
printf("wbxml parser error: missing attributes\n");
#endif
longjmp(mJmpbuf, ERROR_MISSING_ATTR);
}
// TOKEN_END
readByte();
}
if (mContentHandler) {
mContentHandler->startElement(name, attribs);
}
if (stag & 0x40) {
mState = EXPECT_CONTENT;
} else {
mState = ELEMENT_END;
}
tagName = name;
mStartElemStack.push_back(name);
break;
}
case EXPECT_CONTENT:
{
int byte = peekByte();
if (byte == TOKEN_SWITCH_PAGE) {
readByte();
mCurrTagPage = readByte();
byte = peekByte();
}
if (isTagStart(byte) || byte == TOKEN_END) {
if (characters.size() && mContentHandler) {
mContentHandler->characters(characters.c_str(), characters.size());
characters.clear();
}
if (byte == TOKEN_END) {
mState = EXPECT_ELEMENT_END;
} else {
mState = EXPECT_ELEMENT_START;
}
} else {
// TODO: handle extension and pi
switch (byte) {
case TOKEN_ENTITY:
case TOKEN_STR_I:
case TOKEN_STR_T:
readString(characters);
break;
case TOKEN_EXT_T_0:
{
readByte();
uint32_t valueToken = readMbuint32();
if (mPublicId == PUBLICID_IMPS_1_1
|| mPublicId == PUBLICID_IMPS_1_2
|| mPublicId == PUBLICID_IMPS_1_3) {
TokenData t = {valueToken, NULL};
const TokenData * res = (TokenData *)bsearch(&t,
csp13ExtValueTokens, ARRAY_SIZE(csp13ExtValueTokens),
sizeof(csp13ExtValueTokens[0]), compareTokenData);
if (res) {
characters.append(res->tagName);
} else {
longjmp(mJmpbuf, ERROR_INVALID_EXT_TOKEN);
}
} else {
printf ("Token 0x%x\n", byte);
longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
}
break;
}
case TOKEN_OPAQUE:
{
readByte();
uint32_t opaqueDataLen = readMbuint32();
opaque.clear();
while (opaqueDataLen--) {
opaque += (char)readByte();
}
if (mContentHandler) {
mContentHandler->opaque(opaque.c_str(), opaque.size());
}
break;
}
default:
printf ("Token 0x%x\n", byte);
longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
break;
}
}
break;
}
case EXPECT_ELEMENT_END:
if (readByte() != TOKEN_END) {
#ifdef WBXML_DEBUG
printf("wbxml parser error: TOKEN_END expected\n");
#endif
longjmp(mJmpbuf, ERROR_MISSING_TOKEN_END);
}
mState = ELEMENT_END;
break;
case ELEMENT_END:
assert(!mStartElemStack.empty());
tagName = mStartElemStack.back();
mStartElemStack.pop_back();
if (mContentHandler) {
mContentHandler->endElement(tagName.c_str());
}
if (mStartElemStack.empty()) {
mState = EXPECT_BODY_END;
} else {
mState = EXPECT_CONTENT;
}
break;
case EXPECT_BODY_END:
// TODO: handle possible PIs
// we're done
return WBXML_STATUS_OK;
break;
}
}
}
/*
* We don't make a copy of the data chunk for the current parse() until
* it returns.
* The remaining data will be saved in saveRemainingData() before parse()
* returns.
*/
void WbxmlParser::appendData(const char * data, uint32_t len, bool end)
{
mExternalChunk = data;
mExternalChunkLen = len;
mIsDataEnd = end;
}
void WbxmlParser::saveRemainingData()
{
if (mDataOffset > mLastChunk.size()) {
uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
assert(offsetToExtChunk <= mExternalChunkLen);
mLastChunk.assign(mExternalChunk + offsetToExtChunk,
mExternalChunkLen - offsetToExtChunk);
mDataOffset = 0;
} else {
mLastChunk.append(mExternalChunk, mExternalChunkLen);
}
mExternalChunk = NULL;
mExternalChunkLen = 0;
}
int WbxmlParser::readByte()
{
if (mDataOffset < mLastChunk.size()) {
#ifdef WBXML_DEBUG
printf ("rb 0x%x; ", (unsigned char)mLastChunk[mDataOffset]);
#endif
return (unsigned char)mLastChunk[mDataOffset++];
} else {
uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
if (offsetToExtChunk < mExternalChunkLen) {
mDataOffset++;
#ifdef WBXML_DEBUG
printf ("rb 0x%x; ", (unsigned char)mExternalChunk[offsetToExtChunk]);
#endif
return (unsigned char)mExternalChunk[offsetToExtChunk];
}
longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
}
}
int WbxmlParser::peekByte()
{
if (mDataOffset < mLastChunk.size()) {
return (unsigned char)mLastChunk[mDataOffset];
} else {
uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
if (offsetToExtChunk < mExternalChunkLen) {
return (unsigned char)mExternalChunk[offsetToExtChunk];
}
longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
}
}
uint32_t WbxmlParser::readMbuint32()
{
uint32_t value = 0;
uint32_t byte;
do {
if ((value >> 25) != 0) {
// would go overflow. not a valid uint32.
longjmp(mJmpbuf, ERROR_INVALID_MBUINT);
}
byte = readByte();
value = (value << 7) | (byte & 0x7f);
} while (byte & 0x80);
return value;
}
/**
* Read STR_I | STR_T | ENTITY and *append* to str.
* Yes this looks ugly...
*/
void WbxmlParser::readString(string & str)
{
int byte = readByte();
switch (byte) {
case TOKEN_STR_I:
//TODO: assuming UTF-8
while ((byte = readByte()) != 0) {
str += (char)byte;
}
break;
case TOKEN_ENTITY:
{
uint32_t ch = readMbuint32();
//TODO: assuming UTF-8 for now.
if (ch <= 0x7f) {
str += (char)ch;
} else if (ch <= 0x7ff) {
str += (char)((ch >> 6) | 0xc0);
str += (char)((ch & 0x3f) | 0x80);
} else if (ch <= 0xffff) {
str += (char)((ch >> 12) | 0xe0);
str += (char)(((ch >> 6) & 0x3f) | 0x80);
str += (char)((ch & 0x3f) | 0x80);
} else if (ch <= 0x10ffff) {
// 010000 - 10FFFF
str += (char)((ch >> 18) | 0xf0);
str += (char)(((ch >> 12) & 0x3f) | 0x80);
str += (char)(((ch >> 6) & 0x3f) | 0x80);
str += (char)((ch & 0x3f) | 0x80);
} else {
// not a valid UCS-4 character
longjmp(mJmpbuf, ERROR_INVALID_ENTITY);
}
break;
}
case TOKEN_STR_T:
{
const char * s = resolveStrTableRef();
str.append(s, strlen(s));
break;
}
default:
// impossible
printf ("Unknown token 0x%02x\n", byte);
longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
break;
}
}
const char * WbxmlParser::resolveStrTableRef(void)
{
uint32_t offset = readMbuint32();
if (offset >= mStringTable.size()) {
longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE_REFERENCE);
}
return mStringTable.c_str() + offset;
}
bool WbxmlParser::selectTokenMapping(int publicId)
{
switch (publicId) {
case PUBLICID_IMPS_1_3:
case PUBLICID_IMPS_1_2:
case PUBLICID_IMPS_1_1:
mTagPages = csp13TagPages;
mNumTagPages = ARRAY_SIZE(csp13TagPages);
mAttrPages = csp13AttrPages;
mNumAttrPages = ARRAY_SIZE(csp13AttrPages);
break;
#ifdef SUPPORT_SYNCML
case PUBLICID_SYNCML_1_0:
case PUBLICID_SYNCML_1_1:
case PUBLICID_SYNCML_1_2:
case PUBLICID_SYNCML_METINF_1_2:
mTagPages = syncmlTagPages;
mNumTagPages = ARRAY_SIZE(syncmlTagPages);
mAttrPages = NULL;
mNumAttrPages = 0;
break;
case PUBLICID_SYNCML_DEVINF_1_2:
mTagPages = syncmlDevInfTagPages;
mNumTagPages = ARRAY_SIZE(syncmlDevInfTagPages);
mAttrPages = NULL;
mNumAttrPages = 0;
break;
#endif
default:
return false;
}
return true;
}
const char * WbxmlParser::lookupTagName(int tag) const
{
tag = tag & 0x3f;
// TODO: optimize this
if (mCurrTagPage >= mNumTagPages) {
return NULL;
}
const TagCodePage * page = &mTagPages[mCurrTagPage];
if (page == NULL) {
return NULL;
}
TokenData t = {tag, NULL};
const TokenData * res = (TokenData *)bsearch(&t, page->tags, page->numTokens,
sizeof(TokenData), compareTokenData);
if (res) {
return res->tagName;
}
return NULL;
}
const char * WbxmlParser::lookupAttrName(int token, const char **prefix) const
{
// TODO: optimize this
if (mCurrAttrPage >= mNumAttrPages) {
return NULL;
}
const AttrCodePage * page = &mAttrPages[mCurrAttrPage];
if (page == NULL) {
return NULL;
}
AttrData t = {token, NULL, NULL};
const AttrData * res = (AttrData *)bsearch(&t, page->attrs, page->numTokens,
sizeof(AttrData), compareAttrData);
if (res) {
if (prefix) {
*prefix = res->attrValuePrefix;
}
return res->attrName;
}
return NULL;
}
void WbxmlParser::readAttribute(Attribute * attrib)
{
// attribute start: attrib start token, LITERAL or END
int attrStart = readByte();
const char * name;
const char * valuePrefix = NULL;
if (attrStart == TOKEN_LITERAL) {
name = resolveStrTableRef();
} else {
if (attrStart == TOKEN_SWITCH_PAGE) {
mCurrAttrPage = readByte();
attrStart = readByte();
}
name = lookupAttrName(attrStart, &valuePrefix);
}
if (name == NULL) {
longjmp(mJmpbuf, ERROR_UNRECOGNIZED_ATTR);
}
attrib->name = name;
attrib->value = "";
if (valuePrefix != NULL) {
attrib->value = valuePrefix;
}
// now attribute value: zero or more value, string, entity or extension tokens
for (;;) {
int valueToken = peekByte();
if (isAttrStart(valueToken) || valueToken == TOKEN_END) {
// An attribute start token, a LITERAL token or the END token
// indicates the end of an attribute value.
return;
}
switch (valueToken) {
case TOKEN_ENTITY:
case TOKEN_STR_I:
case TOKEN_STR_T:
readString(attrib->value);
break;
case TOKEN_EXT_I_0:
case TOKEN_EXT_I_1:
case TOKEN_EXT_I_2:
case TOKEN_EXT_0:
case TOKEN_EXT_1:
case TOKEN_EXT_2:
//TODO: document type specific
printf ("Unsupported Token 0x%x\n", valueToken);
longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
break;
default:
//TODO
printf ("Unknown Token 0x%x\n", valueToken);
longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
break;
}
}
}