blob: 45b1801763d9d43880e3aeadf65a2ba877398a36 [file] [log] [blame]
var assert = require('assert'),
fs = require('fs'),
path = require('path'),
Tokenizer = require('../../lib/tokenization/tokenizer');
function tokenize(html, initialState, lastStartTag) {
var tokenizer = new Tokenizer(html),
nextToken = null,
out = [];
tokenizer.state = initialState;
if (lastStartTag)
tokenizer.lastStartTagName = lastStartTag;
do {
nextToken = tokenizer.getNextToken();
//NOTE: append current token to the output sequence in html5lib test suite compatible format
switch (nextToken.type) {
case Tokenizer.CHARACTER_TOKEN:
case Tokenizer.NULL_CHARACTER_TOKEN:
case Tokenizer.WHITESPACE_CHARACTER_TOKEN:
out.push(['Character', nextToken.chars]);
break;
case Tokenizer.START_TAG_TOKEN:
var reformatedAttrs = {};
nextToken.attrs.forEach(function (attr) {
reformatedAttrs[attr.name] = attr.value;
});
var startTagEntry = [
'StartTag',
nextToken.tagName,
reformatedAttrs
];
if (nextToken.selfClosing)
startTagEntry.push(true);
out.push(startTagEntry);
break;
case Tokenizer.END_TAG_TOKEN:
out.push(['EndTag', nextToken.tagName]);
break;
case Tokenizer.COMMENT_TOKEN:
out.push(['Comment', nextToken.data]);
break;
case Tokenizer.DOCTYPE_TOKEN:
out.push([
'DOCTYPE',
nextToken.name,
nextToken.publicId,
nextToken.systemId,
!nextToken.forceQuirks
]);
break;
}
} while (nextToken.type !== Tokenizer.EOF_TOKEN);
return concatCharacterTokens(out)
}
function unicodeUnescape(str) {
return str.replace(/\\u([\d\w]{4})/gi, function (match, chCodeStr) {
return String.fromCharCode(parseInt(chCodeStr, 16));
});
}
function unescapeDescrIO(testDescr) {
testDescr.input = unicodeUnescape(testDescr.input);
testDescr.output.forEach(function (tokenEntry) {
if (tokenEntry === 'ParseError')
return;
//NOTE: unescape token tagName (for StartTag and EndTag tokens), comment data (for Comment token),
//character token data (for Character token).
tokenEntry[1] = unicodeUnescape(tokenEntry[1]);
//NOTE: unescape token attributes(if we have them).
if (tokenEntry.length > 2) {
Object.keys(tokenEntry).forEach(function (attrName) {
var attrVal = tokenEntry[attrName];
delete tokenEntry[attrName];
tokenEntry[unicodeUnescape(attrName)] = unicodeUnescape(attrVal);
});
}
});
}
function concatCharacterTokens(tokenEntries) {
var result = [];
tokenEntries.forEach(function (tokenEntry) {
if (tokenEntry[0] === 'Character') {
var lastEntry = result[result.length - 1];
if (lastEntry && lastEntry[0] === 'Character') {
lastEntry[1] += tokenEntry[1];
return;
}
}
result.push(tokenEntry);
});
return result;
}
function getTokenizerSuitableStateName(testDataStateName) {
return testDataStateName.toUpperCase().replace(/\s/g, '_');
}
function loadTests() {
var dataDirPath = path.join(__dirname, '../data/tokenization'),
testSetFileNames = fs.readdirSync(dataDirPath),
testIdx = 0,
tests = [];
testSetFileNames.forEach(function (fileName) {
var filePath = path.join(dataDirPath, fileName),
testSetJson = fs.readFileSync(filePath).toString(),
testSet = JSON.parse(testSetJson),
testDescrs = testSet.tests,
setName = fileName.replace('.test', '');
testDescrs.forEach(function (descr) {
if (!descr.initialStates)
descr.initialStates = ['Data state'];
if (descr.doubleEscaped)
unescapeDescrIO(descr);
var expected = [];
descr.output.forEach(function (tokenEntry) {
if (tokenEntry !== 'ParseError')
expected.push(tokenEntry);
});
descr.initialStates.forEach(function (initialState) {
tests.push({
idx: ++testIdx,
setName: setName,
name: descr.description,
input: descr.input,
expected: concatCharacterTokens(expected),
initialState: getTokenizerSuitableStateName(initialState),
lastStartTag: descr.lastStartTag
});
});
});
});
return tests;
}
function getFullTestName(test) {
return ['Tokenizer - ' +
test.idx, '.', test.setName, ' - ', test.name, ' - Initial state: ', test.initialState].join('');
}
//Here we go..
loadTests().forEach(function (test) {
exports[getFullTestName(test)] = function () {
var out = tokenize(test.input, test.initialState, test.lastStartTag);
assert.deepEqual(out, test.expected);
};
});
exports['Options - locationInfo'] = function () {
var testCases = [
{
initialMode: Tokenizer.MODE.DATA,
lastStartTagName: '',
htmlChunks: [
'\r\n', '<!DOCTYPE html>', '\n',
'<!-- Test -->', '\n',
'<head>',
'\n ', '<meta charset="utf-8">', '<title>', ' ', 'node.js', '\u0000', '</title>', '\n',
'</head>', '\n',
'<body id="front">', '\n',
'<div id="intro">',
'\n ', '<p>',
'\n ', 'Node.js', ' ', 'is', ' ', 'a',
'\n ', 'platform', ' ', 'built', ' ', 'on',
'\n ', '<a href="http://code.google.com/p/v8/">',
'\n ', 'Chrome\'s', ' ', 'JavaScript', ' ', 'runtime',
'\n ', '</a>', '\n',
'</div>',
'<body>'
]
},
{
initialMode: Tokenizer.MODE.RCDATA,
lastStartTagName: 'title',
htmlChunks: [
'<div>Test',
' \n ', 'hey', ' ', 'ya!', '</title>', '<!--Yo-->'
]
},
{
initialMode: Tokenizer.MODE.RAWTEXT,
lastStartTagName: 'style',
htmlChunks: [
'.header{', ' \n ', 'color:red;', '\n', '}', '</style>', 'Some', ' ', 'text'
]
},
{
initialMode: Tokenizer.MODE.SCRIPT_DATA,
lastStartTagName: 'script',
htmlChunks: [
'var', ' ', 'a=c', ' ', '-', ' ', 'd;', '\n', 'a<--d;', '</script>', '<div>'
]
},
{
initialMode: Tokenizer.MODE.PLAINTEXT,
lastStartTagName: 'plaintext',
htmlChunks: [
'Text', ' \n', 'Test</plaintext><div>'
]
}
];
testCases.forEach(function (testCase) {
var html = testCase.htmlChunks.join(''),
tokenizer = new Tokenizer(html, {locationInfo: true});
tokenizer.state = testCase.initialMode;
tokenizer.lastStartTagName = testCase.lastStartTagName;
for (var token = tokenizer.getNextToken(), i = 0; token.type !== Tokenizer.EOF_TOKEN;) {
var chunk = html.substring(token.location.start, token.location.end);
assert.strictEqual(chunk, testCase.htmlChunks[i]);
token = tokenizer.getNextToken();
i++;
}
});
};