src/dc_lex.c - platform/external/bc - Git at Google

 /*
  * *****************************************************************************
  *
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2018-2021 Gavin D. Howard and contributors.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * * Redistributions of source code must retain the above copyright notice, this
  *   list of conditions and the following disclaimer.
  *
  * * Redistributions in binary form must reproduce the above copyright notice,
  *   this list of conditions and the following disclaimer in the documentation
  *   and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * *****************************************************************************
  *
  * The lexer for dc.
  *
  */

 #if DC_ENABLED

 #include <ctype.h>

 #include <dc.h>
 #include <vm.h>

 bool dc_lex_negCommand(BcLex *l) {
 	char c = l->buf[l->i];
 	return !BC_LEX_NUM_CHAR(c, false, false);
 }

 /**
  * Processes a dc command that needs a register. This is where the
  * extended-register extension is implemented.
  * @param l  The lexer.
  */
 static void dc_lex_register(BcLex *l) {

 	// If extended register is enabled and the character is whitespace...
 	if (DC_X && isspace(l->buf[l->i - 1])) {

 		char c;

 		// Eat the whitespace.
 		bc_lex_whitespace(l);
 		c = l->buf[l->i];

 		// Check for a letter or underscore.
 		if (BC_ERR(!isalpha(c) && c != '_'))
 			bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);

 		// Parse a normal identifier.
 		l->i += 1;
 		bc_lex_name(l);
 	}
 	else {

 		// I don't allow newlines because newlines are used for controlling when
 		// execution happens, and allowing newlines would just be complex.
 		if (BC_ERR(l->buf[l->i - 1] == '\n'))
 			bc_lex_verr(l, BC_ERR_PARSE_CHAR, l->buf[l->i - 1]);

 		// Set the lexer string and token.
 		bc_vec_popAll(&l->str);
 		bc_vec_pushByte(&l->str, (uchar) l->buf[l->i - 1]);
 		bc_vec_pushByte(&l->str, '\0');
 		l->t = BC_LEX_NAME;
 	}
 }

 /**
  * Parses a dc string. Since dc's strings need to check for balanced brackets,
  * we can't just parse bc and dc strings with different start and end
  * characters. Oh, and dc strings need to check for escaped brackets.
  * @param l  The lexer.
  */
 static void dc_lex_string(BcLex *l) {

 	size_t depth, nls, i;
 	char c;
 	bool got_more;

 	// Set the token and clear the string.
 	l->t = BC_LEX_STR;
 	bc_vec_popAll(&l->str);

 	do {

 		depth = 1;
 		nls = 0;
 		got_more = false;

 		assert(!l->is_stdin || l->buf == vm.buffer.v);

 		// This is the meat. As long as we don't run into the NUL byte, and we
 		// have "depth", which means we haven't completely balanced brackets
 		// yet, we continue eating the string.
 		for (i = l->i; (c = l->buf[i]) && depth; ++i) {

 			// Check for escaped brackets and set the depths as appropriate.
 			if (c == '\\') {
 				c = l->buf[++i];
 				if (!c) break;
 			}
 			else {
 				depth += (c == '[');
 				depth -= (c == ']');
 			}

 			// We want to adjust the line in the lexer as necessary.
 			nls += (c == '\n');

 			if (depth) bc_vec_push(&l->str, &c);
 		}

 		if (BC_ERR(c == '\0' && depth)) {
 			if (!vm.eof && l->is_stdin) got_more = bc_lex_readLine(l);
 			if (got_more) bc_vec_popAll(&l->str);
 		}

 	} while (got_more && depth);

 	// Obviously, if we didn't balance, that's an error.
 	if (BC_ERR(c == '\0' && depth)) {
 		l->i = i;
 		bc_lex_err(l, BC_ERR_PARSE_STRING);
 	}

 	bc_vec_pushByte(&l->str, '\0');

 	l->i = i;
 	l->line += nls;
 }

 /**
  * Lexes a dc token. This is the dc implementation of BcLexNext.
  * @param l  The lexer.
  */
 void dc_lex_token(BcLex *l) {

 	char c = l->buf[l->i++], c2;
 	size_t i;

 	// If the last token was a command that needs a register, we need to parse a
 	// register, so do so.
 	for (i = 0; i < dc_lex_regs_len; ++i) {

 		// If the token is a register token, take care of it and return.
 		if (l->last == dc_lex_regs[i]) {
 			dc_lex_register(l);
 			return;
 		}
 	}

 	// These lines are for tokens that easily correspond to one character. We
 	// just set the token.
 	if (c >= '"' && c <= '~' &&
 	    (l->t = dc_lex_tokens[(c - '"')]) != BC_LEX_INVALID)
 	{
 		return;
 	}

 	// This is the workhorse of the lexer when more complicated things are
 	// needed.
 	switch (c) {

 		case '\0':
 		case '\n':
 		case '\t':
 		case '\v':
 		case '\f':
 		case '\r':
 		case ' ':
 		{
 			bc_lex_commonTokens(l, c);
 			break;
 		}

 		// We don't have the ! command, so we always expect certain things
 		// after the exclamation point.
 		case '!':
 		{
 			c2 = l->buf[l->i];

 			if (c2 == '=') l->t = BC_LEX_OP_REL_NE;
 			else if (c2 == '<') l->t = BC_LEX_OP_REL_LE;
 			else if (c2 == '>') l->t = BC_LEX_OP_REL_GE;
 			else bc_lex_invalidChar(l, c);

 			l->i += 1;

 			break;
 		}

 		case '#':
 		{
 			bc_lex_lineComment(l);
 			break;
 		}

 		case '.':
 		{
 			c2 = l->buf[l->i];

 			// If the character after is a number, this dot is part of a number.
 			// Otherwise, it's the BSD dot (equivalent to last).
 			if (BC_NO_ERR(BC_LEX_NUM_CHAR(c2, true, false)))
 				bc_lex_number(l, c);
 			else bc_lex_invalidChar(l, c);

 			break;
 		}

 		case '0':
 		case '1':
 		case '2':
 		case '3':
 		case '4':
 		case '5':
 		case '6':
 		case '7':
 		case '8':
 		case '9':
 		case 'A':
 		case 'B':
 		case 'C':
 		case 'D':
 		case 'E':
 		case 'F':
 		{
 			bc_lex_number(l, c);
 			break;
 		}

 		case '[':
 		{
 			dc_lex_string(l);
 			break;
 		}

 		default:
 		{
 			bc_lex_invalidChar(l, c);
 		}
 	}
 }
 #endif // DC_ENABLED
	/*
	* *****************************************************************************
	*
	* SPDX-License-Identifier: BSD-2-Clause
	*
	* Copyright (c) 2018-2021 Gavin D. Howard and contributors.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* * Redistributions of source code must retain the above copyright notice, this
	* list of conditions and the following disclaimer.
	*
	* * Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* *****************************************************************************
	*
	* The lexer for dc.
	*
	*/

	#if DC_ENABLED

	#include <ctype.h>

	#include <dc.h>
	#include <vm.h>

	bool dc_lex_negCommand(BcLex *l) {
	char c = l->buf[l->i];
	return !BC_LEX_NUM_CHAR(c, false, false);
	}

	/**
	* Processes a dc command that needs a register. This is where the
	* extended-register extension is implemented.
	* @param l The lexer.
	*/
	static void dc_lex_register(BcLex *l) {

	// If extended register is enabled and the character is whitespace...
	if (DC_X && isspace(l->buf[l->i - 1])) {

	char c;

	// Eat the whitespace.
	bc_lex_whitespace(l);
	c = l->buf[l->i];

	// Check for a letter or underscore.
	if (BC_ERR(!isalpha(c) && c != '_'))
	bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);

	// Parse a normal identifier.
	l->i += 1;
	bc_lex_name(l);
	}
	else {

	// I don't allow newlines because newlines are used for controlling when
	// execution happens, and allowing newlines would just be complex.
	if (BC_ERR(l->buf[l->i - 1] == '\n'))
	bc_lex_verr(l, BC_ERR_PARSE_CHAR, l->buf[l->i - 1]);

	// Set the lexer string and token.
	bc_vec_popAll(&l->str);
	bc_vec_pushByte(&l->str, (uchar) l->buf[l->i - 1]);
	bc_vec_pushByte(&l->str, '\0');
	l->t = BC_LEX_NAME;
	}
	}

	/**
	* Parses a dc string. Since dc's strings need to check for balanced brackets,
	* we can't just parse bc and dc strings with different start and end
	* characters. Oh, and dc strings need to check for escaped brackets.
	* @param l The lexer.
	*/
	static void dc_lex_string(BcLex *l) {

	size_t depth, nls, i;
	char c;
	bool got_more;

	// Set the token and clear the string.
	l->t = BC_LEX_STR;
	bc_vec_popAll(&l->str);

	do {

	depth = 1;
	nls = 0;
	got_more = false;

	assert(!l->is_stdin \|\| l->buf == vm.buffer.v);

	// This is the meat. As long as we don't run into the NUL byte, and we
	// have "depth", which means we haven't completely balanced brackets
	// yet, we continue eating the string.
	for (i = l->i; (c = l->buf[i]) && depth; ++i) {

	// Check for escaped brackets and set the depths as appropriate.
	if (c == '\\') {
	c = l->buf[++i];
	if (!c) break;
	}
	else {
	depth += (c == '[');
	depth -= (c == ']');
	}

	// We want to adjust the line in the lexer as necessary.
	nls += (c == '\n');

	if (depth) bc_vec_push(&l->str, &c);
	}

	if (BC_ERR(c == '\0' && depth)) {
	if (!vm.eof && l->is_stdin) got_more = bc_lex_readLine(l);
	if (got_more) bc_vec_popAll(&l->str);
	}

	} while (got_more && depth);

	// Obviously, if we didn't balance, that's an error.
	if (BC_ERR(c == '\0' && depth)) {
	l->i = i;
	bc_lex_err(l, BC_ERR_PARSE_STRING);
	}

	bc_vec_pushByte(&l->str, '\0');

	l->i = i;
	l->line += nls;
	}

	/**
	* Lexes a dc token. This is the dc implementation of BcLexNext.
	* @param l The lexer.
	*/
	void dc_lex_token(BcLex *l) {

	char c = l->buf[l->i++], c2;
	size_t i;

	// If the last token was a command that needs a register, we need to parse a
	// register, so do so.
	for (i = 0; i < dc_lex_regs_len; ++i) {

	// If the token is a register token, take care of it and return.
	if (l->last == dc_lex_regs[i]) {
	dc_lex_register(l);
	return;
	}
	}

	// These lines are for tokens that easily correspond to one character. We
	// just set the token.
	if (c >= '"' && c <= '~' &&
	(l->t = dc_lex_tokens[(c - '"')]) != BC_LEX_INVALID)
	{
	return;
	}

	// This is the workhorse of the lexer when more complicated things are
	// needed.
	switch (c) {

	case '\0':
	case '\n':
	case '\t':
	case '\v':
	case '\f':
	case '\r':
	case ' ':
	{
	bc_lex_commonTokens(l, c);
	break;
	}

	// We don't have the ! command, so we always expect certain things
	// after the exclamation point.
	case '!':
	{
	c2 = l->buf[l->i];

	if (c2 == '=') l->t = BC_LEX_OP_REL_NE;
	else if (c2 == '<') l->t = BC_LEX_OP_REL_LE;
	else if (c2 == '>') l->t = BC_LEX_OP_REL_GE;
	else bc_lex_invalidChar(l, c);

	l->i += 1;

	break;
	}

	case '#':
	{
	bc_lex_lineComment(l);
	break;
	}

	case '.':
	{
	c2 = l->buf[l->i];

	// If the character after is a number, this dot is part of a number.
	// Otherwise, it's the BSD dot (equivalent to last).
	if (BC_NO_ERR(BC_LEX_NUM_CHAR(c2, true, false)))
	bc_lex_number(l, c);
	else bc_lex_invalidChar(l, c);

	break;
	}

	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
	case 'A':
	case 'B':
	case 'C':
	case 'D':
	case 'E':
	case 'F':
	{
	bc_lex_number(l, c);
	break;
	}

	case '[':
	{
	dc_lex_string(l);
	break;
	}

	default:
	{
	bc_lex_invalidChar(l, c);
	}
	}
	}
	#endif // DC_ENABLED