An implementation of Unix dc and POSIX bc with GNU and BSD extensions. Finished, but well-maintained.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
bc/src/bc_parse.c

2598 lines
68 KiB

/*
* *****************************************************************************
*
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2018-2021 Gavin D. Howard and contributors.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* *****************************************************************************
*
* The parser for bc.
*
*/
#if BC_ENABLED
#include <assert.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <setjmp.h>
#include <bc.h>
#include <num.h>
#include <vm.h>
// Before you embark on trying to understand this code, have you read the
// Development manual (manuals/development.md) and the comment in include/bc.h
// yet? No? Do that first. I'm serious.
//
// The reason is because this file holds the most sensitive and finicky code in
// the entire codebase. Even getting history to work on Windows was nothing
// compared to this. This is where dreams go to die, where dragons live, and
// from which Ken Thompson himself would flee.
static void
bc_parse_else(BcParse* p);
static void
bc_parse_stmt(BcParse* p);
static BcParseStatus
bc_parse_expr_err(BcParse* p, uint8_t flags, BcParseNext next);
static void
bc_parse_expr_status(BcParse* p, uint8_t flags, BcParseNext next);
/**
* Returns true if an instruction could only have come from a "leaf" expression.
* For more on what leaf expressions are, read the comment for BC_PARSE_LEAF().
* @param t The instruction to test.
* @return True if the instruction is a from a leaf expression.
*/
static bool
bc_parse_inst_isLeaf(BcInst t)
{
return (t >= BC_INST_NUM && t <= BC_INST_LEADING_ZERO) ||
#if BC_ENABLE_EXTRA_MATH
t == BC_INST_TRUNC ||
#endif // BC_ENABLE_EXTRA_MATH
t <= BC_INST_DEC;
}
/**
* Returns true if the *previous* token was a delimiter. A delimiter is anything
* that can legally end a statement. In bc's case, it could be a newline, a
* semicolon, and a brace in certain cases.
* @param p The parser.
* @return True if the token is a legal delimiter.
*/
static bool
bc_parse_isDelimiter(const BcParse* p)
{
BcLexType t = p->l.t;
bool good;
// If it's an obvious delimiter, say so.
if (BC_PARSE_DELIMITER(t)) return true;
good = false;
// If the current token is a keyword, then...beware. That means that we need
// to check for a "dangling" else, where there was no brace-delimited block
// on the previous if.
if (t == BC_LEX_KW_ELSE)
{
size_t i;
uint16_t *fptr = NULL, flags = BC_PARSE_FLAG_ELSE;
// As long as going up the stack is valid for a dangling else, keep on.
for (i = 0; i < p->flags.len && BC_PARSE_BLOCK_STMT(flags); ++i)
{
fptr = bc_vec_item_rev(&p->flags, i);
flags = *fptr;
3 years ago
// If we need a brace and don't have one, then we don't have a
// delimiter.
if ((flags & BC_PARSE_FLAG_BRACE) && p->l.last != BC_LEX_RBRACE)
{
return false;
}
}
// Oh, and we had also better have an if statement somewhere.
good = ((flags & BC_PARSE_FLAG_IF) != 0);
}
else if (t == BC_LEX_RBRACE)
{
size_t i;
// Since we have a brace, we need to just check if a brace was needed.
for (i = 0; !good && i < p->flags.len; ++i)
{
uint16_t* fptr = bc_vec_item_rev(&p->flags, i);
good = (((*fptr) & BC_PARSE_FLAG_BRACE) != 0);
}
}
return good;
}
/**
* Returns true if we are in top level of a function body. The POSIX grammar
* is defined such that anything is allowed after a function body, so we must
* use this function to detect that case when ending a function body.
* @param p The parser.
* @return True if we are in the top level of parsing a function body.
*/
static bool
bc_parse_TopFunc(const BcParse* p)
{
bool good = p->flags.len == 2;
uint16_t val = BC_PARSE_FLAG_BRACE | BC_PARSE_FLAG_FUNC_INNER;
val |= BC_PARSE_FLAG_FUNC;
return good && BC_PARSE_TOP_FLAG(p) == val;
}
/**
* Sets a previously defined exit label. What are labels? See the bc Parsing
* section of the Development manual (manuals/development.md).
* @param p The parser.
*/
static void
bc_parse_setLabel(BcParse* p)
{
BcFunc* func = p->func;
BcInstPtr* ip = bc_vec_top(&p->exits);
size_t* label;
assert(func == bc_vec_item(&p->prog->fns, p->fidx));
// Set the preallocated label to the correct index.
label = bc_vec_item(&func->labels, ip->idx);
*label = func->code.len;
// Now, we don't need the exit label; it is done.
bc_vec_pop(&p->exits);
}
/**
* Creates a label and sets it to idx. If this is an exit label, then idx is
* actually invalid, but it doesn't matter because it will be fixed by
* bc_parse_setLabel() later.
* @param p The parser.
* @param idx The index of the label.
*/
static void
bc_parse_createLabel(BcParse* p, size_t idx)
{
bc_vec_push(&p->func->labels, &idx);
}
/**
* Creates a conditional label. Unlike an exit label, this label is set at
* creation time because it comes *before* the code that will target it.
* @param p The parser.
* @param idx The index of the label.
*/
static void
bc_parse_createCondLabel(BcParse* p, size_t idx)
{
bc_parse_createLabel(p, p->func->code.len);
bc_vec_push(&p->conds, &idx);
}
/*
* Creates an exit label to be filled in later by bc_parse_setLabel(). Also, why
* create a label to be filled in later? Because exit labels are meant to be
* targeted by code that comes *before* the label. Since we have to parse that
* code first, and don't know how long it will be, we need to just make sure to
* reserve a slot to be filled in later when we know.
*
* By the way, this uses BcInstPtr because it was convenient. The field idx
* holds the index, and the field func holds the loop boolean.
*
* @param p The parser.
* @param idx The index of the label's position.
* @param loop True if the exit label is for a loop or not.
*/
static void
bc_parse_createExitLabel(BcParse* p, size_t idx, bool loop)
{
BcInstPtr ip;
assert(p->func == bc_vec_item(&p->prog->fns, p->fidx));
ip.func = loop;
ip.idx = idx;
ip.len = 0;
bc_vec_push(&p->exits, &ip);
bc_parse_createLabel(p, SIZE_MAX);
}
/**
* Pops the correct operators off of the operator stack based on the current
* operator. This is because of the Shunting-Yard algorithm. Lower prec means
* higher precedence.
* @param p The parser.
* @param type The operator.
* @param start The previous start of the operator stack. For more
* information, see the bc Parsing section of the Development
* manual (manuals/development.md).
* @param nexprs A pointer to the current number of expressions that have not
* been consumed yet. This is an IN and OUT parameter.
*/
static void
bc_parse_operator(BcParse* p, BcLexType type, size_t start, size_t* nexprs)
{
BcLexType t;
4 years ago
uchar l, r = BC_PARSE_OP_PREC(type);
uchar left = BC_PARSE_OP_LEFT(type);
// While we haven't hit the stop point yet.
while (p->ops.len > start)
{
// Get the top operator.
t = BC_PARSE_TOP_OP(p);
// If it's a right paren, we have reached the end of whatever expression
// this is no matter what.
if (t == BC_LEX_LPAREN) break;
// Break for precedence. Precedence operates differently on left and
// right associativity, by the way. A left associative operator that
// matches the current precedence should take priority, but a right
// associative operator should not.
4 years ago
l = BC_PARSE_OP_PREC(t);
if (l >= r && (l != r || !left)) break;
// Do the housekeeping. In particular, make sure to note that one
// expression was consumed. (Two were, but another was added.)
bc_parse_push(p, BC_PARSE_TOKEN_INST(t));
bc_vec_pop(&p->ops);
*nexprs -= !BC_PARSE_OP_PREFIX(t);
}
bc_vec_push(&p->ops, &type);
}
/**
* Parses a right paren. In the Shunting-Yard algorithm, it needs to be put on
* the operator stack. But before that, it needs to consume whatever operators
* there are until it hits a left paren.
* @param p The parser.
* @param nexprs A pointer to the current number of expressions that have not
* been consumed yet. This is an IN and OUT parameter.
*/
static void
bc_parse_rightParen(BcParse* p, size_t* nexprs)
{
BcLexType top;
// Consume operators until a left paren.
while ((top = BC_PARSE_TOP_OP(p)) != BC_LEX_LPAREN)
{
bc_parse_push(p, BC_PARSE_TOKEN_INST(top));
bc_vec_pop(&p->ops);
*nexprs -= !BC_PARSE_OP_PREFIX(top);
}
// We need to pop the left paren as well.
bc_vec_pop(&p->ops);
// Oh, and we also want the next token.
bc_lex_next(&p->l);
}
/**
* Parses function arguments.
* @param p The parser.
* @param flags Flags restricting what kind of expressions the arguments can
* be.
*/
static void
bc_parse_args(BcParse* p, uint8_t flags)
{
bool comma = false;
size_t nargs;
bc_lex_next(&p->l);
// Print and comparison operators not allowed. Well, comparison operators
// only for POSIX. But we do allow arrays, and we *must* get a value.
flags &= ~(BC_PARSE_PRINT | BC_PARSE_REL);
flags |= (BC_PARSE_ARRAY | BC_PARSE_NEEDVAL);
// Count the arguments and parse them.
for (nargs = 0; p->l.t != BC_LEX_RPAREN; ++nargs)
{
bc_parse_expr_status(p, flags, bc_parse_next_arg);
comma = (p->l.t == BC_LEX_COMMA);
if (comma) bc_lex_next(&p->l);
}
// An ending comma is FAIL.
if (BC_ERR(comma)) bc_parse_err(p, BC_ERR_PARSE_TOKEN);
// Now do the call with the number of arguments.
bc_parse_push(p, BC_INST_CALL);
bc_parse_pushIndex(p, nargs);
}
/**
* Parses a function call.
* @param p The parser.
* @param flags Flags restricting what kind of expressions the arguments can
* be.
*/
static void
bc_parse_call(BcParse* p, const char* name, uint8_t flags)
{
size_t idx;
bc_parse_args(p, flags);
// We just assert this because bc_parse_args() should
// ensure that the next token is what it should be.
assert(p->l.t == BC_LEX_RPAREN);
// We cannot use bc_program_insertFunc() here
// because it will overwrite an existing function.
idx = bc_map_index(&p->prog->fn_map, name);
// The function does not exist yet. Create a space for it. If the user does
// not define it, it's a *runtime* error, not a parse error.
if (idx == BC_VEC_INVALID_IDX)
{
idx = bc_program_insertFunc(p->prog, name);
2 years ago
assert(idx != BC_VEC_INVALID_IDX);
// Make sure that this pointer was not invalidated.
p->func = bc_vec_item(&p->prog->fns, p->fidx);
}
// The function exists, so set the right function index.
else idx = ((BcId*) bc_vec_item(&p->prog->fn_map, idx))->idx;
bc_parse_pushIndex(p, idx);
// Make sure to get the next token.
bc_lex_next(&p->l);
}
/**
* Parses a name/identifier-based expression. It could be a variable, an array
* element, an array itself (for function arguments), a function call, etc.
*
*/
static void
bc_parse_name(BcParse* p, BcInst* type, bool* can_assign, uint8_t flags)
{
char* name;
BC_SIG_ASSERT_LOCKED;
// We want a copy of the name since the lexer might overwrite its copy.
name = bc_vm_strdup(p->l.str.v);
BC_SETJMP_LOCKED(vm, err);
// We need the next token to see if it's just a variable or something more.
bc_lex_next(&p->l);
// Array element or array.
if (p->l.t == BC_LEX_LBRACKET)
{
bc_lex_next(&p->l);
// Array only. This has to be a function parameter.
if (p->l.t == BC_LEX_RBRACKET)
{
// Error if arrays are not allowed.
if (BC_ERR(!(flags & BC_PARSE_ARRAY)))
{
bc_parse_err(p, BC_ERR_PARSE_EXPR);
}
*type = BC_INST_ARRAY;
*can_assign = false;
}
else
{
// If we are here, we have an array element. We need to set the
// expression parsing flags.
uint8_t flags2 = (flags & ~(BC_PARSE_PRINT | BC_PARSE_REL)) |
BC_PARSE_NEEDVAL;
bc_parse_expr_status(p, flags2, bc_parse_next_elem);
// The next token *must* be a right bracket.
if (BC_ERR(p->l.t != BC_LEX_RBRACKET))
{
bc_parse_err(p, BC_ERR_PARSE_TOKEN);
}
*type = BC_INST_ARRAY_ELEM;
*can_assign = true;
}
// Make sure to get the next token.
bc_lex_next(&p->l);
// Push the instruction and the name of the identifier.
bc_parse_push(p, *type);
bc_parse_pushName(p, name, false);
}
else if (p->l.t == BC_LEX_LPAREN)
{
// We are parsing a function call; error if not allowed.
if (BC_ERR(flags & BC_PARSE_NOCALL))
{
bc_parse_err(p, BC_ERR_PARSE_TOKEN);
}
*type = BC_INST_CALL;
*can_assign = false;
bc_parse_call(p, name, flags);
}
else
{
// Just a variable.
*type = BC_INST_VAR;
*can_assign = true;
4 years ago
bc_parse_push(p, BC_INST_VAR);
bc_parse_pushName(p, name, true);
}
err:
// Need to make sure to unallocate the name.
free(name);
BC_LONGJMP_CONT(vm);
BC_SIG_MAYLOCK;
}
/**
* Parses a builtin function that takes no arguments. This includes read(),
* rand(), maxibase(), maxobase(), maxscale(), and maxrand().
* @param p The parser.
* @param inst The instruction corresponding to the builtin.
*/
static void
bc_parse_noArgBuiltin(BcParse* p, BcInst inst)
{
// Must have a left paren.
bc_lex_next(&p->l);
if (BC_ERR(p->l.t != BC_LEX_LPAREN)) bc_parse_err(p, BC_ERR_PARSE_TOKEN);
// Must have a right paren.
bc_lex_next(&p->l);
if ((p->l.t != BC_LEX_RPAREN)) bc_parse_err(p, BC_ERR_PARSE_TOKEN);
bc_parse_push(p, inst);
bc_lex_next(&p->l);
}
/**
* Parses a builtin function that takes 1 argument. This includes length(),
* sqrt(), abs(), scale(), and irand().
* @param p The parser.
* @param type The lex token.
* @param flags The expression parsing flags for parsing the argument.
* @param prev An out parameter; the previous instruction pointer.
*/
static void
bc_parse_builtin(BcParse* p, BcLexType type, uint8_t flags, BcInst* prev)
{
// Must have a left paren.
bc_lex_next(&p->l);
if (BC_ERR(p->l.t != BC_LEX_LPAREN)) bc_parse_err(p, BC_ERR_PARSE_TOKEN);
bc_lex_next(&p->l);
// Change the flags as needed for parsing the argument.
flags &= ~(BC_PARSE_PRINT | BC_PARSE_REL);
flags |= BC_PARSE_NEEDVAL;
// Since length can take arrays, we need to specially add that flag.
3 years ago
if (type == BC_LEX_KW_LENGTH) flags |= BC_PARSE_ARRAY;
bc_parse_expr_status(p, flags, bc_parse_next_rel);
// Must have a right paren.
if (BC_ERR(p->l.t != BC_LEX_RPAREN)) bc_parse_err(p, BC_ERR_PARSE_TOKEN);
// Adjust previous based on the token and push it.
3 years ago
*prev = type - BC_LEX_KW_LENGTH + BC_INST_LENGTH;
bc_parse_push(p, *prev);
bc_lex_next(&p->l);
}
/**
* Parses a builtin function that takes 3 arguments. This includes modexp() and
* divmod().
*/
static void
bc_parse_builtin3(BcParse* p, BcLexType type, uint8_t flags, BcInst* prev)
{
assert(type == BC_LEX_KW_MODEXP || type == BC_LEX_KW_DIVMOD);
// Must have a left paren.
bc_lex_next(&p->l);
if (BC_ERR(p->l.t != BC_LEX_LPAREN)) bc_parse_err(p, BC_ERR_PARSE_TOKEN);
bc_lex_next(&p->l);
// Change the flags as needed for parsing the argument.
flags &= ~(BC_PARSE_PRINT | BC_PARSE_REL);
flags |= BC_PARSE_NEEDVAL;
bc_parse_expr_status(p, flags, bc_parse_next_builtin);
// Must have a comma.
if (BC_ERR(p->l.t != BC_LEX_COMMA)) bc_parse_err(p, BC_ERR_PARSE_TOKEN);
bc_lex_next(&p->l);
bc_parse_expr_status(p, flags, bc_parse_next_builtin);
// Must have a comma.
if (BC_ERR(p->l.t != BC_LEX_COMMA)) bc_parse_err(p, BC_ERR_PARSE_TOKEN);
bc_lex_next(&p->l);
// If it is a divmod, parse an array name. Otherwise, just parse another
// expression.
if (type == BC_LEX_KW_DIVMOD)
{
// Must have a name.
if (BC_ERR(p->l.t != BC_LEX_NAME)) bc_parse_err(p, BC_ERR_PARSE_TOKEN);
// This is safe because the next token should not overwrite the name.
bc_lex_next(&p->l);
// Must have a left bracket.
if (BC_ERR(p->l.t != BC_LEX_LBRACKET))
{
bc_parse_err(p, BC_ERR_PARSE_TOKEN);
}
// This is safe because the next token should not overwrite the name.
bc_lex_next(&p->l);
// Must have a right bracket.
if (BC_ERR(p->l.t != BC_LEX_RBRACKET))
{
bc_parse_err(p, BC_ERR_PARSE_TOKEN);
}
// This is safe because the next token should not overwrite the name.
bc_lex_next(&p->l);
}
else bc_parse_expr_status(p, flags, bc_parse_next_rel);
// Must have a right paren.
if (BC_ERR(p->l.t != BC_LEX_RPAREN)) bc_parse_err(p, BC_ERR_PARSE_TOKEN);
// Adjust previous based on the token and push it.
*prev = type - BC_LEX_KW_MODEXP + BC_INST_MODEXP;
bc_parse_push(p, *prev);
// If we have divmod, we need to assign the modulus to the array element, so
// we need to push the instructions for doing so.
if (type == BC_LEX_KW_DIVMOD)
{
// The zeroth element.
bc_parse_push(p, BC_INST_ZERO);
bc_parse_push(p, BC_INST_ARRAY_ELEM);
// Push the array.
bc_parse_pushName(p, p->l.str.v, false);
// Swap them and assign. After this, the top item on the stack should
// be the quotient.
bc_parse_push(p, BC_INST_SWAP);
bc_parse_push(p, BC_INST_ASSIGN_NO_VAL);
}
bc_lex_next(&p->l);
}
/**
* Parses the scale keyword. This is special because scale can be a value or a
* builtin function.
* @param p The parser.
* @param type An out parameter; the instruction for the parse.
* @param can_assign An out parameter; whether the expression can be assigned
* to.
* @param flags The expression parsing flags for parsing a scale() arg.
*/
static void
bc_parse_scale(BcParse* p, BcInst* type, bool* can_assign, uint8_t flags)
{
bc_lex_next(&p->l);
// Without the left paren, it's just the keyword.
if (p->l.t != BC_LEX_LPAREN)
{
// Set, push, and return.
*type = BC_INST_SCALE;
*can_assign = true;
bc_parse_push(p, BC_INST_SCALE);
return;
}
// Handle the scale function.
*type = BC_INST_SCALE_FUNC;
*can_assign = false;
// Once again, adjust the flags.
flags &= ~(BC_PARSE_PRINT | BC_PARSE_REL);
flags |= BC_PARSE_NEEDVAL;
bc_lex_next(&p->l);
bc_parse_expr_status(p, flags, bc_parse_next_rel);
// Must have a right paren.
if (BC_ERR(p->l.t != BC_LEX_RPAREN)) bc_parse_err(p, BC_ERR_PARSE_TOKEN);
bc_parse_push(p, BC_INST_SCALE_FUNC);
bc_lex_next(&p->l);
}
/**
* Parses and increment or decrement operator. This is a bit complex.
* @param p The parser.
* @param prev An out parameter; the previous instruction pointer.
* @param can_assign An out parameter; whether the expression can be assigned
* to.
* @param nexs An in/out parameter; the number of expressions in the
* parse tree that are not used.
* @param flags The expression parsing flags for parsing a scale() arg.
*/
static void
bc_parse_incdec(BcParse* p, BcInst* prev, bool* can_assign, size_t* nexs,
uint8_t flags)
{
BcLexType type;
uchar inst;
BcInst etype = *prev;
BcLexType last = p->l.last;
3 years ago
assert(prev != NULL && can_assign != NULL);
// If we can't assign to the previous token, then we have an error.
4 years ago
if (BC_ERR(last == BC_LEX_OP_INC || last == BC_LEX_OP_DEC ||
last == BC_LEX_RPAREN))
{
bc_parse_err(p, BC_ERR_PARSE_ASSIGN);
4 years ago
}
// Is the previous instruction for a variable?
if (BC_PARSE_INST_VAR(etype))
{
// If so, this is a postfix operator.
if (!*can_assign) bc_parse_err(p, BC_ERR_PARSE_ASSIGN);
// Only postfix uses BC_INST_INC and BC_INST_DEC.
*prev = inst = BC_INST_INC + (p->l.t != BC_LEX_OP_INC);
bc_parse_push(p, inst);
bc_lex_next(&p->l);
*can_assign = false;
}
else
{
// This is a prefix operator. In that case, we just convert it to
// an assignment instruction.
*prev = inst = BC_INST_ASSIGN_PLUS + (p->l.t != BC_LEX_OP_INC);
bc_lex_next(&p->l);
type = p->l.t;
// Because we parse the next part of the expression