You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
382 lines
8.4 KiB
382 lines
8.4 KiB
/* |
|
* ***************************************************************************** |
|
* |
|
* SPDX-License-Identifier: BSD-2-Clause |
|
* |
|
* Copyright (c) 2018-2021 Gavin D. Howard and contributors. |
|
* |
|
* Redistribution and use in source and binary forms, with or without |
|
* modification, are permitted provided that the following conditions are met: |
|
* |
|
* * Redistributions of source code must retain the above copyright notice, this |
|
* list of conditions and the following disclaimer. |
|
* |
|
* * Redistributions in binary form must reproduce the above copyright notice, |
|
* this list of conditions and the following disclaimer in the documentation |
|
* and/or other materials provided with the distribution. |
|
* |
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
* POSSIBILITY OF SUCH DAMAGE. |
|
* |
|
* ***************************************************************************** |
|
* |
|
* Common code for the lexers. |
|
* |
|
*/ |
|
|
|
#include <assert.h> |
|
#include <ctype.h> |
|
#include <stdbool.h> |
|
#include <string.h> |
|
|
|
#include <lex.h> |
|
#include <vm.h> |
|
#include <bc.h> |
|
|
|
void |
|
bc_lex_invalidChar(BcLex* l, char c) |
|
{ |
|
l->t = BC_LEX_INVALID; |
|
bc_lex_verr(l, BC_ERR_PARSE_CHAR, c); |
|
} |
|
|
|
void |
|
bc_lex_lineComment(BcLex* l) |
|
{ |
|
l->t = BC_LEX_WHITESPACE; |
|
while (l->i < l->len && l->buf[l->i] != '\n') |
|
{ |
|
l->i += 1; |
|
} |
|
} |
|
|
|
void |
|
bc_lex_comment(BcLex* l) |
|
{ |
|
size_t i, nlines = 0; |
|
const char* buf; |
|
bool end = false, got_more; |
|
char c; |
|
|
|
l->i += 1; |
|
l->t = BC_LEX_WHITESPACE; |
|
|
|
// This loop is complex because it might need to request more data from |
|
// stdin if the comment is not ended. This loop is taken until the comment |
|
// is finished or we have EOF. |
|
do |
|
{ |
|
buf = l->buf; |
|
got_more = false; |
|
|
|
// If we are in stdin mode, the buffer must be the one used for stdin. |
|
assert(!vm->is_stdin || buf == vm->buffer.v); |
|
|
|
// Find the end of the comment. |
|
for (i = l->i; !end; i += !end) |
|
{ |
|
// While we don't have an asterisk, eat, but increment nlines. |
|
for (; (c = buf[i]) && c != '*'; ++i) |
|
{ |
|
nlines += (c == '\n'); |
|
} |
|
|
|
// If this is true, we need to request more data. |
|
if (BC_ERR(!c || buf[i + 1] == '\0')) |
|
{ |
|
// Read more, if possible. |
|
if (!vm->eof && (l->is_stdin || l->is_exprs)) |
|
{ |
|
got_more = bc_lex_readLine(l); |
|
} |
|
|
|
break; |
|
} |
|
|
|
// If this turns true, we found the end. Yay! |
|
end = (buf[i + 1] == '/'); |
|
} |
|
} |
|
while (got_more && !end); |
|
|
|
// If we didn't find the end, barf. |
|
if (!end) |
|
{ |
|
l->i = i; |
|
bc_lex_err(l, BC_ERR_PARSE_COMMENT); |
|
} |
|
|
|
l->i = i + 2; |
|
l->line += nlines; |
|
} |
|
|
|
void |
|
bc_lex_whitespace(BcLex* l) |
|
{ |
|
char c; |
|
|
|
l->t = BC_LEX_WHITESPACE; |
|
|
|
// Eat. We don't eat newlines because they can be special. |
|
for (c = l->buf[l->i]; c != '\n' && isspace(c); c = l->buf[++l->i]) |
|
{ |
|
continue; |
|
} |
|
} |
|
|
|
void |
|
bc_lex_commonTokens(BcLex* l, char c) |
|
{ |
|
if (!c) l->t = BC_LEX_EOF; |
|
else if (c == '\n') l->t = BC_LEX_NLINE; |
|
else bc_lex_whitespace(l); |
|
} |
|
|
|
/** |
|
* Parses a number. |
|
* @param l The lexer. |
|
* @param start The start character. |
|
* @param int_only Whether this function should only look for an integer. This |
|
* is used to implement the exponent of scientific notation. |
|
*/ |
|
static size_t |
|
bc_lex_num(BcLex* l, char start, bool int_only) |
|
{ |
|
const char* buf = l->buf + l->i; |
|
size_t i; |
|
char c; |
|
bool last_pt, pt = (start == '.'); |
|
|
|
// This loop looks complex. It is not. It is asking if the character is not |
|
// a nul byte and it if it a valid num character based on what we have found |
|
// thus far, or whether it is a backslash followed by a newline. I can do |
|
// i+1 on the buffer because the buffer must have a nul byte. |
|
for (i = 0; (c = buf[i]) && (BC_LEX_NUM_CHAR(c, pt, int_only) || |
|
(c == '\\' && buf[i + 1] == '\n')); |
|
++i) |
|
{ |
|
// I don't need to test that the next character is a newline because |
|
// the loop condition above ensures that. |
|
if (c == '\\') |
|
{ |
|
i += 2; |
|
|
|
// Make sure to eat whitespace at the beginning of the line. |
|
while (isspace(buf[i]) && buf[i] != '\n') |
|
{ |
|
i += 1; |
|
} |
|
|
|
c = buf[i]; |
|
|
|
// If the next character is not a number character, bail. |
|
if (!BC_LEX_NUM_CHAR(c, pt, int_only)) break; |
|
} |
|
|
|
// Did we find the radix point? |
|
last_pt = (c == '.'); |
|
|
|
// If we did, and we already have one, then break because it's not part |
|
// of this number. |
|
if (pt && last_pt) break; |
|
|
|
// Set whether we have found a radix point. |
|
pt = pt || last_pt; |
|
|
|
bc_vec_push(&l->str, &c); |
|
} |
|
|
|
return i; |
|
} |
|
|
|
void |
|
bc_lex_number(BcLex* l, char start) |
|
{ |
|
l->t = BC_LEX_NUMBER; |
|
|
|
// Make sure the string is clear. |
|
bc_vec_popAll(&l->str); |
|
bc_vec_push(&l->str, &start); |
|
|
|
// Parse the number. |
|
l->i += bc_lex_num(l, start, false); |
|
|
|
#if BC_ENABLE_EXTRA_MATH |
|
{ |
|
char c = l->buf[l->i]; |
|
|
|
// Do we have a number in scientific notation? |
|
if (c == 'e') |
|
{ |
|
#if BC_ENABLED |
|
// Barf for POSIX. |
|
if (BC_IS_POSIX) bc_lex_err(l, BC_ERR_POSIX_EXP_NUM); |
|
#endif // BC_ENABLED |
|
|
|
// Push the e. |
|
bc_vec_push(&l->str, &c); |
|
l->i += 1; |
|
c = l->buf[l->i]; |
|
|
|
// Check for negative specifically because bc_lex_num() does not. |
|
if (c == BC_LEX_NEG_CHAR) |
|
{ |
|
bc_vec_push(&l->str, &c); |
|
l->i += 1; |
|
c = l->buf[l->i]; |
|
} |
|
|
|
// We must have a number character, so barf if not. |
|
if (BC_ERR(!BC_LEX_NUM_CHAR(c, false, true))) |
|
{ |
|
bc_lex_verr(l, BC_ERR_PARSE_CHAR, c); |
|
} |
|
|
|
// Parse the exponent. |
|
l->i += bc_lex_num(l, 0, true); |
|
} |
|
} |
|
#endif // BC_ENABLE_EXTRA_MATH |
|
|
|
bc_vec_pushByte(&l->str, '\0'); |
|
} |
|
|
|
void |
|
bc_lex_name(BcLex* l) |
|
{ |
|
size_t i = 0; |
|
const char* buf = l->buf + l->i - 1; |
|
char c = buf[i]; |
|
|
|
l->t = BC_LEX_NAME; |
|
|
|
// Should be obvious. It's looking for valid characters. |
|
while ((c >= 'a' && c <= 'z') || isdigit(c) || c == '_') |
|
{ |
|
c = buf[++i]; |
|
} |
|
|
|
// Set the string to the identifier. |
|
bc_vec_string(&l->str, i, buf); |
|
|
|
// Increment the index. We minus 1 because it has already been incremented. |
|
l->i += i - 1; |
|
} |
|
|
|
void |
|
bc_lex_init(BcLex* l) |
|
{ |
|
BC_SIG_ASSERT_LOCKED; |
|
assert(l != NULL); |
|
bc_vec_init(&l->str, sizeof(char), BC_DTOR_NONE); |
|
} |
|
|
|
void |
|
bc_lex_free(BcLex* l) |
|
{ |
|
BC_SIG_ASSERT_LOCKED; |
|
assert(l != NULL); |
|
bc_vec_free(&l->str); |
|
} |
|
|
|
void |
|
bc_lex_file(BcLex* l, const char* file) |
|
{ |
|
assert(l != NULL && file != NULL); |
|
l->line = 1; |
|
vm->file = file; |
|
} |
|
|
|
void |
|
bc_lex_next(BcLex* l) |
|
{ |
|
BC_SIG_ASSERT_LOCKED; |
|
|
|
assert(l != NULL); |
|
|
|
l->last = l->t; |
|
|
|
// If this wasn't here, the line number would be off. |
|
l->line += (l->i != 0 && l->buf[l->i - 1] == '\n'); |
|
|
|
// If the last token was EOF, someone called this one too many times. |
|
if (BC_ERR(l->last == BC_LEX_EOF)) bc_lex_err(l, BC_ERR_PARSE_EOF); |
|
|
|
l->t = BC_LEX_EOF; |
|
|
|
// We are done if this is true. |
|
if (l->i == l->len) return; |
|
|
|
// Loop until failure or we don't have whitespace. This |
|
// is so the parser doesn't get inundated with whitespace. |
|
do |
|
{ |
|
vm->next(l); |
|
} |
|
while (l->t == BC_LEX_WHITESPACE); |
|
} |
|
|
|
/** |
|
* Updates the buffer and len so that they are not invalidated when the stdin |
|
* buffer grows. |
|
* @param l The lexer. |
|
* @param text The text. |
|
* @param len The length of the text. |
|
*/ |
|
static void |
|
bc_lex_fixText(BcLex* l, const char* text, size_t len) |
|
{ |
|
l->buf = text; |
|
l->len = len; |
|
} |
|
|
|
bool |
|
bc_lex_readLine(BcLex* l) |
|
{ |
|
bool good; |
|
|
|
// These are reversed because they should be already locked, but |
|
// bc_vm_readLine() needs them to be unlocked. |
|
BC_SIG_UNLOCK; |
|
|
|
// Make sure we read from the appropriate place. |
|
if (l->is_stdin) good = bc_vm_readLine(false); |
|
else |
|
{ |
|
assert(l->is_exprs); |
|
good = bc_vm_readBuf(false); |
|
} |
|
|
|
BC_SIG_LOCK; |
|
|
|
bc_lex_fixText(l, vm->buffer.v, vm->buffer.len - 1); |
|
|
|
return good; |
|
} |
|
|
|
void |
|
bc_lex_text(BcLex* l, const char* text, bool is_stdin, bool is_exprs) |
|
{ |
|
BC_SIG_ASSERT_LOCKED; |
|
|
|
assert(l != NULL && text != NULL); |
|
|
|
bc_lex_fixText(l, text, strlen(text)); |
|
l->i = 0; |
|
l->t = l->last = BC_LEX_INVALID; |
|
l->is_stdin = is_stdin; |
|
l->is_exprs = is_exprs; |
|
|
|
assert(!l->is_stdin || !l->is_exprs); |
|
|
|
bc_lex_next(l); |
|
}
|
|
|