Improve the UTF-8 handling

This removes some inefficiency, which will be important in these
hotspots.

Signed-off-by: Gavin Howard <gavin@yzena.com>
master
Gavin D. Howard 4 months ago
parent 6d4387e9fe
commit 06e112af66
Signed by: gavin
GPG Key ID: F890265DD80E4E90

@ -51,6 +51,7 @@
set(YC_LANG_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/char.c"
"${CMAKE_CURRENT_SOURCE_DIR}/file.c"
"${CMAKE_CURRENT_SOURCE_DIR}/lex.c"
"${CMAKE_CURRENT_SOURCE_DIR}/parse.c"

@ -0,0 +1,94 @@
/**
* ***** BEGIN LICENSE BLOCK *****
*
* Copyright 2017-2023 Yzena, LLC
*
* Licensed under the Yzena Viral User License, Version 0.1 (the "Yzena Viral
* User License" or "YVUL"), the GNU Affero General Public License (the "GNU
* AGPL"), Version 3.0, and the Server Side Public License (the "SSPL"),
* Version 1. You may not use this file except in compliance with all of those
* licenses.
*
* You may obtain a copy of the Yzena Viral User License at
*
* https://yzena.com/yzena-viral-user-license/
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the Yzena Viral User License is distributed under the
* following disclaimer:
*
* As far as the law allows, this software comes as is, without any
* warranty or condition, and no contributor will be liable to anyone for
* any damages related to this software or this license, under any kind of
* legal claim.
*
* You may obtain a copy of the GNU Affero General Public License at
*
* https://www.gnu.org/licenses/agpl-3.0.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the GNU Affero General Public License is distributed under
* the following disclaimer:
*
* This software is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
* General Public License for more details.
*
* You may obtain a copy of the Server Side Public License at
*
* https://www.mongodb.com/licensing/server-side-public-license
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the Server Side Public License is distributed under the
* following disclaimer:
*
* This software is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Server
* Side Public License for more details.
*
* ****** END LICENSE BLOCK ******
*
* *****************************************************************
*
* ******* BEGIN FILE DESCRIPTION *******
*
* Code for lexer characters. This is here because the length is stored
* specially.
*
* ******** END FILE DESCRIPTION ********
*/
#include "lang.h"
#include "../concurrency/strucon.h"
#include "../util/utf8.h"
#include <string.h>
void
y_lang_char_zero(y_lang_Char* c)
{
y_call_dbg();
// NOLINTNEXTLINE
memset(c, 0, sizeof(y_lang_Char));
y_return_void_dbg;
}
y_Status
y_lang_char_initCodepoint(y_lang_Char* c)
{
y_Status s;
y_call_dbg();
y_assert(c->len >= 1 && c->len <= 4,
"Len is not a valid length for a Unicode codepoint");
s = y_utf8_codepoint_len(c->bytes, c->len, &c->c);
y_return_dbg(s);
}

@ -94,8 +94,7 @@ y_lang_file_nextChar(y_lang_LexFile* f)
}
else
{
f->nextc.c = 0;
f->nextc.len = 0;
y_lang_char_zero(&f->nextc);
s = y_STATUS_SUCCESS;
}
@ -104,7 +103,7 @@ y_lang_file_nextChar(y_lang_LexFile* f)
{
y_uint i;
f->nextc.len = (y_uchar) y_utf8_len(f->nextc.bytes[0]);
f->nextc.len = y_utf8_len(f->nextc.bytes[0]);
// If this is true, we should have the replacement character. Just add
// an error for an invalid character.
@ -154,6 +153,8 @@ y_lang_file_nextChar(y_lang_LexFile* f)
break;
}
}
s = y_lang_char_initCodepoint(&f->nextc);
}
y_return_dbg(s);

@ -337,6 +337,9 @@ typedef struct y_lang_TokenSemantics
} y_lang_TokenSemantics;
/**
* A struct to hold everything about a Unicode character that was lexed.
*/
typedef struct y_lang_Char
{
/// The actual codepoint.
@ -350,6 +353,40 @@ typedef struct y_lang_Char
} y_lang_Char;
/**
* Zeros a character.
* @param c The character to zero.
*/
void
y_lang_char_zero(y_lang_Char* c) y_allnonnull y_inline;
/**
* Initializes a char from its byte array and the length.
* @param c The character to initialize.
* @return An error code, if any.
* @pre @a c must not be NULL.
*/
y_Status
y_lang_char_initCodepoint(y_lang_Char* c) y_allnonnull y_nodiscard y_inline;
/**
* Returns the byte length of the char.
* @param c The character.
* @return The byte length of the char.
* @pre @a c must not be NULL.
*/
y_u32
y_lang_char_len(y_lang_Char* c) y_allnonnull y_nodiscard y_inline;
/**
* Returns the codepoint of the char.
* @param c The character.
* @return The codepoint of the char.
* @pre @a c must not be NULL.
*/
y_u32
y_lang_char_codepoint(y_lang_Char* c) y_allnonnull y_nodiscard y_inline;
/**
* A token.
*

@ -69,36 +69,24 @@
// This is the lookup table of lengths. The last entry is specifically for
// invalid characters that will return U+FFFD, the replacement character.
static const y_uint y_utf8_lens[32] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
2, 2, 2, 3, 3, 4, 0,
static const y_u32 y_utf8_lens[32] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 0,
};
y_Status
y_utf8_codepoint(const char* s, y_usize len, y_u32* cp, y_usize* rlen)
y_utf8_codepoint_len(const y_uchar* s, y_u32 len, y_u32* cp)
{
y_Status status;
y_uchar byte;
y_call_dbg();
if (!len)
{
*cp = 0;
*rlen = 1;
y_return_dbg(y_STATUS_SUCCESS);
}
byte = (y_uchar) s[0];
*rlen = (y_usize) y_utf8_len(byte);
y_assert(*rlen <= 4, "Length is not 0, 1, 2, 3, or 4");
y_assert(len >= 1 && len <= 4, "Length is not 1, 2, 3, or 4");
// This is literally the UTF-8 decoding algorithm. Look that up if you don't
// understand this.
switch (*rlen)
switch (len)
{
case 0:
{
@ -109,7 +97,7 @@ y_utf8_codepoint(const char* s, y_usize len, y_u32* cp, y_usize* rlen)
case 1:
{
*cp = byte;
*cp = (y_uchar) s[0];
break;
}
@ -165,6 +153,30 @@ y_utf8_codepoint(const char* s, y_usize len, y_u32* cp, y_usize* rlen)
y_return_dbg(y_STATUS_SUCCESS);
}
y_Status
y_utf8_codepoint(const char* s, y_usize len, y_u32* cp, y_usize* rlen)
{
y_Status status;
y_uchar byte;
y_call_dbg();
if (!len)
{
*cp = 0;
*rlen = 1;
y_return_dbg(y_STATUS_SUCCESS);
}
byte = (y_uchar) s[0];
*rlen = (y_usize) y_utf8_len(byte);
status = y_utf8_codepoint_len((const y_uchar*) s, (y_u32) *rlen, cp);
y_return_dbg(status);
}
y_usize
y_utf8_codepointToUtf8(y_u32 cp, char* s, y_usize len)
{
@ -233,7 +245,7 @@ y_utf8_codepointToUtf8(y_u32 cp, char* s, y_usize len)
y_return_dbg(rlen);
}
y_uint
y_u32
y_utf8_len(y_uchar c)
{
return y_utf8_lens[c >> (CHAR_BIT - 5)];

@ -88,7 +88,7 @@ extern "C"
#endif // YC_CLANG
/**
* Converts UTF-8 to a Unicode code point.
* Converts a UTF-8 codepoint to a Unicode code point.
* @param s The string.
* @param len The length of the string.
* @param cp An out parameter for the codepoint.
@ -103,6 +103,19 @@ y_Status
y_utf8_codepoint(const char* s, y_usize len, y_u32* cp,
y_usize* rlen) y_allnonnull y_nodiscard;
/**
* Converts a UTF-8 codepoint to a Unicode code point.
* @param s The string.
* @param len The length of the codepoint.
* @param cp An out parameter for the codepoint.
* @return An error code, if the bytes in the codepoint are not valid UTF-8.
* @pre @a s must not be NULL.
* @pre @a cp must not be NULL.
*/
y_Status
y_utf8_codepoint_len(const y_uchar* s, y_u32 len,
y_u32* cp) y_allnonnull y_nodiscard;
/**
* Converts a Unicode codepoint to UTF-8.
* @param cp The codepoint.
@ -121,7 +134,7 @@ y_utf8_codepointToUtf8(y_u32 cp, char* s, y_usize len) y_allnonnull y_nodiscard;
* @param c The first character of a UTF-8 codepoint.
* @return The length of the codepoint.
*/
y_uint
y_u32
y_utf8_len(y_uchar c) y_nodiscard y_const;
/**

Loading…
Cancel
Save