diff options
| author | dam <dam@gudinoff> | 2024-04-07 23:23:06 +0100 |
|---|---|---|
| committer | dam <dam@gudinoff> | 2024-04-07 23:23:06 +0100 |
| commit | 353d8b1145db12ffc42e4f6c2148a848ef8bba84 (patch) | |
| tree | afbf51e66a9e7eaba04b6659b224187bc7a95c75 /modules/UTF8.jai | |
| parent | a3cf506defb4a01759db6f920a363e23de63e984 (diff) | |
| download | task-time-tracker-353d8b1145db12ffc42e4f6c2148a848ef8bba84.tar.zst task-time-tracker-353d8b1145db12ffc42e4f6c2148a848ef8bba84.zip | |
Preparing for UTF8 support on read_input_line.
Diffstat (limited to 'modules/UTF8.jai')
| -rw-r--r-- | modules/UTF8.jai | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/modules/UTF8.jai b/modules/UTF8.jai new file mode 100644 index 0000000..fac1326 --- /dev/null +++ b/modules/UTF8.jai @@ -0,0 +1,71 @@ +// BBBB BBBB & 1100 0000 == 10XX XXXX -> is continuation byte +is_utf8_continuation_byte :: inline (byte: u8) -> bool { + return (byte & 0xC0) == 0x80; +} + +// BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte +// BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte +// BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte +count_utf8_bytes :: inline (byte: u8) -> int { + if (byte & 0xE0) == 0xC0 return 1+1; + if (byte & 0xF0) == 0xE0 return 1+2; + if (byte & 0xF8) == 0xF0 return 1+3; + return 1; +} + +// Truncates the string to the length provided or shorter, in case of UTF8 strings that require so. +// Truncation is done by zeroing the tail of the string in place. +// Returns length of truncated string. +truncate_string :: (str: string, length: int) -> length: int { + if str.data == null then return -1; + + if str.count < length then length = str.count; + + data := str.data; + count := str.count; + + // Find index of first continuation byte. + idx := length; + // while (idx > 0 && ((data[idx - 1] & 0xC0) == 0x80)) { TODO REMOVE AFTER TESTING + while (idx > 0 && is_utf8_continuation_byte(data[idx - 1])) { + idx -= 1; + } + continuation_bytes := length - idx; + + // If string starts with continuation bytes, it's an invalid UTF8 string. + if (idx == 0 && continuation_bytes > 0) { + length = 0; + } + // If length truncates some continuation bytes, remove incomplete UTF8 character. + else if (idx > 0 // string is not empty + // continuation bytes are not complete + && !(continuation_bytes == 0 && (data[idx - 1] & 0x80) == 0x00) + && !(continuation_bytes == 1 && (data[idx - 1] & 0xE0) == 0xC0) + && !(continuation_bytes == 2 && (data[idx - 1] & 0xF0) == 0xE0) + && !(continuation_bytes == 3 && (data[idx - 1] & 0xF8) == 0xF0) + ) { + length -= (continuation_bytes + 1); // Remove start byte, ence '+ 1'. + } + + memset(data + length, 0, count - length); + return length; +} + +// Returns true when the string is empty or consists of space characters. +is_empty_string :: (str: string) -> bool { + for 0..str.count-1 { + if str[it] == { + case #char "\0"; #through; + case #char "\t"; #through; // horizontal tab + case #char "\n"; #through; // line feed + case #char "\x0B"; #through; // vertical tabulation + case #char "\x0C"; #through; // form feed + case #char "\r"; #through; // carriage return + case #char " "; + continue; + case; + return false; + } + } + return true; +} |
