From 65227c476071914b82720084a2a775b44e4de885 Mon Sep 17 00:00:00 2001 From: dam Date: Mon, 6 May 2024 12:50:22 +0100 Subject: Cleanup UTF helper. --- modules/UTF8.jai | 60 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) (limited to 'modules/UTF8.jai') diff --git a/modules/UTF8.jai b/modules/UTF8.jai index eba4585..b583809 100644 --- a/modules/UTF8.jai +++ b/modules/UTF8.jai @@ -1,25 +1,29 @@ -// BBBB BBBB & 1100 0000 == 10XX XXXX -> is continuation byte -// TODO Maybe rename to: is_continuation_byte -is_utf8_continuation_byte :: inline (byte: u8) -> bool { +// Some procedures to help working with UTF8 strings. +// https://en.wikipedia.org/wiki/UTF-8 + +// Returns true if argument is a continuation byte. +is_continuation_byte :: inline (byte: u8) -> bool { + // BBBB BBBB & 1100 0000 == 10XX XXXX -> is continuation byte return (byte & 0xC0) == 0x80; } -// BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte -// BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte -// BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte -// TODO Maybe rename to: count_character_bytes -count_utf8_bytes :: inline (byte: u8) -> int { - if (byte & 0xE0) == 0xC0 return 1+1; - if (byte & 0xF0) == 0xE0 return 1+2; - if (byte & 0xF8) == 0xF0 return 1+3; +// Given a leading_byte, returns the number of bytes on the character. +count_character_bytes :: inline (leading_byte: u8) -> int { + // BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte + if (leading_byte & 0xE0) == 0xC0 return 1+1; + + // BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte + if (leading_byte & 0xF0) == 0xE0 return 1+2; + + // BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte + if (leading_byte & 0xF8) == 0xF0 return 1+3; + return 1; } -// Truncates the string to the length provided or shorter, in case of UTF8 strings that require so. -// Truncation is done by zeroing the tail of the string in place. -// Returns length of truncated string. -// TODO Maybe rename to: truncate -truncate_string :: (str: string, length: int) -> length: int { +// Truncates the string to the provided length and zeroes the discarded bytes. +// Returns the length of truncated string or -1 if string has no data. +truncate :: (str: *string, length: int) -> length: int { if str.data == null then return -1; if str.count < length then length = str.count; @@ -29,7 +33,7 @@ truncate_string :: (str: string, length: int) -> length: int { // Find index of first continuation byte. idx := length; - while (idx > 0 && is_utf8_continuation_byte(data[idx - 1])) { + while (idx > 0 && is_continuation_byte(data[idx - 1])) { idx -= 1; } continuation_bytes := length - idx; @@ -50,13 +54,12 @@ truncate_string :: (str: string, length: int) -> length: int { } memset(data + length, 0, count - length); - // str.count = length; TODO We should be doing this... + str.count = length; return length; } // Returns true when the string is empty or consists of space characters. -// TODO Maybe rename to: is_empty -is_empty_string :: (str: string) -> bool { +is_empty :: (str: string) -> bool { for 0..str.count-1 { if str[it] == { case #char "\0"; #through; @@ -74,21 +77,21 @@ is_empty_string :: (str: string) -> bool { return true; } -// Counts number of characters in string. +// Counts the number of characters. count_characters :: (str: string) -> int { characters := 0; idx := 0; while idx < str.count { - idx += count_utf8_bytes(str[idx]); + idx += count_character_bytes(str[idx]); characters += 1; } return characters; } -// Delete character. +// Deletes character by it's index, and moves tail data to take its place. delete_character :: (str: *string, character_idx: int) { - buffer_idx := map_character_to_buffer_idx(str.*, character_idx); - bytes_to_delete := count_utf8_bytes(str.data[buffer_idx]); + buffer_idx := get_byte_idx(str.*, character_idx); + bytes_to_delete := count_character_bytes(str.data[buffer_idx]); for buffer_idx..str.count-1-bytes_to_delete { str.data[it] = str.data[it+bytes_to_delete]; @@ -100,9 +103,8 @@ delete_character :: (str: *string, character_idx: int) { str.count -= bytes_to_delete; } -// Get character index. -// TODO Maybe rename to: map_character_to_byte_idx or get_character_byte_idx -map_character_to_buffer_idx :: (str: string, character_idx: int) -> buffer_idx: int, success: bool { +// Searches for the given character index and returns its byte index on the string. +get_byte_idx :: (str: string, character_idx: int) -> buffer_idx: int, success: bool { if character_idx < 0 then return -1, false; if character_idx > str.count then return -2, false; if character_idx == 0 then return 0, true; @@ -110,7 +112,7 @@ map_character_to_buffer_idx :: (str: string, character_idx: int) -> buffer_idx: buff_idx := 0; char_idx := 0; while buff_idx < str.count && char_idx != character_idx { - buff_idx += count_utf8_bytes(str[buff_idx]); + buff_idx += count_character_bytes(str[buff_idx]); char_idx += 1; } return buff_idx, char_idx == character_idx; -- cgit v1.2.3