From 65227c476071914b82720084a2a775b44e4de885 Mon Sep 17 00:00:00 2001
From: dam <dam@gudinoff>
Date: Mon, 6 May 2024 12:50:22 +0100
Subject: Cleanup UTF helper.

---
 modules/UTF8.jai | 60 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 29 deletions(-)

(limited to 'modules/UTF8.jai')

diff --git a/modules/UTF8.jai b/modules/UTF8.jai
index eba4585..b583809 100644
--- a/modules/UTF8.jai
+++ b/modules/UTF8.jai
@@ -1,25 +1,29 @@
-// BBBB BBBB & 1100 0000 == 10XX XXXX -> is continuation byte
-// TODO Maybe rename to: is_continuation_byte
-is_utf8_continuation_byte :: inline (byte: u8) -> bool {
+// Some procedures to help working with UTF8 strings.
+// https://en.wikipedia.org/wiki/UTF-8
+
+// Returns true if argument is a continuation byte.
+is_continuation_byte :: inline (byte: u8) -> bool {
+    // BBBB BBBB & 1100 0000 == 10XX XXXX -> is continuation byte
    return (byte & 0xC0) == 0x80;
 }
 
-// BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte
-// BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte
-// BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte
-// TODO Maybe rename to: count_character_bytes
-count_utf8_bytes :: inline (byte: u8) -> int {
-    if (byte & 0xE0) == 0xC0 return 1+1;
-    if (byte & 0xF0) == 0xE0 return 1+2;
-    if (byte & 0xF8) == 0xF0 return 1+3;
+// Given a leading_byte, returns the number of bytes on the character.
+count_character_bytes :: inline (leading_byte: u8) -> int {
+    // BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte
+    if (leading_byte & 0xE0) == 0xC0 return 1+1;
+    
+    // BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte
+    if (leading_byte & 0xF0) == 0xE0 return 1+2;
+    
+    // BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte
+    if (leading_byte & 0xF8) == 0xF0 return 1+3;
+    
     return 1;
 }
 
-// Truncates the string to the length provided or shorter, in case of UTF8 strings that require so.
-// Truncation is done by zeroing the tail of the string in place.
-// Returns length of truncated string.
-// TODO Maybe rename to: truncate
-truncate_string :: (str: string, length: int) -> length: int {
+// Truncates the string to the provided length and zeroes the discarded bytes.
+// Returns the length of truncated string or -1 if string has no data.
+truncate :: (str: *string, length: int) -> length: int {
     if str.data == null     then return -1;
     
     if str.count < length   then length = str.count;
@@ -29,7 +33,7 @@ truncate_string :: (str: string, length: int) -> length: int {
 
     // Find index of first continuation byte.
     idx := length;
-    while (idx > 0 && is_utf8_continuation_byte(data[idx - 1])) {
+    while (idx > 0 && is_continuation_byte(data[idx - 1])) {
         idx -= 1;
     }
     continuation_bytes := length - idx;
@@ -50,13 +54,12 @@ truncate_string :: (str: string, length: int) -> length: int {
     }
 
     memset(data + length, 0, count - length);
-    // str.count = length;  TODO We should be doing this...
+    str.count = length;
     return length;
 }
 
 // Returns true when the string is empty or consists of space characters.
-// TODO Maybe rename to: is_empty
-is_empty_string :: (str: string) -> bool {
+is_empty :: (str: string) -> bool {
     for 0..str.count-1 {
         if str[it] == {
             case #char "\0"; #through;
@@ -74,21 +77,21 @@ is_empty_string :: (str: string) -> bool {
     return true;
 }
 
-// Counts number of characters in string.
+// Counts the number of characters.
 count_characters :: (str: string) -> int {
     characters := 0;
     idx := 0;
     while idx < str.count {
-        idx += count_utf8_bytes(str[idx]);
+        idx += count_character_bytes(str[idx]);
         characters += 1;
     }
     return characters;
 }
 
-// Delete character.
+// Deletes character by it's index, and moves tail data to take its place.
 delete_character :: (str: *string, character_idx: int) {
-    buffer_idx := map_character_to_buffer_idx(str.*, character_idx);
-    bytes_to_delete := count_utf8_bytes(str.data[buffer_idx]);
+    buffer_idx := get_byte_idx(str.*, character_idx);
+    bytes_to_delete := count_character_bytes(str.data[buffer_idx]);
     
     for buffer_idx..str.count-1-bytes_to_delete {
         str.data[it] = str.data[it+bytes_to_delete];
@@ -100,9 +103,8 @@ delete_character :: (str: *string, character_idx: int) {
     str.count -= bytes_to_delete;
 }
 
-// Get character index.
-// TODO Maybe rename to: map_character_to_byte_idx or get_character_byte_idx
-map_character_to_buffer_idx :: (str: string, character_idx: int) -> buffer_idx: int, success: bool {
+// Searches for the given character index and returns its byte index on the string.
+get_byte_idx :: (str: string, character_idx: int) -> buffer_idx: int, success: bool {
     if character_idx < 0            then return -1, false;
     if character_idx > str.count    then return -2, false;
     if character_idx == 0           then return 0, true;
@@ -110,7 +112,7 @@ map_character_to_buffer_idx :: (str: string, character_idx: int) -> buffer_idx:
     buff_idx := 0;
     char_idx := 0;
     while buff_idx < str.count && char_idx != character_idx {
-        buff_idx += count_utf8_bytes(str[buff_idx]);
+        buff_idx += count_character_bytes(str[buff_idx]);
         char_idx += 1;
     }
     return buff_idx, char_idx == character_idx;
-- 
cgit v1.2.3