Preparing for UTF8 support on read_input_line.

author: dam <dam@gudinoff> 2024-04-07 23:23:06 +0100
committer: dam <dam@gudinoff> 2024-04-07 23:23:06 +0100
commit: 353d8b1145db12ffc42e4f6c2148a848ef8bba84 (patch)
tree: afbf51e66a9e7eaba04b6659b224187bc7a95c75 /modules/UTF8.jai
parent: a3cf506defb4a01759db6f920a363e23de63e984 (diff)
download: task-time-tracker-353d8b1145db12ffc42e4f6c2148a848ef8bba84.tar.zst
task-time-tracker-353d8b1145db12ffc42e4f6c2148a848ef8bba84.zip
1 files changed, 71 insertions, 0 deletions
diff --git a/modules/UTF8.jai b/modules/UTF8.jai
new file mode 100644
index 0000000..fac1326
--- /dev/null
+++ b/modules/UTF8.jai
@@ -0,0 +1,71 @@
+// BBBB BBBB & 1100 0000 == 10XX XXXX -> is continuation byte
+is_utf8_continuation_byte :: inline (byte: u8) -> bool {
+   return (byte & 0xC0) == 0x80;
+}
+
+// BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte
+// BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte
+// BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte
+count_utf8_bytes :: inline (byte: u8) -> int {
+    if (byte & 0xE0) == 0xC0 return 1+1;
+    if (byte & 0xF0) == 0xE0 return 1+2;
+    if (byte & 0xF8) == 0xF0 return 1+3;
+    return 1;
+}
+
+// Truncates the string to the length provided or shorter, in case of UTF8 strings that require so.
+// Truncation is done by zeroing the tail of the string in place.
+// Returns length of truncated string.
+truncate_string :: (str: string, length: int) -> length: int {
+    if str.data == null     then return -1;
+    
+    if str.count < length   then length = str.count;
+    
+    data := str.data;
+    count := str.count;
+
+    // Find index of first continuation byte.
+    idx := length;
+    // while (idx > 0 && ((data[idx - 1] & 0xC0) == 0x80)) { TODO REMOVE AFTER TESTING
+    while (idx > 0 && is_utf8_continuation_byte(data[idx - 1])) {
+        idx -= 1;
+    }
+    continuation_bytes := length - idx;
+
+    // If string starts with continuation bytes, it's an invalid UTF8 string.
+    if (idx == 0 && continuation_bytes > 0) {
+        length = 0;
+    }
+    // If length truncates some continuation bytes, remove incomplete UTF8 character.
+    else if (idx > 0 // string is not empty
+        // continuation bytes are not complete
+        && !(continuation_bytes == 0 && (data[idx - 1] & 0x80) == 0x00)
+        && !(continuation_bytes == 1 && (data[idx - 1] & 0xE0) == 0xC0)
+        && !(continuation_bytes == 2 && (data[idx - 1] & 0xF0) == 0xE0)
+        && !(continuation_bytes == 3 && (data[idx - 1] & 0xF8) == 0xF0)
+    ) {
+        length -= (continuation_bytes + 1); // Remove start byte, ence '+ 1'.
+    }
+
+    memset(data + length, 0, count - length);
+    return length;
+}
+
+// Returns true when the string is empty or consists of space characters.
+is_empty_string :: (str: string) -> bool {
+    for 0..str.count-1 {
+        if str[it] == {
+            case #char "\0"; #through;
+            case #char "\t"; #through;      // horizontal tab
+            case #char "\n"; #through;      // line feed
+            case #char "\x0B"; #through;    // vertical tabulation
+            case #char "\x0C"; #through;    // form feed
+            case #char "\r"; #through;      // carriage return
+            case #char " ";
+                continue;
+            case;
+                return false;
+        }
+    }
+    return true;
+}
author	dam <dam@gudinoff>	2024-04-07 23:23:06 +0100
committer	dam <dam@gudinoff>	2024-04-07 23:23:06 +0100
commit	353d8b1145db12ffc42e4f6c2148a848ef8bba84 (patch)
tree	afbf51e66a9e7eaba04b6659b224187bc7a95c75 /modules/UTF8.jai
parent	a3cf506defb4a01759db6f920a363e23de63e984 (diff)
download	task-time-tracker-353d8b1145db12ffc42e4f6c2148a848ef8bba84.tar.zst task-time-tracker-353d8b1145db12ffc42e4f6c2148a848ef8bba84.zip