2 files changed, 311 insertions, 0 deletions
diff --git a/UTF8/module.jai b/UTF8/module.jai
new file mode 100644
index 0000000..5e6fd65
--- /dev/null
+++ b/UTF8/module.jai
@@ -0,0 +1,149 @@
+// Some procedures to help working with UTF8 strings.
+// https://en.wikipedia.org/wiki/UTF-8
+
+// Returns true if argument is a continuation byte.
+is_continuation_byte :: inline (byte: u8) -> bool {
+    // BBBB BBBB & 1100 0000 == 10XX XXXX -> is continuation byte
+   return (byte & 0xC0) == 0x80;
+}
+
+// Given a leading_byte, returns the number of bytes on the character.
+count_character_bytes :: inline (character_leading_byte: u8) -> int {
+    // BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte
+    if (character_leading_byte & 0xE0) == 0xC0 return 1+1;
+    
+    // BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte
+    if (character_leading_byte & 0xF0) == 0xE0 return 1+2;
+    
+    // BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte
+    if (character_leading_byte & 0xF8) == 0xF0 return 1+3;
+    
+    return 1;
+}
+
+// Returns the string (using same str.data) truncated to the provided length.
+truncate :: (str: string, length: int) -> string {
+    
+    if str.data == null     then return "";
+
+    if str.count < length   then return .{str.count, str.data};
+    
+    data    := str.data;
+    count   := str.count;
+
+    // Find index of first continuation byte.
+    idx := length;
+    while (idx > 0 && is_continuation_byte(data[idx - 1])) {
+        idx -= 1;
+    }
+    continuation_bytes := length - idx;
+
+    // If string starts with continuation bytes, it's an invalid UTF8 string.
+    if (idx == 0 && continuation_bytes > 0) {
+        length = 0;
+    }
+    // If length truncates some continuation bytes, remove incomplete UTF8 character.
+    else if (idx > 0 // string is not empty
+        // continuation bytes are not complete
+        && !(continuation_bytes == 0 && (data[idx - 1] & 0x80) == 0x00)
+        && !(continuation_bytes == 1 && (data[idx - 1] & 0xE0) == 0xC0)
+        && !(continuation_bytes == 2 && (data[idx - 1] & 0xF0) == 0xE0)
+        && !(continuation_bytes == 3 && (data[idx - 1] & 0xF8) == 0xF0)
+    ) {
+        length -= (continuation_bytes + 1); // Remove start byte, hence '+ 1'.
+    }
+    
+    return .{length, str.data};
+}
+
+// Returns true when the string is empty or consists of space characters.
+is_empty :: (str: string) -> bool {
+    for 0..str.count-1 {
+        if str[it] == {
+            case #char "\0"; #through;
+            case #char "\t"; #through;      // horizontal tab
+            case #char "\n"; #through;      // line feed
+            case #char "\x0B"; #through;    // vertical tabulation
+            case #char "\x0C"; #through;    // form feed
+            case #char "\r"; #through;      // carriage return
+            case #char " ";
+                continue;
+            case;
+                return false;
+        }
+    }
+    return true;
+}
+
+// Counts the number of characters.
+count_characters :: (str: string, $is_null_terminated := false) -> int {
+    characters := 0;
+    idx := 0;
+    
+    #if is_null_terminated {
+        while idx < str.count && str[idx] != 0 {
+            idx += count_character_bytes(str[idx]);
+            characters += 1;
+        }
+    }
+    else {
+        while idx < str.count {
+            idx += count_character_bytes(str[idx]);
+            characters += 1;
+        }
+    }
+    
+    return characters;
+}
+
+// Deletes character by it's index, and moves tail data to take its place.
+delete_character :: (str: *string, character_idx: int) -> success := true {
+    buffer_idx := get_byte_index(str.*, character_idx);
+
+    if buffer_idx < 0 return false;
+    
+    bytes_to_delete := count_character_bytes(str.data[buffer_idx]);
+    
+    for buffer_idx..str.count-1-bytes_to_delete {
+        str.data[it] = str.data[it+bytes_to_delete];
+    }
+    for str.count-bytes_to_delete..str.count-1 {
+        str.data[it] = 0;
+    }
+    
+    str.count -= bytes_to_delete;
+    return;
+}
+
+// Searches for the given character index and returns its byte index on the string.
+get_byte_index :: (str: string, character_index: int) -> buffer_index: int, success := true {
+    buff_idx := 0;
+    char_idx := 0;
+    while buff_idx < str.count {
+        if char_idx == character_index return buff_idx;
+        buff_idx += count_character_bytes(str[buff_idx]);
+        char_idx += 1;
+    }
+    return -1, false;
+}
+
+// Scans the string for UTF8 encoding errors.
+is_valid :: (str: string) -> is_valid := true, error_index: int = -1 {
+    idx := 0;
+    remainig_bytes := 0;
+    while idx < str.count {
+        defer idx += 1;
+        
+        is_continuation := is_continuation_byte(str[idx]);
+        
+        if (is_continuation && remainig_bytes == 0) || (!is_continuation && remainig_bytes > 0) then return false, idx;
+        
+        if is_continuation {
+            remainig_bytes -= 1;
+            continue;
+        }
+        
+        remainig_bytes = count_character_bytes(str[idx]) - 1;
+    }
+    return;
+}
diff --git a/UTF8/tests.jai b/UTF8/tests.jai
new file mode 100644
index 0000000..b7e3579
--- /dev/null
+++ b/UTF8/tests.jai
@@ -0,0 +1,162 @@
+#import "Basic";
+#import "String";
+#import "UTF8";
+
+main :: () {
+    write_strings(
+        "#=======================#\n",
+        "# Basic tests           #\n"
+    );
+
+    tmp_str: string;
+    tmp_bool: bool;
+    tmp_int: int;
+
+    assert(is_continuation_byte("0€1"[0]) == false);
+    assert(is_continuation_byte("0€1"[1]) == false);
+    assert(is_continuation_byte("0€1"[2]) == true);
+    assert(is_continuation_byte("0€1"[3]) == true);
+    assert(is_continuation_byte("0€1"[4]) == false);
+
+
+    write_strings("# count_character_bytes #\n");
+    
+    assert(count_character_bytes("0£€𐍈1"[0]) == 1);
+    assert(count_character_bytes("0£€𐍈1"[1]) == 2);
+    assert(count_character_bytes("0£€𐍈1"[2]) == 1);
+    assert(count_character_bytes("0£€𐍈1"[3]) == 3);
+    assert(count_character_bytes("0£€𐍈1"[4]) == 1);
+    assert(count_character_bytes("0£€𐍈1"[5]) == 1);
+    assert(count_character_bytes("0£€𐍈1"[6]) == 4);
+    assert(count_character_bytes("0£€𐍈1"[7]) == 1);
+    assert(count_character_bytes("0£€𐍈1"[8]) == 1);
+    assert(count_character_bytes("0£€𐍈1"[9]) == 1);
+    assert(count_character_bytes("0£€𐍈1"[10]) == 1);
+
+
+    write_strings("# truncate              #\n");
+    
+    assert(compare(truncate("0£€𐍈1", 0), "") == 0);
+    assert(compare(truncate("0£€𐍈1", 1), "0") == 0);
+    assert(compare(truncate("0£€𐍈1", 2), "0") == 0);
+    assert(compare(truncate("0£€𐍈1", 3), "0£") == 0);
+    assert(compare(truncate("0£€𐍈1", 4), "0£") == 0);
+    assert(compare(truncate("0£€𐍈1", 5), "0£") == 0);
+    assert(compare(truncate("0£€𐍈1", 6), "0£€") == 0);
+    assert(compare(truncate("0£€𐍈1", 7), "0£€") == 0);
+    assert(compare(truncate("0£€𐍈1", 8), "0£€") == 0);
+    assert(compare(truncate("0£€𐍈1", 9), "0£€") == 0);
+    assert(compare(truncate("0£€𐍈1", 10), "0£€𐍈") == 0);
+    assert(compare(truncate("0£€𐍈1", 11), "0£€𐍈1") == 0);
+    assert(compare(truncate("0£€𐍈1", 12), "0£€𐍈1") == 0);
+
+
+    write_strings("# is_empty              #\n");
+    
+    assert(is_empty(""));
+    assert(is_empty("\0"));
+    assert(is_empty("\0\t"));
+    assert(is_empty("\0\t\n"));
+    assert(is_empty("\0\t\n\x0B"));
+    assert(is_empty("\0\t\n\x0B\x0C"));
+    assert(is_empty("\0\t\n\x0B\x0C\r"));
+    assert(is_empty("\0\t\n\x0B\x0C\r "));
+    assert(is_empty("\0\t\n\x0B\x0C\r .") == false);
+    assert(is_empty("| B A Z € N G A  |") == false);
+
+
+    write_strings("# delete_character      #\n");
+
+    tmp_str = copy_string("",, temporary_allocator);
+    assert(delete_character(*tmp_str, 0) == false);
+    
+    tmp_str = copy_string("12£45€78𐍈",, temporary_allocator);
+    assert(delete_character(*tmp_str, -1) == false);
+    assert(delete_character(*tmp_str, 99999) == false);
+    assert(delete_character(*tmp_str, 7) == true);
+    assert(compare(tmp_str, "12£45€7𐍈") == 0);
+    assert(delete_character(*tmp_str, 2) == true);
+    assert(compare(tmp_str, "1245€7𐍈") == 0);
+    assert(delete_character(*tmp_str, 4) == true);
+    assert(compare(tmp_str, "12457𐍈") == 0);
+    assert(delete_character(*tmp_str, 3) == true);
+    assert(compare(tmp_str, "1247𐍈") == 0);
+    
+    
+    write_strings("# get_byte_index        #\n");
+
+    tmp_str = copy_string("12£45€78𐍈X",, temporary_allocator);
+    
+    tmp_int, tmp_bool = get_byte_index("", 0);
+    assert(tmp_int == -1 && tmp_bool == false, "(%, %)", tmp_int, tmp_bool);
+    tmp_int, tmp_bool = get_byte_index(tmp_str, -1);
+    assert(tmp_int == -1 && tmp_bool == false, "(%, %)", tmp_int, tmp_bool);
+    tmp_int, tmp_bool = get_byte_index(tmp_str, -99999);
+    assert(tmp_int == -1 && tmp_bool == false, "(%, %)", tmp_int, tmp_bool);
+    tmp_int, tmp_bool = get_byte_index(tmp_str, 99999);
+    assert(tmp_int == -1 && tmp_bool == false, "(%, %)", tmp_int, tmp_bool);
+    
+    tmp_int, tmp_bool = get_byte_index(tmp_str, 0);
+    assert(tmp_int == 0 && tmp_bool == true, "(%, %)", tmp_int, tmp_bool);
+    tmp_int, tmp_bool = get_byte_index(tmp_str, 1);
+    assert(tmp_int == 1 && tmp_bool == true, "(%, %)", tmp_int, tmp_bool);
+    tmp_int, tmp_bool = get_byte_index(tmp_str, 2);
+    assert(tmp_int == 2 && tmp_bool == true, "(%, %)", tmp_int, tmp_bool);
+    tmp_int, tmp_bool = get_byte_index(tmp_str, 3);
+    assert(tmp_int == 4 && tmp_bool == true, "(%, %)", tmp_int, tmp_bool);
+    tmp_int, tmp_bool = get_byte_index(tmp_str, 4);
+    assert(tmp_int == 5 && tmp_bool == true, "(%, %)", tmp_int, tmp_bool);
+    tmp_int, tmp_bool = get_byte_index(tmp_str, 5);
+    assert(tmp_int == 6 && tmp_bool == true, "(%, %)", tmp_int, tmp_bool);
+    tmp_int, tmp_bool = get_byte_index(tmp_str, 6);
+    assert(tmp_int == 9 && tmp_bool == true, "(%, %)", tmp_int, tmp_bool);
+    tmp_int, tmp_bool = get_byte_index(tmp_str, 7);
+    assert(tmp_int == 10 && tmp_bool == true, "(%, %)", tmp_int, tmp_bool);
+    tmp_int, tmp_bool = get_byte_index(tmp_str, 8);
+    assert(tmp_int == 11 && tmp_bool == true, "(%, %)", tmp_int, tmp_bool);
+    tmp_int, tmp_bool = get_byte_index(tmp_str, 9);
+    assert(tmp_int == 15 && tmp_bool == true, "(%, %)", tmp_int, tmp_bool);
+    
+    
+    write_strings("# count_characters      #\n");
+    
+    assert(count_characters("") == 0);
+    assert(count_characters("0") == 1);
+    assert(count_characters("0£") == 2);
+    assert(count_characters("0£€") == 3);
+    assert(count_characters("0£€𐍈") == 4);
+    assert(count_characters("0£€𐍈1") == 5);
+    
+    tmp_str = copy_string("123€DELETE",, temporary_allocator);
+    tmp_str[6] = 0;
+    assert(count_characters(tmp_str) == 10);
+    assert(count_characters(tmp_str, true) == 4);
+
+
+    write_strings("# is_valid              #\n");
+
+    assert(is_valid(""));
+
+    tmp_str = copy_string("123€DELETE",, temporary_allocator);
+    tmp_str[6] = 0;
+    tmp_bool, tmp_int = is_valid(tmp_str);
+    assert(tmp_bool == true && tmp_int == -1, "(%, %)", tmp_bool, tmp_int);
+
+    tmp_str = copy_string("123€DELETE",, temporary_allocator);
+    tmp_str[3] = 0; // Cut € at start.
+    tmp_bool, tmp_int = is_valid(tmp_str);
+    assert(tmp_bool == false && tmp_int == 4, "(%, %)", tmp_bool, tmp_int);
+    
+    tmp_str = copy_string("123€DELETE",, temporary_allocator);
+    tmp_str[4] = 0; // Cut € at middle.
+    tmp_bool, tmp_int = is_valid(tmp_str);
+    assert(tmp_bool == false && tmp_int == 4, "(%, %)", tmp_bool, tmp_int);
+
+    tmp_str = copy_string("123€DELETE",, temporary_allocator);
+    tmp_str[5] = 0; // Cut € at end.
+    tmp_bool, tmp_int = is_valid(tmp_str);
+    assert(tmp_bool == false && tmp_int == 5, "(%, %)", tmp_bool, tmp_int);
+
+    
+    write_strings("  No errors found.\n");
+}