// BBBB BBBB & 1100 0000 == 10XX XXXX -> is continuation byte // TODO Maybe rename to: is_continuation_byte is_utf8_continuation_byte :: inline (byte: u8) -> bool { return (byte & 0xC0) == 0x80; } // BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte // BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte // BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte // TODO Maybe rename to: count_character_bytes count_utf8_bytes :: inline (byte: u8) -> int { if (byte & 0xE0) == 0xC0 return 1+1; if (byte & 0xF0) == 0xE0 return 1+2; if (byte & 0xF8) == 0xF0 return 1+3; return 1; } // Truncates the string to the length provided or shorter, in case of UTF8 strings that require so. // Truncation is done by zeroing the tail of the string in place. // Returns length of truncated string. // TODO Maybe rename to: truncate truncate_string :: (str: string, length: int) -> length: int { if str.data == null then return -1; if str.count < length then length = str.count; data := str.data; count := str.count; // Find index of first continuation byte. idx := length; while (idx > 0 && is_utf8_continuation_byte(data[idx - 1])) { idx -= 1; } continuation_bytes := length - idx; // If string starts with continuation bytes, it's an invalid UTF8 string. if (idx == 0 && continuation_bytes > 0) { length = 0; } // If length truncates some continuation bytes, remove incomplete UTF8 character. else if (idx > 0 // string is not empty // continuation bytes are not complete && !(continuation_bytes == 0 && (data[idx - 1] & 0x80) == 0x00) && !(continuation_bytes == 1 && (data[idx - 1] & 0xE0) == 0xC0) && !(continuation_bytes == 2 && (data[idx - 1] & 0xF0) == 0xE0) && !(continuation_bytes == 3 && (data[idx - 1] & 0xF8) == 0xF0) ) { length -= (continuation_bytes + 1); // Remove start byte, ence '+ 1'. } memset(data + length, 0, count - length); // str.count = length; TODO We should be doing this... return length; } // Returns true when the string is empty or consists of space characters. // TODO Maybe rename to: is_empty is_empty_string :: (str: string) -> bool { for 0..str.count-1 { if str[it] == { case #char "\0"; #through; case #char "\t"; #through; // horizontal tab case #char "\n"; #through; // line feed case #char "\x0B"; #through; // vertical tabulation case #char "\x0C"; #through; // form feed case #char "\r"; #through; // carriage return case #char " "; continue; case; return false; } } return true; } // Counts number of characters in string. count_characters :: (str: string) -> int { characters := 0; idx := 0; while idx < str.count { idx += count_utf8_bytes(str[idx]); characters += 1; } return characters; } // Delete character. delete_character :: (str: *string, character_idx: int) { buffer_idx := map_character_to_buffer_idx(str.*, character_idx); bytes_to_delete := count_utf8_bytes(str.data[buffer_idx]); for buffer_idx..str.count-1-bytes_to_delete { str.data[it] = str.data[it+bytes_to_delete]; } for str.count-bytes_to_delete..str.count-1 { str.data[it] = 0; } str.count -= bytes_to_delete; } // Get character index. // TODO Maybe rename to: map_character_to_byte_idx or get_character_byte_idx map_character_to_buffer_idx :: (str: string, character_idx: int) -> buffer_idx: int, success: bool { if character_idx < 0 then return -1, false; if character_idx > str.count then return -2, false; if character_idx == 0 then return 0, true; buff_idx := 0; char_idx := 0; while buff_idx < str.count && char_idx != character_idx { buff_idx += count_utf8_bytes(str[buff_idx]); char_idx += 1; } return buff_idx, char_idx == character_idx; }