// BBBB BBBB & 1100 0000 == 10XX XXXX -> is continuation byte is_utf8_continuation_byte :: inline (byte: u8) -> bool { return (byte & 0xC0) == 0x80; } // BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte // BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte // BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte count_utf8_bytes :: inline (byte: u8) -> int { if (byte & 0xE0) == 0xC0 return 1+1; if (byte & 0xF0) == 0xE0 return 1+2; if (byte & 0xF8) == 0xF0 return 1+3; return 1; } // Truncates the string to the length provided or shorter, in case of UTF8 strings that require so. // Truncation is done by zeroing the tail of the string in place. // Returns length of truncated string. truncate_string :: (str: string, length: int) -> length: int { if str.data == null then return -1; if str.count < length then length = str.count; data := str.data; count := str.count; // Find index of first continuation byte. idx := length; // while (idx > 0 && ((data[idx - 1] & 0xC0) == 0x80)) { TODO REMOVE AFTER TESTING while (idx > 0 && is_utf8_continuation_byte(data[idx - 1])) { idx -= 1; } continuation_bytes := length - idx; // If string starts with continuation bytes, it's an invalid UTF8 string. if (idx == 0 && continuation_bytes > 0) { length = 0; } // If length truncates some continuation bytes, remove incomplete UTF8 character. else if (idx > 0 // string is not empty // continuation bytes are not complete && !(continuation_bytes == 0 && (data[idx - 1] & 0x80) == 0x00) && !(continuation_bytes == 1 && (data[idx - 1] & 0xE0) == 0xC0) && !(continuation_bytes == 2 && (data[idx - 1] & 0xF0) == 0xE0) && !(continuation_bytes == 3 && (data[idx - 1] & 0xF8) == 0xF0) ) { length -= (continuation_bytes + 1); // Remove start byte, ence '+ 1'. } memset(data + length, 0, count - length); return length; } // Returns true when the string is empty or consists of space characters. is_empty_string :: (str: string) -> bool { for 0..str.count-1 { if str[it] == { case #char "\0"; #through; case #char "\t"; #through; // horizontal tab case #char "\n"; #through; // line feed case #char "\x0B"; #through; // vertical tabulation case #char "\x0C"; #through; // form feed case #char "\r"; #through; // carriage return case #char " "; continue; case; return false; } } return true; }