diff options
Diffstat (limited to 'modules/UTF8/module.jai')
| -rw-r--r-- | modules/UTF8/module.jai | 29 |
1 files changed, 25 insertions, 4 deletions
diff --git a/modules/UTF8/module.jai b/modules/UTF8/module.jai index 72d3d75..5e6fd65 100644 --- a/modules/UTF8/module.jai +++ b/modules/UTF8/module.jai @@ -8,15 +8,15 @@ is_continuation_byte :: inline (byte: u8) -> bool { } // Given a leading_byte, returns the number of bytes on the character. -count_character_bytes :: inline (leading_byte: u8) -> int { +count_character_bytes :: inline (character_leading_byte: u8) -> int { // BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte - if (leading_byte & 0xE0) == 0xC0 return 1+1; + if (character_leading_byte & 0xE0) == 0xC0 return 1+1; // BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte - if (leading_byte & 0xF0) == 0xE0 return 1+2; + if (character_leading_byte & 0xF0) == 0xE0 return 1+2; // BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte - if (leading_byte & 0xF8) == 0xF0 return 1+3; + if (character_leading_byte & 0xF8) == 0xF0 return 1+3; return 1; } @@ -126,3 +126,24 @@ get_byte_index :: (str: string, character_index: int) -> buffer_index: int, succ } return -1, false; } + +// Scans the string for UTF8 encoding errors. +is_valid :: (str: string) -> is_valid := true, error_index: int = -1 { + idx := 0; + remainig_bytes := 0; + while idx < str.count { + defer idx += 1; + + is_continuation := is_continuation_byte(str[idx]); + + if (is_continuation && remainig_bytes == 0) || (!is_continuation && remainig_bytes > 0) then return false, idx; + + if is_continuation { + remainig_bytes -= 1; + continue; + } + + remainig_bytes = count_character_bytes(str[idx]) - 1; + } + return; +} |
