aboutsummaryrefslogtreecommitdiff
path: root/modules/UTF8/module.jai
diff options
context:
space:
mode:
Diffstat (limited to 'modules/UTF8/module.jai')
-rw-r--r--modules/UTF8/module.jai29
1 files changed, 25 insertions, 4 deletions
diff --git a/modules/UTF8/module.jai b/modules/UTF8/module.jai
index 72d3d75..5e6fd65 100644
--- a/modules/UTF8/module.jai
+++ b/modules/UTF8/module.jai
@@ -8,15 +8,15 @@ is_continuation_byte :: inline (byte: u8) -> bool {
}
// Given a leading_byte, returns the number of bytes on the character.
-count_character_bytes :: inline (leading_byte: u8) -> int {
+count_character_bytes :: inline (character_leading_byte: u8) -> int {
// BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte
- if (leading_byte & 0xE0) == 0xC0 return 1+1;
+ if (character_leading_byte & 0xE0) == 0xC0 return 1+1;
// BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte
- if (leading_byte & 0xF0) == 0xE0 return 1+2;
+ if (character_leading_byte & 0xF0) == 0xE0 return 1+2;
// BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte
- if (leading_byte & 0xF8) == 0xF0 return 1+3;
+ if (character_leading_byte & 0xF8) == 0xF0 return 1+3;
return 1;
}
@@ -126,3 +126,24 @@ get_byte_index :: (str: string, character_index: int) -> buffer_index: int, succ
}
return -1, false;
}
+
+// Scans the string for UTF8 encoding errors.
+is_valid :: (str: string) -> is_valid := true, error_index: int = -1 {
+ idx := 0;
+ remainig_bytes := 0;
+ while idx < str.count {
+ defer idx += 1;
+
+ is_continuation := is_continuation_byte(str[idx]);
+
+ if (is_continuation && remainig_bytes == 0) || (!is_continuation && remainig_bytes > 0) then return false, idx;
+
+ if is_continuation {
+ remainig_bytes -= 1;
+ continue;
+ }
+
+ remainig_bytes = count_character_bytes(str[idx]) - 1;
+ }
+ return;
+}