blob: fac1326c986a92328a8e8b94762ff096a775c39e (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
// BBBB BBBB & 1100 0000 == 10XX XXXX -> is continuation byte
is_utf8_continuation_byte :: inline (byte: u8) -> bool {
return (byte & 0xC0) == 0x80;
}
// BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte
// BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte
// BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte
count_utf8_bytes :: inline (byte: u8) -> int {
if (byte & 0xE0) == 0xC0 return 1+1;
if (byte & 0xF0) == 0xE0 return 1+2;
if (byte & 0xF8) == 0xF0 return 1+3;
return 1;
}
// Truncates the string to the length provided or shorter, in case of UTF8 strings that require so.
// Truncation is done by zeroing the tail of the string in place.
// Returns length of truncated string.
truncate_string :: (str: string, length: int) -> length: int {
if str.data == null then return -1;
if str.count < length then length = str.count;
data := str.data;
count := str.count;
// Find index of first continuation byte.
idx := length;
// while (idx > 0 && ((data[idx - 1] & 0xC0) == 0x80)) { TODO REMOVE AFTER TESTING
while (idx > 0 && is_utf8_continuation_byte(data[idx - 1])) {
idx -= 1;
}
continuation_bytes := length - idx;
// If string starts with continuation bytes, it's an invalid UTF8 string.
if (idx == 0 && continuation_bytes > 0) {
length = 0;
}
// If length truncates some continuation bytes, remove incomplete UTF8 character.
else if (idx > 0 // string is not empty
// continuation bytes are not complete
&& !(continuation_bytes == 0 && (data[idx - 1] & 0x80) == 0x00)
&& !(continuation_bytes == 1 && (data[idx - 1] & 0xE0) == 0xC0)
&& !(continuation_bytes == 2 && (data[idx - 1] & 0xF0) == 0xE0)
&& !(continuation_bytes == 3 && (data[idx - 1] & 0xF8) == 0xF0)
) {
length -= (continuation_bytes + 1); // Remove start byte, ence '+ 1'.
}
memset(data + length, 0, count - length);
return length;
}
// Returns true when the string is empty or consists of space characters.
is_empty_string :: (str: string) -> bool {
for 0..str.count-1 {
if str[it] == {
case #char "\0"; #through;
case #char "\t"; #through; // horizontal tab
case #char "\n"; #through; // line feed
case #char "\x0B"; #through; // vertical tabulation
case #char "\x0C"; #through; // form feed
case #char "\r"; #through; // carriage return
case #char " ";
continue;
case;
return false;
}
}
return true;
}
|