aboutsummaryrefslogtreecommitdiff
path: root/modules/UTF8.jai
blob: eba4585f596f3c6e587ff929b732a777965ed0d2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
// BBBB BBBB & 1100 0000 == 10XX XXXX -> is continuation byte
// TODO Maybe rename to: is_continuation_byte
is_utf8_continuation_byte :: inline (byte: u8) -> bool {
   return (byte & 0xC0) == 0x80;
}

// BBBB BBBB & 1110 0000 == 110X XXXX -> 1 initial + 1 continuation byte
// BBBB BBBB & 1111 0000 == 1110 XXXX -> 1 initial + 2 continuation byte
// BBBB BBBB & 1111 1000 == 1111 0XXX -> 1 initial + 3 continuation byte
// TODO Maybe rename to: count_character_bytes
count_utf8_bytes :: inline (byte: u8) -> int {
    if (byte & 0xE0) == 0xC0 return 1+1;
    if (byte & 0xF0) == 0xE0 return 1+2;
    if (byte & 0xF8) == 0xF0 return 1+3;
    return 1;
}

// Truncates the string to the length provided or shorter, in case of UTF8 strings that require so.
// Truncation is done by zeroing the tail of the string in place.
// Returns length of truncated string.
// TODO Maybe rename to: truncate
truncate_string :: (str: string, length: int) -> length: int {
    if str.data == null     then return -1;
    
    if str.count < length   then length = str.count;
    
    data := str.data;
    count := str.count;

    // Find index of first continuation byte.
    idx := length;
    while (idx > 0 && is_utf8_continuation_byte(data[idx - 1])) {
        idx -= 1;
    }
    continuation_bytes := length - idx;

    // If string starts with continuation bytes, it's an invalid UTF8 string.
    if (idx == 0 && continuation_bytes > 0) {
        length = 0;
    }
    // If length truncates some continuation bytes, remove incomplete UTF8 character.
    else if (idx > 0 // string is not empty
        // continuation bytes are not complete
        && !(continuation_bytes == 0 && (data[idx - 1] & 0x80) == 0x00)
        && !(continuation_bytes == 1 && (data[idx - 1] & 0xE0) == 0xC0)
        && !(continuation_bytes == 2 && (data[idx - 1] & 0xF0) == 0xE0)
        && !(continuation_bytes == 3 && (data[idx - 1] & 0xF8) == 0xF0)
    ) {
        length -= (continuation_bytes + 1); // Remove start byte, ence '+ 1'.
    }

    memset(data + length, 0, count - length);
    // str.count = length;  TODO We should be doing this...
    return length;
}

// Returns true when the string is empty or consists of space characters.
// TODO Maybe rename to: is_empty
is_empty_string :: (str: string) -> bool {
    for 0..str.count-1 {
        if str[it] == {
            case #char "\0"; #through;
            case #char "\t"; #through;      // horizontal tab
            case #char "\n"; #through;      // line feed
            case #char "\x0B"; #through;    // vertical tabulation
            case #char "\x0C"; #through;    // form feed
            case #char "\r"; #through;      // carriage return
            case #char " ";
                continue;
            case;
                return false;
        }
    }
    return true;
}

// Counts number of characters in string.
count_characters :: (str: string) -> int {
    characters := 0;
    idx := 0;
    while idx < str.count {
        idx += count_utf8_bytes(str[idx]);
        characters += 1;
    }
    return characters;
}

// Delete character.
delete_character :: (str: *string, character_idx: int) {
    buffer_idx := map_character_to_buffer_idx(str.*, character_idx);
    bytes_to_delete := count_utf8_bytes(str.data[buffer_idx]);
    
    for buffer_idx..str.count-1-bytes_to_delete {
        str.data[it] = str.data[it+bytes_to_delete];
    }
    for str.count-bytes_to_delete..str.count-1 {
        str.data[it] = 0;
    }
    
    str.count -= bytes_to_delete;
}

// Get character index.
// TODO Maybe rename to: map_character_to_byte_idx or get_character_byte_idx
map_character_to_buffer_idx :: (str: string, character_idx: int) -> buffer_idx: int, success: bool {
    if character_idx < 0            then return -1, false;
    if character_idx > str.count    then return -2, false;
    if character_idx == 0           then return 0, true;
    
    buff_idx := 0;
    char_idx := 0;
    while buff_idx < str.count && char_idx != character_idx {
        buff_idx += count_utf8_bytes(str[buff_idx]);
        char_idx += 1;
    }
    return buff_idx, char_idx == character_idx;
}