const_str/
utf8.rs

1#![allow(unsafe_code)]
2
3use crate::printable::is_printable;
4use crate::slice::advance;
5use crate::slice::subslice;
6
7pub struct CharEncodeUtf8 {
8    buf: [u8; 4],
9    len: u8,
10}
11
12impl CharEncodeUtf8 {
13    /// Copied from [char::encode_utf8](https://github.com/rust-lang/rust/blob/0273e3bce7a0ce49e96a9662163e2380cb87e0be/library/core/src/char/methods.rs#L1600-L1645)
14    pub const fn new(ch: char) -> Self {
15        // UTF-8 ranges and tags for encoding characters
16        const TAG_CONT: u8 = 0b1000_0000;
17        const TAG_TWO_B: u8 = 0b1100_0000;
18        const TAG_THREE_B: u8 = 0b1110_0000;
19        const TAG_FOUR_B: u8 = 0b1111_0000;
20
21        let mut buf = [0; 4];
22        let len = ch.len_utf8();
23        let code = ch as u32;
24
25        match len {
26            1 => {
27                buf[0] = code as u8;
28            }
29            2 => {
30                buf[0] = ((code >> 6) & 0x1F) as u8 | TAG_TWO_B;
31                buf[1] = (code & 0x3F) as u8 | TAG_CONT;
32            }
33            3 => {
34                buf[0] = ((code >> 12) & 0x0F) as u8 | TAG_THREE_B;
35                buf[1] = ((code >> 6) & 0x3F) as u8 | TAG_CONT;
36                buf[2] = (code & 0x3F) as u8 | TAG_CONT;
37            }
38            4 => {
39                buf[0] = ((code >> 18) & 0x07) as u8 | TAG_FOUR_B;
40                buf[1] = ((code >> 12) & 0x3F) as u8 | TAG_CONT;
41                buf[2] = ((code >> 6) & 0x3F) as u8 | TAG_CONT;
42                buf[3] = (code & 0x3F) as u8 | TAG_CONT;
43            }
44            _ => {}
45        };
46
47        CharEncodeUtf8 {
48            buf,
49            len: len as u8,
50        }
51    }
52
53    pub const fn as_bytes(&self) -> &[u8] {
54        subslice(&self.buf, 0..self.len as usize)
55    }
56
57    // const since 1.55
58    pub const fn as_str(&self) -> &str {
59        unsafe { core::str::from_utf8_unchecked(self.as_bytes()) }
60    }
61}
62
63pub struct CharEscapeUnicode {
64    buf: [u8; 10],
65    len: u8,
66}
67
68impl CharEscapeUnicode {
69    const unsafe fn from_code_point(code: u32) -> Self {
70        let mut hex_buf = [0; 10];
71        let mut hex_pos = 0;
72
73        let mut x = code;
74        loop {
75            hex_buf[hex_pos] = crate::ascii::num_to_hex_digit((x as u8) & 0x0f);
76            hex_pos += 1;
77            x >>= 4;
78            if x == 0 {
79                break;
80            }
81        }
82
83        let mut buf = [b'\\', b'u', b'{', 0, 0, 0, 0, 0, 0, 0];
84        let mut pos = 3;
85
86        while hex_pos > 0 {
87            hex_pos -= 1;
88            buf[pos] = hex_buf[hex_pos];
89            pos += 1;
90        }
91
92        buf[pos] = b'}';
93        pos += 1;
94
95        Self {
96            buf,
97            len: pos as u8,
98        }
99    }
100
101    pub const fn new(ch: char) -> Self {
102        unsafe { Self::from_code_point(ch as u32) }
103    }
104
105    #[cfg(test)]
106    pub fn as_str(&self) -> &str {
107        unsafe { core::str::from_utf8_unchecked(&self.buf[..self.len as usize]) }
108    }
109}
110
111pub struct CharEscapeDebug {
112    buf: [u8; 10],
113    len: u8,
114}
115
116pub struct CharEscapeDebugArgs {
117    pub escape_single_quote: bool,
118    pub escape_double_quote: bool,
119}
120
121impl CharEscapeDebugArgs {
122    #[cfg(test)]
123    pub const ESCAPE_ALL: Self = Self {
124        escape_single_quote: true,
125        escape_double_quote: true,
126    };
127}
128
129impl CharEscapeDebug {
130    pub const fn new(ch: char, args: CharEscapeDebugArgs) -> Self {
131        match ch {
132            '\0' => Self::backslash_ascii(b'0'),
133            '\t' => Self::backslash_ascii(b't'),
134            '\r' => Self::backslash_ascii(b'r'),
135            '\n' => Self::backslash_ascii(b'n'),
136            '\\' => Self::backslash_ascii(b'\\'),
137            '"' if args.escape_double_quote => Self::backslash_ascii(b'"'),
138            '\'' if args.escape_single_quote => Self::backslash_ascii(b'\''),
139            _ if is_printable(ch) => Self::printable(ch),
140            _ => Self::unicode(ch),
141        }
142    }
143
144    const fn printable(ch: char) -> Self {
145        let e = CharEncodeUtf8::new(ch);
146        Self {
147            buf: [e.buf[0], e.buf[1], e.buf[2], e.buf[3], 0, 0, 0, 0, 0, 0],
148            len: e.len,
149        }
150    }
151
152    const fn backslash_ascii(ch: u8) -> Self {
153        Self {
154            buf: [b'\\', ch, 0, 0, 0, 0, 0, 0, 0, 0],
155            len: 2,
156        }
157    }
158
159    const fn unicode(ch: char) -> Self {
160        let e = CharEscapeUnicode::new(ch);
161        Self {
162            buf: e.buf,
163            len: e.len,
164        }
165    }
166
167    pub const fn as_bytes(&self) -> &[u8] {
168        subslice(&self.buf, 0..self.len as usize)
169    }
170
171    #[cfg(test)]
172    pub fn as_str(&self) -> &str {
173        unsafe { core::str::from_utf8_unchecked(&self.buf[..self.len as usize]) }
174    }
175
176    // pub const fn to_str_buf<const N: usize>(&self) -> StrBuf<N> {
177    //     let buf = crate::bytes::clone(self.as_bytes());
178    //     unsafe { StrBuf::new_unchecked(buf) }
179    // }
180}
181
182pub const fn next_char(bytes: &[u8]) -> Option<(char, usize)> {
183    /// Copied from [core::str::validations](https://github.com/rust-lang/rust/blob/e7958d35ca2c898a223efe402481e0ecb854310a/library/core/src/str/validations.rs#L7-L68)
184    #[allow(clippy::many_single_char_names)]
185    const fn next_code_point(bytes: &[u8]) -> Option<(u32, usize)> {
186        const CONT_MASK: u8 = 0b0011_1111;
187
188        const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
189            (byte & (0x7F >> width)) as u32
190        }
191
192        const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
193            (ch << 6) | (byte & CONT_MASK) as u32
194        }
195
196        #[allow(clippy::manual_unwrap_or_default)] // FIXME
197        const fn unwrap_or_0(opt: Option<u8>) -> u8 {
198            match opt {
199                Some(byte) => byte,
200                None => 0,
201            }
202        }
203
204        let mut i = 0;
205
206        macro_rules! next {
207            () => {{
208                if i < bytes.len() {
209                    let x = Some(bytes[i]);
210                    i += 1;
211                    x
212                } else {
213                    None
214                }
215            }};
216        }
217
218        let x = match next!() {
219            Some(x) => x,
220            None => return None,
221        };
222        if x < 128 {
223            return Some((x as u32, i));
224        }
225
226        let init = utf8_first_byte(x, 2);
227        let y = unwrap_or_0(next!());
228        let mut ch = utf8_acc_cont_byte(init, y);
229        if x >= 0xE0 {
230            let z = unwrap_or_0(next!());
231            let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
232            ch = (init << 12) | y_z;
233            if x >= 0xF0 {
234                let w = unwrap_or_0(next!());
235                ch = ((init & 7) << 18) | utf8_acc_cont_byte(y_z, w);
236            }
237        }
238
239        Some((ch, i))
240    }
241
242    match next_code_point(bytes) {
243        Some((ch, count)) => Some((unsafe { crate::str::char_from_u32(ch) }, count)),
244        None => None,
245    }
246}
247
248pub const fn str_count_chars(s: &str) -> usize {
249    let mut s = s.as_bytes();
250    let mut ans = 0;
251    while let Some((_, count)) = next_char(s) {
252        s = advance(s, count);
253        ans += 1;
254    }
255    ans
256}
257
258pub const fn str_chars<const N: usize>(s: &str) -> [char; N] {
259    let mut s = s.as_bytes();
260    let mut buf: [char; N] = ['\0'; N];
261    let mut pos = 0;
262    while let Some((ch, count)) = next_char(s) {
263        s = advance(s, count);
264        buf[pos] = ch;
265        pos += 1;
266    }
267    assert!(pos == N);
268    buf
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    #[test]
276    fn test_char_encode_utf8() {
277        macro_rules! test_char_encode_utf8 {
278            ($ch: expr) => {{
279                let e = CharEncodeUtf8::new($ch);
280                let output = e.as_str();
281                let mut ans = [0; 4];
282                let ans = $ch.encode_utf8(&mut ans);
283                assert_eq!(output, ans);
284            }};
285        }
286
287        test_char_encode_utf8!('\0');
288        test_char_encode_utf8!('我');
289        test_char_encode_utf8!('\u{10ffff}');
290    }
291
292    #[test]
293    fn test_char_escape_unicode() {
294        macro_rules! test_char_escape_unicode {
295            ($ch: expr) => {{
296                let e = CharEscapeUnicode::new($ch);
297                let output = e.as_str();
298                let ans = $ch.escape_unicode().to_string();
299                assert_eq!(output, ans);
300            }};
301        }
302
303        test_char_escape_unicode!('\0');
304        test_char_escape_unicode!('我');
305        test_char_escape_unicode!('\u{10ffff}');
306    }
307
308    #[test]
309    fn test_char_escape_debug() {
310        macro_rules! test_char_escape_debug {
311            ($ch: expr) => {{
312                let e = CharEscapeDebug::new($ch, CharEscapeDebugArgs::ESCAPE_ALL);
313                let output = e.as_str();
314                let ans = $ch.escape_debug().to_string();
315                assert_eq!(output, ans);
316            }};
317        }
318
319        for ch in '\0'..='\u{7f}' {
320            test_char_escape_debug!(ch);
321        }
322
323        // test_char_escape_debug!('我');
324        test_char_escape_debug!('\u{10ffff}');
325    }
326
327    #[test]
328    fn test_str_chars() {
329        const X: &str = "唐可可";
330        const OUTPUT_LEN: usize = str_count_chars(X);
331        const OUTPUT_BUF: [char; OUTPUT_LEN] = str_chars::<OUTPUT_LEN>(X);
332        let ans = X.chars().collect::<Vec<_>>();
333        assert_eq!(OUTPUT_BUF, ans.as_slice());
334    }
335}