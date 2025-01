/// Return the number of bytes required to UTF-8 encode a codepoint.

/// Returns 0 for surrogates and out-of-bounds values.

const fn utf8_bytes_for_codepoint ( codepoint : u32 ) -> usize exceeded_bit << 1

/// Length, based on the number of leading zeros.

const LEN : ( u8 ; 33 ) = (

// 0-10 leading zeros: not valid

0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,

// 11-15 leading zeros: 4 bytes

4 , 4 , 4 , 4 , 4 ,

//16-20 leading zeros: 3 bytes

3 , 3 , 3 , 3 , 3 ,

// 21-24 leading zeros: 2 bytes

2 , 2 , 2 , 2 ,

// 25-32 leading zeros: 1 byte

1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,

);

/// Encode a UTF-8 codepoint.

/// Returns a buffer and the number of valid bytes in the buffer.

///

/// To add this codepoint to a string, append all four bytes in order,

/// and record that (usize) bytes were added to the string.

///

/// Returns a length of zero for invalid codepoints (surrogates and out-of-bounds values).

pub fn branchless_utf8 ( codepoint : u32 ) -> (( u8 ; 4 ), usize ) (( codepoint >> SHIFT ( len )( 3 )) & MASK ( len )( 3 ) as u32 ) as u8 ,

);

( buf , len )

type Table = (( u8 ; 4 ); 5 );

// Byte prefix for a continuation byte.

const CONTINUE : u8 = 0b1000_0000 ;

const PREFIX : Table = (

( 0 u8 ; 4 ),

( 0 , 0 , 0 , 0 ),

( 0b1100_0000 , CONTINUE , 0 , 0 ),

( 0b1110_0000 , CONTINUE , CONTINUE , 0 ),

( 0b1111_0000 , CONTINUE , CONTINUE , CONTINUE ),

);

// We must arrange that the most-significant bytes are always in byte 0.

const SHIFT : Table = (

( 0 u8 ; 4 ),

( 0 , 0 , 0 , 0 ),

( 6 , 0 , 0 , 0 ),

( 12 , 6 , 0 , 0 ),

( 18 , 12 , 6 , 0 ),

);

const MASK : Table = (

( 0 u8 ; 4 ),

( 0x7f , 0 , 0 , 0 ),

( 0x1f , 0x3f , 0 , 0 ),

( 0x0f , 0x3f , 0x3f , 0 ),

( 0x07 , 0x3f , 0x3f , 0x3f ),