2015-05-28 05:13:51 +00:00
|
|
|
use std::char::from_u32;
|
|
|
|
|
2016-07-11 02:11:21 +00:00
|
|
|
/// Reads a potentially multi-bytes utf8 codepoint.
|
|
|
|
///
|
|
|
|
/// Reads the given first byte, and uses the given
|
|
|
|
/// function to get more if needed.
|
2015-05-28 05:13:51 +00:00
|
|
|
///
|
|
|
|
/// Returns an error if the stream is invalid utf-8.
|
2017-03-27 20:27:50 +00:00
|
|
|
#[allow(dead_code)]
|
2016-03-15 22:37:57 +00:00
|
|
|
pub fn read_char<F>(first: u8, next: F) -> Result<char, String>
|
2017-10-12 23:38:55 +00:00
|
|
|
where
|
|
|
|
F: Fn() -> Option<u8>,
|
2015-05-28 05:13:51 +00:00
|
|
|
{
|
|
|
|
if first < 0x80 {
|
2016-03-15 22:37:57 +00:00
|
|
|
return Ok(first as char);
|
2015-05-28 05:13:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Number of leading 1s determines the number of bytes we'll have to read
|
|
|
|
let n_bytes = match (!first).leading_zeros() {
|
2019-02-28 23:55:02 +00:00
|
|
|
n @ 2..=6 => n as usize,
|
2015-05-28 05:13:51 +00:00
|
|
|
1 => return Err("First byte is continuation byte.".to_string()),
|
2019-02-28 23:55:02 +00:00
|
|
|
7..=8 => return Err("WTF is this byte??".to_string()),
|
2015-05-28 05:13:51 +00:00
|
|
|
_ => unreachable!(),
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut res = 0u32;
|
|
|
|
|
|
|
|
// First, get the data - only the few last bits
|
2017-10-11 16:09:49 +00:00
|
|
|
res |= u32::from(first & make_mask(7 - n_bytes));
|
2015-05-28 05:13:51 +00:00
|
|
|
|
|
|
|
// We already have one byte, now read the others.
|
|
|
|
for _ in 1..n_bytes {
|
2018-12-17 22:02:29 +00:00
|
|
|
let byte = next().ok_or_else(|| "Missing UTF-8 byte".to_string())?;
|
2015-05-28 05:13:51 +00:00
|
|
|
if byte & 0xC0 != 0x80 {
|
2017-10-12 23:38:55 +00:00
|
|
|
return Err(format!(
|
|
|
|
"Found non-continuation byte after leading: \
|
|
|
|
{}",
|
|
|
|
byte
|
|
|
|
));
|
2015-05-28 05:13:51 +00:00
|
|
|
}
|
|
|
|
// We have 6 fresh new bits to read, make room.
|
|
|
|
res <<= 6;
|
|
|
|
// 0x3F is 00111111, so we keep the last 6 bits
|
2017-10-11 16:09:49 +00:00
|
|
|
res |= u32::from(byte & 0x3F);
|
2015-05-28 05:13:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// from_u32 could return an error if we gave it invalid utf-8.
|
|
|
|
// But we're probably safe since we respected the rules when building it.
|
|
|
|
Ok(from_u32(res).unwrap())
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns a simple bitmask with n 1s to the right.
|
2017-03-27 20:27:50 +00:00
|
|
|
#[allow(dead_code)]
|
2015-05-28 05:13:51 +00:00
|
|
|
fn make_mask(n: usize) -> u8 {
|
|
|
|
let mut r = 0u8;
|
|
|
|
for i in 0..n {
|
|
|
|
r |= 1 << i;
|
|
|
|
}
|
|
|
|
r
|
|
|
|
}
|