unicode_reader/
codepoints.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// Copyright (c) 2016-2021 William R. Fraser
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::error::Error;
use std::fmt;
use std::io;
use std::str;

use smallvec::SmallVec;

/// Wraps a byte-oriented reader and yields the UTF-8 data one code point at a time.
/// Any UTF-8 parsing errors are raised as `io::Error` with `ErrorKind::InvalidData`.
pub struct CodePoints<R: Iterator<Item = io::Result<u8>>> {
    input: R,
    buffer: SmallVec<[u8; 4]>,
}

impl<R: Iterator<Item = io::Result<u8>>> Iterator for CodePoints<R> {
    /// The type of the elements being iterated over: a `io::Result` with one Unicode code point
    /// (as a `char`), or any I/O error raised by the underlying reader, or any error encountered
    /// while trying to parse the byte stream as UTF-8.
    type Item = io::Result<char>;

    /// Get the next Unicode code point from the stream. Any malformed UTF-8 data will be returned
    /// as an `io::Error` with `ErrorKind::InvalidData`, including if the stream reaches EOF before
    /// a complete code point is read (which is returned as `ErrorKind::UnexpectedEof`). Any I/O
    /// error raised by the underlying stream will be returned as well.
    fn next(&mut self) -> Option<Self::Item> {
        loop {
            if !self.buffer.is_empty() {
                // See if we have a valid codepoint.
                match str::from_utf8(&self.buffer) {
                    Ok(s) => {
                        let mut chars = s.chars();
                        let c = chars.next().unwrap();
                        if c.len_utf8() < self.buffer.len() {
                            self.buffer = SmallVec::from_slice(&self.buffer[c.len_utf8()..]);
                        } else {
                            self.buffer.clear();
                        }
                        return Some(Ok(c));
                    }
                    Err(e) => {
                        if self.buffer.len() - e.valid_up_to() >= 4 {
                            // If we have 4 bytes that still don't make up a valid code point, then
                            // we have garbage.
                            // Remove leading bytes until either the buffer is empty, or we have a
                            // valid code point.
                            let mut split_point = 1;
                            let mut badbytes = vec![];
                            loop {
                                let (bad, rest) = self.buffer.split_at(split_point);
                                if rest.is_empty() || str::from_utf8(rest).is_ok() {
                                    badbytes.extend_from_slice(bad);
                                    self.buffer = SmallVec::from_slice(rest);
                                    break;
                                }
                                split_point += 1;
                            }

                            // Raise the error. If we still have data in the buffer, it will be
                            // returned on the next loop.
                            return Some(Err(io::Error::new(io::ErrorKind::InvalidData,
                                                           BadUtf8Error { bytes: badbytes })));
                        }
                        // else: We probably have a partial code point. Keep reading bytes to find
                        // out.
                    }
                }
            }
            match self.input.next() {
                Some(Ok(byte)) => {
                    self.buffer.push(byte);
                }
                None => {
                    if self.buffer.is_empty() {
                        return None;
                    } else {
                        // Invalid utf-8 at end of stream.
                        let bytes = self.buffer.to_vec();
                        self.buffer = SmallVec::new();
                        return Some(Err(io::Error::new(io::ErrorKind::UnexpectedEof,
                                                       BadUtf8Error { bytes })));
                    }
                }
                Some(Err(e)) => {
                    return Some(Err(e));
                }
            }
        }
    }
}

impl<R: Iterator<Item = io::Result<u8>>> From<R> for CodePoints<R> {
    fn from(input: R) -> CodePoints<R> {
        CodePoints {
            input,
            buffer: SmallVec::new(),
        }
    }
}

/// An error raised when parsing a UTF-8 byte stream fails.
#[derive(Debug)]
pub struct BadUtf8Error {
    /// The bytes that could not be parsed as a code point.
    pub bytes: Vec<u8>,
}

impl Error for BadUtf8Error {
    fn description(&self) -> &str {
        "BadUtf8Error"
    }
}

impl fmt::Display for BadUtf8Error {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "Bad UTF-8: {:?}", self.bytes)
    }
}