unicode_reader/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
// Copyright (c) 2016-2021 William R. Fraser
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! This crate provides adaptors which wrap byte-oriented readers and yield the UTF-8 data as
//! Unicode code points or grapheme clusters.
//!
//! Unlike other Unicode parsers which work on strings (e.g.
//! [unicode_segmentation](https://crates.io/crates/unicode_segmentation) upon which this is built),
//! this crate works on streams and doesn't require reading the entire data into memory. Instead it
//! yields the graphemes or code points as it reads them.
//!
//! # Example
//!
//! ```rust
//! extern crate unicode_reader;
//! use unicode_reader::{CodePoints, Graphemes};
//!
//! use std::io::Cursor;
//!
//! fn main() {
//!     let input = Cursor::new("He\u{302}\u{320}llo");
//!     let mut graphemes = Graphemes::from(input);
//!     assert_eq!("H",                 graphemes.next().unwrap().unwrap());
//!     assert_eq!("e\u{302}\u{320}",   graphemes.next().unwrap().unwrap()); // note 3 characters
//!     assert_eq!("l",                 graphemes.next().unwrap().unwrap());
//!     assert_eq!("l",                 graphemes.next().unwrap().unwrap());
//!     assert_eq!("o",                 graphemes.next().unwrap().unwrap());
//!     assert!(graphemes.next().is_none());
//!
//!     let greek_bytes = vec![0xCE, 0xA7, 0xCE, 0xB1, 0xCE, 0xAF, 0xCF, 0x81, 0xCE, 0xB5,
//!                            0xCF, 0x84, 0xCE, 0xB5];
//!     let mut codepoints = CodePoints::from(Cursor::new(greek_bytes));
//!     assert_eq!(vec!['Χ', 'α', 'ί', 'ρ', 'ε', 'τ', 'ε'],
//!                 codepoints.map(|r| r.unwrap())
//!                           .collect::<Vec<char>>());
//! }
//! ```
//!
//! [Repository](https://github.com/wfraser/rust_unicode_reader)
//!
//! [Documentation](https://docs.rs/unicode_reader)

#![deny(missing_docs, rust_2018_idioms)]

mod codepoints;
mod graphemes;

pub use codepoints::CodePoints;
pub use codepoints::BadUtf8Error;
pub use graphemes::Graphemes;

use std::io::{Bytes, Read};

/// CodePoints can be constructed for any byte-oriented reader.
impl<R: Read> From<R> for CodePoints<Bytes<R>> {
    fn from(input: R) -> CodePoints<Bytes<R>> {
        CodePoints::from(input.bytes())
    }
}

/// Graphemes can be constructed for any byte-oriented reader by going through CodePoints as an
/// internal layer.
impl<R: Read> From<R> for Graphemes<CodePoints<Bytes<R>>> {
    fn from(input: R) -> Graphemes<CodePoints<Bytes<R>>> {
        Graphemes::from(CodePoints::from(input))
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use std::io::{self, Cursor};

    #[test]
    fn test_zalgo() {
        // "zalgo text": The text "ZALGO", with each letter having a ridiculous number of combining
        // marks on it.
        // Should be read in as just 5 ridiculously long graphemes.
        let zalgo = "Z\u{0364}\u{0364}\u{033F}\u{034C}\u{0313}\u{0300}\u{0350}\u{0352}\u{030F}\u{0309}\u{0364}\u{0369}\u{0310}\u{0343}\u{0367}\u{034C}\u{0343}\u{0344}\u{035F}\u{032E}\u{0324}\u{032A}\u{033C}\u{032D}\u{031F}\u{0359}\u{032A}\u{0329}\u{0323}\u{0320}\u{032B}\u{0326}\u{0323}\u{0347}\u{0326}\u{0354}\u{0331}A\u{0344}\u{0364}\u{0308}\u{036A}\u{036B}\u{0334}\u{0335}\u{0337}\u{035E}\u{0316}\u{0339}\u{0356}\u{0318}\u{0326}\u{0348}\u{033A}\u{031E}\u{032C}\u{0356}\u{0329}\u{0354}\u{0318}\u{032A}L\u{0312}\u{0342}\u{0357}\u{033E}\u{0343}\u{031A}\u{0301}\u{0346}\u{0334}\u{0328}\u{031C}\u{0329}\u{0349}\u{0318}\u{0349}\u{0359}\u{0329}\u{032A}\u{0355}\u{0359}\u{0332}G\u{0309}\u{0314}\u{030F}\u{036B}\u{030D}\u{036E}\u{030D}\u{0303}\u{036C}\u{030D}\u{0328}\u{031D}\u{0355}\u{035A}\u{0330}\u{0332}\u{032D}O\u{0350}\u{033F}\u{0308}\u{033F}\u{036D}\u{031A}\u{0304}\u{0350}\u{0344}\u{034B}\u{031B}\u{0322}\u{035D}\u{035C}\u{0336}\u{032A}\u{0317}\u{032C}\u{0347}\u{0316}\u{034D}\u{0323}\u{0330}\u{031E}\u{0354}\u{034E}\u{0323}\u{0326}\u{0317}";
        let input = Cursor::new(zalgo);
        assert_eq!(vec![('Z', 75), ('A', 47), ('L', 43), ('G', 35), ('O', 59)],
                   Graphemes::from(input)
                .map(|g| g.unwrap())
                .map(|g| (g.chars().next().unwrap(), g.len())) // (first_codepoint, num_bytes)
                .collect::<Vec<_>>());
    }

    fn assert_badutf8err<T>(reader: &mut dyn Iterator<Item = io::Result<T>>,
                            kind: io::ErrorKind,
                            bad_bytes: Vec<u8>) {
        let result = reader.next().unwrap();
        assert!(result.is_err());
        let ioerr: io::Error = result.err().unwrap();
        assert_eq!(kind, ioerr.kind());
        let inner: Box<BadUtf8Error> = ioerr.into_inner().unwrap().downcast().unwrap();
        assert_eq!(bad_bytes, inner.bytes);
    }

    #[test]
    fn test_utf8_error() {
        let bad = b"a\xe2\x28\xa1bc"; // the 2nd byte of the 3-byte set is invalid utf8
        let mut codepoints = CodePoints::from(Cursor::new(bad));
        assert_eq!('a', codepoints.next().unwrap().unwrap());

        // Here it should fail to parse a code point.
        assert_badutf8err(&mut codepoints,
                          io::ErrorKind::InvalidData,
                          vec![0xe2, 0x28, 0xa1]);

        // It should recover and continue after the bad bytes.
        assert_eq!('b', codepoints.next().unwrap().unwrap());
        assert_eq!('c', codepoints.next().unwrap().unwrap());
        assert!(codepoints.next().is_none());
    }

    #[test]
    fn test_error_at_end() {
        let bad = b"a\xe2\x80"; // ends in incomplete 3-byte UTF-8
        let mut codepoints = CodePoints::from(Cursor::new(bad));
        assert_eq!('a', codepoints.next().unwrap().unwrap());

        // Here it should fail to parse a code point.
        assert_badutf8err(&mut codepoints,
                          io::ErrorKind::UnexpectedEof,
                          vec![0xe2, 0x80]);

        // After reading the bad bytes, it should report the end of stream.
        assert!(codepoints.next().is_none());
    }

    #[test]
    fn test_grapheme_bad_utf8() {
        let bad = b"ab\xe2\x28\xa1cd";
        let mut graphemes = Graphemes::from(Cursor::new(bad));

        assert_eq!("a", graphemes.next().unwrap().unwrap());

        // Graphemes will hit the error here, but it needs to store it and return its pending
        // buffer instead.
        assert_eq!("b", graphemes.next().unwrap().unwrap());

        // Now it should raise the error.
        assert_badutf8err(&mut graphemes,
                          io::ErrorKind::InvalidData,
                          vec![0xe2, 0x28, 0xa1]);

        // Now it should recover and return more valid data.
        assert_eq!("c", graphemes.next().unwrap().unwrap());
        assert_eq!("d", graphemes.next().unwrap().unwrap());
        assert!(graphemes.next().is_none());
    }

    #[test]
    fn test_grapheme_bad_utf8_at_start() {
        let bad = b"\xe2\x28\xa1ab";
        let mut graphemes = Graphemes::from(Cursor::new(bad));

        // Now it should raise the error.
        assert_badutf8err(&mut graphemes,
                          io::ErrorKind::InvalidData,
                          vec![0xe2, 0x28, 0xa1]);

        // But recover and read the rest okay.
        assert_eq!("a", graphemes.next().unwrap().unwrap());
        assert_eq!("b", graphemes.next().unwrap().unwrap());
        assert!(graphemes.next().is_none());
    }

    #[test]
    fn test_notoria() {
        let bad = [0x80, 1, 2, 3, 4, 5, 6, 7];
        let mut codepoints = CodePoints::from(&bad[..]);

        assert_badutf8err(&mut codepoints,
            io::ErrorKind::InvalidData,
            vec![0x80]);

        for i in 1 ..= 7 {
            match codepoints.next() {
                Some(Ok(c)) if c == char::from(i) => (),
                other => panic!("at {}, expected '{:?}', found: {:?}", i, char::from(i), other),
            }
        }
        assert!(codepoints.next().is_none());
    }
}