unicode_reader/
graphemes.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
// Copyright (c) 2016-2019 William R. Fraser
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use unicode_segmentation::UnicodeSegmentation;
use std::io;
use std::mem;

/// Wraps a `char`-oriented reader and yields the data one Unicode grapheme cluster at a time.
pub struct Graphemes<R: Iterator<Item = io::Result<char>>> {
    input: R,
    buffer: String,
    pending_error: Option<io::Error>,
}

impl<R: Iterator<Item = io::Result<char>>> Iterator for Graphemes<R> {
    /// The type of the elements being iterated over: a `io::Result` with one Unicode grapheme
    /// cluster, or any I/O error raised by the underlying reader.
    type Item = io::Result<String>;

    /// Get the next grapheme cluster from the stream. Note that because grapheme clusters are of
    /// indeterminate length, this has to read the underlying reader until the *next* cluster
    /// starts before it can return a grapheme.
    fn next(&mut self) -> Option<Self::Item> {
        if let Some(err) = self.pending_error.take() {
            return Some(Err(err));
        }
        loop {
            match self.input.next() {
                Some(Ok(codepoint)) => {
                    self.buffer.push(codepoint);
                }
                None => {
                    if self.buffer.is_empty() {
                        return None;
                    } else {
                        return Some(Ok(mem::replace(&mut self.buffer, String::new())));
                    }
                }
                Some(Err(e)) => {
                    if self.buffer.is_empty() {
                        return Some(Err(e));
                    } else {
                        // If the buffer is non-empty, consider the grapheme done and return it,
                        // but save the error and raise it next time around.
                        self.pending_error = Some(e);
                        return Some(Ok(mem::replace(&mut self.buffer, String::new())));
                    }
                }
            }

            let mut gi = self.buffer.grapheme_indices(true).fuse();
            if let (Some((_, first_grapheme)), Some((second_pos, _))) = (gi.next(), gi.next()) {
                let grapheme = first_grapheme.to_owned();
                self.buffer = unsafe { self.buffer.get_unchecked(second_pos ..) }.to_owned();
                return Some(Ok(grapheme));
            }
            // Otherwise, keep reading. We need at least the start of a second grapheme in the
            // buffer before we know where the first one ends, because otherwise there could be
            // additional combining marks ahead.
        }
    }
}

impl<R: Iterator<Item = io::Result<char>>> From<R> for Graphemes<R> {
    fn from(input: R) -> Graphemes<R> {
        Graphemes {
            input,
            buffer: String::new(),
            pending_error: None,
        }
    }
}