[−][src]Trait unicode_segmentation::UnicodeSegmentation
Methods for segmenting strings according to Unicode Standard Annex #29.
Required Methods
fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>
Returns an iterator over the grapheme clusters of self
.
If is_extended
is true, the iterator is over the
extended grapheme clusters;
otherwise, the iterator is over the legacy grapheme clusters.
UAX#29
recommends extended grapheme cluster boundaries for general processing.
Examples
let gr1 = UnicodeSegmentation::graphemes("a\u{310}e\u{301}o\u{308}\u{332}", true) .collect::<Vec<&str>>(); let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"]; assert_eq!(&gr1[..], b); let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>(); let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"]; assert_eq!(&gr2[..], b);
fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>
Returns an iterator over the grapheme clusters of self
and their
byte offsets. See graphemes()
for more information.
Examples
let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲\r\n", true) .collect::<Vec<(usize, &str)>>(); let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]; assert_eq!(&gr_inds[..], b);
fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>
Returns an iterator over the words of self
, separated on
UAX#29 word boundaries.
Here, "words" are just those substrings which, after splitting on UAX#29 word boundaries, contain any alphanumeric characters. That is, the substring must contain at least one character with the Alphabetic property, or with General_Category=Number.
Example
let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?"; let uw1 = uws.unicode_words().collect::<Vec<&str>>(); let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"]; assert_eq!(&uw1[..], b);
fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>
Returns an iterator over substrings of self
separated on
UAX#29 word boundaries.
The concatenation of the substrings returned by this function is just the original string.
Example
let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>(); let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"]; assert_eq!(&swu1[..], b);
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>
Returns an iterator over substrings of self
, split on UAX#29 word boundaries,
and their offsets. See split_word_bounds()
for more information.
Example
let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>(); let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"), (14, "°"), (16, "F"), (17, "!")]; assert_eq!(&swi1[..], b);
Implementors
impl UnicodeSegmentation for str
[src]
impl UnicodeSegmentation for str
ⓘImportant traits for Graphemes<'a>fn graphemes(&self, is_extended: bool) -> Graphemes
[src]
fn graphemes(&self, is_extended: bool) -> Graphemes
ⓘImportant traits for GraphemeIndices<'a>fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices
[src]
fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices
ⓘImportant traits for UnicodeWords<'a>fn unicode_words(&self) -> UnicodeWords
[src]
fn unicode_words(&self) -> UnicodeWords
ⓘImportant traits for UWordBounds<'a>fn split_word_bounds(&self) -> UWordBounds
[src]
fn split_word_bounds(&self) -> UWordBounds
ⓘImportant traits for UWordBoundIndices<'a>fn split_word_bound_indices(&self) -> UWordBoundIndices
[src]
fn split_word_bound_indices(&self) -> UWordBoundIndices