Use byte indices instead of char indices for text runs

Replace character indices with UTF-8 byte offsets throughout the code dealing with text shaping and breaking. This eliminates a lot of complexity when converting from one to the other, and interoperates better with the rest of the Rust ecosystem.
2025-10-04 10:39:16 +01:00 · 2016-04-27 11:22:02 -07:00 · 2016-04-27 11:22:02 -07:00 · 659305fe0a
commit 659305fe0a
parent dba878dfb2
15 changed files with 259 additions and 437 deletions
--- a/components/layout/text.rs
+++ b/components/layout/text.rs
@ -12,7 +12,7 @@ use fragment::{ScannedTextFragmentInfo, SELECTED, SpecificFragmentInfo, Unscanne
 use gfx::font::{DISABLE_KERNING_SHAPING_FLAG, FontMetrics, IGNORE_LIGATURES_SHAPING_FLAG};
 use gfx::font::{RTL_FLAG, RunMetrics, ShapingFlags, ShapingOptions};
 use gfx::font_context::FontContext;
-use gfx::text::glyph::CharIndex;
+use gfx::text::glyph::ByteIndex;
 use gfx::text::text_run::TextRun;
 use gfx::text::util::{self, CompressionMode};
 use inline::{FIRST_FRAGMENT_OF_ELEMENT, InlineFragments, LAST_FRAGMENT_OF_ELEMENT};
@ -174,7 +174,7 @@ impl TextRunScanner {

            for (fragment_index, in_fragment) in self.clump.iter().enumerate() {
                debug!("  flushing {:?}", in_fragment);
-                let mut mapping = RunMapping::new(&run_info_list[..], &run_info, fragment_index);
+                let mut mapping = RunMapping::new(&run_info_list[..], fragment_index);
                let text;
                let selection;
                match in_fragment.specific {
@ -188,13 +188,13 @@ impl TextRunScanner {
                    Some(range) if range.is_empty() => {
                        // `range` is the range within the current fragment. To get the range
                        // within the text run, offset it by the length of the preceding fragments.
-                        Some(range.begin() + CharIndex(run_info.character_length as isize))
+                        Some(range.begin() + ByteIndex(run_info.text.len() as isize))
                    }
                    _ => None
                };

                let (mut start_position, mut end_position) = (0, 0);
-                for (char_index, character) in text.chars().enumerate() {
+                for (byte_index, character) in text.char_indices() {
                    // Search for the first font in this font group that contains a glyph for this
                    // character.
                    let mut font_index = 0;
@ -226,7 +226,7 @@ impl TextRunScanner {
                    }

                    let selected = match selection {
-                        Some(range) => range.contains(CharIndex(char_index as isize)),
+                        Some(range) => range.contains(ByteIndex(byte_index as isize)),
                        None => false
                    };

@ -251,7 +251,6 @@ impl TextRunScanner {
                                run_info = RunInfo::new();
                            }
                            mapping = RunMapping::new(&run_info_list[..],
-                                                      &run_info,
                                                      fragment_index);
                        }
                        run_info.font_index = font_index;
@ -343,11 +342,14 @@ impl TextRunScanner {
                let mut mapping = mappings.next().unwrap();
                let scanned_run = runs[mapping.text_run_index].clone();

+                let mut byte_range = Range::new(ByteIndex(mapping.byte_range.begin() as isize),
+                                                ByteIndex(mapping.byte_range.length() as isize));
+
                let requires_line_break_afterward_if_wrapping_on_newlines =
                    !mapping.byte_range.is_empty() &&
                    scanned_run.run.text.char_at_reverse(mapping.byte_range.end()) == '\n';
                if requires_line_break_afterward_if_wrapping_on_newlines {
-                    mapping.char_range.extend_by(CharIndex(-1));
+                    byte_range.extend_by(ByteIndex(-1)); // Trim the '\n'
                }

                let text_size = old_fragment.border_box.size;
@ -368,12 +370,12 @@ impl TextRunScanner {

                let mut new_text_fragment_info = box ScannedTextFragmentInfo::new(
                    scanned_run.run,
-                    mapping.char_range,
+                    byte_range,
                    text_size,
                    insertion_point,
                    flags);

-                let new_metrics = new_text_fragment_info.run.metrics_for_range(&mapping.char_range);
+                let new_metrics = new_text_fragment_info.run.metrics_for_range(&byte_range);
                let writing_mode = old_fragment.style.writing_mode;
                let bounding_box_size = bounding_box_for_run_metrics(&new_metrics, writing_mode);
                new_text_fragment_info.content_size = bounding_box_size;
@ -490,7 +492,7 @@ fn split_first_fragment_at_newline_if_necessary(fragments: &mut LinkedList<Fragm
                unscanned_text_fragment_info.text[..(position + 1)].to_owned();
            unscanned_text_fragment_info.text =
                unscanned_text_fragment_info.text[(position + 1)..].to_owned().into_boxed_str();
-            let offset = CharIndex(string_before.char_indices().count() as isize);
+            let offset = ByteIndex(string_before.len() as isize);
            match unscanned_text_fragment_info.selection {
                Some(ref mut selection) if selection.begin() >= offset => {
                    // Selection is entirely in the second fragment.
@ -500,7 +502,7 @@ fn split_first_fragment_at_newline_if_necessary(fragments: &mut LinkedList<Fragm
                Some(ref mut selection) if selection.end() > offset => {
                    // Selection is split across two fragments.
                    selection_before = Some(Range::new(selection.begin(), offset));
-                    *selection = Range::new(CharIndex(0), selection.end() - offset);
+                    *selection = Range::new(ByteIndex(0), selection.end() - offset);
                }
                _ => {
                    // Selection is entirely in the first fragment.
@ -523,11 +525,9 @@ struct RunInfo {
    /// The text that will go in this text run.
    text: String,
    /// The insertion point in this text run, if applicable.
-    insertion_point: Option<CharIndex>,
+    insertion_point: Option<ByteIndex>,
    /// The index of the applicable font in the font group.
    font_index: usize,
-    /// A cached copy of the number of Unicode characters in the text run.
-    character_length: usize,
    /// The bidirection embedding level of this text run.
    bidi_level: u8,
    /// The Unicode script property of this text run.
@ -540,7 +540,6 @@ impl RunInfo {
            text: String::new(),
            insertion_point: None,
            font_index: 0,
-            character_length: 0,
            bidi_level: 0,
            script: Script::Common,
        }
@ -552,9 +551,9 @@ impl RunInfo {
    ///   of this text run.
    fn flush(mut self,
             list: &mut Vec<RunInfo>,
-             insertion_point: &mut Option<CharIndex>) {
+             insertion_point: &mut Option<ByteIndex>) {
        if let Some(idx) = *insertion_point {
-            let char_len = CharIndex(self.character_length as isize);
+            let char_len = ByteIndex(self.text.len() as isize);
            if idx <= char_len {
                // The insertion point is in this text run.
                self.insertion_point = insertion_point.take()
@ -571,8 +570,6 @@ impl RunInfo {
 /// for it.
 #[derive(Copy, Clone, Debug)]
 struct RunMapping {
-    /// The range of characters within the text fragment.
-    char_range: Range<CharIndex>,
    /// The range of byte indices within the text fragment.
    byte_range: Range<usize>,
    /// The index of the unscanned text fragment that this mapping corresponds to.
@ -585,13 +582,10 @@ struct RunMapping {

 impl RunMapping {
    /// Given the current set of text runs, creates a run mapping for the next fragment.
-    /// `run_info_list` describes the set of runs we've seen already, and `current_run_info`
-    /// describes the run we just finished processing.
-    fn new(run_info_list: &[RunInfo], current_run_info: &RunInfo, fragment_index: usize)
+    /// `run_info_list` describes the set of runs we've seen already.
+    fn new(run_info_list: &[RunInfo], fragment_index: usize)
           -> RunMapping {
        RunMapping {
-            char_range: Range::new(CharIndex(current_run_info.character_length as isize),
-                                   CharIndex(0)),
            byte_range: Range::new(0, 0),
            old_fragment_index: fragment_index,
            text_run_index: run_info_list.len(),
@ -620,26 +614,21 @@ impl RunMapping {
        // Account for `text-transform`. (Confusingly, this is not handled in "text
        // transformation" above, but we follow Gecko in the naming.)
        let is_first_run = *start_position == 0;
-        let character_count = apply_style_transform_if_necessary(&mut run_info.text,
-                                                                 old_byte_length,
-                                                                 text_transform,
-                                                                 *last_whitespace,
-                                                                 is_first_run);
-
-        run_info.character_length = run_info.character_length + character_count;
+        apply_style_transform_if_necessary(&mut run_info.text, old_byte_length, text_transform,
+                                           *last_whitespace, is_first_run);
        *start_position = end_position;

+        let new_byte_length = run_info.text.len();
+        let is_empty = new_byte_length == old_byte_length;
+
        // Don't save mappings that contain only discarded characters.
        // (But keep ones that contained no characters to begin with, since they might have been
        // generated by an empty flow to draw its borders/padding/insertion point.)
-        let is_empty = character_count == 0;
        if is_empty && !was_empty {
            return;
        }

-        let new_byte_length = run_info.text.len();
        self.byte_range = Range::new(old_byte_length, new_byte_length - old_byte_length);
-        self.char_range.extend_by(CharIndex(character_count as isize));
        mappings.push(self)
    }

@ -648,10 +637,10 @@ impl RunMapping {
    /// NOTE: We treat the range as inclusive at both ends, since the insertion point can lie
    /// before the first character *or* after the last character, and should be drawn even if the
    /// text is empty.
-    fn contains_insertion_point(&self, insertion_point: Option<CharIndex>) -> bool {
-        match insertion_point {
+    fn contains_insertion_point(&self, insertion_point: Option<ByteIndex>) -> bool {
+        match insertion_point.map(ByteIndex::to_usize) {
            None => false,
-            Some(idx) => self.char_range.begin() <= idx && idx <= self.char_range.end()
+            Some(idx) => self.byte_range.begin() <= idx && idx <= self.byte_range.end()
        }
    }
 }
@ -666,39 +655,29 @@ fn apply_style_transform_if_necessary(string: &mut String,
                                      first_character_position: usize,
                                      text_transform: text_transform::T,
                                      last_whitespace: bool,
-                                      is_first_run: bool)
-                                      -> usize {
+                                      is_first_run: bool) {
    match text_transform {
-        text_transform::T::none => string[first_character_position..].chars().count(),
+        text_transform::T::none => {}
        text_transform::T::uppercase => {
            let original = string[first_character_position..].to_owned();
            string.truncate(first_character_position);
-            let mut count = 0;
            for ch in original.chars().flat_map(|ch| ch.to_uppercase()) {
                string.push(ch);
-                count += 1;
            }
-            count
        }
        text_transform::T::lowercase => {
            let original = string[first_character_position..].to_owned();
            string.truncate(first_character_position);
-            let mut count = 0;
            for ch in original.chars().flat_map(|ch| ch.to_lowercase()) {
                string.push(ch);
-                count += 1;
            }
-            count
        }
        text_transform::T::capitalize => {
            let original = string[first_character_position..].to_owned();
            string.truncate(first_character_position);

            let mut capitalize_next_letter = is_first_run || last_whitespace;
-            let mut count = 0;
            for character in original.chars() {
-                count += 1;
-
                // FIXME(#4311, pcwalton): Should be the CSS/Unicode notion of a *typographic
                // letter unit*, not an *alphabetic* character:
                //
@ -716,8 +695,6 @@ fn apply_style_transform_if_necessary(string: &mut String,
                    capitalize_next_letter = true
                }
            }
-
-            count
        }
    }
 }
@ -725,7 +702,7 @@ fn apply_style_transform_if_necessary(string: &mut String,
 #[derive(Clone)]
 struct ScannedTextRun {
    run: Arc<TextRun>,
-    insertion_point: Option<CharIndex>,
+    insertion_point: Option<ByteIndex>,
 }

 /// Can a character with script `b` continue a text run with script `a`?