Auto merge of #17235 - bzbarsky:bigger-sharing-cache, r=bholley

Increase the size of the style sharing cache to 31  ---  - [X] `./mach build -d` does not report any errors - [X] `./mach test-tidy` does not report any errors - [X] These changes fix https://bugzilla.mozilla.org/show_bug.cgi?id=1369621  - [ ] There are tests for these changes OR - [ ] These changes do not require tests because _____    --- This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/servo/17235)
2025-09-21 12:20:20 +01:00 · 2017-06-08 12:22:38 -07:00 · 2017-06-08 12:22:38 -07:00 · 612f2c1c2a
commit 612f2c1c2a
parent 6b707b3248 2f2b93c521
3 changed files with 36 additions and 20 deletions
--- a/components/style/parallel.rs
+++ b/components/style/parallel.rs
@ -26,7 +26,6 @@ use context::TraversalStatistics;
 use dom::{OpaqueNode, SendNode, TElement, TNode};
 use rayon;
 use scoped_tls::ScopedTLS;
 use sharing::STYLE_SHARING_CANDIDATE_CACHE_SIZE;
 use smallvec::SmallVec;
 use std::borrow::Borrow;
 use std::mem;
@ -36,21 +35,10 @@ use traversal::{DomTraversal, PerLevelTraversalData, PreTraverseToken};
 /// The maximum number of child nodes that we will process as a single unit.
 ///
 /// Larger values will increase style sharing cache hits and general DOM locality
-/// at the expense of decreased opportunities for parallelism. The style sharing
+/// at the expense of decreased opportunities for parallelism. This value has not
-/// cache can hold 8 entries, but not all styles are shareable, so we set this
+/// been measured and could potentially be tuned.
 /// value to 16. These values have not been measured and could potentially be
 /// tuned.
 pub const WORK_UNIT_MAX: usize = 16;
 /// Verify that the style sharing cache size doesn't change. If it does, we should
 /// reconsider the above. We do this, rather than defining WORK_UNIT_MAX in terms
 /// of STYLE_SHARING_CANDIDATE_CACHE_SIZE, so that altering the latter doesn't
 /// have surprising effects on the parallelism characteristics of the style system.
 #[allow(dead_code)]
 fn static_assert() {
    unsafe { mem::transmute::<_, [u32; STYLE_SHARING_CANDIDATE_CACHE_SIZE]>([1; 8]); }
 }
 /// A list of node pointers.
 ///
 /// Note that the inline storage doesn't need to be sized to WORK_UNIT_MAX, but
@ -125,6 +113,19 @@ pub fn traverse_dom<E, D>(traversal: &D,
    }
 }
 /// A callback to create our thread local context.  This needs to be
 /// out of line so we don't allocate stack space for the entire struct
 /// in the caller.
 #[inline(never)]
 fn create_thread_local_context<'scope, E, D>(
    traversal: &'scope D,
    slot: &mut Option<D::ThreadLocalContext>)
    where E: TElement + 'scope,
          D: DomTraversal<E>
 {
    *slot = Some(traversal.create_thread_local_context())
 }
 /// A parallel top-down DOM traversal.
 ///
 /// This algorithm traverses the DOM in a breadth-first, top-down manner. The
@ -153,7 +154,8 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
    {
        // Scope the borrow of the TLS so that the borrow is dropped before
        // a potential recursive call when we pass TailCall.
-        let mut tlc = tls.ensure(|| traversal.create_thread_local_context());
+        let mut tlc = tls.ensure(
            |slot: &mut Option<D::ThreadLocalContext>| create_thread_local_context(traversal, slot));
        for n in nodes {
            // If the last node we processed produced children, spawn them off
--- a/components/style/scoped_tls.rs
+++ b/components/style/scoped_tls.rs
@ -9,6 +9,7 @@
 use rayon;
 use std::cell::{Ref, RefCell, RefMut};
 use std::ops::DerefMut;
 /// A scoped TLS set, that is alive during the `'scope` lifetime.
 ///
@ -52,11 +53,16 @@ impl<'scope, T: Send> ScopedTLS<'scope, T> {
    }
    /// Ensure that the current data this thread owns is initialized, or
-    /// initialize it using `f`.
+    /// initialize it using `f`.  We want ensure() to be fast and inline, and we
-    pub fn ensure<F: FnOnce() -> T>(&self, f: F) -> RefMut<T> {
+    /// want to inline the memmove that initializes the Option<T>.  But we don't
    /// want to inline space for the entire large T struct in our stack frame.
    /// That's why we hand `f` a mutable borrow to write to instead of just
    /// having it return a T.
    #[inline(always)]
    pub fn ensure<F: FnOnce(&mut Option<T>)>(&self, f: F) -> RefMut<T> {
        let mut opt = self.borrow_mut();
        if opt.is_none() {
-            *opt = Some(f());
+            f(opt.deref_mut());
        }
        RefMut::map(opt, |x| x.as_mut().unwrap())
--- a/components/style/sharing/mod.rs
+++ b/components/style/sharing/mod.rs
@ -84,8 +84,16 @@ use stylist::{ApplicableDeclarationBlock, Stylist};
 mod checks;
 /// The amount of nodes that the style sharing candidate cache should hold at
-/// most.
+/// most.  We'd somewhat like 32, but ArrayDeque only implements certain backing
-pub const STYLE_SHARING_CANDIDATE_CACHE_SIZE: usize = 8;
+/// store sizes.  A cache size of 32 would mean a backing store of 33, but
 /// that's not an implemented size: we can do 32 or 40.
 ///
 /// The cache size was chosen by measuring style sharing and resulting
 /// performance on a few pages; sizes up to about 32 were giving good sharing
 /// improvements (e.g. 3x fewer styles having to be resolved than at size 8) and
 /// slight performance improvements.  Sizes larger than 32 haven't really been
 /// tested.
 pub const STYLE_SHARING_CANDIDATE_CACHE_SIZE: usize = 31;
 /// Controls whether the style sharing cache is used.
 #[derive(Clone, Copy, PartialEq)]