diff --git a/components/style/parallel.rs b/components/style/parallel.rs index 859548d3c8b..0af6eed469c 100644 --- a/components/style/parallel.rs +++ b/components/style/parallel.rs @@ -26,7 +26,6 @@ use context::TraversalStatistics; use dom::{OpaqueNode, SendNode, TElement, TNode}; use rayon; use scoped_tls::ScopedTLS; -use sharing::STYLE_SHARING_CANDIDATE_CACHE_SIZE; use smallvec::SmallVec; use std::borrow::Borrow; use std::mem; @@ -36,21 +35,10 @@ use traversal::{DomTraversal, PerLevelTraversalData, PreTraverseToken}; /// The maximum number of child nodes that we will process as a single unit. /// /// Larger values will increase style sharing cache hits and general DOM locality -/// at the expense of decreased opportunities for parallelism. The style sharing -/// cache can hold 8 entries, but not all styles are shareable, so we set this -/// value to 16. These values have not been measured and could potentially be -/// tuned. +/// at the expense of decreased opportunities for parallelism. This value has not +/// been measured and could potentially be tuned. pub const WORK_UNIT_MAX: usize = 16; -/// Verify that the style sharing cache size doesn't change. If it does, we should -/// reconsider the above. We do this, rather than defining WORK_UNIT_MAX in terms -/// of STYLE_SHARING_CANDIDATE_CACHE_SIZE, so that altering the latter doesn't -/// have surprising effects on the parallelism characteristics of the style system. -#[allow(dead_code)] -fn static_assert() { - unsafe { mem::transmute::<_, [u32; STYLE_SHARING_CANDIDATE_CACHE_SIZE]>([1; 8]); } -} - /// A list of node pointers. /// /// Note that the inline storage doesn't need to be sized to WORK_UNIT_MAX, but @@ -125,6 +113,19 @@ pub fn traverse_dom(traversal: &D, } } +/// A callback to create our thread local context. This needs to be +/// out of line so we don't allocate stack space for the entire struct +/// in the caller. +#[inline(never)] +fn create_thread_local_context<'scope, E, D>( + traversal: &'scope D, + slot: &mut Option) + where E: TElement + 'scope, + D: DomTraversal +{ + *slot = Some(traversal.create_thread_local_context()) +} + /// A parallel top-down DOM traversal. /// /// This algorithm traverses the DOM in a breadth-first, top-down manner. The @@ -153,7 +154,8 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode], { // Scope the borrow of the TLS so that the borrow is dropped before // a potential recursive call when we pass TailCall. - let mut tlc = tls.ensure(|| traversal.create_thread_local_context()); + let mut tlc = tls.ensure( + |slot: &mut Option| create_thread_local_context(traversal, slot)); for n in nodes { // If the last node we processed produced children, spawn them off diff --git a/components/style/scoped_tls.rs b/components/style/scoped_tls.rs index 78537bce5f9..b1ccb4c7179 100644 --- a/components/style/scoped_tls.rs +++ b/components/style/scoped_tls.rs @@ -9,6 +9,7 @@ use rayon; use std::cell::{Ref, RefCell, RefMut}; +use std::ops::DerefMut; /// A scoped TLS set, that is alive during the `'scope` lifetime. /// @@ -52,11 +53,16 @@ impl<'scope, T: Send> ScopedTLS<'scope, T> { } /// Ensure that the current data this thread owns is initialized, or - /// initialize it using `f`. - pub fn ensure T>(&self, f: F) -> RefMut { + /// initialize it using `f`. We want ensure() to be fast and inline, and we + /// want to inline the memmove that initializes the Option. But we don't + /// want to inline space for the entire large T struct in our stack frame. + /// That's why we hand `f` a mutable borrow to write to instead of just + /// having it return a T. + #[inline(always)] + pub fn ensure)>(&self, f: F) -> RefMut { let mut opt = self.borrow_mut(); if opt.is_none() { - *opt = Some(f()); + f(opt.deref_mut()); } RefMut::map(opt, |x| x.as_mut().unwrap()) diff --git a/components/style/sharing/mod.rs b/components/style/sharing/mod.rs index 621c76794e0..40e4181407d 100644 --- a/components/style/sharing/mod.rs +++ b/components/style/sharing/mod.rs @@ -84,8 +84,16 @@ use stylist::{ApplicableDeclarationBlock, Stylist}; mod checks; /// The amount of nodes that the style sharing candidate cache should hold at -/// most. -pub const STYLE_SHARING_CANDIDATE_CACHE_SIZE: usize = 8; +/// most. We'd somewhat like 32, but ArrayDeque only implements certain backing +/// store sizes. A cache size of 32 would mean a backing store of 33, but +/// that's not an implemented size: we can do 32 or 40. +/// +/// The cache size was chosen by measuring style sharing and resulting +/// performance on a few pages; sizes up to about 32 were giving good sharing +/// improvements (e.g. 3x fewer styles having to be resolved than at size 8) and +/// slight performance improvements. Sizes larger than 32 haven't really been +/// tested. +pub const STYLE_SHARING_CANDIDATE_CACHE_SIZE: usize = 31; /// Controls whether the style sharing cache is used. #[derive(Clone, Copy, PartialEq)]