diff --git a/components/style/parallel.rs b/components/style/parallel.rs index 358eeffd52c..3703559cd98 100644 --- a/components/style/parallel.rs +++ b/components/style/parallel.rs @@ -34,9 +34,16 @@ use traversal::{DomTraversal, PerLevelTraversalData, PreTraverseToken}; /// The maximum number of child nodes that we will process as a single unit. /// -/// Larger values will increase style sharing cache hits and general DOM locality -/// at the expense of decreased opportunities for parallelism. This value has not -/// been measured and could potentially be tuned. +/// Larger values will increase style sharing cache hits and general DOM +/// locality at the expense of decreased opportunities for parallelism. There +/// are some measurements in +/// https://bugzilla.mozilla.org/show_bug.cgi?id=1385982#c11 and comments 12 +/// and 13 that investigate some slightly different values for the work unit +/// size. If the size is significantly increased, make sure to adjust the +/// condition for kicking off a new work unit in top_down_dom, because +/// otherwise we're likely to end up doing too much work serially. For +/// example, the condition there could become some fraction of WORK_UNIT_MAX +/// instead of WORK_UNIT_MAX. pub const WORK_UNIT_MAX: usize = 16; /// A set of nodes, sized to the work unit. This gets copied when sent to other @@ -137,7 +144,8 @@ fn create_thread_local_context<'scope, E, D>( /// * Never process a child before its parent (since child style depends on /// parent style). If this were to happen, the styling algorithm would panic. /// * Prioritize discovering nodes as quickly as possible to maximize -/// opportunities for parallelism. +/// opportunities for parallelism. But this needs to be weighed against +/// styling cousins on a single thread to improve sharing. /// * Style all the children of a given node (i.e. all sibling nodes) on /// a single thread (with an upper bound to handle nodes with an /// abnormally large number of children). This is important because we use @@ -173,10 +181,10 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode], }; for n in nodes { - // If the last node we processed produced children, spawn them off - // into a work item. We do this at the beginning of the loop (rather - // than at the end) so that we can traverse the children of the last - // sibling directly on this thread without a spawn call. + // If the last node we processed produced children, we may want to + // spawn them off into a work item. We do this at the beginning of + // the loop (rather than at the end) so that we can traverse our + // last bits of work directly on this thread without a spawn call. // // This has the important effect of removing the allocation and // context-switching overhead of the parallel traversal for perfectly @@ -184,8 +192,33 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode], // // // - // Which are not at all uncommon. - if !discovered_child_nodes.is_empty() { + // which are not at all uncommon. + // + // There's a tension here between spawning off a work item as soon + // as discovered_child_nodes is nonempty and waiting until we have a + // full work item to do so. The former optimizes for speed of + // discovery (we'll start discovering the kids of the things in + // "nodes" ASAP). The latter gives us better sharing (e.g. we can + // share between cousins much better, because we don't hand them off + // as separate work items, which are likely to end up on separate + // threads) and gives us a chance to just handle everything on this + // thread for small DOM subtrees, as in the linear example above. + // + // There are performance and "number of ComputedValues" + // measurements for various testcases in + // https://bugzilla.mozilla.org/show_bug.cgi?id=1385982#c10 and + // following. + // + // The worst case behavior for waiting until we have a full work + // item is a deep tree which has WORK_UNIT_MAX "linear" branches, + // hence WORK_UNIT_MAX elements at each level. Such a tree would + // end up getting processed entirely sequentially, because we would + // process each level one at a time as a single work unit, whether + // via our end-of-loop tail call or not. If we kicked off a + // traversal as soon as we discovered kids, we would instead + // process such a tree more or less with a thread-per-branch, + // multiplexed across our actual threadpool. + if discovered_child_nodes.len() >= WORK_UNIT_MAX { let mut traversal_data_copy = traversal_data.clone(); traversal_data_copy.current_dom_depth += 1; traverse_nodes(&*discovered_child_nodes, @@ -213,9 +246,9 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode], } } - // Handle the children of the last element in this work unit. If any exist, - // we can process them (or at least one work unit's worth of them) directly - // on this thread by passing TailCall. + // Handle whatever elements we have queued up but not kicked off traversals + // for yet. If any exist, we can process them (or at least one work unit's + // worth of them) directly on this thread by passing TailCall. if !discovered_child_nodes.is_empty() { traversal_data.current_dom_depth += 1; traverse_nodes(&discovered_child_nodes,