diff --git a/components/style/parallel.rs b/components/style/parallel.rs
index 859548d3c8b..0af6eed469c 100644
--- a/components/style/parallel.rs
+++ b/components/style/parallel.rs
@@ -26,7 +26,6 @@ use context::TraversalStatistics;
 use dom::{OpaqueNode, SendNode, TElement, TNode};
 use rayon;
 use scoped_tls::ScopedTLS;
-use sharing::STYLE_SHARING_CANDIDATE_CACHE_SIZE;
 use smallvec::SmallVec;
 use std::borrow::Borrow;
 use std::mem;
@@ -36,21 +35,10 @@ use traversal::{DomTraversal, PerLevelTraversalData, PreTraverseToken};
 /// The maximum number of child nodes that we will process as a single unit.
 ///
 /// Larger values will increase style sharing cache hits and general DOM locality
-/// at the expense of decreased opportunities for parallelism. The style sharing
-/// cache can hold 8 entries, but not all styles are shareable, so we set this
-/// value to 16. These values have not been measured and could potentially be
-/// tuned.
+/// at the expense of decreased opportunities for parallelism. This value has not
+/// been measured and could potentially be tuned.
 pub const WORK_UNIT_MAX: usize = 16;
 
-/// Verify that the style sharing cache size doesn't change. If it does, we should
-/// reconsider the above. We do this, rather than defining WORK_UNIT_MAX in terms
-/// of STYLE_SHARING_CANDIDATE_CACHE_SIZE, so that altering the latter doesn't
-/// have surprising effects on the parallelism characteristics of the style system.
-#[allow(dead_code)]
-fn static_assert() {
-    unsafe { mem::transmute::<_, [u32; STYLE_SHARING_CANDIDATE_CACHE_SIZE]>([1; 8]); }
-}
-
 /// A list of node pointers.
 ///
 /// Note that the inline storage doesn't need to be sized to WORK_UNIT_MAX, but
@@ -125,6 +113,19 @@ pub fn traverse_dom<E, D>(traversal: &D,
     }
 }
 
+/// A callback to create our thread local context.  This needs to be
+/// out of line so we don't allocate stack space for the entire struct
+/// in the caller.
+#[inline(never)]
+fn create_thread_local_context<'scope, E, D>(
+    traversal: &'scope D,
+    slot: &mut Option<D::ThreadLocalContext>)
+    where E: TElement + 'scope,
+          D: DomTraversal<E>
+{
+    *slot = Some(traversal.create_thread_local_context())
+}
+
 /// A parallel top-down DOM traversal.
 ///
 /// This algorithm traverses the DOM in a breadth-first, top-down manner. The
@@ -153,7 +154,8 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
     {
         // Scope the borrow of the TLS so that the borrow is dropped before
         // a potential recursive call when we pass TailCall.
-        let mut tlc = tls.ensure(|| traversal.create_thread_local_context());
+        let mut tlc = tls.ensure(
+            |slot: &mut Option<D::ThreadLocalContext>| create_thread_local_context(traversal, slot));
 
         for n in nodes {
             // If the last node we processed produced children, spawn them off
diff --git a/components/style/scoped_tls.rs b/components/style/scoped_tls.rs
index 78537bce5f9..b1ccb4c7179 100644
--- a/components/style/scoped_tls.rs
+++ b/components/style/scoped_tls.rs
@@ -9,6 +9,7 @@
 
 use rayon;
 use std::cell::{Ref, RefCell, RefMut};
+use std::ops::DerefMut;
 
 /// A scoped TLS set, that is alive during the `'scope` lifetime.
 ///
@@ -52,11 +53,16 @@ impl<'scope, T: Send> ScopedTLS<'scope, T> {
     }
 
     /// Ensure that the current data this thread owns is initialized, or
-    /// initialize it using `f`.
-    pub fn ensure<F: FnOnce() -> T>(&self, f: F) -> RefMut<T> {
+    /// initialize it using `f`.  We want ensure() to be fast and inline, and we
+    /// want to inline the memmove that initializes the Option<T>.  But we don't
+    /// want to inline space for the entire large T struct in our stack frame.
+    /// That's why we hand `f` a mutable borrow to write to instead of just
+    /// having it return a T.
+    #[inline(always)]
+    pub fn ensure<F: FnOnce(&mut Option<T>)>(&self, f: F) -> RefMut<T> {
         let mut opt = self.borrow_mut();
         if opt.is_none() {
-            *opt = Some(f());
+            f(opt.deref_mut());
         }
 
         RefMut::map(opt, |x| x.as_mut().unwrap())
diff --git a/components/style/sharing/mod.rs b/components/style/sharing/mod.rs
index 621c76794e0..40e4181407d 100644
--- a/components/style/sharing/mod.rs
+++ b/components/style/sharing/mod.rs
@@ -84,8 +84,16 @@ use stylist::{ApplicableDeclarationBlock, Stylist};
 mod checks;
 
 /// The amount of nodes that the style sharing candidate cache should hold at
-/// most.
-pub const STYLE_SHARING_CANDIDATE_CACHE_SIZE: usize = 8;
+/// most.  We'd somewhat like 32, but ArrayDeque only implements certain backing
+/// store sizes.  A cache size of 32 would mean a backing store of 33, but
+/// that's not an implemented size: we can do 32 or 40.
+///
+/// The cache size was chosen by measuring style sharing and resulting
+/// performance on a few pages; sizes up to about 32 were giving good sharing
+/// improvements (e.g. 3x fewer styles having to be resolved than at size 8) and
+/// slight performance improvements.  Sizes larger than 32 haven't really been
+/// tested.
+pub const STYLE_SHARING_CANDIDATE_CACHE_SIZE: usize = 31;
 
 /// Controls whether the style sharing cache is used.
 #[derive(Clone, Copy, PartialEq)]