From 2eec1e69ea69e2507ebab88b40f22737e26e4674 Mon Sep 17 00:00:00 2001 From: yvt Date: Sun, 20 Jun 2021 00:04:20 +0900 Subject: [PATCH 1/3] refactor(bhm): reduce item visibility --- .../background_hang_monitor.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/components/background_hang_monitor/background_hang_monitor.rs b/components/background_hang_monitor/background_hang_monitor.rs index b97ec3bceec..fb51df811a4 100644 --- a/components/background_hang_monitor/background_hang_monitor.rs +++ b/components/background_hang_monitor/background_hang_monitor.rs @@ -103,7 +103,7 @@ impl BackgroundHangMonitorClone for HangMonitorRegister { } /// Messages sent from monitored components to the monitor. -pub enum MonitoredComponentMsg { +enum MonitoredComponentMsg { /// Register component for monitoring, Register( Box, @@ -123,7 +123,7 @@ pub enum MonitoredComponentMsg { /// A wrapper around a sender to the monitor, /// which will send the Id of the monitored component along with each message, /// and keep track of whether the monitor is still listening on the other end. -pub struct BackgroundHangMonitorChan { +struct BackgroundHangMonitorChan { sender: Sender<(MonitoredComponentId, MonitoredComponentMsg)>, component_id: MonitoredComponentId, disconnected: Cell, @@ -131,7 +131,7 @@ pub struct BackgroundHangMonitorChan { } impl BackgroundHangMonitorChan { - pub fn new( + fn new( sender: Sender<(MonitoredComponentId, MonitoredComponentMsg)>, component_id: MonitoredComponentId, monitoring_enabled: bool, @@ -144,7 +144,7 @@ impl BackgroundHangMonitorChan { } } - pub fn send(&self, msg: MonitoredComponentMsg) { + fn send(&self, msg: MonitoredComponentMsg) { if self.disconnected.get() { return; } @@ -188,7 +188,7 @@ struct MonitoredComponent { struct Sample(MonitoredComponentId, Instant, NativeStack); -pub struct BackgroundHangMonitorWorker { +struct BackgroundHangMonitorWorker { component_names: HashMap, monitored_components: HashMap, constellation_chan: IpcSender, @@ -204,7 +204,7 @@ pub struct BackgroundHangMonitorWorker { } impl BackgroundHangMonitorWorker { - pub fn new( + fn new( constellation_chan: IpcSender, control_port: IpcReceiver, port: Receiver<(MonitoredComponentId, MonitoredComponentMsg)>, @@ -268,7 +268,7 @@ impl BackgroundHangMonitorWorker { .send(HangMonitorAlert::Profile(bytes)); } - pub fn run(&mut self) -> bool { + fn run(&mut self) -> bool { let tick = if let Some(duration) = self.sampling_duration { let duration = duration .checked_sub(Instant::now() - self.last_sample) From 18c79cafac8c93a4521d1f9eeae646068dbaa36f Mon Sep 17 00:00:00 2001 From: yvt Date: Sun, 20 Jun 2021 10:44:40 +0900 Subject: [PATCH 2/3] fix(bhm): deliver exit signal reliably when component registration and signal submission coincide > There's a race condition between the reception of > `BackgroundHangMonitorControlMsg::Exit` and `MonitoredComponentMsg:: > Register`. When the worker receives `Exit`, it stops receiving > messages, and any remaining messages (including the > `MonitoredComponentMsg::Register` we sent) in the channel are dropped. > Wrapping `exit_signal` with this RAII wrapper ensures the exit signal > is delivered even in such cases. This should (hopefully) eliminate the intermittent hang-ups in the test case `test_hang_monitoring_exit_signal` for good. --- .../background_hang_monitor.rs | 56 +++++++++-- .../tests/hang_monitor_tests.rs | 93 ++++++++++++++----- 2 files changed, 119 insertions(+), 30 deletions(-) diff --git a/components/background_hang_monitor/background_hang_monitor.rs b/components/background_hang_monitor/background_hang_monitor.rs index fb51df811a4..7d06f61f633 100644 --- a/components/background_hang_monitor/background_hang_monitor.rs +++ b/components/background_hang_monitor/background_hang_monitor.rs @@ -44,6 +44,11 @@ impl HangMonitorRegister { while monitor.run() { // Monitoring until all senders have been dropped... } + + // Suppress `signal_to_exit` + for (_, mut component) in monitor.monitored_components.drain() { + component.exit_signal.release(); + } }) .expect("Couldn't start BHM worker."); Box::new(HangMonitorRegister { @@ -85,6 +90,15 @@ impl BackgroundHangMonitorRegister for HangMonitorRegister { #[cfg(any(target_os = "android", target_arch = "arm", target_arch = "aarch64"))] let sampler = crate::sampler::DummySampler::new(); + // There's a race condition between the reception of + // `BackgroundHangMonitorControlMsg::Exit` and + // `MonitoredComponentMsg::Register`. When the worker receives `Exit`, + // it stops receiving messages, and any remaining messages (including + // the `MonitoredComponentMsg::Register` we sent) in the channel are + // dropped. Wrapping `exit_signal` with this RAII wrapper ensures the + // exit signal is delivered even in such cases. + let exit_signal = SignalToExitOnDrop(exit_signal); + bhm_chan.send(MonitoredComponentMsg::Register( sampler, thread::current().name().map(str::to_owned), @@ -110,7 +124,7 @@ enum MonitoredComponentMsg { Option, Duration, Duration, - Option>, + SignalToExitOnDrop, ), /// Unregister component for monitoring. Unregister, @@ -174,6 +188,33 @@ impl BackgroundHangMonitor for BackgroundHangMonitorChan { } } +/// Wraps [`BackgroundHangMonitorExitSignal`] and calls `signal_to_exit` when +/// dropped. +struct SignalToExitOnDrop(Option>); + +impl SignalToExitOnDrop { + /// Call `BackgroundHangMonitorExitSignal::signal_to_exit` now. + fn signal_to_exit(&mut self) { + if let Some(signal) = self.0.take() { + signal.signal_to_exit(); + } + } + + /// Disassociate `BackgroundHangMonitorExitSignal` from itself, preventing + /// `BackgroundHangMonitorExitSignal::signal_to_exit` from being called in + /// the future. + fn release(&mut self) { + self.0 = None; + } +} + +impl Drop for SignalToExitOnDrop { + #[inline] + fn drop(&mut self) { + self.signal_to_exit(); + } +} + struct MonitoredComponent { sampler: Box, last_activity: Instant, @@ -183,7 +224,7 @@ struct MonitoredComponent { sent_transient_alert: bool, sent_permanent_alert: bool, is_waiting: bool, - exit_signal: Option>, + exit_signal: SignalToExitOnDrop, } struct Sample(MonitoredComponentId, Instant, NativeStack); @@ -306,10 +347,8 @@ impl BackgroundHangMonitorWorker { return true; }, Ok(BackgroundHangMonitorControlMsg::Exit(sender)) => { - for component in self.monitored_components.values() { - if let Some(signal) = component.exit_signal.as_ref() { - signal.signal_to_exit(); - } + for component in self.monitored_components.values_mut() { + component.exit_signal.signal_to_exit(); } // Confirm exit with to the constellation. @@ -379,10 +418,13 @@ impl BackgroundHangMonitorWorker { ); }, (component_id, MonitoredComponentMsg::Unregister) => { - let _ = self + let (_, mut component) = self .monitored_components .remove_entry(&component_id) .expect("Received Unregister for an unknown component"); + + // Prevent `signal_to_exit` from being called + component.exit_signal.release(); }, (component_id, MonitoredComponentMsg::NotifyActivity(annotation)) => { let component = self diff --git a/components/background_hang_monitor/tests/hang_monitor_tests.rs b/components/background_hang_monitor/tests/hang_monitor_tests.rs index 23b32dd1bbb..b92344039bd 100644 --- a/components/background_hang_monitor/tests/hang_monitor_tests.rs +++ b/components/background_hang_monitor/tests/hang_monitor_tests.rs @@ -164,10 +164,45 @@ fn test_hang_monitoring_unregister() { assert!(background_hang_monitor_receiver.try_recv().is_err()); } +// Perform two certain steps in `test_hang_monitoring_exit_signal_inner` in +// different orders to check for the race condition that +// caused and +// . #[test] -// https://github.com/servo/servo/issues/28270 -#[cfg(not(any(target_os = "windows", target_os = "macos")))] -fn test_hang_monitoring_exit_signal() { +fn test_hang_monitoring_exit_signal1() { + test_hang_monitoring_exit_signal_inner(|e1, e2| { + e1(); + thread::sleep(Duration::from_millis(100)); + e2(); + }); +} + +#[test] +fn test_hang_monitoring_exit_signal2() { + test_hang_monitoring_exit_signal_inner(|e1, e2| { + e1(); + e2(); + }); +} + +#[test] +fn test_hang_monitoring_exit_signal3() { + test_hang_monitoring_exit_signal_inner(|e1, e2| { + e2(); + e1(); + }); +} + +#[test] +fn test_hang_monitoring_exit_signal4() { + test_hang_monitoring_exit_signal_inner(|e1, e2| { + e2(); + thread::sleep(Duration::from_millis(100)); + e1(); + }); +} + +fn test_hang_monitoring_exit_signal_inner(op_order: fn(&mut dyn FnMut(), &mut dyn FnMut())) { let _lock = SERIAL.lock().unwrap(); let (background_hang_monitor_ipc_sender, _background_hang_monitor_receiver) = @@ -185,9 +220,9 @@ fn test_hang_monitoring_exit_signal() { } let closing = Arc::new(AtomicBool::new(false)); - let signal = BHMExitSignal { + let mut signal = Some(Box::new(BHMExitSignal { closing: closing.clone(), - }; + })); // Init a worker, without active monitoring. let background_hang_monitor_register = HangMonitorRegister::init( @@ -195,26 +230,38 @@ fn test_hang_monitoring_exit_signal() { control_receiver, false, ); - let _background_hang_monitor = background_hang_monitor_register.register_component( - MonitoredComponentId(TEST_PIPELINE_ID, MonitoredComponentType::Script), - Duration::from_millis(10), - Duration::from_millis(1000), - Some(Box::new(signal)), + + let mut background_hang_monitor = None; + let (exit_sender, exit_receiver) = ipc::channel().expect("Failed to create IPC channel!"); + let mut exit_sender = Some(exit_sender); + + // `op_order` determines the order in which these two closures are + // executed. + op_order( + &mut || { + // Register a component. + background_hang_monitor = Some(background_hang_monitor_register.register_component( + MonitoredComponentId(TEST_PIPELINE_ID, MonitoredComponentType::Script), + Duration::from_millis(10), + Duration::from_millis(1000), + Some(signal.take().unwrap()), + )); + }, + &mut || { + // Send the exit message. + control_sender + .send(BackgroundHangMonitorControlMsg::Exit( + exit_sender.take().unwrap(), + )) + .unwrap(); + }, ); - let (exit_sender, exit_receiver) = ipc::channel().expect("Failed to create IPC channel!"); + // Assert we receive a confirmation back. + assert!(exit_receiver.recv().is_ok()); - // Send the exit message. - if control_sender - .send(BackgroundHangMonitorControlMsg::Exit(exit_sender)) - .is_ok() - { - // Assert we receive a confirmation back. - assert!(exit_receiver.recv().is_ok()); - - // Assert we get the exit signal. - while !closing.load(Ordering::SeqCst) { - thread::sleep(Duration::from_millis(10)); - } + // Assert we get the exit signal. + while !closing.load(Ordering::SeqCst) { + thread::sleep(Duration::from_millis(10)); } } From 602c02edd29029ab16e988e4bb5e2cddc1c1da5b Mon Sep 17 00:00:00 2001 From: yvt Date: Sun, 20 Jun 2021 23:29:16 +0900 Subject: [PATCH 3/3] fix(bhm): deliver exit signal reliably (for real) > However, as it turns out, `crossbeam-channel`'s channels don't drop > remaining messages until all associated senders *and* receivers are > dropped. This means the exit signal won't be delivered as long as > there's at least one `HangMonitorRegister` or > `BackgroundHangMonitorChan` maintaining a copy of the sender. To work > around this and guarantee a rapid delivery of the exit signal, the > sender is wrapped in `Arc`, and only the worker thread maintains a > strong reference, thus ensuring both the sender and receiver are > dropped as soon as the worker thread exits. --- .../background_hang_monitor.rs | 129 +++++++++++++++--- 1 file changed, 107 insertions(+), 22 deletions(-) diff --git a/components/background_hang_monitor/background_hang_monitor.rs b/components/background_hang_monitor/background_hang_monitor.rs index 7d06f61f633..ab1e4e5940c 100644 --- a/components/background_hang_monitor/background_hang_monitor.rs +++ b/components/background_hang_monitor/background_hang_monitor.rs @@ -16,12 +16,14 @@ use msg::constellation_msg::{ }; use std::cell::Cell; use std::collections::{HashMap, VecDeque}; +use std::sync::{Arc, Weak}; use std::thread; use std::time::{Duration, Instant}; #[derive(Clone)] pub struct HangMonitorRegister { - sender: Sender<(MonitoredComponentId, MonitoredComponentMsg)>, + sender: Weak>, + tether: Sender, monitoring_enabled: bool, } @@ -32,27 +34,37 @@ impl HangMonitorRegister { control_port: IpcReceiver, monitoring_enabled: bool, ) -> Box { + // Create a channel to pass messages of type `MonitoredComponentMsg`. + // See the discussion in `::register_component` for why we wrap + // the sender with `Arc` and why `HangMonitorRegister` only maintains + // a weak reference to it. let (sender, port) = unbounded(); + let sender = Arc::new(sender); + let sender_weak = Arc::downgrade(&sender); + + // Create a "tether" channel, whose sole purpose is to keep the worker + // thread alive. The worker thread will terminates when all copies of + // `tether` are dropped. + let (tether, tether_port) = unbounded(); + let _ = thread::Builder::new() .spawn(move || { let mut monitor = BackgroundHangMonitorWorker::new( constellation_chan, control_port, - port, + (sender, port), + tether_port, monitoring_enabled, ); while monitor.run() { // Monitoring until all senders have been dropped... } - - // Suppress `signal_to_exit` - for (_, mut component) in monitor.monitored_components.drain() { - component.exit_signal.release(); - } }) .expect("Couldn't start BHM worker."); Box::new(HangMonitorRegister { - sender, + sender: sender_weak, + tether, monitoring_enabled, }) } @@ -71,6 +83,7 @@ impl BackgroundHangMonitorRegister for HangMonitorRegister { ) -> Box { let bhm_chan = BackgroundHangMonitorChan::new( self.sender.clone(), + self.tether.clone(), component_id, self.monitoring_enabled, ); @@ -90,21 +103,54 @@ impl BackgroundHangMonitorRegister for HangMonitorRegister { #[cfg(any(target_os = "android", target_arch = "arm", target_arch = "aarch64"))] let sampler = crate::sampler::DummySampler::new(); - // There's a race condition between the reception of - // `BackgroundHangMonitorControlMsg::Exit` and - // `MonitoredComponentMsg::Register`. When the worker receives `Exit`, - // it stops receiving messages, and any remaining messages (including - // the `MonitoredComponentMsg::Register` we sent) in the channel are - // dropped. Wrapping `exit_signal` with this RAII wrapper ensures the - // exit signal is delivered even in such cases. + // When a component is registered, and there's an exit request that + // reached BHM, we want an exit signal to be delivered to the + // component's exit signal handler eventually. However, there's a race + // condition between the reception of `BackgroundHangMonitorControlMsg:: + // Exit` and `MonitoredComponentMsg::Register` that needs to handled + // carefully. When the worker receives an `Exit` message, it stops + // processing messages, and any further `Register` messages sent to the + // worker thread are ignored. If the submissions of `Exit` and + // `Register` messages are far apart enough, the channel is closed by + // the time the client attempts to send a `Register` message, and + // therefore the client can figure out by `Sender::send`'s return value + // that it must deliver an exit signal. However, if these message + // submissions are close enough, the `Register` message is still sent, + // but the worker thread might exit before it sees the message, leaving + // the message unprocessed and the exit signal unsent. + // + // To fix this, we wrap the exit signal handler in an RAII wrapper of + // type `SignalToExitOnDrop` to automatically send a signal when it's + // dropped. This way, we can make sure the exit signal is sent even if + // the message couldn't reach the worker thread and be processed. + // + // However, as it turns out, `crossbeam-channel`'s channels don't drop + // remaining messages until all associated senders *and* receivers are + // dropped. This means the exit signal won't be delivered as long as + // there's at least one `HangMonitorRegister` or + // `BackgroundHangMonitorChan` maintaining a copy of the sender. To work + // around this and guarantee a rapid delivery of the exit signal, the + // sender is wrapped in `Arc`, and only the worker thread maintains a + // strong reference, thus ensuring both the sender and receiver are + // dropped as soon as the worker thread exits. let exit_signal = SignalToExitOnDrop(exit_signal); + // If the tether is dropped after this call, the worker thread might + // exit before processing the `Register` message because there's no + // implicit ordering guarantee between two channels. If this happens, + // an exit signal will be sent despite we haven't received a + // corresponding exit request. To enforce the correct ordering and + // prevent a false exit signal from being sent, we include a copy of + // `self.tether` in the `Register` message. + let tether = self.tether.clone(); + bhm_chan.send(MonitoredComponentMsg::Register( sampler, thread::current().name().map(str::to_owned), transient_hang_timeout, permanent_hang_timeout, exit_signal, + tether, )); Box::new(bhm_chan) } @@ -125,6 +171,7 @@ enum MonitoredComponentMsg { Duration, Duration, SignalToExitOnDrop, + Sender, ), /// Unregister component for monitoring. Unregister, @@ -134,11 +181,15 @@ enum MonitoredComponentMsg { NotifyWait, } +/// Stable equivalent to the `!` type +enum Never {} + /// A wrapper around a sender to the monitor, /// which will send the Id of the monitored component along with each message, /// and keep track of whether the monitor is still listening on the other end. struct BackgroundHangMonitorChan { - sender: Sender<(MonitoredComponentId, MonitoredComponentMsg)>, + sender: Weak>, + _tether: Sender, component_id: MonitoredComponentId, disconnected: Cell, monitoring_enabled: bool, @@ -146,12 +197,14 @@ struct BackgroundHangMonitorChan { impl BackgroundHangMonitorChan { fn new( - sender: Sender<(MonitoredComponentId, MonitoredComponentMsg)>, + sender: Weak>, + tether: Sender, component_id: MonitoredComponentId, monitoring_enabled: bool, ) -> Self { BackgroundHangMonitorChan { sender, + _tether: tether, component_id: component_id, disconnected: Default::default(), monitoring_enabled, @@ -162,7 +215,17 @@ impl BackgroundHangMonitorChan { if self.disconnected.get() { return; } - if let Err(_) = self.sender.send((self.component_id.clone(), msg)) { + + // The worker thread owns both the receiver *and* the only strong + // reference to the sender. An `upgrade` failure means the latter is + // gone, and a `send` failure means the former is gone. They are dropped + // simultaneously, but we might observe an intermediate state. + if self + .sender + .upgrade() + .and_then(|sender| sender.send((self.component_id.clone(), msg)).ok()) + .is_none() + { warn!("BackgroundHangMonitor has gone away"); self.disconnected.set(true); } @@ -234,6 +297,8 @@ struct BackgroundHangMonitorWorker { monitored_components: HashMap, constellation_chan: IpcSender, port: Receiver<(MonitoredComponentId, MonitoredComponentMsg)>, + _port_sender: Arc>, + tether_port: Receiver, control_port: Receiver, sampling_duration: Option, sampling_max_duration: Option, @@ -248,7 +313,11 @@ impl BackgroundHangMonitorWorker { fn new( constellation_chan: IpcSender, control_port: IpcReceiver, - port: Receiver<(MonitoredComponentId, MonitoredComponentMsg)>, + (port_sender, port): ( + Arc>, + Receiver<(MonitoredComponentId, MonitoredComponentMsg)>, + ), + tether_port: Receiver, monitoring_enabled: bool, ) -> Self { let control_port = ROUTER.route_ipc_receiver_to_new_crossbeam_receiver(control_port); @@ -257,6 +326,8 @@ impl BackgroundHangMonitorWorker { monitored_components: Default::default(), constellation_chan, port, + _port_sender: port_sender, + tether_port, control_port, sampling_duration: None, sampling_max_duration: None, @@ -325,11 +396,24 @@ impl BackgroundHangMonitorWorker { let received = select! { recv(self.port) -> event => { + // Since we own the `Arc>`, the channel never + // gets disconnected. + Some(event.unwrap()) + }, + recv(self.tether_port) -> event => { + // This arm can only reached by a tether disconnection match event { - Ok(msg) => Some(msg), - // Our sender has been dropped, quit. - Err(_) => return false, + Ok(x) => match x {} + Err(_) => {} } + + // All associated `HangMonitorRegister` and + // `BackgroundHangMonitorChan` have been dropped. Suppress + // `signal_to_exit` and exit the BHM. + for component in self.monitored_components.values_mut() { + component.exit_signal.release(); + } + return false; }, recv(self.control_port) -> event => { match event { @@ -394,6 +478,7 @@ impl BackgroundHangMonitorWorker { transient_hang_timeout, permanent_hang_timeout, exit_signal, + _tether, ), ) => { let component = MonitoredComponent {