introduce a background-hang-monitor:

Mac-Os implementation of a thread sampler,
Linux and Windows skeleton implementations.
This commit is contained in:
Gregory Terzian 2018-09-11 15:49:47 +08:00
parent 7c65505df3
commit 4eb785cdc0
23 changed files with 1134 additions and 11 deletions

View file

@ -0,0 +1,27 @@
[package]
name = "background_hang_monitor"
version = "0.0.1"
authors = ["The Servo Project Developers"]
license = "MPL-2.0"
publish = false
edition = "2018"
[lib]
name = "background_hang_monitor"
path = "lib.rs"
test = false
doctest = false
[dependencies]
backtrace = "0.3"
bitflags = "1.0"
ipc-channel = "0.11"
lazy_static = "1"
libc = "0.2"
log = "0.4"
msg = {path = "../msg"}
serde = "1.0.60"
crossbeam-channel = "0.3"
[target.'cfg(target_os = "macos")'.dependencies]
mach = "0.2.3"

View file

@ -0,0 +1,260 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use crate::sampler::Sampler;
use crossbeam_channel::{after, unbounded, Receiver, Sender};
use ipc_channel::ipc::IpcSender;
use msg::constellation_msg::MonitoredComponentId;
use msg::constellation_msg::{
BackgroundHangMonitor, BackgroundHangMonitorClone, BackgroundHangMonitorRegister,
};
use msg::constellation_msg::{HangAlert, HangAnnotation};
use std::cell::Cell;
use std::collections::HashMap;
use std::thread;
use std::time::{Duration, Instant};
#[derive(Clone)]
pub struct HangMonitorRegister {
sender: Sender<(MonitoredComponentId, MonitoredComponentMsg)>,
}
impl HangMonitorRegister {
/// Start a new hang monitor worker, and return a handle to register components for monitoring.
pub fn init(constellation_chan: IpcSender<HangAlert>) -> Box<BackgroundHangMonitorRegister> {
let (sender, port) = unbounded();
let _ = thread::Builder::new().spawn(move || {
let mut monitor = { BackgroundHangMonitorWorker::new(constellation_chan, port) };
while monitor.run() {
// Monitoring until all senders have been dropped...
}
});
Box::new(HangMonitorRegister { sender })
}
}
impl BackgroundHangMonitorRegister for HangMonitorRegister {
/// Register a component for monitoring.
/// Returns a dedicated wrapper around a sender
/// to be used for communication with the hang monitor worker.
fn register_component(
&self,
component_id: MonitoredComponentId,
transient_hang_timeout: Duration,
permanent_hang_timeout: Duration,
) -> Box<BackgroundHangMonitor> {
let bhm_chan = BackgroundHangMonitorChan::new(self.sender.clone(), component_id);
#[cfg(target_os = "windows")]
let sampler = crate::sampler_windows::WindowsSampler::new();
#[cfg(target_os = "macos")]
let sampler = crate::sampler_mac::MacOsSampler::new();
#[cfg(any(target_os = "android", target_os = "linux"))]
let sampler = crate::sampler_linux::LinuxSampler::new();
bhm_chan.send(MonitoredComponentMsg::Register(
sampler,
transient_hang_timeout,
permanent_hang_timeout,
));
Box::new(bhm_chan)
}
}
impl BackgroundHangMonitorClone for HangMonitorRegister {
fn clone_box(&self) -> Box<BackgroundHangMonitorRegister> {
Box::new(self.clone())
}
}
/// Messages sent from monitored components to the monitor.
pub enum MonitoredComponentMsg {
/// Register component for monitoring,
Register(Box<Sampler>, Duration, Duration),
/// Notify start of new activity for a given component,
NotifyActivity(HangAnnotation),
/// Notify start of waiting for a new task to come-in for processing.
NotifyWait,
}
/// A wrapper around a sender to the monitor,
/// which will send the Id of the monitored component along with each message,
/// and keep track of whether the monitor is still listening on the other end.
pub struct BackgroundHangMonitorChan {
sender: Sender<(MonitoredComponentId, MonitoredComponentMsg)>,
component_id: MonitoredComponentId,
disconnected: Cell<bool>,
}
impl BackgroundHangMonitorChan {
pub fn new(
sender: Sender<(MonitoredComponentId, MonitoredComponentMsg)>,
component_id: MonitoredComponentId,
) -> Self {
BackgroundHangMonitorChan {
sender,
component_id: component_id,
disconnected: Default::default(),
}
}
pub fn send(&self, msg: MonitoredComponentMsg) {
if self.disconnected.get() {
return;
}
if let Err(_) = self.sender.send((self.component_id.clone(), msg)) {
warn!("BackgroundHangMonitor has gone away");
self.disconnected.set(true);
}
}
}
impl BackgroundHangMonitor for BackgroundHangMonitorChan {
fn notify_activity(&self, annotation: HangAnnotation) {
let msg = MonitoredComponentMsg::NotifyActivity(annotation);
self.send(msg);
}
fn notify_wait(&self) {
let msg = MonitoredComponentMsg::NotifyWait;
self.send(msg);
}
}
struct MonitoredComponent {
sampler: Box<Sampler>,
last_activity: Instant,
last_annotation: Option<HangAnnotation>,
transient_hang_timeout: Duration,
permanent_hang_timeout: Duration,
sent_transient_alert: bool,
sent_permanent_alert: bool,
is_waiting: bool,
}
pub struct BackgroundHangMonitorWorker {
monitored_components: HashMap<MonitoredComponentId, MonitoredComponent>,
constellation_chan: IpcSender<HangAlert>,
port: Receiver<(MonitoredComponentId, MonitoredComponentMsg)>,
}
impl BackgroundHangMonitorWorker {
pub fn new(
constellation_chan: IpcSender<HangAlert>,
port: Receiver<(MonitoredComponentId, MonitoredComponentMsg)>,
) -> Self {
Self {
monitored_components: Default::default(),
constellation_chan,
port,
}
}
pub fn run(&mut self) -> bool {
let received = select! {
recv(self.port) -> event => {
match event {
Ok(msg) => Some(msg),
// Our sender has been dropped, quit.
Err(_) => return false,
}
},
recv(after(Duration::from_millis(100))) -> _ => None,
};
if let Some(msg) = received {
self.handle_msg(msg);
while let Ok(another_msg) = self.port.try_recv() {
// Handle any other incoming messages,
// before performing a hang checkpoint.
self.handle_msg(another_msg);
}
}
self.perform_a_hang_monitor_checkpoint();
true
}
fn handle_msg(&mut self, msg: (MonitoredComponentId, MonitoredComponentMsg)) {
match msg {
(
component_id,
MonitoredComponentMsg::Register(
sampler,
transient_hang_timeout,
permanent_hang_timeout,
),
) => {
let component = MonitoredComponent {
sampler,
last_activity: Instant::now(),
last_annotation: None,
transient_hang_timeout,
permanent_hang_timeout,
sent_transient_alert: false,
sent_permanent_alert: false,
is_waiting: true,
};
assert!(
self.monitored_components
.insert(component_id, component)
.is_none(),
"This component was already registered for monitoring."
);
},
(component_id, MonitoredComponentMsg::NotifyActivity(annotation)) => {
let component = self
.monitored_components
.get_mut(&component_id)
.expect("Received NotifyActivity for an unknown component");
component.last_activity = Instant::now();
component.last_annotation = Some(annotation);
component.sent_transient_alert = false;
component.sent_permanent_alert = false;
component.is_waiting = false;
},
(component_id, MonitoredComponentMsg::NotifyWait) => {
let component = self
.monitored_components
.get_mut(&component_id)
.expect("Received NotifyWait for an unknown component");
component.last_activity = Instant::now();
component.sent_transient_alert = false;
component.sent_permanent_alert = false;
component.is_waiting = true;
},
}
}
fn perform_a_hang_monitor_checkpoint(&mut self) {
for (component_id, monitored) in self.monitored_components.iter_mut() {
if monitored.is_waiting {
continue;
}
let last_annotation = monitored.last_annotation.unwrap();
if monitored.last_activity.elapsed() > monitored.permanent_hang_timeout {
if monitored.sent_permanent_alert {
continue;
}
let profile = match monitored.sampler.suspend_and_sample_thread() {
Ok(native_stack) => Some(native_stack.to_hangprofile()),
Err(()) => None,
};
let _ = self.constellation_chan.send(HangAlert::Permanent(
component_id.clone(),
last_annotation,
profile,
));
monitored.sent_permanent_alert = true;
continue;
}
if monitored.last_activity.elapsed() > monitored.transient_hang_timeout {
if monitored.sent_transient_alert {
continue;
}
let _ = self
.constellation_chan
.send(HangAlert::Transient(component_id.clone(), last_annotation));
monitored.sent_transient_alert = true;
}
}
}
}

View file

@ -0,0 +1,21 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#![deny(unsafe_code)]
#[macro_use]
extern crate crossbeam_channel;
#[macro_use]
extern crate log;
pub mod background_hang_monitor;
mod sampler;
#[cfg(any(target_os = "android", target_os = "linux"))]
mod sampler_linux;
#[cfg(target_os = "macos")]
mod sampler_mac;
#[cfg(target_os = "windows")]
mod sampler_windows;
pub use self::background_hang_monitor::*;

View file

@ -0,0 +1,83 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use backtrace;
use msg::constellation_msg::{HangProfile, HangProfileSymbol};
use std::ptr;
const MAX_NATIVE_FRAMES: usize = 1024;
pub trait Sampler: Send {
fn suspend_and_sample_thread(&self) -> Result<NativeStack, ()>;
}
// Several types in this file are currently not used in a Linux or Windows build.
#[allow(dead_code)]
pub type Address = *const libc::uint8_t;
/// The registers used for stack unwinding
#[allow(dead_code)]
pub struct Registers {
/// Instruction pointer.
pub instruction_ptr: Address,
/// Stack pointer.
pub stack_ptr: Address,
/// Frame pointer.
pub frame_ptr: Address,
}
pub struct NativeStack {
instruction_ptrs: [*mut std::ffi::c_void; MAX_NATIVE_FRAMES],
stack_ptrs: [*mut std::ffi::c_void; MAX_NATIVE_FRAMES],
count: usize,
}
impl NativeStack {
pub fn new() -> Self {
NativeStack {
instruction_ptrs: [ptr::null_mut(); MAX_NATIVE_FRAMES],
stack_ptrs: [ptr::null_mut(); MAX_NATIVE_FRAMES],
count: 0,
}
}
pub fn process_register(
&mut self,
instruction_ptr: *mut std::ffi::c_void,
stack_ptr: *mut std::ffi::c_void,
) -> Result<(), ()> {
if !(self.count < MAX_NATIVE_FRAMES) {
return Err(());
}
self.instruction_ptrs[self.count] = instruction_ptr;
self.stack_ptrs[self.count] = stack_ptr;
self.count = self.count + 1;
Ok(())
}
pub fn to_hangprofile(&self) -> HangProfile {
let mut profile = HangProfile {
backtrace: Vec::new(),
};
for ip in self.instruction_ptrs.iter().rev() {
if ip.is_null() {
continue;
}
backtrace::resolve(*ip, |symbol| {
// TODO: use the demangled or C++ demangled symbols if available.
let name = symbol
.name()
.map(|n| String::from_utf8_lossy(&n.as_bytes()).to_string());
let filename = symbol.filename().map(|n| n.to_string_lossy().to_string());
let lineno = symbol.lineno();
profile.backtrace.push(HangProfileSymbol {
name,
filename,
lineno,
});
});
}
profile
}
}

View file

@ -0,0 +1,35 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use crate::sampler::{NativeStack, Sampler};
use libc;
type MonitoredThreadId = libc::pthread_t;
#[allow(dead_code)]
pub struct LinuxSampler {
thread_id: MonitoredThreadId,
}
impl LinuxSampler {
#[allow(unsafe_code, dead_code)]
pub fn new() -> Box<Sampler> {
let thread_id = unsafe { libc::pthread_self() };
Box::new(LinuxSampler { thread_id })
}
}
impl Sampler for LinuxSampler {
#[allow(unsafe_code)]
fn suspend_and_sample_thread(&self) -> Result<NativeStack, ()> {
// Warning: The "critical section" begins here.
// In the critical section:
// we must not do any dynamic memory allocation,
// nor try to acquire any lock
// or any other unshareable resource.
// NOTE: End of "critical section".
Err(())
}
}

View file

@ -0,0 +1,118 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use crate::sampler::{Address, NativeStack, Registers, Sampler};
use libc;
use mach;
use std::panic;
use std::process;
type MonitoredThreadId = mach::mach_types::thread_act_t;
pub struct MacOsSampler {
thread_id: MonitoredThreadId,
}
impl MacOsSampler {
#[allow(unsafe_code)]
pub fn new() -> Box<Sampler> {
let thread_id = unsafe { mach::mach_init::mach_thread_self() };
Box::new(MacOsSampler { thread_id })
}
}
impl Sampler for MacOsSampler {
#[allow(unsafe_code)]
fn suspend_and_sample_thread(&self) -> Result<NativeStack, ()> {
// Warning: The "critical section" begins here.
// In the critical section:
// we must not do any dynamic memory allocation,
// nor try to acquire any lock
// or any other unshareable resource.
let current_hook = panic::take_hook();
panic::set_hook(Box::new(|_| {
// Avoiding any allocation or locking as part of standard panicking.
process::abort();
}));
let native_stack = unsafe {
if let Err(()) = suspend_thread(self.thread_id) {
panic::set_hook(current_hook);
return Err(());
};
let native_stack = match get_registers(self.thread_id) {
Ok(regs) => Ok(frame_pointer_stack_walk(regs)),
Err(()) => Err(()),
};
if let Err(()) = resume_thread(self.thread_id) {
process::abort();
}
native_stack
};
panic::set_hook(current_hook);
// NOTE: End of "critical section".
native_stack
}
}
fn check_kern_return(kret: mach::kern_return::kern_return_t) -> Result<(), ()> {
if kret != mach::kern_return::KERN_SUCCESS {
return Err(());
}
Ok(())
}
#[allow(unsafe_code)]
unsafe fn suspend_thread(thread_id: MonitoredThreadId) -> Result<(), ()> {
check_kern_return(mach::thread_act::thread_suspend(thread_id))
}
#[allow(unsafe_code)]
unsafe fn get_registers(thread_id: MonitoredThreadId) -> Result<Registers, ()> {
let mut state = mach::structs::x86_thread_state64_t::new();
let mut state_count = mach::structs::x86_thread_state64_t::count();
let kret = mach::thread_act::thread_get_state(
thread_id,
mach::thread_status::x86_THREAD_STATE64,
(&mut state) as *mut _ as *mut _,
&mut state_count,
);
check_kern_return(kret)?;
Ok(Registers {
instruction_ptr: state.__rip as Address,
stack_ptr: state.__rsp as Address,
frame_ptr: state.__rbp as Address,
})
}
#[allow(unsafe_code)]
unsafe fn resume_thread(thread_id: MonitoredThreadId) -> Result<(), ()> {
check_kern_return(mach::thread_act::thread_resume(thread_id))
}
#[allow(unsafe_code)]
unsafe fn frame_pointer_stack_walk(regs: Registers) -> NativeStack {
// Note: this function will only work with code build with:
// --dev,
// or --with-frame-pointer.
let stackaddr = libc::pthread_get_stackaddr_np(libc::pthread_self());
let mut native_stack = NativeStack::new();
let pc = regs.instruction_ptr as *mut std::ffi::c_void;
let stack = regs.stack_ptr as *mut std::ffi::c_void;
let _ = native_stack.process_register(pc, stack);
let mut current = regs.frame_ptr as *mut *mut std::ffi::c_void;
while !current.is_null() {
if (current as usize) < stackaddr as usize {
break;
}
let next = *current as *mut *mut std::ffi::c_void;
let pc = current.add(1);
let stack = current.add(2);
if let Err(()) = native_stack.process_register(*pc, *stack) {
break;
}
current = next;
}
native_stack
}

View file

@ -0,0 +1,40 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use crate::sampler::{NativeStack, Sampler};
type MonitoredThreadId = usize; // TODO: use winapi
#[allow(dead_code)]
pub struct WindowsSampler {
thread_id: MonitoredThreadId,
}
impl WindowsSampler {
#[allow(unsafe_code, dead_code)]
pub fn new() -> Box<Sampler> {
let thread_id = 0; // TODO: use winapi::um::processthreadsapi::GetThreadId
Box::new(WindowsSampler { thread_id })
}
}
impl Sampler for WindowsSampler {
fn suspend_and_sample_thread(&self) -> Result<NativeStack, ()> {
// Warning: The "critical section" begins here.
// In the critical section:
// we must not do any dynamic memory allocation,
// nor try to acquire any lock
// or any other unshareable resource.
// TODO:
// 1: use winapi::um::processthreadsapi::SuspendThread
// 2: use winapi::um::processthreadsapi::GetThreadContext
// 3: populate registers using the context, see
// https://dxr.mozilla.org/mozilla-central/source/tools/profiler/core/platform-win32.cpp#129
// 4: use winapi::um::processthreadsapi::ResumeThread
// NOTE: End of "critical section".
Err(())
}
}

View file

@ -0,0 +1,107 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use background_hang_monitor::HangMonitorRegister;
use ipc_channel::ipc;
use msg::constellation_msg::ScriptHangAnnotation;
use msg::constellation_msg::TEST_PIPELINE_ID;
use msg::constellation_msg::{HangAlert, HangAnnotation};
use msg::constellation_msg::{MonitoredComponentId, MonitoredComponentType};
use std::thread;
use std::time::Duration;
#[test]
fn test_hang_monitoring() {
let (background_hang_monitor_ipc_sender, background_hang_monitor_receiver) =
ipc::channel().expect("ipc channel failure");
let background_hang_monitor_register =
HangMonitorRegister::init(background_hang_monitor_ipc_sender.clone());
let background_hang_monitor = background_hang_monitor_register.register_component(
MonitoredComponentId(TEST_PIPELINE_ID, MonitoredComponentType::Script),
Duration::from_millis(10),
Duration::from_millis(1000),
);
// Start an activity.
let hang_annotation = HangAnnotation::Script(ScriptHangAnnotation::AttachLayout);
background_hang_monitor.notify_activity(hang_annotation);
// Sleep until the "transient" timeout has been reached.
thread::sleep(Duration::from_millis(10));
// Check for a transient hang alert.
match background_hang_monitor_receiver.recv().unwrap() {
HangAlert::Transient(component_id, _annotation) => {
let expected = MonitoredComponentId(TEST_PIPELINE_ID, MonitoredComponentType::Script);
assert_eq!(expected, component_id);
},
HangAlert::Permanent(..) => unreachable!(),
}
// Sleep until the "permanent" timeout has been reached.
thread::sleep(Duration::from_millis(1000));
// Check for a permanent hang alert.
match background_hang_monitor_receiver.recv().unwrap() {
HangAlert::Permanent(component_id, _annotation, _profile) => {
let expected = MonitoredComponentId(TEST_PIPELINE_ID, MonitoredComponentType::Script);
assert_eq!(expected, component_id);
},
HangAlert::Transient(..) => unreachable!(),
}
// Now the component is not hanging anymore.
background_hang_monitor.notify_activity(hang_annotation);
assert!(background_hang_monitor_receiver.try_recv().is_err());
// Sleep for a while.
thread::sleep(Duration::from_millis(10));
// Check for a transient hang alert.
match background_hang_monitor_receiver.recv().unwrap() {
HangAlert::Transient(component_id, _annotation) => {
let expected = MonitoredComponentId(TEST_PIPELINE_ID, MonitoredComponentType::Script);
assert_eq!(expected, component_id);
},
HangAlert::Permanent(..) => unreachable!(),
}
// Now the component is waiting for a new task.
background_hang_monitor.notify_wait();
// Sleep for a while.
thread::sleep(Duration::from_millis(100));
// The component is still waiting, but not hanging.
assert!(background_hang_monitor_receiver.try_recv().is_err());
// New task handling starts.
background_hang_monitor.notify_activity(hang_annotation);
// Sleep for a while.
thread::sleep(Duration::from_millis(10));
// We're getting new hang alerts for the latest task.
match background_hang_monitor_receiver.recv().unwrap() {
HangAlert::Transient(component_id, _annotation) => {
let expected = MonitoredComponentId(TEST_PIPELINE_ID, MonitoredComponentType::Script);
assert_eq!(expected, component_id);
},
HangAlert::Permanent(..) => unreachable!(),
}
// No new alert yet
assert!(background_hang_monitor_receiver.try_recv().is_err());
// Shut-down the hang monitor
drop(background_hang_monitor_register);
drop(background_hang_monitor);
// Sleep until the "max-timeout" has been reached.
thread::sleep(Duration::from_millis(1000));
// Still no new alerts because the hang monitor has shut-down already.
assert!(background_hang_monitor_receiver.try_recv().is_err());
}