mirror of
https://github.com/servo/servo.git
synced 2025-06-06 16:45:39 +00:00
424 lines
12 KiB
Rust
424 lines
12 KiB
Rust
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
//! Counting and non-counting Bloom filters tuned for use as ancestor filters
|
|
//! for selector matching.
|
|
|
|
use fnv::FnvHasher;
|
|
use std::fmt::{self, Debug};
|
|
use std::hash::{Hash, Hasher};
|
|
|
|
// The top 8 bits of the 32-bit hash value are not used by the bloom filter.
|
|
// Consumers may rely on this to pack hashes more efficiently.
|
|
pub const BLOOM_HASH_MASK: u32 = 0x00ffffff;
|
|
const KEY_SIZE: usize = 12;
|
|
|
|
const ARRAY_SIZE: usize = 1 << KEY_SIZE;
|
|
const KEY_MASK: u32 = (1 << KEY_SIZE) - 1;
|
|
|
|
/// A counting Bloom filter with 8-bit counters.
|
|
pub type BloomFilter = CountingBloomFilter<BloomStorageU8>;
|
|
|
|
/// A non-counting Bloom filter.
|
|
///
|
|
/// Effectively a counting Bloom filter with 1-bit counters.
|
|
pub type NonCountingBloomFilter = CountingBloomFilter<BloomStorageBool>;
|
|
|
|
/// A counting Bloom filter with parameterized storage to handle
|
|
/// counters of different sizes. For now we assume that having two hash
|
|
/// functions is enough, but we may revisit that decision later.
|
|
///
|
|
/// The filter uses an array with 2**KeySize entries.
|
|
///
|
|
/// Assuming a well-distributed hash function, a Bloom filter with
|
|
/// array size M containing N elements and
|
|
/// using k hash function has expected false positive rate exactly
|
|
///
|
|
/// $ (1 - (1 - 1/M)^{kN})^k $
|
|
///
|
|
/// because each array slot has a
|
|
///
|
|
/// $ (1 - 1/M)^{kN} $
|
|
///
|
|
/// chance of being 0, and the expected false positive rate is the
|
|
/// probability that all of the k hash functions will hit a nonzero
|
|
/// slot.
|
|
///
|
|
/// For reasonable assumptions (M large, kN large, which should both
|
|
/// hold if we're worried about false positives) about M and kN this
|
|
/// becomes approximately
|
|
///
|
|
/// $$ (1 - \exp(-kN/M))^k $$
|
|
///
|
|
/// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
|
|
/// or in other words
|
|
///
|
|
/// $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
|
|
///
|
|
/// where r is the false positive rate. This can be used to compute
|
|
/// the desired KeySize for a given load N and false positive rate r.
|
|
///
|
|
/// If N/M is assumed small, then the false positive rate can
|
|
/// further be approximated as 4*N^2/M^2. So increasing KeySize by
|
|
/// 1, which doubles M, reduces the false positive rate by about a
|
|
/// factor of 4, and a false positive rate of 1% corresponds to
|
|
/// about M/N == 20.
|
|
///
|
|
/// What this means in practice is that for a few hundred keys using a
|
|
/// KeySize of 12 gives false positive rates on the order of 0.25-4%.
|
|
///
|
|
/// Similarly, using a KeySize of 10 would lead to a 4% false
|
|
/// positive rate for N == 100 and to quite bad false positive
|
|
/// rates for larger N.
|
|
#[derive(Clone)]
|
|
pub struct CountingBloomFilter<S> where S: BloomStorage {
|
|
storage: S,
|
|
}
|
|
|
|
impl<S> CountingBloomFilter<S> where S: BloomStorage {
|
|
/// Creates a new bloom filter.
|
|
#[inline]
|
|
pub fn new() -> Self {
|
|
CountingBloomFilter {
|
|
storage: Default::default(),
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
pub fn clear(&mut self) {
|
|
self.storage = Default::default();
|
|
}
|
|
|
|
// Slow linear accessor to make sure the bloom filter is zeroed. This should
|
|
// never be used in release builds.
|
|
#[cfg(debug_assertions)]
|
|
pub fn is_zeroed(&self) -> bool {
|
|
self.storage.is_zeroed()
|
|
}
|
|
|
|
#[cfg(not(debug_assertions))]
|
|
pub fn is_zeroed(&self) -> bool {
|
|
unreachable!()
|
|
}
|
|
|
|
#[inline]
|
|
pub fn insert_hash(&mut self, hash: u32) {
|
|
self.storage.adjust_first_slot(hash, true);
|
|
self.storage.adjust_second_slot(hash, true);
|
|
}
|
|
|
|
/// Inserts an item into the bloom filter.
|
|
#[inline]
|
|
pub fn insert<T: Hash>(&mut self, elem: &T) {
|
|
self.insert_hash(hash(elem))
|
|
}
|
|
|
|
#[inline]
|
|
pub fn remove_hash(&mut self, hash: u32) {
|
|
self.storage.adjust_first_slot(hash, false);
|
|
self.storage.adjust_second_slot(hash, false);
|
|
}
|
|
|
|
/// Removes an item from the bloom filter.
|
|
#[inline]
|
|
pub fn remove<T: Hash>(&mut self, elem: &T) {
|
|
self.remove_hash(hash(elem))
|
|
}
|
|
|
|
#[inline]
|
|
pub fn might_contain_hash(&self, hash: u32) -> bool {
|
|
!self.storage.first_slot_is_empty(hash) &&
|
|
!self.storage.second_slot_is_empty(hash)
|
|
}
|
|
|
|
/// Check whether the filter might contain an item. This can
|
|
/// sometimes return true even if the item is not in the filter,
|
|
/// but will never return false for items that are actually in the
|
|
/// filter.
|
|
#[inline]
|
|
pub fn might_contain<T: Hash>(&self, elem: &T) -> bool {
|
|
self.might_contain_hash(hash(elem))
|
|
}
|
|
}
|
|
|
|
impl<S> Debug for CountingBloomFilter<S> where S: BloomStorage {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
let mut slots_used = 0;
|
|
for i in 0..ARRAY_SIZE {
|
|
if !self.storage.slot_is_empty(i) {
|
|
slots_used += 1;
|
|
}
|
|
}
|
|
write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE)
|
|
}
|
|
}
|
|
|
|
pub trait BloomStorage : Clone + Default {
|
|
fn slot_is_empty(&self, index: usize) -> bool;
|
|
fn adjust_slot(&mut self, index: usize, increment: bool);
|
|
fn is_zeroed(&self) -> bool;
|
|
|
|
#[inline]
|
|
fn first_slot_is_empty(&self, hash: u32) -> bool {
|
|
self.slot_is_empty(Self::first_slot_index(hash))
|
|
}
|
|
|
|
#[inline]
|
|
fn second_slot_is_empty(&self, hash: u32) -> bool {
|
|
self.slot_is_empty(Self::second_slot_index(hash))
|
|
}
|
|
|
|
#[inline]
|
|
fn adjust_first_slot(&mut self, hash: u32, increment: bool) {
|
|
self.adjust_slot(Self::first_slot_index(hash), increment)
|
|
}
|
|
|
|
#[inline]
|
|
fn adjust_second_slot(&mut self, hash: u32, increment: bool) {
|
|
self.adjust_slot(Self::second_slot_index(hash), increment)
|
|
}
|
|
|
|
#[inline]
|
|
fn first_slot_index(hash: u32) -> usize {
|
|
hash1(hash) as usize
|
|
}
|
|
|
|
#[inline]
|
|
fn second_slot_index(hash: u32) -> usize {
|
|
hash2(hash) as usize
|
|
}
|
|
}
|
|
|
|
/// Storage class for a CountingBloomFilter that has 8-bit counters.
|
|
pub struct BloomStorageU8 {
|
|
counters: [u8; ARRAY_SIZE],
|
|
}
|
|
|
|
impl BloomStorage for BloomStorageU8 {
|
|
#[inline]
|
|
fn adjust_slot(&mut self, index: usize, increment: bool) {
|
|
let slot = &mut self.counters[index];
|
|
if *slot != 0xff { // full
|
|
if increment {
|
|
*slot += 1;
|
|
} else {
|
|
*slot -= 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn slot_is_empty(&self, index: usize) -> bool {
|
|
self.counters[index] == 0
|
|
}
|
|
|
|
#[inline]
|
|
fn is_zeroed(&self) -> bool {
|
|
self.counters.iter().all(|x| *x == 0)
|
|
}
|
|
}
|
|
|
|
impl Default for BloomStorageU8 {
|
|
fn default() -> Self {
|
|
BloomStorageU8 {
|
|
counters: [0; ARRAY_SIZE],
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Clone for BloomStorageU8 {
|
|
fn clone(&self) -> Self {
|
|
BloomStorageU8 {
|
|
counters: self.counters,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Storage class for a CountingBloomFilter that has 1-bit counters.
|
|
pub struct BloomStorageBool {
|
|
counters: [u8; ARRAY_SIZE / 8],
|
|
}
|
|
|
|
impl BloomStorage for BloomStorageBool {
|
|
#[inline]
|
|
fn adjust_slot(&mut self, index: usize, increment: bool) {
|
|
let bit = 1 << (index % 8);
|
|
let byte = &mut self.counters[index / 8];
|
|
|
|
// Since we have only one bit for storage, decrementing it
|
|
// should never do anything. Assert against an accidental
|
|
// decrementing of a bit that was never set.
|
|
assert!(increment || (*byte & bit) != 0,
|
|
"should not decrement if slot is already false");
|
|
|
|
if increment {
|
|
*byte |= bit;
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn slot_is_empty(&self, index: usize) -> bool {
|
|
let bit = 1 << (index % 8);
|
|
(self.counters[index / 8] & bit) == 0
|
|
}
|
|
|
|
#[inline]
|
|
fn is_zeroed(&self) -> bool {
|
|
self.counters.iter().all(|x| *x == 0)
|
|
}
|
|
}
|
|
|
|
impl Default for BloomStorageBool {
|
|
fn default() -> Self {
|
|
BloomStorageBool {
|
|
counters: [0; ARRAY_SIZE / 8],
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Clone for BloomStorageBool {
|
|
fn clone(&self) -> Self {
|
|
BloomStorageBool {
|
|
counters: self.counters,
|
|
}
|
|
}
|
|
}
|
|
|
|
fn hash<T: Hash>(elem: &T) -> u32 {
|
|
let mut hasher = FnvHasher::default();
|
|
elem.hash(&mut hasher);
|
|
let hash: u64 = hasher.finish();
|
|
(hash >> 32) as u32 ^ (hash as u32)
|
|
}
|
|
|
|
#[inline]
|
|
fn hash1(hash: u32) -> u32 {
|
|
hash & KEY_MASK
|
|
}
|
|
|
|
#[inline]
|
|
fn hash2(hash: u32) -> u32 {
|
|
(hash >> KEY_SIZE) & KEY_MASK
|
|
}
|
|
|
|
#[test]
|
|
fn create_and_insert_some_stuff() {
|
|
use std::mem::transmute;
|
|
|
|
let mut bf = BloomFilter::new();
|
|
|
|
// Statically assert that ARRAY_SIZE is a multiple of 8, which
|
|
// BloomStorageBool relies on.
|
|
unsafe {
|
|
transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]);
|
|
}
|
|
|
|
for i in 0_usize .. 1000 {
|
|
bf.insert(&i);
|
|
}
|
|
|
|
for i in 0_usize .. 1000 {
|
|
assert!(bf.might_contain(&i));
|
|
}
|
|
|
|
let false_positives =
|
|
(1001_usize .. 2000).filter(|i| bf.might_contain(i)).count();
|
|
|
|
assert!(false_positives < 150, "{} is not < 150", false_positives); // 15%.
|
|
|
|
for i in 0_usize .. 100 {
|
|
bf.remove(&i);
|
|
}
|
|
|
|
for i in 100_usize .. 1000 {
|
|
assert!(bf.might_contain(&i));
|
|
}
|
|
|
|
let false_positives = (0_usize .. 100).filter(|i| bf.might_contain(i)).count();
|
|
|
|
assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%.
|
|
|
|
bf.clear();
|
|
|
|
for i in 0_usize .. 2000 {
|
|
assert!(!bf.might_contain(&i));
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "bench")]
|
|
#[cfg(test)]
|
|
mod bench {
|
|
extern crate test;
|
|
use super::BloomFilter;
|
|
|
|
#[derive(Default)]
|
|
struct HashGenerator(u32);
|
|
|
|
impl HashGenerator {
|
|
fn next(&mut self) -> u32 {
|
|
// Each hash is split into two twelve-bit segments, which are used
|
|
// as an index into an array. We increment each by 64 so that we
|
|
// hit the next cache line, and then another 1 so that our wrapping
|
|
// behavior leads us to different entries each time.
|
|
//
|
|
// Trying to simulate cold caches is rather difficult with the cargo
|
|
// benchmarking setup, so it may all be moot depending on the number
|
|
// of iterations that end up being run. But we might as well.
|
|
self.0 += (65) + (65 << super::KEY_SIZE);
|
|
self.0
|
|
}
|
|
}
|
|
|
|
#[bench]
|
|
fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) {
|
|
b.iter(|| {
|
|
let mut gen1 = HashGenerator::default();
|
|
let mut gen2 = HashGenerator::default();
|
|
let mut bf = BloomFilter::new();
|
|
for _ in 0_usize .. 1000 {
|
|
bf.insert_hash(gen1.next());
|
|
}
|
|
for _ in 0_usize .. 100 {
|
|
bf.remove_hash(gen2.next());
|
|
}
|
|
for _ in 100_usize .. 200 {
|
|
test::black_box(bf.might_contain_hash(gen2.next()));
|
|
}
|
|
});
|
|
}
|
|
|
|
#[bench]
|
|
fn might_contain_10(b: &mut test::Bencher) {
|
|
let bf = BloomFilter::new();
|
|
let mut gen = HashGenerator::default();
|
|
b.iter(|| for _ in 0..10 {
|
|
test::black_box(bf.might_contain_hash(gen.next()));
|
|
});
|
|
}
|
|
|
|
#[bench]
|
|
fn clear(b: &mut test::Bencher) {
|
|
let mut bf = Box::new(BloomFilter::new());
|
|
b.iter(|| test::black_box(&mut bf).clear());
|
|
}
|
|
|
|
#[bench]
|
|
fn insert_10(b: &mut test::Bencher) {
|
|
let mut bf = BloomFilter::new();
|
|
let mut gen = HashGenerator::default();
|
|
b.iter(|| for _ in 0..10 {
|
|
test::black_box(bf.insert_hash(gen.next()));
|
|
});
|
|
}
|
|
|
|
#[bench]
|
|
fn remove_10(b: &mut test::Bencher) {
|
|
let mut bf = BloomFilter::new();
|
|
let mut gen = HashGenerator::default();
|
|
// Note: this will underflow, and that's ok.
|
|
b.iter(|| for _ in 0..10 {
|
|
bf.remove_hash(gen.next())
|
|
});
|
|
}
|
|
}
|