diff --git a/components/net/cookie.rs b/components/net/cookie.rs index 558d8e49a8c..28972d95806 100644 --- a/components/net/cookie.rs +++ b/components/net/cookie.rs @@ -7,7 +7,7 @@ use cookie_rs; use net_traits::CookieSource; -use pub_domains::is_pub_domain; +use net_traits::pub_domains::is_pub_domain; use std::borrow::ToOwned; use std::net::{Ipv4Addr, Ipv6Addr}; use time::{Tm, now, at, Duration}; diff --git a/components/net/lib.rs b/components/net/lib.rs index 120dacb27d2..822c2105c6e 100644 --- a/components/net/lib.rs +++ b/components/net/lib.rs @@ -63,7 +63,6 @@ pub mod hsts; pub mod http_loader; pub mod image_cache_thread; pub mod mime_classifier; -pub mod pub_domains; pub mod resource_thread; pub mod storage_thread; pub mod websocket_loader; diff --git a/components/net/pub_domains.rs b/components/net/pub_domains.rs deleted file mode 100644 index 6a9555d4bb1..00000000000 --- a/components/net/pub_domains.rs +++ /dev/null @@ -1,73 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -//! Implementation of public domain matching. -//! -//! The list is a file located on the `resources` folder and loaded once on first need. -//! -//! The list can be updated with `./mach update-pub-domains` from this source: -//! https://publicsuffix.org/list/ -//! -//! This implementation is not strictly following the specification of the list. Wildcards are not -//! restricted to appear only in the leftmost position, but the current list has no such cases so -//! we don't need to make the code more complex for it. The `mach` update command makes sure that -//! those cases are not present. - -use std::collections::HashSet; -use std::str::from_utf8; -use std::sync::Arc; -use util::resource_files::read_resource_file; - -lazy_static! { - static ref PUB_DOMAINS: Arc> = load_pub_domains(); -} - -fn load_pub_domains() -> Arc> { - let content = read_resource_file("public_domains.txt") - .expect("Could not find public suffix list file"); - let domains = from_utf8(&content) - .expect("Could not read suffix list file") - .lines() - .filter_map(|i| { - let domain = i.trim(); - if domain == "" { return None }; - if domain.starts_with("//") { return None }; - Some(domain.to_owned()) - }); - - Arc::new(domains.collect()) -} - -/// Match the given domain against a static list of known public domains -pub fn is_pub_domain(domain: &str) -> bool { - let domain = domain.trim_left_matches("."); - - // Start by looking for a plain match - if PUB_DOMAINS.contains(&domain.to_string()) { - return true - } - - // Then look for a wildcard match - // To make things simpler, just look for the same domain with its leftmost part replaced by a - // wildcard. - match domain.find(".") { - None => { - // This is a domain with only one part, so there is no need to search for wildcards or - // exceptions - return false - } - Some(position) => { - let wildcard_domain = "*".to_string() + domain.split_at(position).1; - if PUB_DOMAINS.contains(&wildcard_domain) { - // We have a wildcard match, search for an eventual exception - let exception_domain = "!".to_string() + domain; - return ! PUB_DOMAINS.contains(&exception_domain) - } else { - // No wildcard match -> this is not a public domain - return false - } - } - } -} - diff --git a/components/net_traits/lib.rs b/components/net_traits/lib.rs index 2cbc198e0f7..6317a7af1d3 100644 --- a/components/net_traits/lib.rs +++ b/components/net_traits/lib.rs @@ -56,6 +56,7 @@ pub mod filemanager_thread; pub mod hosts; pub mod image_cache_thread; pub mod net_error_list; +pub mod pub_domains; pub mod request; pub mod response; pub mod storage_thread; diff --git a/components/net_traits/pub_domains.rs b/components/net_traits/pub_domains.rs new file mode 100644 index 00000000000..f40815e100e --- /dev/null +++ b/components/net_traits/pub_domains.rs @@ -0,0 +1,140 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +//! Implementation of public domain matching. +//! +//! The list is a file located on the `resources` folder and loaded once on first need. +//! +//! The list can be updated with `./mach update-pub-domains` from this source: +//! https://publicsuffix.org/list/ +//! +//! This implementation is not strictly following the specification of the list. Wildcards are not +//! restricted to appear only in the leftmost position, but the current list has no such cases so +//! we don't need to make the code more complex for it. The `mach` update command makes sure that +//! those cases are not present. + +use std::collections::HashSet; +use std::iter::FromIterator; +use std::str::from_utf8; +use util::resource_files::read_resource_file; + +#[derive(Clone,Debug)] +pub struct PubDomainRules { + rules: HashSet, + wildcards: HashSet, + exceptions: HashSet, +} + +lazy_static! { + static ref PUB_DOMAINS: PubDomainRules = load_pub_domains(); +} + +impl<'a> FromIterator<&'a str> for PubDomainRules { + fn from_iter(iter: T) -> Self where T: IntoIterator { + let mut result = PubDomainRules::new(); + for item in iter { + if item.starts_with("!") { + result.exceptions.insert(String::from(&item[1..])); + } else if item.starts_with("*.") { + result.wildcards.insert(String::from(&item[2..])); + } else { + result.rules.insert(String::from(item)); + } + } + result + } +} + +impl PubDomainRules { + pub fn new() -> PubDomainRules { + PubDomainRules { + rules: HashSet::new(), + wildcards: HashSet::new(), + exceptions: HashSet::new(), + } + } + pub fn parse(content: &str) -> PubDomainRules { + content.lines() + .map(str::trim) + .filter(|s| !s.is_empty()) + .filter(|s| !s.starts_with("//")) + .collect() + } + fn suffix_pair<'a>(&self, domain: &'a str) -> (&'a str, &'a str) { + let domain = domain.trim_left_matches("."); + let mut suffix = domain; + let mut prev_suffix = domain; + for (index, _) in domain.match_indices(".") { + let next_suffix = &domain[index + 1..]; + if self.exceptions.contains(suffix) { + return (next_suffix, suffix); + } else if self.wildcards.contains(next_suffix) { + return (suffix, prev_suffix); + } else if self.rules.contains(suffix) { + return (suffix, prev_suffix); + } else { + prev_suffix = suffix; + suffix = next_suffix; + } + } + return (suffix, prev_suffix); + } + pub fn public_suffix<'a>(&self, domain: &'a str) -> &'a str { + let (public, _) = self.suffix_pair(domain); + public + } + pub fn registrable_suffix<'a>(&self, domain: &'a str) -> &'a str { + let (_, registrable) = self.suffix_pair(domain); + registrable + } + pub fn is_public_suffix(&self, domain: &str) -> bool { + // Speeded-up version of + // domain != "" && + // self.public_suffix(domain) == domain. + let domain = domain.trim_left_matches("."); + match domain.find(".") { + None => !domain.is_empty(), + Some(index) => !self.exceptions.contains(domain) && + self.wildcards.contains(&domain[index + 1..]) || + self.rules.contains(domain), + } + } + pub fn is_registrable_suffix(&self, domain: &str) -> bool { + // Speeded-up version of + // self.public_suffix(domain) != domain && + // self.registrable_suffix(domain) == domain. + let domain = domain.trim_left_matches("."); + match domain.find(".") { + None => false, + Some(index) => self.exceptions.contains(domain) || + !self.wildcards.contains(&domain[index + 1..]) && + !self.rules.contains(domain) && + self.is_public_suffix(&domain[index + 1..]), + } + } +} + +fn load_pub_domains() -> PubDomainRules { + let content = read_resource_file("public_domains.txt") + .expect("Could not find public suffix list file"); + let content = from_utf8(&content) + .expect("Could not read public suffix list file"); + PubDomainRules::parse(content) +} + +pub fn pub_suffix(domain: &str) -> &str { + PUB_DOMAINS.public_suffix(domain) +} + +pub fn reg_suffix(domain: &str) -> &str { + PUB_DOMAINS.registrable_suffix(domain) +} + +pub fn is_pub_domain(domain: &str) -> bool { + PUB_DOMAINS.is_public_suffix(domain) +} + +pub fn is_reg_domain(domain: &str) -> bool { + PUB_DOMAINS.is_registrable_suffix(domain) +} diff --git a/tests/unit/net/lib.rs b/tests/unit/net/lib.rs index 1acdb9bb6ae..f7bdec62382 100644 --- a/tests/unit/net/lib.rs +++ b/tests/unit/net/lib.rs @@ -28,7 +28,6 @@ extern crate util; #[cfg(test)] mod file_loader; #[cfg(test)] mod fetch; #[cfg(test)] mod mime_classifier; -#[cfg(test)] mod pub_domains; #[cfg(test)] mod resource_thread; #[cfg(test)] mod hsts; #[cfg(test)] mod http_loader; diff --git a/tests/unit/net/pub_domains.rs b/tests/unit/net/pub_domains.rs deleted file mode 100644 index 60f2aa2b2e9..00000000000 --- a/tests/unit/net/pub_domains.rs +++ /dev/null @@ -1,38 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -use net::pub_domains::is_pub_domain; - -#[test] -fn test_is_pub_domain_plain() { - assert!(is_pub_domain("com")); - assert!(is_pub_domain(".org")); - assert!(is_pub_domain("za.org")); - assert!(is_pub_domain("xn--od0alg.hk")); - assert!(is_pub_domain("xn--krdsherad-m8a.no")); -} - -#[test] -fn test_is_pub_domain_wildcard() { - assert!(is_pub_domain("hello.bd")); - assert!(is_pub_domain("world.jm")); - assert!(is_pub_domain("toto.kobe.jp")); -} - -#[test] -fn test_is_pub_domain_exception() { - assert_eq!(is_pub_domain("www.ck"), false); - assert_eq!(is_pub_domain("city.kawasaki.jp"), false); - assert_eq!(is_pub_domain("city.nagoya.jp"), false); - assert_eq!(is_pub_domain("teledata.mz"), false); -} - -#[test] -fn test_is_pub_domain_not() { - assert_eq!(is_pub_domain(".servo.org"), false); - assert_eq!(is_pub_domain("www.mozilla.org"), false); - assert_eq!(is_pub_domain("publicsuffix.org"), false); - assert_eq!(is_pub_domain("hello.world.jm"), false); - assert_eq!(is_pub_domain("toto.toto.kobe.jp"), false); -} diff --git a/tests/unit/net_traits/lib.rs b/tests/unit/net_traits/lib.rs index 0ea8e4cfe78..3f722e5fc63 100644 --- a/tests/unit/net_traits/lib.rs +++ b/tests/unit/net_traits/lib.rs @@ -5,6 +5,7 @@ extern crate net_traits; #[cfg(test)] mod image; +#[cfg(test)] mod pub_domains; #[test] fn test_trim_http_whitespace() { diff --git a/tests/unit/net_traits/pub_domains.rs b/tests/unit/net_traits/pub_domains.rs new file mode 100644 index 00000000000..d18ecb06e0c --- /dev/null +++ b/tests/unit/net_traits/pub_domains.rs @@ -0,0 +1,119 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use net_traits::pub_domains::{is_pub_domain, is_reg_domain, pub_suffix, reg_suffix}; + +// These tests may need to be updated if the PSL changes. + +#[test] +fn test_is_pub_domain_plain() { + assert!(is_pub_domain("com")); + assert!(is_pub_domain(".org")); + assert!(is_pub_domain("za.org")); + assert!(is_pub_domain("xn--od0alg.hk")); + assert!(is_pub_domain("xn--krdsherad-m8a.no")); +} + +#[test] +fn test_is_pub_domain_wildcard() { + assert!(is_pub_domain("hello.bd")); + assert!(is_pub_domain("world.jm")); + assert!(is_pub_domain("toto.kobe.jp")); +} + +#[test] +fn test_is_pub_domain_exception() { + assert_eq!(is_pub_domain("www.ck"), false); + assert_eq!(is_pub_domain("city.kawasaki.jp"), false); + assert_eq!(is_pub_domain("city.nagoya.jp"), false); + assert_eq!(is_pub_domain("teledata.mz"), false); +} + +#[test] +fn test_is_pub_domain_not() { + assert_eq!(is_pub_domain(""), false); + assert_eq!(is_pub_domain("."), false); + assert_eq!(is_pub_domain("..."), false); + assert_eq!(is_pub_domain(".servo.org"), false); + assert_eq!(is_pub_domain("www.mozilla.org"), false); + assert_eq!(is_pub_domain("publicsuffix.org"), false); + assert_eq!(is_pub_domain("hello.world.jm"), false); + assert_eq!(is_pub_domain("toto.toto.kobe.jp"), false); +} + +#[test] +fn test_is_pub_domain() { + assert!(!is_pub_domain("city.yokohama.jp")); + assert!(!is_pub_domain("foo.bar.baz.yokohama.jp")); + assert!(!is_pub_domain("foo.bar.city.yokohama.jp")); + assert!(!is_pub_domain("foo.bar.com")); + assert!(!is_pub_domain("foo.bar.tokyo.jp")); + assert!(!is_pub_domain("foo.bar.yokohama.jp")); + assert!(!is_pub_domain("foo.city.yokohama.jp")); + assert!(!is_pub_domain("foo.com")); + assert!(!is_pub_domain("foo.tokyo.jp")); + assert!(!is_pub_domain("yokohama.jp")); + assert!(is_pub_domain("com")); + assert!(is_pub_domain("foo.yokohama.jp")); + assert!(is_pub_domain("jp")); + assert!(is_pub_domain("tokyo.jp")); +} + +#[test] +fn test_is_reg_domain() { + assert!(!is_reg_domain("com")); + assert!(!is_reg_domain("foo.bar.baz.yokohama.jp")); + assert!(!is_reg_domain("foo.bar.com")); + assert!(!is_reg_domain("foo.bar.tokyo.jp")); + assert!(!is_reg_domain("foo.city.yokohama.jp")); + assert!(!is_reg_domain("foo.yokohama.jp")); + assert!(!is_reg_domain("jp")); + assert!(!is_reg_domain("tokyo.jp")); + assert!(is_reg_domain("city.yokohama.jp")); + assert!(is_reg_domain("foo.bar.yokohama.jp")); + assert!(is_reg_domain("foo.com")); + assert!(is_reg_domain("foo.tokyo.jp")); + assert!(is_reg_domain("yokohama.jp")); +} + +#[test] +fn test_pub_suffix() { + assert_eq!(pub_suffix("city.yokohama.jp"), "yokohama.jp"); + assert_eq!(pub_suffix("com"), "com"); + assert_eq!(pub_suffix("foo.bar.baz.yokohama.jp"), "baz.yokohama.jp"); + assert_eq!(pub_suffix("foo.bar.com"), "com"); + assert_eq!(pub_suffix("foo.bar.tokyo.jp"), "tokyo.jp"); + assert_eq!(pub_suffix("foo.bar.yokohama.jp"), "bar.yokohama.jp"); + assert_eq!(pub_suffix("foo.city.yokohama.jp"), "yokohama.jp"); + assert_eq!(pub_suffix("foo.com"), "com"); + assert_eq!(pub_suffix("foo.tokyo.jp"), "tokyo.jp"); + assert_eq!(pub_suffix("foo.yokohama.jp"), "foo.yokohama.jp"); + assert_eq!(pub_suffix("jp"), "jp"); + assert_eq!(pub_suffix("tokyo.jp"), "tokyo.jp"); + assert_eq!(pub_suffix("yokohama.jp"), "jp"); +} + +#[test] +fn test_reg_suffix() { + assert_eq!(reg_suffix("city.yokohama.jp"), "city.yokohama.jp"); + assert_eq!(reg_suffix("com"), "com"); + assert_eq!(reg_suffix("foo.bar.baz.yokohama.jp"), "bar.baz.yokohama.jp"); + assert_eq!(reg_suffix("foo.bar.com"), "bar.com"); + assert_eq!(reg_suffix("foo.bar.tokyo.jp"), "bar.tokyo.jp"); + assert_eq!(reg_suffix("foo.bar.yokohama.jp"), "foo.bar.yokohama.jp"); + assert_eq!(reg_suffix("foo.city.yokohama.jp"), "city.yokohama.jp"); + assert_eq!(reg_suffix("foo.com"), "foo.com"); + assert_eq!(reg_suffix("foo.tokyo.jp"), "foo.tokyo.jp"); + assert_eq!(reg_suffix("foo.yokohama.jp"), "foo.yokohama.jp"); + assert_eq!(reg_suffix("jp"), "jp"); + assert_eq!(reg_suffix("tokyo.jp"), "tokyo.jp"); + assert_eq!(reg_suffix("yokohama.jp"), "yokohama.jp"); +} + +#[test] +fn test_weirdness() { + // These are weird results, but AFAICT they are spec-compliant. + assert!(pub_suffix("city.yokohama.jp") != pub_suffix(pub_suffix("city.yokohama.jp"))); + assert!(!is_pub_domain(pub_suffix("city.yokohama.jp"))); +}