base: Remove ucd dependency (#32424)

Remove the `ucd` dependency which has not been updated in 8 years. In
addition, replace it with a generated UnicodeBlock enum which reflects
the modern Unicode standard. This is generated via a Python script which
is included in the repository. The generation is not part of the build
process, because the Unicode database is hosted on the web and it does
not change the frequently.

This is done instead of bringing in the more up-to-date `unicode_blocks`
dependency. `unicode_blocks` defines each block as constant, which means
that they cannot be used in match statements -- which we do in Servo.

Co-authored-by: Lauryn Menard <lauryn.menard@gmail.com>
This commit is contained in:
Martin Robinson 2024-06-03 19:10:01 +02:00 committed by GitHub
parent 48ab8d8847
commit f8985c5521
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 834 additions and 100 deletions

View file

@ -0,0 +1,64 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
pub use crate::unicode_block::{UnicodeBlock, UnicodeBlockMethod};
pub fn is_bidi_control(c: char) -> bool {
matches!(c, '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}' | '\u{200E}' | '\u{200F}' | '\u{061C}')
}
pub fn unicode_plane(codepoint: char) -> u32 {
(codepoint as u32) >> 16
}
pub fn is_cjk(codepoint: char) -> bool {
if let Some(block) = codepoint.block() {
match block {
UnicodeBlock::CJKRadicalsSupplement |
UnicodeBlock::KangxiRadicals |
UnicodeBlock::IdeographicDescriptionCharacters |
UnicodeBlock::CJKSymbolsandPunctuation |
UnicodeBlock::Hiragana |
UnicodeBlock::Katakana |
UnicodeBlock::Bopomofo |
UnicodeBlock::HangulCompatibilityJamo |
UnicodeBlock::Kanbun |
UnicodeBlock::BopomofoExtended |
UnicodeBlock::CJKStrokes |
UnicodeBlock::KatakanaPhoneticExtensions |
UnicodeBlock::EnclosedCJKLettersandMonths |
UnicodeBlock::CJKCompatibility |
UnicodeBlock::CJKUnifiedIdeographsExtensionA |
UnicodeBlock::YijingHexagramSymbols |
UnicodeBlock::CJKUnifiedIdeographs |
UnicodeBlock::CJKCompatibilityIdeographs |
UnicodeBlock::CJKCompatibilityForms |
UnicodeBlock::HalfwidthandFullwidthForms => return true,
_ => {},
}
}
// https://en.wikipedia.org/wiki/Plane_(Unicode)#Supplementary_Ideographic_Plane
// https://en.wikipedia.org/wiki/Plane_(Unicode)#Tertiary_Ideographic_Plane
unicode_plane(codepoint) == 2 || unicode_plane(codepoint) == 3
}
#[test]
fn test_is_cjk() {
// Test characters from different CJK blocks
assert_eq!(is_cjk(''), true);
assert_eq!(is_cjk('㐀'), true);
assert_eq!(is_cjk('あ'), true);
assert_eq!(is_cjk('ア'), true);
assert_eq!(is_cjk('㆒'), true);
assert_eq!(is_cjk('ㆣ'), true);
assert_eq!(is_cjk('龥'), true);
assert_eq!(is_cjk('𰾑'), true);
assert_eq!(is_cjk('𰻝'), true);
// Test characters from outside CJK blocks
assert_eq!(is_cjk('a'), false);
assert_eq!(is_cjk('🙂'), false);
assert_eq!(is_cjk('©'), false);
}