From 68e49e5cd8bd93f28d2ae4ad087f84d3925b8ccb Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Wed, 14 Feb 2024 12:13:01 -0500 Subject: [PATCH 1/6] [fontbe] Add properties module and initial helpers - a method to iterate the scripts + extensions for a codepoint - ability to classifty glyphs into groups based on bidi class or & script --- fontbe/Cargo.toml | 4 ++ fontbe/src/features.rs | 2 + fontbe/src/features/properties.rs | 114 ++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+) create mode 100644 fontbe/src/features/properties.rs diff --git a/fontbe/Cargo.toml b/fontbe/Cargo.toml index e7e4a4701..18f0c6285 100644 --- a/fontbe/Cargo.toml +++ b/fontbe/Cargo.toml @@ -14,6 +14,10 @@ categories = ["text-processing", "parsing", "graphics"] fontdrasil = { version = "0.0.1", path = "../fontdrasil" } fontir = { version = "0.0.1", path = "../fontir" } fea-rs = { version = "0.18.0", path = "../fea-rs", features = ["serde"] } +#icu_properties = "1.4" +icu_properties = { version = "1.4", path = "../../extern/icu4x/components/properties" } +#tinystr = "0.7.5" +tinystr = {version = "0.7.5", path = "../../extern/icu4x/utils/tinystr" } serde.workspace = true bincode.workspace = true diff --git a/fontbe/src/features.rs b/fontbe/src/features.rs index 5c762e457..bdd108983 100644 --- a/fontbe/src/features.rs +++ b/fontbe/src/features.rs @@ -41,6 +41,8 @@ use crate::{ mod kern; mod marks; +mod properties; + pub use kern::{create_gather_ir_kerning_work, create_kern_segment_work, create_kerns_work}; pub use marks::create_mark_work; diff --git a/fontbe/src/features/properties.rs b/fontbe/src/features/properties.rs new file mode 100644 index 000000000..e469d1de6 --- /dev/null +++ b/fontbe/src/features/properties.rs @@ -0,0 +1,114 @@ +use std::{ + collections::{HashMap, HashSet}, + hash::Hash, + sync::Arc, +}; + +use fontir::ir::Glyph; +use icu_properties::{script::ScriptWithExtensionsBorrowed, BidiClass, Script}; +use write_fonts::{ + read::{tables::gsub::Gsub, ReadError}, + types::{GlyphId, Tag}, +}; + +static SCRIPT_DATA: ScriptWithExtensionsBorrowed<'static> = + icu_properties::script::script_with_extensions(); + +/// The type used by icu4x for script names +pub type UnicodeShortName = tinystr::TinyAsciiStr<4>; + +/// Iterate over the unicode scripts + extensions for the provided codepoint +/// +/// This returns the scripts as shortnames, because that's what python does. +/// It would probably make more sense for us to use the Script type defined by +/// icu4x, but I want to get a more direct port working first. +pub(crate) fn unicode_script_extensions( + c: u32, +) -> impl Iterator + 'static { + let lookup = Script::enum_to_short_name_mapper(); + SCRIPT_DATA + .get_script_extensions_val(c) + .iter() + .map(move |script| { + lookup + .get(script) + // if we get a script it is by definition a 4-char ascii string, + // so this unwrap should never fail + .expect("names should be available for all defined scripts") + }) +} + +fn unicode_bidi_type(c: u32) -> BidiClass { + icu_properties::maps::bidi_class().get32(c) +} + +// equivalent to the 'classify' method in ufo2ft: +// +fn classify( + glyphs: &[(Arc, GlyphId)], + mut props_fn: F, + gsub: Option<&Gsub>, +) -> Result>, ReadError> +where + T: Hash + Eq, + I: Iterator, + F: FnMut(u32) -> I, +{ + let mut sets = HashMap::new(); + let mut neutral_glyphs = HashSet::new(); + for (gid, unicode_value) in glyphs.iter().flat_map(|(glyph, gid)| { + glyph + .codepoints + .iter() + .copied() + .map(|codepoint| (*gid, codepoint)) + }) { + let mut has_props = false; + for prop in props_fn(unicode_value) { + sets.entry(prop).or_insert(HashSet::new()).insert(gid); + has_props = true; + } + if !has_props { + neutral_glyphs.insert(gid); + } + } + + if let Some(gsub) = gsub.as_ref() { + neutral_glyphs = gsub.closure_glyphs(neutral_glyphs)?; + for glyphs in sets.values_mut() { + let temp = glyphs + .union(&neutral_glyphs) + .copied() + .collect::>(); + let temp = gsub.closure_glyphs(temp)?; + glyphs.extend(temp.difference(&neutral_glyphs).copied()) + } + } + Ok(sets) +} + +/// Returns a map of gids their scripts +pub(crate) fn scripts_by_glyph( + glyphs: &[(Arc, GlyphId)], + gsub: Option<&Gsub>, +) -> Result>, ReadError> { + let mut result = HashMap::with_capacity(glyphs.len()); + for (script, glyphs) in classify(glyphs, |cp| unicode_script_extensions(cp), gsub)? { + for glyph in glyphs { + result.entry(glyph).or_insert(HashSet::new()).insert(script); + } + } + Ok(result) +} + +/// A map of bidi class to glyphs in that class. +pub(crate) fn glyphs_by_bidi_class( + glyphs: &[(Arc, GlyphId)], + gsub: Option<&Gsub>, +) -> Result>, ReadError> { + classify( + glyphs, + |codepoint| Some(unicode_bidi_type(codepoint)).into_iter(), + gsub, + ) +} From e1c52348e112ab51800caddd34008cfbeeb2fb98 Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Tue, 27 Feb 2024 18:21:32 -0600 Subject: [PATCH 2/6] [fontbe] Implement Ot script -> Unicode script mapping The opentype script tags have a different format from the short names used by unicode. --- fontbe/src/features/properties.rs | 114 ++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/fontbe/src/features/properties.rs b/fontbe/src/features/properties.rs index e469d1de6..883959e6d 100644 --- a/fontbe/src/features/properties.rs +++ b/fontbe/src/features/properties.rs @@ -112,3 +112,117 @@ pub(crate) fn glyphs_by_bidi_class( gsub, ) } +static SCRIPT_ALIASES: &[(Tag, Tag)] = &[(Tag::new(b"jamo"), Tag::new(b"hang"))]; + +//TODO: python maps 'math' to 'Zmth' but 'Mathematical Notation' +//is not a variant on icu4x's Script? But this is included in +//some datafiles so might be an oversight? But it's actually not listed +//in the official list of scripts, so idk? +static SCRIPT_EXCEPTIONS: &[(Tag, Script)] = &[]; + +// I don't know what's going on here, just copying OTTags.py +static NEW_SCRIPTS: &[(Tag, Script)] = &[ + (Tag::new(b"bng2"), Script::Bengali), + (Tag::new(b"dev2"), Script::Devanagari), + (Tag::new(b"gjr2"), Script::Gujarati), + (Tag::new(b"gur2"), Script::Gurmukhi), + (Tag::new(b"knd2"), Script::Kannada), + (Tag::new(b"mlm2"), Script::Malayalam), + (Tag::new(b"mym2"), Script::Myanmar), + (Tag::new(b"ory2"), Script::Oriya), + (Tag::new(b"tel2"), Script::Telugu), + (Tag::new(b"tml2"), Script::Tamil), +]; + +// a little helper trait to handle binary searching an array of 2-tuples where +// the first item is a key and the second a value +trait BinarySearchExact { + fn binary_search_exact(&self, needle: &T) -> Option; +} + +impl BinarySearchExact for &[(T, U)] { + fn binary_search_exact(&self, needle: &T) -> Option { + self.binary_search_by(|probe| probe.0.cmp(&needle)) + .ok() + .map(|idx| &self[idx].1) + .cloned() + } +} + +/// Takes an OpenType script tag and returns a unicode script identifier +/// +/// +pub(crate) fn ot_tag_to_script(script_tag: Tag) -> Option { + const DFLT: Tag = Tag::new(b"DFLT"); + if script_tag == DFLT { + return None; + } + + let tag = SCRIPT_ALIASES + .binary_search_exact(&script_tag) + .unwrap_or(script_tag); + + if let Some(exception) = SCRIPT_EXCEPTIONS + .binary_search_exact(&tag) + .or_else(|| NEW_SCRIPTS.binary_search_exact(&tag)) + { + let mapping = Script::enum_to_short_name_mapper(); + return mapping.get(exception); + } + + // finally, algorithmic conversion + Some(ot_tag_to_unicode_short_name(tag)) +} + +// first char is uppercased; any trailing spaces are replaced with last non-space letter +fn ot_tag_to_unicode_short_name(tag: Tag) -> UnicodeShortName { + const SPACE: u8 = b' '; + + let tag_bytes = tag.into_bytes(); + let mut out = [b'\0'; 4]; + out[0] = tag_bytes[0].to_ascii_uppercase(); + let mut last_non_space = tag_bytes[1]; + for i in 1..=3 { + if tag_bytes[i] != SPACE { + out[i] = tag_bytes[i]; + last_non_space = tag_bytes[i]; + } else { + out[i] = last_non_space; + } + } + + UnicodeShortName::try_from_raw(out).expect("cannot fail, as tag cannot have leading nul byte") +} + +#[cfg(test)] +mod tests { + use super::*; + + /// we want to binary search these, so let's enforce that they are sorted, + /// to avoid future headaches + #[test] + fn const_arrays_are_sorted() { + fn get_original_and_sorted_items( + items: &[(T, U)], + ) -> (Vec, Vec) { + let originals = items.iter().map(|(a, _)| a.clone()).collect::>(); + let mut sorted = originals.clone(); + sorted.sort(); + (originals, sorted) + } + + let (actual, expected) = get_original_and_sorted_items(SCRIPT_ALIASES); + assert_eq!(actual, expected); + let (actual, expected) = get_original_and_sorted_items(SCRIPT_EXCEPTIONS); + assert_eq!(actual, expected); + let (actual, expected) = get_original_and_sorted_items(NEW_SCRIPTS); + assert_eq!(actual, expected); + } + + #[test] + fn raw_tag_conversion() { + assert_eq!(ot_tag_to_unicode_short_name(Tag::new(b"deva")), "Deva"); + assert_eq!(ot_tag_to_unicode_short_name(Tag::new(b"yi ")), "Yiii"); + assert_eq!(ot_tag_to_unicode_short_name(Tag::new(b"nko ")), "Nkoo"); + } +} From 624249243e5f596fe3866f7b77393c0c7c26ffc4 Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Tue, 12 Mar 2024 15:48:24 -0400 Subject: [PATCH 3/6] [kerning] Split kern rules by script & directionality This is very closely based on the code in the KernFeatureWriter, in ufo2ft. With this patch we very nearly match fontmake's output for kerning in oswald, with a few lingering differences. It feels like there is room for polish here, but I also think it's worth checkpointing here, for further iteration. --- Cargo.toml | 4 +- fea-rs/src/common/glyph_class.rs | 8 +- fea-rs/src/compile/lookups.rs | 10 +- fea-rs/src/compile/lookups/gpos_builders.rs | 11 + fea-rs/src/compile/metrics.rs | 10 + fea-rs/src/token_tree/typed.rs | 12 +- fontbe/Cargo.toml | 6 +- fontbe/src/features.rs | 17 +- fontbe/src/features/kern.rs | 673 +++++++++++++++++++- fontbe/src/features/properties.rs | 279 +++++++- fontbe/src/orchestration.rs | 114 +++- fontc/src/workload.rs | 3 + 12 files changed, 1088 insertions(+), 59 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0d932aea5..40855cc9a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,6 @@ temp-env = "0.3.3" rstest = "0.18.2" [workspace] - resolver = "2" members = [ @@ -52,3 +51,6 @@ members = [ "layout-normalizer", ] +[patch.crates-io] +icu_properties = { version = "1.4", git = "https://github.com/unicode-org/icu4x.git", rev = "728eb44" } +tinystr = { version = "0.7.5", git = "https://github.com/unicode-org/icu4x.git", rev = "728eb44" } diff --git a/fea-rs/src/common/glyph_class.rs b/fea-rs/src/common/glyph_class.rs index ed8b19fa1..3ceef3f12 100644 --- a/fea-rs/src/common/glyph_class.rs +++ b/fea-rs/src/common/glyph_class.rs @@ -65,9 +65,15 @@ impl GlyphSet { self.0.iter().copied() } - pub(crate) fn len(&self) -> usize { + /// The number of glyphs in the set + pub fn len(&self) -> usize { self.0.len() } + + /// Returns `true` if the set contains no glyphs + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } } impl std::iter::FromIterator for GlyphClass { diff --git a/fea-rs/src/compile/lookups.rs b/fea-rs/src/compile/lookups.rs index e5f029f45..26a487b23 100644 --- a/fea-rs/src/compile/lookups.rs +++ b/fea-rs/src/compile/lookups.rs @@ -8,6 +8,7 @@ mod helpers; use std::{ collections::{BTreeMap, HashMap}, convert::TryInto, + fmt::Debug, }; use smol_str::SmolStr; @@ -182,7 +183,8 @@ pub(crate) struct LookupFlagInfo { } /// A feature associated with a particular script and language. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct FeatureKey { pub(crate) feature: Tag, pub(crate) language: Tag, @@ -1171,6 +1173,12 @@ impl FeatureKey { } } +impl Debug for FeatureKey { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}: {}/{}", self.feature, self.script, self.language) + } +} + fn is_gpos_rule(kind: Kind) -> bool { matches!( kind, diff --git a/fea-rs/src/compile/lookups/gpos_builders.rs b/fea-rs/src/compile/lookups/gpos_builders.rs index f57fa0a58..bb65bac46 100644 --- a/fea-rs/src/compile/lookups/gpos_builders.rs +++ b/fea-rs/src/compile/lookups/gpos_builders.rs @@ -196,6 +196,17 @@ impl PairPosBuilder { self.pairs.0.is_empty() && self.classes.0.is_empty() } + /// The number of rules in the builder + pub fn len(&self) -> usize { + self.pairs.0.values().map(|vals| vals.len()).sum::() + + self + .classes + .0 + .values() + .flat_map(|x| x.iter().map(|y| y.items.values().len())) + .sum::() + } + /// Insert a new kerning pair pub fn insert_pair( &mut self, diff --git a/fea-rs/src/compile/metrics.rs b/fea-rs/src/compile/metrics.rs index 5f79c8874..b161f2faf 100644 --- a/fea-rs/src/compile/metrics.rs +++ b/fea-rs/src/compile/metrics.rs @@ -61,6 +61,16 @@ impl ValueRecord { Default::default() } + /// Duplicates the x-advance value to x-placement, required for RTL rules. + /// + /// This is only necessary when a record was originally created without + /// knowledge of the writing direction, and then later needs to be modified. + pub fn make_rtl_compatible(&mut self) { + if self.x_placement.is_none() { + self.x_placement = self.x_advance.clone(); + } + } + // these methods just match the existing builder methods on `ValueRecord` /// Builder style method to set the default x_placement value pub fn with_x_placement(mut self, val: i16) -> Self { diff --git a/fea-rs/src/token_tree/typed.rs b/fea-rs/src/token_tree/typed.rs index 7be379d9e..38923f0d9 100644 --- a/fea-rs/src/token_tree/typed.rs +++ b/fea-rs/src/token_tree/typed.rs @@ -420,17 +420,20 @@ impl ContextualRuleNode for Gsub8 {} impl ContextualRuleNode for IgnoreRule {} impl Root { - pub(crate) fn statements(&self) -> impl Iterator { + /// Iterate over all top-level statements + pub fn statements(&self) -> impl Iterator { self.iter().filter(|t| !t.kind().is_trivia()) } } impl LanguageSystem { - pub(crate) fn script(&self) -> Tag { + /// The script tag + pub fn script(&self) -> Tag { self.inner.iter_children().find_map(Tag::cast).unwrap() } - pub(crate) fn language(&self) -> Tag { + /// The language tag + pub fn language(&self) -> Tag { self.inner .iter_children() .skip_while(|t| t.kind() != Kind::Tag) @@ -451,7 +454,8 @@ impl Tag { self.inner.text.parse() } - pub(crate) fn to_raw(&self) -> write_fonts::types::Tag { + /// Convert this AST tag into a raw `Tag` + pub fn to_raw(&self) -> write_fonts::types::Tag { self.parse().expect("tag is exactly 4 bytes") } } diff --git a/fontbe/Cargo.toml b/fontbe/Cargo.toml index 18f0c6285..48db4926d 100644 --- a/fontbe/Cargo.toml +++ b/fontbe/Cargo.toml @@ -14,10 +14,8 @@ categories = ["text-processing", "parsing", "graphics"] fontdrasil = { version = "0.0.1", path = "../fontdrasil" } fontir = { version = "0.0.1", path = "../fontir" } fea-rs = { version = "0.18.0", path = "../fea-rs", features = ["serde"] } -#icu_properties = "1.4" -icu_properties = { version = "1.4", path = "../../extern/icu4x/components/properties" } -#tinystr = "0.7.5" -tinystr = {version = "0.7.5", path = "../../extern/icu4x/utils/tinystr" } +icu_properties = "1.4" +tinystr = {version = "0.7.5", features = ["serde"] } serde.workspace = true bincode.workspace = true diff --git a/fontbe/src/features.rs b/fontbe/src/features.rs index bdd108983..8a674b512 100644 --- a/fontbe/src/features.rs +++ b/fontbe/src/features.rs @@ -213,12 +213,19 @@ impl<'a> FeatureWriter<'a> { if self.kerning.is_empty() { return Ok(()); } - let pairpos_subtables = self.kerning.lookups.clone(); + // convert the lookups into lookup ids + let lookup_ids = self + .kerning + .lookups + .iter() + .map(|lookups| builder.add_lookup(LookupFlag::empty(), None, lookups.to_owned())) + .collect::>(); - // now we have a builder for the pairpos subtables, so we can make - // a lookup: - let lookups = vec![builder.add_lookup(LookupFlag::empty(), None, pairpos_subtables)]; - builder.add_to_default_language_systems(Tag::new(b"kern"), &lookups); + for (feature, ids) in &self.kerning.features { + // get the generated lookup ids based on the stored lookup indices + let ids = ids.iter().map(|idx| lookup_ids[*idx]).collect(); + builder.add_feature(*feature, ids); + } { self.timing diff --git a/fontbe/src/features/kern.rs b/fontbe/src/features/kern.rs index 1077c1d15..bc4ca0fa8 100644 --- a/fontbe/src/features/kern.rs +++ b/fontbe/src/features/kern.rs @@ -1,10 +1,17 @@ //! Generates an [FeaRsKerns] datastructure to be fed to fea-rs -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::{ + borrow::Cow, + collections::{BTreeMap, BTreeSet, HashMap, HashSet}, +}; use fea_rs::{ - compile::{PairPosBuilder, ValueRecord as ValueRecordBuilder}, - GlyphSet, + compile::{ + FeatureKey, NopFeatureProvider, NopVariationInfo, PairPosBuilder, + ValueRecord as ValueRecordBuilder, + }, + typed::{AstNode, LanguageSystem}, + GlyphMap, GlyphSet, Opts, ParseTree, }; use fontdrasil::{ coords::NormalizedLocation, @@ -15,22 +22,38 @@ use fontir::{ ir::{KernGroup, KernPair, KernParticipant, KerningGroups, KerningInstance}, orchestration::WorkId as FeWorkId, }; +use icu_properties::BidiClass; use log::debug; use ordered_float::OrderedFloat; -use write_fonts::types::GlyphId; +use write_fonts::{ + read::{tables::gsub::Gsub, FontRead, ReadError}, + tables::gdef::GlyphClassDef, + types::{GlyphId, Tag}, +}; use crate::{ error::Error, - features::resolve_variable_metric, + features::{ + properties::{ScriptDirection, UnicodeShortName, COMMON_SCRIPT, INHERITED_SCRIPT}, + resolve_variable_metric, + }, orchestration::{ AllKerningPairs, AnyWorkId, BeWork, Context, FeaRsKerns, KernAdjustments, KernFragment, PairPosEntry, WorkId, }, }; +use super::properties::CharMap; + /// On Linux it took ~0.01 ms per loop, try to get enough to make fan out worthwhile /// based on empirical testing const KERNS_PER_BLOCK: usize = 2048; +const KERN: Tag = Tag::new(b"kern"); +// we don't currently compile this feature, but we will, and it is referenced +// in places because our impl is based on fonttools. +const DIST: Tag = Tag::new(b"dist"); +const DFLT_SCRIPT: Tag = Tag::new(b"DFLT"); +const DFLT_LANG: Tag = Tag::new(b"dflt"); /// Accumulation of all the kerning from IR #[derive(Debug)] @@ -359,31 +382,645 @@ impl Work for KerningGatherWork { fn exec(&self, context: &Context) -> Result<(), Error> { debug!("Gather be kerning"); let arc_fragments = context.kern_fragments.all(); + let ast = context.fea_ast.get(); + let glyph_order = context.ir.glyph_order.get(); + let glyph_map = glyph_order.iter().map(|g| g.clone().into_inner()).collect(); let mut fragments: Vec<_> = arc_fragments .iter() .map(|(_, fragment)| fragment.as_ref()) .collect(); fragments.sort_by_key(|fragment| fragment.segment); - let mut builder = PairPosBuilder::default(); + let glyphs_and_gids = glyph_order + .iter() + .enumerate() + .map(|(i, glyphname)| { + ( + context.ir.glyphs.get(&FeWorkId::Glyph(glyphname.clone())), + GlyphId::new(i as u16), + ) + }) + .collect::>(); + let lookups = self.finalize_kerning(&fragments, &ast.ast, &glyph_map, &glyphs_and_gids)?; + context.fea_rs_kerns.set(lookups); + Ok(()) + } +} + +impl KerningGatherWork { + //. take the kerning fragments and generate the kerning lookups. + /// + // This includes much of the logic from the ufo2ft KernFeatureWriter + fn finalize_kerning( + &self, + fragments: &[&KernFragment], + ast: &ParseTree, + glyph_map: &GlyphMap, + glyphs: &impl CharMap, + ) -> Result { + // ignore diagnostics, they'll get logged during actual GSUB compilation + let (compilation, _) = fea_rs::compile::compile::( + ast, + glyph_map, + None, + None, + Opts::new().compile_gpos(false), + ) + .map_err(|err| { + Error::FeaCompileError(fea_rs::compile::error::CompilerError::CompilationFail(err)) + })?; + + let gsub = compilation + .gsub + .as_ref() + .map(write_fonts::dump_table) + .transpose() + .expect("if this doesn't compile we will already panic when we try to add it to the context"); + let gsub = gsub + .as_ref() + .map(|data| write_fonts::read::tables::gsub::Gsub::read(data.as_slice().into())) + .transpose()?; + + let gdef = compilation.gdef_classes; - let mut entries = 0; - for ppe in fragments + // serves as a standin for cmap in the fonttools impl + + let pairs = fragments .iter() - .flat_map(|fragment| fragment.kerns.iter()) - .cloned() + .flat_map(|frag| frag.kerns.iter()) + .collect::>(); + + let known_scripts = guess_font_scripts(ast, glyphs); + let split_ctx = KernSplitContext::new(glyphs, &known_scripts, gsub, gdef)?; + + let lookups = split_ctx.make_lookups(&pairs); + let (lookups, features) = self.assign_lookups_to_scripts(lookups, ast, KERN); + Ok(FeaRsKerns { lookups, features }) + } + + /// returns a vec of lookups (as a vec of subtables), along with a map of features -> lookups + /// (by order in the first vec) + fn assign_lookups_to_scripts( + &self, + lookups: HashMap, Vec>, + ast: &ParseTree, + // one of 'kern' or 'dist' + current_feature: Tag, + ) -> (Vec>, HashMap>) { + let dflt_langs = vec![DFLT_LANG]; + + let is_kern_feature = current_feature == KERN; + assert!(is_kern_feature || current_feature == DIST); + // this is based on the _registerLookups fn, and this is where we should start typing + let mut lookups_by_script = HashMap::new(); + let mut ordered_lookups = Vec::new(); + + let fea_langs_by_script: HashMap<_, _> = get_script_language_systems(ast) + .into_values() + .flat_map(|x| x.into_iter()) + .collect(); + + // in python this part happens earlier, as part of splitKerning. + for (scripts, lookups) in lookups { + let idx = ordered_lookups.len(); + ordered_lookups.push(lookups); + for script in scripts { + lookups_by_script + .entry(script) + .or_insert(Vec::new()) + .push(idx); + } + } + + // now implement the logic from _registerLookups in KernFeatureWriter: + // + let mut default_lookups = Vec::new(); + if let Some(common_lookups) = lookups_by_script + .get(&COMMON_SCRIPT) + .filter(|_| is_kern_feature) { - ppe.add_to(&mut builder); - entries += 1; + log::debug!("found {} default lookups", common_lookups.len()); + default_lookups.extend(common_lookups.iter().copied()); } - debug!("{entries} be kerns gathered"); - let mut kerns = FeaRsKerns::default(); - if !builder.is_empty() { - kerns.lookups = vec![builder]; + //inDesign bugfix: + let dist_enabled_scripts = super::properties::dist_feature_enabled_scripts(); + let (mut ltr_lookups, mut rtl_lookups) = (Vec::new(), Vec::new()); + for (script, lookups) in lookups_by_script + .iter() + .filter(|(script, _)| !dist_enabled_scripts.contains(script)) + { + match ScriptDirection::for_script(script) { + ScriptDirection::LeftToRight => ltr_lookups.extend(lookups.iter().copied()), + ScriptDirection::RightToLeft => rtl_lookups.extend(lookups.iter().copied()), + ScriptDirection::Auto => (), + } } - context.fea_rs_kerns.set(kerns); - Ok(()) + // if we have any LTR lookups, we add those to defaults, otherwise add RTL lookups + if !ltr_lookups.is_empty() { + default_lookups.extend(ltr_lookups); + } else { + default_lookups.extend(rtl_lookups); + } + default_lookups.sort_unstable(); + + let mut features = HashMap::new(); + if !default_lookups.is_empty() { + let languages = fea_langs_by_script.get(&DFLT_SCRIPT).unwrap_or(&dflt_langs); + log::debug!("DFLT languages: {languages:?}"); + for lang in languages { + features.insert( + FeatureKey::new(KERN, *lang, DFLT_SCRIPT), + default_lookups.clone(), + ); + } + } + + let common_lookups = lookups_by_script.remove(&COMMON_SCRIPT); + let inherited_lookups = lookups_by_script.remove(&INHERITED_SCRIPT); + let dflt_lookups = match (common_lookups, inherited_lookups) { + (Some(mut a), Some(b)) => { + a.extend(b); + a.sort_unstable(); + a.dedup(); + a + } + (Some(a), None) | (None, Some(a)) => a, + (None, None) => Vec::new(), + }; + + for (script, mut lookups) in lookups_by_script { + lookups.extend(dflt_lookups.iter().copied()); + + for tag in super::properties::script_to_ot_tags(&script) { + let languages = fea_langs_by_script.get(&tag).unwrap_or(&dflt_langs); + for lang in languages { + log::info!("added {} lookups for {tag}/{lang}", lookups.len()); + features.insert(FeatureKey::new(KERN, *lang, tag), lookups.clone()); + } + } + } + + debug_ordered_lookups(&features, &ordered_lookups); + (ordered_lookups, features) + } +} + +fn debug_ordered_lookups( + features: &HashMap>, + lookups: &[Vec], +) { + for (i, subtables) in lookups.iter().enumerate() { + let total_rules = subtables.iter().map(|x| x.len()).sum::(); + log::debug!("lookup {i}, {total_rules} rules"); } + + let mut feature_keys = features.keys().collect::>(); + feature_keys.sort(); + for feature in feature_keys { + let indices = features.get(feature).unwrap(); + log::debug!("feature {feature:?}, lookups {indices:?}"); + } +} + +/// All the state needed for splitting kern pairs by script & writing direction +struct KernSplitContext { + gdef: Option>, + glyph_scripts: HashMap>, + bidi_glyphs: HashMap>, + opts: KernSplitOptions, + dflt_scripts: HashSet, + common_scripts: HashSet, +} + +// unused for now, but included so that our impl can more closely follow fonttools +struct KernSplitOptions { + ignore_marks: bool, +} + +impl Default for KernSplitOptions { + fn default() -> Self { + Self { ignore_marks: true } + } +} + +impl KernSplitContext { + fn new( + glyphs: &impl CharMap, + known_scripts: &HashSet, + gsub: Option, + gdef: Option>, + ) -> Result { + let glyph_scripts = + super::properties::scripts_by_glyph(glyphs, known_scripts, gsub.as_ref())?; + let bidi_glyphs = super::properties::glyphs_by_bidi_class(glyphs, gsub.as_ref())?; + + Ok(Self { + gdef, + glyph_scripts, + bidi_glyphs, + opts: KernSplitOptions::default(), + dflt_scripts: HashSet::from([COMMON_SCRIPT, INHERITED_SCRIPT]), + common_scripts: HashSet::from([COMMON_SCRIPT]), + }) + } + + fn mark_glyph_ids(&self) -> HashSet { + self.gdef + .as_ref() + .map(|classes| { + classes + .iter() + .filter_map(|(gid, cls)| (*cls == GlyphClassDef::Mark).then_some(*gid)) + .collect() + }) + .unwrap_or_default() + } + + // + fn make_lookups( + &self, + pairs: &[&PairPosEntry], + ) -> HashMap, Vec> { + if !self.opts.ignore_marks { + let pairs = pairs.iter().map(|x| Cow::Borrowed(*x)).collect::>(); + return self.make_split_script_kern_lookups(&pairs, false); + } + + let (base_pairs, mark_pairs) = self.split_base_and_mark_pairs(pairs); + let mut result = HashMap::new(); + if !base_pairs.is_empty() { + result = self.make_split_script_kern_lookups(&base_pairs, false); + } + if !mark_pairs.is_empty() { + for (scripts, lookups) in self.make_split_script_kern_lookups(&mark_pairs, true) { + result.entry(scripts).or_default().extend(lookups); + } + } + result + } + + /// Split these pairpos rules into lookups based on script. + /// + /// Returns a map of scripts: `[lookup]`, where 'scripts' is a set of scripts + /// referencing the lookups. + /// + /// Although this method always/only returns a single lookup per unique set + /// of scripts, we wrap it in a vec because that is what we need at the + /// call site. + // + fn make_split_script_kern_lookups( + &self, + pairs: &[Cow], + //TODO: handle marks + _are_marks: bool, + ) -> HashMap, Vec> { + let mut lookups_by_script = HashMap::new(); + let kerning_per_script = self.split_kerns(pairs); + let mut bidi_buf = HashSet::new(); // we can reuse this for each pair + for (scripts, pairs) in kerning_per_script { + let mut lookup = PairPosBuilder::default(); + for mut pair in pairs { + bidi_buf.clear(); + for (direction, glyphs) in &self.bidi_glyphs { + if !pair.glyphs_are_disjoint(glyphs) { + bidi_buf.insert(*direction); + } + } + if bidi_buf.contains(&BidiClass::LeftToRight) + && bidi_buf.contains(&BidiClass::RightToLeft) + { + log::warn!( + "skipping kern pair with ambigous direction: {}", + pair.display_glyphs() + ); + continue; + } + + let script_direction = ScriptDirection::for_script(scripts.first().unwrap()); + assert!(scripts + .iter() + .all(|x| ScriptDirection::for_script(x) == script_direction)); + let script_is_rtl = matches!(script_direction, ScriptDirection::RightToLeft); + let pair_is_rtl = script_is_rtl && !bidi_buf.contains(&BidiClass::LeftToRight); + if pair_is_rtl { + pair.make_rtl_compatible(); + } + pair.add_to(&mut lookup); + } + lookups_by_script.insert(scripts, vec![lookup]); + } + lookups_by_script + } + + // + fn split_kerns( + &self, + pairs: &[Cow], + ) -> HashMap, Vec> { + let mut kerning_per_script = HashMap::new(); + for pair in pairs { + for (scripts, pair) in self.partition_by_script(pair) { + kerning_per_script + .entry(scripts) + .or_insert(Vec::new()) + .push(pair); + } + } + + kerning_per_script = merge_scripts(kerning_per_script); + for scripts in kerning_per_script.keys().filter(|x| x.len() > 1) { + log::info!("merging kerning lookups for {scripts:?}"); + } + + kerning_per_script + } + + // + fn partition_by_script<'b>( + &self, + pair: &'b PairPosEntry, + ) -> impl Iterator, PairPosEntry)> + 'b { + //TODO: we could definitely make a reusable context and save all this + //reallocation + let mut resolved_scripts = HashMap::new(); + let mut side1_directions = HashMap::new(); + let mut side2_directions = HashMap::new(); + for glyph in pair.first_glyphs() { + let mut scripts = self.glyph_scripts.get(&glyph).unwrap_or(&self.dflt_scripts); + if !scripts.is_disjoint(&self.dflt_scripts) { + // if has any dflt script, just treat it as common + scripts = &self.common_scripts; + } + resolved_scripts.insert(glyph, scripts.to_owned()); + for direction in scripts.iter().map(ScriptDirection::for_script) { + side1_directions + .entry(direction) + .or_insert(HashSet::new()) + .insert(glyph); + } + } + + for glyph in pair.second_glyphs() { + let mut scripts = self.glyph_scripts.get(&glyph).unwrap_or(&self.dflt_scripts); + if !scripts.is_disjoint(&self.dflt_scripts) { + scripts = &self.common_scripts; + } + resolved_scripts.insert(glyph, scripts.to_owned()); + for direction in scripts.iter().map(ScriptDirection::for_script) { + side2_directions + .entry(direction) + .or_insert(HashSet::new()) + .insert(glyph); + } + } + + let mut product = side1_directions.into_iter().flat_map(move |s1d| { + side2_directions + .clone() + .into_iter() + .map(move |s2d| (s1d.clone(), s2d)) + }); + + std::iter::from_fn(move || loop { + // we need to loop here so that we can skip some items + + let ((side1_dir, side1_glyphs), (side2_dir, side2_glyphs)) = product.next()?; + + let side1: GlyphSet = side1_glyphs.iter().copied().collect(); + let side1_scripts = side1 + .iter() + .flat_map(|gid| resolved_scripts.get(&gid).unwrap().iter().copied()) + .collect::>(); + let side2: GlyphSet = side2_glyphs.iter().copied().collect(); + let side2_scripts = side2 + .iter() + .flat_map(|gid| resolved_scripts.get(&gid).unwrap().iter().copied()) + .collect::>(); + + if !side1_dir.plays_nicely_with(&side2_dir) { + log::warn!("skipping kerning pair {side1:?}/{side2:?} with mixed direction {side1_dir:?}/{side2_dir:?}"); + continue; + } + + let mut scripts = side1_scripts + .iter() + .copied() + .chain(side2_scripts.iter().copied()) + .collect::>(); + if ![side1_scripts, side2_scripts] + .iter() + .all(|x| x.contains(&COMMON_SCRIPT)) + { + scripts.remove(&COMMON_SCRIPT); + } + return Some((scripts, pair.with_new_glyphs(side1, side2))); + }) + } + + /// returns vecs of base and mark pairs. + /// + /// Where possible the input items will be reused, but if a given entry contains + /// mixed items we will need to allocate new entries. + /// + /// see + /// + fn split_base_and_mark_pairs<'b>( + &self, + pairs: &[&'b PairPosEntry], + ) -> (Vec>, Vec>) { + enum GlyphSetContent { + Empty, + BasesOnly, + MarksOnly, + Mixed, + } + + // little helper to tell us what's in a glyphset + fn classify_glyphset_contents( + glyph_set: &GlyphSet, + marks: &HashSet, + ) -> GlyphSetContent { + glyph_set.iter().fold(GlyphSetContent::Empty, |val, gid| { + match (val, marks.contains(&gid)) { + (GlyphSetContent::Empty, true) => GlyphSetContent::MarksOnly, + (GlyphSetContent::Empty, false) => GlyphSetContent::BasesOnly, + (GlyphSetContent::MarksOnly, true) => GlyphSetContent::MarksOnly, + (GlyphSetContent::BasesOnly, false) => GlyphSetContent::BasesOnly, + _ => GlyphSetContent::Mixed, + } + }) + } + + /// split a glyphset into (bases, marks) + fn split_glyphs(glyphs: &GlyphSet, marks: &HashSet) -> (GlyphSet, GlyphSet) { + let (x, y): (Vec<_>, Vec<_>) = glyphs.iter().partition(|gid| !marks.contains(gid)); + (x.into(), y.into()) + } + + let marks = self.mark_glyph_ids(); + if marks.is_empty() { + return ( + pairs.iter().map(|x| Cow::Borrowed(*x)).collect(), + Vec::new(), + ); + } + + let (mut base_pairs, mut mark_pairs) = (Vec::new(), Vec::new()); + + for pair in pairs { + match pair { + PairPosEntry::Pair(side1, _, side2, _) + if !marks.contains(side1) && !marks.contains(side2) => + { + base_pairs.push(Cow::Borrowed(*pair)) + } + PairPosEntry::Pair(..) => mark_pairs.push(Cow::Borrowed(*pair)), + + // handle the case where all are marks or bases, first: + PairPosEntry::Class(side1, val1, side2, val2) => { + let side1_cls = classify_glyphset_contents(side1, &marks); + let side2_cls = classify_glyphset_contents(side2, &marks); + + match (side1_cls, side2_cls) { + (GlyphSetContent::Empty, _) | (_, GlyphSetContent::Empty) => continue, + (GlyphSetContent::BasesOnly, GlyphSetContent::BasesOnly) => { + base_pairs.push(Cow::Borrowed(*pair)); + continue; + } + (GlyphSetContent::BasesOnly, GlyphSetContent::MarksOnly) + | (GlyphSetContent::MarksOnly, GlyphSetContent::BasesOnly) + | (GlyphSetContent::MarksOnly, GlyphSetContent::MarksOnly) => { + mark_pairs.push(Cow::Borrowed(*pair)); + continue; + } + _ => { + let (side1_bases, side1_marks) = split_glyphs(side1, &marks); + let (side2_bases, side2_marks) = split_glyphs(side2, &marks); + + if !side1_bases.is_empty() && !side2_bases.is_empty() { + base_pairs.push(Cow::Owned(PairPosEntry::Class( + side1_bases.clone(), + val1.clone(), + side2_bases.clone(), + val2.clone(), + ))); + } + // these various combos all go in the marks group + for (side1, side2) in [ + (&side1_bases, &side2_marks), + (&side1_marks, &side2_bases), + (&side1_marks, &side2_marks), + ] { + if !side1.is_empty() && !side2.is_empty() { + base_pairs.push(Cow::Owned(PairPosEntry::Class( + side1.clone(), + val1.clone(), + side2.clone(), + val2.clone(), + ))); + } + } + } + } + } + } + } + (base_pairs, mark_pairs) + } +} + +// +fn guess_font_scripts(ast: &ParseTree, glyphs: &impl CharMap) -> HashSet { + let mut scripts: HashSet<_> = glyphs + .iter_glyphs() + .filter_map(|(_, codepoint)| { + let mut scripts = super::properties::unicode_script_extensions(codepoint); + // only if a codepoint has a single script do know it is supported + match (scripts.next(), scripts.next()) { + (Some(script), None) => Some(script), + _ => None, + } + }) + .collect(); + + // add scripts explicitly defined in fea + scripts.extend(get_script_language_systems(ast).keys().cloned()); + scripts +} + +// +/// returns a map of unicode script names to (ot_script, `[ot_lang]`) +fn get_script_language_systems(ast: &ParseTree) -> HashMap)>> { + let mut languages_by_script = HashMap::new(); + for langsys in ast + .typed_root() + .statements() + .filter_map(LanguageSystem::cast) + { + languages_by_script + .entry(langsys.script().to_raw()) + .or_insert(Vec::new()) + .push(langsys.language().to_raw()) + } + + let mut unic_script_to_languages = HashMap::new(); + for (ot_script, langs) in languages_by_script { + let Some(unicode_script) = super::properties::ot_tag_to_script(ot_script) else { + log::info!("no unicode script for OT script tag {ot_script}"); + continue; + }; + unic_script_to_languages + .entry(unicode_script) + .or_insert(Vec::new()) + .push((ot_script, langs)); + } + + unic_script_to_languages +} + +// +fn merge_scripts( + kerning_per_script: HashMap, Vec>, +) -> HashMap, Vec> { + let mut sets = kerning_per_script.keys().cloned().collect::>(); + let mut buf = Vec::with_capacity(sets.len()); + let mut did_merge = true; + + while did_merge { + did_merge = false; + while let Some(mut common) = sets.pop() { + sets.retain(|scripts| { + if scripts.is_disjoint(&common) { + true + } else { + common.extend(scripts.iter().copied()); + did_merge = true; + false + } + }); + buf.push(common); + } + // all items from sets were moved into buf; now move them back + std::mem::swap(&mut sets, &mut buf); + assert!(buf.is_empty()); + } + + let mut result = sets + .iter() + .map(|set| (set.clone(), Vec::new())) + .collect::>(); + + for (scripts, pairs) in kerning_per_script { + let merged_script = sets + .iter() + .find(|merged| !merged.is_disjoint(&scripts)) + .unwrap(); + result.get_mut(merged_script).unwrap().extend(pairs); + } + + // sort all the pairs; fonttools does that after returning but here works? + result.values_mut().for_each(|pairs| pairs.sort_unstable()); + result } diff --git a/fontbe/src/features/properties.rs b/fontbe/src/features/properties.rs index 883959e6d..3dbc9346f 100644 --- a/fontbe/src/features/properties.rs +++ b/fontbe/src/features/properties.rs @@ -1,3 +1,4 @@ +//! Properties and constants related to unicode data use std::{ collections::{HashMap, HashSet}, hash::Hash, @@ -11,12 +12,75 @@ use write_fonts::{ types::{GlyphId, Tag}, }; +// SAFETY: we can easily verify that neither of these strings contains a nul byte. +// (this is the only way we can declare these as constants) +pub const COMMON_SCRIPT: UnicodeShortName = + unsafe { UnicodeShortName::from_bytes_unchecked(*b"Zyyy") }; +pub const INHERITED_SCRIPT: UnicodeShortName = + unsafe { UnicodeShortName::from_bytes_unchecked(*b"Zinh") }; + static SCRIPT_DATA: ScriptWithExtensionsBorrowed<'static> = icu_properties::script::script_with_extensions(); /// The type used by icu4x for script names pub type UnicodeShortName = tinystr::TinyAsciiStr<4>; +/// The writing direction of a script +#[derive(Clone, Debug, Copy, Hash, PartialEq, Eq)] +pub enum ScriptDirection { + /// any direction, for the 'common' script + Auto, + LeftToRight, + RightToLeft, +} + +/// A trait for mapping glyph ids to unicode values +/// +/// This lets us write functions that don't need to know the concrete types we're +/// using, which is an implementation detail. It also makes it easier for us to +/// write tests. +pub trait CharMap { + /// Iterate over all the defined (gid, unicode value) pairs. + /// + /// Note that a single glyph may appear multiple times, with different + /// unicode values. + fn iter_glyphs(&self) -> impl Iterator; +} + +impl CharMap for Vec<(Arc, GlyphId)> { + fn iter_glyphs(&self) -> impl Iterator { + self.iter() + .flat_map(|(glyph, gid)| glyph.codepoints.iter().map(|uv| (*gid, *uv))) + } +} + +impl ScriptDirection { + /// Returns the writing direction for the provided script + pub(crate) fn for_script(script: &UnicodeShortName) -> Self { + match script.as_str() { + "Zyyy" => ScriptDirection::Auto, + "Arab" | "Hebr" | "Syrc" | "Thaa" | "Cprt" | "Khar" | "Phnx" | "Nkoo" | "Lydi" + | "Avst" | "Armi" | "Phli" | "Prti" | "Sarb" | "Orkh" | "Samr" | "Mand" | "Merc" + | "Mero" | "Mani" | "Mend" | "Nbat" | "Narb" | "Palm" | "Phlp" | "Hatr" | "Hung" + | "Adlm" | "Rohg" | "Sogo" | "Sogd" | "Elym" | "Chrs" | "Yezi" | "Ougr" => { + ScriptDirection::RightToLeft + } + _ => ScriptDirection::LeftToRight, + } + } + + /// true if either side is auto, or both sides are equal + pub(crate) fn plays_nicely_with(&self, other: &ScriptDirection) -> bool { + matches!( + (self, other), + (ScriptDirection::Auto, _) + | (_, ScriptDirection::Auto) + | (ScriptDirection::LeftToRight, ScriptDirection::LeftToRight) + | (ScriptDirection::RightToLeft, ScriptDirection::RightToLeft) + ) + } +} + /// Iterate over the unicode scripts + extensions for the provided codepoint /// /// This returns the scripts as shortnames, because that's what python does. @@ -38,14 +102,22 @@ pub(crate) fn unicode_script_extensions( }) } -fn unicode_bidi_type(c: u32) -> BidiClass { - icu_properties::maps::bidi_class().get32(c) +// +/// returns none for neutral characters +fn unicode_bidi_type(c: u32) -> Option { + match icu_properties::maps::bidi_class().get32(c) { + BidiClass::RightToLeft | BidiClass::ArabicLetter => Some(BidiClass::RightToLeft), + BidiClass::LeftToRight | BidiClass::ArabicNumber | BidiClass::EuropeanNumber => { + Some(BidiClass::LeftToRight) + } + _ => None, + } } // equivalent to the 'classify' method in ufo2ft: // -fn classify( - glyphs: &[(Arc, GlyphId)], +fn classify( + char_map: &CM, mut props_fn: F, gsub: Option<&Gsub>, ) -> Result>, ReadError> @@ -53,16 +125,11 @@ where T: Hash + Eq, I: Iterator, F: FnMut(u32) -> I, + CM: CharMap, { let mut sets = HashMap::new(); let mut neutral_glyphs = HashSet::new(); - for (gid, unicode_value) in glyphs.iter().flat_map(|(glyph, gid)| { - glyph - .codepoints - .iter() - .copied() - .map(|codepoint| (*gid, codepoint)) - }) { + for (gid, unicode_value) in char_map.iter_glyphs() { let mut has_props = false; for prop in props_fn(unicode_value) { sets.entry(prop).or_insert(HashSet::new()).insert(gid); @@ -87,13 +154,33 @@ where Ok(sets) } -/// Returns a map of gids their scripts +/// Returns a map of gids to their scripts pub(crate) fn scripts_by_glyph( - glyphs: &[(Arc, GlyphId)], + glyphs: &impl CharMap, + known_scripts: &HashSet, gsub: Option<&Gsub>, ) -> Result>, ReadError> { - let mut result = HashMap::with_capacity(glyphs.len()); - for (script, glyphs) in classify(glyphs, |cp| unicode_script_extensions(cp), gsub)? { + let mut result = HashMap::new(); + for (script, glyphs) in classify( + glyphs, + |cp| { + // we need to write this in such a way as to return a single concrete type; + // this is basically two branches, one or the other option is always `None` + let common = known_scripts.is_empty().then_some(COMMON_SCRIPT); + let other_branch = if known_scripts.is_empty() { + None + } else { + Some(unicode_script_extensions(cp).filter(|script| { + *script == COMMON_SCRIPT + || *script == INHERITED_SCRIPT + || known_scripts.contains(script) + })) + }; + + common.into_iter().chain(other_branch.into_iter().flatten()) + }, + gsub, + )? { for glyph in glyphs { result.entry(glyph).or_insert(HashSet::new()).insert(script); } @@ -103,22 +190,37 @@ pub(crate) fn scripts_by_glyph( /// A map of bidi class to glyphs in that class. pub(crate) fn glyphs_by_bidi_class( - glyphs: &[(Arc, GlyphId)], + glyphs: &impl CharMap, gsub: Option<&Gsub>, ) -> Result>, ReadError> { classify( glyphs, - |codepoint| Some(unicode_bidi_type(codepoint)).into_iter(), + |codepoint| unicode_bidi_type(codepoint).into_iter(), gsub, ) } + +const DFLT_SCRIPT: Tag = Tag::new(b"DFLT"); + static SCRIPT_ALIASES: &[(Tag, Tag)] = &[(Tag::new(b"jamo"), Tag::new(b"hang"))]; +static SCRIPT_EXCEPTIONS: &[(&str, Tag)] = &[ + ("Hira", Tag::new(b"kana")), + ("Hrkt", Tag::new(b"kana")), + ("Laoo", Tag::new(b"lao ")), + ("Nkoo", Tag::new(b"nko ")), + ("Vaii", Tag::new(b"vai ")), + ("Yiii", Tag::new(b"yi ")), + ("Zinh", DFLT_SCRIPT), + ("Zmth", Tag::new(b"math")), + ("Zyyy", DFLT_SCRIPT), + ("Zzzz", DFLT_SCRIPT), +]; //TODO: python maps 'math' to 'Zmth' but 'Mathematical Notation' //is not a variant on icu4x's Script? But this is included in //some datafiles so might be an oversight? But it's actually not listed //in the official list of scripts, so idk? -static SCRIPT_EXCEPTIONS: &[(Tag, Script)] = &[]; +static SCRIPT_EXCEPTIONS_REVERSED: &[(Tag, Script)] = &[]; // I don't know what's going on here, just copying OTTags.py static NEW_SCRIPTS: &[(Tag, Script)] = &[ @@ -134,6 +236,122 @@ static NEW_SCRIPTS: &[(Tag, Script)] = &[ (Tag::new(b"tml2"), Script::Tamil), ]; +// I don't know what's going on here, just copying OTTags.py +static NEW_SCRIPT_TAGS: &[(&str, Tag)] = &[ + ("Beng", Tag::new(b"bng2")), + ("Deva", Tag::new(b"dev2")), + ("Gujr", Tag::new(b"gjr2")), + ("Guru", Tag::new(b"gur2")), + ("Kana", Tag::new(b"knd2")), + ("Mlym", Tag::new(b"mlm2")), + ("Mymr", Tag::new(b"mym2")), + ("Orya", Tag::new(b"ory2")), + ("Taml", Tag::new(b"tml2")), + ("Telu", Tag::new(b"tel2")), +]; + +static INDIC_SCRIPTS: &[&str] = &[ + "Beng", // Bengali + "Deva", // Devanagari + "Gujr", // Gujarati + "Guru", // Gurmukhi + "Knda", // Kannada + "Mlym", // Malayalam + "Orya", // Oriya + "Sinh", // Sinhala + "Taml", // Tamil + "Telu", // Telugu +]; + +static USE_SCRIPTS: &[&str] = &[ + // Correct as at Unicode 15.0 + "Adlm", // Adlam + "Ahom", // Ahom + "Bali", // Balinese + "Batk", // Batak + "Brah", // Brahmi + "Bugi", // Buginese + "Buhd", // Buhid + "Cakm", // Chakma + "Cham", // Cham + "Chrs", // Chorasmian + "Cpmn", // Cypro Minoan + "Diak", // Dives Akuru + "Dogr", // Dogra + "Dupl", // Duployan + "Egyp", // Egyptian Hieroglyphs + "Elym", // Elymaic + "Gong", // Gunjala Gondi + "Gonm", // Masaram Gondi + "Gran", // Grantha + "Hano", // Hanunoo + "Hmng", // Pahawh Hmong + "Hmnp", // Nyiakeng Puachue Hmong + "Java", // Javanese + "Kali", // Kayah Li + "Kawi", // Kawi + "Khar", // Kharosthi + "Khoj", // Khojki + "Kits", // Khitan Small Script + "Kthi", // Kaithi + "Lana", // Tai Tham + "Lepc", // Lepcha + "Limb", // Limbu + "Mahj", // Mahajani + "Maka", // Makasar + "Mand", // Mandaic + "Mani", // Manichaean + "Marc", // Marchen + "Medf", // Medefaidrin + "Modi", // Modi + "Mong", // Mongolian + "Mtei", // Meetei Mayek + "Mult", // Multani + "Nagm", // Nag Mundari + "Nand", // Nandinagari + "Newa", // Newa + "Nhks", // Bhaiksuki + "Nko ", // Nko + "Ougr", // Old Uyghur + "Phag", // Phags Pa + "Phlp", // Psalter Pahlavi + "Plrd", // Miao + "Rjng", // Rejang + "Rohg", // Hanifi Rohingya + "Saur", // Saurashtra + "Shrd", // Sharada + "Sidd", // Siddham + "Sind", // Khudawadi + "Sogd", // Sogdian + "Sogo", // Old Sogdian + "Soyo", // Soyombo + "Sund", // Sundanese + "Sylo", // Syloti Nagri + "Tagb", // Tagbanwa + "Takr", // Takri + "Tale", // Tai Le + "Tavt", // Tai Viet + "Tfng", // Tifinagh + "Tglg", // Tagalog + "Tibt", // Tibetan + "Tirh", // Tirhuta + "Tnsa", // Tangsa + "Toto", // Toto + "Vith", // Vithkuqi + "Wcho", // Wancho + "Yezi", // Yezidi + "Zanb", // Zanabazar Square +]; + +pub(crate) fn dist_feature_enabled_scripts() -> HashSet { + INDIC_SCRIPTS + .iter() + .chain(USE_SCRIPTS) + .chain(["Khmr", "Mymr"].iter()) + .map(|s| UnicodeShortName::from_str(s).unwrap()) + .collect() +} + // a little helper trait to handle binary searching an array of 2-tuples where // the first item is a key and the second a value trait BinarySearchExact { @@ -142,7 +360,7 @@ trait BinarySearchExact { impl BinarySearchExact for &[(T, U)] { fn binary_search_exact(&self, needle: &T) -> Option { - self.binary_search_by(|probe| probe.0.cmp(&needle)) + self.binary_search_by(|probe| probe.0.cmp(needle)) .ok() .map(|idx| &self[idx].1) .cloned() @@ -162,7 +380,7 @@ pub(crate) fn ot_tag_to_script(script_tag: Tag) -> Option { .binary_search_exact(&script_tag) .unwrap_or(script_tag); - if let Some(exception) = SCRIPT_EXCEPTIONS + if let Some(exception) = SCRIPT_EXCEPTIONS_REVERSED .binary_search_exact(&tag) .or_else(|| NEW_SCRIPTS.binary_search_exact(&tag)) { @@ -194,6 +412,21 @@ fn ot_tag_to_unicode_short_name(tag: Tag) -> UnicodeShortName { UnicodeShortName::try_from_raw(out).expect("cannot fail, as tag cannot have leading nul byte") } +/// a script can correspond to one or two tags, because +pub(crate) fn script_to_ot_tags(script: &UnicodeShortName) -> impl Iterator { + let mut out = [None, None]; + if let Some(tag) = SCRIPT_EXCEPTIONS.binary_search_exact(&script.as_str()) { + out[0] = Some(tag); + } else if Script::name_to_enum_mapper().get_strict(script).is_none() { + out[0] = Some(DFLT_SCRIPT); + } else { + out[0] = NEW_SCRIPT_TAGS.binary_search_exact(&script.as_str()); + out[1] = Some(Tag::new(script.to_owned().to_ascii_lowercase().all_bytes())); + } + + out.into_iter().flatten() +} + #[cfg(test)] mod tests { use super::*; @@ -213,10 +446,14 @@ mod tests { let (actual, expected) = get_original_and_sorted_items(SCRIPT_ALIASES); assert_eq!(actual, expected); - let (actual, expected) = get_original_and_sorted_items(SCRIPT_EXCEPTIONS); + let (actual, expected) = get_original_and_sorted_items(SCRIPT_EXCEPTIONS_REVERSED); assert_eq!(actual, expected); let (actual, expected) = get_original_and_sorted_items(NEW_SCRIPTS); assert_eq!(actual, expected); + let (actual, expected) = get_original_and_sorted_items(NEW_SCRIPT_TAGS); + assert_eq!(actual, expected); + let (actual, expected) = get_original_and_sorted_items(SCRIPT_EXCEPTIONS); + assert_eq!(actual, expected); } #[test] diff --git a/fontbe/src/orchestration.rs b/fontbe/src/orchestration.rs index 6d52cec71..da189842f 100644 --- a/fontbe/src/orchestration.rs +++ b/fontbe/src/orchestration.rs @@ -1,7 +1,8 @@ //! Helps coordinate the graph execution for BE use std::{ - collections::BTreeMap, + collections::{BTreeMap, HashMap, HashSet}, + fmt::Display, fs::File, io::{self, BufReader, BufWriter, Read, Write}, path::{Path, PathBuf}, @@ -10,7 +11,8 @@ use std::{ use fea_rs::{ compile::{ - MarkToBaseBuilder, MarkToMarkBuilder, PairPosBuilder, ValueRecord as ValueRecordBuilder, + FeatureKey, MarkToBaseBuilder, MarkToMarkBuilder, PairPosBuilder, + ValueRecord as ValueRecordBuilder, }, GlyphMap, GlyphSet, ParseTree, }; @@ -372,7 +374,10 @@ impl Persistable for FeaRsMarks { /// The aggregation of all [KernFragment]s. #[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq)] pub struct FeaRsKerns { - pub lookups: Vec, + /// ordered! + pub lookups: Vec>, + /// each value is a set of lookups, referenced by their order in array above + pub features: HashMap>, } /// The abstract syntax tree of any user FEA. @@ -437,13 +442,25 @@ impl Persistable for AllKerningPairs { } } -#[derive(Clone, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Ord)] pub enum PairPosEntry { Pair(GlyphId, ValueRecordBuilder, GlyphId, ValueRecordBuilder), Class(GlyphSet, ValueRecordBuilder, GlyphSet, ValueRecordBuilder), } impl PairPosEntry { + /// if a rule is right-to-left, we need to set both x-advance AND x-position + /// + /// see + /// for further details. The tl;dr is that the lookup itself does not have + /// any knowledge of writing direction. + pub(crate) fn make_rtl_compatible(&mut self) { + let record = match self { + PairPosEntry::Pair(_, record, _, _) => record, + PairPosEntry::Class(_, record, _, _) => record, + }; + record.make_rtl_compatible(); + } pub(crate) fn add_to(self, builder: &mut PairPosBuilder) { match self { PairPosEntry::Pair(gid0, rec0, gid1, rec1) => { @@ -454,6 +471,95 @@ impl PairPosEntry { } } } + + /// Returns true if this entry has no glyphs in common with the provided set + pub(crate) fn glyphs_are_disjoint(&self, glyphs: &HashSet) -> bool { + self.first_glyphs() + .chain(self.second_glyphs()) + .all(|gid| !glyphs.contains(&gid)) + } + + /// a helper used when splitting kerns based on script direction + pub(crate) fn with_new_glyphs(&self, side1: GlyphSet, side2: GlyphSet) -> Self { + let (val1, val2) = match self { + PairPosEntry::Pair(_, val1, _, val2) => (val1, val2), + PairPosEntry::Class(_, val1, _, val2) => (val1, val2), + }; + + if side1.len() == 1 && side2.len() == 1 { + Self::Pair( + side1.iter().next().unwrap(), + val1.clone(), + side2.iter().next().unwrap(), + val2.clone(), + ) + } else { + Self::Class(side1, val1.clone(), side2, val2.clone()) + } + } + + pub(crate) fn first_glyphs(&self) -> impl Iterator + '_ { + let (first, second) = match self { + PairPosEntry::Pair(gid0, ..) => (Some(*gid0), None), + PairPosEntry::Class(set0, ..) => (None, Some(set0)), + }; + + first + .into_iter() + .chain(second.into_iter().flat_map(|set| set.iter())) + } + + pub(crate) fn second_glyphs(&self) -> impl Iterator + '_ { + let (first, second) = match self { + PairPosEntry::Pair(_, _, gid1, _) => (Some(*gid1), None), + PairPosEntry::Class(_, _, set1, _) => (None, Some(set1)), + }; + + first + .into_iter() + .chain(second.into_iter().flat_map(|set| set.iter())) + } + + /// A helper for pretty-printing the rule's glyph or glyphs + pub(crate) fn display_glyphs(&self) -> impl Display + '_ { + enum DisplayGlyphs<'a> { + Pair(GlyphId, GlyphId), + Class(&'a GlyphSet, &'a GlyphSet), + } + + impl Display for DisplayGlyphs<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DisplayGlyphs::Pair(one, two) => write!(f, "{one}/{two}"), + DisplayGlyphs::Class(one, two) => { + let mut first = true; + for gid in one.iter() { + if !first { + write!(f, ", ")?; + } + write!(f, "{gid}")?; + first = false; + } + write!(f, "/")?; + first = true; + for gid in two.iter() { + if !first { + write!(f, ", ")?; + } + write!(f, "{gid}")?; + first = false; + } + Ok(()) + } + } + } + } + + match self { + PairPosEntry::Pair(one, _, two, _) => DisplayGlyphs::Pair(*one, *two), + PairPosEntry::Class(one, _, two, _) => DisplayGlyphs::Class(one, two), + } + } } /// A chunk of kerning that needs to be fed into a [PairPosBuilder] diff --git a/fontc/src/workload.rs b/fontc/src/workload.rs index 7696d97b7..069edd2c7 100644 --- a/fontc/src/workload.rs +++ b/fontc/src/workload.rs @@ -318,6 +318,9 @@ impl<'a> Workload<'a> { .expect("Gather BE Kerning has to be pending") .read_access = AccessBuilder::::new() .variant(BeWorkIdentifier::KernFragment(0)) + .variant(BeWorkIdentifier::FeaturesAst) + .variant(FeWorkIdentifier::Glyph(GlyphName::NOTDEF)) + .variant(FeWorkIdentifier::StaticMetadata) .build() .into(); } From ce7122e9b830a0072e4f5c33f022e7528a0f1e7c Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Tue, 12 Mar 2024 18:09:35 -0400 Subject: [PATCH 4/6] [kerning] Add a simple test case for split-by-script --- fontbe/src/features/kern.rs | 85 ++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 6 deletions(-) diff --git a/fontbe/src/features/kern.rs b/fontbe/src/features/kern.rs index bc4ca0fa8..2eadf7d92 100644 --- a/fontbe/src/features/kern.rs +++ b/fontbe/src/features/kern.rs @@ -933,7 +933,15 @@ impl KernSplitContext { // fn guess_font_scripts(ast: &ParseTree, glyphs: &impl CharMap) -> HashSet { - let mut scripts: HashSet<_> = glyphs + let mut scripts = scripts_for_chars(glyphs); + // add scripts explicitly defined in fea + scripts.extend(get_script_language_systems(ast).keys().cloned()); + scripts +} + +/// return the set of scripts (based on unicode data) that use this set of glyphs +fn scripts_for_chars(glyphs: &impl CharMap) -> HashSet { + glyphs .iter_glyphs() .filter_map(|(_, codepoint)| { let mut scripts = super::properties::unicode_script_extensions(codepoint); @@ -943,11 +951,7 @@ fn guess_font_scripts(ast: &ParseTree, glyphs: &impl CharMap) -> HashSet None, } }) - .collect(); - - // add scripts explicitly defined in fea - scripts.extend(get_script_language_systems(ast).keys().cloned()); - scripts + .collect() } // @@ -1024,3 +1028,72 @@ fn merge_scripts( result.values_mut().for_each(|pairs| pairs.sort_unstable()); result } + +#[cfg(test)] +mod tests { + use super::*; + + struct MockCharMap(HashMap); + + impl CharMap for MockCharMap { + fn iter_glyphs(&self) -> impl Iterator { + self.0.iter().map(|(uv, gid)| (*gid, *uv as u32)) + } + } + + impl MockCharMap { + fn make_rule(&self, left: char, right: char, val: i16) -> PairPosEntry { + let left = self.0.get(&left).unwrap(); + let right = self.0.get(&right).unwrap(); + PairPosEntry::Pair( + *left, + ValueRecordBuilder::new().with_x_advance(val), + *right, + ValueRecordBuilder::new(), + ) + } + } + + impl FromIterator for MockCharMap { + fn from_iter>(iter: T) -> Self { + Self( + iter.into_iter() + .enumerate() + .map(|(i, c)| (c, GlyphId::new(i as u16 + 1))) + .collect(), + ) + } + } + + #[test] + fn split_latin_and_cyrillic() { + const A_CY: char = 'а'; + const BE_CY: char = 'б'; + let charmap: MockCharMap = ['a', 'b', A_CY, BE_CY].into_iter().collect(); + let known_scripts = scripts_for_chars(&charmap); + let pairs = [('a', 'b', 5i16), ('a', 'a', 7), (A_CY, BE_CY, 12)] + .into_iter() + .map(|(a, b, val)| charmap.make_rule(a, b, val)) + .collect::>(); + let ctx = KernSplitContext::new(&charmap, &known_scripts, None, None).unwrap(); + + let pairs_ref = pairs.iter().collect::>(); + + let result = ctx.make_lookups(&pairs_ref); + assert_eq!(result.len(), 2); + + let cyrillic: BTreeSet<_> = [UnicodeShortName::from_str("Cyrl").unwrap()] + .into_iter() + .collect(); + + let latn: BTreeSet<_> = [UnicodeShortName::from_str("Latn").unwrap()] + .into_iter() + .collect(); + + let cyr_rules = result.get(&cyrillic).unwrap(); + assert_eq!(cyr_rules.iter().map(|x| x.len()).sum::(), 1); + + let latn_rules = result.get(&latn).unwrap(); + assert_eq!(latn_rules.iter().map(|x| x.len()).sum::(), 2); + } +} From d4160af720c3f02f5bca5d44e059ebb62f516643 Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Wed, 13 Mar 2024 17:04:43 -0400 Subject: [PATCH 5/6] [kerning] Support math script --- fontbe/src/features/properties.rs | 34 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/fontbe/src/features/properties.rs b/fontbe/src/features/properties.rs index 3dbc9346f..db06611ec 100644 --- a/fontbe/src/features/properties.rs +++ b/fontbe/src/features/properties.rs @@ -216,24 +216,23 @@ static SCRIPT_EXCEPTIONS: &[(&str, Tag)] = &[ ("Zyyy", DFLT_SCRIPT), ("Zzzz", DFLT_SCRIPT), ]; -//TODO: python maps 'math' to 'Zmth' but 'Mathematical Notation' -//is not a variant on icu4x's Script? But this is included in -//some datafiles so might be an oversight? But it's actually not listed -//in the official list of scripts, so idk? -static SCRIPT_EXCEPTIONS_REVERSED: &[(Tag, Script)] = &[]; + +// 'math' is used as a script in opentype features: +// +static SCRIPT_EXCEPTIONS_REVERSED: &[(Tag, &str)] = &[(Tag::new(b"math"), "Zmth")]; // I don't know what's going on here, just copying OTTags.py -static NEW_SCRIPTS: &[(Tag, Script)] = &[ - (Tag::new(b"bng2"), Script::Bengali), - (Tag::new(b"dev2"), Script::Devanagari), - (Tag::new(b"gjr2"), Script::Gujarati), - (Tag::new(b"gur2"), Script::Gurmukhi), - (Tag::new(b"knd2"), Script::Kannada), - (Tag::new(b"mlm2"), Script::Malayalam), - (Tag::new(b"mym2"), Script::Myanmar), - (Tag::new(b"ory2"), Script::Oriya), - (Tag::new(b"tel2"), Script::Telugu), - (Tag::new(b"tml2"), Script::Tamil), +static NEW_SCRIPTS: &[(Tag, &str)] = &[ + (Tag::new(b"bng2"), "Beng"), + (Tag::new(b"dev2"), "Deva"), + (Tag::new(b"gjr2"), "Gujr"), + (Tag::new(b"gur2"), "Guru"), + (Tag::new(b"knd2"), "Knda"), + (Tag::new(b"mlm2"), "Mlym"), + (Tag::new(b"mym2"), "Mymr"), + (Tag::new(b"ory2"), "Orya"), + (Tag::new(b"tel2"), "Telu"), + (Tag::new(b"tml2"), "Taml"), ]; // I don't know what's going on here, just copying OTTags.py @@ -384,8 +383,7 @@ pub(crate) fn ot_tag_to_script(script_tag: Tag) -> Option { .binary_search_exact(&tag) .or_else(|| NEW_SCRIPTS.binary_search_exact(&tag)) { - let mapping = Script::enum_to_short_name_mapper(); - return mapping.get(exception); + return Some(UnicodeShortName::from_str(exception).unwrap()); } // finally, algorithmic conversion From e1e743d078c879c17bf1455351fca0af71cb7892 Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Thu, 14 Mar 2024 12:26:52 -0400 Subject: [PATCH 6/6] [kerning] cleanup and code review --- Cargo.toml | 2 + fontbe/src/features.rs | 1 + fontbe/src/features/kern.rs | 43 ++++---- fontbe/src/features/ot_tags.rs | 176 ++++++++++++++++++++++++++++++ fontbe/src/features/properties.rs | 152 ++------------------------ fontbe/src/orchestration.rs | 4 +- 6 files changed, 212 insertions(+), 166 deletions(-) create mode 100644 fontbe/src/features/ot_tags.rs diff --git a/Cargo.toml b/Cargo.toml index 40855cc9a..3942aea41 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,6 +51,8 @@ members = [ "layout-normalizer", ] +# remove this when icu_properties 1.5 is released: +# https://github.com/unicode-org/icu4x/issues?q=is%3Aopen+is%3Aissue+milestone%3A%221.5+Blocking+⟨P1⟩%22 [patch.crates-io] icu_properties = { version = "1.4", git = "https://github.com/unicode-org/icu4x.git", rev = "728eb44" } tinystr = { version = "0.7.5", git = "https://github.com/unicode-org/icu4x.git", rev = "728eb44" } diff --git a/fontbe/src/features.rs b/fontbe/src/features.rs index 8a674b512..7a41dda97 100644 --- a/fontbe/src/features.rs +++ b/fontbe/src/features.rs @@ -41,6 +41,7 @@ use crate::{ mod kern; mod marks; +mod ot_tags; mod properties; pub use kern::{create_gather_ir_kerning_work, create_kern_segment_work, create_kerns_work}; diff --git a/fontbe/src/features/kern.rs b/fontbe/src/features/kern.rs index 2eadf7d92..d58f44536 100644 --- a/fontbe/src/features/kern.rs +++ b/fontbe/src/features/kern.rs @@ -443,8 +443,6 @@ impl KerningGatherWork { let gdef = compilation.gdef_classes; - // serves as a standin for cmap in the fonttools impl - let pairs = fragments .iter() .flat_map(|frag| frag.kerns.iter()) @@ -460,22 +458,24 @@ impl KerningGatherWork { /// returns a vec of lookups (as a vec of subtables), along with a map of features -> lookups /// (by order in the first vec) + /// + /// this based on + /// fn assign_lookups_to_scripts( &self, - lookups: HashMap, Vec>, + lookups: BTreeMap, Vec>, ast: &ParseTree, // one of 'kern' or 'dist' current_feature: Tag, - ) -> (Vec>, HashMap>) { + ) -> (Vec>, BTreeMap>) { let dflt_langs = vec![DFLT_LANG]; let is_kern_feature = current_feature == KERN; assert!(is_kern_feature || current_feature == DIST); - // this is based on the _registerLookups fn, and this is where we should start typing - let mut lookups_by_script = HashMap::new(); + let mut lookups_by_script = BTreeMap::new(); let mut ordered_lookups = Vec::new(); - let fea_langs_by_script: HashMap<_, _> = get_script_language_systems(ast) + let fea_langs_by_script: BTreeMap<_, _> = get_script_language_systems(ast) .into_values() .flat_map(|x| x.into_iter()) .collect(); @@ -492,8 +492,6 @@ impl KerningGatherWork { } } - // now implement the logic from _registerLookups in KernFeatureWriter: - // let mut default_lookups = Vec::new(); if let Some(common_lookups) = lookups_by_script .get(&COMMON_SCRIPT) @@ -504,6 +502,7 @@ impl KerningGatherWork { } //inDesign bugfix: + // let dist_enabled_scripts = super::properties::dist_feature_enabled_scripts(); let (mut ltr_lookups, mut rtl_lookups) = (Vec::new(), Vec::new()); for (script, lookups) in lookups_by_script @@ -525,10 +524,9 @@ impl KerningGatherWork { } default_lookups.sort_unstable(); - let mut features = HashMap::new(); + let mut features = BTreeMap::new(); if !default_lookups.is_empty() { let languages = fea_langs_by_script.get(&DFLT_SCRIPT).unwrap_or(&dflt_langs); - log::debug!("DFLT languages: {languages:?}"); for lang in languages { features.insert( FeatureKey::new(KERN, *lang, DFLT_SCRIPT), @@ -552,11 +550,12 @@ impl KerningGatherWork { for (script, mut lookups) in lookups_by_script { lookups.extend(dflt_lookups.iter().copied()); + lookups.sort_unstable(); + lookups.dedup(); for tag in super::properties::script_to_ot_tags(&script) { let languages = fea_langs_by_script.get(&tag).unwrap_or(&dflt_langs); for lang in languages { - log::info!("added {} lookups for {tag}/{lang}", lookups.len()); features.insert(FeatureKey::new(KERN, *lang, tag), lookups.clone()); } } @@ -568,19 +567,19 @@ impl KerningGatherWork { } fn debug_ordered_lookups( - features: &HashMap>, + features: &BTreeMap>, lookups: &[Vec], ) { for (i, subtables) in lookups.iter().enumerate() { let total_rules = subtables.iter().map(|x| x.len()).sum::(); - log::debug!("lookup {i}, {total_rules} rules"); + log::trace!("lookup {i}, {total_rules} rules"); } let mut feature_keys = features.keys().collect::>(); feature_keys.sort(); for feature in feature_keys { let indices = features.get(feature).unwrap(); - log::debug!("feature {feature:?}, lookups {indices:?}"); + log::trace!("feature {feature:?}, lookups {indices:?}"); } } @@ -642,14 +641,14 @@ impl KernSplitContext { fn make_lookups( &self, pairs: &[&PairPosEntry], - ) -> HashMap, Vec> { + ) -> BTreeMap, Vec> { if !self.opts.ignore_marks { let pairs = pairs.iter().map(|x| Cow::Borrowed(*x)).collect::>(); return self.make_split_script_kern_lookups(&pairs, false); } let (base_pairs, mark_pairs) = self.split_base_and_mark_pairs(pairs); - let mut result = HashMap::new(); + let mut result = BTreeMap::new(); if !base_pairs.is_empty() { result = self.make_split_script_kern_lookups(&base_pairs, false); } @@ -675,8 +674,8 @@ impl KernSplitContext { pairs: &[Cow], //TODO: handle marks _are_marks: bool, - ) -> HashMap, Vec> { - let mut lookups_by_script = HashMap::new(); + ) -> BTreeMap, Vec> { + let mut lookups_by_script = BTreeMap::new(); let kerning_per_script = self.split_kerns(pairs); let mut bidi_buf = HashSet::new(); // we can reuse this for each pair for (scripts, pairs) in kerning_per_script { @@ -731,7 +730,7 @@ impl KernSplitContext { kerning_per_script = merge_scripts(kerning_per_script); for scripts in kerning_per_script.keys().filter(|x| x.len() > 1) { - log::info!("merging kerning lookups for {scripts:?}"); + log::debug!("merged kerning lookups for {scripts:?}"); } kerning_per_script @@ -972,7 +971,9 @@ fn get_script_language_systems(ast: &ParseTree) -> HashMap + +use write_fonts::types::Tag; + +pub(crate) const DFLT_SCRIPT: Tag = Tag::new(b"DFLT"); + +pub(crate) static SCRIPT_ALIASES: &[(Tag, Tag)] = &[(Tag::new(b"jamo"), Tag::new(b"hang"))]; + +pub(crate) static SCRIPT_EXCEPTIONS: &[(&str, Tag)] = &[ + ("Hira", Tag::new(b"kana")), + ("Hrkt", Tag::new(b"kana")), + ("Laoo", Tag::new(b"lao ")), + ("Nkoo", Tag::new(b"nko ")), + ("Vaii", Tag::new(b"vai ")), + ("Yiii", Tag::new(b"yi ")), + ("Zinh", DFLT_SCRIPT), + ("Zmth", Tag::new(b"math")), + ("Zyyy", DFLT_SCRIPT), + ("Zzzz", DFLT_SCRIPT), +]; + +// 'math' is used as a script in opentype features: +// +pub(crate) static SCRIPT_EXCEPTIONS_REVERSED: &[(Tag, &str)] = &[(Tag::new(b"math"), "Zmth")]; + +pub(crate) static NEW_SCRIPTS: &[(Tag, &str)] = &[ + (Tag::new(b"bng2"), "Beng"), + (Tag::new(b"dev2"), "Deva"), + (Tag::new(b"gjr2"), "Gujr"), + (Tag::new(b"gur2"), "Guru"), + (Tag::new(b"knd2"), "Knda"), + (Tag::new(b"mlm2"), "Mlym"), + (Tag::new(b"mym2"), "Mymr"), + (Tag::new(b"ory2"), "Orya"), + (Tag::new(b"tel2"), "Telu"), + (Tag::new(b"tml2"), "Taml"), +]; + +pub(crate) static NEW_SCRIPT_TAGS: &[(&str, Tag)] = &[ + ("Beng", Tag::new(b"bng2")), + ("Deva", Tag::new(b"dev2")), + ("Gujr", Tag::new(b"gjr2")), + ("Guru", Tag::new(b"gur2")), + ("Kana", Tag::new(b"knd2")), + ("Mlym", Tag::new(b"mlm2")), + ("Mymr", Tag::new(b"mym2")), + ("Orya", Tag::new(b"ory2")), + ("Taml", Tag::new(b"tml2")), + ("Telu", Tag::new(b"tel2")), +]; + +pub(crate) static INDIC_SCRIPTS: &[&str] = &[ + "Beng", // Bengali + "Deva", // Devanagari + "Gujr", // Gujarati + "Guru", // Gurmukhi + "Knda", // Kannada + "Mlym", // Malayalam + "Orya", // Oriya + "Sinh", // Sinhala + "Taml", // Tamil + "Telu", // Telugu +]; + +pub(crate) static USE_SCRIPTS: &[&str] = &[ + // Correct as at Unicode 15.0 + "Adlm", // Adlam + "Ahom", // Ahom + "Bali", // Balinese + "Batk", // Batak + "Brah", // Brahmi + "Bugi", // Buginese + "Buhd", // Buhid + "Cakm", // Chakma + "Cham", // Cham + "Chrs", // Chorasmian + "Cpmn", // Cypro Minoan + "Diak", // Dives Akuru + "Dogr", // Dogra + "Dupl", // Duployan + "Egyp", // Egyptian Hieroglyphs + "Elym", // Elymaic + "Gong", // Gunjala Gondi + "Gonm", // Masaram Gondi + "Gran", // Grantha + "Hano", // Hanunoo + "Hmng", // Pahawh Hmong + "Hmnp", // Nyiakeng Puachue Hmong + "Java", // Javanese + "Kali", // Kayah Li + "Kawi", // Kawi + "Khar", // Kharosthi + "Khoj", // Khojki + "Kits", // Khitan Small Script + "Kthi", // Kaithi + "Lana", // Tai Tham + "Lepc", // Lepcha + "Limb", // Limbu + "Mahj", // Mahajani + "Maka", // Makasar + "Mand", // Mandaic + "Mani", // Manichaean + "Marc", // Marchen + "Medf", // Medefaidrin + "Modi", // Modi + "Mong", // Mongolian + "Mtei", // Meetei Mayek + "Mult", // Multani + "Nagm", // Nag Mundari + "Nand", // Nandinagari + "Newa", // Newa + "Nhks", // Bhaiksuki + "Nko ", // Nko + "Ougr", // Old Uyghur + "Phag", // Phags Pa + "Phlp", // Psalter Pahlavi + "Plrd", // Miao + "Rjng", // Rejang + "Rohg", // Hanifi Rohingya + "Saur", // Saurashtra + "Shrd", // Sharada + "Sidd", // Siddham + "Sind", // Khudawadi + "Sogd", // Sogdian + "Sogo", // Old Sogdian + "Soyo", // Soyombo + "Sund", // Sundanese + "Sylo", // Syloti Nagri + "Tagb", // Tagbanwa + "Takr", // Takri + "Tale", // Tai Le + "Tavt", // Tai Viet + "Tfng", // Tifinagh + "Tglg", // Tagalog + "Tibt", // Tibetan + "Tirh", // Tirhuta + "Tnsa", // Tangsa + "Toto", // Toto + "Vith", // Vithkuqi + "Wcho", // Wancho + "Yezi", // Yezidi + "Zanb", // Zanabazar Square +]; + +#[cfg(test)] +mod tests { + use super::*; + + /// we want to binary search these, so let's enforce that they are sorted, + /// to avoid future headaches + #[test] + fn const_arrays_are_sorted() { + fn get_original_and_sorted_items( + items: &[(T, U)], + ) -> (Vec, Vec) { + let originals = items.iter().map(|(a, _)| a.clone()).collect::>(); + let mut sorted = originals.clone(); + sorted.sort(); + (originals, sorted) + } + + let (actual, expected) = get_original_and_sorted_items(SCRIPT_ALIASES); + assert_eq!(actual, expected); + let (actual, expected) = get_original_and_sorted_items(SCRIPT_EXCEPTIONS_REVERSED); + assert_eq!(actual, expected); + let (actual, expected) = get_original_and_sorted_items(NEW_SCRIPTS); + assert_eq!(actual, expected); + let (actual, expected) = get_original_and_sorted_items(NEW_SCRIPT_TAGS); + assert_eq!(actual, expected); + let (actual, expected) = get_original_and_sorted_items(SCRIPT_EXCEPTIONS); + assert_eq!(actual, expected); + } +} diff --git a/fontbe/src/features/properties.rs b/fontbe/src/features/properties.rs index db06611ec..a9bf7b2c6 100644 --- a/fontbe/src/features/properties.rs +++ b/fontbe/src/features/properties.rs @@ -12,8 +12,13 @@ use write_fonts::{ types::{GlyphId, Tag}, }; -// SAFETY: we can easily verify that neither of these strings contains a nul byte. +use crate::features::ot_tags::{NEW_SCRIPTS, SCRIPT_ALIASES, SCRIPT_EXCEPTIONS_REVERSED}; + +use super::ot_tags::{DFLT_SCRIPT, INDIC_SCRIPTS, NEW_SCRIPT_TAGS, SCRIPT_EXCEPTIONS, USE_SCRIPTS}; + +// SAFETY: we can visually verify that these inputs contain only non-null ASCII bytes // (this is the only way we can declare these as constants) +// TODO: remove this if https://github.com/unicode-org/icu4x/pull/4691 is merged/released pub const COMMON_SCRIPT: UnicodeShortName = unsafe { UnicodeShortName::from_bytes_unchecked(*b"Zyyy") }; pub const INHERITED_SCRIPT: UnicodeShortName = @@ -56,8 +61,11 @@ impl CharMap for Vec<(Arc, GlyphId)> { impl ScriptDirection { /// Returns the writing direction for the provided script + // pub(crate) fn for_script(script: &UnicodeShortName) -> Self { match script.as_str() { + // this list from + // "Zyyy" => ScriptDirection::Auto, "Arab" | "Hebr" | "Syrc" | "Thaa" | "Cprt" | "Khar" | "Phnx" | "Nkoo" | "Lydi" | "Avst" | "Armi" | "Phli" | "Prti" | "Sarb" | "Orkh" | "Samr" | "Mand" | "Merc" @@ -200,148 +208,6 @@ pub(crate) fn glyphs_by_bidi_class( ) } -const DFLT_SCRIPT: Tag = Tag::new(b"DFLT"); - -static SCRIPT_ALIASES: &[(Tag, Tag)] = &[(Tag::new(b"jamo"), Tag::new(b"hang"))]; - -static SCRIPT_EXCEPTIONS: &[(&str, Tag)] = &[ - ("Hira", Tag::new(b"kana")), - ("Hrkt", Tag::new(b"kana")), - ("Laoo", Tag::new(b"lao ")), - ("Nkoo", Tag::new(b"nko ")), - ("Vaii", Tag::new(b"vai ")), - ("Yiii", Tag::new(b"yi ")), - ("Zinh", DFLT_SCRIPT), - ("Zmth", Tag::new(b"math")), - ("Zyyy", DFLT_SCRIPT), - ("Zzzz", DFLT_SCRIPT), -]; - -// 'math' is used as a script in opentype features: -// -static SCRIPT_EXCEPTIONS_REVERSED: &[(Tag, &str)] = &[(Tag::new(b"math"), "Zmth")]; - -// I don't know what's going on here, just copying OTTags.py -static NEW_SCRIPTS: &[(Tag, &str)] = &[ - (Tag::new(b"bng2"), "Beng"), - (Tag::new(b"dev2"), "Deva"), - (Tag::new(b"gjr2"), "Gujr"), - (Tag::new(b"gur2"), "Guru"), - (Tag::new(b"knd2"), "Knda"), - (Tag::new(b"mlm2"), "Mlym"), - (Tag::new(b"mym2"), "Mymr"), - (Tag::new(b"ory2"), "Orya"), - (Tag::new(b"tel2"), "Telu"), - (Tag::new(b"tml2"), "Taml"), -]; - -// I don't know what's going on here, just copying OTTags.py -static NEW_SCRIPT_TAGS: &[(&str, Tag)] = &[ - ("Beng", Tag::new(b"bng2")), - ("Deva", Tag::new(b"dev2")), - ("Gujr", Tag::new(b"gjr2")), - ("Guru", Tag::new(b"gur2")), - ("Kana", Tag::new(b"knd2")), - ("Mlym", Tag::new(b"mlm2")), - ("Mymr", Tag::new(b"mym2")), - ("Orya", Tag::new(b"ory2")), - ("Taml", Tag::new(b"tml2")), - ("Telu", Tag::new(b"tel2")), -]; - -static INDIC_SCRIPTS: &[&str] = &[ - "Beng", // Bengali - "Deva", // Devanagari - "Gujr", // Gujarati - "Guru", // Gurmukhi - "Knda", // Kannada - "Mlym", // Malayalam - "Orya", // Oriya - "Sinh", // Sinhala - "Taml", // Tamil - "Telu", // Telugu -]; - -static USE_SCRIPTS: &[&str] = &[ - // Correct as at Unicode 15.0 - "Adlm", // Adlam - "Ahom", // Ahom - "Bali", // Balinese - "Batk", // Batak - "Brah", // Brahmi - "Bugi", // Buginese - "Buhd", // Buhid - "Cakm", // Chakma - "Cham", // Cham - "Chrs", // Chorasmian - "Cpmn", // Cypro Minoan - "Diak", // Dives Akuru - "Dogr", // Dogra - "Dupl", // Duployan - "Egyp", // Egyptian Hieroglyphs - "Elym", // Elymaic - "Gong", // Gunjala Gondi - "Gonm", // Masaram Gondi - "Gran", // Grantha - "Hano", // Hanunoo - "Hmng", // Pahawh Hmong - "Hmnp", // Nyiakeng Puachue Hmong - "Java", // Javanese - "Kali", // Kayah Li - "Kawi", // Kawi - "Khar", // Kharosthi - "Khoj", // Khojki - "Kits", // Khitan Small Script - "Kthi", // Kaithi - "Lana", // Tai Tham - "Lepc", // Lepcha - "Limb", // Limbu - "Mahj", // Mahajani - "Maka", // Makasar - "Mand", // Mandaic - "Mani", // Manichaean - "Marc", // Marchen - "Medf", // Medefaidrin - "Modi", // Modi - "Mong", // Mongolian - "Mtei", // Meetei Mayek - "Mult", // Multani - "Nagm", // Nag Mundari - "Nand", // Nandinagari - "Newa", // Newa - "Nhks", // Bhaiksuki - "Nko ", // Nko - "Ougr", // Old Uyghur - "Phag", // Phags Pa - "Phlp", // Psalter Pahlavi - "Plrd", // Miao - "Rjng", // Rejang - "Rohg", // Hanifi Rohingya - "Saur", // Saurashtra - "Shrd", // Sharada - "Sidd", // Siddham - "Sind", // Khudawadi - "Sogd", // Sogdian - "Sogo", // Old Sogdian - "Soyo", // Soyombo - "Sund", // Sundanese - "Sylo", // Syloti Nagri - "Tagb", // Tagbanwa - "Takr", // Takri - "Tale", // Tai Le - "Tavt", // Tai Viet - "Tfng", // Tifinagh - "Tglg", // Tagalog - "Tibt", // Tibetan - "Tirh", // Tirhuta - "Tnsa", // Tangsa - "Toto", // Toto - "Vith", // Vithkuqi - "Wcho", // Wancho - "Yezi", // Yezidi - "Zanb", // Zanabazar Square -]; - pub(crate) fn dist_feature_enabled_scripts() -> HashSet { INDIC_SCRIPTS .iter() diff --git a/fontbe/src/orchestration.rs b/fontbe/src/orchestration.rs index da189842f..a92d2d441 100644 --- a/fontbe/src/orchestration.rs +++ b/fontbe/src/orchestration.rs @@ -1,7 +1,7 @@ //! Helps coordinate the graph execution for BE use std::{ - collections::{BTreeMap, HashMap, HashSet}, + collections::{BTreeMap, HashSet}, fmt::Display, fs::File, io::{self, BufReader, BufWriter, Read, Write}, @@ -377,7 +377,7 @@ pub struct FeaRsKerns { /// ordered! pub lookups: Vec>, /// each value is a set of lookups, referenced by their order in array above - pub features: HashMap>, + pub features: BTreeMap>, } /// The abstract syntax tree of any user FEA.