Skip to content

Commit

Permalink
[natural_translit] Add English letters Latin inventory and English la…
Browse files Browse the repository at this point in the history
…nguage params.

PiperOrigin-RevId: 720611268
  • Loading branch information
isingoo authored and copybara-github committed Jan 28, 2025
1 parent 018a0d5 commit b44e3c6
Show file tree
Hide file tree
Showing 11 changed files with 3,542 additions and 85 deletions.
10 changes: 10 additions & 0 deletions nisaba/scripts/natural_translit/language_params/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ py_library(
],
)

py_library(
name = "en",
srcs = ["en.py"],
deps = [
"//nisaba/scripts/natural_translit/phonology/inventories:x_mul",
"//nisaba/scripts/natural_translit/script:grapheme",
"//nisaba/scripts/natural_translit/script/inventories:latn",
],
)

py_library(
name = "gu",
srcs = ["gu.py"],
Expand Down
230 changes: 230 additions & 0 deletions nisaba/scripts/natural_translit/language_params/en.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
# Copyright 2024 Nisaba Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Language parameters for English."""

from nisaba.scripts.natural_translit.phonology.inventories import x_mul
from nisaba.scripts.natural_translit.script import grapheme as g
from nisaba.scripts.natural_translit.script.inventories import latn as l


def _grapheme_inventory() -> g.Grapheme.Inventory:
"""Builds a grapheme inventory for English."""
latn = l.GRAPHEMES
ph = x_mul.INVENTORY
gr = g.Grapheme.Inventory(g.Grapheme.GR_FEATURES.script.latn, 'en')
lowercase = [
latn.a,
latn.b,
latn.c,
latn.d,
latn.e,
latn.f,
latn.g,
latn.h,
latn.i,
latn.j,
latn.k,
latn.l,
latn.m,
latn.n,
latn.o,
latn.p,
latn.q,
latn.r,
latn.s,
latn.t,
latn.u,
latn.v,
latn.w,
latn.x,
latn.y,
latn.z,
]
gr.import_as_feature_pairs(
g.Grapheme.GR_FEATURES.case.lower,
g.Grapheme.GR_FEATURES.case.upper,
*((lower, lower.upper) for lower in lowercase)
)
gr.make_iterable_suppl('letter', *gr.upper, *gr.lower)

# Descriptive features from common one-to-one grapheme-phoneme mappings.
# Many-to-many values for subbstrings, universal mappings such
# as vowel reduction, and common phonological operations such as
# palatalization will be matched through g2p/g2g alignables and/or built-in
# phonological rules.

# Initial values retrieved on 2025-01-24 from:
# https://en.wikipedia.org/wiki/English_orthography#Spelling-to-sound_correspondences

# Vowels.
# Technically all durations are {short, long} through union of lax and tense/
# heavy phonemes, but it's left as {any} for now.

gr.a.update_descriptives_from_symbol(
# lax: man, tense: mane, heavy: mar, heavy-r: mare
ph.ae, # lax nucleus
ph.e, # tense nucleus
ph.iy, # tense glide
ph.aw, # heavy nucleus
ph.eh, # heavy-r nucleus
ph.ec, # heavy-r glide
)
gr.e.update_descriptives_from_symbol(
# lax: met, tense: meet, heavy: her, heavy-r: here
ph.eh, # lax nucleus
ph.i, # tense nucleus
ph.ex, # heavy nucleus
ph.iy, # heavy-r nucleus
ph.ec, # heavy-r glide
)
gr.i.update_descriptives_from_symbol(
# lax: win, tense: wine, heavy: fir, heavy-r: fire
ph.iy, # lax nucleus; tense, heavy-r glide
ph.a, # tense, heavy-r nucleus
ph.ex, # heavy nucleus
ph.ec, # heavy-r second glide
)
gr.o.update_descriptives_from_symbol(
# lax: mop, tense: mope, heavy: for, heavy-r: fore
ph.ow, # lax nucleus
ph.o, # tense nucleus
ph.uv, # tense glide
ph.oh, # heavy, heavy-r nucleus
)
gr.u.update_descriptives_from_symbol(
# lax: hug, push, tense: huge, heavy: cur, heavy-r: cure
ph.ah, # lax nucleus
ph.uv, # lax, heavy-r nucleus
ph.u, # tense nucleus
ph.ec, # heavy-r glide
)

# Consonants.
gr.b.update_descriptives_from_symbol(ph.b)
gr.c.update_descriptives_from_symbol(
ph.s, # city
ph.k, # cat
)
gr.d.update_descriptives_from_symbol(
ph.d, # dog
)
gr.f.update_descriptives_from_symbol(
ph.f, # fine
)
gr.g.update_descriptives_from_symbol(
ph.g, # get
ph.d, # gin stop
ph.zh, # gin fricative
)
gr.h.update_descriptives_from_symbol(
ph.h, # honey
)
gr.j.update_descriptives_from_symbol(
ph.d, # jump stop
ph.zh, # jump fricative
ph.y, # hallelujah
ph.h, # jalapeno
)
gr.k.update_descriptives_from_symbol(
ph.k, # key
)
gr.l.update_descriptives_from_symbol(
ph.l, # line
)
gr.m.update_descriptives_from_symbol(
ph.m, # mine
)
gr.n.update_descriptives_from_symbol(
ph.n, # name
)
gr.p.update_descriptives_from_symbol(
ph.p, # pill
)
gr.q.update_descriptives_from_symbol(
ph.k, # quick
)
gr.r.update_descriptives_from_symbol(
ph.r, # red
)
gr.s.update_descriptives_from_symbol(
ph.s, # saw
ph.z, # prison
ph.sh, # sugar
ph.zh, # vision
)
gr.t.update_descriptives_from_symbol(
ph.t, # ten, rigtheous stop
ph.sh, # ration, righteous fricative
ph.zh, # equation
)
gr.v.update_descriptives_from_symbol(
ph.v, # vine
)
gr.w.update_descriptives_from_symbol(
ph.w, # water
)
gr.x.update_descriptives_from_symbol(
ph.k, # box, anxious, luxurious(gb) stop
ph.s, # box fricative
ph.g, # anxiety, luxurious(us) stop
ph.z, # anxiety fricative
ph.zh, # luxurious fricative
ph.sh, # anxious fricative
)
gr.y.update_descriptives_from_symbol(
ph.y, # yes
gr.i, # flynn, fry, fyrd, pyre
)
gr.z.update_descriptives_from_symbol(
ph.z, # zoo
ph.zh, # seizure
ph.t, # schizophrenia stop
ph.s, # schizophrenia fricative
)
for upper in gr.upper:
upper.update_descriptives_from_symbol(upper.lower)
vowels = [gr.a, gr.e, gr.i, gr.o, gr.u, gr.y]
consonants = [
gr.b,
gr.c,
gr.d,
gr.f,
gr.g,
gr.h,
gr.j,
gr.k,
gr.l,
gr.m,
gr.n,
gr.p,
gr.q,
gr.r,
gr.s,
gr.t,
gr.v,
gr.w,
gr.x,
gr.y,
gr.z,
]
gr.make_iterable_suppl('vowel', *vowels, *(v.upper for v in vowels))
gr.make_iterable_suppl(
'consonant', *consonants, *(c.upper for c in consonants)
)
gr.sync_atomics([gr.upper, gr.lower, gr.letter, gr.vowel, gr.consonant])
return gr


GRAPHEMES = _grapheme_inventory()
45 changes: 27 additions & 18 deletions nisaba/scripts/natural_translit/phonology/phonological_symbol.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,23 @@ def __init__(
ft.Feature.Profile(self.PH_DESCRIPTIVE_FEATURES, 'new')
)

def descriptives(self) -> ft.Feature.Profile:
return self.features.phonology_descriptive

def update_descriptives(
self, *features: ft.Feature.ITERABLE
) -> 'PhonologicalSymbol':
"""Updates the descriptive features of the PhonologicalSymbol."""
self.descriptives().update(*features)
return self

def update_descriptives_from_symbol(
self, *symbols: 'PhonologicalSymbol'
) -> 'PhonologicalSymbol':
"""Updates the descriptives from the union of the given symbols."""
self.update_descriptives(*(s.descriptives() for s in symbols))
return self

class Inventory(sym.Symbol.Inventory):
"""Phonological symbol inventory."""

Expand All @@ -59,7 +76,7 @@ def or_from_suppl(self, suppl: ty.IterableThing) -> bool:
return self.atomics.add_suppl(exp.Or(*suppl, alias=suppl.alias))

def sync_atomics(
self, or_suppls: ty.ListOrNothing = ty.UNSPECIFIED
self, update_ors_from_suppls: ty.ListOrNothing = ty.UNSPECIFIED
) -> 'PhonologicalSymbol.Inventory':
"""Syncs the atomic inventory with the symbol inventory.
Expand All @@ -68,25 +85,24 @@ def sync_atomics(
Or supplements in atomics to include all members of the given supplements
in the list. For example, if an `inventory.vowel` iterable and a
corresponding `inventory.atomics.vowel` Or were initiated as `[a, e, i]`
and `(a | e | i)`, and later `[o, u]` was added to `inventory.vowel`,
this function will update `inventory.atomics.vowel` to
and `(a | e | i)` respectively, and later `[o, u]` was added to
`inventory.vowel`, this function will update `inventory.atomics.vowel` to
`(a | e | i | o | u)`.
Updates the atomic inventory with the supplements.
Args:
or_suppls: Optional list iterable supplements. If specified, the
corresponding Or supplement of the given supplements will be updated to
include all symbols.
update_ors_from_suppls: Optional list of iterable supplements. When
specified,
- if there's no corresponding Or in the atomics, a new one is created.
- if there is a corresponding Or, it's updated to include all symbols
in the given supplement.
Returns:
The inventory.
"""
for atomic in self.atomics:
for profile in atomic.features:
profile.update(atomic.symbol.features.get(profile.inventory))
for suppl in ty.type_check(or_suppls, []):
for suppl in ty.type_check(update_ors_from_suppls, []):
if suppl.alias not in self.atomics.suppl_aliases:
self.or_from_suppl(suppl)
self.atomics.get(suppl.alias).add(*suppl)
Expand Down Expand Up @@ -130,7 +146,7 @@ def copy(
language: str = '',
alias: str = '',
ipa: str = '',
) -> 'Phon':
) -> 'Phon':
"""Creates a copy of the Phon."""
return Phon(
language=language if language else self.language,
Expand All @@ -141,13 +157,6 @@ def copy(
features=self.features.copy(),
)

def update_descriptives(
self, *features: ft.Feature.ITERABLE
) -> 'Phon':
"""Updates the descriptive features of the Phon."""
self.features.phonology_descriptive.update(*features)
return self

class Inventory(PhonologicalSymbol.Inventory):
"""Phon inventory."""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def _test_inventory() -> po.Phon.Inventory:
ph_inv.make_iterable_suppl('close_like', ph_inv.e)
ph_inv.or_from_suppl(ph_inv.close_like)
ph_inv.close_like.add(ph_inv.i)
return ph_inv.sync_atomics(or_suppls=[ph_inv.vowel, ph_inv.close_like])
return ph_inv.sync_atomics([ph_inv.vowel, ph_inv.close_like])


_TEST = _test_inventory()
Expand Down
4 changes: 1 addition & 3 deletions nisaba/scripts/natural_translit/script/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,9 @@ py_library(
name = "grapheme",
srcs = ["grapheme.py"],
deps = [
"//nisaba/scripts/natural_translit/phonology:descriptive_features",
"//nisaba/scripts/natural_translit/utils:expression",
"//nisaba/scripts/natural_translit/phonology:phonological_symbol",
"//nisaba/scripts/natural_translit/utils:feature",
"//nisaba/scripts/natural_translit/utils:inventory",
"//nisaba/scripts/natural_translit/utils:symbol",
"//nisaba/scripts/natural_translit/utils:type_op",
requirement("pycountry"),
],
Expand Down
Loading

0 comments on commit b44e3c6

Please sign in to comment.