Skip to content

Commit

Permalink
Compositor refactoration with hard copy support. (#99)
Browse files Browse the repository at this point in the history
  • Loading branch information
ShikiSuen authored Mar 10, 2023
1 parent a7599a2 commit 1d095da
Show file tree
Hide file tree
Showing 9 changed files with 286 additions and 238 deletions.
88 changes: 50 additions & 38 deletions Sources/Megrez/1_Compositor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,26 @@ public extension Megrez {
self.separator = separator
}

/// 以指定組字器生成拷貝。
/// - Remark: 因為 Node 不是 Struct,所以會在 Compositor 被拷貝的時候無法被真實複製。
/// 這樣一來,Compositor 複製品當中的 Node 的變化會被反應到原先的 Compositor 身上。
/// 這在某些情況下會造成意料之外的混亂情況,所以需要引入一個拷貝用的建構子。
public init(from target: Compositor) {
cursor = target.cursor
marker = target.marker
separator = target.separator
walkedNodes = target.walkedNodes.map(\.copy)
keys = target.keys
spans = target.spans.map(\.hardCopy)
langModel = target.langModel
}

/// 該組字器的硬拷貝。
/// - Remark: 因為 Node 不是 Struct,所以會在 Compositor 被拷貝的時候無法被真實複製。
/// 這樣一來,Compositor 複製品當中的 Node 的變化會被反應到原先的 Compositor 身上。
/// 這在某些情況下會造成意料之外的混亂情況,所以需要引入一個拷貝用的建構子。
public var hardCopy: Compositor { .init(from: self) }

/// 重置包括游標在內的各項參數,且清空各種由組字器生成的內部資料。
///
/// 將已經被插入的索引鍵陣列與幅位單元陣列(包括其內的節點)全部清空。
Expand Down Expand Up @@ -167,21 +187,19 @@ public extension Megrez {
public var dumpDOT: String {
// C# StringBuilder 與 Swift NSMutableString 能提供爆發性的效能。
let strOutput: NSMutableString = .init(string: "digraph {\ngraph [ rankdir=LR ];\nBOS;\n")
for (p, span) in spans.enumerated() {
for ni in 0 ... (span.maxLength) {
guard let np = span.nodeOf(length: ni) else { continue }
if p == 0 {
strOutput.append("BOS -> \(np.value);\n")
}
spans.enumerated().forEach { p, span in
(0 ... span.maxLength).forEach { ni in
guard let np = span[ni] else { return }
if p == 0 { strOutput.append("BOS -> \(np.value);\n") }
strOutput.append("\(np.value);\n")
if (p + ni) < spans.count {
let destinationSpan = spans[p + ni]
for q in 0 ... (destinationSpan.maxLength) {
guard let dn = destinationSpan.nodeOf(length: q) else { continue }
(0 ... destinationSpan.maxLength).forEach { q in
guard let dn = destinationSpan[q] else { return }
strOutput.append(np.value + " -> " + dn.value + ";\n")
}
}
guard (p + ni) == spans.count else { continue }
guard (p + ni) == spans.count else { return }
strOutput.append(np.value + " -> EOS;\n")
}
}
Expand All @@ -198,11 +216,11 @@ extension Megrez.Compositor {
/// - Parameters:
/// - location: 給定的幅位座標。
/// - action: 指定是擴張還是縮減一個幅位。
mutating func resizeGrid(at location: Int, do action: ResizeBehavior) {
private mutating func resizeGrid(at location: Int, do action: ResizeBehavior) {
let location = max(min(location, spans.count), 0) // 防呆
switch action {
case .expand:
spans.insert(SpanUnit(), at: location)
spans.insert(.init(), at: location)
if [0, spans.count].contains(location) { return }
case .shrink:
if spans.count == location { return }
Expand Down Expand Up @@ -248,60 +266,54 @@ extension Megrez.Compositor {
let affectedLength = Megrez.Compositor.maxSpanLength - 1
let begin = max(0, location - affectedLength)
guard location >= begin else { return }
for i in begin ..< location {
spans[i].dropNodesOfOrBeyond(length: location - i + 1)
(begin ..< location).forEach { delta in
((location - delta + 1) ... Self.maxSpanLength).forEach { theLength in
spans[delta][theLength] = nil
}
}
}

/// 自索引鍵陣列獲取指定範圍的資料。
/// - Parameter range: 指定範圍。
/// - Returns: 拿到的資料。
func getJoinedKeyArray(range: Range<Int>) -> [String] {
private func getJoinedKeyArray(range: Range<Int>) -> [String] {
// 下面這句不能用 contains,不然會要求至少 macOS 13 Ventura。
guard range.upperBound <= keys.count, range.lowerBound >= 0 else { return [] }
return keys[range].map(\.description)
}

/// 在指定位置(以指定索引鍵陣列和指定幅位長度)拿取節點。
/// - Parameters:
/// - location: 指定游標位置。
/// - length: 指定幅位長度。
/// - keyArray: 指定索引鍵陣列。
/// - Returns: 拿取的節點。拿不到的話就會是 nil。
func getNode(at location: Int, length: Int, keyArray: [String]) -> Node? {
let location = max(min(location, spans.count - 1), 0) // 防呆
guard let node = spans[location].nodeOf(length: length) else { return nil }
return keyArray == node.keyArray ? node : nil
}

/// 根據當前狀況更新整個組字器的節點文脈。
/// - Parameter updateExisting: 是否根據目前的語言模型的資料狀態來對既有節點更新其內部的單元圖陣列資料。
/// 該特性可以用於「在選字窗內屏蔽了某個詞之後,立刻生效」這樣的軟體功能需求的實現。
/// - Returns: 新增或影響了多少個節點。如果返回「0」則表示可能發生了錯誤。
@discardableResult public mutating func update(updateExisting: Bool = false) -> Int {
let maxSpanLength = Megrez.Compositor.maxSpanLength
let range = max(0, cursor - maxSpanLength) ..< min(cursor + maxSpanLength, keys.count)
let rangeOfPositions = max(0, cursor - maxSpanLength) ..< min(cursor + maxSpanLength, keys.count)
var nodesChanged = 0
for position in range {
for theLength in 1 ... min(maxSpanLength, range.upperBound - position) {
let joinedKeyArray = getJoinedKeyArray(range: position ..< (position + theLength))
if let theNode = getNode(at: position, length: theLength, keyArray: joinedKeyArray) {
if !updateExisting { continue }
rangeOfPositions.forEach { position in
let rangeOfLengths = 1 ... min(maxSpanLength, rangeOfPositions.upperBound - position)
rangeOfLengths.forEach { theLength in
guard position + theLength <= keys.count, position >= 0 else { return }
let joinedKeyArray = keys[position ..< (position + theLength)].map(\.description)

if let theNode = spans[position][theLength] {
if !updateExisting { return }
let unigrams = langModel.unigramsFor(keyArray: joinedKeyArray)
// 自動銷毀無效的節點。
if unigrams.isEmpty {
if theNode.keyArray.count == 1 { continue }
spans[position].nullify(node: theNode)
if theNode.keyArray.count == 1 { return }
spans[position][theNode.spanLength] = nil
} else {
theNode.syncingUnigrams(from: unigrams)
}
nodesChanged += 1
continue
return
}
let unigrams = langModel.unigramsFor(keyArray: joinedKeyArray)
guard !unigrams.isEmpty else { continue }
spans[position].append(
node: .init(keyArray: joinedKeyArray, spanLength: theLength, unigrams: unigrams)
guard !unigrams.isEmpty else { return }
// 這裡原本用 SpanUnit.addNode 來完成的,但直接當作辭典來互動的話也沒差。
spans[position][theLength] = .init(
keyArray: joinedKeyArray, spanLength: theLength, unigrams: unigrams
)
nodesChanged += 1
}
Expand Down
33 changes: 14 additions & 19 deletions Sources/Megrez/2_Walker.swift
Original file line number Diff line number Diff line change
Expand Up @@ -13,53 +13,48 @@ public extension Megrez.Compositor {
/// 對於 `G = (V, E)`,該算法的運行次數為 `O(|V|+|E|)`,其中 `G` 是一個有向無環圖。
/// 這意味著,即使軌格很大,也可以用很少的算力就可以爬軌。
/// - Returns: 爬軌結果+該過程是否順利執行。
@discardableResult mutating func walk() -> (walkedNode: [Node], succeeded: Bool) {
var result = [Node]()
@discardableResult mutating func walk() -> (walkedNodes: [Megrez.Node], succeeded: Bool) {
var result = [Megrez.Node]()
defer { walkedNodes = result }
guard !spans.isEmpty else { return (result, true) }

var vertexSpans = [[Vertex]]()
for _ in spans {
spans.forEach { _ in
vertexSpans.append(.init())
}

for (i, span) in spans.enumerated() {
for j in 1 ... max(span.maxLength, 1) {
if let theNode = span.nodeOf(length: j) {
vertexSpans[i].append(.init(node: theNode))
}
spans.enumerated().forEach { i, span in
(1 ... max(span.maxLength, 1)).forEach { j in
guard let theNode = span[j] else { return }
vertexSpans[i].append(.init(node: theNode))
}
}

let terminal = Vertex(node: .init(keyArray: ["_TERMINAL_"]))
var root = Vertex(node: .init(keyArray: ["_ROOT_"]))

for (i, vertexSpan) in vertexSpans.enumerated() {
for vertex in vertexSpan {
vertexSpans.enumerated().forEach { i, vertexSpan in
vertexSpan.forEach { vertex in
let nextVertexPosition = i + vertex.node.spanLength
if nextVertexPosition == vertexSpans.count {
vertex.edges.append(terminal)
continue
}
for nextVertex in vertexSpans[nextVertexPosition] {
vertex.edges.append(nextVertex)
return
}
vertexSpans[nextVertexPosition].forEach { vertex.edges.append($0) }
}
}

root.distance = 0
root.edges.append(contentsOf: vertexSpans[0])

var ordered = topologicalSort(root: &root)
for (j, neta) in ordered.reversed().enumerated() {
for (k, _) in neta.edges.enumerated() {
relax(u: neta, v: &neta.edges[k])
}
ordered.reversed().enumerated().forEach { j, neta in
neta.edges.indices.forEach { relax(u: neta, v: &neta.edges[$0]) }
ordered[j] = neta
}

var iterated = terminal
var walked = [Node]()
var walked = [Megrez.Node]()
var totalLengthOfKeys = 0

while let itPrev = iterated.prev {
Expand Down
50 changes: 33 additions & 17 deletions Sources/Megrez/3_KeyValuePaired.swift
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import Foundation

public extension Megrez.Compositor {
public extension Megrez {
/// 鍵值配對,乃索引鍵陣列與讀音的配對單元。
struct KeyValuePaired: Equatable, Hashable, Comparable, CustomStringConvertible {
/// 索引鍵陣列。一般情況下用來放置讀音等可以用來作為索引的內容。
Expand All @@ -18,6 +18,8 @@ public extension Megrez.Compositor {
public var isValid: Bool { !keyArray.joined().isEmpty && !value.isEmpty }
/// 將當前鍵值列印成一個字串,但如果該鍵值配對為空的話則僅列印「()」。
public var toNGramKey: String { !isValid ? "()" : "(" + joinedKey() + "," + value + ")" }
/// 通用陣列表達形式。
public var tupletExpression: (keyArray: [String], value: String) { (keyArray, value) }

/// 初期化一組鍵值配對。
/// - Parameters:
Expand All @@ -28,6 +30,13 @@ public extension Megrez.Compositor {
self.value = value.isEmpty ? "N/A" : value
}

/// 初期化一組鍵值配對。
/// - Parameter tupletExpression: 傳入的通用陣列表達形式。
public init(_ tupletExpression: (keyArray: [String], value: String)) {
keyArray = tupletExpression.keyArray.isEmpty ? ["N/A"] : tupletExpression.keyArray
value = tupletExpression.value.isEmpty ? "N/A" : tupletExpression.value
}

/// 初期化一組鍵值配對。
/// - Parameters:
/// - key: 索引鍵。一般情況下用來放置讀音等可以用來作為索引的內容。
Expand Down Expand Up @@ -72,7 +81,9 @@ public extension Megrez.Compositor {
|| (lhs.keyArray.count == rhs.keyArray.count && lhs.value >= rhs.value)
}
}
}

public extension Megrez.Compositor {
/// 規定候選字陣列內容的獲取範圍類型:
/// - all: 不只包含其它兩類結果,還允許游標穿插候選字。
/// - beginAt: 僅獲取從當前游標位置開始的節點內的候選字。
Expand All @@ -84,26 +95,25 @@ public extension Megrez.Compositor {
/// 話,那麼這裡會用到 location - 1、以免去在呼叫該函式後再處理的麻煩。
/// - Parameter location: 游標位置。
/// - Returns: 候選字音配對陣列。
func fetchCandidates(at location: Int, filter: CandidateFetchFilter = .all) -> [KeyValuePaired] {
var result = [KeyValuePaired]()
func fetchCandidates(at location: Int, filter: CandidateFetchFilter = .all) -> [Megrez.KeyValuePaired] {
var result = [Megrez.KeyValuePaired]()
guard !keys.isEmpty else { return result }
let location = max(min(location, keys.count - 1), 0) // 防呆
let anchors: [NodeAnchor] = fetchOverlappingNodes(at: location).stableSorted {
// 按照讀音的長度(幅位長度)來給節點排序。
$0.spanLength > $1.spanLength
}
let keyAtCursor = keys[location]
for theNode in anchors.map(\.node) {
if theNode.keyArray.isEmpty { continue }
for gram in theNode.unigrams {
anchors.map(\.node).filter(\.keyArray.isEmpty.negative).forEach { theNode in
theNode.unigrams.forEach { gram in
switch filter {
case .all:
// 得加上這道篩選,所以會出現很多無效結果
if !theNode.keyArray.contains(keyAtCursor) { continue }
// 得加上這道篩選,不然會出現很多無效結果
if !theNode.keyArray.contains(keyAtCursor) { return }
case .beginAt:
if theNode.keyArray[0] != keyAtCursor { continue }
if theNode.keyArray[0] != keyAtCursor { return }
case .endAt:
if theNode.keyArray.reversed()[0] != keyAtCursor { continue }
if theNode.keyArray.reversed()[0] != keyAtCursor { return }
}
result.append(.init(keyArray: theNode.keyArray, value: gram.value))
}
Expand All @@ -120,7 +130,7 @@ public extension Megrez.Compositor {
/// - overrideType: 指定覆寫行為。
/// - Returns: 該操作是否成功執行。
@discardableResult func overrideCandidate(
_ candidate: KeyValuePaired, at location: Int, overrideType: Node.OverrideType = .withHighScore
_ candidate: Megrez.KeyValuePaired, at location: Int, overrideType: Megrez.Node.OverrideType = .withHighScore
)
-> Bool
{
Expand All @@ -137,7 +147,7 @@ public extension Megrez.Compositor {
/// - Returns: 該操作是否成功執行。
@discardableResult func overrideCandidateLiteral(
_ candidate: String,
at location: Int, overrideType: Node.OverrideType = .withHighScore
at location: Int, overrideType: Megrez.Node.OverrideType = .withHighScore
) -> Bool {
overrideCandidateAgainst(keyArray: nil, at: location, value: candidate, type: overrideType)
}
Expand All @@ -151,7 +161,7 @@ public extension Megrez.Compositor {
/// - value: 資料值。
/// - type: 指定覆寫行為。
/// - Returns: 該操作是否成功執行。
internal func overrideCandidateAgainst(keyArray: [String]?, at location: Int, value: String, type: Node.OverrideType)
internal func overrideCandidateAgainst(keyArray: [String]?, at location: Int, value: String, type: Megrez.Node.OverrideType)
-> Bool
{
let location = max(min(location, keys.count), 0) // 防呆
Expand All @@ -166,18 +176,18 @@ public extension Megrez.Compositor {

guard let overridden = overridden else { return false } // 啥也不覆寫。

for i in overridden.spanIndex ..< min(spans.count, overridden.spanIndex + overridden.node.spanLength) {
(overridden.spanIndex ..< min(spans.count, overridden.spanIndex + overridden.node.spanLength)).forEach { i in
/// 咱們還得弱化所有在相同的幅位座標的節點的複寫權重。舉例說之前爬軌的結果是「A BC」
/// 且 A 與 BC 都是被覆寫的結果,然後使用者現在在與 A 相同的幅位座標位置
/// 選了「DEF」,那麼 BC 的覆寫狀態就有必要重設(但 A 不用重設)。
arrOverlappedNodes = fetchOverlappingNodes(at: i)
for anchor in arrOverlappedNodes {
if anchor.node == overridden.node { continue }
arrOverlappedNodes.forEach { anchor in
if anchor.node == overridden.node { return }
if !overridden.node.joinedKey(by: "\t").contains(anchor.node.joinedKey(by: "\t"))
|| !overridden.node.value.contains(anchor.node.value)
{
anchor.node.reset()
continue
return
}
anchor.node.overridingScore /= 4
}
Expand Down Expand Up @@ -208,3 +218,9 @@ private extension Sequence {
.map(\.element)
}
}

// MARK: - Bool Extension (Private)

extension Bool {
var negative: Bool { !self }
}
Loading

0 comments on commit 1d095da

Please sign in to comment.