From 334ea7c02d40377de98e62edb967ef5a5d8518f9 Mon Sep 17 00:00:00 2001 From: Le Wang Date: Fri, 7 Apr 2023 11:43:40 +0800 Subject: [PATCH 1/2] add golang version --- scel2txt.go | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 scel2txt.go diff --git a/scel2txt.go b/scel2txt.go new file mode 100644 index 0000000..24a48c7 --- /dev/null +++ b/scel2txt.go @@ -0,0 +1,206 @@ +package main + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "strings" + "unicode/utf16" +) + +func readUtf16Str(b *bytes.Reader, offset int64, length int) string { + if offset >= 0 { + b.Seek(offset, 0) + } + data := make([]byte, length) + b.Read(data) + u16 := make([]uint16, length/2) + for i := range u16 { + u16[i] = binary.LittleEndian.Uint16(data[i*2 : (i+1)*2]) + } + return string(utf16.Decode(u16)) +} + +func readUint16(b *bytes.Reader) uint16 { + var num uint16 + binary.Read(b, binary.LittleEndian, &num) + return num +} + +func getHzOffset(b *bytes.Reader) int64 { + b.Seek(4, 0) + var mask byte + binary.Read(b, binary.LittleEndian, &mask) + if mask == 0x44 { + return 0x2628 + } else if mask == 0x45 { + return 0x26c4 + } else { + fmt.Println("不支持的文件类型(无法获取汉语词组的偏移量)") + os.Exit(1) + } + return -1 +} + +func getDictMeta(b *bytes.Reader) (string, string, string, string) { + title := readUtf16Str(b, 0x130, 0x338-0x130) + category := readUtf16Str(b, 0x338, 0x540-0x338) + desc := readUtf16Str(b, 0x540, 0xd40-0x540) + samples := readUtf16Str(b, 0xd40, 0x1540-0xd40) + return title, category, desc, samples +} + +func getPyMap(b *bytes.Reader) map[uint16]string { + pyMap := make(map[uint16]string) + b.Seek(0x1540+4, 0) + + for { + pyIdx := readUint16(b) + pyLen := readUint16(b) + pyStr := readUtf16Str(b, -1, int(pyLen)) + + if _, ok := pyMap[pyIdx]; !ok { + pyMap[pyIdx] = pyStr + } + + if pyStr == "zuo" { + break + } + } + return pyMap +} +func getRecords(b *bytes.Reader, fileSize int64, hzOffset int64, pyMap map[uint16]string) []string { + b.Seek(int64(hzOffset), io.SeekStart) + var records []string + for b.Size()-int64(b.Len()) != fileSize { + wordCount := readUint16(b) + pyIdxCount := int(readUint16(b) / 2) + + pySet := make([]string, pyIdxCount) + for i := 0; i < pyIdxCount; i++ { + pyIdx := readUint16(b) + if py, ok := pyMap[pyIdx]; ok { + pySet[i] = py + } else { + return records + } + } + pyStr := strings.Join(pySet, " ") + + for i := 0; i < int(wordCount); i++ { + wordLen := readUint16(b) + wordStr := readUtf16Str(b, -1, int(wordLen)) + + // 跳过 ext_len 和 ext 共 12 个字节 + b.Seek(12, io.SeekCurrent) + records = append(records, fmt.Sprintf("%s\t%s", wordStr, pyStr)) + } + } + return records +} + +func getWordsFromSogouCellDict(fname string) []string { + data, err := ioutil.ReadFile(fname) + if err != nil { + fmt.Println("Error reading file:", err) + os.Exit(1) + } + + b := bytes.NewReader(data) + + hzOffset := getHzOffset(b) + + pyMap := getPyMap(b) + + fileSize := int64(len(data)) + words := getRecords(b, fileSize, hzOffset, pyMap) + + return words +} + +func save(records []string, f *os.File) []string { + recordsTranslated := make([]string, len(records)) + for i, record := range records { + recordsTranslated[i] = record + } + output := strings.Join(recordsTranslated, "\n") + _, err := f.WriteString(output) + if err != nil { + fmt.Println("Error writing to file:", err) + os.Exit(1) + } + return recordsTranslated +} + +func main() { + scelFiles, _ := filepath.Glob("./scel/*.scel") + + dictFile := "luna_pinyin.sogou.dict.yaml" + var dictFileContent []string + dictFileHeader := `# Rime dictionary +# encoding: utf-8 +# +# Sogou Pinyin Dict - 搜狗细胞词库 +# +# https://pinyin.sogou.com/dict/ +# +# 包括: +# +%s +# + +--- +name: luna_pinyin.sogou +version: "1.0" +sort: by_weight +use_preset_vocabulary: true +... +` + sogouDictNameList := make([]string, len(scelFiles)) + for i, scelFile := range scelFiles { + sogouDictNameList[i] = fmt.Sprintf("# * %s", strings.TrimSuffix(filepath.Base(scelFile), ".scel")) + } + dictFileContent = append(dictFileContent, fmt.Sprintf(dictFileHeader, strings.Join(sogouDictNameList, "\n"))) + + outDir := "./out" + if _, err := os.Stat(outDir); os.IsNotExist(err) { + os.Mkdir(outDir, os.ModePerm) + } + + for _, scelFile := range scelFiles { + records := getWordsFromSogouCellDict(scelFile) + fmt.Printf("%s: %d 个词\n", scelFile, len(records)) + + outFile := filepath.Join(outDir, strings.Replace(filepath.Base(scelFile), ".scel", ".txt", 1)) + f, err := os.Create(outFile) + if err != nil { + fmt.Println("Error creating file:", err) + os.Exit(1) + } + defer f.Close() + + dictFileContent = append(dictFileContent, save(records, f)...) + + fmt.Println(strings.Repeat("-", 80)) + } + + fmt.Printf("合并后 %s: %d 个词\n", dictFile, len(dictFileContent)-1) + + dictFileOut := filepath.Join(outDir, dictFile) + fDict, err := os.Create(dictFileOut) + if err != nil { + fmt.Println("Error creating file:", err) + os.Exit(1) + } + defer fDict.Close() + + _, err = fDict.WriteString(strings.Join(dictFileContent, "\n")) + if err != nil { + fmt.Println("Error writing to file:", err) + os.Exit(1) + } +} From ae7c6e2d69b5f242e946b31a1be61dda2ea12057 Mon Sep 17 00:00:00 2001 From: Le Wang Date: Fri, 7 Apr 2023 11:47:22 +0800 Subject: [PATCH 2/2] reformat --- scel2txt.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scel2txt.go b/scel2txt.go index 24a48c7..286c91d 100644 --- a/scel2txt.go +++ b/scel2txt.go @@ -73,6 +73,7 @@ func getPyMap(b *bytes.Reader) map[uint16]string { } return pyMap } + func getRecords(b *bytes.Reader, fileSize int64, hzOffset int64, pyMap map[uint16]string) []string { b.Seek(int64(hzOffset), io.SeekStart) var records []string @@ -111,11 +112,8 @@ func getWordsFromSogouCellDict(fname string) []string { } b := bytes.NewReader(data) - hzOffset := getHzOffset(b) - pyMap := getPyMap(b) - fileSize := int64(len(data)) words := getRecords(b, fileSize, hzOffset, pyMap)