Skip to content

Commit

Permalink
Enforce order for results with same confidence
Browse files Browse the repository at this point in the history
  • Loading branch information
xWTF committed Jan 30, 2023
1 parent b7413ea commit 77849dd
Show file tree
Hide file tree
Showing 9 changed files with 46 additions and 19 deletions.
3 changes: 2 additions & 1 deletion 2022.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ type recognizer2022 struct {
escapes [][]byte
}

func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
func (r *recognizer2022) Match(input *recognizerInput, order int) (output recognizerOutput) {
return recognizerOutput{
Charset: r.charset,
Confidence: r.matchConfidence(input.input),
order: order,
}
}

Expand Down
25 changes: 15 additions & 10 deletions detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ type Result struct {
Language string
// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
Confidence int

// used for sorting internally
order int
}

// Detector implements charset detection.
Expand Down Expand Up @@ -87,13 +90,13 @@ var (
func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
input := newRecognizerInput(b, d.stripTag)
outputChan := make(chan recognizerOutput)
for _, r := range d.recognizers {
go matchHelper(r, input, outputChan)
for i, r := range d.recognizers {
go matchHelper(r, input, outputChan, i)
}
var output Result
for i := 0; i < len(d.recognizers); i++ {
o := <-outputChan
if output.Confidence < o.Confidence {
if output.Confidence < o.Confidence || (output.Confidence == o.Confidence && o.order < output.order) {
output = Result(o)
}
}
Expand All @@ -107,8 +110,8 @@ func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
func (d *Detector) DetectAll(b []byte) ([]Result, error) {
input := newRecognizerInput(b, d.stripTag)
outputChan := make(chan recognizerOutput)
for _, r := range d.recognizers {
go matchHelper(r, input, outputChan)
for i, r := range d.recognizers {
go matchHelper(r, input, outputChan, i)
}
outputs := make(recognizerOutputs, 0, len(d.recognizers))
for i := 0; i < len(d.recognizers); i++ {
Expand Down Expand Up @@ -136,12 +139,14 @@ func (d *Detector) DetectAll(b []byte) ([]Result, error) {
return dedupOutputs, nil
}

func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
outputChan <- r.Match(input)
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput, order int) {
outputChan <- r.Match(input, order)
}

type recognizerOutputs []recognizerOutput

func (r recognizerOutputs) Len() int { return len(r) }
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
func (r recognizerOutputs) Len() int { return len(r) }
func (r recognizerOutputs) Less(i, j int) bool {
return r[i].Confidence > r[j].Confidence || (r[i].Confidence == r[j].Confidence && r[i].order < r[j].order)
}
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
15 changes: 14 additions & 1 deletion detector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ package chardet_test

import (
"bytes"
"github.com/gogs/chardet"
"io"
"os"
"path/filepath"
"testing"

"github.com/gogs/chardet"
)

func TestDetector(t *testing.T) {
Expand Down Expand Up @@ -58,6 +59,18 @@ func TestDetector(t *testing.T) {
t.Errorf("Expected language %s, actual %s", d.Language, result.Language)
}
}

// "ノエル" Shift JIS encoded
result, err := textDetector.DetectAll([]byte("\x83m\x83G\x83\x8b"))
if err != nil {
t.Fatal(err)
}
if len(result) != 3 {
t.Errorf("Expected 3 results, actual %d", len(result))
}
if result[0].Charset != "Shift_JIS" || result[1].Charset != "GB18030" || result[2].Charset != "Big5" {
t.Errorf("Result order is wrong: %v", result)
}
}

func BenchmarkDetectBest(b *testing.B) {
Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
module github.com/gogs/chardet

go 1.19
3 changes: 2 additions & 1 deletion multi_byte.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ type charDecoder interface {
DecodeOneChar([]byte) (c uint16, remain []byte, err error)
}

func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
func (r *recognizerMultiByte) Match(input *recognizerInput, order int) (output recognizerOutput) {
return recognizerOutput{
Charset: r.charset,
Language: r.language,
Confidence: r.matchConfidence(input),
order: order,
}
}

Expand Down
2 changes: 1 addition & 1 deletion recognizer.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package chardet

type recognizer interface {
Match(*recognizerInput) recognizerOutput
Match(*recognizerInput, int) recognizerOutput
}

type recognizerOutput Result
Expand Down
3 changes: 2 additions & 1 deletion single_byte.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ type recognizerSingleByte struct {
ngram *[64]uint32
}

func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
func (r *recognizerSingleByte) Match(input *recognizerInput, order int) recognizerOutput {
var charset string = r.charset
if input.hasC1Bytes && len(r.hasC1ByteCharset) > 0 {
charset = r.hasC1ByteCharset
Expand All @@ -18,6 +18,7 @@ func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
Charset: charset,
Language: r.language,
Confidence: r.parseNgram(input.input),
order: order,
}
}

Expand Down
9 changes: 6 additions & 3 deletions unicode.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ func newRecognizer_utf16be() *recognizerUtf16be {
return &recognizerUtf16be{}
}

func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
func (*recognizerUtf16be) Match(input *recognizerInput, order int) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-16BE",
order: order,
}
if bytes.HasPrefix(input.raw, utf16beBom) {
output.Confidence = 100
Expand All @@ -35,9 +36,10 @@ func newRecognizer_utf16le() *recognizerUtf16le {
return &recognizerUtf16le{}
}

func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
func (*recognizerUtf16le) Match(input *recognizerInput, order int) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-16LE",
order: order,
}
if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
output.Confidence = 100
Expand Down Expand Up @@ -75,9 +77,10 @@ func newRecognizer_utf32le() *recognizerUtf32 {
}
}

func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
func (r *recognizerUtf32) Match(input *recognizerInput, order int) (output recognizerOutput) {
output = recognizerOutput{
Charset: r.name,
order: order,
}
hasBom := bytes.HasPrefix(input.raw, r.bom)
var numValid, numInvalid uint32
Expand Down
3 changes: 2 additions & 1 deletion utf8.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ func newRecognizer_utf8() *recognizerUtf8 {
return &recognizerUtf8{}
}

func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
func (*recognizerUtf8) Match(input *recognizerInput, order int) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-8",
order: order,
}
hasBom := bytes.HasPrefix(input.raw, utf8Bom)
inputLen := len(input.raw)
Expand Down

0 comments on commit 77849dd

Please sign in to comment.