Fix chardet test and add ordering option (#11621)
* Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io>
This commit is contained in:
parent
fe2cacf5ea
commit
a1ad188326
5 changed files with 117 additions and 6 deletions
|
@ -7,6 +7,7 @@ package charset
|
|||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
|
@ -137,16 +138,42 @@ func DetectEncoding(content []byte) (string, error) {
|
|||
} else {
|
||||
detectContent = content
|
||||
}
|
||||
result, err := textDetector.DetectBest(detectContent)
|
||||
|
||||
// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
|
||||
results, err := textDetector.DetectAll(detectContent)
|
||||
if err != nil {
|
||||
if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {
|
||||
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
|
||||
return setting.Repository.AnsiCharset, nil
|
||||
}
|
||||
return "", err
|
||||
}
|
||||
|
||||
topConfidence := results[0].Confidence
|
||||
topResult := results[0]
|
||||
priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]
|
||||
for _, result := range results {
|
||||
// As results are sorted in confidence order - if we have a different confidence
|
||||
// we know it's less than the current confidence and can break out of the loop early
|
||||
if result.Confidence != topConfidence {
|
||||
break
|
||||
}
|
||||
|
||||
// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss
|
||||
resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]
|
||||
if resultHas && (!has || resultPriority < priority) {
|
||||
topResult = result
|
||||
priority = resultPriority
|
||||
has = true
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
|
||||
if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
|
||||
if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
|
||||
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
|
||||
return setting.Repository.AnsiCharset, err
|
||||
}
|
||||
|
||||
log.Debug("Detected encoding: %s", result.Charset)
|
||||
return result.Charset, err
|
||||
log.Debug("Detected encoding: %s", topResult.Charset)
|
||||
return topResult.Charset, err
|
||||
}
|
||||
|
|
|
@ -230,7 +230,11 @@ func TestDetectEncoding(t *testing.T) {
|
|||
// we accept either.
|
||||
assert.Contains(t, encoding, "ISO-8859")
|
||||
|
||||
old := setting.Repository.AnsiCharset
|
||||
setting.Repository.AnsiCharset = "placeholder"
|
||||
defer func() {
|
||||
setting.Repository.AnsiCharset = old
|
||||
}()
|
||||
testSuccess(b, "placeholder")
|
||||
|
||||
// invalid bytes
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue