Add option to disable ambiguous unicode characters detection (#28454)

* Close #24483
* Close #28123
* Close #23682
* Close #23149

(maybe more)
This commit is contained in:
wxiaoguang 2023-12-17 22:38:54 +08:00 committed by GitHub
parent 408a484224
commit 20929edc99
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 113 additions and 149 deletions

View file

@ -8,11 +8,12 @@
package charset
import (
"bufio"
"html/template"
"io"
"strings"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/translation"
)
@ -20,20 +21,18 @@ import (
const RuneNBSP = 0xa0
// EscapeControlHTML escapes the unicode control sequences in a provided html document
func EscapeControlHTML(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) {
func EscapeControlHTML(html template.HTML, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output template.HTML) {
sb := &strings.Builder{}
outputStream := &HTMLStreamerWriter{Writer: sb}
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
if err := StreamHTML(strings.NewReader(text), streamer); err != nil {
streamer.escaped.HasError = true
log.Error("Error whilst escaping: %v", err)
}
return streamer.escaped, sb.String()
escaped, _ = EscapeControlReader(strings.NewReader(string(html)), sb, locale, allowed...) // err has been handled in EscapeControlReader
return escaped, template.HTML(sb.String())
}
// EscapeControlReaders escapes the unicode control sequences in a provided reader of HTML content and writer in a locale and returns the findings as an EscapeStatus and the escaped []byte
// EscapeControlReader escapes the unicode control sequences in a provided reader of HTML content and writer in a locale and returns the findings as an EscapeStatus
func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, err error) {
if !setting.UI.AmbiguousUnicodeDetection {
_, err = io.Copy(writer, reader)
return &EscapeStatus{}, err
}
outputStream := &HTMLStreamerWriter{Writer: writer}
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
@ -43,41 +42,3 @@ func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.
}
return streamer.escaped, err
}
// EscapeControlStringReader escapes the unicode control sequences in a provided reader of string content and writer in a locale and returns the findings as an EscapeStatus and the escaped []byte. HTML line breaks are not inserted after every newline by this method.
func EscapeControlStringReader(reader io.Reader, writer io.Writer, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, err error) {
bufRd := bufio.NewReader(reader)
outputStream := &HTMLStreamerWriter{Writer: writer}
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
for {
line, rdErr := bufRd.ReadString('\n')
if len(line) > 0 {
if err := streamer.Text(line); err != nil {
streamer.escaped.HasError = true
log.Error("Error whilst escaping: %v", err)
return streamer.escaped, err
}
}
if rdErr != nil {
if rdErr != io.EOF {
err = rdErr
}
break
}
}
return streamer.escaped, err
}
// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
func EscapeControlString(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) {
sb := &strings.Builder{}
outputStream := &HTMLStreamerWriter{Writer: sb}
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
if err := streamer.Text(text); err != nil {
streamer.escaped.HasError = true
log.Error("Error whilst escaping: %v", err)
}
return streamer.escaped, sb.String()
}

View file

@ -64,7 +64,7 @@ func (e *escapeStreamer) Text(data string) error {
until, next = nextIdxs[0]+pos, nextIdxs[1]+pos
}
// from pos until until we know that the runes are not \r\t\n or even ' '
// from pos until we know that the runes are not \r\t\n or even ' '
runes := make([]rune, 0, next-until)
positions := make([]int, 0, next-until+1)

View file

@ -4,11 +4,14 @@
package charset
import (
"reflect"
"strings"
"testing"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/test"
"code.gitea.io/gitea/modules/translation"
"github.com/stretchr/testify/assert"
)
type escapeControlTest struct {
@ -132,22 +135,8 @@ then resh (ר), and finally heh (ה) (which should appear leftmost).`,
},
}
func TestEscapeControlString(t *testing.T) {
for _, tt := range escapeControlTests {
t.Run(tt.name, func(t *testing.T) {
status, result := EscapeControlString(tt.text, &translation.MockLocale{})
if !reflect.DeepEqual(*status, tt.status) {
t.Errorf("EscapeControlString() status = %v, wanted= %v", status, tt.status)
}
if result != tt.result {
t.Errorf("EscapeControlString()\nresult= %v,\nwanted= %v", result, tt.result)
}
})
}
}
func TestEscapeControlReader(t *testing.T) {
// lets add some control characters to the tests
// add some control characters to the tests
tests := make([]escapeControlTest, 0, len(escapeControlTests)*3)
copy(tests, escapeControlTests)
@ -169,29 +158,20 @@ func TestEscapeControlReader(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
input := strings.NewReader(tt.text)
output := &strings.Builder{}
status, err := EscapeControlReader(input, output, &translation.MockLocale{})
result := output.String()
if err != nil {
t.Errorf("EscapeControlReader(): err = %v", err)
}
if !reflect.DeepEqual(*status, tt.status) {
t.Errorf("EscapeControlReader() status = %v, wanted= %v", status, tt.status)
}
if result != tt.result {
t.Errorf("EscapeControlReader()\nresult= %v,\nwanted= %v", result, tt.result)
}
status, err := EscapeControlReader(strings.NewReader(tt.text), output, &translation.MockLocale{})
assert.NoError(t, err)
assert.Equal(t, tt.status, *status)
assert.Equal(t, tt.result, output.String())
})
}
}
func TestEscapeControlReader_panic(t *testing.T) {
bs := make([]byte, 0, 20479)
bs = append(bs, 'A')
for i := 0; i < 6826; i++ {
bs = append(bs, []byte("—")...)
}
_, _ = EscapeControlString(string(bs), &translation.MockLocale{})
func TestSettingAmbiguousUnicodeDetection(t *testing.T) {
defer test.MockVariableValue(&setting.UI.AmbiguousUnicodeDetection, true)()
_, out := EscapeControlHTML("a test", &translation.MockLocale{})
assert.EqualValues(t, `a<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>test`, out)
setting.UI.AmbiguousUnicodeDetection = false
_, out = EscapeControlHTML("a test", &translation.MockLocale{})
assert.EqualValues(t, `a test`, out)
}