Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve-hard-line-break #130

Merged
merged 6 commits into from
Dec 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 44 additions & 5 deletions internal/textutils/consecutive_newlines.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,51 @@ package textutils

import (
"unicode/utf8"

"github.com/JohannesKaufmann/html-to-markdown/v2/marker"
)

func TrimConsecutiveNewlines(input []byte) []byte {
var result []byte
newlineCount := 0
spaceBuffer := []byte{}

for i := 0; i < len(input); {
r, size := utf8.DecodeRune(input[i:])

if r == '\n' {
newlineCount++
if newlineCount <= 2 {
// Preserve up to 2 newlines, including preceding spaces
result = append(result, spaceBuffer...)
result = append(result, '\n')
spaceBuffer = spaceBuffer[:0] // Clear space buffer
} else {
// Skip additional newlines
spaceBuffer = spaceBuffer[:0] // Clear space buffer
}
} else if r == ' ' {
// Collect spaces into the space buffer
spaceBuffer = append(spaceBuffer, input[i:i+size]...)
} else {
// Reset newline count and append non-newline characters
newlineCount = 0
result = append(result, spaceBuffer...)
result = append(result, input[i:i+size]...)
spaceBuffer = spaceBuffer[:0] // Clear space buffer
}

i += size
}

// Append any trailing spaces
result = append(result, spaceBuffer...)

return result
}

/*
func TrimConsecutiveNewlines(source []byte) []byte {
// Some performance optimizations:
// - If no replacement was done, we return the original slice and dont allocate.
// - If no replacement was done, we return the original slice and don't allocate.
// - We batch appends

var ret []byte
Expand All @@ -22,7 +60,7 @@ func TrimConsecutiveNewlines(source []byte) []byte {
r, size := utf8.DecodeRune(source[i:])
_ = size

isNewline := r == '\n' || r == marker.MarkerLineBreak
isNewline := r == '\n' // || r == marker.MarkerLineBreak
if isNewline {
count += 1
}
Expand Down Expand Up @@ -82,9 +120,10 @@ func TrimConsecutiveNewlines(source []byte) []byte {
}

if ret == nil {
// Huray, we did not do any allocations with make()
// Hurray, we did not do any allocations with make()
// and instead just return the original slice.
return source
}
return ret
}
*/
190 changes: 82 additions & 108 deletions internal/textutils/consecutive_newlines_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,107 +6,69 @@ import (
)

func TestTrimConsecutiveNewlines(t *testing.T) {
runs := []struct {
desc string
input []byte
expected []byte
tests := []struct {
name string
input string
expected string
}{
{
desc: "empty",
input: []byte(""),
expected: []byte(""),
},
{
desc: "not needed",
input: []byte("normal text"),
expected: []byte("normal text"),
},
{
desc: "also not needed",
input: []byte("normal\n\ntext"),
expected: []byte("normal\n\ntext"),
},
{"empty string", "", ""},
{"single char", "a", "a"},
{"simple text", "hello", "hello"},
{"normal text without newlines", "hello this is a normal text", "hello this is a normal text"},

{
desc: "just two newlines",
input: []byte("\n\n"),
expected: []byte("\n\n"),
},
{
desc: "just three newlines",
input: []byte("\n\n\n"),
expected: []byte("\n\n"),
},
{
desc: "just four newlines",
input: []byte("\n\n\n\n"),
expected: []byte("\n\n"),
},
// Single newline cases
{"single newline", "a\nb", "a\nb"},
{"single newline with spaces", "a \nb", "a \nb"},
{"spaces after newline", "a\n b", "a\n b"},

{
desc: "newlines before",
input: []byte("\n\n\ntext"),
expected: []byte("\n\ntext"),
},
{
desc: "newlines after",
input: []byte("text\n\n\n"),
expected: []byte("text\n\n"),
},
{
desc: "newlines before and after",
input: []byte("\n\n\ntext\n\n\n"),
expected: []byte("\n\ntext\n\n"),
},
{
desc: "newlines between",
input: []byte("before\n\n\nafter"),
expected: []byte("before\n\nafter"),
},
{
desc: "newlines between multiple times",
input: []byte("1\n\n\n2\n\n\n3"),
expected: []byte("1\n\n2\n\n3"),
},
// Double newline cases
{"double newline", "a\n\nb", "a\n\nb"},
{"double newline with spaces", "a \n\nb", "a \n\nb"},
{"spaces between newlines", "a\n \nb", "a\n \nb"},
{"spaces after double newline", "a\n\n b", "a\n\n b"},

{
desc: "not needed the first time",
input: []byte("abc\n\nabc\n\n\nabc"),
expected: []byte("abc\n\nabc\n\nabc"),
},
{
desc: "not needed the second time",
input: []byte("abc\n\n\nabc\n\nabc"),
expected: []byte("abc\n\nabc\n\nabc"),
},
// Triple+ newline cases
{"triple newline", "a\n\n\nb", "a\n\nb"},
{"quad newline", "a\n\n\n\nb", "a\n\nb"},
{"triple newline with spaces", "a \n\n\nb", "a \n\nb"},

{
desc: "with special characters",
input: []byte("äöü\n\n\näöü"),
expected: []byte("äöü\n\näöü"),
},
{
desc: "space at end",
input: []byte("a\n\n\nb "),
expected: []byte("a\n\nb "),
},
{
desc: "one newline at end",
input: []byte("a\n\n\nb\n"),
expected: []byte("a\n\nb\n"),
},
{
desc: "two newlines at end",
input: []byte("a\n\n\nb\n\n"),
expected: []byte("a\n\nb\n\n"),
},
// Multiple segment cases
{"multiple segments", "a\n\nb\n\nc", "a\n\nb\n\nc"},
{"multiple segments with spaces", "a \n\nb \n\nc", "a \n\nb \n\nc"},

// Spaces at end of line
{"hard-line-break followed by text", "a \nb", "a \nb"},
{"hard-line-break followed by newline", "a \n\nb", "a \n\nb"},

// Edge cases
{"only newlines", "\n\n\n", "\n\n"},
{"only spaces", " ", " "},

{"leading and trailing newlines", "\n\n\ntext\n\n\n", "\n\ntext\n\n"},
{"newlines and spaces", " \n \n \n \n ", " \n \n "},

{"leading spaces", " a", " a"},
{"leading newline 1", "\na", "\na"},
{"leading newline 2", "\n\na", "\n\na"},
{"leading newline 3", "\n\n\na", "\n\na"},

{"trailing spaces", "a ", "a "},
{"trailing newline 1", "a\n", "a\n"},
{"trailing newlines 2", "a\n\n", "a\n\n"},
{"trailing newlines 3", "a\n\n\n", "a\n\n"},

// UTF-8 cases
{"german special chars", "äöü\n\n\näöü", "äöü\n\näöü"},
{"utf8 chars", "🌟\n\n\n🌟\n\n\n🌟", "🌟\n\n🌟\n\n🌟"},
}

for _, run := range runs {
t.Run(run.desc, func(t *testing.T) {
output := TrimConsecutiveNewlines(run.input)
if !bytes.Equal(output, run.expected) {
t.Errorf("expected %q but got %q", string(run.expected), string(output))
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := string(TrimConsecutiveNewlines([]byte(tt.input)))
if got != tt.expected {
t.Errorf("\ninput: %q\nexpected: %q\ngot: %q",
tt.input, tt.expected, got,
)
}
})
}
Expand All @@ -115,31 +77,43 @@ func TestTrimConsecutiveNewlines(t *testing.T) {
func TestTrimConsecutiveNewlines_Allocs(t *testing.T) {
const N = 1000

avg := testing.AllocsPerRun(N, func() {
input := []byte("abc")
output := TrimConsecutiveNewlines(input)
_ = output
})
if avg != 0 {
t.Errorf("with no newlines there should be no allocations but got %f", avg)
}
var avg float64
/*
avg = testing.AllocsPerRun(N, func() {
input := []byte("abc")
output := TrimConsecutiveNewlines(input)
_ = output
})
if avg != 0 {
t.Errorf("with no newlines there should be no allocations but got %f", avg)
}

avg = testing.AllocsPerRun(N, func() {
input := []byte("abc\n\nabc")
output := TrimConsecutiveNewlines(input)
_ = output
})
if avg != 0 {
t.Errorf("with only two newlines there should be no allocations but got %f", avg)
}
*/

avg = testing.AllocsPerRun(N, func() {
input := []byte("abc\n\nabc")
input := []byte("abc\n\n\nabc")
output := TrimConsecutiveNewlines(input)
_ = output
})
if avg != 0 {
t.Errorf("with only two newlines there should be no allocations but got %f", avg)
if avg != 1 {
t.Errorf("with three newlines there should be 1 allocation but got %f", avg)
}

avg = testing.AllocsPerRun(N, func() {
input := []byte("abc\n\n\nabc")
input := []byte("abc\n\n\n\n\n\nabc\n\n\n\n\n\nabc\n\n\n\n\n\nabc\n\n\n\n\n\nabc\n\n\n\n\n\nabc")
output := TrimConsecutiveNewlines(input)
_ = output
})
if avg != 1 {
t.Errorf("with trhee newlines there should be 1 allocation but got %f", avg)
if avg != 3 {
t.Errorf("with many newlines there should be 3 allocation but got %f", avg)
}
}

Expand Down
35 changes: 0 additions & 35 deletions internal/textutils/escape_multiline.go
Original file line number Diff line number Diff line change
@@ -1,39 +1,5 @@
package textutils

import (
"bytes"

"github.com/JohannesKaufmann/html-to-markdown/v2/marker"
)

var newline = []byte{'\n'}
var escape = []byte{'\\'}

func EscapeMultiLine(content []byte) []byte {
content = bytes.TrimSpace(content)
content = TrimConsecutiveNewlines(content)
if len(content) == 0 {
return content
}

parts := marker.SplitFunc(content, func(r rune) bool {
return r == '\n' || r == marker.MarkerLineBreak
})

for i := range parts {
parts[i] = bytes.TrimSpace(parts[i])
if len(parts[i]) == 0 {
parts[i] = escape
}
}
content = bytes.Join(parts, newline)

return content
}

/*
// TODO: use this optimized function again after integrating the marker.MarkerLineBreak changes

// EscapeMultiLine deals with multiline content inside a link or a heading.
func EscapeMultiLine(content []byte) []byte {
content = TrimConsecutiveNewlines(content)
Expand Down Expand Up @@ -74,4 +40,3 @@ func EscapeMultiLine(content []byte) []byte {

return newContent
}
*/
4 changes: 3 additions & 1 deletion internal/textutils/escape_multiline_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ import (
"testing"
)

var newline = []byte{'\n'}
var escape = []byte{'\\'}

func EscapeMultiLine_Old(content []byte) []byte {
content = bytes.TrimSpace(content)
content = TrimConsecutiveNewlines(content)
Expand Down Expand Up @@ -115,7 +118,6 @@ line4`,
t.Errorf("expected '%s' but got '%s'", test.Expected, string(output))
}
})

})

}
Expand Down
6 changes: 2 additions & 4 deletions internal/textutils/surrounding_spaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,16 @@ package textutils
import (
"bytes"
"unicode"

"github.com/JohannesKaufmann/html-to-markdown/v2/marker"
)

func SurroundingSpaces(content []byte) ([]byte, []byte, []byte) {
rightTrimmed := bytes.TrimRightFunc(content, func(r rune) bool {
return unicode.IsSpace(r) || r == marker.MarkerLineBreak
return unicode.IsSpace(r)
})
rightExtra := content[len(rightTrimmed):]

trimmed := bytes.TrimLeftFunc(rightTrimmed, func(r rune) bool {
return unicode.IsSpace(r) || r == marker.MarkerLineBreak
return unicode.IsSpace(r)
})
leftExtra := content[0 : len(rightTrimmed)-len(trimmed)]

Expand Down
Loading
Loading