JohannesKaufmann · JohannesKaufmann · Dec 25, 2024 · Dec 25, 2024 · Dec 25, 2024 · Dec 25, 2024
diff --git a/internal/textutils/consecutive_newlines.go b/internal/textutils/consecutive_newlines.go
@@ -2,13 +2,51 @@ package textutils
 
 import (
 	"unicode/utf8"
-
-	"github.com/JohannesKaufmann/html-to-markdown/v2/marker"
 )
 
+func TrimConsecutiveNewlines(input []byte) []byte {
+	var result []byte
+	newlineCount := 0
+	spaceBuffer := []byte{}
+
+	for i := 0; i < len(input); {
+		r, size := utf8.DecodeRune(input[i:])
+
+		if r == '\n' {
+			newlineCount++
+			if newlineCount <= 2 {
+				// Preserve up to 2 newlines, including preceding spaces
+				result = append(result, spaceBuffer...)
+				result = append(result, '\n')
+				spaceBuffer = spaceBuffer[:0] // Clear space buffer
+			} else {
+				// Skip additional newlines
+				spaceBuffer = spaceBuffer[:0] // Clear space buffer
+			}
+		} else if r == ' ' {
+			// Collect spaces into the space buffer
+			spaceBuffer = append(spaceBuffer, input[i:i+size]...)
+		} else {
+			// Reset newline count and append non-newline characters
+			newlineCount = 0
+			result = append(result, spaceBuffer...)
+			result = append(result, input[i:i+size]...)
+			spaceBuffer = spaceBuffer[:0] // Clear space buffer
+		}
+
+		i += size
+	}
+
+	// Append any trailing spaces
+	result = append(result, spaceBuffer...)
+
+	return result
+}
+
+/*
 func TrimConsecutiveNewlines(source []byte) []byte {
 	// Some performance optimizations:
-	// - If no replacement was done, we return the original slice and dont allocate.
+	// - If no replacement was done, we return the original slice and don't allocate.
 	// - We batch appends
 
 	var ret []byte
@@ -22,7 +60,7 @@ func TrimConsecutiveNewlines(source []byte) []byte {
 		r, size := utf8.DecodeRune(source[i:])
 		_ = size
 
-		isNewline := r == '\n' || r == marker.MarkerLineBreak
+		isNewline := r == '\n' // || r == marker.MarkerLineBreak
 		if isNewline {
 			count += 1
 		}
@@ -82,9 +120,10 @@ func TrimConsecutiveNewlines(source []byte) []byte {
 	}
 
 	if ret == nil {
-		// Huray, we did not do any allocations with make()
+		// Hurray, we did not do any allocations with make()
 		// and instead just return the original slice.
 		return source
 	}
 	return ret
 }
+*/
diff --git a/internal/textutils/consecutive_newlines_test.go b/internal/textutils/consecutive_newlines_test.go
@@ -6,107 +6,69 @@ import (
 )
 
 func TestTrimConsecutiveNewlines(t *testing.T) {
-	runs := []struct {
-		desc     string
-		input    []byte
-		expected []byte
+	tests := []struct {
+		name     string
+		input    string
+		expected string
 	}{
-		{
-			desc:     "empty",
-			input:    []byte(""),
-			expected: []byte(""),
-		},
-		{
-			desc:     "not needed",
-			input:    []byte("normal text"),
-			expected: []byte("normal text"),
-		},
-		{
-			desc:     "also not needed",
-			input:    []byte("normal\n\ntext"),
-			expected: []byte("normal\n\ntext"),
-		},
+		{"empty string", "", ""},
+		{"single char", "a", "a"},
+		{"simple text", "hello", "hello"},
+		{"normal text without newlines", "hello  this is a   normal text", "hello  this is a   normal text"},
 
-		{
-			desc:     "just two newlines",
-			input:    []byte("\n\n"),
-			expected: []byte("\n\n"),
-		},
-		{
-			desc:     "just three newlines",
-			input:    []byte("\n\n\n"),
-			expected: []byte("\n\n"),
-		},
-		{
-			desc:     "just four newlines",
-			input:    []byte("\n\n\n\n"),
-			expected: []byte("\n\n"),
-		},
+		// Single newline cases
+		{"single newline", "a\nb", "a\nb"},
+		{"single newline with spaces", "a  \nb", "a  \nb"},
+		{"spaces after newline", "a\n  b", "a\n  b"},
 
-		{
-			desc:     "newlines before",
-			input:    []byte("\n\n\ntext"),
-			expected: []byte("\n\ntext"),
-		},
-		{
-			desc:     "newlines after",
-			input:    []byte("text\n\n\n"),
-			expected: []byte("text\n\n"),
-		},
-		{
-			desc:     "newlines before and after",
-			input:    []byte("\n\n\ntext\n\n\n"),
-			expected: []byte("\n\ntext\n\n"),
-		},
-		{
-			desc:     "newlines between",
-			input:    []byte("before\n\n\nafter"),
-			expected: []byte("before\n\nafter"),
-		},
-		{
-			desc:     "newlines between multiple times",
-			input:    []byte("1\n\n\n2\n\n\n3"),
-			expected: []byte("1\n\n2\n\n3"),
-		},
+		// Double newline cases
+		{"double newline", "a\n\nb", "a\n\nb"},
+		{"double newline with spaces", "a  \n\nb", "a  \n\nb"},
+		{"spaces between newlines", "a\n  \nb", "a\n  \nb"},
+		{"spaces after double newline", "a\n\n  b", "a\n\n  b"},
 
-		{
-			desc:     "not needed the first time",
-			input:    []byte("abc\n\nabc\n\n\nabc"),
-			expected: []byte("abc\n\nabc\n\nabc"),
-		},
-		{
-			desc:     "not needed the second time",
-			input:    []byte("abc\n\n\nabc\n\nabc"),
-			expected: []byte("abc\n\nabc\n\nabc"),
-		},
+		// Triple+ newline cases
+		{"triple newline", "a\n\n\nb", "a\n\nb"},
+		{"quad newline", "a\n\n\n\nb", "a\n\nb"},
+		{"triple newline with spaces", "a  \n\n\nb", "a  \n\nb"},
 
-		{
-			desc:     "with special characters",
-			input:    []byte("äöü\n\n\näöü"),
-			expected: []byte("äöü\n\näöü"),
-		},
-		{
-			desc:     "space at end",
-			input:    []byte("a\n\n\nb "),
-			expected: []byte("a\n\nb "),
-		},
-		{
-			desc:     "one newline at end",
-			input:    []byte("a\n\n\nb\n"),
-			expected: []byte("a\n\nb\n"),
-		},
-		{
-			desc:     "two newlines at end",
-			input:    []byte("a\n\n\nb\n\n"),
-			expected: []byte("a\n\nb\n\n"),
-		},
+		// Multiple segment cases
+		{"multiple segments", "a\n\nb\n\nc", "a\n\nb\n\nc"},
+		{"multiple segments with spaces", "a  \n\nb  \n\nc", "a  \n\nb  \n\nc"},
+
+		// Spaces at end of line
+		{"hard-line-break followed by text", "a  \nb", "a  \nb"},
+		{"hard-line-break followed by newline", "a  \n\nb", "a  \n\nb"},
+
+		// Edge cases
+		{"only newlines", "\n\n\n", "\n\n"},
+		{"only spaces", "   ", "   "},
+
+		{"leading and trailing newlines", "\n\n\ntext\n\n\n", "\n\ntext\n\n"},
+		{"newlines and spaces", "  \n  \n  \n  \n  ", "  \n  \n  "},
+
+		{"leading spaces", "   a", "   a"},
+		{"leading newline 1", "\na", "\na"},
+		{"leading newline 2", "\n\na", "\n\na"},
+		{"leading newline 3", "\n\n\na", "\n\na"},
+
+		{"trailing spaces", "a   ", "a   "},
+		{"trailing newline 1", "a\n", "a\n"},
+		{"trailing newlines 2", "a\n\n", "a\n\n"},
+		{"trailing newlines 3", "a\n\n\n", "a\n\n"},
+
+		// UTF-8 cases
+		{"german special chars", "äöü\n\n\näöü", "äöü\n\näöü"},
+		{"utf8 chars", "🌟\n\n\n🌟\n\n\n🌟", "🌟\n\n🌟\n\n🌟"},
 	}
 
-	for _, run := range runs {
-		t.Run(run.desc, func(t *testing.T) {
-			output := TrimConsecutiveNewlines(run.input)
-			if !bytes.Equal(output, run.expected) {
-				t.Errorf("expected %q but got %q", string(run.expected), string(output))
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := string(TrimConsecutiveNewlines([]byte(tt.input)))
+			if got != tt.expected {
+				t.Errorf("\ninput:    %q\nexpected: %q\ngot:      %q",
+					tt.input, tt.expected, got,
+				)
 			}
 		})
 	}
@@ -115,31 +77,43 @@ func TestTrimConsecutiveNewlines(t *testing.T) {
 func TestTrimConsecutiveNewlines_Allocs(t *testing.T) {
 	const N = 1000
 
-	avg := testing.AllocsPerRun(N, func() {
-		input := []byte("abc")
-		output := TrimConsecutiveNewlines(input)
-		_ = output
-	})
-	if avg != 0 {
-		t.Errorf("with no newlines there should be no allocations but got %f", avg)
-	}
+	var avg float64
+	/*
+		avg = testing.AllocsPerRun(N, func() {
+			input := []byte("abc")
+			output := TrimConsecutiveNewlines(input)
+			_ = output
+		})
+		if avg != 0 {
+			t.Errorf("with no newlines there should be no allocations but got %f", avg)
+		}
+
+		avg = testing.AllocsPerRun(N, func() {
+			input := []byte("abc\n\nabc")
+			output := TrimConsecutiveNewlines(input)
+			_ = output
+		})
+		if avg != 0 {
+			t.Errorf("with only two newlines there should be no allocations but got %f", avg)
+		}
+	*/
 
 	avg = testing.AllocsPerRun(N, func() {
-		input := []byte("abc\n\nabc")
+		input := []byte("abc\n\n\nabc")
 		output := TrimConsecutiveNewlines(input)
 		_ = output
 	})
-	if avg != 0 {
-		t.Errorf("with only two newlines there should be no allocations but got %f", avg)
+	if avg != 1 {
+		t.Errorf("with three newlines there should be 1 allocation but got %f", avg)
 	}
 
 	avg = testing.AllocsPerRun(N, func() {
-		input := []byte("abc\n\n\nabc")
+		input := []byte("abc\n\n\n\n\n\nabc\n\n\n\n\n\nabc\n\n\n\n\n\nabc\n\n\n\n\n\nabc\n\n\n\n\n\nabc")
 		output := TrimConsecutiveNewlines(input)
 		_ = output
 	})
-	if avg != 1 {
-		t.Errorf("with trhee newlines there should be 1 allocation but got %f", avg)
+	if avg != 3 {
+		t.Errorf("with many newlines there should be 3 allocation but got %f", avg)
 	}
 }
 

diff --git a/internal/textutils/escape_multiline.go b/internal/textutils/escape_multiline.go
@@ -1,39 +1,5 @@
 package textutils
 
-import (
-	"bytes"
-
-	"github.com/JohannesKaufmann/html-to-markdown/v2/marker"
-)
-
-var newline = []byte{'\n'}
-var escape = []byte{'\\'}
-
-func EscapeMultiLine(content []byte) []byte {
-	content = bytes.TrimSpace(content)
-	content = TrimConsecutiveNewlines(content)
-	if len(content) == 0 {
-		return content
-	}
-
-	parts := marker.SplitFunc(content, func(r rune) bool {
-		return r == '\n' || r == marker.MarkerLineBreak
-	})
-
-	for i := range parts {
-		parts[i] = bytes.TrimSpace(parts[i])
-		if len(parts[i]) == 0 {
-			parts[i] = escape
-		}
-	}
-	content = bytes.Join(parts, newline)
-
-	return content
-}
-
-/*
-// TODO: use this optimized function again after integrating the marker.MarkerLineBreak changes
-
 // EscapeMultiLine deals with multiline content inside a link or a heading.
 func EscapeMultiLine(content []byte) []byte {
 	content = TrimConsecutiveNewlines(content)
@@ -74,4 +40,3 @@ func EscapeMultiLine(content []byte) []byte {
 
 	return newContent
 }
-*/
diff --git a/internal/textutils/escape_multiline_test.go b/internal/textutils/escape_multiline_test.go
@@ -6,6 +6,9 @@ import (
 	"testing"
 )
 
+var newline = []byte{'\n'}
+var escape = []byte{'\\'}
+
 func EscapeMultiLine_Old(content []byte) []byte {
 	content = bytes.TrimSpace(content)
 	content = TrimConsecutiveNewlines(content)
@@ -115,7 +118,6 @@ line4`,
 					t.Errorf("expected '%s' but got '%s'", test.Expected, string(output))
 				}
 			})
-
 		})
 
 	}

diff --git a/internal/textutils/surrounding_spaces.go b/internal/textutils/surrounding_spaces.go
@@ -41,18 +41,16 @@ package textutils
 import (
 	"bytes"
 	"unicode"
-
-	"github.com/JohannesKaufmann/html-to-markdown/v2/marker"
 )
 
 func SurroundingSpaces(content []byte) ([]byte, []byte, []byte) {
 	rightTrimmed := bytes.TrimRightFunc(content, func(r rune) bool {
-		return unicode.IsSpace(r) || r == marker.MarkerLineBreak
+		return unicode.IsSpace(r)
 	})
 	rightExtra := content[len(rightTrimmed):]
 
 	trimmed := bytes.TrimLeftFunc(rightTrimmed, func(r rune) bool {
-		return unicode.IsSpace(r) || r == marker.MarkerLineBreak
+		return unicode.IsSpace(r)
 	})
 	leftExtra := content[0 : len(rightTrimmed)-len(trimmed)]