Skip to content

Commit

Permalink
Refactor HtmlContentParser
Browse files Browse the repository at this point in the history
Uses `Ksoup#clean` API to clean up the content and extract text and lead image
  • Loading branch information
msasikanth committed Aug 9, 2024
1 parent 7dfa02b commit 8e4e107
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ internal object AtomContentParser : ContentParser() {

val htmlContent = HtmlContentParser.parse(htmlContent = rawContent)
if (image.isNullOrBlank() && htmlContent != null) {
image = htmlContent.imageUrl
image = htmlContent.leadImage
}

content = htmlContent?.content?.ifBlank { rawContent.trim() } ?: rawContent.trim()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,36 +17,41 @@ package dev.sasikanth.rss.reader.core.network.parser

import co.touchlab.crashkios.bugsnag.BugsnagKotlin
import com.fleeksoft.ksoup.Ksoup
import com.fleeksoft.ksoup.safety.Safelist
import io.ktor.utils.io.charsets.MalformedInputException

internal object HtmlContentParser {

private val allowedContentTags = setOf("p", "span", "em", "u", "b", "i", "strong")
private const val TAG_BODY = "body"
private const val TAG_IMG = "img"
private const val TAG_FIGCAPTION = "figcaption"
private const val ATTR_SRC = "src"

fun parse(htmlContent: String): HtmlContent? {
private val allowedContentTags =
Safelist().addTags(TAG_FIGCAPTION, TAG_IMG).addAttributes(TAG_IMG, ATTR_SRC)
private val gifRegex by lazy { Regex("/\\.gif(\\?.*)?\\$/i") }

fun parse(htmlContent: String): Result? {
if (htmlContent.isBlank()) return null

return try {
val document = Ksoup.parse(htmlContent)

val imageUrl =
document
.getElementsByTag("img")
.firstOrNull { it.hasAttr("src") && !it.attr("src").endsWith(".gif") }
?.attr("src")

val contentStringBuilder = StringBuilder()
document.getAllElements().forEach { element ->
if (allowedContentTags.contains(element.tagName())) {
contentStringBuilder.append(element.text().cleanWhitespaces())
}
val cleanedHtml = Ksoup.clean(htmlContent, allowedContentTags)
val document = Ksoup.parse(cleanedHtml)
val body = document.getElementsByTag(TAG_BODY).first() ?: return null
val elements = body.children()

if (element.tagName() == "p" || element.tagName() == "br") {
contentStringBuilder.appendLine()
val leadImage =
elements.firstNotNullOfOrNull {
val imageUrl = it.attr(ATTR_SRC)
if (it.tagName() == TAG_IMG && !gifRegex.containsMatchIn(imageUrl)) {
imageUrl.removeSurrounding("\"")
} else {
null
}
}
}
val content = body.ownText()

HtmlContent(imageUrl = imageUrl, content = contentStringBuilder.toString())
Result(leadImage = leadImage, content = content)
} catch (e: Exception) {
null
} catch (e: MalformedInputException) {
Expand All @@ -55,18 +60,5 @@ internal object HtmlContentParser {
}
}

private fun String.cleanWhitespaces(): String {
var formattedText = this.trim()
if (formattedText.isNotBlank()) {
if (this[0].isWhitespace()) {
formattedText = " $formattedText"
}
if (this.last().isWhitespace()) {
formattedText += " "
}
}
return formattedText
}

data class HtmlContent(val imageUrl: String?, val content: String)
data class Result(val leadImage: String?, val content: String)
}
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ internal object RDFContentParser : ContentParser() {

val htmlContent = HtmlContentParser.parse(htmlContent = rawContent)
if (image.isNullOrBlank() && htmlContent != null) {
image = htmlContent.imageUrl
image = htmlContent.leadImage
}

description = htmlContent?.content?.ifBlank { rawContent.trim() } ?: rawContent.trim()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ internal object RSSContentParser : ContentParser() {

val htmlContent = HtmlContentParser.parse(htmlContent = rawContent)
if (image.isNullOrBlank() && htmlContent != null) {
image = htmlContent.imageUrl
image = htmlContent.leadImage
}

description = htmlContent?.content?.ifBlank { rawContent.trim() } ?: rawContent.trim()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright 2024 Sasikanth Miriyampalli
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dev.sasikanth.rss.reader.core.network.parser

import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertNull

class HtmlContentParserTest {

companion object {
private const val TEST_HTML =
"""
<figure>
<img alt="A screenshot from DOOM + DOOM II." src="https://cdn.vox-cdn.com/thumbor/LJt9a0BM9fnTyZtP68Ba1Mr1YDY=/150x0:1770x1080/1310x873/cdn.vox-cdn.com/uploads/chorus_image/image/73510530/ss_c5781b8f9a8181e6c989869b86d0b455ccca344a.0.jpg"/>
<figcaption>Image: Bethesda</figcaption>
</figure>
<p id="2Z0e9a">If you haven’t played <em>Doom</em> or <em>Doom II</em> for a while — or ever — a new re-release that Bethesda <a href="https://slayersclub.bethesda.net/en-US/article/doom-doomii-release-notes?linkId=100000279162898">surprise-dropped</a> (<a href="https://x.com/Wario64/status/1821578978462699748">sorta</a>) on Thursday might be the perfect excuse to jump in to the classic games. The re-release, which combines both games into one package called <em>Doom + Doom II</em> and is a free update for anyone who already owns <em>Doom (1993)</em> or <em>Doom II</em>, offers a long list of great new features — including a brand new single-player episode and online, cross-platform deathmatch multiplayer.</p>
<p id="Pm12nB">With <em>Doom + Doom II, </em>you’ll have access to both of those two games as well as extra single-player content like John Romero’s <em>Sigil</em> episode <a href="https://romero.com/sigil">released in 2019</a> and <em>Legacy of Rust</em>, which is a new <em>Doom</em> episode created by “individuals from id Software, Nightdive Studios...</p>
<p><a href="https://www.theverge.com/2024/8/8/24216379/doom-doom-ii-definitive-re-release">Continue reading&hellip;</a> </p>
"""
}

@Test
fun parsingLeadImageAndContentFromHtmlShouldWorkCorrectly() {
// when
val result = HtmlContentParser.parse(TEST_HTML)

// then
assertEquals(
"https://cdn.vox-cdn.com/thumbor/LJt9a0BM9fnTyZtP68Ba1Mr1YDY=/150x0:1770x1080/1310x873/cdn.vox-cdn.com/uploads/chorus_image/image/73510530/ss_c5781b8f9a8181e6c989869b86d0b455ccca344a.0.jpg",
result?.leadImage,
)
assertEquals(
"If you haven’t played Doom or Doom II for a while — or ever — a new re-release that Bethesda surprise-dropped (sorta) on Thursday might be the perfect excuse to jump in to the classic games. The re-release, which combines both games into one package called Doom + Doom II and is a free update for anyone who already owns Doom (1993) or Doom II, offers a long list of great new features — including a brand new single-player episode and online, cross-platform deathmatch multiplayer. With Doom + Doom II, you’ll have access to both of those two games as well as extra single-player content like John Romero’s Sigil episode released in 2019 and Legacy of Rust, which is a new Doom episode created by “individuals from id Software, Nightdive Studios... Continue reading…",
result?.content,
)
}

@Test
fun parsingContentFromTextShouldWorkCorrectly() {
// when
val result = HtmlContentParser.parse("This is a normal text")

// then
assertNull(result?.leadImage)
assertEquals("This is a normal text", result?.content)
}
}

0 comments on commit 8e4e107

Please sign in to comment.