Skip to content

Commit

Permalink
feat: 네이버 뉴스 크롤링 로직 1차 구현
Browse files Browse the repository at this point in the history
  • Loading branch information
hun-ca committed Dec 15, 2024
1 parent eec1b29 commit 03e9901
Show file tree
Hide file tree
Showing 8 changed files with 250 additions and 0 deletions.
7 changes: 7 additions & 0 deletions domain-generator/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,11 @@ tasks.getByName("jar") {
}

dependencies {
implementation(project(":web"))

/** jsoup - html parser */
implementation("org.jsoup:jsoup:1.15.3")

/** JSON <-> Class serializer **/
implementation("org.jetbrains.kotlinx:kotlinx-serialization-json:1.6.0") //TODO: DB저장으로 로직 변경 후 삭제
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package com.few.domain.generator.controller

import com.few.domain.generator.controller.response.ExecuteCrawlerResponse
import com.few.domain.generator.usecase.ExecuteCrawlerUseCase
import com.few.domain.generator.usecase.dto.ExecuteCrawlerUseCaseIn
import org.springframework.http.HttpStatus
import org.springframework.http.MediaType
import org.springframework.validation.annotation.Validated
import org.springframework.web.bind.annotation.GetMapping
import org.springframework.web.bind.annotation.RequestMapping
import org.springframework.web.bind.annotation.RequestParam
import org.springframework.web.bind.annotation.RestController
import web.ApiResponse
import web.ApiResponseGenerator

@Validated
@RestController
@RequestMapping(value = ["/api/v2/crawlers"], produces = [MediaType.APPLICATION_JSON_VALUE])
class CrawlerController(
private val executeCrawlerUseCase: ExecuteCrawlerUseCase,
) {

/**
* 아직 포스팅 되지 않은 크롤링 데이터 조회
* 만약 크롤링하고 포스팅되지 않은 데이터가 있을 경우
* 해당 데이터의 식별자들을 응답하고 포스팅되지 않은 데이터가 없을 경우
* 크롤링을 수행함
*/
@GetMapping
fun executeCrawler(
/**
* 100(정치), 10(경제), 105(IT/과학)
*/
@RequestParam(
required = false,
defaultValue = "0"
) sid: Int,
): ApiResponse<ApiResponse.SuccessBody<ExecuteCrawlerResponse>> {
val useCaseOut = executeCrawlerUseCase.execute(ExecuteCrawlerUseCaseIn(sid))

return ApiResponseGenerator.success(
ExecuteCrawlerResponse(useCaseOut.sid, useCaseOut.crawlingIds),
HttpStatus.OK
)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package com.few.domain.generator.controller.response

data class ExecuteCrawlerResponse(
val sid: Int,
val crawlingId: List<String>,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package com.few.domain.generator.crawler

import io.github.oshai.kotlinlogging.KotlinLogging
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.springframework.stereotype.Component
import java.util.regex.Pattern
import java.io.File
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import kotlinx.serialization.*
import kotlinx.serialization.json.*

@Component
class NaverNewsCrawler(
private val maxPages: Int = 100,
private val maxLinks: Int = 100,
) {
private val log = KotlinLogging.logger {}
private val regex_news_links = "https://n\\.news\\.naver\\.com/mnews/article/\\d+/\\d+$"
private val headers =
mapOf("User-Agent" to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36")

private fun getSoup(url: String): Document {
val connection = Jsoup.connect(url)
headers.forEach { (key, value) ->
connection.header(key, value)
}
return connection.get()
}

private fun makeUrl(sid: Int, page: Int) =
"https://news.naver.com/main/main.naver?mode=LSD&mid=shm&sid1=$sid#&date=%2000:00:00&page=$page"

fun getNaverNewsUrls(sid: Int): List<String> {
println("$sid 분야의 뉴스 링크를 수집합니다.")
val allLinks = mutableSetOf<String>()

for (page in 1..maxPages) {
val url = makeUrl(sid, page)
val soup = getSoup(url)

// Regex to match the desired link pattern
val pattern = Pattern.compile(regex_news_links)
val links = soup.select("a[href]").mapNotNull { element ->
val href = element.attr("href")
if (pattern.matcher(href).matches()) href else null
}

allLinks.addAll(links)

if (allLinks.size >= maxLinks) {
break
}

Thread.sleep(500) // 0.5 seconds delay
}

return allLinks.take(maxLinks).toList()
}

fun getNewsContent(url: String): NewsModel? {
log.info { "뉴스 내용을 가져오는 중: $url" }
val soup: Document = getSoup(url)

val title = soup.selectFirst("#title_area > span")
val date =
soup.selectFirst("#ct > div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > div:nth-child(1) > span")
val content = soup.selectFirst("#dic_area")
val linkElement =
soup.selectFirst("#ct > div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > a.media_end_head_origin_link")
val originalLink = linkElement?.attr("href")

// TODO 원본 데이터 DB 저장으로 변경
File("soup_content.txt").writeText(soup.outerHtml(), Charsets.UTF_8)

if (title == null || date == null || content == null) {
return null
}

val dateStr = date.text().trim()
val dateParts = dateStr.split(" ")

val dateTime: LocalDateTime = if (dateParts.size == 3) {
val dateOnly = dateParts[0]
val amPm = dateParts[1]
val time = dateParts[2]

val (hour, minute) = time.split(":").map { it.toInt() }
val adjustedHour = when {
amPm == "오후" && hour != 12 -> hour + 12
amPm == "오전" && hour == 12 -> 0
else -> hour
}

val dateTimeStr = "$dateOnly ${"%02d".format(adjustedHour)}:${"%02d".format(minute)}"
LocalDateTime.parse(dateTimeStr, DateTimeFormatter.ofPattern("yyyy.MM.dd. HH:mm"))
} else {
LocalDateTime.parse(dateStr, DateTimeFormatter.ofPattern("yyyy.MM.dd. HH:mm"))
}

return NewsModel(
title = title.text().trim(),
content = content.text().trim(),
date = dateTime,
link = url,
originalLink = originalLink
)
}

fun saveContentAsJson(content: List<NewsModel>) {
// 콘텐츠를 JSON으로 직렬화
val jsonContent = Json {
prettyPrint = true
encodeDefaults = true
}.encodeToString(content)

// TODO DB에 저장
File("crawled_news.json").writeText(jsonContent, Charsets.UTF_8)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package com.few.domain.generator.crawler

import kotlinx.serialization.Serializable
import java.time.LocalDateTime

@Serializable
data class NewsModel(
val title: String,
val content: String,
val date: LocalDateTime,
val link: String,
val originalLink: String?,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package com.few.domain.generator.usecase

import com.few.domain.generator.crawler.NaverNewsCrawler
import com.few.domain.generator.crawler.NewsModel
import com.few.domain.generator.usecase.dto.ExecuteCrawlerUseCaseIn
import com.few.domain.generator.usecase.dto.ExecuteCrawlerUseCaseOut
import io.github.oshai.kotlinlogging.KotlinLogging
import org.springframework.stereotype.Component
import java.util.*

@Component
class ExecuteCrawlerUseCase(
private val naverNewsCrawler: NaverNewsCrawler
) {
private val log = KotlinLogging.logger {}

//TODO: @Transactional
fun execute(useCaseIn: ExecuteCrawlerUseCaseIn): ExecuteCrawlerUseCaseOut {
/**
* TODO: 아직 포스팅되지 않은 크롤링 데이터가 있는지 DB에서 확인
* 있는 경우 조회해서 리턴
* 없는 경우 크롤링 시작
*/

log.info { "크롤링이 시작" }
val newsUrls = naverNewsCrawler.getNaverNewsUrls(useCaseIn.sid)

val results = mutableListOf<NewsModel>()
for ((i, url) in newsUrls.withIndex()) {
val newsData = naverNewsCrawler.getNewsContent(url)
if (newsData != null) {
results.add(newsData)
}
log.info { "뉴스 ${i + 1}/${newsUrls.size} 처리 완료" }
Thread.sleep(1000) // 1초 딜레이
}

naverNewsCrawler.saveContentAsJson(results)
log.info { "크롤링이 완료" }

return ExecuteCrawlerUseCaseOut(
useCaseIn.sid,
listOf(UUID.randomUUID().toString()), // TODO: DB 저장 시 크롤링 고유 ID 응답
)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package com.few.domain.generator.usecase.dto

data class ExecuteCrawlerUseCaseIn(
val sid: Int,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package com.few.domain.generator.usecase.dto

data class ExecuteCrawlerUseCaseOut(
val sid: Int,
val crawlingIds: List<String>,
)

0 comments on commit 03e9901

Please sign in to comment.