-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
250 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
46 changes: 46 additions & 0 deletions
46
domain-generator/src/main/kotlin/com/few/domain/generator/controller/CrawlerController.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
package com.few.domain.generator.controller | ||
|
||
import com.few.domain.generator.controller.response.ExecuteCrawlerResponse | ||
import com.few.domain.generator.usecase.ExecuteCrawlerUseCase | ||
import com.few.domain.generator.usecase.dto.ExecuteCrawlerUseCaseIn | ||
import org.springframework.http.HttpStatus | ||
import org.springframework.http.MediaType | ||
import org.springframework.validation.annotation.Validated | ||
import org.springframework.web.bind.annotation.GetMapping | ||
import org.springframework.web.bind.annotation.RequestMapping | ||
import org.springframework.web.bind.annotation.RequestParam | ||
import org.springframework.web.bind.annotation.RestController | ||
import web.ApiResponse | ||
import web.ApiResponseGenerator | ||
|
||
@Validated | ||
@RestController | ||
@RequestMapping(value = ["/api/v2/crawlers"], produces = [MediaType.APPLICATION_JSON_VALUE]) | ||
class CrawlerController( | ||
private val executeCrawlerUseCase: ExecuteCrawlerUseCase, | ||
) { | ||
|
||
/** | ||
* 아직 포스팅 되지 않은 크롤링 데이터 조회 | ||
* 만약 크롤링하고 포스팅되지 않은 데이터가 있을 경우 | ||
* 해당 데이터의 식별자들을 응답하고 포스팅되지 않은 데이터가 없을 경우 | ||
* 크롤링을 수행함 | ||
*/ | ||
@GetMapping | ||
fun executeCrawler( | ||
/** | ||
* 100(정치), 10(경제), 105(IT/과학) | ||
*/ | ||
@RequestParam( | ||
required = false, | ||
defaultValue = "0" | ||
) sid: Int, | ||
): ApiResponse<ApiResponse.SuccessBody<ExecuteCrawlerResponse>> { | ||
val useCaseOut = executeCrawlerUseCase.execute(ExecuteCrawlerUseCaseIn(sid)) | ||
|
||
return ApiResponseGenerator.success( | ||
ExecuteCrawlerResponse(useCaseOut.sid, useCaseOut.crawlingIds), | ||
HttpStatus.OK | ||
) | ||
} | ||
} |
6 changes: 6 additions & 0 deletions
6
...or/src/main/kotlin/com/few/domain/generator/controller/response/ExecuteCrawlerResponse.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
package com.few.domain.generator.controller.response | ||
|
||
data class ExecuteCrawlerResponse( | ||
val sid: Int, | ||
val crawlingId: List<String>, | ||
) |
121 changes: 121 additions & 0 deletions
121
domain-generator/src/main/kotlin/com/few/domain/generator/crawler/NaverNewsCrawler.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
package com.few.domain.generator.crawler | ||
|
||
import io.github.oshai.kotlinlogging.KotlinLogging | ||
import org.jsoup.Jsoup | ||
import org.jsoup.nodes.Document | ||
import org.springframework.stereotype.Component | ||
import java.util.regex.Pattern | ||
import java.io.File | ||
import java.time.LocalDateTime | ||
import java.time.format.DateTimeFormatter | ||
import kotlinx.serialization.* | ||
import kotlinx.serialization.json.* | ||
|
||
@Component | ||
class NaverNewsCrawler( | ||
private val maxPages: Int = 100, | ||
private val maxLinks: Int = 100, | ||
) { | ||
private val log = KotlinLogging.logger {} | ||
private val regex_news_links = "https://n\\.news\\.naver\\.com/mnews/article/\\d+/\\d+$" | ||
private val headers = | ||
mapOf("User-Agent" to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36") | ||
|
||
private fun getSoup(url: String): Document { | ||
val connection = Jsoup.connect(url) | ||
headers.forEach { (key, value) -> | ||
connection.header(key, value) | ||
} | ||
return connection.get() | ||
} | ||
|
||
private fun makeUrl(sid: Int, page: Int) = | ||
"https://news.naver.com/main/main.naver?mode=LSD&mid=shm&sid1=$sid#&date=%2000:00:00&page=$page" | ||
|
||
fun getNaverNewsUrls(sid: Int): List<String> { | ||
println("$sid 분야의 뉴스 링크를 수집합니다.") | ||
val allLinks = mutableSetOf<String>() | ||
|
||
for (page in 1..maxPages) { | ||
val url = makeUrl(sid, page) | ||
val soup = getSoup(url) | ||
|
||
// Regex to match the desired link pattern | ||
val pattern = Pattern.compile(regex_news_links) | ||
val links = soup.select("a[href]").mapNotNull { element -> | ||
val href = element.attr("href") | ||
if (pattern.matcher(href).matches()) href else null | ||
} | ||
|
||
allLinks.addAll(links) | ||
|
||
if (allLinks.size >= maxLinks) { | ||
break | ||
} | ||
|
||
Thread.sleep(500) // 0.5 seconds delay | ||
} | ||
|
||
return allLinks.take(maxLinks).toList() | ||
} | ||
|
||
fun getNewsContent(url: String): NewsModel? { | ||
log.info { "뉴스 내용을 가져오는 중: $url" } | ||
val soup: Document = getSoup(url) | ||
|
||
val title = soup.selectFirst("#title_area > span") | ||
val date = | ||
soup.selectFirst("#ct > div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > div:nth-child(1) > span") | ||
val content = soup.selectFirst("#dic_area") | ||
val linkElement = | ||
soup.selectFirst("#ct > div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > a.media_end_head_origin_link") | ||
val originalLink = linkElement?.attr("href") | ||
|
||
// TODO 원본 데이터 DB 저장으로 변경 | ||
File("soup_content.txt").writeText(soup.outerHtml(), Charsets.UTF_8) | ||
|
||
if (title == null || date == null || content == null) { | ||
return null | ||
} | ||
|
||
val dateStr = date.text().trim() | ||
val dateParts = dateStr.split(" ") | ||
|
||
val dateTime: LocalDateTime = if (dateParts.size == 3) { | ||
val dateOnly = dateParts[0] | ||
val amPm = dateParts[1] | ||
val time = dateParts[2] | ||
|
||
val (hour, minute) = time.split(":").map { it.toInt() } | ||
val adjustedHour = when { | ||
amPm == "오후" && hour != 12 -> hour + 12 | ||
amPm == "오전" && hour == 12 -> 0 | ||
else -> hour | ||
} | ||
|
||
val dateTimeStr = "$dateOnly ${"%02d".format(adjustedHour)}:${"%02d".format(minute)}" | ||
LocalDateTime.parse(dateTimeStr, DateTimeFormatter.ofPattern("yyyy.MM.dd. HH:mm")) | ||
} else { | ||
LocalDateTime.parse(dateStr, DateTimeFormatter.ofPattern("yyyy.MM.dd. HH:mm")) | ||
} | ||
|
||
return NewsModel( | ||
title = title.text().trim(), | ||
content = content.text().trim(), | ||
date = dateTime, | ||
link = url, | ||
originalLink = originalLink | ||
) | ||
} | ||
|
||
fun saveContentAsJson(content: List<NewsModel>) { | ||
// 콘텐츠를 JSON으로 직렬화 | ||
val jsonContent = Json { | ||
prettyPrint = true | ||
encodeDefaults = true | ||
}.encodeToString(content) | ||
|
||
// TODO DB에 저장 | ||
File("crawled_news.json").writeText(jsonContent, Charsets.UTF_8) | ||
} | ||
} |
13 changes: 13 additions & 0 deletions
13
domain-generator/src/main/kotlin/com/few/domain/generator/crawler/NewsModel.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
package com.few.domain.generator.crawler | ||
|
||
import kotlinx.serialization.Serializable | ||
import java.time.LocalDateTime | ||
|
||
@Serializable | ||
data class NewsModel( | ||
val title: String, | ||
val content: String, | ||
val date: LocalDateTime, | ||
val link: String, | ||
val originalLink: String?, | ||
) |
46 changes: 46 additions & 0 deletions
46
domain-generator/src/main/kotlin/com/few/domain/generator/usecase/ExecuteCrawlerUseCase.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
package com.few.domain.generator.usecase | ||
|
||
import com.few.domain.generator.crawler.NaverNewsCrawler | ||
import com.few.domain.generator.crawler.NewsModel | ||
import com.few.domain.generator.usecase.dto.ExecuteCrawlerUseCaseIn | ||
import com.few.domain.generator.usecase.dto.ExecuteCrawlerUseCaseOut | ||
import io.github.oshai.kotlinlogging.KotlinLogging | ||
import org.springframework.stereotype.Component | ||
import java.util.* | ||
|
||
@Component | ||
class ExecuteCrawlerUseCase( | ||
private val naverNewsCrawler: NaverNewsCrawler | ||
) { | ||
private val log = KotlinLogging.logger {} | ||
|
||
//TODO: @Transactional | ||
fun execute(useCaseIn: ExecuteCrawlerUseCaseIn): ExecuteCrawlerUseCaseOut { | ||
/** | ||
* TODO: 아직 포스팅되지 않은 크롤링 데이터가 있는지 DB에서 확인 | ||
* 있는 경우 조회해서 리턴 | ||
* 없는 경우 크롤링 시작 | ||
*/ | ||
|
||
log.info { "크롤링이 시작" } | ||
val newsUrls = naverNewsCrawler.getNaverNewsUrls(useCaseIn.sid) | ||
|
||
val results = mutableListOf<NewsModel>() | ||
for ((i, url) in newsUrls.withIndex()) { | ||
val newsData = naverNewsCrawler.getNewsContent(url) | ||
if (newsData != null) { | ||
results.add(newsData) | ||
} | ||
log.info { "뉴스 ${i + 1}/${newsUrls.size} 처리 완료" } | ||
Thread.sleep(1000) // 1초 딜레이 | ||
} | ||
|
||
naverNewsCrawler.saveContentAsJson(results) | ||
log.info { "크롤링이 완료" } | ||
|
||
return ExecuteCrawlerUseCaseOut( | ||
useCaseIn.sid, | ||
listOf(UUID.randomUUID().toString()), // TODO: DB 저장 시 크롤링 고유 ID 응답 | ||
) | ||
} | ||
} |
5 changes: 5 additions & 0 deletions
5
...generator/src/main/kotlin/com/few/domain/generator/usecase/dto/ExecuteCrawlerUseCaseIn.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
package com.few.domain.generator.usecase.dto | ||
|
||
data class ExecuteCrawlerUseCaseIn( | ||
val sid: Int, | ||
) |
6 changes: 6 additions & 0 deletions
6
...enerator/src/main/kotlin/com/few/domain/generator/usecase/dto/ExecuteCrawlerUseCaseOut.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
package com.few.domain.generator.usecase.dto | ||
|
||
data class ExecuteCrawlerUseCaseOut( | ||
val sid: Int, | ||
val crawlingIds: List<String>, | ||
) |