Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix DMM scraping issues #355

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions build.gradle
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
plugins {
id 'application'
id 'org.openjfx.javafxplugin' version '0.0.9'
id 'com.github.johnrengelman.shadow' version '5.2.0'
id "com.diffplug.spotless" version "5.1.2"
id "org.sonarqube" version "3.0"
id 'org.openjfx.javafxplugin' version '0.0.10'
id 'com.github.johnrengelman.shadow' version '6.1.0'
id "com.diffplug.spotless" version "5.13.0"
id "org.sonarqube" version "3.3"
}

apply plugin: 'java'
Expand Down Expand Up @@ -43,7 +43,7 @@ sourceSets {
}
}
dependencies {
compile (
implementation (
'commons-io:commons-io:2.7',
'org.apache.commons:commons-lang3:3.11',
'commons-cli:commons-cli:1.4',
Expand All @@ -66,7 +66,7 @@ dependencies {
runtimeOnly "org.openjfx:javafx-graphics:$javafx.version:win"
runtimeOnly "org.openjfx:javafx-graphics:$javafx.version:linux"
runtimeOnly "org.openjfx:javafx-graphics:$javafx.version:mac"
testCompile('junit:junit:4.+')
testImplementation('junit:junit:4.+')
}

jar {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ public List<ScraperGroupName> getScraperGroupNames() {

private boolean firstWordOfFileIsID = false;

public static final int CONNECTION_TIMEOUT_VALUE = 13000;
public static final int CONNECTION_TIMEOUT_VALUE = 900000;

protected File scrapedMovieFile;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,20 +105,25 @@ public DmmParsingProfile(Document document, boolean doGoogleTranslation) {

@Override
public Title scrapeTitle() {
Element titleElement = document.select("[property=og:title]").first();
//Element titleElement = document.select("[property=og:title]").first();
Element titleElement = document.select("table#w tr td#mu h1#title").first();
// run a google translate on the japanese title
if (doGoogleTranslation) {
return new Title(TranslateString.translateStringJapaneseToEnglish(titleElement.attr("content").toString()));
//return new Title(TranslateString.translateStringJapaneseToEnglish(titleElement.attr("content").toString()));
return new Title(TranslateString.translateStringJapaneseToEnglish(titleElement.text()));
} else {
return new Title(titleElement.attr("content").toString());
//return new Title(titleElement.attr("content").toString());
return new Title(titleElement.text());
}
}

@Override
public OriginalTitle scrapeOriginalTitle() {
Element titleElement = document.select("[property=og:title]").first();
//Element titleElement = document.select("[property=og:title]").first();
Element titleElement = document.select("table#w tr td#mu h1#title").first();
// leave the original title as the japanese title
return new OriginalTitle(titleElement.attr("content").toString());
//return new OriginalTitle(titleElement.attr("content").toString());
return new OriginalTitle(titleElement.text());
}

@Override
Expand Down Expand Up @@ -193,12 +198,15 @@ public Plot scrapePlot() {
Element plotElement = document.select("p.mg-b20").first();
if (plotElement == null || document.baseUri().contains("/digital/video") || document.baseUri().contains("/digital/nikkatsu")) {
//video rental mode if it didnt find a match using above method
plotElement = document.select("tbody .mg-b20.lh4").first();
//plotElement = document.select("tbody .mg-b20.lh4").first();
plotElement = document.select("div.mg-b20.lh4").first();
}
if (doGoogleTranslation) {
return new Plot(TranslateString.translateStringJapaneseToEnglish(plotElement.text()));
//return new Plot(TranslateString.translateStringJapaneseToEnglish(plotElement.text()));
return new Plot(TranslateString.translateStringJapaneseToEnglish(plotElement.childNode(0).attr("text")));
} else
return new Plot(plotElement.text());
//return new Plot(plotElement.text());
return new Plot(plotElement.childNode(0).attr("text"));
}

@Override
Expand Down Expand Up @@ -394,7 +402,7 @@ public static String fixUpIDFormatting(String idElementText) {

@Override
public ArrayList<Genre> scrapeGenres() {
Elements genreElements = document.select("table.mg-b12 tr td a[href*=article=keyword/id=]");
Elements genreElements = document.select("table.mg-b20 tr td a[href*=list/?keyword=]");
ArrayList<Genre> genres = new ArrayList<>(genreElements.size());
for (Element genreElement : genreElements) {
// get the link so we can examine the id and do some sanity cleanup
Expand Down Expand Up @@ -527,17 +535,23 @@ private boolean acceptGenreID(String genreID) {
@Override
public ArrayList<Actor> scrapeActors() {
// scrape all the actress IDs
Elements actressIDElements = document.select("span#performer a[href*=article=actress/id=]");
//Elements actressIDElements = document.select("span#performer a[href*=article=actress/id=]");
Elements actressIDElements = document.select("span#performer a[href*=list/?actress=]");
ArrayList<Actor> actorList = new ArrayList<>(actressIDElements.size());
for (Element actressIDLink : actressIDElements) {
String actressIDHref = actressIDLink.attr("abs:href");
String actressNameKanji = actressIDLink.text();
String actressID = actressIDHref.substring(actressIDHref.indexOf("id=") + 3, actressIDHref.length() - 1);
//String actressID = actressIDHref.substring(actressIDHref.indexOf("id=") + 3, actressIDHref.length() - 1);
String actressID = actressIDHref.substring(actressIDHref.indexOf("actress=") + 8, actressIDHref.length());
String actressPageURL = "https://actress.dmm.co.jp/-/detail/=/actress_id=" + actressID + "/";
try {
Document actressPage = Jsoup.connect(actressPageURL).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).get();
Element actressNameElement = actressPage.select("td.t1 h1").first();
Element actressThumbnailElement = actressPage.select("tr.area-av30.top td img").first();
Map<String, String> cookies = new HashMap<String, String>();
cookies.put("age_check_done", "1");
Document actressPage = Jsoup.connect(actressPageURL).cookies(cookies).userAgent("Mozilla").ignoreHttpErrors(true).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).get();
//Element actressNameElement = actressPage.select("td.t1 h1").first();
Element actressNameElement = actressPage.select("span.c-tx-actressName__ruby").first();
//Element actressThumbnailElement = actressPage.select("tr.area-av30.top td img").first();
Element actressThumbnailElement = actressPage.select("span.p-section-profile__image img").first();
String actressThumbnailPath = actressThumbnailElement.attr("abs:src");
//Sometimes the translation service from google gives us weird engrish instead of a name, so let's compare it to the thumbnail file name for the image as a sanity check
//if the names aren't close enough, we'll use the thumbnail name
Expand All @@ -554,7 +568,8 @@ public ArrayList<Actor> scrapeActors() {
// hiragana form of it.
// The hiragana form of it is between a '(' and a ')' (These are
// not parens but some japanese version of parens)
String actressNameHiragana = actressNameElement.text().substring(actressNameElement.text().indexOf('(') + 1, actressNameElement.text().indexOf(')'));
//String actressNameHiragana = actressNameElement.text().substring(actressNameElement.text().indexOf('(') + 1, actressNameElement.text().indexOf(')'));
String actressNameHiragana = actressNameElement.text();
// maybe we know in advance the translation system will be junk,
// so we check our manual override of people we know it will get
// the name wrong on
Expand Down Expand Up @@ -620,7 +635,8 @@ public ArrayList<Actor> scrapeActors() {
@Override
public ArrayList<Director> scrapeDirectors() {
ArrayList<Director> directors = new ArrayList<>();
Element directorElement = document.select("table.mg-b20 tr td a[href*=article=director/id=]").first();
//Element directorElement = document.select("table.mg-b20 tr td a[href*=article=director/id=]").first();
Element directorElement = document.select("table.mg-b20 tr td a[href*=list/?director=]").first();
if (directorElement != null && directorElement.hasText()) {
if (doGoogleTranslation)
directors.add(new Director(TranslateString.translateStringJapaneseToEnglish(directorElement.text()), null));
Expand All @@ -647,6 +663,11 @@ public String createSearchString(File file) {
scrapedMovieFile = file;
String fileNameNoExtension = findIDTagFromFile(file, isFirstWordOfFileIsID());
//System.out.println("fileNameNoExtension in DMM: " + fileNameNoExtension);
Pattern patternID = Pattern.compile("(h?_?[0-9]*[a-z]+[0-9]+)");
Matcher matcher = patternID.matcher(fileNameNoExtension);
while (matcher.find()) {
fileNameNoExtension = matcher.group(1);
}
URLCodec codec = new URLCodec();
try {
String fileNameURLEncoded = codec.encode(fileNameNoExtension);
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/moviescraper/doctord/model/Movie.java
Original file line number Diff line number Diff line change
Expand Up @@ -777,7 +777,7 @@ public static Movie scrapeMovie(File movieFile, SiteParsingProfile siteToParseFr
//loop through search results and see if URL happens to contain ID number in the URL. This will improve accuracy!
for (int i = 0; i < searchResults.length; i++) {
String urltoMatch = searchResults[i].getUrlPath().toLowerCase();
String idFromMovieFileToMatch = idFromMovieFile.toLowerCase().replaceAll("-", "");
String idFromMovieFileToMatch = idFromMovieFile.toLowerCase().replaceAll("-", "").replaceAll("hhb","").replaceAll("mhb","").replaceAll("hmb","").replaceAll("mmb","").replaceAll("dm","").replaceAll("sm","");
//System.out.println("Comparing " + searchResults[i].toLowerCase() + " to " + idFromMovieFile.toLowerCase().replaceAll("-", ""));
if (urltoMatch.contains(idFromMovieFileToMatch)) {
//let's do some fuzzy logic searching to try to get the "best" match in case we got some that are pretty close
Expand Down