diff --git a/.gitignore b/.gitignore index 417122f..7bface9 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,6 @@ project/plugins/project/ # IntelliJ IDEA specific /.idea -sansa-ml-parent_2.11.iml +*.iml + +scalastyle-output.xml diff --git a/.travis.yml b/.travis.yml index 2e15138..682ef63 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,10 @@ -language: java +language: scala sudo: false cache: directories: - - $HOME/.m2 \ No newline at end of file + - $HOME/.m2 +scala: + - 2.11.11 +script: + - mvn scalastyle:check + - mvn test \ No newline at end of file diff --git a/pom.xml b/pom.xml index e240109..4a4c58b 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ 4.0.0 net.sansa-stack sansa-ml-parent_2.11 - 0.4.0 + 0.5.0 pom ML API - Parent RDF/OWL Machine Learning Library for Big Data @@ -20,8 +20,8 @@ - GNU GENERAL PUBLIC LICENSE, Version 3 - http://www.gnu.org/licenses/gpl-3.0.txt + Apache License 2.0 + http://www.apache.org/licenses/LICENSE-2.0.html repo @@ -65,10 +65,11 @@ UTF-8 2.11.11 2.11 - 2.3.1 - 1.5.0 - 3.7.0 - 0.4.0 + 2.4.0 + 1.7.0 + 3.9.0 + 0.5.0 + 0.4.1 ${project.basedir}/scalastyle-config.xml @@ -85,14 +86,14 @@ net.sansa-stack sansa-rdf-spark_${scala.binary.version} - ${sansa.version} + ${sansa.rdf.version} net.sansa-stack sansa-owl-spark_${scala.binary.version} - ${sansa.version} + ${sansa.owl.version} @@ -157,13 +158,7 @@ org.scalatest scalatest_${scala.binary.version} - 3.0.3 - test - - - com.holdenkarau - spark-testing-base_${scala.binary.version} - 2.3.0_0.9.0 + 2.2.6 test @@ -173,6 +168,7 @@ scala-logging_${scala.binary.version} 3.5.0 + com.github.scopt @@ -180,11 +176,25 @@ 3.5.0 - - com.google.guava - guava - 19.0 + com.holdenkarau + spark-testing-base_${scala.binary.version} + 2.3.0_0.9.0 + test + + + + org.glassfish.jersey + jersey-bom + 2.26-b03 + pom + import + + + + org.apache.commons + commons-compress + 1.18 @@ -193,11 +203,44 @@ + + + org.apache.maven.plugins + maven-source-plugin + 3.0.1 + + + attach-sources + verify + + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.4 + + false + + + + attach-javadocs + + jar + + + + + net.alchim31.maven scala-maven-plugin - 3.2.1 + 3.3.1 @@ -209,13 +252,18 @@ -dependencyfile ${project.build.directory}/.scala_dependencies + -Xmax-classfile-name + 128 + + -Xss2048K + ${scala.version} - incremental + @@ -229,6 +277,48 @@ + + com.amashchenko.maven.plugin + gitflow-maven-plugin + 1.8.0 + + + v + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.6 + + + sign-artifacts + verify + + sign + + + AKSW + ${gpg.keyname} + + + + + + + org.sonatype.plugins + nexus-staging-maven-plugin + 1.6.8 + true + + ossrh + https://oss.sonatype.org/ + true + + + org.apache.maven.plugins @@ -263,7 +353,7 @@ 1.0.0 false - false + true true false ${project.basedir}/src/main/scala @@ -287,65 +377,49 @@ - + + + root-dir + + + ${project.basedir}/../../scalastyle-config.xml + + + + ${project.basedir}/../scalastyle-config.xml + + release - - - ossrh - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - - + + + performRelease + true + + + - net.alchim31.maven - scala-maven-plugin - 3.2.2 - - - - compile - testCompile - - - ${scala.version} - incremental - true - - -unchecked - -deprecation - -feature - -dependencyfile - ${project.build.directory}/.scala_dependencies - - - - - - attach-javadocs - - doc-jar - - - - + org.apache.maven.plugins + maven-gpg-plugin - org.apache.maven.plugins - maven-source-plugin - - - verify - attach-sources - - jar - - - + org.sonatype.plugins + nexus-staging-maven-plugin + + + + + doclint-java8-disable + + [1.8,) + + + org.apache.maven.plugins maven-javadoc-plugin @@ -355,55 +429,18 @@ jar - - - - - - org.apache.maven.plugins - maven-gpg-plugin - 1.6 - - - sign-artifacts - verify - - sign - - AKSW - ${gpg.keyname} + false - - - - org.sonatype.plugins - nexus-staging-maven-plugin - 1.6.7 - true - ossrh - https://oss.sonatype.org/ - true + -Xdoclint:none - - - root-dir - - - ${project.basedir}/../../scalastyle-config.xml - - - - ${project.basedir}/../scalastyle-config.xml - - diff --git a/sansa-ml-common/pom.xml b/sansa-ml-common/pom.xml index 1d4ead0..57bae30 100644 --- a/sansa-ml-common/pom.xml +++ b/sansa-ml-common/pom.xml @@ -5,7 +5,7 @@ sansa-ml-parent_2.11 net.sansa-stack - 0.4.0 + 0.5.0 sansa-ml-common_2.11 ML API - Common @@ -18,6 +18,20 @@ scala-library + + + net.sf.extjwnl + extjwnl + 1.9.4 + + + + + net.sf.extjwnl + extjwnl-data-wn31-map + 1.0 + + junit diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNet.scala b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNet.scala similarity index 81% rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNet.scala rename to sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNet.scala index 93a4a44..546ff00 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNet.scala +++ b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNet.scala @@ -10,14 +10,15 @@ * and ws4j * and nltk project */ -package net.sansa_stack.ml.spark.nlp.wordnet +package net.sansa_stack.ml.common.nlp.wordnet import java.io.Serializable -import scala.collection.JavaConversions._ + +import net.sf.extjwnl.data.{PointerType, PointerUtils, Word} +import net.sf.extjwnl.dictionary.Dictionary +import scala.collection.JavaConverters._ import scala.collection.breakOut import scala.collection.mutable.ArrayBuffer -import net.sf.extjwnl.dictionary.Dictionary -import net.sf.extjwnl.data.{PointerType, PointerUtils, Word} /** * WordNet singleton to initialize WordNet dataset @@ -33,8 +34,11 @@ object WordNet { */ class WordNet extends Serializable { + var maxDepth = 0 + /** * Returns an instance of the WordNet dictionary used in the package + * * @return */ def getDict: Dictionary = WordNet.dict @@ -46,10 +50,9 @@ class WordNet extends Serializable { * @return : List[Synset] */ def getSynsets(lemma: String): List[Synset] = - net.sf.extjwnl.data.POS.getAllPOS + net.sf.extjwnl.data.POS.getAllPOS.asScala .flatMap(pos => getSynsets(lemma, pos))(breakOut) - /** * Returns a Synset given a String * Returns empty list if the lemma did not exist in the WordNet @@ -63,7 +66,8 @@ class WordNet extends Serializable { val indexWord = WordNet.dict.getIndexWord(pos, lemma) var result = List.empty[Synset] if (indexWord != null) { - result = List(indexWord.getSenses()(sid)) + val result_scala = indexWord.getSenses().asScala + result = List(result_scala(sid)) } result } @@ -79,7 +83,7 @@ class WordNet extends Serializable { def getSynsets(lemma: String, pos: POS): List[Synset] = { val iword = WordNet.dict.getIndexWord(pos, lemma) if (iword == null) List.empty[Synset] - else iword.getSenses.toList + else iword.getSenses.asScala.toList } /** @@ -89,7 +93,7 @@ class WordNet extends Serializable { * @return : List[String] */ def lemmaNames(synset: Synset): List[String] = - synset.getWords.map(_.getLemma)(breakOut) + synset.getWords.asScala.map(_.getLemma)(breakOut) /** * Input is a synset @@ -172,7 +176,7 @@ class WordNet extends Serializable { * @return : List[Synset] */ def relatedSynsets(synset: Synset, ptr: PointerType): List[Synset] = - synset.getPointers(ptr).map(ptr => ptr.getTarget.asInstanceOf[Synset])(breakOut) + synset.getPointers(ptr).asScala.map(ptr => ptr.getTarget.asInstanceOf[Synset])(breakOut) /** * Returns list of all hypernyms of a synset @@ -180,12 +184,12 @@ class WordNet extends Serializable { * @param synset :Synset * @return : List[Synset] */ - def allHypernyms(synset: Synset): List[List[Synset]] = + def getAllHypernyms(synset: Synset): List[List[Synset]] = PointerUtils .getHypernymTree(synset) .toList - .map(ptnl => ptnl - .map(ptn => ptn.getSynset) + .asScala.map(ptnl => ptnl + .asScala.map(ptn => ptn.getSynset) .toList)(breakOut) /** @@ -195,7 +199,7 @@ class WordNet extends Serializable { * @return : List[Synset] */ def rootHypernyms(synset: Synset): List[Synset] = - allHypernyms(synset) + getAllHypernyms(synset) .map(hp => hp.reverse.head).distinct /** @@ -206,8 +210,8 @@ class WordNet extends Serializable { * @return : List[Synset] */ def lowestCommonHypernym(synset1: Synset, synset2: Synset): List[Synset] = { - val paths1 = allHypernyms(synset1) - val paths2 = allHypernyms(synset2) + val paths1 = getAllHypernyms(synset1) + val paths2 = getAllHypernyms(synset2) lch(paths1, paths2) } @@ -219,7 +223,7 @@ class WordNet extends Serializable { * @return : Integer */ def shortestHypernymPathLength(synset1: Synset, hypernym: Synset): Int = { - val paths1 = allHypernyms(synset1) + val paths1 = getAllHypernyms(synset1) val path = ArrayBuffer[(Synset, Int)]() val matchedPath = paths1.zipWithIndex.filter { case (s, i) => s.contains(hypernym) } @@ -249,15 +253,30 @@ class WordNet extends Serializable { } /** - * Returns the depth of a synset - * Since there can be several paths to root, the minimum lenth is considered + * Returns the length of the shortest hypernym path from this + * synset to the root + * Since there can be several paths to root, the minimum length is considered * * @param synset : Synset * @return : Integer */ - def depth(synset: Synset): Int = { - val lens = allHypernyms(synset) - if (lens.isEmpty) -1 else lens.map(_.size).min - 1 + def minDepth(synset: Synset): Int = { + val lists = getAllHypernyms(synset) + if (lists.isEmpty) -1 else lists.map(_.size).min - 1 + } + + + + /** + * Returns the length of the longest hypernym path from this + * synset to the root + * Since there can be several paths to root, the minimum length is considered + * @param synset : Synset + * @return : Integer + */ + def maxDepth(synset: Synset): Int = { + val lists = getAllHypernyms(synset) + if (lists.isEmpty) -1 else lists.map(_.size).max - 1 } /** @@ -278,6 +297,6 @@ class WordNet extends Serializable { */ def relatedLemmas(word: Word, ptr: PointerType): List[Word] = word.getPointers(ptr) - .map(ptr => ptr.getTarget.asInstanceOf[Word])(breakOut) + .asScala.map(ptr => ptr.getTarget.asInstanceOf[Word])(breakOut) -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNetSimilarity.scala b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNetSimilarity.scala similarity index 67% rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNetSimilarity.scala rename to sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNetSimilarity.scala index 047061c..86cc701 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNetSimilarity.scala +++ b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNetSimilarity.scala @@ -6,29 +6,28 @@ * Inspired from: * WordNet::Similarity of Ted Peterson * and ws4j - * and ntlk project + * and nltk project */ -package net.sansa_stack.ml.spark.nlp.wordnet - +package net.sansa_stack.ml.common.nlp.wordnet object WordNetSimilarity extends WordNet { /** - * Wu & Palmer (1994) method of measuring semantic relatedness based on node counting. - * given two synsets, synset1 and synset2 returns the similarity score - * - * @param synset1 :Synset - * @param synset2 :Synset - * @return score :Double - */ + * Wu & Palmer (1994) method of measuring semantic relatedness based on node counting. + * given two synsets, synset1 and synset2 returns the similarity score + * + * @param synset1 :Synset + * @param synset2 :Synset + * @return score :Double + */ def wupSimilarity(synset1: Synset, synset2: Synset): Double = { val min = 0.0 if (synset1 == null || synset2 == null) throw new IllegalArgumentException("arg 1 or 2 was null...") val lcs = lowestCommonHypernym(synset1, synset2) if (lcs.isEmpty) return min - val depth = this.depth(lcs.head) + val depth = this.maxDepth(lcs.head) val depth1 = shortestHypernymPathLength(synset1, lcs.head) + depth val depth2 = shortestHypernymPathLength(synset2, lcs.head) + depth var score = 0.0 @@ -37,13 +36,13 @@ object WordNetSimilarity extends WordNet { } /** - * Returns the distance similarity of two synsets using the shortest path linking the two synsets (if - * one exists) - * - * @param synset1 : Synset - * @param synset2 : Synset - * @return : Double - */ + * Returns the distance similarity of two synsets using the shortest path linking the two synsets (if + * one exists) + * + * @param synset1 : Synset + * @param synset2 : Synset + * @return : Double + */ def shortestPathSim(synset1: Synset, synset2: Synset): Double = { if (synset1 == null || synset2 == null) throw new IllegalArgumentException("arg 1 or 2 was null...") @@ -56,5 +55,4 @@ object WordNetSimilarity extends WordNet { else score = 1.toDouble / distance score } - - } +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/package.scala b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/package.scala similarity index 89% rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/package.scala rename to sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/package.scala index a1440d1..a567761 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/package.scala +++ b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/package.scala @@ -1,6 +1,7 @@ -package net.sansa_stack.ml.spark.nlp +package net.sansa_stack.ml.common.nlp import java.io.Serializable + import net.sf.extjwnl.data.POS package object wordnet extends Serializable { @@ -13,4 +14,3 @@ package object wordnet extends Serializable { val Adjective = POS.ADJECTIVE val Adjverb = POS.ADVERB } - diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala similarity index 80% rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala rename to sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala index a2f4058..80abc36 100644 --- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala +++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala @@ -1,10 +1,9 @@ -package net.sansa_stack.ml.spark.nlp.wordnet +package net.sansa_stack.ml.common.nlp.wordnet -import com.holdenkarau.spark.testing.DataFrameSuiteBase -import org.scalatest.FunSuite import net.sf.extjwnl.data._ +import org.scalatest.FunSuite -class DistanceWordNetSimilarityMeasureTests extends FunSuite with DataFrameSuiteBase { +class DistanceWordNetSimilarityMeasureTests extends FunSuite { test("shortest path similarity between dog and cat synset should result in value 0.3") { try { diff --git a/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestAllHypernims.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestAllHypernims.scala new file mode 100644 index 0000000..1531ced --- /dev/null +++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestAllHypernims.scala @@ -0,0 +1,27 @@ +package net.sansa_stack.ml.common.nlp.wordnet + +import net.sf.extjwnl.data._ +import org.scalatest.FunSuite + +class TestAllHypernims extends FunSuite { + + test("Tests getting all hypernyms of the the first synset in the word cat") { + try { + val wn = new WordNet + val dict = wn.getDict + // getting a synset by a word and index + + val cat = wn.getSynset("cat", POS.NOUN, 1).head + + val getAllHypers = wn.getAllHypernyms(cat) + + assert(getAllHypers != null) + } + catch { + case e: ExceptionInInitializerError => println("The WordNet dictionary is not installed, please check the readme for instructions to enable it.") + } + + } +} + + diff --git a/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetMaxDepth.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetMaxDepth.scala new file mode 100644 index 0000000..6e67049 --- /dev/null +++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetMaxDepth.scala @@ -0,0 +1,25 @@ +package net.sansa_stack.ml.common.nlp.wordnet + +import net.sf.extjwnl.data._ +import org.scalatest.FunSuite + +class TestGetMaxDepth extends FunSuite { + + test("Test the function that gets the maximum depth of dataset graph ") { + + try { + val wn = new WordNet + val dict = wn.getDict + + val thing1 = wn.getSynset("thing", POS.NOUN, 1).head + val dog = wn.getSynset("dog", POS.NOUN, 1).head + + + val dogD = wn.maxDepth(dog) + val dogD2 = wn.minDepth(dog) + assert(dogD != 0) + } catch { + case e: ExceptionInInitializerError => println("The WordNet dictionary is not installed, please check the readme for instructions to enable it.") + } + } +} diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/TestGetSynsets.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetSynsets.scala similarity index 73% rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/TestGetSynsets.scala rename to sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetSynsets.scala index c5d5eb5..607d585 100644 --- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/TestGetSynsets.scala +++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetSynsets.scala @@ -1,10 +1,9 @@ -package net.sansa_stack.ml.spark.nlp.wordnet +package net.sansa_stack.ml.common.nlp.wordnet -import com.holdenkarau.spark.testing.DataFrameSuiteBase -import org.scalatest.FunSuite import net.sf.extjwnl.data._ +import org.scalatest.FunSuite -class TestGetSynsets extends FunSuite with DataFrameSuiteBase { +class TestGetSynsets extends FunSuite { test("If The WordNet dictionary is correctly installed synsets must not be null ") { @@ -13,10 +12,9 @@ class TestGetSynsets extends FunSuite with DataFrameSuiteBase { val dict = wn.getDict // getting a synset by a word and index - val thing1 = wn.getSynset("thing", POS.NOUN, 1) + val thing1 = wn.getSynset("thing", POS.NOUN, 1).head // getting a list of synsets by a word - val thing2 = wn.getSynsets("thing", POS.NOUN).head assert(thing1 != null) diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala similarity index 70% rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala rename to sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala index fa7913d..92a63e2 100644 --- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala +++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala @@ -1,13 +1,12 @@ -package net.sansa_stack.ml.spark.nlp.wordnet +package net.sansa_stack.ml.common.nlp.wordnet -import com.holdenkarau.spark.testing.DataFrameSuiteBase -import org.scalatest.FunSuite -//import net.didion.jwnl.data._ import net.sf.extjwnl.data._ +import org.scalatest.FunSuite +// import net.didion.jwnl.data._ -class WUPWordNetSimilarityMeasuresTests extends FunSuite with DataFrameSuiteBase { +class WUPWordNetSimilarityMeasuresTests extends FunSuite { - test("wwup similarity between dog and cat synset should result in value 0.3") { + test(" WUP similarity between dog and cat synset should result in value 0.3") { try { val wn = new WordNet val dict = wn.getDict diff --git a/sansa-ml-flink/pom.xml b/sansa-ml-flink/pom.xml index 9a4c33e..5b245b4 100644 --- a/sansa-ml-flink/pom.xml +++ b/sansa-ml-flink/pom.xml @@ -5,7 +5,7 @@ sansa-ml-parent_2.11 net.sansa-stack - 0.4.0 + 0.5.0 sansa-ml-flink_2.11 ML API - Apache Flink diff --git a/sansa-ml-spark/pom.xml b/sansa-ml-spark/pom.xml index 662236f..55cade1 100644 --- a/sansa-ml-spark/pom.xml +++ b/sansa-ml-spark/pom.xml @@ -5,7 +5,7 @@ sansa-ml-parent_2.11 net.sansa-stack - 0.4.0 + 0.5.0 sansa-ml-spark_2.11 ML API - Apache Spark @@ -13,6 +13,7 @@ 2.8.3 + 1.1.3 @@ -29,7 +30,6 @@ net.sansa-stack sansa-rdf-spark_${scala.binary.version} - net.sansa-stack @@ -70,9 +70,9 @@ - net.jpountz.lz4 - lz4 - 1.3.0 + net.jpountz.lz4 + lz4 + 1.3.0 @@ -105,37 +105,59 @@ pom ${jena.version} + + - com.github.scopt - scopt_${scala.binary.version} - 3.5.0 + com.intel.analytics.bigdl + bigdl-SPARK_2.2 + 0.3.0 - - net.sf.extjwnl - extjwnl - 1.9.4 + org.json + json + + + + + com.github.haifengl + smile-core + 1.5.0 + + + com.github.haifengl + smile-netlib + 1.5.0 + + + org.json4s + json4s-native_${scala.binary.version} + 3.6.2 + - + - net.sf.extjwnl - extjwnl-data-wn31-map - 1.0 + org.datasyslab + geospark + ${geospark.version} + provided - + - com.intel.analytics.bigdl - bigdl-SPARK_2.2 - 0.3.0 + com.vividsolutions + jts + 1.13 + + - org.json - json + org.datasyslab + geospark-sql_2.3 + ${geospark.version} @@ -167,12 +189,6 @@ com.github.scopt scopt_${scala.binary.version} - - - org.springframework - spring - 2.5.6.SEC03 - @@ -186,6 +202,19 @@ scala-maven-plugin + + org.apache.maven.plugins + maven-shade-plugin + 3.0.0 + + + package + + shade + + + + maven-compiler-plugin @@ -431,6 +460,13 @@ true + + + dbscan-on-spark-repo + Repo for DBSCAN on Spark + http://dl.bintray.com/irvingc/maven + + maven.aksw.internal AKSW Release Repository diff --git a/sansa-ml-spark/src/main/resources/application.properties b/sansa-ml-spark/src/main/resources/application.properties new file mode 100644 index 0000000..eed2c90 --- /dev/null +++ b/sansa-ml-spark/src/main/resources/application.properties @@ -0,0 +1,54 @@ +# spark configuration +sansa.spark.master=local[*] +sansa.spark.serializer=org.apache.spark.serializer.KryoSerializer +sansa.spark.executor.memory=15g +sansa.spark.driver.memory=15g +sansa.spark.driver.maxResultSize=15g +sansa.spark.app.name=SANSA_Clustering + +# clusterig profile +sansa.clustering.profile=results/profile.txt + +# pic clustering configuration +sansa.clustering.pic.result=results/pic_clusters.json +sansa.clustering.pic.matrix=results/pic_matrix.json +sansa.clustering.pic.number_clusters=10 +sansa.clustering.pic.iterations=5 + +# ont hot km clustering configuration +sansa.clustering.km.onehot.result=results/oneHot_kmeans_clusters.json +sansa.clustering.km.onehot.matrix=results/oneHotMatrix.json +sansa.clustering.km.onehot.number_clusters=10 +sansa.clustering.km.onehot.iterations=5 + +# mds km clustering configuration +sansa.clustering.km.mds.result=results/mds_kmeans_clusters.json +sansa.clustering.km.mds.matrix=results/mds_coordinates.json +sansa.clustering.km.mds.dimension=2 +sansa.clustering.km.mds.number_clusters=10 +sansa.clustering.km.mds.iterations=5 + +# word2vec km clustering configuration +sansa.clustering.km.word2vec.result=results/word2vec_kmeans_clusters.json +sansa.clustering.km.word2vec.matrix=results/word2Vec.json +sansa.clustering.km.word2vec.number_clusters=10 +sansa.clustering.km.word2vec.iterations=5 + +# dataset configuration +#sansa.data.input=data/merged_tomtom_yelp/ +#sansa.data.input=data/tomtom_pois_austria_v0.3.nt +sansa.data.input=src/main/resources/Cluster/input.nt +sansa.data.termValueUri=http://example.org/def#termValue +sansa.data.termPrefix=http://example.org/id/term/ +sansa.data.typePOI=http://example.org/def#POI +sansa.data.coordinatesPredicate=http://www.opengis.net/ont/geosparql#asWKT +sansa.data.categoryPOI=http://example.org/def#category +sansa.data.poiPrefix=http://example.org/id/poi/ + + +# sansa and yelp file merge +sansa.merge.input=src/main/resources/Cluster/input.nt +#yelp.sansa.merged_file=data/tomtom_yelp.nt +yelp.data.input=src/main/resources/Cluster/categories.nt +yelp.data.categoryPOI=http://example.org/hasYelpCategory +yelp.data.rating=http://example.org/hasRating diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ClassMembership.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ClassMembership.scala index 48c57f9..caf62cf 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ClassMembership.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ClassMembership.scala @@ -1,25 +1,20 @@ package net.sansa_stack.ml.spark.classification import java.io.PrintStream -import java.util.ArrayList -import java.util.HashSet -import java.util.Set -import scala.util.Random -import collection.JavaConverters._ +import java.util.{ ArrayList, HashSet, Set } + import scala.collection +import scala.util.Random -import org.semanticweb.owlapi.model.OWLClassExpression -import org.semanticweb.owlapi.model.OWLIndividual -import org.semanticweb.owlapi.model.OWLNamedIndividual +import collection.JavaConverters._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession +import org.semanticweb.owlapi.model.{ OWLClassExpression, OWLIndividual, OWLNamedIndividual } import net.sansa_stack.ml.spark.classification -import net.sansa_stack.ml.spark.classification.TDTInducer.TDTInducer -import net.sansa_stack.ml.spark.classification.KB.KB import net.sansa_stack.ml.spark.classification.ConceptsGenerator._ - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession -import org.apache.spark.{ SparkConf, SparkContext } +import net.sansa_stack.ml.spark.classification.KB.KB +import net.sansa_stack.ml.spark.classification.TDTInducer.TDTInducer object ClassMembership { @@ -51,11 +46,11 @@ object ClassMembership { println() println(nFolds + "-fold BOOTSTRAP Experiment on ontology: ") - //val classifierClass: Class[_] = ClassLoader.getSystemClassLoader.loadClass(className) + // val classifierClass: Class[_] = ClassLoader.getSystemClassLoader.loadClass(className) val nOfConcepts: Int = if (testConcepts != null) testConcepts.size else 1 - //var Generator: Random = new Random() - //val ntestExs: Array[Int] = Array.ofDim[Int](nFolds) + // var Generator: Random = new Random() + // val ntestExs: Array[Int] = Array.ofDim[Int](nFolds) // main loop on the folds for (f <- 0 until nFolds) { @@ -71,10 +66,10 @@ object ClassMembership { testRDD.foreach(println(_)) val classifier: TDTInducer = new TDTInducer(k, nOfConcepts, spark) - //val classifier: TDTInducer = new TDTInducer(k, kb.Concepts.count().toInt, spark) - /*val cl: TDTInducer = (classifierClass.getConstructor(classOf[KB], classOf[Int])) - .newInstance(kb, nOfConcepts).asInstanceOf[TDTInducer]*/ - //ntestExs(f) = testRDD.count.toInt + // val classifier: TDTInducer = new TDTInducer(k, kb.Concepts.count().toInt, spark) + /* val cl: TDTInducer = (classifierClass.getConstructor(classOf[KB], classOf[Int])) + .newInstance(kb, nOfConcepts).asInstanceOf[TDTInducer] */ + // ntestExs(f) = testRDD.count.toInt // training phase: using all examples but only those in the f-th partition println("\nTraining is starting...") @@ -85,44 +80,6 @@ object ClassMembership { val labels: Array[Array[Int]] = classifier.test(f, testRDD, testConcepts) } // for loop - } //bootstrap function - } //class + } // bootstrap function + } // class } - - - -// for (i<- 0 until allExamples.count.toInt) -// trainRDD.add(allExamples.takeSample(true, 1)(0)) - -// val trainingExsSet: Set[Integer] = new HashSet[Integer]() -// var trainRDD = spark.sparkContext.parallelize(trainingExsSet.asScala.toSeq) -// -// val testingExsSet: Set[Integer] = new HashSet[Integer]() -// var testRDD = spark.sparkContext.parallelize(testingExsSet.asScala.toSeq) -// -// var rand1 = new ArrayList[Integer] -// for (r <- 0 until allExamples.count.toInt) -// rand1.add(Generator.nextInt(allExamples.count.toInt)) -// -// var newRDD = spark.sparkContext.parallelize(rand1.asScala) -// trainRDD.union(newRDD) -// //trainingExsSet.add(Generator.nextInt(allExamples.count.toInt)) -// -// var r = 0 to allExamples.count.toInt -// var rand2 = spark.sparkContext.parallelize(r) -// -// if (!trainRDD.collect().contains(rand2)) -// testRDD.union(rand2.asInstanceOf[RDD[Integer]]) - - /*for (r <- 0 until allExamples.count.toInt){ - if (!trainRDD.collect().contains(r)) - testRDD.union(r) - }*/ - - - /*var trainingExs: Array[Integer] = Array.ofDim[Integer](0) - var testExs: Array[Integer] = Array.ofDim[Integer](0) - - trainingExs = trainingExsSet.toArray(trainingExs) - testExs = testingExsSet.toArray(testExs)*/ - \ No newline at end of file diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ConceptsGenerator.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ConceptsGenerator.scala index 1acdc06..b3d36ea 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ConceptsGenerator.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ConceptsGenerator.scala @@ -1,25 +1,23 @@ package net.sansa_stack.ml.spark.classification import java.util.HashSet + import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession -import org.apache.spark.{SparkConf, SparkContext} import org.semanticweb.HermiT.Reasoner -import org.semanticweb.owlapi.model.OWLClassExpression -import org.semanticweb.owlapi.model.OWLDataFactory -import org.semanticweb.owlapi.model.OWLIndividual -import org.semanticweb.owlapi.model.OWLNamedIndividual +import org.semanticweb.owlapi.model.{ OWLClassExpression, OWLDataFactory, OWLIndividual, OWLNamedIndividual } + import net.sansa_stack.ml.spark.classification.KB.KB -object ConceptsGenerator{ +object ConceptsGenerator { class ConceptsGenerator(protected var kb: KB) { protected var reasoner: Reasoner = kb.getReasoner protected var dataFactory: OWLDataFactory = kb.getDataFactory protected var allExamples: RDD[OWLIndividual] = kb.getIndividuals - + def generateQueryConcepts(numConceptsToGenerate: Int, sc: SparkSession): Array[OWLClassExpression] = { - + println("\nConcepts Generation\n-----------\n") val queryConcept: Array[OWLClassExpression] = Array.ofDim[OWLClassExpression](numConceptsToGenerate) val minOfSubConcepts: Int = 2 @@ -29,7 +27,7 @@ object ConceptsGenerator{ var j: Int = 0 var nextConcept: OWLClassExpression = null var complPartialConcept: OWLClassExpression = null - var nEx : Int = allExamples.count().toInt + var nEx: Int = allExamples.count().toInt // cycle to build numConceptsToGenerate new query concepts i = 0 while (i < numConceptsToGenerate) { @@ -37,56 +35,50 @@ object ConceptsGenerator{ numOfSubConcepts = minOfSubConcepts + KB.generator.nextInt(maxOfSubConcepts - minOfSubConcepts) var numPosInst: Int = 0 var numNegInst: Int = 0 - + // build a single new query OWLClassExpression adding conjuncts or disjuncts do { - - //take the first subConcept for builiding the query OWLClassExpression + + // take the first subConcept for builiding the query OWLClassExpression partialConcept = kb.getRandomConcept - //println("partial concept" + partialConcept) + // println("partial concept" + partialConcept) j = 1 - + while (j < numOfSubConcepts) { val newConcepts: HashSet[OWLClassExpression] = new HashSet[OWLClassExpression]() newConcepts.add(partialConcept) - + nextConcept = kb.getRandomConcept newConcepts.add(nextConcept) - + partialConcept = - if (KB.generator.nextInt(4) == 0) + if (KB.generator.nextInt(4) == 0) { dataFactory.getOWLObjectIntersectionOf(newConcepts) - else dataFactory.getOWLObjectUnionOf(newConcepts) - j+=1 + } else dataFactory.getOWLObjectUnionOf(newConcepts) + j += 1 } // for j - + println() complPartialConcept = dataFactory.getOWLObjectComplementOf(partialConcept) - //println("\n", complPartialConcept) + // println("\n", complPartialConcept) numPosInst = reasoner.getInstances(partialConcept, false).entities().count().toInt numNegInst = reasoner.getInstances(complPartialConcept, false).entities().count().toInt - + println(partialConcept) - println ("\n pos: " + numPosInst + ", neg: " + numNegInst + ", und: " + (nEx - numNegInst - numPosInst)) + println("\n pos: " + numPosInst + ", neg: " + numNegInst + ", und: " + (nEx - numNegInst - numPosInst)) println() - } while ((numPosInst < 20) || (numNegInst >3)) - // ((numPosInst < 10) || (numNegInst > 10)) - // (numPosInst * numNegInst == 0) - //add the newly built OWLClassExpression to the list of all required query concepts + } while ((numPosInst < 20) || (numNegInst > 3)) + // ((numPosInst < 10) || (numNegInst > 10)) + // (numPosInst * numNegInst == 0) + // add the newly built OWLClassExpression to the list of all required query concepts queryConcept(i) = partialConcept - println("Query " + (i+1) + " found\n\n") - i+=1 + println("Query " + (i + 1) + " found\n\n") + i += 1 } - - queryConcept + + queryConcept } - + } } - - - /*println("pos:%d (%3.1f)\t\t neg:%d (%3.1f)\t\t und:%d (%3.1f)\n " + numPosInst + numPosInst * 100.0 / nExs, - numNegInst, numNegInst * 100.0 / nExs, - (nExs - numNegInst - numPosInst), - (nExs - numNegInst - numPosInst) * 100.0 / nExs)*/ diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/DLTree.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/DLTree.scala index 47b6338..1144421 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/DLTree.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/DLTree.scala @@ -1,15 +1,14 @@ package net.sansa_stack.ml.spark.classification -import java.util.ArrayList -import java.util.List +import java.util.{ArrayList, List} import collection.JavaConverters._ - -import net.sansa_stack.ml.spark.classification._ -import org.semanticweb.owlapi.model.OWLClassExpression -import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession +import org.semanticweb.owlapi.model.OWLClassExpression + +import net.sansa_stack.ml.spark.classification._ + /* * Class for basic functions of DL trees @@ -99,8 +98,7 @@ class DLTree { * function to get the number of nodes */ - /* - def getNodi(sc: SparkSession): Double = { + /* def getNodi(sc: SparkSession): Double = { // visit in to make the count val lista: ArrayList[DLNode] = new ArrayList[DLNode]() @@ -145,10 +143,6 @@ class DLTree { } num } - - - - def getComplexityMeasure(sc: SparkSession) : Double = getNodi(sc)*/ + def getComplexityMeasure(sc: SparkSession) : Double = getNodi(sc) */ } - diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/KB.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/KB.scala index 4cdd1d9..c2ee415 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/KB.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/KB.scala @@ -5,29 +5,29 @@ import java.net.URI import java.util.{ ArrayList, List, Random } import java.util.stream.{ Collectors, IntStream, Stream } -import scala.collection.JavaConversions._ -import collection.JavaConverters._ import scala.collection.{ Iterator, Map } +import scala.collection.JavaConverters._ import scala.collection.immutable.{ HashMap, Set } +import collection.JavaConverters._ +import net.sansa_stack.owl.spark.rdd.OWLAxiomsRDD +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession +import org.semanticweb.HermiT.{ Configuration, Reasoner, ReasonerFactory } import org.semanticweb.owlapi.apibinding.OWLManager import org.semanticweb.owlapi.model._ -import org.semanticweb.owlapi.util.SimpleIRIMapper import org.semanticweb.owlapi.reasoner.{ OWLReasoner, OWLReasonerFactory } import org.semanticweb.owlapi.reasoner.structural.StructuralReasonerFactory +import org.semanticweb.owlapi.util.SimpleIRIMapper -import org.semanticweb.HermiT.{ Configuration, Reasoner, ReasonerFactory } -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession -import net.sansa_stack.owl.spark.rdd.OWLAxiomsRDD object KB { val d: Double = 0.3 var generator: Random = new Random(2) - /* - * The class to define the Knowledgebase elements - */ + /** + * The class to define the Knowledgebase elements + */ class KB(var UrlOwlFile: String, rdd: OWLAxiomsRDD, sparkSession: SparkSession) { @@ -82,7 +82,7 @@ object KB { val Concepts2: RDD[OWLClass] = rdd.flatMap { case axiom: HasClassesInSignature => axiom.classesInSignature().iterator().asScala - case _ => null + case _ => null }.filter(_ != null).distinct() Concepts = Concepts2 @@ -96,7 +96,7 @@ object KB { val Roles2: RDD[OWLObjectProperty] = rdd.map { case axiom: HasProperty[OWLObjectProperty] => axiom.getProperty - case _ => null + case _ => null }.filter(_ != null).distinct() Roles = Roles2 @@ -110,7 +110,7 @@ object KB { val Properties2: RDD[OWLDataProperty] = rdd.flatMap { case axiom: HasDataPropertiesInSignature => axiom.dataPropertiesInSignature().iterator().asScala - case _ => null + case _ => null }.filter(_ != null).distinct() Properties = Properties2 @@ -124,7 +124,7 @@ object KB { val Examples2: RDD[OWLNamedIndividual] = rdd.flatMap { case axiom: HasIndividualsInSignature => axiom.individualsInSignature().collect(Collectors.toSet()).asScala - case _ => null + case _ => null }.filter(_ != null).distinct() Examples = Examples2.asInstanceOf[RDD[OWLIndividual]] @@ -163,10 +163,12 @@ object KB { p = p + 1 } else { if (!flag) { - if (r.isEntailed(getDataFactory.getOWLClassAssertionAxiom(negTestConcepts(c), ind))) + if (r.isEntailed(getDataFactory.getOWLClassAssertionAxiom(negTestConcepts(c), ind))) { classifications(c)(e) = -1 - } else + } + } else { classifications(c)(e) = -1 + } n = n + 1 } @@ -231,7 +233,7 @@ object KB { def getReasoner(): Reasoner = hermit - //def getURL(): String = urlOwlFile + // def getURL(): String = urlOwlFile def getRandomProperty(numQueryProperty: Int): Array[Int] = { @@ -277,11 +279,12 @@ object KB { val role: OWLObjectProperty = Roles.takeSample(true, 1)(0) newConcept = - if (KB.generator.nextDouble() < 0.5) + if (KB.generator.nextDouble() < 0.5) { dataFactory.getOWLObjectAllValuesFrom(role, newConceptBase) - else dataFactory.getOWLObjectSomeValuesFrom(role, newConceptBase) - } else + } else dataFactory.getOWLObjectSomeValuesFrom(role, newConceptBase) + } else { newConcept = dataFactory.getOWLObjectComplementOf(newConceptBase) + } } } } while (!reasoner.isSatisfiable(newConcept)) @@ -301,12 +304,13 @@ object KB { val role: OWLObjectProperty = Roles.takeSample(true, 1)(0) newConcept = - if (KB.generator.nextDouble() < d) + if (KB.generator.nextDouble() < d) { dataFactory.getOWLObjectAllValuesFrom(role, newConceptBase) - else dataFactory.getOWLObjectSomeValuesFrom(role, newConceptBase) + } else dataFactory.getOWLObjectSomeValuesFrom(role, newConceptBase) } - } else + } else { newConcept = dataFactory.getOWLObjectComplementOf(newConcept) + } } while (!reasoner.isSatisfiable(newConcept)) diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/PerformanceMetrics.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/PerformanceMetrics.scala index 9226568..dd9387f 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/PerformanceMetrics.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/PerformanceMetrics.scala @@ -1,5 +1,4 @@ package net.sansa_stack.ml.spark.classification object PerformanceMetrics { - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/RefinementOperator.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/RefinementOperator.scala index ea1b1bd..2407956 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/RefinementOperator.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/RefinementOperator.scala @@ -6,11 +6,12 @@ import java.util.stream.{ Collectors, Stream } import scala.collection.JavaConverters._ import scala.util.Random +import org.apache.spark.rdd.RDD import org.semanticweb.owlapi.model._ import org.semanticweb.owlapi.search.EntitySearcher + import net.sansa_stack.ml.spark.classification._ import net.sansa_stack.ml.spark.classification.KB.KB -import org.apache.spark.rdd.RDD object RefinementOperator { val d: Double = 0.5 @@ -25,9 +26,9 @@ class RefinementOperator(var kb: KB) { private var Properties: RDD[OWLDataProperty] = kb.getDataProperties private var dataFactory: OWLDataFactory = kb.getDataFactory - /* - * Function to generate subsumed random concepts - */ + /** + * Function to generate subsumed random concepts + */ def getSubsumedRandomConcept(currentConcept: OWLClassExpression): OWLClassExpression = { val generator: Random = new Random() diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTClassifiers.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTClassifiers.scala index 136e1af..9d5722f 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTClassifiers.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTClassifiers.scala @@ -1,28 +1,24 @@ package net.sansa_stack.ml.spark.classification -import java.util.ArrayList -import java.util.HashSet -import java.util.Iterator -import java.util.List -import collection.JavaConverters._ +import java.util.{ ArrayList, HashSet, Iterator, List } + import scala.util.control.Breaks._ +import collection.JavaConverters._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession import org.semanticweb.owlapi.model.OWLClassExpression import org.semanticweb.owlapi.model.OWLDataFactory +import org.semanticweb.owlapi.model.OWLEquivalentClassesAxiom import org.semanticweb.owlapi.model.OWLIndividual -import org.semanticweb.owlapi.model.OWLObjectProperty import org.semanticweb.owlapi.model.OWLObjectAllValuesFrom -import org.semanticweb.owlapi.model.OWLObjectSomeValuesFrom import org.semanticweb.owlapi.model.OWLObjectIntersectionOf -import org.semanticweb.owlapi.model.OWLEquivalentClassesAxiom -//import org.semanticweb.owlapi.model.IRI +import org.semanticweb.owlapi.model.OWLObjectProperty +import org.semanticweb.owlapi.model.OWLObjectSomeValuesFrom -import net.sansa_stack.ml.spark.classification -import net.sansa_stack.ml.spark.classification.KB.KB import net.sansa_stack.ml.spark.classification._ +import net.sansa_stack.ml.spark.classification.KB.KB -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession /* * Terminological Decision Tree Classifier @@ -30,7 +26,8 @@ import org.apache.spark.sql.SparkSession object TDTClassifiers { - /* L for the left branch and R for the right branch + /** + * L for the left branch and R for the right branch * P, N, U for postive, negative and unlabeled respectively */ @@ -40,7 +37,7 @@ object TDTClassifiers { val PR: Int = 3 val NR: Int = 4 val UR: Int = 5 - + class TDTClassifiers(var k: KB, var sc: SparkSession) { /** @@ -57,12 +54,13 @@ object TDTClassifiers { * @return */ - def induceDLTree(father: OWLClassExpression, - posExs: RDD[String], negExs: RDD[String], undExs: RDD[String], - nRefs: Int, prPos: Double, prNeg: Double): DLTree = { + def induceDLTree( + father: OWLClassExpression, + posExs: RDD[String], negExs: RDD[String], undExs: RDD[String], + nRefs: Int, prPos: Double, prNeg: Double): DLTree = { val THRESHOLD: Double = 0.05 - val tree: DLTree = new DLTree() + val tree: DLTree = new DLTree() if (posExs.count.toInt == 0 && negExs.count.toInt == 0) // There is no examples if (prPos >= prNeg) { // prior majority of positives @@ -80,12 +78,10 @@ object TDTClassifiers { val total = numPos + numNeg var perPos: Double = 0 var perNeg: Double = 0 - if (total !=0){ + if (total != 0) { perPos = numPos / total perNeg = numNeg / total - } - else - return tree + } else return tree println("\nnew per Pos: " + perPos) println("new per Neg: " + perNeg) @@ -94,14 +90,13 @@ object TDTClassifiers { tree.setRoot(k.getDataFactory().getOWLThing) // set positive leaf println("-----\nPostive leaf (prior2)") return tree - } - else if (perPos == 0 && perNeg > THRESHOLD) { // no positive + } else if (perPos == 0 && perNeg > THRESHOLD) { // no positive tree.setRoot(k.getDataFactory().getOWLNothing); // set negative leaf println("-----\nNegative leaf (prior2)\n"); return tree } - // else (a non-leaf node) ... + // else (a non-leaf node) // generate set of concepts val Con: RDD[OWLClassExpression] = generateRefs(k, father, nRefs, posExs, negExs) @@ -109,56 +104,53 @@ object TDTClassifiers { // select best partitioning node concept val bestConcept: OWLClassExpression = selectBestConcept(k, Con, posExs, negExs, undExs, prPos, prNeg) - - if (bestConcept != null){ - - val sNode = split(k, bestConcept, posExs, negExs, undExs) - - // set the root concept - tree.setRoot(bestConcept.getNNF) - - // sNode._1._1 = PosEL, sNode._2._1 = NegEL, sNode._3._1 = undEL - // sNode._1._2 = PosER, sNode._2._2 = NegER, sNode._3._2 = undER - - - // build subtrees - + + if (bestConcept != null) { + + val sNode = split(k, bestConcept, posExs, negExs, undExs) + + // set the root concept + tree.setRoot(bestConcept.getNNF) + + // sNode._1._1 = PosEL, sNode._2._1 = NegEL, sNode._3._1 = undEL + // sNode._1._2 = PosER, sNode._2._2 = NegER, sNode._3._2 = undER + + // build subtrees + println("\nStart Positive tree \n----------") tree.setPosTree(induceDLTree(bestConcept, sNode._1._1, sNode._2._1, sNode._3._1, nRefs, prPos, prNeg)) - + println("\nStart Negative tree \n----------") tree.setNegTree(induceDLTree(bestConcept.getComplementNNF, sNode._1._2, sNode._2._2, sNode._3._2, nRefs, prPos, prNeg)) - return tree - } - else - return null + } else return null } - + /** * recursive down through the tree model * @param ind * @param tree * @return */ - def classify(ind: OWLIndividual, tree: DLTree): Int = { + def classify(ind: OWLIndividual, tree: DLTree): Int = { val rootClass: OWLClassExpression = tree.getRoot println("\nrootClass " + rootClass) - + val negRootClass: OWLClassExpression = k.getDataFactory.getOWLObjectComplementOf(rootClass) println("negRootClass " + negRootClass) - + if (rootClass.equals(k.getDataFactory.getOWLThing)) return +1 if (rootClass.equals(k.getDataFactory.getOWLNothing)) return -1 var r1: Int = 0 var r2: Int = 0 - if (k.getReasoner.isEntailed(k.getDataFactory.getOWLClassAssertionAxiom(rootClass, ind))) + if (k.getReasoner.isEntailed(k.getDataFactory.getOWLClassAssertionAxiom(rootClass, ind))) { r1 = classify(ind, tree.getPosSubTree) - else if (k.getReasoner.isEntailed(k.getDataFactory.getOWLClassAssertionAxiom(negRootClass, ind))) + } else if (k.getReasoner.isEntailed(k.getDataFactory.getOWLClassAssertionAxiom(negRootClass, ind))) { r2 = classify(ind, tree.getNegSubTree) + } var cP: Int = 0 var cn: Int = 0 @@ -169,7 +161,7 @@ object TDTClassifiers { if (missingVForTDT) { cP += classify(ind, tree.getPosSubTree) cn -= classify(ind, tree.getNegSubTree) - + if (cP > (-1 * cn)) return +1 else if (cP < (-1 * cn)) return -1 else return 0 @@ -178,380 +170,367 @@ object TDTClassifiers { else if ((r1 != 0)) r1 else r2 } - - - /** - * @param know - * @param concept - * @param dim - * @param posExs - * @param negExs - * @return - */ - private def generateRefs(know: KB, concept: OWLClassExpression, dim: Int, posExs: RDD[String], - negExs: RDD[String]): RDD[OWLClassExpression] = { - - println("\nGenerating node concepts: \n ") - var rConcepts: Array[OWLClassExpression] = Array.ofDim[OWLClassExpression](dim) - var newConcept: OWLClassExpression = null - var refinement: OWLClassExpression = null - var emptyIntersection: Boolean = false - - //val conceptExp = concept.nestedClassExpressions.iterator().asScala.toArray - val C = concept.asConjunctSet() - val ConceptExp = concept.asConjunctSet().iterator().asScala.toSeq - //println("\nconcept set " + C ) - - for (c <- 0 until dim) { - - do { - emptyIntersection = false //true - val Concepts: HashSet[OWLClassExpression] = new HashSet[OWLClassExpression]() - - if (concept.equals(know.getDataFactory().getOWLThing)) - refinement = new RefinementOperator(know).getRandomConcept(know) - else - refinement = new RefinementOperator(know).getSubsumedRandomConcept(concept) - - /* val con: OWLEquivalentClassesAxiom = know.dataFactory.getOWLEquivalentClassesAxiom(concept) + + /** + * @param know + * @param concept + * @param dim + * @param posExs + * @param negExs + * @return + */ + private def generateRefs(know: KB, concept: OWLClassExpression, dim: Int, posExs: RDD[String], + negExs: RDD[String]): RDD[OWLClassExpression] = { + + println("\nGenerating node concepts: \n ") + var rConcepts: Array[OWLClassExpression] = Array.ofDim[OWLClassExpression](dim) + var newConcept: OWLClassExpression = null + var refinement: OWLClassExpression = null + var emptyIntersection: Boolean = false + + // val conceptExp = concept.nestedClassExpressions.iterator().asScala.toArray + val C = concept.asConjunctSet() + val ConceptExp = concept.asConjunctSet().iterator().asScala.toSeq + // println("\nconcept set " + C ) + + for (c <- 0 until dim) { + + do { + emptyIntersection = false // true + val Concepts: HashSet[OWLClassExpression] = new HashSet[OWLClassExpression]() + + if (concept.equals(know.getDataFactory().getOWLThing)) { + refinement = new RefinementOperator(know).getRandomConcept(know) + } else { + refinement = new RefinementOperator(know).getSubsumedRandomConcept(concept) + } + + /* val con: OWLEquivalentClassesAxiom = know.dataFactory.getOWLEquivalentClassesAxiom(concept) val conExp: Array[OWLClassExpression] = con.classExpressions.iterator().asScala.toArray println("Concept Expressions = " ) - conExp.foreach(println(_))*/ - - val refInstance: Boolean = refinement.isInstanceOf[OWLObjectAllValuesFrom] - breakable{ - - for (i <- ConceptExp) - { - if (i.isInstanceOf[OWLObjectSomeValuesFrom]){ - val y: OWLObjectSomeValuesFrom = i.asInstanceOf[OWLObjectSomeValuesFrom] - val conprop: OWLObjectProperty = y.getProperty.getNamedProperty - val confiller : OWLClassExpression = y.getFiller - /*println("============================") + conExp.foreach(println(_)) */ + + val refInstance: Boolean = refinement.isInstanceOf[OWLObjectAllValuesFrom] + breakable { + + for (i <- ConceptExp) { + if (i.isInstanceOf[OWLObjectSomeValuesFrom]) { + val y: OWLObjectSomeValuesFrom = i.asInstanceOf[OWLObjectSomeValuesFrom] + val conprop: OWLObjectProperty = y.getProperty.getNamedProperty + val confiller: OWLClassExpression = y.getFiller + /* println("============================") println("concept property = " + conprop) - println("concept filler = " + confiller)*/ - - if (refInstance){ - val x : OWLObjectAllValuesFrom = refinement.asInstanceOf[OWLObjectAllValuesFrom] - val rprop: OWLObjectProperty = x.getProperty.getNamedProperty - val rfiller: OWLClassExpression = x.getFiller - // println("refienment property = " + rprop) - //println("refienment filler = " + rfiller) - if (conprop == rprop) break - + println("concept filler = " + confiller) */ + + if (refInstance) { + val x: OWLObjectAllValuesFrom = refinement.asInstanceOf[OWLObjectAllValuesFrom] + val rprop: OWLObjectProperty = x.getProperty.getNamedProperty + val rfiller: OWLClassExpression = x.getFiller + // println("refienment property = " + rprop) + // println("refienment filler = " + rfiller) + if (conprop == rprop) break + + } + } + } + if ((!(ConceptExp.contains(refinement)))) { + Concepts.add(concept) + Concepts.add(refinement) + newConcept = know.getDataFactory.getOWLObjectIntersectionOf(Concepts) + if (newConcept != null) { + emptyIntersection = !know.getReasoner.isSatisfiable(newConcept) } } } - if ((!(ConceptExp.contains(refinement)))) - { - Concepts.add(concept) - Concepts.add(refinement) - newConcept = know.getDataFactory.getOWLObjectIntersectionOf(Concepts) - if (newConcept != null) - emptyIntersection = !know.getReasoner.isSatisfiable(newConcept) - } - } - - - } while (emptyIntersection ) - - rConcepts(c) = - if (newConcept != null) newConcept - else concept - + + } while (emptyIntersection) + + rConcepts(c) = + if (newConcept != null) newConcept + else concept + + } + var Refs: RDD[OWLClassExpression] = sc.sparkContext.parallelize(rConcepts) + var nRef = Refs.distinct().count.toInt + println("\nNo. of generated concepts: " + nRef) + Refs.distinct() } - var Refs: RDD[OWLClassExpression] = sc.sparkContext.parallelize(rConcepts) - var nRef = Refs.distinct().count.toInt - println("\nNo. of generated concepts: " + nRef) - Refs.distinct() - } - - //val iterator: Iterator[OWLIndividual] = know.getReasoner().getInstances(newConcept, false).entities().iterator().asInstanceOf[Iterator[OWLIndividual]] - //val nextInd : OWLIndividual = iterator.next() - - /** - * Selecting the best in a list (RDD) of refinements - * @param know - * @param concepts - * @param posExs - * @param negExs - * @param undExs - * @param prPos - * @param prNeg - * @return - */ - def selectBestConcept(know: KB, - concepts: RDD[OWLClassExpression], - posExs: RDD[String], - negExs: RDD[String], - undExs: RDD[String], - prPos: Double, prNeg: Double): OWLClassExpression = { + // val iterator: Iterator[OWLIndividual] = know.getReasoner().getInstances(newConcept, false).entities().iterator().asInstanceOf[Iterator[OWLIndividual]] + // val nextInd : OWLIndividual = iterator.next() - var bestConceptIndex: Int = 0 + /** + * Selecting the best in a list (RDD) of refinements + * @param know + * @param concepts + * @param posExs + * @param negExs + * @param undExs + * @param prPos + * @param prNeg + * @return + */ - println("\nThe First concept is: " + concepts.first()) - var counts: Array[Int] = getSplitCounts(know, concepts.first(), posExs, negExs, undExs) + def selectBestConcept( + know: KB, + concepts: RDD[OWLClassExpression], + posExs: RDD[String], + negExs: RDD[String], + undExs: RDD[String], + prPos: Double, prNeg: Double): OWLClassExpression = { + + var bestConceptIndex: Int = 0 - println("\nPL:" +counts(0) +",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) + + println("\nThe First concept is: " + concepts.first()) + var counts: Array[Int] = getSplitCounts(know, concepts.first(), posExs, negExs, undExs) + + println("\nPL:" + counts(0) + ",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) + ",\tNR:" + counts(4) + ",\tUR:" + counts(5)) - //var bestGain: Double = gain(counts, prPos, prNeg) - var bestGain: Double = gain(counts) - println("\nCurrent gain: "+ bestGain) + // var bestGain: Double = gain(counts, prPos, prNeg) + var bestGain: Double = gain(counts) + println("\nCurrent gain: " + bestGain) - for (c <- 1 until concepts.count.toInt) { + for (c <- 1 until concepts.count.toInt) { - var nConcept = concepts.take(concepts.count.toInt).apply(c) - println("\nConcept " + (c+1) +" is: " + nConcept) + var nConcept = concepts.take(concepts.count.toInt).apply(c) + println("\nConcept " + (c + 1) + " is: " + nConcept) - counts = getSplitCounts(know, nConcept, posExs, negExs, undExs) - println("\nPL:" +counts(0) +",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) + - ",\tNR:" + counts(4) + ",\tUR:" + counts(5)) + counts = getSplitCounts(know, nConcept, posExs, negExs, undExs) + println("\nPL:" + counts(0) + ",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) + + ",\tNR:" + counts(4) + ",\tUR:" + counts(5)) - //var thisGain: Double = gain(counts, prPos, prNeg) - var thisGain: Double = gain(counts) - println("\nCurrent gain: " + thisGain) + // var thisGain: Double = gain(counts, prPos, prNeg) + var thisGain: Double = gain(counts) + println("\nCurrent gain: " + thisGain) - if (thisGain > bestGain) { - bestConceptIndex = c - bestGain = thisGain + if (thisGain > bestGain) { + bestConceptIndex = c + bestGain = thisGain + } + } + + val nCpt = concepts.take(concepts.count.toInt).apply(bestConceptIndex) + + if (bestGain == 0.0) { + null + // val parts = nCpt.nestedClassExpressions.iterator().asScala.toList + // val ref = parts.last + // val x = parts.filterNot(elem => elem == ref) + // println("refienment removed: ") + // x.foreach(println(_)) + // var y: ArrayList[OWLClassExpression] = new ArrayList() + // var i = 0 + // while (i< x.size) + // { + // val z = x.get(i) + // y.add(z) + // i = i+1 + // } + // + // nCpt + } else { + println("\n --------\nBest gain: " + bestGain + " \t Split index: " + bestConceptIndex) + println("\nPL:" + counts(0) + ",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) + + ",\tNR:" + counts(4) + ",\tUR:" + counts(5)) + + println("\n Best concept is: " + nCpt) + nCpt } } - - val nCpt = concepts.take(concepts.count.toInt).apply(bestConceptIndex) - - if (bestGain == 0.0) { - null -// val parts = nCpt.nestedClassExpressions.iterator().asScala.toList -// val ref = parts.last -// val x = parts.filterNot(elem => elem == ref) -// println("refienment removed: ") -// x.foreach(println(_)) -// var y: ArrayList[OWLClassExpression] = new ArrayList() -// var i = 0 -// while (i< x.size) -// { -// val z = x.get(i) -// y.add(z) -// i = i+1 -// } -// -// nCpt - } - else { - println("\n --------\nBest gain: " + bestGain + " \t Split index: " + bestConceptIndex) - println("\nPL:" +counts(0) +",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) + - ",\tNR:" + counts(4) + ",\tUR:" + counts(5)) - - println("\n Best concept is: " + nCpt) - nCpt - } - } - /** - * @param counts - * @return The calculated Gain - */ + /** + * @param counts + * @return The calculated Gain + */ - /* - * Function to calculate the gain - */ - - private def gain(counts: Array[Int]): Double = { - - var gain: Double = 0.0 - val totalL: Double = counts(PL) + counts(NL) + 0.001 - val totalR: Double = counts(PR) + counts(NR) + 0.001 - val total: Double = totalL + totalR - val pPL: Double = counts(PL) / totalL - val pPR: Double = counts(PR) / totalR - val pNL: Double = counts(NL) / totalL - val pNR: Double = counts(NR) / totalR - - if (Math.abs(pPL + pPR) != 0 && Math.abs(pNL + pNR) != 0 ) - { - gain = (totalL / total) * (totalR / total) * + /** + * Function to calculate the gain + */ + private def gain(counts: Array[Int]): Double = { + + var gain: Double = 0.0 + val totalL: Double = counts(PL) + counts(NL) + 0.001 + val totalR: Double = counts(PR) + counts(NR) + 0.001 + val total: Double = totalL + totalR + val pPL: Double = counts(PL) / totalL + val pPR: Double = counts(PR) / totalR + val pNL: Double = counts(NL) / totalL + val pNR: Double = counts(NR) / totalR + + if (Math.abs(pPL + pPR) != 0 && Math.abs(pNL + pNR) != 0) { + gain = (totalL / total) * (totalR / total) * Math.pow(Math.abs(pPL - pPR) / Math.abs(pPL + pPR) + Math.abs(pNL - pNR) / Math.abs(pNL + pNR), 2) + } + gain + } - - gain - - } - - - /** - * @param counts - * @param prPos - * @param prNeg - * @return The calculated Gain - */ - /* - * Function to calculate the gain based on gini index - */ + /** + * @param counts + * @param prPos + * @param prNeg + * @return The calculated Gain + */ + + /** + * Function to calculate the gain based on gini index + */ + + /* def gain(counts: Array[Int], prPos: Double, prNeg: Double): Double = { - /* def gain(counts: Array[Int], prPos: Double, prNeg: Double): Double = { - val Trsize: Double = counts(0) + counts(1) val Flsize: Double = counts(3) + counts(4) val Usize: Double = counts(2) + counts(5)// + counts(6) + counts(7) - + val size: Double = Trsize + Flsize + Usize - - val startImpurity : Double = gini(counts(0) + counts(3), counts(1) + counts(4), prPos, prNeg) - + val startImpurity : Double = gini(counts(0) + counts(3), counts(1) + counts(4), prPos, prNeg) val TrImpurity = gini(counts(0), counts(1), prPos, prNeg) val FlImpurity = gini(counts(3), counts(4), prPos, prNeg) val UImpurity = gini(counts(2) , counts(5), prPos, prNeg) //counts(2)+ counts(6), counts(5) + counts(7) - + val Gainval = startImpurity - (Trsize/size)*TrImpurity - (Flsize/size)*FlImpurity - -(Usize/size)*UImpurity - + Gainval } - + def gini(nPos: Double, nNeg: Double, prPos: Double, prNeg: Double): Double = { - + val estimatProp : Int = 3 val total: Double = nPos + nNeg - + val p1 : Double = (nPos*estimatProp*prPos)/(total+estimatProp) val p2: Double = (nNeg*estimatProp*prNeg)/(total+estimatProp) - + val ginival = 1.0-p1*p1-p2*p2 ginival - }*/ - - - - /** - * @param know - * @param concept - * @param posExs - * @param negExs - * @param undExs - * @return - */ + } */ - private def getSplitCounts(know: KB, - concept: OWLClassExpression, - posExs: RDD[String], - negExs: RDD[String], - undExs: RDD[String]): Array[Int] = { - - val counts: Array[Int] = Array.ofDim[Int](6) - - val Pos = splitGroup(know, concept, posExs) - val Neg = splitGroup(know, concept, negExs) - val Und = splitGroup(know, concept, undExs) - - counts(PL) = Pos._1.count.toInt - counts(NL) = Neg._1.count.toInt - counts(UL) = Und._1.count.toInt - counts(PR) = Pos._2.count.toInt - counts(NR) = Neg._2.count.toInt - counts(UR) = Und._2.count.toInt - - counts - } + /** + * @param know + * @param concept + * @param posExs + * @param negExs + * @param undExs + * @return + */ - /** - * @param know - * @param concept - * @param nodeExamples - * @param leftExs - * @param rightExs - */ - private def splitGroup(know: KB, - concept: OWLClassExpression, - nodeExamples: RDD[String]): (RDD[String], RDD[String]) = { + private def getSplitCounts( + know: KB, + concept: OWLClassExpression, + posExs: RDD[String], + negExs: RDD[String], + undExs: RDD[String]): Array[Int] = { + + val counts: Array[Int] = Array.ofDim[Int](6) - /*println("\nNode examples: \n ----------") - nodeExamples.take(nodeExamples.count.toInt).foreach(println(_))*/ + val Pos = splitGroup(know, concept, posExs) + val Neg = splitGroup(know, concept, negExs) + val Und = splitGroup(know, concept, undExs) - val negConcept: OWLClassExpression = know.getDataFactory.getOWLObjectComplementOf(concept) - - var Left = new ArrayList[String]() - var Right = new ArrayList[String]() + counts(PL) = Pos._1.count.toInt + counts(NL) = Neg._1.count.toInt + counts(UL) = Und._1.count.toInt + counts(PR) = Pos._2.count.toInt + counts(NR) = Neg._2.count.toInt + counts(UR) = Und._2.count.toInt - for (e <- 0 until nodeExamples.count.toInt) { + counts + } + + /** + * @param know + * @param concept + * @param nodeExamples + * @param leftExs + * @param rightExs + */ + private def splitGroup( + know: KB, + concept: OWLClassExpression, + nodeExamples: RDD[String]): (RDD[String], RDD[String]) = { + + /* println("\nNode examples: \n ----------") + nodeExamples.take(nodeExamples.count.toInt).foreach(println(_)) */ - val nodeEx = nodeExamples.take(e + 1).apply(e) - val nodeInd = know.getDataFactory().getOWLNamedIndividual(nodeEx).asInstanceOf[OWLIndividual] + val negConcept: OWLClassExpression = know.getDataFactory.getOWLObjectComplementOf(concept) - if (know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(concept, nodeInd))) { + var Left = new ArrayList[String]() + var Right = new ArrayList[String]() + + for (e <- 0 until nodeExamples.count.toInt) { + + val nodeEx = nodeExamples.take(e + 1).apply(e) + val nodeInd = know.getDataFactory().getOWLNamedIndividual(nodeEx).asInstanceOf[OWLIndividual] + + if (know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(concept, nodeInd))) { Left.add(nodeEx) - - } else if (know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(negConcept, nodeInd))) { + + } else if (know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(negConcept, nodeInd))) { Right.add(nodeEx) - - } else { + + } else { Left.add(nodeEx) Right.add(nodeEx) + } } - } - val leftRDD = sc.sparkContext.parallelize(Left.asScala) - val rightRDD = sc.sparkContext.parallelize(Right.asScala) + val leftRDD = sc.sparkContext.parallelize(Left.asScala) + val rightRDD = sc.sparkContext.parallelize(Right.asScala) - /*println("\nleft ex: ") + /* println("\nleft ex: ") leftRDD.take(20).foreach(println(_)) println("\nright ex: ") - rightRDD.take(20).foreach(println(_))*/ - - (leftRDD, rightRDD) - - - //val propName: RDD[String] = know.getIndividuals().map( ind => ind.asOWLNamedIndividual().getIRI.getShortForm) - // println("\n nodeEx = " + nodeEx ) - //val Filtered = know.getIndividuals().filter(_ == nodeInd) + rightRDD.take(20).foreach(println(_)) */ + + (leftRDD, rightRDD) + + // val propName: RDD[String] = know.getIndividuals().map( ind => ind.asOWLNamedIndividual().getIRI.getShortForm) + // println("\n nodeEx = " + nodeEx ) + // val Filtered = know.getIndividuals().filter(_ == nodeInd) // println("\n filtered = " ) // Filtered.take(10).foreach(println(_)) - //val exIndex = ex.lookup(e) + // val exIndex = ex.lookup(e) // println("the element: ") - //exInd.take(1).foreach(println(_)) - //val ind = know.getDataFactory().getOWLNamedIndividual(IRI.create(nodeEx)).asInstanceOf[OWLIndividual] - //println("newexample " + ind ) + // exInd.take(1).foreach(println(_)) + // val ind = know.getDataFactory().getOWLNamedIndividual(IRI.create(nodeEx)).asInstanceOf[OWLIndividual] + // println("newexample " + ind ) - //val x = know.getIndividuals().take(nodeExamples.count.toInt).apply(e) - //val x = know.getIndividuals().filter( _ == neew).first() + // val x = know.getIndividuals().take(nodeExamples.count.toInt).apply(e) + // val x = know.getIndividuals().filter( _ == neew).first() - //x.take(20).foreach(println(_)) + // x.take(20).foreach(println(_)) - //val r =know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(concept, ind)) - //println("\n r = " + r) + // val r =know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(concept, ind)) + // println("\n r = " + r) // val l =know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(negConcept, ind)) - //println("\n l = " + l) - } - - /** - * @param know - * @param concept - * @param posExs - * @param negExs - * @param undExs - */ - - private def split(know: KB, - concept: OWLClassExpression, - posExs: RDD[String], negExs: RDD[String], undExs: RDD[String]): - ((RDD[String], RDD[String]), (RDD[String], RDD[String]), (RDD[String], RDD[String])) = { + // println("\n l = " + l) + } - val Pos = splitGroup(know, concept, posExs) - val Neg = splitGroup(know, concept, negExs) - val Und = splitGroup(know, concept, undExs) - - (Pos, Neg, Und) - } + /** + * @param know + * @param concept + * @param posExs + * @param negExs + * @param undExs + */ - }//class + private def split( + know: KB, + concept: OWLClassExpression, + posExs: RDD[String], negExs: RDD[String], undExs: RDD[String]): ((RDD[String], RDD[String]), (RDD[String], RDD[String]), (RDD[String], RDD[String])) = { - + val Pos = splitGroup(know, concept, posExs) + val Neg = splitGroup(know, concept, negExs) + val Und = splitGroup(know, concept, undExs) + (Pos, Neg, Und) + } + } // class /** * Selecting the best in a list (RDD) of refinements using Entropy calculations @@ -567,7 +546,7 @@ object TDTClassifiers { * @return */ - /* def selectBestConceptEntropy(know: KB, concepts: RDD[OWLClassExpression], + /* def selectBestConceptEntropy(know: KB, concepts: RDD[OWLClassExpression], posExs: RDD[String], negExs: RDD[String], undExs: RDD[String], @@ -607,13 +586,13 @@ object TDTClassifiers { val nCpt = n.lookup(bestConceptIndex).asInstanceOf[OWLClassExpression] println("\n %s\n\n", nCpt) nCpt - }*/ + } */ - /* + /** * Function to calculate the Entropy value */ - /* def Entropy(counts: Array[Int], prPos: Double, prNeg: Double, sizPos: Int, sizNeg: Int): Double = { + /* def Entropy(counts: Array[Int], prPos: Double, prNeg: Double, sizPos: Int, sizNeg: Int): Double = { val nP = counts(0) + counts(1) val nN = counts(3) + counts(4) val nU = counts(2) + counts(5) + counts(6) + counts(7) @@ -654,6 +633,6 @@ object TDTClassifiers { - (2 - p1 - p2) * (p1 * Math.log(p1) - p2 * Math.log(p2))) EntropyValue - }*/ + } */ -}//object +}// object diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTInducer.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTInducer.scala index e3cec25..a1873be 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTInducer.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTInducer.scala @@ -1,21 +1,15 @@ package net.sansa_stack.ml.spark.classification import java.io.PrintStream -import java.util.ArrayList -import java.util.List -import java.util.Arrays -import java.util.HashSet -import collection.JavaConverters._ -import scala.collection +import java.util.{ ArrayList, Arrays, HashSet, List } -import org.semanticweb.owlapi.model.OWLClassExpression -import org.semanticweb.owlapi.model.OWLIndividual -import org.semanticweb.owlapi.model.OWLNamedIndividual -import org.semanticweb.HermiT.Reasoner +import scala.collection +import collection.JavaConverters._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession -import org.apache.spark.{SparkConf, SparkContext} +import org.semanticweb.HermiT.Reasoner +import org.semanticweb.owlapi.model.{ OWLClassExpression, OWLIndividual, OWLNamedIndividual } import net.sansa_stack.ml.spark.classification._ import net.sansa_stack.ml.spark.classification.KB.KB @@ -26,151 +20,147 @@ import net.sansa_stack.ml.spark.classification.TDTClassifiers.TDTClassifiers */ object TDTInducer { - var stream: PrintStream = _ - -class TDTInducer(var kb: KB, var nConcepts: Int, var sc: SparkSession) { + var stream: PrintStream = _ -//for each query concept induce an ensemble - var trees: Array[DLTree] = new Array[DLTree](nConcepts) + class TDTInducer(var kb: KB, var nConcepts: Int, var sc: SparkSession) { - var cl: TDTClassifiers = new TDTClassifiers(kb, sc) + // for each query concept induce an ensemble + var trees: Array[DLTree] = new Array[DLTree](nConcepts) - - - /* - * Function for training the algorithm - */ - def training(results: Array[Array[Int]], trainingExs: RDD[OWLIndividual], - testConcepts: Array[OWLClassExpression], - negTestConcepts: Array[OWLClassExpression]): Unit = { - - val op: RefinementOperator = new RefinementOperator(kb) - val reasoner: Reasoner = kb.getReasoner - val allExamples: RDD[OWLIndividual] = kb.getIndividuals - - //val trainingExsSet: HashSet[Integer] = new HashSet[Integer](Arrays.asList(trainingExs: _*)) - - val length: Int = if (testConcepts != null) testConcepts.size else 1 - - for (c <- 0 until length) { - - println("\n--- Query Concept # " + (c+1)) - - // These instances should be divided into negative instances, positive and uncertain - // split._1 = posExs, split._2 = negExs, split._3 = undExs - val split = splitting(trainingExs, results, c) - - var prPos: Double = split._1.count.toDouble / (trainingExs.count.toInt) - var prNeg: Double = split._2.count.toDouble / (trainingExs.count.toInt) - println("Training set composition: " + split._1.count() + " - " + split._2.count() + " - " + split._3.count()) - - val Sum: Double = prPos + prNeg - if (Sum == 0) { - prPos = 0.5 - prNeg = 0.5 - } else { - prPos = prPos / Sum - prNeg = prNeg / Sum - } - println("\nNew learning problem prepared "+ (c+1)) - println("Learning a tree ") - trees(c) = cl.induceDLTree(kb.getDataFactory.getOWLThing, split._1, split._2, split._3, 50, prPos, prNeg) + var cl: TDTClassifiers = new TDTClassifiers(kb, sc) + + /** + * Function for training the algorithm + */ + def training(results: Array[Array[Int]], trainingExs: RDD[OWLIndividual], + testConcepts: Array[OWLClassExpression], + negTestConcepts: Array[OWLClassExpression]): Unit = { + + val op: RefinementOperator = new RefinementOperator(kb) + val reasoner: Reasoner = kb.getReasoner + val allExamples: RDD[OWLIndividual] = kb.getIndividuals + // val trainingExsSet: HashSet[Integer] = new HashSet[Integer](Arrays.asList(trainingExs: _*)) + + val length: Int = if (testConcepts != null) testConcepts.size else 1 + + for (c <- 0 until length) { + + println("\n--- Query Concept # " + (c + 1)) + + // These instances should be divided into negative instances, positive and uncertain + // split._1 = posExs, split._2 = negExs, split._3 = undExs + val split = splitting(trainingExs, results, c) + + var prPos: Double = split._1.count.toDouble / (trainingExs.count.toInt) + var prNeg: Double = split._2.count.toDouble / (trainingExs.count.toInt) + println("Training set composition: " + split._1.count() + " - " + split._2.count() + " - " + split._3.count()) + + val Sum: Double = prPos + prNeg + if (Sum == 0) { + prPos = 0.5 + prNeg = 0.5 + } else { + prPos = prPos / Sum + prNeg = prNeg / Sum + } + println("\nNew learning problem prepared " + (c + 1)) + println("Learning a tree ") + trees(c) = cl.induceDLTree(kb.getDataFactory.getOWLThing, split._1, split._2, split._3, 50, prPos, prNeg) + + } } - } - - /* + + /* * Function for testing the algorithm */ - def test (f: Int, testExs: RDD[OWLIndividual], testConcepts: Array[OWLClassExpression]): Array[Array[Int]] = { - - // classifier answers for each example and for each concept - val labels: Array[Array[Int]] = Array.ofDim[Int](testExs.count.toInt, nConcepts) - - for (t <- 0 until testExs.count.toInt) { - val indTestEx = testExs.take(t+1).apply(t) - println("\n\nFold #" + (f+1)) - println(" ---\n Classifying Example " + (t+1) + "/" + testExs.count.toInt + " [" + indTestEx + "] ") - - //labels(t) = Array.ofDim[Int](nConcepts) - - - for (i <- 0 until nConcepts - 1) { - labels(t)(i) = cl.classify(indTestEx, trees(i)) + def test(f: Int, testExs: RDD[OWLIndividual], testConcepts: Array[OWLClassExpression]): Array[Array[Int]] = { + + // classifier answers for each example and for each concept + val labels: Array[Array[Int]] = Array.ofDim[Int](testExs.count.toInt, nConcepts) + + for (t <- 0 until testExs.count.toInt) { + val indTestEx = testExs.take(t + 1).apply(t) + println("\n\nFold #" + (f + 1)) + println(" ---\n Classifying Example " + (t + 1) + "/" + testExs.count.toInt + " [" + indTestEx + "] ") + + // labels(t) = Array.ofDim[Int](nConcepts) + + for (i <- 0 until nConcepts - 1) { + labels(t)(i) = cl.classify(indTestEx, trees(i)) + } } + labels } - labels - } - /* + /* * Function for splitting the training examples into positive, negative and undefined examples */ - - def splitting(trainingExs: RDD[OWLIndividual], classifications: Array[Array[Int]], c: Int): (RDD[String],RDD[String],RDD[String]) = { - - var BINARYCLASSIFICATION : Boolean = false -// var classRDD = sc.sparkContext.parallelize(classifications,2) -// var pos = classRDD.filter(_ == +1) - - var pos = new ArrayList[String]() - var neg = new ArrayList[String]() - var und = new ArrayList[String]() - var TExs = trainingExs.zipWithIndex() - - for (i <-0 until trainingExs.count.toInt){ - - val trainValue = trainingExs.take(i+1).apply(i) - //var trainIndex = TExs.lookup(trainValue) - //println("\nvalue : " + trainValue) - val trainIndex = trainingExs.take(trainingExs.count.toInt).indexOf(trainValue) - // println("index : " + trainIndex) - -/* var p = trainingExs.filter{ exs => + + def splitting(trainingExs: RDD[OWLIndividual], classifications: Array[Array[Int]], c: Int): (RDD[String], RDD[String], RDD[String]) = { + + var BINARYCLASSIFICATION: Boolean = false + // var classRDD = sc.sparkContext.parallelize(classifications,2) + // var pos = classRDD.filter(_ == +1) + + var pos = new ArrayList[String]() + var neg = new ArrayList[String]() + var und = new ArrayList[String]() + var TExs = trainingExs.zipWithIndex() + + for (i <- 0 until trainingExs.count.toInt) { + + val trainValue = trainingExs.take(i + 1).apply(i) + // var trainIndex = TExs.lookup(trainValue) + // println("\nvalue : " + trainValue) + val trainIndex = trainingExs.take(trainingExs.count.toInt).indexOf(trainValue) + // println("index : " + trainIndex) + + /* var p = trainingExs.filter{ exs => val v = exs.toString() - - }*/ - - if (trainIndex != -1){ - val value = trainValue.toString() - if (classifications(c)(trainIndex) == +1) + + } */ + + if (trainIndex != -1) { + val value = trainValue.toString() + if (classifications(c)(trainIndex) == +1) { pos.add(value) - else if (!BINARYCLASSIFICATION) { - if (classifications(c)(trainIndex) == -1) + } else if (!BINARYCLASSIFICATION) { + if (classifications(c)(trainIndex) == -1) { + neg.add(value) + } else { + und.add(value) + } + } else { neg.add(value) - else - und.add(value) + } + } } - else - neg.add(value) - } - } - var posExs = sc.sparkContext.parallelize(pos.asScala) - var negExs = sc.sparkContext.parallelize(neg.asScala) - var undExs = sc.sparkContext.parallelize(und.asScala) - - (posExs, negExs, undExs) - } -// val TList : List[Integer]= new ArrayList[Integer] -// var T = sc.sparkContext.parallelize(TList.asScala) -// -// var TExs = trainingExs.zipWithIndex() -// for (e <- 0 until trainingExs.count.toInt) { -// -// var index = TExs.lookup(e) -// T.union(index) - //val Train = sc.sparkContext.parallelize(T.asScala) - - /*if (classifications(c)(TExs.lookup(e)) == +1) posExs.union(T) + var posExs = sc.sparkContext.parallelize(pos.asScala) + var negExs = sc.sparkContext.parallelize(neg.asScala) + var undExs = sc.sparkContext.parallelize(und.asScala) + + (posExs, negExs, undExs) + } + // val TList : List[Integer]= new ArrayList[Integer] + // var T = sc.sparkContext.parallelize(TList.asScala) + // + // var TExs = trainingExs.zipWithIndex() + // for (e <- 0 until trainingExs.count.toInt) { + // + // var index = TExs.lookup(e) + // T.union(index) + // val Train = sc.sparkContext.parallelize(T.asScala) + + /* if (classifications(c)(TExs.lookup(e)) == +1) posExs.union(T) else if (!BINARYCLASSIFICATION) { if (classifications(c)(TExs.lookup(e)) == -1) negExs.union(T) else undExs.union(T) - } else negExs.union(T)*/ - //} + } else negExs.union(T) */ + // } - - - /* def getComplexityValues(sc: SparkSession): Array[Double] = { + /* def getComplexityValues(sc: SparkSession): Array[Double] = { // a measure to express the model complexity (e.g. the number of nodes in a tree) val complexityValue: Array[Double] = Array.ofDim[Double](trees.length) @@ -179,7 +169,6 @@ class TDTInducer(var kb: KB, var nConcepts: Int, var sc: SparkSession) { complexityValue(i) = current } complexityValue - }*/ - + } */ } -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TermDecisionTrees.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TermDecisionTrees.scala index 71f23f7..2a6dbcb 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TermDecisionTrees.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TermDecisionTrees.scala @@ -1,30 +1,30 @@ -package net.sansa_stack.ml.spark.classification +package net.sansa_stack.ml.spark.classification import java.util.ArrayList -import scala.reflect.runtime.universe._ + import scala.collection.JavaConverters._ +import scala.reflect.runtime.universe._ + +import org.apache.log4j.{ Level, Logger } +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession import org.semanticweb.owlapi.model.OWLClassExpression import org.semanticweb.owlapi.model.OWLIndividual +import scopt.OptionParser -import net.sansa_stack.ml.spark.classification.KB.KB import net.sansa_stack.ml.spark.classification.ClassMembership.ClassMembership +import net.sansa_stack.ml.spark.classification.KB.KB import net.sansa_stack.ml.spark.classification.TDTClassifiers.TDTClassifiers - import net.sansa_stack.owl.spark.rdd.FunctionalSyntaxOWLAxiomsRDDBuilder import net.sansa_stack.owl.spark.rdd.OWLAxiomsRDD -import scopt.OptionParser -import org.apache.log4j.{Level, Logger} -import org.apache.spark.sql.SparkSession -import org.apache.spark.rdd.RDD - object TermDecisionTrees { - /* - * The main file to call Terminological Decision Trees for Classification - */ + /** + * The main file to call Terminological Decision Trees for Classification + */ - def main(args: Array[String]) = { + def main(args: Array[String]): Unit = { val input = "src/main/resources/Classification/trains.owl" @@ -38,61 +38,59 @@ object TermDecisionTrees { .config("spark.kryo.registrator", "net.sansa_stack.ml.spark.classification.Registrator") .appName("Termnological Decision Tree") .getOrCreate() - - //Call owl axion builder to read the classes and object properties and print - - val rdd : OWLAxiomsRDD = FunctionalSyntaxOWLAxiomsRDDBuilder.build(sparkSession, input) - + + // Call owl axion builder to read the classes and object properties and print + + val rdd: OWLAxiomsRDD = FunctionalSyntaxOWLAxiomsRDDBuilder.build(sparkSession, input) + val kb: KB = new KB(input, rdd, sparkSession) var ClassM = new ClassMembership(kb, sparkSession) val ClassName = TDTInducer.toString() ClassM.bootstrap(10, ClassName, sparkSession) - //val c : TDTInducer = new TDTInducer(kb, kb.Concepts.count().toInt, sparkSession) - -// var PosExamples = sparkSession.sparkContext.parallelize(Array("http://example.com/foo#east1", -// "http://example.com/foo#east2", -// "http://example.com/foo#east3", -// "http://example.com/foo#east4", -// "http://example.com/foo#east5")) -// -// var NegExamples = sparkSession.sparkContext.parallelize(Array("http://example.com/foo#west6", -// "http://example.com/foo#west7", -// "http://example.com/foo#west8", -// "http://example.com/foo#west9", -// "http://example.com/foo#west10")) -// -// var UndExamples = sparkSession.sparkContext.parallelize(new ArrayList[String]().asScala) -// -// val numPos: Double = PosExamples.count -// val numNeg: Double = NegExamples.count -// val perPos: Double = numPos / (numPos + numNeg) -// val perNeg: Double = numNeg / (numPos + numNeg) -// -// println("\nLearning problem: \n --------------------\n") -// println("No. of Positive examples: " + PosExamples.count) -// println("No. of Negative examples: " + NegExamples.count) -// println("No. of Undefined examples: " + UndExamples.count) -// println("\nper Pos: " + perPos) -// println("per Neg: " + perNeg) -// -// val nGeneratedRef: Int = 50 -// -// val c : TDTClassifiers = new TDTClassifiers (kb, sparkSession) -// val tree : DLTree = c.induceDLTree(kb.getDataFactory.getOWLThing, PosExamples, NegExamples, UndExamples, nGeneratedRef, perPos, perNeg) -// -// val Root: OWLClassExpression = tree.getRoot() -// println("\nRoot of the tree is: " + Root) - - /*val possubtree = tree.getPosSubTree().toString() - println("possubtree: " + possubtree)*/ - - //val ind = kb.getDataFactory().getOWLNamedIndividual("http://example.com/foo#east2") - //val classification : Int = c.classify(ind, tree) - //println("\nclassification of east2 is " + classification) - - sparkSession.stop + // val c : TDTInducer = new TDTInducer(kb, kb.Concepts.count().toInt, sparkSession) - } + // var PosExamples = sparkSession.sparkContext.parallelize(Array("http://example.com/foo#east1", + // "http://example.com/foo#east2", + // "http://example.com/foo#east3", + // "http://example.com/foo#east4", + // "http://example.com/foo#east5")) + // + // var NegExamples = sparkSession.sparkContext.parallelize(Array("http://example.com/foo#west6", + // "http://example.com/foo#west7", + // "http://example.com/foo#west8", + // "http://example.com/foo#west9", + // "http://example.com/foo#west10")) + // + // var UndExamples = sparkSession.sparkContext.parallelize(new ArrayList[String]().asScala) + // + // val numPos: Double = PosExamples.count + // val numNeg: Double = NegExamples.count + // val perPos: Double = numPos / (numPos + numNeg) + // val perNeg: Double = numNeg / (numPos + numNeg) + // + // println("\nLearning problem: \n --------------------\n") + // println("No. of Positive examples: " + PosExamples.count) + // println("No. of Negative examples: " + NegExamples.count) + // println("No. of Undefined examples: " + UndExamples.count) + // println("\nper Pos: " + perPos) + // println("per Neg: " + perNeg) + // + // val nGeneratedRef: Int = 50 + // + // val c : TDTClassifiers = new TDTClassifiers (kb, sparkSession) + // val tree : DLTree = c.induceDLTree(kb.getDataFactory.getOWLThing, PosExamples, NegExamples, UndExamples, nGeneratedRef, perPos, perNeg) + // + // val Root: OWLClassExpression = tree.getRoot() + // println("\nRoot of the tree is: " + Root) + + /* val possubtree = tree.getPosSubTree().toString() + println("possubtree: " + possubtree) */ + + // val ind = kb.getDataFactory().getOWLNamedIndividual("http://example.com/foo#east2") + // val classification : Int = c.classify(ind, tree) + // println("\nclassification of east2 is " + classification) -} \ No newline at end of file + sparkSession.stop + } +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/Utils.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/Utils.scala index bd0f03e..6d9a41a 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/Utils.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/Utils.scala @@ -1,12 +1,13 @@ package net.sansa_stack.ml.spark.classification -import org.apache.spark.serializer.{ KryoRegistrator => SparkKryoRegistrator } import com.esotericsoftware.kryo.Kryo +import org.apache.spark.serializer.{ KryoRegistrator => SparkKryoRegistrator } import org.semanticweb.owlapi.model.OWLClass -import net.sansa_stack.ml.spark.classification.KB.KB import org.semanticweb.owlapi.reasoner.structural.StructuralReasoner -/* +import net.sansa_stack.ml.spark.classification.KB.KB + +/** * Class for serialization by the Kryo serializer. */ class Registrator extends SparkKryoRegistrator { @@ -17,4 +18,4 @@ class Registrator extends SparkKryoRegistrator { kryo.register(classOf[StructuralReasoner]) kryo.register(classOf[net.sansa_stack.ml.spark.classification.KB.KB]) } -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/BorderFlow.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/BorderFlow.scala similarity index 89% rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/BorderFlow.scala rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/BorderFlow.scala index 4434e6f..d010777 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/BorderFlow.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/BorderFlow.scala @@ -1,38 +1,32 @@ -package net.sansa_stack.ml.spark.clustering +import java.io._ +import java.io.{ ByteArrayInputStream, FileNotFoundException, FileReader, IOException, StringWriter } +import java.lang.{ Long => JLong } -import org.apache.spark.rdd.RDD -import org.apache.spark.graphx.{ Graph, EdgeDirection } import scala.math.BigDecimal -import org.apache.spark.sql.SparkSession import scala.reflect.runtime.universe._ -import scopt.OptionParser -import org.apache.log4j.{ Level, Logger } -import org.apache.spark.mllib.util.MLUtils -import java.io.{ FileReader, FileNotFoundException, IOException } -import org.apache.spark.mllib.linalg.Vectors -import java.lang.{ Long => JLong } -import java.lang.{ Long => JLong } -import breeze.linalg.{ squaredDistance, DenseVector, Vector } -import org.apache.spark.sql.SparkSession import scala.util.control.Breaks._ + +import breeze.linalg.{ squaredDistance, DenseVector, Vector } import org.apache.jena.datatypes.{ RDFDatatype, TypeMapper } -import org.apache.jena.graph.{ Node => JenaNode, Triple => JenaTriple, _ } -import org.apache.jena.riot.writer.NTriplesWriter +import org.apache.jena.graph.{ Node => JenaNode, Node_ANY, Node_Blank, Node_Literal, Node_URI, Triple => JenaTriple, _ } import org.apache.jena.riot.{ Lang, RDFDataMgr } -import org.apache.jena.graph.{ Node_ANY, Node_Blank, Node_Literal, Node_URI, Node => JenaNode, Triple => JenaTriple } +import org.apache.jena.riot.writer.NTriplesWriter +import org.apache.jena.util._ import org.apache.jena.vocabulary.RDF -import java.io.ByteArrayInputStream -import org.apache.spark.rdd.PairRDDFunctions +import org.apache.log4j.{ Level, Logger } import org.apache.spark.SparkContext._ import org.apache.spark.graphx._ -import org.apache.jena.util._ -import java.io.StringWriter -import java.io._ -import org.apache.spark.graphx.Graph +import org.apache.spark.graphx.{ EdgeDirection, Graph } +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.rdd.PairRDDFunctions +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession +import scopt.OptionParser object BorderFlow { - def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputevlsoft: String, outputevlhard: String) = { + def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputevlsoft: String, outputevlhard: String): Unit = { /** * undirected graph : orient =0 @@ -51,9 +45,9 @@ object BorderFlow { graphXinBorderFlow(orient, selectYourSimilarity) } - /* - * Computes different similarities function for a given graph @graph. - */ + /** + * Computes different similarities function for a given graph @graph. + */ def graphXinBorderFlow(e: Int, f: Int): List[List[Long]] = { val edge = graph.edges.collect() @@ -197,7 +191,7 @@ object BorderFlow { f3 } - //computing f(X,V) for Heuristics BorderFlow + // computing f(X,V) for Heuristics BorderFlow def fOmega(x: List[Long], v: Long): Double = { var numberFlow = 0 @@ -223,9 +217,7 @@ object BorderFlow { var jaccardBV = 0.0 if (b.size == 0) return 0.0 for (i <- 0 until b.length) yield { - - jaccardBV = jaccardBV.+(findingSimilarity(b(i), v).abs) - + jaccardBV = jaccardBV. + (findingSimilarity(b(i), v).abs) } var jaccardVXV = 0.0 @@ -233,38 +225,13 @@ object BorderFlow { for (i <- 0 until VX.length) yield { if (VX(i) != v) { - jaccardVXV = jaccardVXV.+(findingSimilarity(VX(i), v).abs) + jaccardVXV = jaccardVXV. + (findingSimilarity(VX(i), v).abs) } } (jaccardVXV / jaccardBV) - /* - * without similarity - val nv = neighborSort.lookup(v).distinct.head.toSet - val nvX = nv.intersect(X.toSet) - val nvx = nvX.toList.diff(x).size - - - for(k <- 0 until x.length) yield{ - if(x.length>0){ - - val xk = x(k) - val bX = neighborSort.lookup(xk).distinct.head.toSet - val bxX = bX.intersect(X.toSet) - - if(bxX.toList.diff(x).size > 0 && bxX.toList.diff(x).contains(v)) { - numberFlow = numberFlow + 1 - } - - } - - } - - ( 1/(numberFlow.toDouble/ nvx.toDouble)) - * - */ } @@ -325,7 +292,7 @@ object BorderFlow { for (i <- 0 until b.length) yield { for (j <- 0 until x.length) yield { if (b(i) != x(j)) { - jaccardX = jaccardX.+(findingSimilarity(b(i), x(j)).abs) + jaccardX = jaccardX. + (findingSimilarity(b(i), x(j)).abs) } } @@ -334,7 +301,7 @@ object BorderFlow { for (i <- 0 until b.length) yield { for (j <- 0 until n.length) yield { - jaccardN = jaccardN.+(findingSimilarity(b(i), n(j)).abs) + jaccardN = jaccardN. + (findingSimilarity(b(i), n(j)).abs) } } @@ -367,7 +334,7 @@ object BorderFlow { for (i <- 0 until n.length) yield { if (n(i) != u) { - jaccardNU = jaccardNU.+(findingSimilarity(u, n(i)).abs) + jaccardNU = jaccardNU. + (findingSimilarity(u, n(i)).abs) } @@ -377,15 +344,14 @@ object BorderFlow { val nu = neighborSort.lookup(u).distinct.head.toSet val nuX = nu.intersect(X.toSet).toList ( (nuX.intersect(listOfN(x))).size.toDouble) - */ jaccardNU } - /* - * Use Heuristics method for producing clusters. - */ + /** + * Use Heuristics method for producing clusters. + */ def heuristicsCluster(a: List[Long]): List[Long] = { var nj = 0.0 @@ -436,9 +402,9 @@ object BorderFlow { } - /* - * Use Non-Heuristics(normal) method for producing clusters. - */ + /** + * Use Non-Heuristics(normal) method for producing clusters. + */ def nonHeuristicsCluster(a: List[Long], d: List[Long]): List[Long] = { var nj: List[Long] = List() @@ -529,18 +495,18 @@ object BorderFlow { } - /* - * Input for heuristics heuristicsCluster(element) . - * Input for nonHeuristics nonHeuristicsCluster(element,List()) . - */ + /** + * Input for heuristics heuristicsCluster(element) . + * Input for nonHeuristics nonHeuristicsCluster(element,List()) . + */ def makeClusters(a: Long): List[Long] = { var clusters: List[Long] = List() clusters = nonHeuristicsCluster(List(a), List()) - // if(b == 1){ - // clusters = heuristicsCluster(List(a))} + // if(b == 1) { + // clusters = heuristicsCluster(List(a)) } (clusters) @@ -558,9 +524,9 @@ object BorderFlow { bigList = bigList.map(_.distinct) - /* - * Sillouhette Evaluation soft - */ + /** + * Sillouhette Evaluation soft + */ def avgAsoft(c: List[Long], d: Long): Double = { var sumA = 0.0 @@ -585,6 +551,7 @@ object BorderFlow { sumB / sizeC } + def SIsoft(a: Double, b: Double): Double = { var s = 0.0 if (a > b) { @@ -632,9 +599,9 @@ object BorderFlow { val evaluateSoft = AiBiSoft(bigList, X) - /* - * Apply Hardening - */ + /** + * Apply Hardening + */ def subset(c: List[List[Long]]): List[List[Long]] = { var C = c @@ -698,7 +665,7 @@ object BorderFlow { for (i <- 0 until c.length) yield { if (c(i) != v) { - omega = omega.+(findingSimilarity(v, c(i)).abs) + omega = omega. + (findingSimilarity(v, c(i)).abs) } @@ -708,7 +675,6 @@ object BorderFlow { val nu = neighborSort.lookup(u).distinct.head.toSet val nuX = nu.intersect(X.toSet).toList ( (nuX.intersect(listOfN(x))).size.toDouble) - */ omega @@ -741,6 +707,7 @@ object BorderFlow { } C } + def nul(c: List[List[Long]]): List[List[Long]] = { var C = c var newCluster: List[List[Long]] = List() @@ -755,9 +722,9 @@ object BorderFlow { bigList = reassignment(bigList, X) bigList = nul(bigList) - /* - * Sillouhette Evaluation Hard - */ + /** + * Sillouhette Evaluation Hard + */ def avgA(c: List[Long], d: Long): Double = { var sumA = 0.0 @@ -782,6 +749,7 @@ object BorderFlow { sumB / sizeC } + def SI(a: Double, b: Double): Double = { var s = 0.0 if (a > b) { @@ -838,14 +806,14 @@ object BorderFlow { val evaluateStringRDDS = spark.sparkContext.parallelize(evaluateStringS) evaluateStringRDDS.saveAsTextFile(outputevlsoft) - //println(s"averagesoft: $avsoft\n") + // println(s"averagesoft: $avsoft\n") bigList } - /* - * convert to RDF - */ + /** + * convert to RDF + */ def makerdf(a: List[Long]): List[String] = { var listuri: List[String] = List() @@ -857,13 +825,12 @@ object BorderFlow { } listuri - } val rdf = clusterRdd.map(x => makerdf(x)) val rdfRDD = spark.sparkContext.parallelize(rdf) rdfRDD.saveAsTextFile(output) - } } + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/DBSCAN.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/DBSCAN.scala new file mode 100644 index 0000000..3c0dbfd --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/DBSCAN.scala @@ -0,0 +1,260 @@ +package net.sansa_stack.ml.spark.clustering.algorithms + +/* +* DBSCAN Distributed Edition in Spark & Scala. +* +* Authors: Panagiotis Kalampokis, Dr. Dimitris Skoutas +* */ + +import com.vividsolutions.jts.geom.{Coordinate, Envelope, GeometryFactory, Point} +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.storage.StorageLevel._ +import org.datasyslab.geospark.enums.GridType +import org.datasyslab.geospark.spatialPartitioning.SpatialPartitioner +import org.datasyslab.geospark.spatialRDD.PointRDD +import scala.collection.mutable.{ArrayBuffer, HashMap} + +import net.sansa_stack.ml.spark.clustering.datatypes.DbPOI +import net.sansa_stack.ml.spark.clustering.datatypes.POI +import net.sansa_stack.ml.spark.clustering.utils.DBCLusterer + + +class DBSCAN() extends Serializable { + + private var clusterRDD: RDD[DbPOI] = null + private var mergingClusterNameVecBD: Broadcast[Vector[Set[String]]] = null + private var boundaryPoisToKeepHMBD : Broadcast[HashMap[String, String]] = null + private var spatialPartitionerBD : Broadcast[SpatialPartitioner] = null + + private def areIntersectingSets(set1: Set[String], set2: Set[String]): Boolean = { + set1.exists(s1 => set2.exists(s2 => s1 == s2) ) + } + + private def insertSetIntoVec(vec: Vector[Set[String]], xSet: Set[String]): Vector[Set[String]] = { + + var tmpVec = Vector[Set[String]]() + var unionSet = Set[String]() ++ xSet + + for(set_i <- vec) { + if (areIntersectingSets(set_i, unionSet)) + { + unionSet = unionSet ++ set_i + } + else + { + tmpVec = tmpVec :+ set_i + } + } + unionSet +: tmpVec + } + + protected def getExpandedEnvelopeFromPoint(p: Point, epsilon: Double): Envelope = { + val env = p.getEnvelopeInternal + env.expandBy(epsilon) + + env + } + /* + * Performs DBSCAN and Returns the clusters. + * */ + def dbclusters(pointRDD_0: RDD[Point], eps: Double, minPts: Int, spark: SparkSession) : RDD[(String, Array[(String, DbPOI)])] = { + + val pointRDD_1 = new JavaRDD[Point](pointRDD_0) + val pointRDD = new PointRDD(pointRDD_1) + + pointRDD.analyze() + + // Perform Spatial Partitioning with QuadTree + pointRDD.spatialPartitioning(GridType.QUADTREE, 16) + + // val boundaryEnvelopes = pointRDD.getPartitioner.getGrids + // writeBoundaryEnvsToFile(pointRDD, outputFile + "_Envelopes_only.txt", geometryFactory) + this.spatialPartitionerBD = spark.sparkContext.broadcast(pointRDD.getPartitioner) + + // RDD[partitionID, dbpoi] + val flatMappedRDD = pointRDD.spatialPartitionedRDD + .rdd + .mapPartitions{ + pointIter => { + val geometryFactory = new GeometryFactory() + pointIter.flatMap{ + point => { + // Get expanded by eps Envelope From Point. + val pointEnv = getExpandedEnvelopeFromPoint(point, eps) + + // Given a Geometry, it Returns a List of Partitions it overlaps. + val pIDListTuple = this.spatialPartitionerBD.value.placeObject(geometryFactory.toGeometry(pointEnv)) + + // ArrayBuffer[PIDs] + val arrBuff = ArrayBuffer[Int]() + + while (pIDListTuple.hasNext) { + val (pID, envP) = pIDListTuple.next() + arrBuff.append(pID.intValue()) + } + + // Is Boundary Point? + val isBoundaryP = (arrBuff.size > 1) + arrBuff.map{ + pID => { + val poi = DbPOI(point.getUserData.asInstanceOf[String], point.getX, point.getY) + if (isBoundaryP) { + poi.isBoundary = true + } + + (pID, poi) + } + } + } + } + } + } + // RDD[(pID, ArrayBuffer[DBPOI])] + val partitionRDD = flatMappedRDD.aggregateByKey(ArrayBuffer[DbPOI]())( + // SeqOp + (zArrBuffDBPoi, poi) => zArrBuffDBPoi += poi, + + // CombOp + (zArrBuffDBPoi1, zArrBuffDBPoi2) => zArrBuffDBPoi1 ++= zArrBuffDBPoi2 + ) + + + // RDD[dbpoi] + this.clusterRDD = partitionRDD.flatMap{ + case (pID, poiArrBuff) => + // New DBSCAN CLusterer For Each Partition-Envelope + val dbclusterer = DBCLusterer(eps, minPts) + + // Perform DBSCAN in each partition and return a List of Clusters: ArrayBuffer[ArrayBuffer[DBPOI]] + val clusters = dbclusterer.clusterPois(poiArrBuff) + + var i = 0 + for (cluster <- clusters) { + for (poi <- cluster) { + poi.clusterName = pID + "p" + i + } + + i = i + 1 + } + clusters.flatten + } + .persist(MEMORY_AND_DISK) + + + // Take all Boundary Pois. + // RDD[poiID, dbpoi] + val boundaryPoiRDD = this.clusterRDD.filter(_.isBoundary).map(poi => (poi.poiId, poi) ) + + + // RDD[poiID, (List[pID&cID], isDense?)] Set[pID&cID], isDensePoi? + val bPoiRDD = boundaryPoiRDD.aggregateByKey( (Set[String](), false) )( + // SeqOp + (zTuple, poi) => (zTuple._1 + poi.clusterName, zTuple._2 | poi.isDense ), + + // CombOp + (tuple1, tuple2) => (tuple1._1 ++ tuple2._1, tuple1._2 | tuple2._2) + ) + + + // Vector[Set[pID&cIID]], HashMap[poiID ,pID&cID] Vector[Set[pID&cIID]] , HashMap[poiID , pID&cID] + val (mergingClusterNameVec, boundaryPoisToKeepHM) = bPoiRDD.aggregate( ( Vector[Set[String]](), HashMap[String, String]() ))( + // SeqOp + (zTuple, xTuple) => { + + val (vec, zHashMap) = zTuple + + // [poiID, (Set[pID&cID], isDense?)] + val (poiID, (pIDcIDSet, isDense)) = xTuple + + if(isDense) { + (insertSetIntoVec(vec, pIDcIDSet), zHashMap) + } + else { + (vec, zHashMap += ((poiID, pIDcIDSet.head)) ) + } + }, + + // CombOp + (zTuple1, zTuple2) => { + val (vec1, hashMap1) = zTuple1 + val (vec2, hashMap2) = zTuple2 + val vec3 = vec2.foldLeft(vec1)((zVec, xSet) => insertSetIntoVec(zVec, xSet)) + + (vec3, hashMap1 ++= hashMap2) + } + ) + + + // Broadcast commonNames and PoisToKeep + this.mergingClusterNameVecBD = spark.sparkContext.broadcast(mergingClusterNameVec) + this.boundaryPoisToKeepHMBD = spark.sparkContext.broadcast(boundaryPoisToKeepHM) + val preFinalClusterRDD = this.clusterRDD.mapPartitions{ + poiIter => { + + val commonNameMap = this.mergingClusterNameVecBD.value.flatMap{ + nameSet => { + val commonName = nameSet.toSeq.sortWith(_ < _).mkString("c") + nameSet.map(_ -> commonName) + } + }.toMap + + poiIter.flatMap{ + poi => { + var poiIDcIDName = poi.clusterName + commonNameMap.get(poi.clusterName) match { + case Some(commonName) => poiIDcIDName = commonName + case None => () + } + var keepPoi = true + this.boundaryPoisToKeepHMBD.value.get(poi.poiId) match { + case Some(pIDcIDwhoKeepsPoi) => + if (poi.clusterName != pIDcIDwhoKeepsPoi) { + keepPoi = false + } + case None => () + } + + poi.clusterName = poiIDcIDName + if(keepPoi) { + Seq((poi.clusterName, poi)) + } + else { + Seq() + } + } + } + } + } + + + // RDD[clusterName, HashMap[poiID, poi]] + val dbclusterRDD = preFinalClusterRDD.aggregateByKey(HashMap[String, DbPOI]())( + // SeqOp + (zPoiHM, poi) => zPoiHM += ((poi.poiId, poi)), + + // CombOp + (hm1, hm2) => hm1 ++= hm2 + ) + // RDD[(String, Array[POI])] + // dbclusterRDD.foreach(println) + val k = dbclusterRDD.mapValues(_.toArray) + k + } + /* + * This method should be called after + * finishing using this class(e.g: writing results, or printing stats). + * */ + def clear(): Unit = { + this.clusterRDD.unpersist(true) + this.boundaryPoisToKeepHMBD.destroy() + this.mergingClusterNameVecBD.destroy() + this.spatialPartitionerBD.destroy() + } + +} + + + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Distances.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Distances.scala new file mode 100644 index 0000000..234fda8 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Distances.scala @@ -0,0 +1,17 @@ +package net.sansa_stack.ml.spark.clustering.algorithms + +class Distances { + + /** + * Jaccard Similarity Coefficient between two sets of categories corresponding to two pois + * + * @param x set of categories + * @param y set of categories + */ + def jaccardSimilarity(x: Set[String], y: Set[String]): Double = { + val union_l = x.union(y).toList.length.toDouble + val intersect_l = x.intersect(y).toList.length.doubleValue() + intersect_l / (union_l) + } +} + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Encoder.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Encoder.scala new file mode 100644 index 0000000..2624b8f --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Encoder.scala @@ -0,0 +1,121 @@ +package net.sansa_stack.ml.spark.clustering.algorithms + +import org.apache.spark.ml.feature.{ VectorAssembler, Word2Vec } +import org.apache.spark.rdd._ +import org.apache.spark.sql._ +import org.apache.spark.sql.types._ + + +class Encoder { + + /** + * One hot encoding categorical data + * + * @param poiCategories, category ids with corresponding category values + * @param spark + * @return one hot encoded DataFrame for each poi + */ + def oneHotEncoding(poiCategories: RDD[(Long, Set[String])], spark: SparkSession): (DataFrame, Array[Array[Int]]) = { + // create a set to contain all categories + var set = scala.collection.mutable.Set[String]() + // put all categories to set + poiCategories.collect().foreach(x => x._2.foreach(y => set += y)) + // create columns base on the length of set + val numPOIS = poiCategories.count().toInt // Array.ofDim only accept Int + val categoryArray = set.toArray + val oneHotMatrix = Array.ofDim[Int](numPOIS, categoryArray.length + 1) // one column keep poi id + // initialize distance matrix, collect first needed + var i = 0 + poiCategories.collect().foreach(x => + { + oneHotMatrix(i)(0) = x._1.toInt + for (j <- 1 until categoryArray.length + 1) { + oneHotMatrix(i)(j) = 0 + } + x._2.foreach(y => + { // encode corresponding category value to 1 + oneHotMatrix(i)(categoryArray.indexOf(y) + 1) = 1 + }) + i += 1 + }) + // vector keep all StructField + val fields = Array.ofDim[StructField](categoryArray.length + 1) + val featureColumns = Array.ofDim[String](categoryArray.length + 1) + // keep other columns with integer type + for (i <- 0 until categoryArray.length + 1) { + fields(i) = StructField(i.toString, IntegerType, true) + featureColumns(i) = i.toString + } + val schema = new StructType(fields) + val oneHotEncodedRDD = spark.sparkContext.parallelize(oneHotMatrix).map(x => Row.fromSeq(x.toList)) + val oneHotEncodedDF = spark.createDataFrame(oneHotEncodedRDD, schema) + // set up 'features' column + val assemblerFeatures = new VectorAssembler().setInputCols(featureColumns.slice(1, featureColumns.length)).setOutputCol("features") + val transformedDf = assemblerFeatures.transform(oneHotEncodedDF) + (transformedDf, oneHotMatrix) + } + + /** + * word2Vec encoding + * + * @param poiCategories category ids with corresponding category values + * @param spark + * @return word2Vec encoded categories for each poi in DataFrame + */ + def wordVectorEncoder(poiCategories: RDD[(Long, Set[String])], spark: SparkSession): (DataFrame, RDD[(Int, Array[Double])]) = { + val word2vec = new Word2Vec().setInputCol("inputCol").setMinCount(1) + val schema = StructType(StructField("inputCol", ArrayType(StringType, true), true) :: Nil) + val df = spark.createDataFrame(poiCategories.map(f => Row(f._2.map(x => x.toString).toArray)), schema) + val wordVectorsRDD = word2vec.fit(df).getVectors.select("word", "vector").rdd + val vectors = wordVectorsRDD.map(f => (f.getString(0), f.getAs[org.apache.spark.ml.linalg.DenseVector](1))) + val categoryVectors = vectors.collectAsMap() + val poiCategoryVectors = poiCategories.map(f => (f._1, f._2.map(x => categoryVectors.get(x).head.toArray))) + val poiVector = poiCategoryVectors.map(f => (f._1, f._2.size, f._2.toArray.toList.transpose.map(_.sum).toArray)) + val leng = poiVector.take(1)(0)._2 + val poiAvgVector = poiVector.map(x => (x._1.toInt, x._3.map(y => y / x._2))) + val fields = Array.ofDim[StructField](leng + 1) + val featureColumns = Array.ofDim[String](leng + 1) + // keep other columns with integer type + fields(0) = StructField("id", IntegerType, true) + featureColumns(0) = "id" + for (i <- 1 until leng + 1) { + fields(i) = StructField(i.toString, DoubleType, true) + featureColumns(i) = i.toString + } + val schema2 = new StructType(fields) + val poiAvgVectorDF = spark.createDataFrame(poiAvgVector.map(x => Row.fromSeq(x._1 +: x._2)), schema2) + val assemblerFeatures = new VectorAssembler().setInputCols(featureColumns.slice(1, featureColumns.length)).setOutputCol("features") + val transformedDf = assemblerFeatures.transform(poiAvgVectorDF) + (transformedDf, poiAvgVector) + } + + /** + * multiple dimensional encoding + * + * @param distancePairs distance between pair of pois + * @param numPOIS number of pois + * @param dimension mapped coordinate dimension + * @param spark + * @return encoded coordinates for each poi in DataFrame + */ + def mdsEncoding(distancePairs: RDD[(Long, Long, Double)], numPOIS: Int, dimension: Int, spark: SparkSession): (DataFrame, Array[(Long, Array[Double])]) = { + val poi2Coordinates = new MultiDS().multiDimensionScaling(distancePairs, numPOIS, dimension) + val poi2Coordinates2 = poi2Coordinates.map(x => x._1.toInt :: x._2.toList) + // create schema + val fields = Array.ofDim[StructField](dimension + 1) + val featureColumns = Array.ofDim[String](dimension + 1) + fields(0) = StructField("id", IntegerType, true) + featureColumns(0) = "id" + for (i <- 1 until dimension + 1) { + fields(i) = StructField(i.toString, DoubleType, true) + featureColumns(i) = i.toString + } + val schema = new StructType(fields) + val coordinatesRDD = spark.sparkContext.parallelize(poi2Coordinates2.toSeq).map(x => Row.fromSeq(x)) + val coordinatesDF = spark.createDataFrame(coordinatesRDD, schema) + val assembler = new VectorAssembler().setInputCols(featureColumns.slice(1, featureColumns.length)).setOutputCol("features") + val featureData = assembler.transform(coordinatesDF) + (featureData, poi2Coordinates) + } +} + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/FirstHardeninginBorderFlow.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/FirstHardeninginBorderFlow.scala similarity index 84% rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/FirstHardeninginBorderFlow.scala rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/FirstHardeninginBorderFlow.scala index f22f1f9..eee7c7b 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/FirstHardeninginBorderFlow.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/FirstHardeninginBorderFlow.scala @@ -1,31 +1,34 @@ -package net.sansa_stack.ml.spark.clustering +package net.sansa_stack.ml.spark.clustering.algorithms + +import java.io._ +import java.io.{ ByteArrayInputStream, FileNotFoundException, FileReader, IOException, StringWriter } +import java.lang.{ Long => JLong } +import java.net.URI -import org.apache.spark.rdd.RDD -import org.apache.spark.graphx.{ Graph, EdgeDirection } import scala.math.BigDecimal -import org.apache.spark.sql.SparkSession import scala.reflect.runtime.universe._ -import scopt.OptionParser +import scala.util.control.Breaks._ + +import breeze.linalg.{ squaredDistance, DenseVector, Vector } +import org.apache.jena.graph.Node import org.apache.log4j.{ Level, Logger } -import org.apache.spark.mllib.util.MLUtils -import java.io.{ FileReader, FileNotFoundException, IOException } +import org.apache.spark.graphx._ +import org.apache.spark.graphx.{ EdgeDirection, Graph } import org.apache.spark.mllib.linalg.Vectors -import java.lang.{ Long => JLong } -import breeze.linalg.{ squaredDistance, DenseVector, Vector } -import scala.util.control.Breaks._ -import java.io.ByteArrayInputStream +import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.PairRDDFunctions -import java.io.StringWriter -import java.io._ -import java.net.URI -import org.apache.spark.graphx._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession + + + object FirstHardeninginBorderFlow { - def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputeval: String) = { + def apply(spark: SparkSession, graph: Graph[Node, Node], output: String, outputeval: String): Unit = { /** - * + * * Jaccard similarity measure : selectYourSimilarity = 0 * Batet similarity measure : selectYourSimilarity = 1 * Rodríguez and Egenhofer similarity measure : selectYourSimilarity = 2 @@ -39,9 +42,9 @@ object FirstHardeninginBorderFlow { graphXinBorderFlow(selectYourSimilarity) } - /* - * Computes different similarities function for a given graph @graph. - */ + /** + * Computes different similarities function for a given graph @graph. + */ def graphXinBorderFlow(f: Int): List[List[Long]] = { val edge = graph.edges @@ -55,7 +58,8 @@ object FirstHardeninginBorderFlow { val x = f._1 x }) - + println("hard") + sort.foreach(println) var X = sort.collect() neighborSort.unpersist() @@ -63,9 +67,9 @@ object FirstHardeninginBorderFlow { val neighborcollect = neighbor.collect() val verticescollect = graph.vertices.collect() - /* - * finding neighbors for node a - */ + /** + * finding neighbors for node a + */ def findneighbors(a: VertexId): Array[VertexId] = { var b: Array[VertexId] = Array() @@ -80,15 +84,15 @@ object FirstHardeninginBorderFlow { b } - /* - * Computing logarithm based 2 - */ + /** + * Computing logarithm based 2 + */ val LOG2 = math.log(2) val log2 = { x: Double => math.log(x) / LOG2 } - /* - * Difference between two set of vertices, used in different similarity measures - */ + /** + * Difference between two set of vertices, used in different similarity measures + */ def difference(a: Array[VertexId], b: Array[VertexId]): Double = { if (a.length == 0) { return 0.0 } @@ -97,9 +101,9 @@ object FirstHardeninginBorderFlow { differ.size.toDouble } - /* - * Intersection of two set of vertices, used in different similarity measures - */ + /** + * Intersection of two set of vertices, used in different similarity measures + */ def intersection(a: Array[VertexId], b: Array[VertexId]): Double = { if ((a.length == 0) || (b.length == 0)) { return 0.0 } val rst = a.intersect(b) @@ -107,9 +111,9 @@ object FirstHardeninginBorderFlow { rst.size.toDouble } - /* - * Union of two set of vertices, used in different similarity measures - */ + /** + * Union of two set of vertices, used in different similarity measures + */ def union(a: Array[VertexId], b: Array[VertexId]): Double = { val rst = a.union(b) @@ -117,17 +121,17 @@ object FirstHardeninginBorderFlow { rst.size.toDouble } - /* - * similarity measures - */ + /** + * similarity measures + */ def selectSimilarity(a: Array[VertexId], b: Array[VertexId], c: Int): Double = { var s = 0.0 if (c == 0) { - /* - * Jaccard similarity measure - */ + /** + * Jaccard similarity measure + */ val sim = intersection(a, b) / union(a, b).toDouble if (sim == 0.0) { s = (1 / vertex) } @@ -137,9 +141,9 @@ object FirstHardeninginBorderFlow { if (c == 1) { - /* - * Rodríguez and Egenhofer similarity measure - */ + /** + * Rodríguez and Egenhofer similarity measure + */ var g = 0.8 @@ -149,9 +153,9 @@ object FirstHardeninginBorderFlow { } if (c == 2) { - /* - * The Ratio model similarity - */ + /** + * The Ratio model similarity + */ var alph = 0.5 var beth = 0.5 @@ -162,15 +166,14 @@ object FirstHardeninginBorderFlow { } if (c == 3) { - /* - * Batet similarity measure - */ + /** + * Batet similarity measure + */ val cal = 1 + ((difference(a, b) + difference(b, a)) / (difference(a, b) + difference(b, a) + intersection(a, b))).abs val sim = log2(cal.toDouble) if (sim == 0.0) { s = (1 / vertex) } else { s = sim } - } s } @@ -222,7 +225,7 @@ object FirstHardeninginBorderFlow { val sortsim = sumsimilarity(X) - //println(s"sortsim: $sortsim\n") + // println(s"sortsim: $sortsim\n") var node = sortsim.map(f => { f._1 @@ -232,7 +235,7 @@ object FirstHardeninginBorderFlow { neighbor.unpersist() - //computing F(X) for BorderFlow + // computing F(X) for BorderFlow def fX(x: List[Long]): Double = { var jaccardX = 0.0 @@ -276,13 +279,13 @@ object FirstHardeninginBorderFlow { b.map(bi => { x.map(xj => { - if (bi.!=(xj)) { jaccardX = jaccardX.+(findingSimilarity(bi, xj).abs) } + if (bi.!=(xj)) { jaccardX = jaccardX. + (findingSimilarity(bi, xj).abs) } }) }) b.map(bi => { n.map(nj => { - jaccardN = jaccardN.+(findingSimilarity(bi, nj).abs) + jaccardN = jaccardN. + (findingSimilarity(bi, nj).abs) }) }) @@ -309,16 +312,16 @@ object FirstHardeninginBorderFlow { val n = listOfN(x) var jaccardNU = 0.0 n.map(ni => { - if (ni.!=(u)) { jaccardNU = jaccardNU.+(findingSimilarity(u, ni).abs) } + if (ni.!=(u)) { jaccardNU = jaccardNU. + (findingSimilarity(u, ni).abs) } }) jaccardNU } - /* - * Use Non-Heuristics(normal) method for producing clusters. - */ + /** + * Use Non-Heuristics(normal) method for producing clusters. + */ def nonHeuristicsCluster(a: List[Long], d: List[Long]): List[Long] = { var nj: List[Long] = List() @@ -399,22 +402,20 @@ object FirstHardeninginBorderFlow { } - /* - * - * Input for nonHeuristics nonHeuristicsCluster(element,List()) . - */ + /** + * Input for nonHeuristics nonHeuristicsCluster(element,List()) . + */ def makerdf(a: List[Long]): List[String] = { var listuri: List[String] = List() val b: List[VertexId] = a for (i <- 0 until b.length) { verticescollect.map(v => { - if (b(i) == v._1) listuri = listuri.::(v._2) + if (b(i) == v._1) listuri = listuri.::(v._2.toString()) }) } listuri - } def makeClusters(a: Long): List[Long] = { @@ -453,13 +454,13 @@ object FirstHardeninginBorderFlow { } while (node.size > 0) neighborSort.unpersist() - //println(s"RDF Cluster assignments: $rdfcluster\n") + // println(s"RDF Cluster assignments: $rdfcluster\n") val rdfRDD = spark.sparkContext.parallelize(rdfcluster) rdfRDD.saveAsTextFile(output) - /* - * Sillouhette Evaluation - */ + /** + * Sillouhette Evaluation + */ def avgA(c: List[Long], d: Long): Double = { var sumA = 0.0 @@ -530,7 +531,7 @@ object FirstHardeninginBorderFlow { val evaluate = AiBi(bigList, nnode) val av = evaluate.sum / evaluate.size - //println(s"average: $av\n") + // println(s"average: $av\n") val evaluateString: List[String] = List(av.toString()) val evaluateStringRDD = spark.sparkContext.parallelize(evaluateString) @@ -540,8 +541,6 @@ object FirstHardeninginBorderFlow { } val rdf = clusterRdd() - //println(s"RDF Cluster assignments: $rdf\n") - + // println(s"RDF Cluster assignments: $rdf\n") } - } diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Kmeans.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Kmeans.scala new file mode 100644 index 0000000..a90f876 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Kmeans.scala @@ -0,0 +1,28 @@ +package net.sansa_stack.ml.spark.clustering.algorithms + +import org.apache.spark.ml.clustering.KMeans +import org.apache.spark.sql._ +import org.apache.spark.sql.SparkSession + +class Kmeans { + + /** + * K-means clustering based on given Dataframe + * + * @param numClusters + * @param df + * @param spark + * @return cluster id and corresponding pois in cluster + */ + def kmClustering(numClusters: Int, maxIter: Int, df: DataFrame, spark: SparkSession): Map[Int, Array[Long]] = { + val km = new KMeans().setK(numClusters).setMaxIter(maxIter).setSeed(1L).setFeaturesCol("features").setPredictionCol("prediction") + val model = km.fit(df) + val transformedDataFrame = model.transform(df) + import spark.implicits._ + // get (cluster_id, poi_id) + val clusterIdPoi = transformedDataFrame.map(f => (f.getInt(f.size - 1), f.getInt(0).toLong)).rdd.groupByKey() + val clustersMDSKM = clusterIdPoi.map(x => (x._1, x._2.toArray)).collectAsMap().toMap + clustersMDSKM + } +} + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDS.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDS.scala new file mode 100644 index 0000000..a9cc699 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDS.scala @@ -0,0 +1,50 @@ +package net.sansa_stack.ml.spark.clustering.algorithms + +import org.apache.spark.rdd._ +import smile.mds.MDS + +class MultiDS { + + /** + * Multi-dimensional scaling + * Generate n dimensional coordinates based on input similarity matrix + * + * @param distancePairs distance between pair of poi + * @param numPOIS number of poi + * @param dimension dimension of generated coordinates + * @return poi id and coordinates in given dimension + */ + def multiDimensionScaling(distancePairs: RDD[(Long, Long, Double)], numPOIS: Int, dimension: Int): Array[(Long, Array[Double])] = { + // vector keep recorded poi + var vector = Array.ofDim[Long](numPOIS) + // positive symmetric distance matrix + var distanceMatrix = Array.ofDim[Double](numPOIS, numPOIS) + // initialize distance matrix + for (i <- 0 until numPOIS) { + vector(i) = 0 + for (j <- 0 until numPOIS) { + distanceMatrix(i)(j) = 0.0 + } + } + var i = 0 + distancePairs.collect().foreach(x => { + if (!vector.contains(x._1)) { // if there is no record for this poi + vector(i) = x._1 + i += 1 + } + if (!vector.contains(x._2)) { // if there is no record for this poi + vector(i) = x._2 + i += 1 + } + val i1 = vector.indexOf(x._1) // get the index as x-y axis for matrix + val i2 = vector.indexOf(x._2) // get the index as x-y axis for matrix + distanceMatrix(i1)(i2) = x._3 + distanceMatrix(i2)(i1) = x._3 + }) + // create coordinates + val mds = new MDS(distanceMatrix, dimension, true) + mds.getCoordinates.zip(vector).map(x => (x._2, x._1)) + } +} + + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/PIC.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/PIC.scala new file mode 100644 index 0000000..3688318 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/PIC.scala @@ -0,0 +1,34 @@ +package net.sansa_stack.ml.spark.clustering.algorithms + +import org.apache.spark.graphx.Edge +import org.apache.spark.graphx.Graph +import org.apache.spark.mllib.clustering.PowerIterationClustering +import org.apache.spark.rdd._ +import org.apache.spark.sql._ + + +class PIC { + + /* + * Power Iteration clustering algorithm from Spark standard library + * */ + def picSparkML(pairwisePOISimilarity: RDD[(Long, Long, Double)], numCentroids: Int, numIterations: Int, sparkSession: SparkSession): Map[Int, Array[Long]] = { + val model = new PowerIterationClustering().setK(numCentroids).setMaxIterations(numIterations).setInitializationMode("degree").run(pairwisePOISimilarity) + val clusters = model.assignments.collect().groupBy(_.cluster).mapValues(_.map(_.id)) + clusters + } +/* + * Power Iteration using implementation from SANSA + * */ + def picSANSA(pairwisePOISimilarity: RDD[(Long, Long, Double)], numCentroids: Int, numIterations: Int, sparkSession: SparkSession) { + val verticeS = pairwisePOISimilarity.map(f => f._1) + val verticeD = pairwisePOISimilarity.map(f => f._2) + val indexedMap = verticeS.union(verticeD).distinct().zipWithIndex() + val vertices = indexedMap.map(f => (f._2, f._1)) + val edges = pairwisePOISimilarity.map(f => Edge(f._1, f._2, f._3)) // from similarity to int + val similarityGraph = Graph(vertices, edges) + // val model = new RDFGraphPICClustering(sparkSession, similarityGraph, numCentroids, numIterations) + } +} + + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFByModularityClustering.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFByModularityClustering.scala similarity index 88% rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFByModularityClustering.scala rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFByModularityClustering.scala index 55e508c..47a498e 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFByModularityClustering.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFByModularityClustering.scala @@ -1,19 +1,21 @@ -package net.sansa_stack.ml.spark.clustering +package net.sansa_stack.ml.spark.clustering.algorithms + +import java.io.StringWriter + +import scala.util.control.Breaks._ import org.apache.log4j.{ Level, Logger } +import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD -import org.apache.spark.{ SparkConf, SparkContext } -import scala.util.control.Breaks._ -import java.io.StringWriter /** * Created by hpetzka on 09.11.2016. */ object RDFByModularityClustering { - def apply(sc: SparkContext, numIterations: Int, graphFile: String, outputFile: String) = { + def apply(sc: SparkContext, numIterations: Int, graphFile: String, outputFile: String): Unit = { // DEFAULT INPUT // val (numIterations, graphFile) = (100 , "C:/Users/hpetzka/IdeaProjects/Clustering_in_Spark/Graphs/testRDF.txt") @@ -39,7 +41,7 @@ object RDFByModularityClustering { If weight edges come in, one probably needa a map as follows val adjacencyMatrix: Array[Array[Int]] = Array.ofDim[Int](numVertices, numVertices) var adjacencies: Map[(String, String), Int] = Map[(String, String),Int]() - for (x <- edgesRDD.collect()){ + for (x <- edgesRDD.collect()) { // TODO add the weights here if they exist if(x(0) < x(1)) adjacencies += ( (x(0),x(1)) -> 1 ) else adjacencies += ( (x(1),x(0)) -> 1 ) @@ -117,11 +119,12 @@ object RDFByModularityClustering { } - def iterationStepClusteringRDFByModularity(numEdges: Long, - edgesBC: Broadcast[Array[(String, String)]], - vertexDegreesBC: Broadcast[Map[String, Int]], - clusterMapRDD: RDD[List[String]], - sc: SparkContext): (RDD[List[String]], Boolean) = { + def iterationStepClusteringRDFByModularity( + numEdges: Long, + edgesBC: Broadcast[Array[(String, String)]], + vertexDegreesBC: Broadcast[Map[String, Int]], + clusterMapRDD: RDD[List[String]], + sc: SparkContext): (RDD[List[String]], Boolean) = { // Start iteration // The following RDD contains distinct pairs of clusters for which there is an edge between them @@ -172,11 +175,12 @@ object RDFByModularityClustering { // The function that computes delta Q for the merge of two clusters - def deltaQ(numEdges: Long, - vertexDegreesBC: Broadcast[Map[String, Int]], - edgesBC: Broadcast[Array[(String, String)]], - clusterI: List[String], - clusterJ: List[String]): Double = { + def deltaQ( + numEdges: Long, + vertexDegreesBC: Broadcast[Map[String, Int]], + edgesBC: Broadcast[Array[(String, String)]], + clusterI: List[String], + clusterJ: List[String]): Double = { val clusterPairs: List[(String, String)] = clusterI.flatMap(x => clusterJ.map(y => (x, y))) @@ -189,12 +193,9 @@ object RDFByModularityClustering { 1.0 / numEdges * summand.fold(0.0)((a: Double, b: Double) => a - b) } - def WriteToFile[T](rdd: RDD[T], file: String, coalesce: (Boolean, Int) = (false, 0)) = + def WriteToFile[T](rdd: RDD[T], file: String, coalesce: (Boolean, Int) = (false, 0)): Unit = coalesce._1 match { - case true => rdd.coalesce(coalesce._2).saveAsTextFile(file) + case true => rdd.coalesce(coalesce._2).saveAsTextFile(file) case false => rdd.saveAsTextFile(file) } - } - - diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFGraphPowerIterationClustering.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFGraphPowerIterationClustering.scala similarity index 87% rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFGraphPowerIterationClustering.scala rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFGraphPowerIterationClustering.scala index 15a3234..e49c871 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFGraphPowerIterationClustering.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFGraphPowerIterationClustering.scala @@ -1,46 +1,36 @@ -package net.sansa_stack.ml.spark.clustering +package net.sansa_stack.ml.spark.clustering.algorithms -import scala.reflect.runtime.universe._ -import scopt.OptionParser -import org.apache.log4j.{ Level, Logger } -import org.apache.spark.mllib.util.MLUtils -import java.io.{ FileReader, FileNotFoundException, IOException } -import org.apache.spark.mllib.linalg.Vectors -import java.lang.{ Long => JLong } +import java.io._ +import java.io.{ ByteArrayInputStream, FileNotFoundException, FileReader, IOException, StringWriter } import java.lang.{ Long => JLong } +import java.net.URI + +import scala.collection.mutable +import scala.math.BigDecimal +import scala.reflect.runtime.universe._ + import breeze.linalg.{ squaredDistance, DenseVector, Vector } -import org.apache.spark.sql.SparkSession -import org.apache.spark.graphx.GraphLoader +import org.apache.commons.math3.util.MathUtils import org.apache.jena.datatypes.{ RDFDatatype, TypeMapper } -import org.apache.jena.graph.{ Node => JenaNode, Triple => JenaTriple, _ } -import org.apache.jena.riot.writer.NTriplesWriter +import org.apache.jena.graph.{ Node => JenaNode, Node_ANY, Node_Blank, Node_Literal, Node_URI, Triple => JenaTriple, _ } import org.apache.jena.riot.{ Lang, RDFDataMgr } -import org.apache.jena.graph.{ Node_ANY, Node_Blank, Node_Literal, Node_URI, Node => JenaNode, Triple => JenaTriple } +import org.apache.jena.riot.writer.NTriplesWriter import org.apache.jena.vocabulary.RDF -import java.io.ByteArrayInputStream -import org.apache.spark.rdd.PairRDDFunctions +import org.apache.log4j.{ Level, Logger } import org.apache.spark.SparkContext._ import org.apache.spark.graphx._ +import org.apache.spark.graphx.{ EdgeDirection, Graph, GraphLoader } +import org.apache.spark.mllib.clustering.{ PowerIterationClustering, PowerIterationClusteringModel } +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.rdd.PairRDDFunctions import org.apache.spark.rdd.RDD -import java.io.StringWriter -import java.io._ -import org.apache.spark.rdd.RDD -import org.apache.spark.mllib.clustering.{ PowerIterationClusteringModel, PowerIterationClustering } -import org.apache.spark.graphx.{ Graph, EdgeDirection } -import scala.math.BigDecimal -import org.apache.commons.math3.util.MathUtils import org.apache.spark.sql.SparkSession -import org.apache.spark.graphx._ -import java.net.URI import org.apache.spark.storage.StorageLevel -import org.apache.spark.graphx._ -import scala.collection.mutable object RDFGraphPowerIterationClustering { - def apply(spark: SparkSession, graph: Graph[String, String], output: String, k: Int = 2, maxIterations: Int = 5) = { - - + def apply(spark: SparkSession, graph: Graph[String, String], output: String, k: Int = 2, maxIterations: Int = 5): RDD[(Int, String)] = { def clusterRdd(): RDD[(Int, String)] = { SimilaritesInPIC() @@ -48,16 +38,16 @@ object RDFGraphPowerIterationClustering { def SimilaritesInPIC(): RDD[(Int, String)] = { - /* - * Collect all the edges of the graph - */ + /** + * Collect all the edges of the graph + */ val edge = graph.edges val nodes = graph.vertices - /* - * Collect distinct vertices of the graph - * - */ + /** + * Collect distinct vertices of the graph + * + */ val node = nodes.map(e => (e._1)) @@ -94,9 +84,9 @@ object RDFGraphPowerIterationClustering { def model = pic.run(weightedGraph) - /* - * Cluster the graph data into two classes using PowerIterationClustering - */ + /** + * Cluster the graph data into two classes using PowerIterationClustering + */ def run() = model val modelAssignments = model.assignments diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/SilviaClustering.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/SilviaClustering.scala similarity index 91% rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/SilviaClustering.scala rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/SilviaClustering.scala index d370a29..5aa3313 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/SilviaClustering.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/SilviaClustering.scala @@ -1,36 +1,31 @@ -package net.sansa_stack.ml.spark.clustering +package net.sansa_stack.ml.spark.clustering.algorithms + +import java.io._ +import java.io.{ ByteArrayInputStream, FileNotFoundException, FileReader, IOException, StringWriter } +import java.lang.{ Long => JLong } +import java.net.URI -import org.apache.spark.rdd.RDD -import org.apache.spark.graphx.{ Graph, EdgeDirection } import scala.math.BigDecimal -import org.apache.spark.sql.SparkSession import scala.reflect.runtime.universe._ -import scopt.OptionParser -import org.apache.log4j.{ Level, Logger } -import org.apache.spark.mllib.util.MLUtils -import java.io.{ FileReader, FileNotFoundException, IOException } -import org.apache.spark.mllib.linalg.Vectors -import java.lang.{ Long => JLong } -import java.lang.{ Long => JLong } -import breeze.linalg.{ squaredDistance, DenseVector, Vector } -import org.apache.spark.sql.SparkSession -import org.apache.spark.graphx.GraphLoader import scala.util.control.Breaks._ + +import breeze.linalg.{ squaredDistance, DenseVector, Vector } +import net.sansa_stack.rdf.spark.model.graph._ +import org.apache.jena.graph.{ Node, Triple } import org.apache.jena.riot.{ Lang, RDFDataMgr } -import java.io.ByteArrayInputStream -import org.apache.spark.rdd.PairRDDFunctions +import org.apache.log4j.{ Level, Logger } import org.apache.spark.SparkContext._ import org.apache.spark.graphx._ -import java.io.StringWriter -import java.io._ -import org.apache.jena.graph.{ Node, Triple } -import org.apache.jena.riot.Lang -import net.sansa_stack.rdf.spark.model.graph._ -import java.net.URI +import org.apache.spark.graphx.{ EdgeDirection, Graph, GraphLoader } +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.rdd.PairRDDFunctions +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession object SilviaClustering { - def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputeval: String) = { + def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputeval: String): Unit = { Logger.getRootLogger.setLevel(Level.WARN) @@ -40,7 +35,7 @@ object SilviaClustering { * * Jaccard similarity measure : selectYourSimilarity = 0 * Batet similarity measure : selectYourSimilarity = 1 - * Rodríguez and Egenhofer similarity measure : selectYourSimilarity = 2 + * Rodriguez and Egenhofer similarity measure : selectYourSimilarity = 2 * The Contrast model similarity : selectYourSimilarity = 3 * The Ratio model similarity : selectYourSimilarity = 4 */ @@ -48,12 +43,13 @@ object SilviaClustering { val selectYourSimilarity = 0 def clusterRdd(): RDD[List[String]] = { + val a = graph.triplets graphXinBorderFlow(graph, orient, selectYourSimilarity) } - /* - * Computes different similarities function for a given graph @graph. - */ + /** + * Computes different similarities function for a given graph @graph. + */ def graphXinBorderFlow(graph: Graph[String, String], e: Int, f: Int): RDD[List[String]] = { val edge = graph.edges.collect() @@ -77,9 +73,9 @@ object SilviaClustering { val LOG2 = math.log(2) val log2 = { x: Double => math.log(x) / LOG2 } - /* - * Difference between two set of vertices, used in different similarity measures - */ + /** + * Difference between two set of vertices, used in different similarity measures + */ def difference(a: Long, b: Long): Double = { val ansec = neighbor.lookup(a).distinct.head.toSet val ansec1 = neighbor.lookup(b).distinct.head.toSet @@ -90,9 +86,9 @@ object SilviaClustering { differ.size.toDouble } - /* - * Intersection of two set of vertices, used in different similarity measures - */ + /** + * Intersection of two set of vertices, used in different similarity measures + */ def intersection(a: Long, b: Long): Double = { val inters = neighbor.lookup(a).distinct.head.toList val inters1 = neighbor.lookup(b).distinct.head.toList @@ -106,9 +102,9 @@ object SilviaClustering { rst.size.toDouble } - /* - * Union of two set of vertices, used in different similarity measures - */ + /** + * Union of two set of vertices, used in different similarity measures + */ def union(a: Long, b: Long): Double = { val uni = neighbor.lookup(a).distinct.head.toList val uni1 = neighbor.lookup(b).distinct.head.toList @@ -124,9 +120,9 @@ object SilviaClustering { var s = 0.0 if (c == 0) { - /* - * Jaccard similarity measure - */ + /** + * Jaccard similarity measure + */ val sim = intersection(a, b) / union(a, b).toDouble @@ -136,9 +132,9 @@ object SilviaClustering { if (c == 1) { - /* - * Rodríguez and Egenhofer similarity measure - */ + /** + * Rodríguez and Egenhofer similarity measure + */ var g = 0.8 @@ -148,9 +144,10 @@ object SilviaClustering { } if (c == 2) { - /* - * The Ratio model similarity - */ + + /** + * The Ratio model similarity + */ var alph = 0.5 var beth = 0.5 @@ -161,9 +158,9 @@ object SilviaClustering { } if (c == 3) { - /* - * Batet similarity measure - */ + /** + * Batet similarity measure + */ val cal = 1 + ((difference(a, b) + difference(b, a)) / (difference(a, b) + difference(b, a) + intersection(a, b))).abs val sim = log2(cal.toDouble) @@ -518,11 +515,8 @@ object SilviaClustering { result } - val cRdd = clusterRdd() - - cRdd.saveAsTextFile(output) - + val zipwithindex = cRdd.zipWithIndex().map(f => (f._2, f._1)) + zipwithindex.saveAsTextFile(output) } - } diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/AppConfig.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/AppConfig.scala new file mode 100644 index 0000000..b5a11c5 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/AppConfig.scala @@ -0,0 +1,30 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +case class Spark(master: String, + spark_serializer: String, + spark_executor_memory: String, + spark_driver_memory: String, + spark_driver_maxResultSize: String, + app_name: String) + +case class Clustering(profile: String, + pic: String, + oneHotKM: String, + mdsKM: String, + word2VecKM: String, + picDistanceMatrix: String, + mdsCoordinates: String, + oneHotMatrix: String, + word2Vec: String) + +case class Datasets(input: String, + termValueUri: String, + termPrefix: String, + typePOI: String, + coordinatesPredicate: String, + categoryPOI: String, + poiPrefix: String) + +case class AppConfig(dataset: Datasets, clustering: Clustering, spark: Spark) + + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Categories.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Categories.scala new file mode 100644 index 0000000..bf693e1 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Categories.scala @@ -0,0 +1,7 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +/** + * @param categories a set of category values + */ +case class Categories(categories: scala.collection.mutable.Set[String]) + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Cluster.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Cluster.scala new file mode 100644 index 0000000..d342e65 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Cluster.scala @@ -0,0 +1,10 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +/** + * a cluster + * + * @param cluster_id id of cluster + * @param poi_in_cluster an array of pois in cluster + */ +case class Cluster(cluster_id: Int, poi_in_cluster: Array[Poi]) + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Clusters.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Clusters.scala new file mode 100644 index 0000000..3646c9b --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Clusters.scala @@ -0,0 +1,9 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +/** + * @param numOfClusters number of clusters + * @param clusterSizes size of each cluster + * @param clusters a list of cluster + */ +case class Clusters(numOfClusters: Int, clusterSizes: Array[Int], clusters: List[Cluster]) + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/CoordinatePOI.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/CoordinatePOI.scala new file mode 100644 index 0000000..b722a73 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/CoordinatePOI.scala @@ -0,0 +1,10 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +/** + * a coordinate + * + * @param longitude + * @param latitude + */ +case class CoordinatePOI(longitude: Double, latitude: Double) + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbPOI.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbPOI.scala new file mode 100644 index 0000000..67aa807 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbPOI.scala @@ -0,0 +1,14 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +import net.sansa_stack.ml.spark.clustering.datatypes.DbStatusEnum._ + +case class DbPOI(val poiId: String, + val lon: Double, + val lat: Double) { + + var dbstatus = UNDEFINED + var isDense = false + var isBoundary = false + var clusterName = "" +} + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbStatusEnum.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbStatusEnum.scala new file mode 100644 index 0000000..cceca32 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbStatusEnum.scala @@ -0,0 +1,7 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +object DbStatusEnum extends Enumeration { + + type DBSTATUS = Value + val UNDEFINED, NOISE, PARTOFCLUSTER = Value +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Distance.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Distance.scala new file mode 100644 index 0000000..efa7596 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Distance.scala @@ -0,0 +1,9 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +/** + * @param poi1 + * @param poi2 + * @param distance distance between poi1 and poi2 + */ +case class Distance(poi1: Long, poi2: Long, distance: Double) + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DistanceMatrix.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DistanceMatrix.scala new file mode 100644 index 0000000..27ff94d --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DistanceMatrix.scala @@ -0,0 +1,4 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +case class DistanceMatrix(distances: List[Distance]) + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinate.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinate.scala new file mode 100644 index 0000000..87aadfa --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinate.scala @@ -0,0 +1,4 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +case class MdsCoordinate (poiID: Long, coordinate: Array[Double]) + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinates.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinates.scala new file mode 100644 index 0000000..5318cc2 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinates.scala @@ -0,0 +1,4 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +case class MdsCoordinates(coordinates: Array[MdsCoordinate]) + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/POI.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/POI.scala new file mode 100644 index 0000000..0c3bba4 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/POI.scala @@ -0,0 +1,14 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +import com.vividsolutions.jts.geom.{Coordinate, GeometryFactory} + +class POI( + id: String, + name: String, + val x : Double, + val y : Double, + keywords: List[String], + score: Double, + geometryFactory: GeometryFactory + ) extends SpatialObject(id, name, keywords, score, geometryFactory.createPoint(new Coordinate(x, y))) + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Poi.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Poi.scala new file mode 100644 index 0000000..2e15ac9 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Poi.scala @@ -0,0 +1,11 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +/** + * Poi object representing a point of interest + * + * @param poi_id, id of poi + * @param coordinate, coordinate of poi + * @param categories, categories of poi + */ +case class Poi(poi_id: Long, coordinate: CoordinatePOI, categories: Categories, review: Double) + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/SpatialObject.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/SpatialObject.scala new file mode 100644 index 0000000..2388fb1 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/SpatialObject.scala @@ -0,0 +1,23 @@ +package net.sansa_stack.ml.spark.clustering.datatypes + +import com.vividsolutions.jts.geom.Geometry +import scala.collection.mutable.HashMap + +class SpatialObject( + var id: String, + var name: String, + var keywords: List[String], + var score: Double, + var geometry: Geometry + ) extends Ordered[SpatialObject]{ + + var attributes = HashMap[Object, Object]() + + // @Override + override def compare(o: SpatialObject ): Int = { + if (this.score > o.score) -1 + else if (this.score == o.score) 0 + else 1 + } +} + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Common.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Common.scala new file mode 100644 index 0000000..0044bda --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Common.scala @@ -0,0 +1,71 @@ +package net.sansa_stack.ml.spark.clustering.utils + +import java.io.PrintWriter + +import org.apache.jena.graph.{ NodeFactory, Triple} +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.json4s.DefaultFormats +import org.json4s.jackson.Serialization + +import net.sansa_stack.ml.spark.clustering.datatypes.{Cluster, Clusters, Poi} + +object Common { + val prefixID = "http://example.org/id/poi/" + val prefixCategory = "http://example.org/hasCategory" + val prefixCoordinate = "http://example.org/id/hasCoordinate/" + + + /** + * create a pair RDD and join with another pair RDD + * + * @param sparkContext + * @param ids an array with poi id + * @param pairs + * @return an array of poi + */ + def join(sparkContext: SparkContext, ids: Array[Long], pairs: RDD[(Long, Poi)]): Array[Poi] = { + val idsPair = sparkContext.parallelize(ids).map(x => (x, x)) + idsPair.join(pairs).map(x => x._2._2).collect() + } + + /** + * serialize clustering results to file + * + * @param sparkContext + * @param clusters clustering results + * @param pois pois object + * @return + */ + def writeClusteringResult(sparkContext: SparkContext, clusters: Map[Int, Array[Long]], pois: RDD[Poi], fileWriter: PrintWriter): Unit = { + val assignments = clusters.toList.sortBy { case (k, v) => v.length } + val poisKeyPair = pois.keyBy(f => f.poi_id).persist() + val clustersPois = Clusters(assignments.size, assignments.map(_._2.length).toArray, assignments.map(f => Cluster(f._1, join(sparkContext, f._2, poisKeyPair)))) + implicit val formats = DefaultFormats + Serialization.writePretty(clustersPois, fileWriter) + } + /** + * serialize clustering results to .nt file + */ + def seralizeToNT(sparkContext: SparkContext, clusters: Map[Int, Array[Long]], pois: RDD[Poi]): Unit = { + val assignments = clusters.toList.sortBy { case (k, v) => v.length } + val poisKeyPair = pois.keyBy(f => f.poi_id).persist() + val newAssignment = assignments.map(f => (f._1, sparkContext.parallelize(f._2).map(x => (x, x)).join(poisKeyPair).map(x => ( x._2._2.poi_id, x._2._2.categories, x._2._2.coordinate)).collect())) + val newAssignmentRDD = sparkContext.parallelize(newAssignment) + println(newAssignmentRDD.count()) + val newAssignmentRDDTriple = newAssignmentRDD.map(cluster => (cluster._1, cluster._2.flatMap(poi => + {List(new Triple(NodeFactory.createURI(prefixID + poi._1.toString), + NodeFactory.createURI(prefixCategory), + NodeFactory.createLiteral(poi._2.categories.mkString(","))), + new Triple(NodeFactory.createURI(prefixID + poi._1.toString), + NodeFactory.createURI(prefixCoordinate), + NodeFactory.createLiteral((poi._3.latitude, poi._3.longitude).toString())) + )} + ).toList) + ) + newAssignmentRDDTriple.saveAsTextFile("results/triples") + } + +} + + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DBCLusterer.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DBCLusterer.scala new file mode 100644 index 0000000..1609d60 --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DBCLusterer.scala @@ -0,0 +1,78 @@ +package net.sansa_stack.ml.spark.clustering.utils + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + +import net.sansa_stack.ml.spark.clustering.datatypes.DbPOI +import net.sansa_stack.ml.spark.clustering.datatypes.DbStatusEnum._ + +case class DBCLusterer(val eps: Double, val minPts: Int) { + + def clusterPois(poiArrBuff: ArrayBuffer[DbPOI]): ArrayBuffer[ArrayBuffer[DbPOI]] = { + + val clusterArrBuff = ArrayBuffer[ArrayBuffer[DbPOI]]() + val grid = Grid(poiArrBuff, eps) + + for{ + dbpoi <- poiArrBuff + + if(dbpoi.dbstatus == UNDEFINED) + }{ + + val neighbourArrBuff = grid.getNeighbours(dbpoi) + + if(neighbourArrBuff.size < minPts) + { + dbpoi.dbstatus = NOISE + } + else + { + clusterArrBuff.append(findCluster(dbpoi, neighbourArrBuff, grid)) + } + } + + clusterArrBuff + } + + + def findCluster(dbpoi: DbPOI, neighbourArrBuff: ArrayBuffer[DbPOI], grid: Grid): ArrayBuffer[DbPOI] = { + + dbpoi.dbstatus = PARTOFCLUSTER + dbpoi.isDense = true + + val cluster = ArrayBuffer[DbPOI]() + cluster.append(dbpoi) + + val neighbourQueue = mutable.Queue[DbPOI]() ++ neighbourArrBuff + + while(neighbourQueue.nonEmpty) { + val poi = neighbourQueue.dequeue() + poi.dbstatus match { + case UNDEFINED => + poi.dbstatus = PARTOFCLUSTER + val poi_i_neighbours = grid.getNeighbours(poi) + if(poi_i_neighbours.size >= minPts) + { + poi.isDense = true + neighbourQueue ++= poi_i_neighbours + } + else + { + poi.isDense = false + } + cluster.append(poi) + case NOISE => + poi.dbstatus = PARTOFCLUSTER + poi.isDense = false + cluster.append(poi) + case _ => () + } + } + + cluster + } + +} + + + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataFiltering.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataFiltering.scala new file mode 100644 index 0000000..742a90a --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataFiltering.scala @@ -0,0 +1,63 @@ +package net.sansa_stack.ml.spark.clustering.utils + +import org.apache.jena.graph.Triple +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession +import scala.collection.mutable.ArrayBuffer + +import net.sansa_stack.ml.spark.clustering.datatypes.AppConfig +import net.sansa_stack.rdf.spark.io.NTripleReader + +class DataFiltering(val spark: SparkSession, val conf: AppConfig) extends Serializable { + + val dataRDD: RDD[Triple] = NTripleReader.load(spark, conf.dataset.input).persist() + + /** + * Generate triples with related to poi in poiArray, method name not JavaBean format because of side effect with Unit return result + * @param poiArray id of pois in Vienna + * @param dataRDD RDD containing triples + * @param spark SparkSession + * @return + */ + def get_triples(poiArray: Array[Long], dataRDD: RDD[Triple], spark: SparkSession) : (RDD[Triple], RDD[Triple]) = { + // create an array of subjects related with each poi + val subjects = ArrayBuffer[String]() + for (i <- 0 until poiArray.length - 1) { + subjects ++= createSubjects(poiArray(i)) + } + // RDD[Triple] => RDD[(subject, Triple)] + val dataRDDPair = dataRDD.map(f => (f.getSubject.getURI, f)).persist() + // create RDD[(subject, subject)] from Array[subjects] + val subjectsRDD = spark.sparkContext.parallelize(subjects.toSet.toList).map(f => (f, f)).persist() + // get RDD[Triples] with subject in Array[subjects] + val viennaTriples = subjectsRDD.join(dataRDDPair).map(f => f._2._2).persist() + // find filtered Triples with prediction category, and get their object => RDD[Object] + val viennaCatgoriesObjects = viennaTriples.filter(f => f.getPredicate.getURI.equals("http://example.org/def#category")).map(f => f.getObject.getURI).distinct().persist() + // RDD[Object] => RDD[(Object, Object)] + val viennaPoiCategoriesRDD = viennaCatgoriesObjects.map(f => (f, f)).persist() + // RDD[(Object, Object)] => RDD[Triples], where Object is Subject in Triples + val viennaCategoryTriples = viennaPoiCategoriesRDD.join(dataRDDPair).map(f => f._2._2) + // RDD[Triples] => RDD[(Key, Triple)], where key=subject+predicate+object, because there are some duplicated triples in the tomtom data + val temp = viennaCategoryTriples.map(f => (f.getSubject.getURI + f.getPredicate.getURI + f.getObject.toString(), f)).persist() + // remove duplicated triples + val categoryTriples = temp.reduceByKey((v1, v2) => v1).map(f => f._2).persist() + (viennaTriples, categoryTriples) + } + + /** + * @param poiID id of a poi + * @return an array of subject in RDF triples with related to this poi + */ + def createSubjects(poiID: Long): ArrayBuffer[String] = { + val subjects = ArrayBuffer[String]() + val id = "http://example.org/id/poi/".concat(poiID.toString) + subjects.+=(id) + subjects.+=(id.concat("/address")) + subjects.+=(id.concat("/phone")) + subjects.+=(id.concat("/geometry")) + subjects.+=(id.concat("/name")) + subjects.+=(id.concat("/accuracy_info")) + subjects.+=(id.concat("/brandname")) + subjects + } +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataProcessing.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataProcessing.scala new file mode 100644 index 0000000..89c892f --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataProcessing.scala @@ -0,0 +1,178 @@ +package net.sansa_stack.ml.spark.clustering.utils + +import java.io.{File, FilenameFilter} + +import com.typesafe.config.Config +import org.apache.jena.graph.Triple +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession + +import net.sansa_stack.ml.spark.clustering.datatypes.{Categories, CoordinatePOI, Poi} +import net.sansa_stack.rdf.spark.io.NTripleReader + + + +/** + * load TomTom dataset + * @param spark SparkSession + * @param conf Configuration + */ +class DataProcessing(val spark: SparkSession, val conf: Config) extends Serializable { + + // val dataRDD: RDD[Triple] = NTripleReader.load(spark, conf.getString("sansa.data.input")).persist() + val dataRDD: RDD[Triple] = loadNTriple(conf.getString("sansa.data.input")) + + // var poiCoordinates: RDD[(Long, Coordinate)] = this.getPOICoordinates(16.192851, 16.593533, 48.104194, 48.316388).sample(withReplacement = false, fraction = 0.01, seed = 0) + var poiCoordinates: RDD[(Long, CoordinatePOI)] = this.getPOICoordinates + var poiFlatCategoryId: RDD[(Long, Long)] = this.getPOIFlatCategoryId + var poiCategoryId: RDD[(Long, Set[Long])] = this.getCategoryId(poiCoordinates, poiFlatCategoryId).persist() + var poiCategoryValueSet: RDD[(Long, Categories)] = this.getCategoryValues // (category_id, Categories) + var poiCategories: RDD[(Long, Categories)] = this.getPOICategories(poiCoordinates, poiFlatCategoryId, poiCategoryValueSet) // (poi_id, Categories) + val poiYelpCategories: RDD[(Long, (Categories, Double))] = this.getYelpCategories(dataRDD).sample(withReplacement = false, fraction = 0.1, seed = 0) + var pois: RDD[Poi] = { if (!poiYelpCategories.isEmpty()) { + // val poiAllCategories: RDD[(Long, Categories, Double)] = poiCategories.join(poiYelpCategories).map(x => (x._1, (Categories(x._2._1.categories++x._2._2._1.categories), x._2._2._2)) + val poiAllCategories: RDD[(Long, (Categories, Double))] = poiYelpCategories.join(poiCategories).map(x => (x._1, (Categories(x._2._1._1.categories++x._2._2.categories), x._2._1._2))) + poiCoordinates.join(poiAllCategories).map(x => Poi(x._1, x._2._1, x._2._2._1, x._2._2._2)).persist() + } else { + println("--------pois--------------") + poiCoordinates.join(poiCategories).map(x => Poi(x._1, x._2._1, x._2._2, 0.0)).persist() + }} + + def loadNTriple(tripleFilePath: String): RDD[Triple] = { + val tripleFile = new File(tripleFilePath) + if(tripleFile.isDirectory) { + val files = tripleFile.listFiles(new FilenameFilter() { + def accept(tripleFile: File, name: String): Boolean = { + !(name.toString.contains("SUCCESS") || name.toLowerCase.endsWith(".crc")) + } + }) + var i = 0 + var triple_0 = NTripleReader.load(spark, files(0).getAbsolutePath) + for(file <- files) { + if (i!=0) { + triple_0 = triple_0.union(NTripleReader.load(spark, file.getAbsolutePath)) + } + i+=1 + } + triple_0 + } + else { + NTripleReader.load(spark, tripleFile.getAbsolutePath) + } + } + + + /** + * @param poiCoordinates super set of poi with coordinates + * @param lo_min min longitude + * @param lo_max max longitude + * @param la_min min latitude + * @param la_max max latitude + * @return pois within certain coordinates + */ + def filterCoordinates(poiCoordinates: RDD[(Long, CoordinatePOI)], lo_min: Double, lo_max: Double, la_min: Double, la_max: Double): RDD[(Long, CoordinatePOI)] = { + poiCoordinates.filter(x => (x._2.longitude >= lo_min && x._2.longitude <= lo_max) + && (x._2.latitude >= la_min && x._2.latitude <= la_max)) + } + + /** + * get coordinate for all poi + */ + def getPOICoordinates: RDD[(Long, CoordinatePOI)] = { + // get the coordinates of pois + val pattern = "POINT(.+ .+)".r + val poiCoordinatesString = dataRDD.filter(x => x.getPredicate.toString().equalsIgnoreCase(conf.getString("sansa.data.coordinatesPredicate"))) + .map(x => (x.getSubject.toString().replace(conf.getString("sansa.data.poiPrefix"), "").replace("/geometry", "").toLong, + pattern.findFirstIn(x.getObject.toString()).head.replace("POINT", "") + .replace("^^http://www.opengis.net/ont/geosparql#wktLiteral", "").replaceAll("^\"|\"$", ""))) + // transform to Coordinate object + poiCoordinatesString.mapValues(x => { + val coordinates = x.replace("(", "").replace(")", "").split(" ") + CoordinatePOI(coordinates(0).toDouble, coordinates(1).toDouble) + }) + } + + /** + * load data filter on geo-coordinates + * @param lo_min min longitude + * @param lo_max max longitude + * @param la_min min latitude + * @param la_max max latitude + */ + def getPOICoordinates(lo_min: Double, lo_max: Double, la_min: Double, la_max: Double): RDD[(Long, CoordinatePOI)] = { + this.filterCoordinates(poiCoordinates = this.getPOICoordinates, lo_min = lo_min, lo_max = lo_max, la_min = la_min, la_max = la_max) + } + + /** + * + * @return (poi, category_id) + */ + def getPOIFlatCategoryId: RDD[(Long, Long)] = { + val poiFlatCategories = dataRDD.filter(x => x.getPredicate.toString().equalsIgnoreCase(conf.getString("sansa.data.categoryPOI"))) + poiFlatCategories.map(x => ( + x.getSubject.toString().replace(conf.getString("sansa.data.poiPrefix"), "").toLong, + x.getObject.toString().replace(conf.getString("sansa.data.termPrefix"), "").toLong) + ) + } + + /** + * get (poi_unique, Categories) + * @param poiCoordinates (poi_unique, Coordinate) + * @param poiFlatCategoryId (poi, category_id) + * @param poiCategoryValueSet (category_id, Categories) + * @return (poi, Categories) + */ + def getPOICategories(poiCoordinates: RDD[(Long, CoordinatePOI)], poiFlatCategoryId: RDD[(Long, Long)], poiCategoryValueSet: RDD[(Long, Categories)]): RDD[(Long, Categories)] = { + // from (poi, category_id) map-> (category_id, poi) join-> (category_id, (poi, Categories)) map-> (poi, Categories) groupByKey-> (poi_unique, Iterable(Categories)) + val poiCategorySets = poiFlatCategoryId.map(f => (f._2, f._1)).join(poiCategoryValueSet).map(f => (f._2._1, f._2._2)).groupByKey() + // from (poi_unique, Iterable(Categories)) join-> (poi_unique, (Coordinate, Iterable(Categories))) map-> (poi_unique, Categories) + poiCoordinates.join(poiCategorySets).map(x => (x._1, Categories(collection.mutable.Set(x._2._2.flatMap(_.categories).toList: _*)))) + } + + /** + * get (category_id, Categories) + * @return RDD with category values for category id + */ + def getCategoryValues: RDD[(Long, Categories)] = { + // get category id(s) + val categoryTriples = dataRDD.filter(x => x.getPredicate.toString().equalsIgnoreCase(conf.getString("sansa.data.termValueUri"))) + // get category id and it's corresponding values + val categoriesIdValues = categoryTriples.map(x => ( + x.getSubject.toString().replace(conf.getString("sansa.data.termPrefix"), "").toLong, + x.getObject.toString().replaceAll("\"", ""))) + // group by id and put all values of category to a set + categoriesIdValues.groupByKey().map(x => (x._1, Categories(scala.collection.mutable.Set(x._2.toList: _*)))) + } + + /** + * get (poi_unique, poi_category_id_set) + * @param poiCoordinates (poi_unique, Coordinate) + * @param poiFlatCategoryId (poi, category_id) + */ + def getCategoryId(poiCoordinates: RDD[(Long, CoordinatePOI)], poiFlatCategoryId: RDD[(Long, Long)]): RDD[(Long, Set[Long])] = { + poiCoordinates.join(poiFlatCategoryId.groupByKey()) + .map(x => (x._1, x._2._2.toSet)) + } + + + def getYelpCategories(mergedRDD: RDD[Triple]): RDD[(Long, (Categories, Double))] = { + val yelpPOICategory = mergedRDD.filter(triple => triple.getPredicate.toString.equalsIgnoreCase(conf.getString("yelp.data.categoryPOI"))) + println(conf.getString("yelp.data.rating")) + val yelpPOIRating = mergedRDD.filter(triple => triple.getPredicate.toString.contains(conf.getString("yelp.data.rating"))) + println("category") + println(yelpPOICategory.count()) + println("rating") + println(yelpPOIRating.count()) + val yelpPOICategoryMapped = yelpPOICategory.map(triple => ( + triple.getSubject.toString().replace(conf.getString("sansa.data.poiPrefix"), "").toLong, + triple.getObject.toString() + )) + val yelpPOIRatingMapped = yelpPOIRating.map(triple => ( + triple.getSubject.toString().replace(conf.getString("sansa.data.poiPrefix"), "").toLong, + triple.getObject.getLiteralValue.toString.toDouble + )) + yelpPOICategoryMapped.groupByKey().join(yelpPOIRatingMapped).map(x => (x._1, (Categories(scala.collection.mutable.Set(x._2._1.toList: _*)), x._2._2))) + } +} + + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Grid.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Grid.scala new file mode 100644 index 0000000..74d5a9c --- /dev/null +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Grid.scala @@ -0,0 +1,55 @@ +package net.sansa_stack.ml.spark.clustering.utils + +import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.HashMap + +import net.sansa_stack.ml.spark.clustering.datatypes.DbPOI + +case class Grid(val poiArrBuf: ArrayBuffer[DbPOI], val eps: Double) { + + val startX = poiArrBuf.head.lon + val startY = poiArrBuf.head.lat + val gridCell = HashMap[(Int, Int), ArrayBuffer[DbPOI]]() + + init() + + private def init(): Unit = { + var i = 0 + var j = 0 + for(dbpoi <- poiArrBuf) { + i = math.floor( (dbpoi.lon - startX) / eps).toInt + j = math.floor( (dbpoi.lat - startY) / eps).toInt + + gridCell.get((i, j)) match { + case Some(cellArrBuff) => cellArrBuff.append(dbpoi) + case None => gridCell += ( ((i, j), ArrayBuffer(dbpoi)) ) + } + } + } + + + def getNeighbours(dbpoi: DbPOI): ArrayBuffer[DbPOI] = { + + val neighbourArrBuff = ArrayBuffer[DbPOI]() + + val celli = math.floor( (dbpoi.lon - startX) / eps).toInt + val cellj = math.floor( (dbpoi.lat - startY) / eps).toInt + for{ + i <- (celli - 1) to (celli + 1) + j <- (cellj - 1) to (cellj + 1) + }{ + gridCell.get((i, j)) match { + case Some(cellArrBuff) => neighbourArrBuff ++= cellArrBuff + case None => () + } + } + + neighbourArrBuff.filter{ + p => (math.abs(p.lon - dbpoi.lon) <= eps) && (math.abs(p.lat - dbpoi.lat) <= eps) && p.poiId != dbpoi.poiId + } + + } + +} + + diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernel.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernel.scala index a49c2b3..a7ef9b4 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernel.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernel.scala @@ -1,18 +1,18 @@ package net.sansa_stack.ml.spark.kernel +import org.apache.jena.graph.Triple import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel, StringIndexer } import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD -import org.apache.spark.sql.functions._ import org.apache.spark.sql.{ DataFrame, SparkSession } -import org.apache.jena.graph.Triple +import org.apache.spark.sql.functions._ class RDFFastGraphKernel( @transient val sparkSession: SparkSession, - val tripleRDD: RDD[Triple], - val predicateToPredict: String) extends Serializable { + val tripleRDD: RDD[Triple], + val predicateToPredict: String) extends Serializable { import sparkSession.implicits._ @@ -84,8 +84,8 @@ class RDFFastGraphKernel( object RDFFastGraphKernel { def apply( - sparkSession: SparkSession, - tripleRDD: RDD[Triple], + sparkSession: SparkSession, + tripleRDD: RDD[Triple], predicateToPredict: String): RDFFastGraphKernel = { new RDFFastGraphKernel(sparkSession, tripleRDD, predicateToPredict) diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel.scala index 1ea7080..a3f64fa 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel.scala @@ -1,13 +1,14 @@ package net.sansa_stack.ml.spark.kernel -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{ DataFrame, SparkSession } -import org.apache.spark.sql.functions._ +import org.apache.jena.graph.Triple import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel } import org.apache.spark.mllib.linalg.SparseVector -import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.jena.graph.Triple +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{ DataFrame, SparkSession } +import org.apache.spark.sql.functions._ + object Uri2Index { /* @@ -81,9 +82,9 @@ object Uri2Index { class RDFFastTreeGraphKernel( @transient val sparkSession: SparkSession, - val tripleRDD: RDD[Triple], - val instanceDF: DataFrame, - val maxDepth: Int) extends Serializable { + val tripleRDD: RDD[Triple], + val instanceDF: DataFrame, + val maxDepth: Int) extends Serializable { /* * Construct Triples DataFrame and Instances DataFrame * Also, Get/Set Index for each URI and Literal @@ -168,9 +169,9 @@ object RDFFastTreeGraphKernel { def apply( sparkSession: SparkSession, - tripleRDD: RDD[Triple], - instanceDF: DataFrame, - maxDepth: Int): RDFFastTreeGraphKernel = { + tripleRDD: RDD[Triple], + instanceDF: DataFrame, + maxDepth: Int): RDFFastTreeGraphKernel = { new RDFFastTreeGraphKernel(sparkSession, tripleRDD, instanceDF, maxDepth) } diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelApp.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelApp.scala index b3ef264..6ecc25d 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelApp.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelApp.scala @@ -46,7 +46,7 @@ object RDFFastTreeGraphKernelApp { } def experimentAffiliationPrediction(sparkSession: SparkSession, depth: Int, iteration: Int): Unit = { - //val input = "src/main/resources/kernel/aifb-fixed_complete4.nt" + // val input = "src/main/resources/kernel/aifb-fixed_complete4.nt" val input = "src/main/resources/kernel/aifb-fixed_no_schema4.nt" val t0 = System.nanoTime @@ -137,7 +137,7 @@ object RDFFastTreeGraphKernelApp { tripleRDD.filter(_.getPredicate.getURI == "http://data.bgs.ac.uk/ref/Lexicon/hasTheme") .foreach(f => Uri2Index.setInstanceAndLabel(f.getSubject.toString, f.getObject.toString)) - val filteredTripleRDD=tripleRDD + val filteredTripleRDD = tripleRDD .filter(_.getPredicate.getURI != "http://data.bgs.ac.uk/ref/Lexicon/hasTheme") val instanceDF = Uri2Index.getInstanceLabelsDF(sparkSession) diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelUtil.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelUtil.scala index f4bb7f0..a8f2ec7 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelUtil.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelUtil.scala @@ -2,31 +2,31 @@ package net.sansa_stack.ml.spark.kernel import org.apache.jena.graph import org.apache.spark.ml.feature.StringIndexer -import org.apache.spark.mllib.classification.{LogisticRegressionModel, LogisticRegressionWithLBFGS} +import org.apache.spark.mllib.classification.{ LogisticRegressionModel, LogisticRegressionWithLBFGS } import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{ DataFrame, SparkSession } import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{DataFrame, SparkSession} - object RDFFastTreeGraphKernelUtil { - def triplesToDF(sparkSession: SparkSession, - triples: RDD[graph.Triple], - subjectColName:String = "subject", - predicateColName:String = "predicate", - objectColName:String ="object" - ): DataFrame = { + def triplesToDF( + sparkSession: SparkSession, + triples: RDD[graph.Triple], + subjectColName: String = "subject", + predicateColName: String = "predicate", + objectColName: String = "object"): DataFrame = { import sparkSession.implicits._ - triples.map(f => (f.getSubject.toString,f.getPredicate.toString,f.getObject.toString)) + triples.map(f => (f.getSubject.toString, f.getPredicate.toString, f.getObject.toString)) .toDF(subjectColName, predicateColName, objectColName) } - def getInstanceAndLabelDF( filteredTripleDF: DataFrame, - subjectColName:String = "subject", - objectColName:String ="object" ): DataFrame = { + def getInstanceAndLabelDF( + filteredTripleDF: DataFrame, + subjectColName: String = "subject", + objectColName: String = "object"): DataFrame = { /* root |-- instance: string (nullable = true) @@ -47,7 +47,7 @@ object RDFFastTreeGraphKernelUtil { indexedDF } - def predictLogisticRegressionMLLIB(data: RDD[LabeledPoint], numClasses : Int = 2, maxIteration: Int = 5): Unit = { + def predictLogisticRegressionMLLIB(data: RDD[LabeledPoint], numClasses: Int = 2, maxIteration: Int = 5): Unit = { val t0 = System.nanoTime data.cache() @@ -61,7 +61,7 @@ object RDFFastTreeGraphKernelUtil { val validation = splits(1) val model = new LogisticRegressionWithLBFGS().setNumClasses(numClasses).run(training) - val predictions = validation.map{ point => + val predictions = validation.map { point => val prediction = model.predict(point.features) (point.label, prediction) } @@ -73,15 +73,14 @@ object RDFFastTreeGraphKernelUtil { var sumOfAccuracy = 0.0 - for ( seed <- 1 to maxIteration ) { + for (seed <- 1 to maxIteration) { val (model, accuracy) = trainAndValidate(data, seed) -// println(accuracy) + // println(accuracy) sumOfAccuracy += accuracy } val t2 = System.nanoTime - // score the model on test data. println("Average Accuracy: " + sumOfAccuracy / maxIteration) diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel_v2.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel_v2.scala index 5f9aa81..6c24a7f 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel_v2.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel_v2.scala @@ -1,18 +1,18 @@ package net.sansa_stack.ml.spark.kernel -import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, StringIndexer} +import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel, StringIndexer } import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{ DataFrame, SparkSession } import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{DataFrame, SparkSession} -class RDFFastTreeGraphKernel_v2 (@transient val sparkSession: SparkSession, - val tripleDF: DataFrame, - val instanceDF: DataFrame, - val maxDepth: Int - ) extends Serializable { +class RDFFastTreeGraphKernel_v2( + @transient val sparkSession: SparkSession, + val tripleDF: DataFrame, + val instanceDF: DataFrame, + val maxDepth: Int) extends Serializable { def computeFeatures(): DataFrame = { /* @@ -46,7 +46,6 @@ class RDFFastTreeGraphKernel_v2 (@transient val sparkSession: SparkSession, intermediateDF.createOrReplaceTempView("df") } - // Indexing on path val indexer = new StringIndexer() .setInputCol("path") @@ -59,12 +58,10 @@ class RDFFastTreeGraphKernel_v2 (@transient val sparkSession: SparkSession, .agg(collect_list("pathIndex") as "paths") .toDF("instance", "label", "paths") - // CountVectorize the aggregated paths val cvModel: CountVectorizerModel = new CountVectorizer().setInputCol("paths").setOutputCol("features").fit(aggDF) val dataML = cvModel.transform(aggDF) - dataML } @@ -97,11 +94,11 @@ class RDFFastTreeGraphKernel_v2 (@transient val sparkSession: SparkSession, object RDFFastTreeGraphKernel_v2 { - def apply(sparkSession: SparkSession, - tripleDF: DataFrame, - instanceDF: DataFrame, - maxDepth: Int - ): RDFFastTreeGraphKernel_v2 = { + def apply( + sparkSession: SparkSession, + tripleDF: DataFrame, + instanceDF: DataFrame, + maxDepth: Int): RDFFastTreeGraphKernel_v2 = { new RDFFastTreeGraphKernel_v2(sparkSession, tripleDF, instanceDF, maxDepth) } diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Bootstrapping.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Bootstrapping.scala index 02bbfa9..7350c53 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Bootstrapping.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Bootstrapping.scala @@ -1,5 +1,8 @@ package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation +import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples +import org.apache.spark.sql._ + /** * Bootstrapping * ------------- @@ -8,18 +11,12 @@ package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation * * Created by lpfgarcia */ - -import org.apache.spark.sql._ - -import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples - class Bootstrapping(data: Dataset[IntegerTriples]) - extends CrossValidation[Dataset[IntegerTriples]] { + extends CrossValidation[Dataset[IntegerTriples]] { - def crossValidation() = { + def crossValidation(): (Dataset[IntegerTriples], Dataset[IntegerTriples]) = { val train = data.sample(true, 1) val test = data.except(train) (train, test) } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/CrossValidation.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/CrossValidation.scala index 791d0b1..b55c36a 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/CrossValidation.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/CrossValidation.scala @@ -13,4 +13,4 @@ trait CrossValidation[T] { def crossValidation: (T, T) -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Holdout.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Holdout.scala index f25bc84..1cbf42d 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Holdout.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Holdout.scala @@ -1,5 +1,8 @@ package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation +import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples +import org.apache.spark.sql._ + /** * Hould Out * --------- @@ -8,22 +11,17 @@ package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation * * Created by lpfgarcia */ - -import org.apache.spark.sql._ - -import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples - case class rateException(info: String) extends Exception class Holdout(data: Dataset[IntegerTriples], rate: Float) extends CrossValidation[Dataset[IntegerTriples]] { - if (rate < 0 || rate >= 1) + if (rate < 0 || rate >= 1) { throw new rateException("Rate value should be higher than 0 and lower than 1") + } - def crossValidation() = { + def crossValidation(): (Dataset[IntegerTriples], Dataset[IntegerTriples]) = { val train = data.sample(false, rate) val test = data.except(train) (train, test) } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/kFold.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/kFold.scala index 97021e2..eed57a2 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/kFold.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/kFold.scala @@ -1,5 +1,8 @@ package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation +import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples +import org.apache.spark.sql._ + /** * k-fold Cross Validation * ----------------------- @@ -9,26 +12,23 @@ package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation * Created by lpfgarcia */ -import org.apache.spark.sql._ - -import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples - case class kException(info: String) extends Exception case class withIndex(Subject: Int, Predicate: Int, Object: Int, k: Int) class kFold(data: Dataset[IntegerTriples], k: Int, sk: SparkSession) - extends CrossValidation[Seq[Dataset[IntegerTriples]]] { + extends CrossValidation[Seq[Dataset[IntegerTriples]]] { import sk.implicits._ - if (k > 1 && k <= 10) + if (k > 1 && k <= 10) { throw new kException("The k value should be higher than 1 and lower or equal to 10") + } val id = (1 to data.count().toInt / k).flatMap(List.fill(k)(_)) val fold = sk.sparkContext.parallelize(id, data.rdd.getNumPartitions) - def crossValidation() = { + def crossValidation(): (IndexedSeq[Dataset[IntegerTriples]], IndexedSeq[Dataset[IntegerTriples]]) = { val df = sk.createDataFrame(data.rdd.zip(fold).map { r => withIndex(r._1.Subject, r._1.Predicate, r._1.Object, r._2) @@ -45,4 +45,4 @@ class kFold(data: Dataset[IntegerTriples], k: Int, sk: SparkSession) (train, test) } -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/evaluate/Evaluate.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/evaluate/Evaluate.scala index 1fda916..0d092e8 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/evaluate/Evaluate.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/evaluate/Evaluate.scala @@ -9,9 +9,8 @@ package net.sansa_stack.ml.spark.kge.linkprediction.evaluate object Evaluate { - def meanRank(left: Array[Float], right: Array[Float]) { + def meanRank(left: Array[Float], right: Array[Float]): (Float, Float) = { (left.sum / left.length, right.sum / right.length) } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/DistMult.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/DistMult.scala index 51fcfe5..da3f6ac 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/DistMult.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/DistMult.scala @@ -1,5 +1,11 @@ package net.sansa_stack.ml.spark.kge.linkprediction.models +import com.intel.analytics.bigdl.optim.Adam +import com.intel.analytics.bigdl.tensor.Tensor +import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat +import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples } +import org.apache.spark.sql._ + /** * DistMult: diagonal bilinear model * --------------------------------- @@ -9,24 +15,15 @@ package net.sansa_stack.ml.spark.kge.linkprediction.models * * Created by lpfgarcia on 20/11/2017. */ - -import org.apache.spark.sql._ - -import com.intel.analytics.bigdl.optim.Adam -import com.intel.analytics.bigdl.tensor.Tensor -import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat - -import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples} - class DistMult(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) - extends Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) { + extends Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) { val epochs = 100 val rate = 0.01f var opt = new Adam(learningRate = rate) - def dist(data: Dataset[IntegerTriples]) = { + def dist(data: Dataset[IntegerTriples]): Float = { val aux = data.collect().map { i => e(i.Subject) * r(i.Predicate) * e(i.Object) }.reduce((a, b) => a + b) @@ -34,7 +31,7 @@ class DistMult(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: L2(aux) } - def run() = { + def run(): Unit = { for (i <- 1 to epochs) { @@ -53,5 +50,4 @@ class DistMult(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: } } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/Models.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/Models.scala index 7720f30..1bfb3ad 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/Models.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/Models.scala @@ -1,23 +1,21 @@ package net.sansa_stack.ml.spark.kge.linkprediction.models -/** - * Model Abstract Class - * -------------------- - * - * Created by lpfgarcia on 14/11/2017. - */ - import scala.math._ import scala.util._ -import org.apache.spark.sql._ - import com.intel.analytics.bigdl.nn.Power import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat +import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples } +import org.apache.spark.sql._ -import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples} +/** + * Model Abstract Class + * -------------------- + * + * Created by lpfgarcia on 14/11/2017. + */ abstract class Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) { val Ne = ne @@ -26,11 +24,11 @@ abstract class Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) { var e = initialize(ne) var r = normalize(initialize(nr)) - def initialize(size: Int) = { + def initialize(size: Int): Tensor[Float] = { Tensor(size, k).rand(-6 / sqrt(k), 6 / sqrt(k)) } - def normalize(data: Tensor[Float]) = { + def normalize(data: Tensor[Float]): Tensor[Float] = { data / data.abs().sum() } @@ -38,7 +36,7 @@ abstract class Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) { val seed = new Random(System.currentTimeMillis()) - def tuple(aux: IntegerTriples) = { + def tuple(aux: IntegerTriples): IntegerTriples = { if (seed.nextBoolean()) { IntegerTriples(seed.nextInt(Ne) + 1, aux.Predicate, aux.Object) } else { @@ -46,20 +44,20 @@ abstract class Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) { } } - def negative(data: Dataset[IntegerTriples]) = { + def negative(data: Dataset[IntegerTriples]): Dataset[IntegerTriples] = { data.map(i => tuple(i)) } - def subset(data: Dataset[IntegerTriples]) = { + def subset(data: Dataset[IntegerTriples]): Dataset[IntegerTriples] = { data.sample(false, 2 * (batch.toDouble / data.count().toDouble)).limit(batch) } - def L1(vec: Tensor[Float]) = { + def L1(vec: Tensor[Float]): Float = { vec.abs().sum() } - def L2(vec: Tensor[Float]) = { + def L2(vec: Tensor[Float]): Float = { vec.pow(2).sqrt().sum() } -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/TransE.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/TransE.scala index f5fb0db..43a4205 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/TransE.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/TransE.scala @@ -1,5 +1,14 @@ package net.sansa_stack.ml.spark.kge.linkprediction.models +import scala.math._ + +import com.intel.analytics.bigdl.optim.Adam +import com.intel.analytics.bigdl.tensor.Tensor +import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat +import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples } +import org.apache.spark.sql._ + + /** * TransE embedding model * ---------------------- @@ -9,19 +18,8 @@ package net.sansa_stack.ml.spark.kge.linkprediction.models * * Created by lpfgarcia on 14/11/2017. */ - -import scala.math._ - -import org.apache.spark.sql._ - -import com.intel.analytics.bigdl.optim.Adam -import com.intel.analytics.bigdl.tensor.Tensor -import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat - -import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples} - class TransE(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: Int, margin: Float, L: String, sk: SparkSession) - extends Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) { + extends Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) { val epochs = 1000 val rate = 0.01f @@ -30,12 +28,12 @@ class TransE(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: In val myL = L match { case "L2" => L2 _ - case _ => L1 _ + case _ => L1 _ } import sk.implicits._ - def dist(data: Dataset[IntegerTriples]) = { + def dist(data: Dataset[IntegerTriples]): Float = { val aux = data.collect().map { i => e(i.Subject) + r(i.Predicate) - e(i.Object) @@ -44,11 +42,11 @@ class TransE(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: In myL(aux) } - def dist(row: IntegerTriples) = { + def dist(row: IntegerTriples): Tensor[Float] = { e(row.Subject) + r(row.Predicate) - e(row.Object) } - def run() = { + def run(): Unit = { for (i <- 1 to epochs) { @@ -70,5 +68,4 @@ class TransE(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: In } } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/Predict.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/Predict.scala index 29cee10..e1c8227 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/Predict.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/Predict.scala @@ -1,29 +1,27 @@ package net.sansa_stack.ml.spark.kge.linkprediction.prediction +import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples } +import org.apache.spark.sql._ + /** * Predict Abstract Class * ---------------------- * * Created by lpfgarcia on 14/11/2017. */ - -import org.apache.spark.sql._ - -import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples} - abstract class Evaluate(test: Dataset[IntegerTriples]) { - def left(row: IntegerTriples, i: Int) = { + def left(row: IntegerTriples, i: Int): IntegerTriples = { IntegerTriples(i, row.Predicate, row.Object) } - def right(row: IntegerTriples, i: Int) = { + def right(row: IntegerTriples, i: Int): IntegerTriples = { IntegerTriples(row.Subject, row.Predicate, i) } def rank(row: IntegerTriples, spo: String): Integer - def ranking() = { + def ranking(): (Seq[Integer], Seq[Integer]) = { var l, r = Seq[Integer]() @@ -35,7 +33,7 @@ abstract class Evaluate(test: Dataset[IntegerTriples]) { (l, r) } - def rawHits10() = { + def rawHits10(): (Seq[Boolean], Seq[Boolean]) = { var l, r = Seq[Boolean]() @@ -46,5 +44,4 @@ abstract class Evaluate(test: Dataset[IntegerTriples]) { (l, r) } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/PredictTransE.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/PredictTransE.scala index 263c19d..a7c60dc 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/PredictTransE.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/PredictTransE.scala @@ -1,28 +1,26 @@ package net.sansa_stack.ml.spark.kge.linkprediction.prediction +import org.apache.spark.sql._ + +import net.sansa_stack.ml.spark.kge.linkprediction.models.TransE +import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples } + /** * Predict TransE Class * -------------------- * * Created by lpfgarcia on 14/11/2017. */ - -import org.apache.spark.sql._ - -import net.sansa_stack.ml.spark.kge.linkprediction.models.TransE - -import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples} - class PredictTransE(model: TransE, test: Dataset[IntegerTriples]) extends Evaluate(test: Dataset[IntegerTriples]) { - def rank(row: IntegerTriples, spo: String) = { + def rank(row: IntegerTriples, spo: String): Integer = { var x = Seq[Float]() val y = model.myL(model.dist(row)) val cor = spo match { case "l" => left _ - case _ => right _ + case _ => right _ } x = y +: x @@ -33,4 +31,4 @@ class PredictTransE(model: TransE, test: Dataset[IntegerTriples]) extends Evalua x.sorted.indexOf(y) } -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TransERun.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TransERun.scala index 2b238a4..45e34b3 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TransERun.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TransERun.scala @@ -4,18 +4,14 @@ package net.sansa_stack.ml.spark.kge.linkprediction.run * Created by lpfgarcia on 14/11/2017. */ +import org.apache.log4j.{ Level, Logger } import org.apache.spark.sql._ -import org.apache.log4j.Logger -import org.apache.log4j.Level - -import net.sansa_stack.rdf.spark.kge.convertor.ByIndex -import net.sansa_stack.rdf.spark.kge.triples._ - -import net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation.Holdout -import net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation.{kFold,Bootstrapping,Holdout} +import net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation.{ kFold, Bootstrapping, Holdout } import net.sansa_stack.ml.spark.kge.linkprediction.models.TransE import net.sansa_stack.ml.spark.kge.linkprediction.prediction.PredictTransE +import net.sansa_stack.rdf.spark.kge.convertor.ByIndex +import net.sansa_stack.rdf.spark.kge.triples._ object TransERun { @@ -25,7 +21,7 @@ object TransERun { val spark = SparkSession.builder.master("local") .appName("kge").getOrCreate - def main(args: Array[String]) = { + def main(args: Array[String]): Unit = { val table = new Triples("/home/lpfgarcia/Desktop/SANSA-ML/data/train.txt", "\t", false, false, spark) @@ -37,18 +33,15 @@ object TransERun { val (train, test) = new Holdout(data.triples, 0.6f).crossValidation() - println("Trinamento:") println(train.show()) println("Teste:") println(test.show()) - //var model = new TransE(train, data.e.length, data.r.length, 100, 20, 1, "L1", spark) - //model.run() - - //val predict = new PredictTransE(model, test).ranking() - //println(predict) + // var model = new TransE(train, data.e.length, data.r.length, 100, 20, 1, "L1", spark) + // model.run() + // val predict = new PredictTransE(model, test).ranking() + // println(predict) } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TriplesRun.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TriplesRun.scala index 763c443..73cc080 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TriplesRun.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TriplesRun.scala @@ -1,16 +1,13 @@ package net.sansa_stack.ml.spark.kge.linkprediction.run -import scala.util.Random - -import net.sansa_stack.rdf.spark.kge.triples._ import net.sansa_stack.rdf.spark.kge.convertor.ByIndex -import org.apache.spark.sql._ +import net.sansa_stack.rdf.spark.kge.triples._ import org.apache.log4j.{ Level, Logger } -import org.springframework.util.StopWatch +import org.apache.spark.sql._ -object runTesting extends App { +object TriplesRun extends App { - def printType[T](x: T): Unit = { println(x.getClass.toString()) } + def printType[T](x: T): Unit = { println(x.getClass.toString) } Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) @@ -26,27 +23,26 @@ object runTesting extends App { println("<<< STARTING >>>") - var watch: StopWatch = new StopWatch() + var startTime = System.currentTimeMillis() - watch.start() + startTime = System.currentTimeMillis() val trp = new Triples("/home/hamed/workspace/TransE/DataSets/FB15k/freebase_mtr100_mte100-train.txt", "\t", false, false, spark) - watch.stop() - println("Readin triples done in " + watch.getTotalTimeSeconds + " seconds") + println("Reading triples done in " + (System.currentTimeMillis() - startTime) + " seconds") - watch.start() + startTime = System.currentTimeMillis() var num: Long = trp.triples.count() - watch.stop() - println("\n\n No triples = " + num.toString() + " - Done in " + watch.getTotalTimeSeconds + " seconds.") - watch.start() + println("\n\n No triples = " + num.toString + " - Done in " + (System.currentTimeMillis() - startTime) + " seconds.") + + startTime = System.currentTimeMillis() num = trp.getEntities().length - watch.stop() - println("\n\n No Entities = " + num.toString() + " - Done in " + watch.getTotalTimeSeconds + " seconds.") - watch.start() + println("\n\n No Entities = " + num.toString + " - Done in " + (System.currentTimeMillis() - startTime) + " seconds.") + + startTime = System.currentTimeMillis() num = trp.getRelations().length - watch.stop() - println("\n\n No Predicates = " + num.toString() + " - Done in " + watch.getTotalTimeSeconds + " seconds.") + + println("\n\n No Predicates = " + num.toString + " - Done in " + (System.currentTimeMillis() - startTime) + " seconds.") // trp.getAllDistinctEntities().take(10).foreach(println) // println("\n \n No entities = ",trp.getAllDistinctEntities().count() ) // println("\n \n No predicates = ",trp.getAllDistinctPredicates().count() ) @@ -79,10 +75,10 @@ object runTesting extends App { sample1.show() - //val r3 = conv.getTriplesByIndex(sample1) - //r3.printSchema() - //r3.show + // val r3 = conv.getTriplesByIndex(sample1) + // r3.printSchema() + // r3.show - //val r4 = conv.getTriplesByString(r3) - //println("<<< DONE >>>") + // val r4 = conv.getTriplesByString(r3) + // println("<<< DONE >>>") } diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/AbstractRDFGraph.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/AbstractRDFGraph.scala index f5a54b5..a608b50 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/AbstractRDFGraph.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/AbstractRDFGraph.scala @@ -3,6 +3,7 @@ package net.sansa_stack.ml.spark.mining.amieSpark import org.apache.jena.graph.Triple import org.apache.spark.rdd.RDD import org.apache.spark.sql.{ DataFrame, SparkSession } + import net.sansa_stack.ml.spark.mining.amieSpark._ /** diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/DfLoader.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/DfLoader.scala index 1e1674f..888b54c 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/DfLoader.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/DfLoader.scala @@ -1,7 +1,5 @@ package net.sansa_stack.ml.spark.mining.amieSpark -import net.sansa_stack.ml.spark.mining.amieSpark._ - import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.types._ @@ -18,9 +16,9 @@ object DfLoader { val startTime = System.currentTimeMillis() import sqlContext.implicits._ - /* var y = StructType(StructField("sub", StringType,false):: + /* var y = StructType(StructField("sub", StringType,false):: StructField("rel", StringType, false):: - StructField("ob", StringType, false):: Nil)*/ + StructField("ob", StringType, false):: Nil) */ val triples = sc.textFile(path, minPartitions) diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/EmptyRDFGraphDataFrame.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/EmptyRDFGraphDataFrame.scala index 44421de..dd51b98 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/EmptyRDFGraphDataFrame.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/EmptyRDFGraphDataFrame.scala @@ -1,7 +1,7 @@ package net.sansa_stack.ml.spark.mining.amieSpark -import org.apache.spark.sql.types.{ StringType, StructField, StructType } import org.apache.spark.sql.{ DataFrame, Row, SQLContext } +import org.apache.spark.sql.types.{ StringType, StructField, StructType } /** * @author Lorenz Buehmann @@ -26,4 +26,4 @@ object EmptyRDFGraphDataFrame { triplesDataFrame } -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/KBObject.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/KBObject.scala index b7d8088..442a589 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/KBObject.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/KBObject.scala @@ -1,25 +1,23 @@ package net.sansa_stack.ml.spark.mining.amieSpark -import org.apache.spark.SparkContext -import org.apache.spark.sql.{ DataFrame, SQLContext } +import java.io.File import scala.collection.mutable.{ ArrayBuffer, Map } -//import net.sansa_stack.ml.spark.dissect.inference.utils._ - -import java.io.File - -import net.sansa_stack.ml.spark.mining.amieSpark.Rules.RuleContainer +import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{ DataFrame, SQLContext } import org.apache.spark.sql.functions.udf +import net.sansa_stack.ml.spark.mining.amieSpark.Rules.RuleContainer + object KBObject { case class Atom(rdf: RDFTriple) class KB() extends Serializable { var kbSrc: String = "" - var kbGraph: RDFGraph = null - var dfTable: DataFrame = null + var kbGraph: RDFGraph = _ + var dfTable: DataFrame = _ var dfMap: Map[String, DataFrame] = Map() @@ -62,7 +60,7 @@ object KBObject { } str = str.replace(" ", "_").replace("?", "_") - return str + str } def calcName(whole: ArrayBuffer[RDFTriple]): String = { @@ -83,11 +81,11 @@ object KBObject { } else { countMap += (w._3 -> 1) } - if (!(numberMap.contains(w._1))) { + if (!numberMap.contains(w._1)) { numberMap += (w._1 -> counter) counter += 1 } - if (!(numberMap.contains(w._3))) { + if (!numberMap.contains(w._3)) { numberMap += (w._3 -> counter) counter += 1 } @@ -113,28 +111,28 @@ object KBObject { out += a + "_" + wh._2 + "_" + b + "_" } out = out.stripSuffix("_") - return out + out } def getRngSize(rel: String): Double = { - return this.predicate2object2subject.get(rel).get.size + this.predicate2object2subject.get(rel).get.size } def setKbSrc(x: String) { this.kbSrc = x } - def getKbSrc(): String = { + def getKbSrc: String = { - return this.kbSrc + this.kbSrc } - def getKbGraph(): RDFGraph = { - return this.kbGraph + def getKbGraph: RDFGraph = { + this.kbGraph } - //TODO: think about Graph representation + // TODO: think about Graph representation def setKbGraph(x: RDFGraph) { this.kbGraph = x val graph = x.triples.collect @@ -163,7 +161,7 @@ object KBObject { } - return out + out } /** @@ -178,7 +176,7 @@ object KBObject { val subject = tp.subject val relation = tp.predicate val o = tp.`object` - //filling the to to to maps + // filling the to to to maps if (!(add(subject, relation, o, this.subject2predicate2object))) { add(relation, o, subject, this.predicate2object2subject) add(o, subject, relation, this.object2subject2predicate) @@ -187,7 +185,7 @@ object KBObject { add(subject, o, relation, this.subject2object2predicate) } - //filling the sizes + // filling the sizes if (this.subjectSize.get(subject).isEmpty) { this.subjectSize += (subject -> 1) } else { @@ -212,7 +210,7 @@ object KBObject { this.objectSize += (o -> obSize) } - //filling the overlaps + // filling the overlaps if (this.subject2subjectOverlap.get(relation).isEmpty) { subject2subjectOverlap += (relation -> Map()) @@ -242,18 +240,18 @@ object KBObject { return 0 } - return this.relationSize.get(rel).get + this.relationSize.get(rel).get } - /*TO DO + /* TODO * Functionality * bulidOverlapTable * */ def relationsSize(): Int = { - return this.relationSize.size + this.relationSize.size } /** @@ -264,7 +262,7 @@ object KBObject { var x = this.subjectSize.size var y = this.objectSize.size - return (x + y) + (x + y) } /** @@ -283,24 +281,24 @@ object KBObject { val objects2 = predicate2object2subject.get(r2).get.keys.toSet if (!r1.equals(r2)) { - var ssoverlap: Int = computeOverlap(subjects1, subjects2); - subject2subjectOverlap.get(r1).get.put(r2, ssoverlap); - subject2subjectOverlap.get(r2).get.put(r1, ssoverlap); + var ssoverlap: Int = computeOverlap(subjects1, subjects2) + subject2subjectOverlap.get(r1).get.put(r2, ssoverlap) + subject2subjectOverlap.get(r2).get.put(r1, ssoverlap) } else { - subject2subjectOverlap.get(r1).get.put(r1, subjects2.size); + subject2subjectOverlap.get(r1).get.put(r1, subjects2.size) } - var soverlap1: Int = computeOverlap(subjects1, objects2); - subject2objectOverlap.get(r1).get.put(r2, soverlap1); - var soverlap2: Int = computeOverlap(subjects2, objects1); - subject2objectOverlap.get(r2).get.put(r1, soverlap2); + var soverlap1: Int = computeOverlap(subjects1, objects2) + subject2objectOverlap.get(r1).get.put(r2, soverlap1) + var soverlap2: Int = computeOverlap(subjects2, objects1) + subject2objectOverlap.get(r2).get.put(r1, soverlap2) if (!r1.equals(r2)) { - var oooverlap: Int = computeOverlap(objects1, objects2); - object2objectOverlap.get(r1).get.put(r2, oooverlap); - object2objectOverlap.get(r2).get.put(r1, oooverlap); + var oooverlap: Int = computeOverlap(objects1, objects2) + object2objectOverlap.get(r1).get.put(r2, oooverlap) + object2objectOverlap.get(r2).get.put(r1, oooverlap) } else { - object2objectOverlap.get(r1).get.put(r1, objects2.size); + object2objectOverlap.get(r1).get.put(r1, objects2.size) } } } @@ -316,11 +314,12 @@ object KBObject { def computeOverlap(s1: Set[String], s2: Set[String]): Int = { var overlap: Int = 0 for (r <- s1) { - if (s2.contains(r)) + if (s2.contains(r)) { overlap += 1 + } } - return overlap + overlap } // --------------------------------------------------------------------------- @@ -334,13 +333,13 @@ object KBObject { * */ def functionality(relation: String): Double = { - /*if (relation.equals(EQUALSbs)) { - return 1.0;*/ + /* if (relation.equals(EQUALSbs)) { + return 1.0; */ if (this.predicate2subject2object.get(relation).isEmpty) { return 0.0 } var a: Double = this.predicate2subject2object.get(relation).get.size var b: Double = this.relationSize.get(relation).get - return (a / b) + (a / b) } @@ -351,12 +350,12 @@ object KBObject { * */ def inverseFunctionality(relation: String): Double = { - /*if (relation.equals(EQUALSbs)) { - return 1.0; - } */ + /* if (relation.equals(EQUALSbs)) { + return 1.0 + } */ var a: Double = this.predicate2object2subject.get(relation).get.size var b: Double = this.relationSize.get(relation).get - return (a / b) + (a / b) } @@ -368,7 +367,7 @@ object KBObject { * @author AMIE+ Team */ def isFunctional(relation: String): Boolean = { - return functionality(relation) >= inverseFunctionality(relation); + functionality(relation) >= inverseFunctionality(relation) } /** @@ -381,10 +380,11 @@ object KBObject { * */ def functionality(relation: String, inversed: Boolean): Double = { - if (inversed) - return inverseFunctionality(relation); - else - return functionality(relation); + if (inversed) { + inverseFunctionality(relation) + } else { + functionality(relation) + } } /** @@ -396,10 +396,11 @@ object KBObject { * */ def inverseFunctionality(relation: String, inversed: Boolean): Double = { - if (inversed) - return functionality(relation); - else - return inverseFunctionality(relation); + if (inversed) { + functionality(relation) + } else { + inverseFunctionality(relation) + } } /** @@ -408,28 +409,29 @@ object KBObject { * length of maplist is the number of instantiations of a rule * * @param triplesCard rule as an ArrayBuffer of RDFTriples, triplesCard(0) - * is the head of the rule + * is the head of the rule * @param sc spark context * */ - //---------------------------------------------------------------- + // ---------------------------------------------------------------- // Statistics - //---------------------------------------------------------------- + // ---------------------------------------------------------------- def overlap(relation1: String, relation2: String, overlap: Int): Double = { overlap match { - case SUBJECT2SUBJECT => if ((!(subject2subjectOverlap.get(relation1).isEmpty)) && (!(subject2subjectOverlap.get(relation1).get.get(relation2).isEmpty))) { return subject2subjectOverlap.get(relation1).get.get(relation2).get } - else return 0.0 + case SUBJECT2SUBJECT => + if (subject2subjectOverlap.get(relation1).isDefined && (!(subject2subjectOverlap.get(relation1).get.get(relation2).isEmpty))) { + subject2subjectOverlap.get(relation1).get.get(relation2).get + } else 0.0 case SUBJECT2OBJECT => - - if ((!(subject2objectOverlap.get(relation1).isEmpty)) && (!(subject2objectOverlap.get(relation1).get.get(relation2).isEmpty))) { return subject2objectOverlap.get(relation1).get.get(relation2).get } - else return 0.0 + if ((!(subject2objectOverlap.get(relation1).isEmpty)) && (!(subject2objectOverlap.get(relation1).get.get(relation2).isEmpty))) { + subject2objectOverlap.get(relation1).get.get(relation2).get + } else 0.0 case OBJECT2OBJECT => - - if ((!(object2objectOverlap.get(relation1).isEmpty)) && (!(object2objectOverlap.get(relation1).get.get(relation2).isEmpty))) { return object2objectOverlap.get(relation1).get.get(relation2).get } - else return 0.0 - + if ((!(object2objectOverlap.get(relation1).isEmpty)) && (!(object2objectOverlap.get(relation1).get.get(relation2).isEmpty))) { + object2objectOverlap.get(relation1).get.get(relation2).get + } else 0.0 } } @@ -445,16 +447,16 @@ object KBObject { def relationColumnSize(rel: String, elem: String): Int = { elem match { case "subject" => - return predicate2subject2object.get(rel).get.size + predicate2subject2object.get(rel).get.size case "object" => - return predicate2object2subject.get(rel).get.size + predicate2object2subject.get(rel).get.size } } - //TODO: better than cardinality + // TODO: better than cardinality def bindingExists(triplesCard: ArrayBuffer[RDFTriple]): Boolean = { val k = this.kbGraph @@ -478,7 +480,7 @@ object KBObject { var minSize = this.relationSize.get(triplesCard(0).predicate).get var index = 0 - for (i <- 1 to triplesCard.length - 1) { + for (i <- 1 until triplesCard.length) { if (this.relationSize.get(triplesCard(i).predicate).get < minSize) { minSize = this.relationSize.get(triplesCard(i).predicate).get min = triplesCard(i) @@ -500,7 +502,7 @@ object KBObject { x = k.find(None, Some(min.predicate), None).collect } - //x.foreach(println) + // x.foreach(println) triplesCard.remove(index) for (i <- x) { @@ -530,16 +532,16 @@ object KBObject { if (test) { if ((a.startsWith("?")) && ((j._1 == a) && (!(atestLeft)))) { - temp += new RDFTriple(i._1, j._2, j._3) + temp += RDFTriple(i._1, j._2, j._3) } else if ((a.startsWith("?")) && ((j._3 == a) && (!(atestRight)))) { - temp += new RDFTriple(j._1, j._2, i._1) + temp += RDFTriple(j._1, j._2, i._1) } else if ((b.startsWith("?")) && ((j._3 == b) && (!(btestRight)))) { - temp += new RDFTriple(j._1, j._2, i._3) + temp += RDFTriple(j._1, j._2, i._3) } else if ((b.startsWith("?")) && ((j._1 == b) && (!(btestLeft)))) { - temp += new RDFTriple(i._3, j._2, j._3) + temp += RDFTriple(i._3, j._2, j._3) } else if ((b.startsWith("?")) && (((j._3 == b) && (btestRight)) || ((j._1 == b) && (btestLeft)))) { exploreFurther = false } else if ((a.startsWith("?")) && (((j._1 == a) && (atestLeft)) || ((j._3 == a) && (atestRight)))) { @@ -564,12 +566,12 @@ object KBObject { } - return false + false } - def varCount(tpAr: ArrayBuffer[RDFTriple]): ArrayBuffer[Tuple2[String, String]] = { + def varCount(tpAr: ArrayBuffer[RDFTriple]): ArrayBuffer[(String, String)] = { - var out2: ArrayBuffer[Tuple2[String, String]] = new ArrayBuffer + var out2: ArrayBuffer[(String, String)] = new ArrayBuffer for (i <- tpAr) { if (!(out2.contains(Tuple2(i.subject, i.predicate)))) { @@ -582,9 +584,9 @@ object KBObject { } - return out2 + out2 } - def countProjectionQueriesDF(posit: Int, id: Int, operator: String, minHC: Double, tpAr: ArrayBuffer[RDFTriple], RXY: ArrayBuffer[Tuple2[String, String]], sc: SparkContext, sqlContext: SQLContext): DataFrame = + def countProjectionQueriesDF(posit: Int, id: Int, operator: String, minHC: Double, tpAr: ArrayBuffer[RDFTriple], RXY: ArrayBuffer[(String, String)], sc: SparkContext, sqlContext: SQLContext): DataFrame = { val threshold = minHC * this.relationSize.get(tpAr(0).predicate).get @@ -643,13 +645,13 @@ object KBObject { } - return whole + whole } def cardinalityQueries(id: Int, tpArDF: DataFrame, wholeAr: ArrayBuffer[RDFTriple], sc: SparkContext, sqlContext: SQLContext): DataFrame = { val DF = this.dfTable - var tpMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map() + var tpMap: Map[String, ArrayBuffer[(Int, String)]] = Map() DF.registerTempTable("table") tpArDF.registerTempTable("tpArTable") @@ -659,10 +661,10 @@ object KBObject { var v = sqlContext.sql("SELECT * FROM tpArTable JOIN newColumn") var varAr: ArrayBuffer[String] = new ArrayBuffer - var checkMap: Map[Int, Tuple2[String, String]] = Map() + var checkMap: Map[Int, (String, String)] = Map() var checkSQLSELECT = "SELECT " - for (i <- 0 to wholeAr.length - 1) { + for (i <- wholeAr.indices) { var a = wholeAr(i).subject var b = wholeAr(i)._3 @@ -693,7 +695,7 @@ object KBObject { var cloneTpAr = wholeAr.clone() - var removedMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map() + var removedMap: Map[String, ArrayBuffer[(Int, String)]] = Map() varAr = varAr.distinct var checkSQLWHERE = "WHERE " @@ -732,7 +734,7 @@ object KBObject { } checkSQLWHERE = checkSQLWHERE.stripSuffix(" AND ") - var seq: Seq[String] = Seq((wholeAr.last.toString() + " " + id.toString())) + var seq: Seq[String] = Seq((wholeAr.last.toString() + " " + id.toString)) import sqlContext.implicits._ var key: DataFrame = seq.toDF("key") @@ -745,7 +747,7 @@ object KBObject { key.registerTempTable("keyTable") var out = sqlContext.sql(checkSQLSELECT + ", keyTable.key FROM lastTable JOIN keyTable") - return out + out } @@ -754,18 +756,19 @@ object KBObject { */ def cardinality(tpAr: ArrayBuffer[RDFTriple], sc: SparkContext, sqlContext: SQLContext): DataFrame = { + println(s"computing cardinality for ${tpAr.mkString(",")} ...") var name = calcName(tpAr) if (dfMap.contains(name)) { - return dfMap.get(name).get + dfMap.get(name).get } else { val DF = this.dfTable - var tpMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map() + var tpMap: Map[String, ArrayBuffer[(Int, String)]] = Map() DF.registerTempTable("table") var v = sqlContext.sql("SELECT rdf AS tp0 FROM table WHERE rdf.predicate = '" + tpAr(0).predicate + "'") - for (k <- 1 to tpAr.length - 1) { + for (k <- 1 until tpAr.length) { var w = sqlContext.sql("SELECT rdf AS tp" + k + " FROM table WHERE rdf.predicate = '" + tpAr(k).predicate + "'") w.registerTempTable("newColumn") @@ -773,7 +776,7 @@ object KBObject { tempO.registerTempTable("previous") var sqlString = "" - for (re <- 0 to k - 1) { + for (re <- 0 until k) { sqlString += "previous.tp" + re + ", " } @@ -782,10 +785,10 @@ object KBObject { } var varAr: ArrayBuffer[String] = new ArrayBuffer - var checkMap: Map[Int, Tuple2[String, String]] = Map() + var checkMap: Map[Int, (String, String)] = Map() var checkSQLSELECT = "SELECT " - for (i <- 0 to tpAr.length - 1) { + for (i <- tpAr.indices) { var a = tpAr(i).subject var b = tpAr(i)._3 @@ -816,7 +819,7 @@ object KBObject { var cloneTpAr = tpAr.clone() - var removedMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map() + var removedMap: Map[String, ArrayBuffer[(Int, String)]] = Map() varAr = varAr.distinct var checkSQLWHERE = "WHERE " @@ -856,8 +859,9 @@ object KBObject { checkSQLWHERE = checkSQLWHERE.stripSuffix(" AND ") v.registerTempTable("t") + println(checkSQLSELECT + " FROM t " + checkSQLWHERE) var out = sqlContext.sql(checkSQLSELECT + " FROM t " + checkSQLWHERE) - return out + out } } @@ -869,7 +873,7 @@ object KBObject { var go = false var outCount: Double = 0.0 var tpsString = calcName(tpAr) - for (i <- 1 to tpAr.length - 1) { + for (i <- 1 until tpAr.length) { if ((tpAr(i)._1 == "?a") || (tpAr(i)._3 == "?a")) { go = true } @@ -879,7 +883,7 @@ object KBObject { return outCount } - if (go) { + if ( go ) { var card = dfMap.get(tpsString).get @@ -905,33 +909,23 @@ object KBObject { h.registerTempTable("subjects") out = sqlContext.sql("SELECT twoLengthT.tp0 FROM twoLengthT JOIN subjects ON twoLengthT.tp0." + abString + "=subjects.sub") - /* - if ((tpAr(0).predicate == "directed")&&(tpAr(1).predicate== "produced")&&(tpAr(1).subject== "?a")&&(tpAr(1)._3== "?b")){ - h.show(800, false) - - var fjgf = sqlContext.sql("SELECT ") - } - - - */ - } outCount = out.count() } - return outCount + outCount } def negatveExampleBuilder(subjects: DataFrame, wholeAr: ArrayBuffer[RDFTriple], sc: SparkContext, sqlContext: SQLContext): DataFrame = { val DF = this.dfTable - var tpMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map() + var tpMap: Map[String, ArrayBuffer[(Int, String)]] = Map() DF.registerTempTable("table") var wholeTPARBackup = wholeAr.clone() wholeAr.remove(0) var complete = sqlContext.sql("SELECT rdf AS tp" + 0 + " FROM table WHERE rdf.predicate = '" + (wholeAr(0)).predicate + "'") - for (i <- 1 to wholeAr.length - 1) { + for (i <- 1 until wholeAr.length) { var w = sqlContext.sql("SELECT rdf AS tp" + i + " FROM table WHERE rdf.predicate = '" + (wholeAr(i)).predicate + "'") w.registerTempTable("newColumn") @@ -940,12 +934,12 @@ object KBObject { } var varAr: ArrayBuffer[String] = new ArrayBuffer - var checkMap: Map[Int, Tuple2[String, String]] = Map() + var checkMap: Map[Int, (String, String)] = Map() var checkSQLSELECT = "SELECT " var abString = ("", "") - for (i <- 0 to wholeAr.length - 1) { + for (i <- wholeAr.indices) { var a = wholeAr(i).subject var b = wholeAr(i)._3 @@ -984,7 +978,7 @@ object KBObject { var cloneTpAr = wholeAr.clone() - var removedMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map() + var removedMap: Map[String, ArrayBuffer[(Int, String)]] = Map() varAr = varAr.distinct var checkSQLWHERE = "WHERE " @@ -1032,11 +1026,11 @@ object KBObject { var out = sqlContext.sql(checkSQLSELECT + " FROM lastTable JOIN keyTable ON lastTable." + abString._2 + "." + abString._1 + "=keyTable.sub") - return out + out } - //TODO: solve with DataFrames + // TODO: solve with DataFrames def cardPlusnegativeExamplesLength(triplesCard: ArrayBuffer[RDFTriple], sc: SparkContext): Double = { val k = this.kbGraph @@ -1058,7 +1052,7 @@ object KBObject { } - /**initializing maplist with head of the rule*/ + /** initializing maplist with head of the rule */ for (ii <- arbuf(0).collect()) { mapList += Map(triplesCard(0).subject -> ii._1, triplesCard(0).`object` -> ii._3) @@ -1066,9 +1060,9 @@ object KBObject { var temp = mapList.clone() - for (tripleCount <- 1 to triplesCard.length - 1) { + for (tripleCount <- 1 until triplesCard.length) { - val rdd1 = sc.parallelize(mapList.toSeq) + val rdd1 = sc.parallelize(mapList) val rdd2 = arbuf(tripleCount) val comb = rdd1.cartesian(rdd2) // cartesian() to get every possible combination @@ -1084,17 +1078,17 @@ object KBObject { for (i <- combinations) { var ltrip = i._2 - var elem1 = ltrip._1 //subject from combination + var elem1 = ltrip._1 // subject from combination var elem2 = ltrip._3 var trip1 = triplesCard(tripleCount)._1 // subject from Rule var trip2 = triplesCard(tripleCount)._3 - /**checking map for placeholder for the subject*/ + /** checking map for placeholder for the subject */ if (!(i._1.contains(trip1))) { i._1 += (trip1 -> elem1) } - /**checking map for placeholder for the object*/ + /** checking map for placeholder for the object */ if (!(i._1.contains(trip2))) { i._1 += (trip2 -> elem2) } @@ -1106,9 +1100,9 @@ object KBObject { } } - var rightOnes = sc.parallelize(mapList.toSeq).map(y => y.get(triplesCard(0).subject).get).distinct.collect + var rightOnes = sc.parallelize(mapList).map(y => y.get(triplesCard(0).subject).get).distinct.collect - var as = sc.parallelize(temp.toSeq).map { + var as = sc.parallelize(temp).map { x => (x.get(triplesCard(0).subject).get, 1) @@ -1116,19 +1110,17 @@ object KBObject { var out: Double = 0.0 for (i <- as) { - if (rightOnes.contains(i._1)) + if (rightOnes.contains(i._1)) { out += (i._2 - 1) - + } } - - return ((mapList.length) + out) - + ((mapList.length) + out) } def addDanglingAtom(c: Int, id: Int, minHC: Double, rule: RuleContainer, sc: SparkContext, sqlContext: SQLContext): DataFrame = { val tpAr = rule.getRule() - var RXY: ArrayBuffer[Tuple2[String, String]] = new ArrayBuffer + var RXY: ArrayBuffer[(String, String)] = new ArrayBuffer val notC = rule.notClosed() @@ -1148,13 +1140,13 @@ object KBObject { var x = this.countProjectionQueriesDF(c, id, "OD", minHC, tpAr, RXY, sc, sqlContext) - return x + x } def addClosingAtom(c: Int, id: Int, minHC: Double, rule: RuleContainer, sc: SparkContext, sqlContext: SQLContext): DataFrame = { val tpAr = rule.getRule() - var RXY: ArrayBuffer[Tuple2[String, String]] = new ArrayBuffer + var RXY: ArrayBuffer[(String, String)] = new ArrayBuffer val notC = rule.notClosed() @@ -1190,9 +1182,8 @@ object KBObject { } var x = this.countProjectionQueriesDF(c, id, "OC", minHC, tpAr, RXY, sc, sqlContext) - return x + x } } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/MineRules.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/MineRules.scala index 5f15515..193a760 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/MineRules.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/MineRules.scala @@ -3,23 +3,23 @@ package net.sansa_stack.ml.spark.mining.amieSpark import java.io.File import java.net.URI -import net.sansa_stack.ml.spark.mining.amieSpark.KBObject.KB -import net.sansa_stack.ml.spark.mining.amieSpark.Rules.RuleContainer -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{ DataFrame, SQLContext, SparkSession, _ } - import scala.collection.mutable.{ ArrayBuffer, Map } import scala.util.Try import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.fs.Path +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{ DataFrame, SparkSession, SQLContext, _ } import net.sansa_stack.ml.spark.mining.amieSpark.DfLoader.Atom +import net.sansa_stack.ml.spark.mining.amieSpark.KBObject.KB +import net.sansa_stack.ml.spark.mining.amieSpark.Rules.RuleContainer + object MineRules { /** - * Algorithm that mines the Rules. + * Algorithm that mines the Rules. * * @param kb object knowledge base that was created in main * @param minHC threshold on head coverage @@ -55,11 +55,11 @@ object MineRules { } else { countMap += (w._3 -> 1) } - if (!(numberMap.contains(w._1))) { + if (!numberMap.contains(w._1)) { numberMap += (w._1 -> counter) counter += 1 } - if (!(numberMap.contains(w._3))) { + if (!numberMap.contains(w._3)) { numberMap += (w._3 -> counter) counter += 1 } @@ -85,15 +85,16 @@ object MineRules { out += a + "_" + wh._2 + "_" + b + "_" } out = out.stripSuffix("_") - return out + out } def ruleMining(sc: SparkContext, sqlContext: SQLContext): ArrayBuffer[RuleContainer] = { - var predicates = kb.getKbGraph().triples.map { x => x.predicate + var predicates = kb.getKbGraph.triples.map { x => x.predicate }.distinct var z = predicates.collect() + println(s"#predicates:$z.length") /** * q is a queue with one atom rules @@ -119,7 +120,7 @@ object MineRules { var out: ArrayBuffer[RuleContainer] = new ArrayBuffer var dublicate: ArrayBuffer[String] = ArrayBuffer("") - for (i <- 0 to this.maxLen - 1) { + for (i <- 0 until this.maxLen) { if ((i > 0) && (dataFrameRuleParts != null)) { var temp = q.clone @@ -147,14 +148,14 @@ object MineRules { var dubCheck = fstTp - for (i <- 1 to newTpArr.length - 1) { + for (i <- 1 until newTpArr.length) { var temp = newTpArr(i).toString dubCheck += sortedNewTpArr(i).toString if (temp == fstTp) { counter += 1 } } - if ((counter < newTpArr.length) && (!(dublicate.contains(dubCheck)))) { + if ((counter < newTpArr.length) && (!dublicate.contains(dubCheck))) { dublicate += dubCheck newRuleC.setRule(minConf, n1._2, parent, newTpArr, sortedNewTpArr, kb, sc, sqlContext) q += newRuleC @@ -162,12 +163,12 @@ object MineRules { } - } else if ((i > 0) && ((dataFrameRuleParts == null) || (dataFrameRuleParts.isEmpty()))) { + } else if ((i > 0) && ((dataFrameRuleParts == null) || dataFrameRuleParts.isEmpty())) { q = new ArrayBuffer } - if ((!q.isEmpty)) { - for (j <- 0 to q.length - 1) { + if (q.nonEmpty) { + for (j <- q.indices) { val r: RuleContainer = q(j) @@ -180,11 +181,11 @@ object MineRules { if (acceptedForOutput(outMap, r, minConf, kb, sc, sqlContext)) { out += r - if (!(outMap.contains(tp(0).predicate))) { + if (!outMap.contains(tp(0).predicate)) { outMap += (tp(0).predicate -> ArrayBuffer((tp, r))) } else { var temp: ArrayBuffer[(ArrayBuffer[RDFTriple], RuleContainer)] = outMap.remove(tp(0).predicate).get - temp += new Tuple2(tp, r) + temp += Tuple2(tp, r) outMap += (tp(0).predicate -> temp) } @@ -195,7 +196,7 @@ object MineRules { if (r.getRule().length < maxLen) { dataFrameRuleParts = refine(i, j, r, dataFrameRuleParts, sc, sqlContext) - //TODO: Dublicate check + // TODO: Dublicate check } @@ -204,7 +205,7 @@ object MineRules { } - return out + out } /** @@ -219,14 +220,14 @@ object MineRules { var out: DataFrame = null var OUT: RDD[(RDFTriple, Int, Int)] = dataFrameRuleParts - //var count2:RDD[(String, Int)] = null + // var count2:RDD[(String, Int)] = null var path = new File("test_table/") var temp = 0 val tpAr = r.getRule() var stringSELECT = "" - for (tp <- 0 to tpAr.length - 1) { + for (tp <- tpAr.indices) { stringSELECT += "tp" + tp + ", " @@ -239,7 +240,7 @@ object MineRules { var a = kb.addDanglingAtom(c, id, minHC, r, sc, sqlContext) z = Try(a.first()) - if ((!(z.isFailure)) && (z.isSuccess)) { + if ((!z.isFailure) && z.isSuccess) { out = a @@ -251,7 +252,7 @@ object MineRules { var t = Try(b.first) - if ((!(t.isFailure)) && (t.isSuccess) && (temp == 0)) { + if ((!t.isFailure) && t.isSuccess && (temp == 0)) { if (out == null) { out = b @@ -265,12 +266,12 @@ object MineRules { var count: RDD[(String, Int)] = null var o: RDD[(RDFTriple, Int, Int)] = null - if (((!(t.isFailure)) && (t.isSuccess)) || ((z != null) && (!(z.isFailure)) && (z.isSuccess))) { - count = out.rdd.map(x => (x(r.getRule().length + 1).toString(), 1)).reduceByKey(_ + _) + if (((!t.isFailure) && t.isSuccess) || ((z != null) && (!z.isFailure) && z.isSuccess)) { + count = out.rdd.map(x => (x(r.getRule().length + 1).toString, 1)).reduceByKey(_ + _) o = count.map(q => (q._1.split("\\s+"), q._2)).map { token => Tuple3(RDFTriple(token._1(0), token._1(1), token._1(2)), token._2, token._1(3).toInt) - }.filter(n1 => (n1._2 >= (kb.getRngSize(n1._1.predicate) * minHC))) + }.filter(n1 => n1._2 >= (kb.getRngSize(n1._1.predicate) * minHC)) if (OUT == null) { OUT = o @@ -280,7 +281,7 @@ object MineRules { } - return OUT + OUT } @@ -294,14 +295,14 @@ object MineRules { */ def acceptedForOutput(outMap: Map[String, ArrayBuffer[(ArrayBuffer[RDFTriple], RuleContainer)]], r: RuleContainer, minConf: Double, k: KB, sc: SparkContext, sqlContext: SQLContext): Boolean = { - //if ((!(r.closed())) || (r.getPcaConfidence(k, sc, sqlContext) < minConf)) { - if ((!(r.closed())) || (r.getPcaConfidence() < minConf)) { + // if ((!(r.closed())) || (r.getPcaConfidence(k, sc, sqlContext) < minConf)) { + if ((!r.closed()) || (r.getPcaConfidence() < minConf)) { return false } var parents: ArrayBuffer[RuleContainer] = r.parentsOfRule(outMap, sc) - if (r.getRule.length > 2) { + if (r.getRule().length > 2) { for (rp <- parents) { if (r.getPcaConfidence() <= rp.getPcaConfidence()) { return false @@ -310,14 +311,14 @@ object MineRules { } } - return true + true } def sort(tp: ArrayBuffer[RDFTriple]): ArrayBuffer[RDFTriple] = { var out = ArrayBuffer(tp(0)) - var temp = new ArrayBuffer[Tuple2[String, RDFTriple]] + var temp = new ArrayBuffer[(String, RDFTriple)] - for (i <- 1 to tp.length - 1) { + for (i <- 1 until tp.length) { var tempString: String = tp(i).predicate + tp(i).subject + tp(i).`object` temp += Tuple2(tempString, tp(i)) @@ -327,63 +328,8 @@ object MineRules { out += t._2 } - return out + out } } - - def main(args: Array[String]) = { - val know = new KB() - - val sparkSession = SparkSession.builder - - .master("local[*]") - .appName("AMIESpark example") - - .getOrCreate() - - if (args.length < 2) { - System.err.println( - "Usage: Triple reader ") - System.exit(1) - } - - val input = args(0) - val outputPath: String = args(1) - val hdfsPath: String = outputPath + "/" - - val sc = sparkSession.sparkContext - val sqlContext = new org.apache.spark.sql.SQLContext(sc) - - know.sethdfsPath(hdfsPath) - know.setKbSrc(input) - - know.setKbGraph(RDFGraphLoader.loadFromFile(know.getKbSrc(), sc, 2)) - know.setDFTable(DfLoader.loadFromFileDF(know.getKbSrc, sc, sqlContext, 2)) - - val algo = new Algorithm(know, 0.01, 3, 0.1, hdfsPath) - - var output = algo.ruleMining(sc, sqlContext) - - var outString = output.map { x => - var rdfTrp = x.getRule() - var temp = "" - for (i <- 0 to rdfTrp.length - 1) { - if (i == 0) { - temp = rdfTrp(i) + " <= " - } else { - temp += rdfTrp(i) + " \u2227 " - } - } - temp = temp.stripSuffix(" \u2227 ") - temp - }.toSeq - var rddOut = sc.parallelize(outString) - - rddOut.saveAsTextFile(outputPath + "/testOut") - - sc.stop - - } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraph.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraph.scala index beaf08c..c0f8936 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraph.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraph.scala @@ -1,8 +1,9 @@ package net.sansa_stack.ml.spark.mining.amieSpark import org.apache.spark.rdd.RDD -import org.apache.spark.sql.types.{ StringType, StructField, StructType } import org.apache.spark.sql.{ DataFrame, Row, SQLContext } +import org.apache.spark.sql.types.{ StringType, StructField, StructType } + import net.sansa_stack.ml.spark.mining.amieSpark._ /** @@ -40,7 +41,7 @@ case class RDFGraph(triples: RDD[RDFTriple]) { /** * Persist the triples RDD with the default storage level (`MEMORY_ONLY`). */ - def cache() = { + def cache(): RDFGraph = { triples.cache() this } @@ -49,7 +50,7 @@ case class RDFGraph(triples: RDD[RDFTriple]) { * Return the number of triples. * @return the number of triples */ - def size() = { + def size(): Long = { triples.count() } diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphDataFrame.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphDataFrame.scala index a773da2..aa9f6d0 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphDataFrame.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphDataFrame.scala @@ -1,8 +1,11 @@ package net.sansa_stack.ml.spark.mining.amieSpark import org.apache.jena.graph.Triple +import org.apache.spark.rdd.RDD import org.apache.spark.sql.{ DataFrame, SparkSession } + import net.sansa_stack.ml.spark.mining.amieSpark._ + /** * A data structure that comprises a set of triples. * @@ -65,7 +68,7 @@ class RDFGraphDataFrame(triples: DataFrame) extends AbstractRDFGraph[DataFrame, this } - def distinct() = { + def distinct(): RDFGraphDataFrame = { new RDFGraphDataFrame(triples.distinct()) } @@ -74,11 +77,11 @@ class RDFGraphDataFrame(triples: DataFrame) extends AbstractRDFGraph[DataFrame, * * @return the number of triples */ - def size() = { + def size(): Long = { triples.count() } def toDataFrame(sparkSession: SparkSession): DataFrame = triples - def toRDD() = triples.rdd.map(row => RDFTriple(row.getString(0), row.getString(1), row.getString(2))) + def toRDD(): RDD[RDFTriple] = triples.rdd.map(row => RDFTriple(row.getString(0), row.getString(1), row.getString(2))) } diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphLoader.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphLoader.scala index 7dc355f..81278c1 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphLoader.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphLoader.scala @@ -4,6 +4,7 @@ import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.types._ import org.slf4j.LoggerFactory + import net.sansa_stack.ml.spark.mining.amieSpark._ /** @@ -14,7 +15,7 @@ import net.sansa_stack.ml.spark.mining.amieSpark._ */ object RDFGraphLoader { - //private val logger = com.typesafe.scalalogging.slf4j.Logger(LoggerFactory.getLogger(this.getClass.getName)) + // private val logger = com.typesafe.scalalogging.slf4j.Logger(LoggerFactory.getLogger(this.getClass.getName)) private val logger = LoggerFactory.getLogger(this.getClass.getName) def loadFromFile(path: String, sc: SparkContext, minPartitions: Int = 2): RDFGraph = { diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphNative.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphNative.scala index 7bcf965..f979825 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphNative.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphNative.scala @@ -2,45 +2,44 @@ package net.sansa_stack.ml.spark.mining.amieSpark import org.apache.jena.graph.Triple import org.apache.spark.rdd.RDD -import org.apache.spark.sql.types.{StringType, StructField, StructType} -import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.sql.{ DataFrame, Row, SparkSession } +import org.apache.spark.sql.types.{ StringType, StructField, StructType } + import net.sansa_stack.ml.spark.mining.amieSpark._ /** - * A data structure that comprises a set of triples. - * - * @author Lorenz Buehmann - * - */ -class RDFGraphNative(val triples: RDD[RDFTriple]) extends AbstractRDFGraph[RDD[RDFTriple], RDFGraphNative](triples){ + * A data structure that comprises a set of triples. + * + * @author Lorenz Buehmann + * + */ +class RDFGraphNative(val triples: RDD[RDFTriple]) extends AbstractRDFGraph[RDD[RDFTriple], RDFGraphNative](triples) { /** - * Returns an RDD of triples that match with the given input. - * - * @param s the subject - * @param p the predicate - * @param o the object - * @return RDD of triples - */ - def find (s: Option[String] = None, p: Option[String] = None, o: Option[String] = None): RDD[RDFTriple]= { - triples.filter(t => - (s == None || t.subject == s.get) && - (p == None || t.predicate == p.get) && - (o == None || t.`object` == o.get) - ) + * Returns an RDD of triples that match with the given input. + * + * @param s the subject + * @param p the predicate + * @param o the object + * @return RDD of triples + */ + def find(s: Option[String] = None, p: Option[String] = None, o: Option[String] = None): RDD[RDFTriple] = { + triples.filter(t => + (s == None || t.subject == s.get) && + (p == None || t.predicate == p.get) && + (o == None || t.`object` == o.get)) } /** - * Returns an RDD of triples that match with the given input. - * - * @return RDD of triples - */ + * Returns an RDD of triples that match with the given input. + * + * @return RDD of triples + */ def find(triple: Triple): RDD[RDFTriple] = { find( if (triple.getSubject.isVariable) None else Option(triple.getSubject.toString), if (triple.getPredicate.isVariable) None else Option(triple.getPredicate.toString), - if (triple.getObject.isVariable) None else Option(triple.getObject.toString) - ) + if (triple.getObject.isVariable) None else Option(triple.getObject.toString)) } def union(graph: RDFGraphNative): RDFGraphNative = { @@ -52,20 +51,20 @@ class RDFGraphNative(val triples: RDD[RDFTriple]) extends AbstractRDFGraph[RDD[R this } - def distinct() = { + def distinct(): RDFGraphNative = { new RDFGraphNative(triples.distinct()) } /** - * Return the number of triples. - * - * @return the number of triples - */ - def size() = { + * Return the number of triples. + * + * @return the number of triples + */ + def size(): Long = { triples.count() } - def toRDD() = triples + def toRDD(): RDD[RDFTriple] = triples def toDataFrame(sparkSession: SparkSession): DataFrame = { // convert RDD to DataFrame diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFTriple.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFTriple.scala index e70cc27..c5753a4 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFTriple.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFTriple.scala @@ -1,6 +1,5 @@ - - package net.sansa_stack.ml.spark.mining.amieSpark + /** * An RDF triple. * @@ -11,9 +10,9 @@ case class RDFTriple(subject: String, predicate: String, `object`: String) exten override def _2: String = predicate override def _3: String = `object` - def s = subject - def p = predicate - def o = `object` + def s: String = subject + def p: String = predicate + def o: String = `object` - override def toString = subject + " " + predicate + " " + `object` + override def toString: String = subject + " " + predicate + " " + `object` } diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/Rules.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/Rules.scala index 9ee008f..c2d0b28 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/Rules.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/Rules.scala @@ -1,19 +1,15 @@ package net.sansa_stack.ml.spark.mining.amieSpark -import net.sansa_stack.ml.spark.mining.amieSpark._ - -import org.apache.spark.SparkConf -import org.apache.spark.SparkContext +import scala.collection.mutable.{ ArrayBuffer, Map } -import org.apache.spark.sql.DataFrame +import KBObject.KB +import org.apache.spark.{ SparkConf, SparkContext } +import org.apache.spark.rdd.RDD import org.apache.spark.sql._ +import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types._ -import org.apache.spark.rdd.RDD - -import scala.collection.mutable.ArrayBuffer -import KBObject.KB -import scala.collection.mutable.Map +import net.sansa_stack.ml.spark.mining.amieSpark._ object Rules { @@ -49,7 +45,7 @@ object Rules { return this.sortedRule } - /**initializes rule, support, bodySize and sizeHead*/ + /** initializes rule, support, bodySize and sizeHead */ def initRule(x: ArrayBuffer[RDFTriple], k: KB, sc: SparkContext, sqlContext: SQLContext) { this.rule = x @@ -78,7 +74,7 @@ object Rules { this.rule = tp } - /**returns ArrayBuffer with every triplePattern of the body as a RDFTriple*/ + /** returns ArrayBuffer with every triplePattern of the body as a RDFTriple */ def hc(): Double = { if (this.bodySize < 1) { @@ -134,15 +130,17 @@ object Rules { */ def calcSupport(k: KB, sc: SparkContext, sqlContext: SQLContext) { + println(s"computing support for rule $rule ...") if (this.rule.length > 1) { val mapList = k.cardinality(this.rule, sc, sqlContext) this.support = mapList.count() + println(s"#support($rule):$support") } } - /**returns the length of the body*/ + /** returns the length of the body */ def bodyLength(): Int = { var x = this.rule.length - 1 @@ -168,7 +166,7 @@ object Rules { maptp += (x._1 -> counter) } - /**checking map for placeholder for the object*/ + /** checking map for placeholder for the object */ if (!(maptp.contains(x._3))) { maptp += (x._3 -> 1) } else { @@ -262,29 +260,30 @@ object Rules { } def parentsOfRule(outMap: Map[String, ArrayBuffer[(ArrayBuffer[RDFTriple], RuleContainer)]], sc: SparkContext): ArrayBuffer[RuleContainer] = { - // TODO: create new rules with body in alphabetical order + // TODO: create new rules with body in alphabetical order var parents = ArrayBuffer(this.parent) val r = this.sortedRule.clone - if (outMap.get(r(0).predicate) == None) { return parents } - var rel = outMap.get(r(0).predicate).get + if (outMap.get(r(0).predicate) == None) { + return parents + } + var rel = outMap.get(r(0).predicate).get var tp: ArrayBuffer[RDFTriple] = new ArrayBuffer - var filtered = rel.filter(x => (x._1.length == r.length - 1)) - /* - for (f <- filtered){ + /* + for (f <- filtered) { var bool = true - for (ff <- f._1){ - if (!(r.contains(ff))){ + for (ff <- f._1) { + if (!(r.contains(ff))) { bool = false } } - if (bool){ + if (bool) { parents += f._2 } - }*/ + } */ for (l <- 1 to r.length - 1) { if (!(filtered.isEmpty)) { @@ -319,7 +318,7 @@ object Rules { maptp.put(x._1, (maptp.get(x._1).get + 1)).get } - /**checking map for placeholder for the object*/ + /** checking map for placeholder for the object */ if (!(maptp.contains(x._3))) { maptp += (x._3 -> 1) @@ -359,7 +358,7 @@ object Rules { maptp.put(x._1, (maptp.get(x._1).get + 1)).get } - /**checking map for placeholder for the object*/ + /** checking map for placeholder for the object */ if (!(maptp.contains(x._3))) { varArBuff += x._3 @@ -398,13 +397,11 @@ object Rules { var out2: ArrayBuffer[(ArrayBuffer[RDFTriple], RuleContainer)] = new ArrayBuffer var out: (ArrayBuffer[RuleContainer], ArrayBuffer[(ArrayBuffer[RDFTriple], RuleContainer)]) = (out1, out2) - if (triples.length <= 1) { - return out - } - var triplesCardcombis: ArrayBuffer[ArrayBuffer[RDFTriple]] = new ArrayBuffer + if (triples.length <= 1) return out - //var rdd =sc.parallelize(arbuf.toSeq) - //out ++= rdd.filter(x => (sameRule(triples, x._1))).map(y => y._2).collect + var triplesCardcombis: ArrayBuffer[ArrayBuffer[RDFTriple]] = new ArrayBuffer + // var rdd =sc.parallelize(arbuf.toSeq) + // out ++= rdd.filter(x => (sameRule(triples, x._1))).map(y => y._2).collect for (x <- arbuf) { if (sameRule(triples, x._1)) { @@ -415,13 +412,13 @@ object Rules { /* var rdd = sc.parallelize(arbuf.toSeq) var pq = rdd.map{ x=> - if (sameRule(triples, x._1)){ - ("out1", x._2) - + if (sameRule(triples, x._1)) { + ("out1", x._2) + } else ("out2",x) }.groupByKey() - * + * */ out = (out1, out2) @@ -492,8 +489,6 @@ object Rules { return out } - //end - + // end } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/SQLSchema.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/SQLSchema.scala index 9a108d9..11808e9 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/SQLSchema.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/SQLSchema.scala @@ -1,6 +1,5 @@ - - package net.sansa_stack.ml.spark.mining.amieSpark + /** * The SQL schema used for RDF triples in a Dataframe. * @@ -8,12 +7,12 @@ package net.sansa_stack.ml.spark.mining.amieSpark */ object SQLSchema { - def triplesTable = "TRIPLES" + def triplesTable: String = "TRIPLES" - def subjectCol = "subject" + def subjectCol: String = "subject" - def predicateCol = "predicate" + def predicateCol: String = "predicate" - def objectCol = "object" + def objectCol: String = "object" } diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/amieExample.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/amieExample.scala deleted file mode 100644 index 4f91dad..0000000 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/amieExample.scala +++ /dev/null @@ -1,87 +0,0 @@ -package net.sansa_stack.ml.spark.mining.amieSpark - -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession, _} -import net.sansa_stack.ml.spark.mining.amieSpark.KBObject.KB -import net.sansa_stack.ml.spark.mining.amieSpark.MineRules.Algorithm - -import org.apache.hadoop.fs.FileSystem -import org.apache.hadoop.fs.Path -import java.net.URI - - -import java.io.File - -object amieExample { - - def main(args: Array[String]) = { - - - - - - val know = new KB() - - val sparkSession = SparkSession.builder - - .master("spark://172.18.160.16:3077") - .appName("SPARK Reasoning") - .config("spark.sql.warehouse.dir", "file:///data/home/MohamedMami/spark-2.1.0-bin-hadoop2.7/bin/spark-warehouse") - - - .getOrCreate() - - - val hdfsPath:String = args(0) - - val outputPath =hdfsPath - val inputFile = hdfsPath + args(1) - - - - - val sc = sparkSession.sparkContext - - val sqlContext = new org.apache.spark.sql.SQLContext(sc) - - - know.sethdfsPath(hdfsPath) - know.setKbSrc(inputFile) - - know.setKbGraph(RDFGraphLoader.loadFromFile(know.getKbSrc(), sc, 2)) - know.setDFTable(DfLoader.loadFromFileDF(know.getKbSrc, sc, sqlContext, 2) ) - - - - - - val algo = new Algorithm (know, 0.01, 3, 0.1, hdfsPath) - - - var erg = algo.ruleMining(sc, sqlContext) - var outString = erg.map { x => - var rdfTrp = x.getRule() - var temp = "" - for (i <- 0 to rdfTrp.length - 1) { - if (i == 0) { - temp = rdfTrp(i) + " <= " - } else { - temp += rdfTrp(i) + " \u2227 " - } - } - temp = temp.stripSuffix(" \u2227 ") - temp - }.toSeq - - outString.foreach(println) - var rddOut = sc.parallelize(outString).repartition(1) - - rddOut.saveAsTextFile(outputPath + "testOut") - - sc.stop - - -} - -} \ No newline at end of file diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalWithDataframeCrossJoin.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalWithDataframeCrossJoin.scala index 12fab07..75c6a2f 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalWithDataframeCrossJoin.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalWithDataframeCrossJoin.scala @@ -1,23 +1,18 @@ package net.sansa_stack.ml.spark.outliers.anomalydetection -import org.apache.jena.graph.Node -import org.apache.spark.rdd.RDD -import org.apache.spark.RangePartitioner -import org.apache.jena.graph.Triple -import org.apache.spark.sql.SparkSession -import org.apache.spark.HashPartitioner import scala.collection.mutable import scala.collection.mutable.HashSet -import org.apache.jena.graph.NodeFactory + +import org.apache.commons.math3.stat.descriptive._ +import org.apache.jena.graph.{ Node, NodeFactory, Triple } +import org.apache.spark.{ HashPartitioner, RangePartitioner } +import org.apache.spark.ml.feature.{ MinHashLSH, _ } +import org.apache.spark.ml.linalg._ +import org.apache.spark.rdd.{ RDD, _ } import org.apache.spark.sql._ +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.functions.{ col, udf } import org.apache.spark.sql.types._ -import org.apache.spark.rdd._ -import org.apache.spark.ml.feature.MinHashLSH -import org.apache.spark.sql.functions.udf -import org.apache.spark.ml.feature._ -import org.apache.spark.ml.linalg._ -import org.apache.spark.sql.functions.col -import org.apache.commons.math3.stat.descriptive._ import org.apache.spark.storage.StorageLevel /* Dataframe CrossJoin works well for smaller datasets(for e.g. 3.6GB) @@ -30,34 +25,34 @@ import org.apache.spark.storage.StorageLevel */ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[String], - triplesType: List[String], JSimThreshold: Double, - listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) extends Serializable { + triplesType: List[String], JSimThreshold: Double, + listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) extends Serializable { def run(): RDD[(Set[(String, String, Object)])] = { - // get all the triples whose objects are literal - //these literals also contains xsd:date as well as xsd:langstring + // get all the triples whose objects are literal + // these literals also contains xsd:date as well as xsd:langstring val getObjectLiteral = getObjectList() - //remove the literal which has ^^xsd:date or xsd:langstring(only considering numerical) + // remove the literal which has ^^xsd:date or xsd:langstring(only considering numerical) val removedLangString = getObjectLiteral.filter(f => searchedge(f.getObject.toString(), objList)) - //the predicate wikipageId,wikiPageRevisionID are not important for outliers + // the predicate wikipageId,wikiPageRevisionID are not important for outliers val removewiki = removedLangString.filter(f => (!f.getPredicate.toString().contains("wikiPageID")) && (!f.getPredicate.toString().contains("wikiPageRevisionID"))) - //checking object has only numerical data only + // checking object has only numerical data only val triplesWithNumericLiteral = triplesWithNumericLit(removewiki) - - //Pair rdd with key as subject and calue as triple with numerical literal - val mapSubWithTriples = propClustering(triplesWithNumericLiteral) //.persist - //get triples of hypernym + // Pair rdd with key as subject and calue as triple with numerical literal + val mapSubWithTriples = propClustering(triplesWithNumericLiteral) // .persist + + // get triples of hypernym val getHypernymTriples = getHyp() - //filter Dbpedia's rdf type and join with hyernym + // filter Dbpedia's rdf type and join with hyernym val rdfTypeDBwiki = rdfType(getHypernymTriples) - //joining those subjects only who has rdf:ytpe/hypernym and numerical literal + // joining those subjects only who has rdf:ytpe/hypernym and numerical literal val rdfTypeWithSubject = mapSubWithTriples.join(rdfTypeDBwiki) val mapSubjectwithType = rdfTypeWithSubject.map(f => (f._1, f._2._2)) @@ -68,13 +63,13 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin } - //filter triples with hypernm + // filter triples with hypernm def getHyp(): RDD[Triple] = nTriplesRDD.filter(f => f.getPredicate.toString().equals(hypernym)) - //filtering triples with literal at object position + // filtering triples with literal at object position def getObjectList(): RDD[Triple] = nTriplesRDD.filter(f => f.getObject.isLiteral()) - //filtering only numeric literals + // filtering only numeric literals def triplesWithNumericLit(objLit: RDD[Triple]): RDD[Triple] = objLit.filter(f => isNumeric(f.getObject.toString())) def isNumeric(x: String): Boolean = @@ -83,21 +78,17 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin val c = x.indexOf('^') val subject = x.substring(1, c - 1) - if (isAllDigits(subject)) - true - else - false - } else - false + if (isAllDigits(subject)) true + else false + } else false } def isAllDigits(x: String): Boolean = { var found = false for (ch <- x) { - if (ch.isDigit || ch == '.') + if (ch.isDigit || ch == '.') { found = true - else if (ch.isLetter) { - + } else if (ch.isLetter) { found = false } } @@ -110,20 +101,19 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin val c = x.indexOf('^') val subject = x.substring(c + 2) y.contains(subject) - } else - false + } else false } def rdfType(getHypernym: RDD[Triple]): RDD[(String, HashSet[String])] = { - //filter triples with predicate as rdf:type + // filter triples with predicate as rdf:type val triplesWithRDFType = nTriplesRDD.filter(_.getPredicate.toString() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type") val triplesWithDBpedia = triplesWithRDFType.filter(f => searchType(f.getObject.toString(), triplesType)) val subWithType1 = triplesWithDBpedia.map(f => // ... - (getLocalName1(f.getSubject), (getLocalName1(f.getObject)))) //.reduceByKey(_ ++ _) //.partitionBy(new HashPartitioner(8)).persist() + (getLocalName1(f.getSubject), (getLocalName1(f.getObject)))) // .reduceByKey(_ ++ _) //.partitionBy(new HashPartitioner(8)).persist() val initialSet1 = mutable.HashSet.empty[String] val addToSet1 = (s: mutable.HashSet[String], v: String) => s += v @@ -131,7 +121,7 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin val uniqueByKey1 = subWithType1.aggregateByKey(initialSet1)(addToSet1, mergePartitionSets1) val hyper1 = getHypernym.map(f => - (getLocalName1(f.getSubject), (getLocalName1(f.getObject) + ("hypernym")))) //.partitionBy(new HashPartitioner(8)).persist + (getLocalName1(f.getSubject), (getLocalName1(f.getObject) + ("hypernym")))) // .partitionBy(new HashPartitioner(8)).persist val initialSet = mutable.HashSet.empty[String] val addToSet = (s: mutable.HashSet[String], v: String) => s += v @@ -157,31 +147,31 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin def searchType(x: String, y: List[String]): Boolean = { if (y.exists(x.contains)) { true - } else - false + } else false } - def jSimilarity(TriplesWithNumericLiteral: RDD[Triple], - rdfTypeDBwiki: RDD[(String, HashSet[String])], mapSubWithTriples: RDD[(String, mutable.Set[(String, String, Object)])]): RDD[(Set[(String, String, Object)])] = { + def jSimilarity( + TriplesWithNumericLiteral: RDD[Triple], + rdfTypeDBwiki: RDD[(String, HashSet[String])], mapSubWithTriples: RDD[(String, mutable.Set[(String, String, Object)])]): RDD[(Set[(String, String, Object)])] = { nTriplesRDD.unpersist() import sparkSession.implicits._ - //KV pair with subject as key and rdf type/hypernym as value + // KV pair with subject as key and rdf type/hypernym as value val hashtoseq = rdfTypeDBwiki.map(f => (f._1, f._2.toSeq)) val part = new RangePartitioner(30, hashtoseq) val partitioned = hashtoseq.partitionBy(part).persist() - //converting the rdd to dataframe + // converting the rdd to dataframe val dfA = partitioned.toDF("id1", "value1") val dfB = partitioned.toDF("id2", "value2") - //crossJoin of the rdd + // crossJoin of the rdd val joindfA = dfA.crossJoin(dfB) - //registering Jaccard similarity function in udf + // registering Jaccard similarity function in udf val myUDF = udf(sim _) - //papplying jaccard similarity function to each row + // papplying jaccard similarity function to each row val newDF = joindfA.withColumn("Jsim", myUDF(joindfA("value1"), joindfA("value2"))).select("id1", "id2", "Jsim").filter($"Jsim" > 0.6) - //converting df to rdd - val x1 = newDF.rdd + // converting df to rdd + val x1 = newDF.rdd .map(row => { val id = row.getString(0) val value = row.getString(1) @@ -193,13 +183,13 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin val mergePartitionSets3 = (p1: mutable.Set[String], p2: mutable.Set[String]) => p1 ++= p2 val uniqueByKey3 = x1.aggregateByKey(initialSet3)(addToSet3, mergePartitionSets3) - //create cohort of subjects + // create cohort of subjects val SubKV = uniqueByKey3.map(f => ((f._1, (f._2 += (f._1)).toSet))) val partitioner = new HashPartitioner(80) val mapSubWithTriplesPart = mapSubWithTriples.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK) // --heap size error on local mode when not unpersisted with persist - //join cohort of subjects with KV value of mapSubWithTriples - val ys = SubKV.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK) + // join cohort of subjects with KV value of mapSubWithTriples + val ys = SubKV.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK) val g = ys.join(mapSubWithTriples) val clusterOfSubjects = g.map({ @@ -229,8 +219,7 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin def isContains(a: List[Node], b: List[Node]): Boolean = { if (a.forall(b.contains) || b.forall(a.contains)) { true - } else - false + } else false } def removeSupType(a: RDD[((String, HashSet[String]), (String, HashSet[String]))]): RDD[((String, HashSet[String]), (String, HashSet[String]))] = { @@ -251,13 +240,12 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin }) clusterOfProp - } else - a + } else a } def propClustering(triplesWithNumericLiteral: RDD[Triple]): RDD[(String, mutable.Set[(String, String, Object)])] = { val subMap = triplesWithNumericLiteral.map(f => (getLocalName1(f.getSubject), - (getLocalName1(f.getSubject), getLocalName1(f.getPredicate), getNumber(f.getObject.toString())))) //.partitionBy(new HashPartitioner(8)) //make a function instead of using + (getLocalName1(f.getSubject), getLocalName1(f.getPredicate), getNumber(f.getObject.toString())))) // .partitionBy(new HashPartitioner(8)) //make a function instead of using val initialSet = mutable.Set.empty[(String, String, Object)] val addToSet = (s: mutable.Set[(String, String, Object)], v: (String, String, Object)) => s += v @@ -281,9 +269,9 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin var dbtype2: Seq[String] = null val hyper1 = seq1.filter(p => p.contains("hypernym")) val hyper2 = seq2.filter(p => p.contains("hypernym")) - //case of usa and India + // case of usa and India - //USA= hypernym/states and India :- hypernym//Country + // USA= hypernym/states and India :- hypernym//Country if (hyper1 == hyper2 && !hyper1.isEmpty && !hyper2.isEmpty) { jSimilarity = 1.0 @@ -291,12 +279,14 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin } else { if (seq1.contains("hypernym")) { dbtype1 = seq1.dropRight(1) - } else + } else { dbtype1 = seq1 + } if (seq2.contains("hypernym")) { dbtype2 = seq2.dropRight(1) - } else + } else { dbtype2 = seq2 + } val intersect_cnt = dbtype1.toSet.intersect(dbtype2.toSet).size @@ -308,7 +298,7 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin } def iqr2(cluster: Seq[(String, String, Object)], anomalyListLimit: Int): Seq[(String, String, Object)] = { - //create sample data + // create sample data val listofData = cluster.map(b => (b._3.toString()).toDouble).toArray @@ -318,13 +308,13 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin genericArrayOps(c).foreach(v => arrMean.addValue(v)) // Get first and third quartiles and then calc IQR val Q1 = arrMean.getPercentile(25) - //println("Q1="+Q1) + // println("Q1="+Q1) val Q3 = arrMean.getPercentile(75) - //println("Q3="+Q3) + // println("Q3="+Q3) val IQR = Q3 - Q1 - //println("IQR="+IQR) + // println("IQR="+IQR) val lowerRange = Q1 - 1.5 * IQR - //println("lowerRange="+lowerRange) + // println("lowerRange="+lowerRange) val upperRange = Q3 + 1.5 * IQR // println("upperRange="+upperRange) val yse = c.filter(p => (p < lowerRange || p > upperRange)) @@ -335,15 +325,13 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin } def search(a: Double, b: Array[Double]): Boolean = { - if (b.contains(a)) - true - else - false - + if (b.contains(a)) true + else false } } object AnomalWithDataframeCrossJoin { def apply(nTriplesRDD: RDD[Triple], objList: List[String], triplesType: List[String], - JSimThreshold: Double, listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) = new AnomalWithDataframeCrossJoin(nTriplesRDD, objList, triplesType, + JSimThreshold: Double, listSuperType: List[String], sparkSession: SparkSession, + hypernym: String, numPartition: Int): AnomalWithDataframeCrossJoin = new AnomalWithDataframeCrossJoin(nTriplesRDD, objList, triplesType, JSimThreshold, listSuperType, sparkSession, hypernym, numPartition) } diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionWithCountVetcorizerModel.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionWithCountVetcorizerModel.scala index 56e98da..89507a8 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionWithCountVetcorizerModel.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionWithCountVetcorizerModel.scala @@ -1,22 +1,18 @@ package net.sansa_stack.ml.spark.outliers.anomalydetection -import org.apache.jena.graph.Node -import org.apache.spark.rdd.RDD -import org.apache.jena.graph.Triple -import org.apache.spark.sql.SparkSession -import org.apache.spark.HashPartitioner import scala.collection.mutable import scala.collection.mutable.HashSet -import org.apache.jena.graph.NodeFactory -import org.apache.spark.sql._ -import org.apache.spark.sql.types._ -import org.apache.spark.rdd._ -import org.apache.spark.ml.feature.MinHashLSH -import org.apache.spark.sql.functions.udf -import org.apache.spark.ml.feature._ -import org.apache.spark.ml.linalg._ -import org.apache.spark.sql.functions.col + import org.apache.commons.math3.stat.descriptive._ +import org.apache.jena.graph.{ Node, NodeFactory, Triple } +import org.apache.spark.{ HashPartitioner, RangePartitioner } +import org.apache.spark.ml.feature.{ MinHashLSH, _ } +import org.apache.spark.ml.linalg._ +import org.apache.spark.rdd.{ RDD, _ } +import org.apache.spark.sql.{ SparkSession, _ } +import org.apache.spark.sql.functions.{ col, udf } +import org.apache.spark.sql.types._ +import org.apache.spark.storage.StorageLevel /* * @@ -27,39 +23,39 @@ import org.apache.commons.math3.stat.descriptive._ */ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList: List[String], - triplesType: List[String], JSimThreshold: Double, - listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) extends Serializable { + triplesType: List[String], JSimThreshold: Double, + listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) extends Serializable { def run(): RDD[(Set[(String, String, Object)])] = { - // get all the triples whose objects are literal - //these literals also contains xsd:date as well as xsd:langstring + // get all the triples whose objects are literal + // these literals also contains xsd:date as well as xsd:langstring val getObjectLiteral = getObjectList() - //remove the literal which has ^^xsd:date or xsd:langstring(only considering numerical) + // remove the literal which has ^^xsd:date or xsd:langstring(only considering numerical) val removedLangString = getObjectLiteral.filter(f => searchedge(f.getObject.toString(), objList)) val removewiki = removedLangString.filter(f => (!f.getPredicate.toString().contains("wikiPageID")) && (!f.getPredicate.toString().contains("wikiPageRevisionID"))) - //checking still object has only numerical data only + // checking still object has only numerical data only val triplesWithNumericLiteral = triplesWithNumericLit(removewiki) - val mapSubWithTriples = propClustering(triplesWithNumericLiteral) //.persist + val mapSubWithTriples = propClustering(triplesWithNumericLiteral) // .persist - //get triples of hypernym + // get triples of hypernym val getHypernymTriples = getHyp() - - //filter rdf type having object value dbpedia and join with hyernym + + // filter rdf type having object value dbpedia and join with hyernym // val rdfTypeDBwiki = rdfType(getHypernym) //.partitionBy(new HashPartitioner(2)).persist() val rdfTypeDBwiki = rdfType(getHypernymTriples) - - //joining those subjects only who has rdf:ytpe and numerical literal + + // joining those subjects only who has rdf:ytpe and numerical literal val rdfTypeWithSubject = mapSubWithTriples.join(rdfTypeDBwiki) - + val mapSubjectwithType = rdfTypeWithSubject.map(f => (f._1, f._2._2)) - + // val propwithSub = propwithsubject(triplesWithNumericLiteral) - //cluster subjects on the basis of rdf type + // cluster subjects on the basis of rdf type val jacardSimilarity = jSimilarity(triplesWithNumericLiteral, mapSubjectwithType, mapSubWithTriples) jacardSimilarity @@ -79,20 +75,17 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList val c = x.indexOf('^') val subject = x.substring(1, c - 1) - if (isAllDigits(subject)) - true - else - false - } else - false + if (isAllDigits(subject)) true + else false + } else false } def isAllDigits(x: String): Boolean = { var found = false for (ch <- x) { - if (ch.isDigit || ch == '.') + if (ch.isDigit || ch == '.') { found = true - else if (ch.isLetter) { + } else if (ch.isLetter) { found = false } @@ -106,20 +99,19 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList val c = x.indexOf('^') val subject = x.substring(c + 2) y.contains(subject) - } else - false + } else false } def rdfType(getHypernym: RDD[Triple]): RDD[(String, HashSet[String])] = { - //filter triples with predicate as rdf:type + // filter triples with predicate as rdf:type val triplesWithRDFType = nTriplesRDD.filter(_.getPredicate.toString() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type") val triplesWithDBpedia = triplesWithRDFType.filter(f => searchType(f.getObject.toString(), triplesType)) val subWithType1 = triplesWithDBpedia.map(f => // ... - (getLocalName1(f.getSubject), (getLocalName1(f.getObject)))) //.reduceByKey(_ ++ _) //.partitionBy(new HashPartitioner(8)).persist() + (getLocalName1(f.getSubject), (getLocalName1(f.getObject)))) // .reduceByKey(_ ++ _) //.partitionBy(new HashPartitioner(8)).persist() val initialSet1 = mutable.HashSet.empty[String] val addToSet1 = (s: mutable.HashSet[String], v: String) => s += v @@ -127,7 +119,7 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList val uniqueByKey1 = subWithType1.aggregateByKey(initialSet1)(addToSet1, mergePartitionSets1) val hyper1 = getHypernym.map(f => - (getLocalName1(f.getSubject), (getLocalName1(f.getObject) + ("hypernym")))) //.partitionBy(new HashPartitioner(8)).persist + (getLocalName1(f.getSubject), (getLocalName1(f.getObject) + ("hypernym")))) // .partitionBy(new HashPartitioner(8)).persist val initialSet = mutable.HashSet.empty[String] val addToSet = (s: mutable.HashSet[String], v: String) => s += v @@ -153,11 +145,11 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList def searchType(x: String, y: List[String]): Boolean = { if (y.exists(x.contains)) { true - } else - false + } else false } - def jSimilarity(TriplesWithNumericLiteral: RDD[Triple], - rdfTypeDBwiki: RDD[(String, HashSet[String])], mapSubWithTriples: RDD[(String, mutable.Set[(String, String, Object)])]): RDD[(Set[(String, String, Object)])] = { + def jSimilarity( + TriplesWithNumericLiteral: RDD[Triple], + rdfTypeDBwiki: RDD[(String, HashSet[String])], mapSubWithTriples: RDD[(String, mutable.Set[(String, String, Object)])]): RDD[(Set[(String, String, Object)])] = { nTriplesRDD.unpersist() import sparkSession.implicits._ @@ -172,10 +164,10 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList .setMinDF(1) .fit(dfA) - val kt = cvModel.transform(dfA) //.filter(isNoneZeroVector(col("features"))) - + val kt = cvModel.transform(dfA) // .filter(isNoneZeroVector(col("features"))) + val mh = new MinHashLSH() - .setNumHashTables(3) //tested with 100 on out4.nt file ..result in /home/rajjat/Desktop/recent_dataset/output_removed_boolean_udf.txt + .setNumHashTables(3) // tested with 100 on out4.nt file ..result in /home/rajjat/Desktop/recent_dataset/output_removed_boolean_udf.txt .setInputCol("features") .setOutputCol("hashes") @@ -183,13 +175,13 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList // val model1 = mh.fit(featurizedData) val dffilter = model.approxSimilarityJoin(kt, kt, 0.45) - val opiu = dffilter.filter($"datasetA.id".isNotNull).filter($"datasetB.id".isNotNull) .filter(($"datasetA.id" =!= $"datasetB.id")) - .select(col("datasetA.id").alias("id1"), + .select( + col("datasetA.id").alias("id1"), col("datasetB.id").alias("id2")) - val x1 = opiu.rdd //maimum time taken by this rdd + val x1 = opiu.rdd // maimum time taken by this rdd .map(row => { val id = row.getString(0) val value = row.getString(1) @@ -200,16 +192,16 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList val addToSet3 = (s: mutable.Set[String], v: String) => s += v val mergePartitionSets3 = (p1: mutable.Set[String], p2: mutable.Set[String]) => p1 ++= p2 val uniqueByKey3 = x1.aggregateByKey(initialSet3)(addToSet3, mergePartitionSets3) - + x1.unpersist() - + val k = uniqueByKey3.map(f => ((f._2 += (f._1)).toSet)).map(a => (a, a)) .aggregateByKey(Set[String]())((x, y) => y, (x, y) => x) .keys.distinct() val abc = k.repartition(50).persist() val simSubjectCart = abc.cartesian(abc).filter(f => f._1.intersect(f._2).size > 0) - + partitionedy.unpersist() // joined.unpersist() val subsetMembers = simSubjectCart.filter { case (set1, set2) => (set2.subsetOf(set1)) && (set1 -- set2).nonEmpty } @@ -217,8 +209,8 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList val superset1 = abc.subtract(sdf) val ys = superset1.flatMap(f => (f.map(g => (g, f)))) - - val g=ys.join(mapSubWithTriples) + + val g = ys.join(mapSubWithTriples) val clusterOfSubjects = g.map({ case (s, (iter, iter1)) => ((iter).toSet, iter1) @@ -240,15 +232,14 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList val clusterOfProp = propDistinct.map({ case (a, (iter1)) => (iter1.filter(f => f._2.equals(a))) }) - + clusterOfProp } def isContains(a: List[Node], b: List[Node]): Boolean = { if (a.forall(b.contains) || b.forall(a.contains)) { true - } else - false + } else false } def removeSupType(a: RDD[((String, HashSet[String]), (String, HashSet[String]))]): RDD[((String, HashSet[String]), (String, HashSet[String]))] = { @@ -269,13 +260,12 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList }) clusterOfProp - } else - a + } else a } def propClustering(triplesWithNumericLiteral: RDD[Triple]): RDD[(String, mutable.Set[(String, String, Object)])] = { val subMap = triplesWithNumericLiteral.map(f => (getLocalName1(f.getSubject), - (getLocalName1(f.getSubject), getLocalName1(f.getPredicate), getNumber(f.getObject.toString())))) //.partitionBy(new HashPartitioner(8)) //make a function instead of using + (getLocalName1(f.getSubject), getLocalName1(f.getPredicate), getNumber(f.getObject.toString())))) // .partitionBy(new HashPartitioner(8)) //make a function instead of using val initialSet = mutable.Set.empty[(String, String, Object)] val addToSet = (s: mutable.Set[(String, String, Object)], v: (String, String, Object)) => s += v @@ -299,9 +289,9 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList var dbtype2: HashSet[String] = null val hyper1 = seq1.filter(p => p.contains("hypernym")) val hyper2 = seq2.filter(p => p.contains("hypernym")) - //case of usa and India + // case of usa and India - //USA= hypernym/states and India :- hypernym//Country + // USA= hypernym/states and India :- hypernym//Country if (hyper1 == hyper2 && !hyper1.isEmpty && !hyper2.isEmpty) { jSimilarity = 1.0 @@ -309,12 +299,14 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList } else { if (seq1.contains("hypernym")) { dbtype1 = seq1.dropRight(1) - } else + } else { dbtype1 = seq1 + } if (seq2.contains("hypernym")) { dbtype2 = seq2.dropRight(1) - } else + } else { dbtype2 = seq2 + } val intersect_cnt = dbtype1.toSet.intersect(dbtype2.toSet).size @@ -327,7 +319,7 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList def iqr1(cluster: Seq[(String, String, Object)], anomalyListLimit: Int): Dataset[Row] = { - //create sample data + // create sample data var result: Dataset[Row] = null // var _partitionData: RDD[String] = _ @@ -337,7 +329,7 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList val listofData = cluster.map(b => (b._3.toString()).toDouble).toList val k = sparkSession.sparkContext.makeRDD(listofData) - //create sample data + // create sample data // println("sampleData=" + listofData) val c = listofData.sorted @@ -351,9 +343,10 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList val dfWithoutSchema = sparkSession.createDataFrame(KVcluster).toDF("id", "outliers") // calculate quantiles and IQR - val quantiles = df.stat.approxQuantile("value", + val quantiles = df.stat.approxQuantile( + "value", Array(0.25, 0.75), 0.0) - //quantiles.foreach(println) + // quantiles.foreach(println) val Q1 = quantiles(0) @@ -383,52 +376,48 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList result } result - // + // // // result.show() // result.where(result.col("outliers").isNotNull) } def iqr2(cluster: Seq[(String, String, Object)], anomalyListLimit: Int): Seq[(String, String, Object)] = { - //create sample data + // create sample data val listofData = cluster.map(b => (b._3.toString()).toDouble).toArray - - val c = listofData.sorted - + val arrMean = new DescriptiveStatistics() genericArrayOps(c).foreach(v => arrMean.addValue(v)) // Get first and third quartiles and then calc IQR val Q1 = arrMean.getPercentile(25) - //println("Q1="+Q1) + // println("Q1="+Q1) val Q3 = arrMean.getPercentile(75) - //println("Q3="+Q3) + // println("Q3="+Q3) val IQR = Q3 - Q1 - //println("IQR="+IQR) + // println("IQR="+IQR) val lowerRange = Q1 - 1.5 * IQR - //println("lowerRange="+lowerRange) + // println("lowerRange="+lowerRange) val upperRange = Q3 + 1.5 * IQR // println("upperRange="+upperRange) val yse = c.filter(p => (p < lowerRange || p > upperRange)) - val xde = cluster.filter(f => search(f._3.toString().toDouble, yse)) - + xde } def search(a: Double, b: Array[Double]): Boolean = { - if (b.contains(a)) - true - else - false + if (b.contains(a)) true + else false } } object AnomalyDetectionWithCountVetcorizerModel { def apply(nTriplesRDD: RDD[Triple], objList: List[String], triplesType: List[String], - JSimThreshold: Double, listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) = new AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD, objList, triplesType, + JSimThreshold: Double, listSuperType: List[String], sparkSession: SparkSession, + hypernym: String, numPartition: Int): AnomalyDetectionWithCountVetcorizerModel = new AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD, objList, triplesType, JSimThreshold, listSuperType, sparkSession, hypernym, numPartition) } diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyWithHashingTF.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyWithHashingTF.scala index dc0db93..99a9305 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyWithHashingTF.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyWithHashingTF.scala @@ -1,24 +1,19 @@ package net.sansa_stack.ml.spark.outliers.anomalydetection -import org.apache.jena.graph.Node -import org.apache.spark.rdd.RDD -import org.apache.spark.RangePartitioner -import org.apache.jena.graph.Triple -import org.apache.spark.sql.SparkSession -import org.apache.spark.HashPartitioner import scala.collection.mutable import scala.collection.mutable.HashSet -import org.apache.jena.graph.NodeFactory -import org.apache.spark.sql._ -import org.apache.spark.sql.types._ -import org.apache.spark.rdd._ -import org.apache.spark.ml.feature.MinHashLSH -import org.apache.spark.sql.functions.udf -import org.apache.spark.ml.feature._ -import org.apache.spark.ml.linalg._ -import org.apache.spark.sql.functions.col + import org.apache.commons.math3.stat.descriptive._ +import org.apache.jena.graph.{ Node, NodeFactory, Triple } +import org.apache.spark.{ HashPartitioner, RangePartitioner } +import org.apache.spark.ml.feature.{ MinHashLSH, _ } +import org.apache.spark.ml.linalg._ +import org.apache.spark.rdd.{ RDD, _ } +import org.apache.spark.sql.{ SparkSession, _ } +import org.apache.spark.sql.functions.{ col, udf } +import org.apache.spark.sql.types._ import org.apache.spark.storage.StorageLevel + /* * * AnomalyDetection - Anomaly detection of numerical data @@ -32,34 +27,34 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String], listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) extends Serializable { def run(): RDD[(Set[(String, String, Object)])] = { - // get all the triples whose objects are literal - //these literals also contains xsd:date as well as xsd:langstring + // get all the triples whose objects are literal + // these literals also contains xsd:date as well as xsd:langstring val getObjectLiteral = getObjectList() - //remove the literal which has ^^xsd:date or xsd:langstring(only considering numerical) + // remove the literal which has ^^xsd:date or xsd:langstring(only considering numerical) val removedLangString = getObjectLiteral.filter(f => searchedge(f.getObject.toString(), objList)) val removewiki = removedLangString.filter(f => (!f.getPredicate.toString().contains("wikiPageID")) && (!f.getPredicate.toString().contains("wikiPageRevisionID"))) - //checking still object has only numerical data only + // checking still object has only numerical data only val triplesWithNumericLiteral = triplesWithNumericLit(removewiki) - val mapSubWithTriples = propClustering(triplesWithNumericLiteral) //.partitionBy(new HashPartitioner(40)).persist() + val mapSubWithTriples = propClustering(triplesWithNumericLiteral) // .partitionBy(new HashPartitioner(40)).persist() - //get triples of hypernym + // get triples of hypernym val getHypernymTriples = getHyp() - //filter rdf type having object value dbpedia and join with hyernym + // filter rdf type having object value dbpedia and join with hyernym val rdfTypeDBwiki = rdfType(getHypernymTriples) - //joining those subjects only who has rdf:ytpe and numerical literal + // joining those subjects only who has rdf:ytpe and numerical literal val rdfTypeWithSubject = mapSubWithTriples.join(rdfTypeDBwiki) val mapSubjectwithType = rdfTypeWithSubject.map(f => (f._1, f._2._2)) val propwithSub = propwithsubject(triplesWithNumericLiteral) - //cluster subjects on the basis of rdf type + // cluster subjects on the basis of rdf type val jacardSimilarity = jSimilarity(triplesWithNumericLiteral, propwithSub, mapSubjectwithType, mapSubWithTriples) jacardSimilarity @@ -79,25 +74,20 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String], val c = x.indexOf('^') val subject = x.substring(1, c - 1) - if (isAllDigits(subject)) - true - else - false - } else - false + if (isAllDigits(subject)) true + else false + } else false } def isAllDigits(x: String): Boolean = { var found = false for (ch <- x) { - if (ch.isDigit || ch == '.') + if (ch.isDigit || ch == '.') { found = true - else if (ch.isLetter) { - + } else if (ch.isLetter) { found = false } } - found } @@ -106,20 +96,19 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String], val c = x.indexOf('^') val subject = x.substring(c + 2) y.contains(subject) - } else - false + } else false } def rdfType(getHypernym: RDD[Triple]): RDD[(String, HashSet[String])] = { - //filter triples with predicate as rdf:type + // filter triples with predicate as rdf:type val triplesWithRDFType = nTriplesRDD.filter(_.getPredicate.toString() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type") val triplesWithDBpedia = triplesWithRDFType.filter(f => searchType(f.getObject.toString(), triplesType)) val subWithType1 = triplesWithDBpedia.map(f => // ... - (getLocalName1(f.getSubject), (getLocalName1(f.getObject)))) //.reduceByKey(_ ++ _) //.partitionBy(new HashPartitioner(8)).persist() + (getLocalName1(f.getSubject), (getLocalName1(f.getObject)))) // .reduceByKey(_ ++ _) //.partitionBy(new HashPartitioner(8)).persist() val initialSet1 = mutable.HashSet.empty[String] val addToSet1 = (s: mutable.HashSet[String], v: String) => s += v @@ -127,7 +116,7 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String], val uniqueByKey1 = subWithType1.aggregateByKey(initialSet1)(addToSet1, mergePartitionSets1) val hyper1 = getHypernym.map(f => - (getLocalName1(f.getSubject), (getLocalName1(f.getObject) + ("hypernym")))) //.partitionBy(new HashPartitioner(8)).persist + (getLocalName1(f.getSubject), (getLocalName1(f.getObject) + ("hypernym")))) // .partitionBy(new HashPartitioner(8)).persist val initialSet = mutable.HashSet.empty[String] val addToSet = (s: mutable.HashSet[String], v: String) => s += v @@ -153,8 +142,7 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String], def searchType(x: String, y: List[String]): Boolean = { if (y.exists(x.contains)) { true - } else - false + } else false } def jSimilarity(TriplesWithNumericLiteral: RDD[Triple], xse: RDD[(String, String)], rdfTypeDBwiki: RDD[(String, HashSet[String])], mapSubWithTriples: RDD[(String, mutable.Set[(String, String, Object)])]): RDD[(Set[(String, String, Object)])] = { @@ -171,24 +159,24 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String], val hashingTF = new HashingTF() .setInputCol("values").setOutputCol("features").setNumFeatures(1048576) - val featurizedData = hashingTF.transform(dropDup) - + val featurizedData = hashingTF.transform(dropDup) + val mh = new MinHashLSH() - .setNumHashTables(3) + .setNumHashTables(3) .setInputCol("features") .setOutputCol("hashes") val model = mh.fit(featurizedData) val dffilter = model.approxSimilarityJoin(featurizedData, featurizedData, 0.45) - println("dffilter") - + val opiu = dffilter.filter($"datasetA.id".isNotNull).filter($"datasetB.id".isNotNull) .filter(($"datasetA.id" =!= $"datasetB.id")) - .select(col("datasetA.id").alias("id1"), - col("datasetB.id").alias("id2")) //heap space error due to persist + .select( + col("datasetA.id").alias("id1"), + col("datasetB.id").alias("id2")) // heap space error due to persist - val x1 = opiu.repartition(400).persist(StorageLevel.MEMORY_AND_DISK) + val x1 = opiu.repartition(400).persist(StorageLevel.MEMORY_AND_DISK) val x1Map = x1.rdd.map(row => { val id = row.getString(0) val value = row.getString(1) @@ -200,14 +188,12 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String], val mergePartitionSets3 = (p1: mutable.Set[String], p2: mutable.Set[String]) => p1 ++= p2 val uniqueByKey3 = x1Map.aggregateByKey(initialSet3)(addToSet3, mergePartitionSets3) - - val k = uniqueByKey3.map(f => ((f._1, (f._2 += (f._1)).toSet))) - + val partitioner = new HashPartitioner(500) - val mapSubWithTriplesPart = mapSubWithTriples.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK) + val mapSubWithTriplesPart = mapSubWithTriples.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK) - val ys = k.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK) + val ys = k.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK) val joinSimSubTriples2 = ys.join(mapSubWithTriplesPart) val clusterOfSubjects = joinSimSubTriples2.map({ @@ -233,21 +219,20 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String], mapSubWithTriplesPart.unpersist() ys.unpersist() - + clusterOfProp } def isContains(a: List[Node], b: List[Node]): Boolean = { if (a.forall(b.contains) || b.forall(a.contains)) { true - } else - false + } else false } def propClustering(triplesWithNumericLiteral: RDD[Triple]): RDD[(String, mutable.Set[(String, String, Object)])] = { val subMap = triplesWithNumericLiteral.map(f => (getLocalName1(f.getSubject), - (getLocalName1(f.getSubject), getLocalName1(f.getPredicate), getNumber(f.getObject.toString())))) //.partitionBy(new HashPartitioner(8)) //make a function instead of using + (getLocalName1(f.getSubject), getLocalName1(f.getPredicate), getNumber(f.getObject.toString())))) // .partitionBy(new HashPartitioner(8)) //make a function instead of using val initialSet = mutable.Set.empty[(String, String, Object)] val addToSet = (s: mutable.Set[(String, String, Object)], v: (String, String, Object)) => s += v @@ -266,7 +251,7 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String], } def iqr2(cluster: Seq[(String, String, Object)], anomalyListLimit: Int): Seq[(String, String, Object)] = { - //create sample data + // create sample data val listofData = cluster.map(b => (b._3.toString()).toDouble).toArray @@ -274,36 +259,33 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String], val arrMean = new DescriptiveStatistics() genericArrayOps(c).foreach(v => arrMean.addValue(v)) - + val Q1 = arrMean.getPercentile(25) val Q3 = arrMean.getPercentile(75) - + val IQR = Q3 - Q1 - + val lowerRange = Q1 - 1.5 * IQR - + val upperRange = Q3 + 1.5 * IQR - + val yse = c.filter(p => (p < lowerRange || p > upperRange)) val xde = cluster.filter(f => search(f._3.toString().toDouble, yse)) xde - + } def search(a: Double, b: Array[Double]): Boolean = { - if (b.contains(a)) - true - else - false - + if (b.contains(a)) true + else false } } object AnomalyWithHashingTF { def apply(nTriplesRDD: RDD[Triple], objList: List[String], triplesType: List[String], - JSimThreshold: Double, listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) = new AnomalyWithHashingTF(nTriplesRDD, objList, triplesType, + JSimThreshold: Double, listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int): AnomalyWithHashingTF = new AnomalyWithHashingTF(nTriplesRDD, objList, triplesType, JSimThreshold, listSuperType, sparkSession, hypernym, numPartition) } diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/readme.md b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/README.md similarity index 100% rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/readme.md rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/README.md diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CharactersFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CharactersFeatures.scala index e2b6641..056f6cd 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CharactersFeatures.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CharactersFeatures.scala @@ -1,7 +1,8 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection +import java.util.regex.{ Matcher, Pattern } + import org.apache.spark.ml.linalg.{ Vector, Vectors } -import java.util.regex.{ Pattern, Matcher } class CharactersFeatures extends Serializable { @@ -10,7 +11,6 @@ class CharactersFeatures extends Serializable { val rounded: Double = Math.round(va * 10000).toDouble / 10000 rounded - } def Vector_Characters_Feature(StrValue: String): Array[Double] = { @@ -19,148 +19,145 @@ class CharactersFeatures extends Serializable { var RatioValues = new Array[Double](25) // Index is Important here val characterFeature_OBJ = new CharactersFeatures() - //1.Double result Value for uppercase Ration + // 1.Double result Value for uppercase Ration val uppercase = characterFeature_OBJ.UppercaseRation_Character(StrValue) if (!uppercase.isNaN()) { RatioValues(0) = RoundDouble(uppercase) } - //2.Double result Value for lowerCase Ratio + // 2.Double result Value for lowerCase Ratio val lowerCase = characterFeature_OBJ.LowercaseRation_Character(StrValue) if (!lowerCase.isNaN()) { RatioValues(1) = RoundDouble(lowerCase) } - //3.Double result Value for Alphanumeric Ratio + // 3.Double result Value for Alphanumeric Ratio val Alphanumeric = characterFeature_OBJ.AlphanumericRation_Character(StrValue) if (!Alphanumeric.isNaN()) { RatioValues(2) = RoundDouble(Alphanumeric) } - //4.Double result Value for ASCII Ratio + // 4.Double result Value for ASCII Ratio val ASCII = characterFeature_OBJ.ASCIIRation_Character(StrValue) if (!ASCII.isNaN()) { RatioValues(3) = RoundDouble(ASCII) } - //5.Double result Value for Bracket Ratio + // 5.Double result Value for Bracket Ratio val Bracket = characterFeature_OBJ.BracketRation_Character(StrValue) if (!Bracket.isNaN()) { RatioValues(4) = RoundDouble(Bracket) } - //6.Double result Value for Digits Ratio + // 6.Double result Value for Digits Ratio val Digits = characterFeature_OBJ.DigitsRation_Character(StrValue) if (!Digits.isNaN()) { RatioValues(5) = RoundDouble(Digits) } - //7.Double result Value for Latin Ratio + // 7.Double result Value for Latin Ratio val Latin = characterFeature_OBJ.Latin_Character(StrValue) if (!Latin.isNaN()) { RatioValues(6) = RoundDouble(Latin) } - //8.Double result Value for WhiteSpace Ratio + // 8.Double result Value for WhiteSpace Ratio val WhiteSpace = characterFeature_OBJ.WhiteSpace_Character(StrValue) if (!WhiteSpace.isNaN()) { RatioValues(7) = RoundDouble(WhiteSpace) } - //9.Double result Value for punc Ratio + // 9.Double result Value for punc Ratio val punc = characterFeature_OBJ.Punct_Character(StrValue) if (!punc.isNaN()) { RatioValues(8) = RoundDouble(punc) } - //10. Integer to Double result Value for LongCharacterSequence (1 integer) + // 10. Integer to Double result Value for LongCharacterSequence (1 integer) val LongCharacterSequence = characterFeature_OBJ.Longcharactersequence_Character(StrValue) if (!LongCharacterSequence.isNaN()) { RatioValues(9) = LongCharacterSequence } - //11.Double result Value for ArabicCharacter + // 11.Double result Value for ArabicCharacter val ArabicCharacter = characterFeature_OBJ.ArabicRation_Character(StrValue) if (!ArabicCharacter.isNaN()) { RatioValues(10) = RoundDouble(ArabicCharacter) } - //12.Double result Value for Bengali + // 12.Double result Value for Bengali val Bengali = characterFeature_OBJ.BengaliRation_Character(StrValue) if (!Bengali.isNaN()) { RatioValues(11) = RoundDouble(Bengali) } - //13.Double result Value for Brahmi + // 13.Double result Value for Brahmi val Brahmi = characterFeature_OBJ.BrahmiRation_Character(StrValue) if (!Brahmi.isNaN()) { RatioValues(12) = RoundDouble(Brahmi) - } - //14.Double result Value for Cyrillic + // 14.Double result Value for Cyrillic val Cyrillic = characterFeature_OBJ.CyrillicRation_Character(StrValue) if (!Cyrillic.isNaN()) { RatioValues(13) = RoundDouble(Cyrillic) - } - //15.Double result Value for Han + // 15.Double result Value for Han val Han = characterFeature_OBJ.HanRatio_Character(StrValue) if (!Han.isNaN()) { RatioValues(14) = RoundDouble(Han) - } - //16.Double result Value for Malysia + // 16.Double result Value for Malysia val Malysia = characterFeature_OBJ.MalaysRatio_Character(StrValue) if (!Malysia.isNaN()) { RatioValues(15) = RoundDouble(Malysia) } - //17.Double result Value for Tami + // 17.Double result Value for Tami val Tami = characterFeature_OBJ.TamilRatio_Character(StrValue) if (!Tami.isNaN()) { RatioValues(16) = RoundDouble(Tami) } - //18.Double result Value for Telugu + // 18.Double result Value for Telugu val Telugu = characterFeature_OBJ.TeluguRatio_Character(StrValue) if (!Telugu.isNaN()) { RatioValues(17) = RoundDouble(Telugu) } - //19.Double result Value for Symbol + // 19.Double result Value for Symbol val Symbol = characterFeature_OBJ.Symbol_Character(StrValue) if (!Symbol.isNaN()) { RatioValues(18) = RoundDouble(Symbol) } - //20. Double Alphabets Ration: + // 20. Double Alphabets Ration: val Alphabets = characterFeature_OBJ.AlphaBetsRation_Character(StrValue) if (!Alphabets.isNaN()) { RatioValues(19) = RoundDouble(Alphabets) } - //21. Double AVisible character Ratio: + // 21. Double AVisible character Ratio: val Visible = characterFeature_OBJ.VisibleRation_Character(StrValue) if (!Visible.isNaN()) { RatioValues(20) = RoundDouble(Visible) } - //22. Double Printable character Ratio: + // 22. Double Printable character Ratio: val Printable = characterFeature_OBJ.PrintableRation_Character(StrValue) if (!Printable.isNaN()) { RatioValues(21) = RoundDouble(Printable) } - //23.Double Blank character Ratio: + // 23.Double Blank character Ratio: val Blank = characterFeature_OBJ.BlankRation_Character(StrValue) if (!Blank.isNaN()) { RatioValues(22) = RoundDouble(Blank) } - //24.Double A control character: + // 24.Double A control character: val Control = characterFeature_OBJ.ControlRation_Character(StrValue) if (!Control.isNaN()) { RatioValues(23) = RoundDouble(Control) } - - //25. Double A hexadecimal digit : + // 25. Double A hexadecimal digit : val hexadecimal = characterFeature_OBJ.HexaRation_Character(StrValue) if (!hexadecimal.isNaN()) { RatioValues(24) = RoundDouble(hexadecimal) } + // val FacilityOBJ = new FacilitiesClass() // val vector_Values = FacilityOBJ.ToVector(RatioValues) @@ -176,7 +173,8 @@ class CharactersFeatures extends Serializable { } charRatio } - //1.Uppercase Ratio: + + // 1.Uppercase Ratio: def UppercaseRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{javaUpperCase}") val result: Double = characterRatio(str, pattern) @@ -187,51 +185,51 @@ class CharactersFeatures extends Serializable { val result: Double = characterRatio(str, pattern) result } - //3.Alphanumeric + // 3.Alphanumeric def AlphanumericRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Alnum}") val result: Double = characterRatio(str, pattern) result } - //4.ASCII + // 4.ASCII def ASCIIRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{ASCII}") val result: Double = characterRatio(str, pattern) result } - //5.Bracket + // 5.Bracket def BracketRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\(|\\)|\\}|\\{|\\[|\\]") val result: Double = characterRatio(str, pattern) result } - //6.Digits + // 6.Digits def DigitsRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\d") val result: Double = characterRatio(str, pattern) result } - //7.Latin + // 7.Latin def Latin_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsLatin}") val result: Double = characterRatio(str, pattern) result } - //8.WhiteSpace + // 8.WhiteSpace def WhiteSpace_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\s") val result: Double = characterRatio(str, pattern) result } - //9.Punct + // 9.Punct def Punct_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Punct}") val result: Double = characterRatio(str, pattern) result } - //10.Long character sequence: + // 10.Long character sequence: def Longcharactersequence_Character(str: String): Double = { var text: String = str var maxlength: Integer = null @@ -265,96 +263,96 @@ class CharactersFeatures extends Serializable { } - //11.ARabic Ratio: + // 11.ARabic Ratio: def ArabicRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsArabic}") val result: Double = characterRatio(str, pattern) result } - //12. Bengali Ratio + // 12. Bengali Ratio def BengaliRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsBengali}") val result: Double = characterRatio(str, pattern) result } - //13.Brahmi Ratio + // 13.Brahmi Ratio def BrahmiRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsBrahmi}") val result: Double = characterRatio(str, pattern) result } - //14.Cyrillic Ratio + // 14.Cyrillic Ratio def CyrillicRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsCyrillic}") val result: Double = characterRatio(str, pattern) result } - //15.HanRatio + // 15.HanRatio def HanRatio_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsHan}") val result: Double = characterRatio(str, pattern) result } - //16.Malaysian Ratio: + // 16.Malaysian Ratio: def MalaysRatio_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsMalayalam}") val result: Double = characterRatio(str, pattern) result } - //17.Tamil Ratio: + // 17.Tamil Ratio: def TamilRatio_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsTamil}") val result: Double = characterRatio(str, pattern) result } - //18.Telugu Ration: + // 18.Telugu Ration: def TeluguRatio_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsTelugu}") val result: Double = characterRatio(str, pattern) result } - //19.Symbols Ratio : + // 19.Symbols Ratio : def Symbol_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("[#$%&@+-_+*/]*") val result: Double = characterRatio(str, pattern) result } - //20.Alphabets Ratio : + // 20.Alphabets Ratio : def AlphaBetsRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Alpha}") val result: Double = characterRatio(str, pattern) result } - //21.A visible character Ratio: + // 21.A visible character Ratio: def VisibleRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Graph}") val result: Double = characterRatio(str, pattern) result } - //22.A printable character + // 22.A printable character def PrintableRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Print}") val result: Double = characterRatio(str, pattern) result } - //23.A Black(it is different from White space) character Ratio + // 23.A Black(it is different from White space) character Ratio def BlankRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Blank}") val result: Double = characterRatio(str, pattern) result } - //24.Control character Ratio + // 24.Control character Ratio def ControlRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Cntrl}") val result: Double = characterRatio(str, pattern) result } - //25.HexaDecimal character Ratio + // 25.HexaDecimal character Ratio def HexaRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{XDigit}") val result: Double = characterRatio(str, pattern) @@ -362,4 +360,4 @@ class CharactersFeatures extends Serializable { } // Character features: ------ End calculation the Ratio for character: -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Classifiers.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Classifiers.scala index 3549c50..2aa2ea3 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Classifiers.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Classifiers.scala @@ -1,33 +1,27 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.{ SparkContext, RangePartitioner } -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql.types.{ DoubleType, StringType, IntegerType, StructField, StructType } -import org.apache.spark.ml.linalg.{ Vector, Vectors } -import org.apache.spark.ml.classification.{ GBTClassificationModel, GBTClassifier } -import org.apache.spark.ml.classification.DecisionTreeClassificationModel -import org.apache.spark.ml.classification.DecisionTreeClassifier -import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator -import org.apache.spark.ml.classification.LogisticRegression -import org.apache.spark.ml.classification.MultilayerPerceptronClassifier +import java.io.{ File, IOException } +import java.text.SimpleDateFormat +import java.util.{ Calendar, Date } + import scala.collection.mutable -import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics -import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator -import org.apache.spark.ml.feature.{ IndexToString, StringIndexer, VectorIndexer } -import org.apache.spark.ml.classification.{ RandomForestClassificationModel, RandomForestClassifier } + +import org.apache.commons.io.FileUtils +import org.apache.spark.{ RangePartitioner, SparkContext } import org.apache.spark.ml.Pipeline -import org.apache.commons.io.FileUtils; -import java.io.File; -import java.io.IOException; -import java.util.Calendar -import java.text.SimpleDateFormat -import java.util.Date -import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD} +import org.apache.spark.ml.classification.{ DecisionTreeClassificationModel, DecisionTreeClassifier, GBTClassificationModel, GBTClassifier, LogisticRegression, MultilayerPerceptronClassifier, RandomForestClassificationModel, RandomForestClassifier } +import org.apache.spark.ml.evaluation.{ BinaryClassificationEvaluator, MulticlassClassificationEvaluator } +import org.apache.spark.ml.feature.{ IndexToString, StringIndexer, VectorIndexer } +import org.apache.spark.ml.linalg.{ Vector, Vectors } +import org.apache.spark.mllib.classification.{ SVMModel, SVMWithSGD } +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql.types.{ DoubleType, IntegerType, StringType, StructField, StructType } class Classifiers extends Serializable { - //1.ok ----- + // 1.ok ----- def RandomForestClassifer(DF_Training: DataFrame, DF_Testing: DataFrame, sc: SparkContext): String = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) @@ -48,7 +42,8 @@ class Classifiers extends Serializable { // val Array(DF_Testing) = DF_Testing//.randomSplit(Array(0.100)) // Train a RandomForest model. - val rf = new RandomForestClassifier().setImpurity("gini").setMaxDepth(3).setNumTrees(20).setFeatureSubsetStrategy("auto").setSeed(5043).setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") //.setNumTrees(20) + val rf = new RandomForestClassifier().setImpurity("gini").setMaxDepth(3).setNumTrees(20).setFeatureSubsetStrategy("auto").setSeed(5043) + .setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") // .setNumTrees(20) // Convert indexed labels back to original labels. val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) @@ -66,7 +61,7 @@ class Classifiers extends Serializable { val finlaPrediction = predictions.select("Rid", "features", "FinalROLLBACK_REVERTED", "predictedLabel") predictions.show() - //Case1 : BinaryClassificationEvaluator:OK ------------------------------------------------------ + // Case1 : BinaryClassificationEvaluator:OK ------------------------------------------------------ val binaryClassificationEvaluator = new BinaryClassificationEvaluator().setLabelCol("indexedLabel").setRawPredictionCol("rawPrediction") var results1 = 0.0 def printlnMetricCAse1(metricName: String): Double = { @@ -79,7 +74,7 @@ class Classifiers extends Serializable { val PR = printlnMetricCAse1("areaUnderPR") // Case 2: MulticlassClassificationEvaluator:OK ----------------------------------------------------- - //Select (prediction, true label) and compute test error. + // Select (prediction, true label) and compute test error. val MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction") var results2 = 0.0 @@ -93,10 +88,10 @@ class Classifiers extends Serializable { val Recall = printlnMetricCase2("weightedRecall") val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() - finalResult + finalResult } - //2.ok------ + // 2.ok------ def DecisionTreeClassifier(DF_Training: DataFrame, DF_Testing: DataFrame, sc: SparkContext): String = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) @@ -135,40 +130,38 @@ class Classifiers extends Serializable { val predictions = modelxx.transform(TestingData) // Select example rows to display. - //val finlaPrediction = predictions.select("Rid", "features", "FinalROLLBACK_REVERTED", "predictedLabel") + // val finlaPrediction = predictions.select("Rid", "features", "FinalROLLBACK_REVERTED", "predictedLabel") - //Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- + // Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- val binaryClassificationEvaluator = new BinaryClassificationEvaluator().setLabelCol("indexedLabel").setRawPredictionCol("rawPrediction") - - var result1=0.0 + + var result1 = 0.0 def printlnMetricCAse1(metricName: String): Double = { - result1 =binaryClassificationEvaluator.setMetricName(metricName).evaluate(predictions) - println(metricName + " = " +result1 ) - + result1 = binaryClassificationEvaluator.setMetricName(metricName).evaluate(predictions) + println(metricName + " = " + result1) + result1 } val ROC = printlnMetricCAse1("areaUnderROC") val PR = printlnMetricCAse1("areaUnderPR") - //Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- - //Select (prediction, true label) and compute test error. + // Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- + // Select (prediction, true label) and compute test error. val MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction") - var result2=0.0 + var result2 = 0.0 def printlnMetricCase2(metricName: String): Double = { - result2=MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions) + result2 = MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions) println(metricName + " = " + result2) result2 } - val accuracy = printlnMetricCase2("accuracy") + val accuracy = printlnMetricCase2("accuracy") val Precision = printlnMetricCase2("weightedPrecision") val Recall = printlnMetricCase2("weightedRecall") - - val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() + val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() finalResult - - + } // 3.Ok -------- @@ -210,7 +203,7 @@ class Classifiers extends Serializable { predictions.show() - //Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- + // Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- val binaryClassificationEvaluator = new BinaryClassificationEvaluator().setLabelCol("indexedLabel").setRawPredictionCol("rawPrediction") var results1 = 0.0 def printlnMetricCase1(metricName: String): Double = { @@ -222,13 +215,13 @@ class Classifiers extends Serializable { val ROC = printlnMetricCase1("areaUnderROC") val PR = printlnMetricCase1("areaUnderPR") - //Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- - //Select (prediction, true label) and compute test error. + // Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- + // Select (prediction, true label) and compute test error. val MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction") - var result2=0.0 + var result2 = 0.0 def printlnMetricCase2(metricName: String): Double = { - - result2=MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions) + + result2 = MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions) println(metricName + " = " + result2) result2 } @@ -236,13 +229,12 @@ class Classifiers extends Serializable { val Precision = printlnMetricCase2("weightedPrecision") val Recall = printlnMetricCase2("weightedRecall") - - val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() + val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() + + finalResult - finalResult - } - //4. OK----- + // 4. OK----- def GradientBoostedTree(DF_Training: DataFrame, DF_Testing: DataFrame, sc: SparkContext): String = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) @@ -265,7 +257,7 @@ class Classifiers extends Serializable { // val Array(trainingData, testData) = Data.randomSplit(Array(0.7, 0.3)) // Train a DecisionTree model. - val gbt = new GBTClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") //.setMaxIter(10) + val gbt = new GBTClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") // .setMaxIter(10) // Convert indexed labels back to original labels. val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) @@ -281,7 +273,7 @@ class Classifiers extends Serializable { // Select example rows to display. - //Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- + // Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- var predictionsRDD = predictions.select("prediction", "FinalROLLBACK_REVERTED").rdd var predictionAndLabels = predictionsRDD.map { row => (row.get(0).asInstanceOf[Double], row.get(1).asInstanceOf[Double]) } @@ -290,32 +282,31 @@ class Classifiers extends Serializable { println("Area under ROC = " + metrics.areaUnderROC()) println("Area under PR = " + metrics.areaUnderPR()) - val ROC =metrics.areaUnderROC() - val PR= metrics.areaUnderPR() - - - //Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- - //Select (prediction, true label) and compute test error. + val ROC = metrics.areaUnderROC() + val PR = metrics.areaUnderPR() + + // Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- + // Select (prediction, true label) and compute test error. val MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction") - var result2=0.0 + var result2 = 0.0 def printlnMetric(metricName: String): Double = { - - result2= MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions) - println(metricName + " = " +result2) + + result2 = MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions) + println(metricName + " = " + result2) result2 } val accuracy = printlnMetric("accuracy") val Precision = printlnMetric("weightedPrecision") val Recall = printlnMetric("weightedRecall") - + val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() - finalResult + finalResult } - //5.Ok------------ + // 5.Ok------------ def MultilayerPerceptronClassifier(DF_Training: DataFrame, DF_Testing: DataFrame, sc: SparkContext): String = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) @@ -352,7 +343,7 @@ class Classifiers extends Serializable { // predictions.show() - //Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- + // Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- var predictionsDF = predictions.select("prediction", "label") var predictionsRDD = predictions.select("prediction", "label").rdd var predictionAndLabels = predictionsRDD.map { row => (row.get(0).asInstanceOf[Double], row.get(1).asInstanceOf[Double]) } @@ -361,13 +352,10 @@ class Classifiers extends Serializable { println("Area under ROC = " + metrics.areaUnderROC()) println("Area under PR = " + metrics.areaUnderPR()) - - val ROC =metrics.areaUnderROC() - val PR= metrics.areaUnderPR() - - - - //Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- + val ROC = metrics.areaUnderROC() + val PR = metrics.areaUnderPR() + + // Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- val accuracyevaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy") val weightedPrecisionevaluator = new MulticlassClassificationEvaluator().setMetricName("weightedPrecision") val weightedRecallevaluator = new MulticlassClassificationEvaluator().setMetricName("weightedRecall") @@ -375,22 +363,14 @@ class Classifiers extends Serializable { println("Accuracy = " + accuracyevaluator.evaluate(predictionsDF)) println("weightedPrecision = " + weightedPrecisionevaluator.evaluate(predictionsDF)) println("weightedRecall = " + weightedRecallevaluator.evaluate(predictionsDF)) - - + val accuracy = accuracyevaluator.evaluate(predictionsDF) val Precision = weightedPrecisionevaluator.evaluate(predictionsDF) val Recall = weightedRecallevaluator.evaluate(predictionsDF) - - - val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() - finalResult - - + val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() + finalResult } - - - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CommentProcessor.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CommentProcessor.scala index 0f0ecc3..834cd7f 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CommentProcessor.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CommentProcessor.scala @@ -1,6 +1,7 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection import java.util.regex.{ Matcher, Pattern } + import org.slf4j.{ Logger, LoggerFactory } class CommentProcessor extends Serializable { @@ -97,7 +98,7 @@ class CommentProcessor extends Serializable { actions } - //Ok: helper for Revision Features: extract Action- subaction from comment: + // Ok: helper for Revision Features: extract Action- subaction from comment: def Extract_ActionsOfNormalComment(comment: String): String = { var result: Boolean = false @@ -108,7 +109,7 @@ class CommentProcessor extends Serializable { var Param = "" var parameters: Array[String] = Array.ofDim[String](0) var asterisk_Start = 0 // == /* - var asterisk_End = 0 //== */ + var asterisk_End = 0 // == */ var colon = 0 if (comment != null) { val check_asterisk_Start = comment.contains("/*") @@ -182,7 +183,7 @@ class CommentProcessor extends Serializable { var suffixComment = "" var asterisk_Start = 0 // == /* - var asterisk_End = 0 //== */ + var asterisk_End = 0 // == */ var colon = 0 if (comment != null) { @@ -246,7 +247,7 @@ class CommentProcessor extends Serializable { var Param = "" var parameters: Array[String] = Array.ofDim[String](0) var asterisk_Start = 0 // == /* - var asterisk_End = 0 //== */ + var asterisk_End = 0 // == */ var colon = 0 if (comment != null) { val check_asterisk_Start = comment.contains("/*") @@ -324,7 +325,7 @@ class CommentProcessor extends Serializable { var Param = "" var parameters: Array[String] = Array.ofDim[String](0) var asterisk_Start = 0 // == /* - var asterisk_End = 0 //== */ + var asterisk_End = 0 // == */ var colon = 0 if (comment != null) { val check_asterisk_Start = comment.contains("/*") @@ -403,7 +404,7 @@ class CommentProcessor extends Serializable { var Param = "" var parameters: Array[String] = Array.ofDim[String](0) var asterisk_Start = 0 // == /* - var asterisk_End = 0 //== */ + var asterisk_End = 0 // == */ var colon = 0 if (comment != null) { val check_asterisk_Start = comment.contains("/*") @@ -569,7 +570,7 @@ class CommentProcessor extends Serializable { } else { - //do not thing + // do not thing } @@ -584,7 +585,7 @@ class CommentProcessor extends Serializable { } - //"Thecommentis" + result_Str + "&&&" + "Ac1:" + Action1 + "&&&" + "Ac2 :" + Action2 + "&&&" + "SF:" + suffixComment + // "Thecommentis" + result_Str + "&&&" + "Ac1:" + Action1 + "&&&" + "Ac2 :" + Action2 + "&&&" + "SF:" + suffixComment def isRollback(comment: String): Boolean = { var result: Boolean = false if (comment != null) { @@ -594,8 +595,8 @@ class CommentProcessor extends Serializable { logger.debug("Robust but not precise rollback match (result = " + result + ") : " + tmp) } } - //result = tmp.startsWith("Reverted"); - //result = tmp.startsWith("Reverted"); + // result = tmp.startsWith("Reverted"); + // result = tmp.startsWith("Reverted"); result } @@ -613,8 +614,8 @@ class CommentProcessor extends Serializable { } } } - //result = (tmp.startsWith("Undid") || tmp.startsWith("Undo")) ; - //result = (tmp.startsWith("Undid") || tmp.startsWith("Undo")) ; + // result = (tmp.startsWith("Undid") || tmp.startsWith("Undo")) ; + // result = (tmp.startsWith("Undid") || tmp.startsWith("Undo")) ; result } @@ -632,8 +633,8 @@ class CommentProcessor extends Serializable { } } } - //result = (tmp.startsWith("Restored") || tmp.startsWith("Restore")); - //result = (tmp.startsWith("Restored") || tmp.startsWith("Restore")); + // result = (tmp.startsWith("Restored") || tmp.startsWith("Restore")); + // result = (tmp.startsWith("Restored") || tmp.startsWith("Restore")); result } @@ -693,7 +694,7 @@ class CommentProcessor extends Serializable { } def getUndoneRevisionId(comment: String): Long = { - var result: Long = 0l + var result: Long = 0L val matcher: Matcher = ROBUST_UNDO_PATTERN.matcher(comment) if (matcher.matches()) { val str: String = matcher.group(2) @@ -705,7 +706,7 @@ class CommentProcessor extends Serializable { } def getRestoredRevisionId(comment: String): Long = { - var result: Long = 0l + var result: Long = 0L val matcher: Matcher = ROBUST_RESTORE_PATTERN.matcher(comment) if (matcher.matches()) { val str: String = matcher.group(1) @@ -869,4 +870,4 @@ class CommentProcessor extends Serializable { def getItemValue(): String = itemValue -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/FacilitiesClass.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/FacilitiesClass.scala index a0902aa..333ec38 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/FacilitiesClass.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/FacilitiesClass.scala @@ -1,9 +1,9 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection +import org.apache.spark.ml.linalg.{ Vector, Vectors } import org.apache.spark.rdd.RDD import org.apache.spark.sql._ -import org.apache.spark.sql.types.{ DoubleType, StringType, IntegerType, StructField, StructType } -import org.apache.spark.ml.linalg.{ Vector, Vectors } +import org.apache.spark.sql.types.{ DoubleType, IntegerType, StringType, StructField, StructType } class FacilitiesClass extends Serializable { @@ -18,68 +18,68 @@ class FacilitiesClass extends Serializable { namesList } - //ok --- Used for DF Triples + // ok --- Used for DF Triples def RDD_TO_DFR_RDFXML(rdd: RDD[String], sqlContext: org.apache.spark.sql.SQLContext): DataFrame = { - //Create an Encoded Schema in a String Format: + // Create an Encoded Schema in a String Format: val schemaString = "Subject Predicate Object" - //Generate schema: - val schema = StructType(schemaString.split(" ").map(fieldName ⇒ StructField(fieldName, StringType, true))) - //Apply Transformation for Reading Data from Text File - val rowRDD = rdd.map(_.split(" ")).map(e ⇒ Row(e(0), e(1), e(2))) - //Apply RowRDD in Row Data based on Schema: + // Generate schema: + val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true))) + // Apply Transformation for Reading Data from Text File + val rowRDD = rdd.map(_.split(" ")).map(e => Row(e(0), e(1), e(2))) + // Apply RowRDD in Row Data based on Schema: val RDFTRIPLE = sqlContext.createDataFrame(rowRDD, schema) - //Store DataFrame Data into Table + // Store DataFrame Data into Table RDFTRIPLE.registerTempTable("SPO") - //Select Query on DataFrame + // Select Query on DataFrame val dfr = sqlContext.sql("SELECT * FROM SPO") dfr.show() dfr } - //ok --- Used for DF Triples + // ok --- Used for DF Triples def RDD_TO_DFR_TRIX(rdd: RDD[String], sqlContext: org.apache.spark.sql.SQLContext): DataFrame = { - //Create an Encoded Schema in a String Format: + // Create an Encoded Schema in a String Format: val schemaString = "Subject Predicate Object" - //Generate schema: - val schema = StructType(schemaString.split(" ").map(fieldName ⇒ StructField(fieldName, StringType, true))) - //Apply Transformation for Reading Data from Text File - val rowRDD = rdd.map(_.split("><")).map(e ⇒ Row(e(0), e(1), e(2))) - //Apply RowRDD in Row Data based on Schema: + // Generate schema: + val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true))) + // Apply Transformation for Reading Data from Text File + val rowRDD = rdd.map(_.split("><")).map(e => Row(e(0), e(1), e(2))) + // Apply RowRDD in Row Data based on Schema: val RDFTRIPLE = sqlContext.createDataFrame(rowRDD, schema) - //Store DataFrame Data into Table + // Store DataFrame Data into Table RDFTRIPLE.registerTempTable("SPO") - //Select Query on DataFrame + // Select Query on DataFrame val dfr = sqlContext.sql("SELECT * FROM SPO") dfr.show() dfr } - //ok --- Used for DF Triples + // ok --- Used for DF Triples def RDD_TO_DFR_JTriple(rdd: RDD[String], sqlContext: org.apache.spark.sql.SQLContext): DataFrame = { - //Create an Encoded Schema in a String Format: + // Create an Encoded Schema in a String Format: val schemaString = "Subject Predicate Object" - //Generate schema: - val schema = StructType(schemaString.split(" ").map(fieldName ⇒ StructField(fieldName, StringType, true))) - //Apply Transformation for Reading Data from Text File - val rowRDD = rdd.map(_.split(",")).map(e ⇒ Row(e(0), e(1), e(2))) - //Apply RowRDD in Row Data based on Schema: + // Generate schema: + val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true))) + // Apply Transformation for Reading Data from Text File + val rowRDD = rdd.map(_.split(",")).map(e => Row(e(0), e(1), e(2))) + // Apply RowRDD in Row Data based on Schema: val RDFTRIPLE = sqlContext.createDataFrame(rowRDD, schema) - //Store DataFrame Data into Table + // Store DataFrame Data into Table RDFTRIPLE.registerTempTable("SPO") - //Select Query on DataFrame + // Select Query on DataFrame val dfr = sqlContext.sql("SELECT * FROM SPO") dfr.show() dfr } + def RoundDouble(va: Double): Double = { val rounded: Double = Math.round(va * 10000).toDouble / 10000 rounded - } def stringToInt(str: String): Integer = { @@ -139,7 +139,5 @@ class FacilitiesClass extends Serializable { } tem.trim() - } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ItemFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ItemFeatures.scala index b4fc8c1..2992634 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ItemFeatures.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ItemFeatures.scala @@ -1,10 +1,10 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import java.util.regex.{ Pattern, Matcher } +import java.util.regex.{ Matcher, Pattern } class ItemFeatures extends Serializable { - //1. + // 1. def Get_NumberOfLabels(str: String): Double = { // from Label Tag @@ -15,11 +15,9 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //2. + // 2. def Get_NumberOfDescription(str: String): Double = { // from description tag @@ -30,11 +28,9 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //3. + // 3. def Get_NumberOfAliases(str: String): Double = { // from Aliases Tag @@ -45,11 +41,9 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //4. + // 4. def Get_NumberOfClaim(str: String): Double = { // from claim tag @@ -60,10 +54,8 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //5. + // 5. def Get_NumberOfSiteLinks(str: String): Double = { // from Sitelink tag @@ -74,10 +66,8 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //6. + // 6. def Get_NumberOfstatements(str: String): Double = { // from claims tag @@ -88,10 +78,8 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //7. + // 7. def Get_NumberOfReferences(str: String): Double = { @@ -107,7 +95,7 @@ class ItemFeatures extends Serializable { count } - //8. + // 8. def Get_NumberOfQualifier(str: String): Double = { // from claims tag @@ -118,10 +106,8 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //9. + // 9. def Get_NumberOfQualifier_Order(str: String): Double = { // from claims tag val input: String = str @@ -131,12 +117,9 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //10. + // 10. def Get_NumberOfBadges(str: String): Double = { - // from Sitelink tag val input: String = str val pattern: Pattern = Pattern.compile(""""badges"""" + ":") @@ -145,8 +128,6 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Main.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Main.scala index 02f0bdd..5fa21d8 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Main.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Main.scala @@ -1,8 +1,7 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.{ SparkConf, SparkContext } +import org.apache.spark.{ RangePartitioner, SparkConf, SparkContext } import org.apache.spark.sql._ -import org.apache.spark.{ SparkContext, RangePartitioner } object Main { @@ -19,7 +18,7 @@ object Main { if (num == "1") { Start.Start_RDF_Parser_Appraoch(sc) - } // Distributed Standard Parser and Vandalism Detection : + } // Distributed Standard Parser and Vandalism Detection: else if (num == "2") { val Training_Data = Start.Training_Start_StandardXMLParser_VD(sc) @@ -27,22 +26,21 @@ object Main { val OBJClassifiers = new Classifiers() - //1.Random Forest Classifer: + // 1.Random Forest Classifer: val RandomForestClassifer_Values = OBJClassifiers.RandomForestClassifer(Training_Data, Testing_Data, sc) - //2.DecisionTreeClassifier + // 2.DecisionTreeClassifier val DecisionTreeClassifier_values = OBJClassifiers.DecisionTreeClassifier(Training_Data, Testing_Data, sc) // 3.LogisticRegrision val LogisticRegrision_values = OBJClassifiers.LogisticRegrision(Training_Data, Testing_Data, sc) - //4.GradientBoostedTree + // 4.GradientBoostedTree val GradientBoostedTree_values = OBJClassifiers.GradientBoostedTree(Training_Data, Testing_Data, sc) - //5.MultilayerPerceptronClassifier + // 5.MultilayerPerceptronClassifier val MultilayerPerceptronClassifier_values = OBJClassifiers.MultilayerPerceptronClassifier(Training_Data, Testing_Data, sc) - println(RandomForestClassifer_Values) println(DecisionTreeClassifier_values) println(LogisticRegrision_values) @@ -52,4 +50,4 @@ object Main { } } -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseJTriple.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseJTriple.scala index 122e297..395e53b 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseJTriple.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseJTriple.scala @@ -1,24 +1,25 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.SparkContext + +import java.io.ByteArrayInputStream +import java.util.ArrayList +import java.util.regex.Pattern + import org.apache.hadoop.mapred.JobConf -import org.apache.spark.rdd.RDD import org.apache.jena.graph.Triple import org.apache.jena.rdf.model.ModelFactory -import java.util.ArrayList -import java.util.regex.Pattern -import java.io.ByteArrayInputStream - -class ParseJTriple extends Serializable{ - - - def Start_JTriple_Parser(jobConf_Record: JobConf, sc: SparkContext): RDD[String] = { +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD + +class ParseJTriple extends Serializable { + + def Start_JTriple_Parser(jobConf_Record: JobConf, sc: SparkContext): RDD[String] = { jobConf_Record.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader") jobConf_Record.set("stream.recordreader.begin", """"s":""") // start Tag jobConf_Record.set("stream.recordreader.end", "}") // End Tag org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf_Record, "hdfs://localhost:9000/mydata/xxx.json") // input path from Hadoop - //------------JTriple Record + // ------------JTriple Record // read data and save in RDD as block- JTriple Record val JTriple_Dataset_Record = sc.hadoopRDD(jobConf_Record, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // println("HelloRecords" + " " + JTriple_Dataset_Record.count) @@ -29,14 +30,10 @@ class ParseJTriple extends Serializable{ val RevisioninOneString = JTriple_Dataset_Record_AsstringBlock.map(line => New_abendRevision(line)).distinct().cache() RevisioninOneString } - def New_abendRevision(str: String): String = { + def New_abendRevision(str: String): String = { val s1 = str.replaceAll("[\r\n]+", " "); - val s2 = s1.replaceAll("[.\\s]","").trim() - + val s2 = s1.replaceAll("[.\\s]", "").trim() s2 } - - - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseNormalXML.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseNormalXML.scala index 5b70361..cea1e38 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseNormalXML.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseNormalXML.scala @@ -1,19 +1,20 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.SparkContext -import org.apache.hadoop.mapred.JobConf -import org.apache.spark.rdd.RDD import java.math.BigInteger +import java.net.InetAddress import java.util.ArrayList -import org.apache.commons.lang3.ArrayUtils import java.util.regex.{ Matcher, Pattern } -import java.net.InetAddress + +import org.apache.commons.lang3.ArrayUtils +import org.apache.hadoop.mapred.JobConf +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD class ParseNormalXML extends Serializable { def Training_DB_NormalXML_Parser_Input1(sc: SparkContext): RDD[String] = { - //Streaming records:==================================================================Input Files + // Streaming records:==================================================================Input Files val jobConf = new JobConf() jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader") jobConf.set("stream.recordreader.begin", "") // start Tag @@ -21,10 +22,10 @@ class ParseNormalXML extends Serializable { org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf, "hdfs://localhost:9000/mydata/sample.xml") // input path from Hadoop // read data and save in RDD as block - val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) //.distinct() + val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // .distinct() println(wikiData.count) val RevisionTagewikidata = wikiData.map { case (x, y) => (x.toString()) } - //println(RevisionTagewikidata.count) + // println(RevisionTagewikidata.count) // ABend the revision in one line string val RevisioninOneString = RevisionTagewikidata.map(line => New_abendRevision(line)).cache() @@ -37,7 +38,7 @@ class ParseNormalXML extends Serializable { } def Training_DB_NormalXML_Parser_Input2(sc: SparkContext): RDD[String] = { - //Streaming records:==================================================================Input Files + // Streaming records:==================================================================Input Files val jobConf = new JobConf() jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader") jobConf.set("stream.recordreader.begin", "") // start Tag @@ -45,10 +46,10 @@ class ParseNormalXML extends Serializable { org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf, "hdfs://localhost:9000/mydata/2.xml") // input path from Hadoop // read data and save in RDD as block - val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) //.distinct() + val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // .distinct() println(wikiData.count) val RevisionTagewikidata = wikiData.map { case (x, y) => (x.toString()) } - //println(RevisionTagewikidata.count) + // println(RevisionTagewikidata.count) // ABend the revision in one line string val RevisioninOneString = RevisionTagewikidata.map(line => New_abendRevision(line)).cache() @@ -61,7 +62,7 @@ class ParseNormalXML extends Serializable { } def Training_DB_NormalXML_Parser_Input3(sc: SparkContext): RDD[String] = { - //Streaming records:==================================================================Input Files + // Streaming records:==================================================================Input Files val jobConf = new JobConf() jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader") jobConf.set("stream.recordreader.begin", "") // start Tag @@ -69,10 +70,10 @@ class ParseNormalXML extends Serializable { org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf, "hdfs://localhost:9000/mydata/3.xml") // input path from Hadoop // read data and save in RDD as block - val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) //.distinct() + val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // .distinct() println(wikiData.count) val RevisionTagewikidata = wikiData.map { case (x, y) => (x.toString()) } - //println(RevisionTagewikidata.count) + // println(RevisionTagewikidata.count) // ABend the revision in one line string val RevisioninOneString = RevisionTagewikidata.map(line => New_abendRevision(line)).cache() // println("TotalCount" + " " + RevisioninOneString.count) @@ -83,9 +84,9 @@ class ParseNormalXML extends Serializable { } - def Testing_DB_NormalXML_Parser(sc: SparkContext): RDD[String] = { + def Testing_DB_NormalXML_Parser(sc: SparkContext): RDD[String] = { - //Streaming records:==================================================================Input Files + // Streaming records:==================================================================Input Files val jobConf = new JobConf() jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader") jobConf.set("stream.recordreader.begin", "") // start Tag @@ -93,10 +94,10 @@ class ParseNormalXML extends Serializable { org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf, "hdfs://localhost:9000/mydata/3.xml") // input path from Hadoop // read data and save in RDD as block - val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) //.distinct() + val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // .distinct() println(wikiData.count) val RevisionTagewikidata = wikiData.map { case (x, y) => (x.toString()) } - //println(RevisionTagewikidata.count) + // println(RevisionTagewikidata.count) // ABend the revision in one line string val RevisioninOneString = RevisionTagewikidata.map(line => New_abendRevision(line)).cache() // println("TotalCount" + " " + RevisioninOneString.count) @@ -107,11 +108,6 @@ class ParseNormalXML extends Serializable { } - - - - - // make the revision as one string def New_abendRevision(str: String): String = { @@ -125,13 +121,13 @@ class ParseNormalXML extends Serializable { // Ok: used on the Top def New_Build_Revision_map(obj: String): String = { var Store_Record_String = "" - //Json Revision : + // Json Revision : val JsonStr = Get_Json_Revision(obj) val Standered_JsonStr = Standared_Get_Json_Revision(obj) // for full string Jason with all formating for parsing by spark val Json_Standered = Standered_JsonStr.get(0).toString() // for full string Jason with all formating for parsing by spark val Json = JsonStr.get(0).toString() - //0.Id Revision + // 0.Id Revision val IdRevision = Get_ID_Revision(obj) if (IdRevision != "") { val ID = IdRevision.toString().trim() @@ -141,7 +137,7 @@ class ParseNormalXML extends Serializable { // else { // Store_Record_String = "0" // } - //1. Item Title : + // 1. Item Title : val ItemTitle: ArrayList[String] = Get_Item_Title_FromJson(Json) if (ItemTitle.size() > 0) { val groupItemTilte = ItemTitle.get(0).toString() @@ -164,8 +160,8 @@ class ParseNormalXML extends Serializable { } } - //=============Start:======= extract information from the json string - //2.Comments : + // =============Start:======= extract information from the json string + // 2.Comments : val commentarray = Get_Comment(obj) val comment = commentarray.get(0) if (comment.nonEmpty) { @@ -174,7 +170,7 @@ class ParseNormalXML extends Serializable { Store_Record_String = Store_Record_String.trim() + "NNLL" + "NA" } - //3.Parent ID : + // 3.Parent ID : val ParentIDStr = Get_ParentID(obj) if (ParentIDStr.nonEmpty) { @@ -185,7 +181,7 @@ class ParseNormalXML extends Serializable { Store_Record_String = Store_Record_String + "NNLL" + "0" } - //4.Timestamp: + // 4.Timestamp: val TimeStamparray = Get_TIMEStamp(obj) val TimeSta = TimeStamparray.get(0) if (TimeSta.nonEmpty) { @@ -194,41 +190,41 @@ class ParseNormalXML extends Serializable { Store_Record_String = Store_Record_String + "NNLL" + "NA" } - //5. Contributor Data( IP ): + // 5. Contributor Data( IP ): val Contributstr = Get_Contributor_IP(obj) - //val ContributorSta = Contributorarray.get(0) + // val ContributorSta = Contributorarray.get(0) if (Contributstr != "0") { Store_Record_String = Store_Record_String + "NNLL" + Contributstr.trim() } else { Store_Record_String = Store_Record_String + "NNLL" + "0" } - //6. Contributor ID : + // 6. Contributor ID : val Contributor_IDStr = Get_Contributor_ID(obj) - //val Contributor_IDSta = Contributor_IDarray.get(0) + // val Contributor_IDSta = Contributor_IDarray.get(0) if (Contributor_IDStr != "0") { Store_Record_String = Store_Record_String + "NNLL" + Contributor_IDStr.trim() } else { Store_Record_String = Store_Record_String + "NNLL" + "0" } - //7. Contributor Name : + // 7. Contributor Name : val Contributor_NameStr = Get_Contributor_Name(obj) - //val Contributor_IDSta = Contributor_IDarray.get(0) + // val Contributor_IDSta = Contributor_IDarray.get(0) if (Contributor_NameStr != "NA") { Store_Record_String = Store_Record_String + "NNLL" + Contributor_NameStr.trim() } else { Store_Record_String = Store_Record_String + "NNLL" + "NA" } - //8. Full Json Tag for Parsing: + // 8. Full Json Tag for Parsing: if (Json_Standered.nonEmpty) { Store_Record_String = Store_Record_String + "NNLL" + Json_Standered.trim() } else { Store_Record_String = Store_Record_String + "NNLL" + "NA" } - //9. Model : + // 9. Model : val modelstr = Get_Model(obj) if (modelstr.nonEmpty) { @@ -236,14 +232,14 @@ class ParseNormalXML extends Serializable { } else { Store_Record_String = Store_Record_String + "NNLL" + "NA" } - //10.Format: + // 10.Format: val Formatstr = Get_Format(obj) if (Formatstr.nonEmpty) { Store_Record_String = Store_Record_String + "NNLL" + Formatstr.trim() } else { Store_Record_String = Store_Record_String + "NNLL" + "NA" } - //11.SHA1 : + // 11.SHA1 : val SHAstr = Get_SHA1(obj) if (SHAstr.nonEmpty) { Store_Record_String = Store_Record_String + "NNLL" + SHAstr.trim() @@ -290,8 +286,8 @@ class ParseNormalXML extends Serializable { } - //********************** - // if (str.contains("")){ + // ********************** + // if (str.contains("")) { // // val inputID: CharSequence = str // val pattStr_id: String = "[0-9]+" @@ -306,7 +302,7 @@ class ParseNormalXML extends Serializable { // } // } // - // else if (str.contains("")){ + // else if (str.contains("")) { // // val inputID: CharSequence = str // val pattStr_id: String = "[0-9]+" @@ -327,7 +323,7 @@ class ParseNormalXML extends Serializable { tem } - //Extract TimeStampe value from Tag: + // Extract TimeStampe value from Tag: def Get_TIMEStamp(str: String): ArrayList[String] = { val TimeStamp: ArrayList[String] = new ArrayList[String]() @@ -382,7 +378,7 @@ class ParseNormalXML extends Serializable { } - //extract Item Title from Json string + // extract Item Title from Json string def Get_Item_Title_FromJson(str: String): ArrayList[String] = { val Item_Title_FromJason: ArrayList[String] = new ArrayList[String]() @@ -634,5 +630,4 @@ class ParseNormalXML extends Serializable { } temp } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseRDFXML.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseRDFXML.scala index 3f83897..2add40c 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseRDFXML.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseRDFXML.scala @@ -1,13 +1,14 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.SparkContext +import java.io.ByteArrayInputStream +import java.util.ArrayList +import java.util.regex.Pattern + import org.apache.hadoop.mapred.JobConf -import org.apache.spark.rdd.RDD import org.apache.jena.graph.Triple import org.apache.jena.rdf.model.ModelFactory -import java.util.ArrayList -import java.util.regex.Pattern -import java.io.ByteArrayInputStream +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD class ParseRDFXML extends Serializable { @@ -24,7 +25,7 @@ class ParseRDFXML extends Serializable { org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf_Record, "hdfs://localhost:9000/mydata/Germany.rdf") // input path from Hadoop org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf_Prefixes, "hdfs://localhost:9000/mydata/Germany.rdf") // input path from Hadoop - //------------ RDF XML Record + // ------------ RDF XML Record // read data and save in RDD as block- RDFXML Record val RDFXML_Dataset_Record = sc.hadoopRDD(jobConf_Record, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // println("HelloRecords" + " " + RDFXML_Dataset_Record.count) @@ -34,14 +35,14 @@ class ParseRDFXML extends Serializable { println("HelloRecords" + " " + RDFXML_Dataset_Record_AsstringBlock.count) // RDFXML_Dataset_Record_AsstringBlock.foreach(println) - //-------------RDF XML Prefixes + // -------------RDF XML Prefixes // read data and save in RDD as block- RDFXML Prefixes val RDFXML_Dataset_Prefixes = sc.hadoopRDD(jobConf_Prefixes, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) println("HelloPrefixes" + " " + RDFXML_Dataset_Prefixes.count) // RDFXML_Dataset_Prefixes.foreach(println) // Convert the block- RDFXML Prefixes to String DataType var RDFXML_Dataset_AsstringPrefixes_WithoutDist = RDFXML_Dataset_Prefixes.map { case (x, y) => (x.toString()) } - val RDFXML_Dataset_AsstringPrefixes=RDFXML_Dataset_AsstringPrefixes_WithoutDist.distinct() + val RDFXML_Dataset_AsstringPrefixes = RDFXML_Dataset_AsstringPrefixes_WithoutDist.distinct() println("HelloPrefixes" + " " + RDFXML_Dataset_AsstringPrefixes.count) // RDFXML_Dataset_AsstringPrefixes.foreach(println) val pref = RDFXML_Dataset_AsstringPrefixes.reduce((a, b) => a + "\n" + b) @@ -88,5 +89,4 @@ class ParseRDFXML extends Serializable { val str = Arraylistval.get(0).toString() str } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseTRIX.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseTRIX.scala index 3bd8364..f3a4201 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseTRIX.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseTRIX.scala @@ -1,12 +1,14 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.SparkContext + +import java.io.ByteArrayInputStream +import java.util.ArrayList +import java.util.regex.Pattern + import org.apache.hadoop.mapred.JobConf -import org.apache.spark.rdd.RDD import org.apache.jena.graph.Triple import org.apache.jena.rdf.model.ModelFactory -import java.util.ArrayList -import java.util.regex.Pattern -import java.io.ByteArrayInputStream +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD class ParseTRIX extends Serializable { @@ -18,7 +20,7 @@ class ParseTRIX extends Serializable { org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf_Record, "hdfs://localhost:9000/mydata/xx.trix") // input path from Hadoop - //------------TRIX Record + // ------------TRIX Record // read data and save in RDD as block- TRIX Record val TRIX_Dataset_Record = sc.hadoopRDD(jobConf_Record, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // println("HelloRecords" + " " + TRIX_Dataset_Record.count) @@ -43,11 +45,9 @@ class ParseTRIX extends Serializable { s4 } - // This function for TRIX case. def arrayListTOstring(Arraylistval: ArrayList[Triple]): String = { val str = Arraylistval.get(0).toString() str } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/RevisionFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/RevisionFeatures.scala index 7dc3c19..ccbd2b4 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/RevisionFeatures.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/RevisionFeatures.scala @@ -1,4 +1,5 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection + import java.util.regex.{ Matcher, Pattern } class RevisionFeatures extends Serializable { @@ -53,7 +54,7 @@ class RevisionFeatures extends Serializable { } - // if (result_isNonLatin==true){ // is matched + // if (result_isNonLatin==true) { // is matched // // Final_Result=false // @@ -123,4 +124,4 @@ class RevisionFeatures extends Serializable { } -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/SentencesFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/SentencesFeatures.scala index 5490ec1..62c0432 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/SentencesFeatures.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/SentencesFeatures.scala @@ -13,7 +13,7 @@ class SentencesFeatures extends Serializable { } - //1.comment tail Lenght Action subaction param+ tail + // 1.comment tail Lenght Action subaction param+ tail def CommentTailLenght(Full_Comment_Str: String): Integer = { val parsedCommment_OBJ = new CommentProcessor() val commentTail_Str = parsedCommment_OBJ.Extract_CommentTail(Full_Comment_Str) @@ -23,9 +23,9 @@ class SentencesFeatures extends Serializable { } // similarity between the comment ( suffix of the comment = Tail ) where the comment is normal comment /* .........*/ or /* ......... // e.g This comment includes wb...sitelink - //1-we have to be sure the comment is normal comment take the form /* ........./* - //2-Next step: we check the Action part if it includes a sitelink word or not. - //3-we compare the suffix in this case to site link with pay attention to the same language. + // 1-we have to be sure the comment is normal comment take the form /* ........./* + // 2-Next step: we check the Action part if it includes a sitelink word or not. + // 3-we compare the suffix in this case to site link with pay attention to the same language. // we check the type of Normal comment if it contains Aliases . def extract_CommentAliases_LanguageType(Full_Comment_Str: String): String = { @@ -185,5 +185,4 @@ class SentencesFeatures extends Serializable { langeType.trim() } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/StatementFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/StatementFeatures.scala index 33b1b5a..31d1158 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/StatementFeatures.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/StatementFeatures.scala @@ -42,5 +42,4 @@ class StatementFeatures extends Serializable { } result } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/VandalismDetection.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/VandalismDetection.scala index 065adb1..5937817 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/VandalismDetection.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/VandalismDetection.scala @@ -1,31 +1,28 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.{ SparkContext, RangePartitioner } -import org.apache.spark.sql._ -import org.apache.spark.sql.expressions.Window -import org.apache.hadoop.mapred.JobConf import java.util.Scanner -import org.json.JSONObject + import org.apache.commons.lang3.StringUtils -import org.apache.spark.sql.functions.{ concat, lit } -import org.apache.spark.ml.feature.{ Word2Vec, Word2VecModel } +import org.apache.hadoop.mapred.JobConf +import org.apache.spark.{ RangePartitioner, SparkContext } import org.apache.spark.ml.Pipeline -import org.apache.spark.ml.feature.VectorAssembler +import org.apache.spark.ml.feature.{ VectorAssembler, Word2Vec, Word2VecModel } import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql._ +import org.apache.spark.sql.expressions.Window +import org.apache.spark.sql.functions.{ concat, lit } +import org.json.JSONObject class VandalismDetection extends Serializable { - - - // Function 1 : Distributed RDF Parser Approach def Start_RDF_Parser_Appraoch(sc: SparkContext): Unit = { - + val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ import org.apache.spark.sql.functions._ // for UDF import org.apache.spark.sql.types._ - + println("*********************************************************************") println("Distributed RDF Parser Model") println("Please Enter 1 for JTriple and 2 for TRIX process and 3 for RDFXML:") @@ -41,12 +38,11 @@ class VandalismDetection extends Serializable { val DRF_Builder_JTripleOBJ = new FacilitiesClass() val RDD_JTriple = JTriple_Parser_OBJ.Start_JTriple_Parser(jobConf, sc) RDD_JTriple.foreach(println) - //----------------------------DF for RDF TRIX ------------------------------------------ + // ----------------------------DF for RDF TRIX ------------------------------------------ // Create SQLContext Object: val sqlContext = new org.apache.spark.sql.SQLContext(sc) val DFR_JTriple = DRF_Builder_JTripleOBJ.RDD_TO_DFR_JTriple(RDD_JTriple, sqlContext) DFR_JTriple.show() - } else if (num == "2") { @@ -57,12 +53,11 @@ class VandalismDetection extends Serializable { val DRF_Builder_RDFTRIX_OBJ = new FacilitiesClass() val RDD_TRIX = TRIX_Parser_OBJ.Start_TriX_Parser(jobConf, sc) RDD_TRIX.foreach(println) - //----------------------------DF for RDF TRIX ------------------------------------------ + // ----------------------------DF for RDF TRIX ------------------------------------------ // Create SQLContext Object: val sqlContext = new org.apache.spark.sql.SQLContext(sc) val DFR_TRIX = DRF_Builder_RDFTRIX_OBJ.RDD_TO_DFR_TRIX(RDD_TRIX, sqlContext) DFR_TRIX.show() - } else if (num == "3") { println("RDF XML .........!!!!!!") @@ -76,1917 +71,1272 @@ class VandalismDetection extends Serializable { val RDD_RDFXML = RDFXML_Parser_OBJ.start_RDFXML_Parser(jobConf_Record, jobConf_Prefixes, sc) RDD_RDFXML.foreach(println) - //----------------------------DF for RDF XML ------------------------------------------ + // ----------------------------DF for RDF XML ------------------------------------------ // Create SQLContext Object: val sqlContext = new org.apache.spark.sql.SQLContext(sc) val DFR_RDF_XML = DRF_Builder_RDFXML_OBJ.RDD_TO_DFR_RDFXML(RDD_RDFXML, sqlContext) DFR_RDF_XML.show() } - - sc.stop() + + sc.stop() } - - //*********************************************************************************************************************************************** - // Function 2:Training XML and Vandalism Detection + + // ********************************************************************************* + // Function 2:Training XML and Vandalism Detection def Training_Start_StandardXMLParser_VD(sc: SparkContext): DataFrame = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ import org.apache.spark.sql.functions._ // for UDF import org.apache.spark.sql.types._ - // Streaming records: - val jobConf = new JobConf() - val NormalXML_Parser_OBJ = new ParseNormalXML() - val RDD_OBJ = new ParseNormalXML() - - val Training_RDD_All_Record1 = RDD_OBJ.Training_DB_NormalXML_Parser_Input1(sc) - val Training_RDD_All_Record2 = RDD_OBJ.Training_DB_NormalXML_Parser_Input2(sc) - val Training_RDD_All_Record3 = RDD_OBJ.Training_DB_NormalXML_Parser_Input3(sc) - //RDD_All_Record1.foreach(println) - //RDD_All_Record2.foreach(println) - // RDD_All_Record3.foreach(println) - - val Training_RDD_All_Record = Training_RDD_All_Record1.union(Training_RDD_All_Record2).union(Training_RDD_All_Record3).distinct().cache() - - //println(RDD_All_Record.count()) - println(Training_RDD_All_Record.count()) - - // ======= Json part : - //Json RDD : Each record has its Revision iD: - val JsonRDD = Training_RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache() - //JsonRDD.foreach(println) - //println(JsonRDD.count()) - - // Data set - val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache() - //Ds_Json.show() - // println(Ds_Json.count()) - - // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage - val TagsRDD = Training_RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache() - val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache() - // DF_Tags.show() - // println(DF_Tags.count()) - - //======== Join Json part with Tag Part:============================ - //Joining to have full data - val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha") //.orderBy("Rid", "Itemid") - DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1") - val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache() - - val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") - val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) //.distinct() - DF_Second.registerTempTable("Data2") - - //===================================================================Parent // Previous Revision============================================================================================================== - //val DF_Joined = result1.as("df1").join(result2.as("df2"), col("itemid") === col("itemid2") && col("index1") === col("index2") + 1, "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha", "Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") - //.select("itemid", "Rid","pid","time","itemid2","Rid2","pid2","time2") - - //Joining based on Parent Id to get the previous cases: ParentID - val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct() - - val RDD_After_JoinDF = DF_Joined.rdd.distinct() - val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache() - val part = new RangePartitioner(4, x) - val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory. - //partitioned.foreach(println) - // - // //=====================================================All Features Based on Categories of Features Data Type :================================================================================== - // - val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on "," - //Result_all_Features.foreach(println) - // println("nayef" + Result_all_Features.count()) - - // Conver the RDD of All Features to DataFrame: - - val schema = StructType( - - //0 - StructField("Rid", IntegerType, false) :: - - // Character Features : - /* 1*/ StructField("C1uppercaseratio", DoubleType, false) :: /*2 */ StructField("C2lowercaseratio", DoubleType, false) :: /*3*/ StructField("C3alphanumericratio", DoubleType, false) :: - /*4*/ StructField("C4asciiratio", DoubleType, false) :: /*5*/ StructField("C5bracketratio", DoubleType, false) :: /*6*/ StructField("C6digitalratio", DoubleType, false) :: - /*7*/ StructField("C7latinratio", DoubleType, false) :: /*8*/ StructField("C8whitespaceratio", DoubleType, false) :: /* 9*/ StructField("C9puncratio", DoubleType, false) :: - /*10*/ StructField("C10longcharacterseq", DoubleType, false) :: /*11*/ StructField("C11arabicratio", DoubleType, false) :: /*12*/ StructField("C12bengaliratio", DoubleType, false) :: - /*13 */ StructField("C13brahmiratio", DoubleType, false) :: /*14*/ StructField("C14cyrilinratio", DoubleType, false) :: /*15*/ StructField("C15hanratio", DoubleType, false) :: - /*16*/ StructField("c16malysiaratio", DoubleType, false) :: /*17*/ StructField("C17tamiratio", DoubleType, false) :: /*18*/ StructField("C18telugratio", DoubleType, false) :: - /*19 */ StructField("C19symbolratio", DoubleType, false) :: /*20 */ StructField("C20alpharatio", DoubleType, false) :: /*21*/ StructField("C21visibleratio", DoubleType, false) :: - /*22*/ StructField("C22printableratio", DoubleType, false) :: /*23*/ StructField("C23blankratio", DoubleType, false) :: /*24 */ StructField("C24controlratio", DoubleType, false) :: - /* 25 */ StructField("C25hexaratio", DoubleType, false) :: - - //word Features: - /*26*/ StructField("W1languagewordratio", DoubleType, false) :: /*27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /*28*/ StructField("W3lowercaseratio", DoubleType, false) :: - /*29 Integer */ StructField("W4longestword", IntegerType, false) :: /*30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /*31*/ StructField("W6badwordratio", DoubleType, false) :: - /*32*/ StructField("W7uppercaseratio", DoubleType, false) :: /*33*/ StructField("W8banwordratio", DoubleType, false) :: /*34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) :: - /*35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /*36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: /*37 Boolean*/ StructField("W12IsContainBanword", DoubleType, false) :: - /*38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /*39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) :: - /*40*/ StructField("W15PortionQid", DoubleType, false) :: /*41*/ StructField("W16PortionLnags", DoubleType, false) :: /*42*/ StructField("W17PortionLinks", DoubleType, false) :: - - // - // // Sentences Features: - /*43*/ StructField("S1CommentTailLength", DoubleType, false) :: /*44*/ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: /*45*/ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /*46*/ StructField("S4SimilarityCommentComment", DoubleType, false) :: - // - // // Statements Features : - /*47*/ StructField("SS1Property", StringType, false) :: /*48*/ StructField("SS2DataValue", StringType, false) :: /*49*/ StructField("SS3ItemValue", StringType, false) :: - // - // - // //User Features : - /*50 Boolean*/ StructField("U1IsPrivileged", DoubleType, false) :: /*51 Boolean*/ StructField("U2IsBotUser", DoubleType, false) :: /*52 Boolean*/ StructField("U3IsBotuserWithFlaguser", DoubleType, false) :: - /*53 Boolean*/ StructField("U4IsProperty", DoubleType, false) :: /*54 Boolean*/ StructField("U5IsTranslator", DoubleType, false) :: /*55 Boolean*/ StructField("U6IsRegister", DoubleType, false) :: - /*56*/ StructField("U7IPValue", DoubleType, false) :: /*57*/ StructField("U8UserID", IntegerType, false) :: /*58*/ StructField("U9HasBirthDate", DoubleType, false) :: /*59*/ StructField("U10HasDeathDate", DoubleType, false) :: - - //Items Features : - - /*60*/ StructField("I1NumberLabels", DoubleType, false) :: /*61*/ StructField("I2NumberDescription", DoubleType, false) :: /*62*/ StructField("I3NumberAliases", DoubleType, false) :: /*63*/ StructField("I4NumberClaims", DoubleType, false) :: - /*64*/ StructField("I5NumberSitelinks", DoubleType, false) :: /*65*/ StructField("I6NumberStatement", DoubleType, false) :: /*66*/ StructField("I7NumberReferences", DoubleType, false) :: /*67*/ StructField("I8NumberQualifier", DoubleType, false) :: - /*68*/ StructField("I9NumberQualifierOrder", DoubleType, false) :: /*69*/ StructField("I10NumberBadges", DoubleType, false) :: /*70*/ StructField("I11ItemTitle", StringType, false) :: - - // Revision Features: - /*71*/ StructField("R1languageRevision", StringType, false) :: /*72*/ StructField("R2RevisionLanguageLocal", StringType, false) :: /*73*/ StructField("R3IslatainLanguage", DoubleType, false) :: - /*74*/ StructField("R4JsonLength", DoubleType, false) :: /*75*/ StructField("R5RevisionAction", StringType, false) :: /*76*/ StructField("R6PrevReviAction", StringType, false) :: - /*77*/ StructField("R7RevisionAccountChange", DoubleType, false) :: /*78*/ StructField("R8ParRevision", StringType, false) :: /*79*/ StructField("R9RevisionTime", StringType, false) :: - /*80*/ StructField("R10RevisionSize", DoubleType, false) :: /*81*/ StructField("R11ContentType", StringType, false) :: /*82*/ StructField("R12BytesIncrease", DoubleType, false) :: - /*83*/ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /*84*/ StructField("R14CommentLength", DoubleType, false) :: /*85*/ StructField("R15RevisionSubaction", StringType, false) :: - /*86*/ StructField("R16PrevReviSubaction", StringType, false) :: - - Nil) - - val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column - , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble), - e(11).toDouble, e(12).toDouble, e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble //Word Feature column - , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble, RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column: - , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble //Statement Features Column: - , e(47), e(48), e(49) // User Features Column: - , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble //Item Features column: - , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() //Revision Features Column: - , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86))) - - //a.User Frequency: - //number of revisions a user has contributed - //val resu= DF_Tags.groupBy("contributorID").agg(count("Rid")) - DF_Tags.registerTempTable("TagesTable") - val ContributorFreq_for_Each_Revision_DF = sqlContext.sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") //.drop("CIDUSER1") - //ContributorFreq_for_Each_Revision_DF.show() - - //b.Cumulated : Number of a unique Item a user has contributed. - val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext.sql("select contributorID as CIDUSER2, COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") //.drop("CIDUSER2") - //CumulatedNumberof_uniqueItemsForUser_DF.show() - - //1.Item Frequency: - // number of revisions an Item has - val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable group by itemid") - // ItemFrequ_DF.show() - - //2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name - val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid, COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid") - //CumulatedNumberof_UniqueUserForItem_DF.show() - - //3. freq each Item : - val Fre_Item_DF = sqlContext.sql("select itemid, COUNT(itemid) as FreqItem from TagesTable group by itemid") - // Fre_Item_DF.show() - - //***************************************************************************************************************************************** - // This is Main DataFrame: - val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema) - //BeforeJoin_All_Features.show() - - //********************************** User feature Join - - // Join1 for add The first User Feature : number of revisions a user has contributed - val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1") - //AfterJoinUser1_All_Features.show() - - // Join2 for add The second User Feature - val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2") - //AfterJoinUser2_All_Features.show() - - //********************************** Item Feature Join - // Join3 for add The First Item Feature :number of revisions an Item has - val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") - // AfterJoinItem3_All_Features.show() - - // Join4 for add The Second Item Feature - val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") - // AfterJoinItem4_All_Features.show() - - // Join5 for add The Third Item Feature - val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") - //2 AfterJoinItem5_All_Features.show() - - //******************************** - - //*Geografical information Feature from Meta File - //REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS - val df_GeoInf = sqlContext.read - .format("com.databricks.spark.csv") - .option("header", "true") // Use first line of all files as header - .option("inferSchema", "true") // Automatically infer data types - .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS") - // df_GeoInf.show() - - val df_Truth = sqlContext.read - .format("com.databricks.spark.csv") - .option("header", "true") // Use first line of all files as header - .option("inferSchema", "true") // Automatically infer data types - .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED") - // df_GeoInf.show() - - val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() - // AfterJoinGeoInfo_All_Features.show() - - val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() - //Final_All_Features.show() - - // Pre- process Data ============================================================================================================================================================ - - // For String Column, We fill the Null values by "NA": - - var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache() - - // For Integer Frequency Column, We fill the Null values by 0: - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache() - //Fill_Missing_Final_All_Features.show() - - val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 } - val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble } - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED"))) - - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID"))) - - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit"))) - - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem"))) - - //===========================================================================Caharacter Features : Double , Integer Features ==================================================================================== - //Double Ratio: For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features : - var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() //.where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0) - Samples.registerTempTable("df") - - val Query = "select " + - "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," + - "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," + - "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," + - "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," + - "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," + - "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," + - "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," + - "percentile_approx(c16malysiaratio, 0.5) as median16" + "," + - "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," + - "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," + - "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," + - "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," + - "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," + - "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," + - "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df" - - val medianValues = sqlContext.sql(Query).rdd - val Median = medianValues.first() - - // Median : - // Character Ratio Features: UDF - val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i } - val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i } - val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i } - val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i } - val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i } - val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i } - val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i } - val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i } - val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i } - - val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i } - val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i } - val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i } - val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i } - val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i } - val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i } - val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i } - val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i } - val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i } - val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i } - val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i } - val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i } - val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i } - val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i } - val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i } - - val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) //.drop("C1uppercaseratio").cache() - val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) //.drop("C2lowercaseratio").cache() - //df1.unpersist() - val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) //.drop("C3alphanumericratio").cache() - //df2.unpersist() - val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) //.drop("C4asciiratio").cache() - //df3.unpersist() - val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) //.drop("C5bracketratio").cache() - //df4.unpersist() - val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) //.drop("C6digitalratio").cache() - //df5.unpersist() - val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) //.drop("C7latinratio").cache() - //df6.unpersist() - val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) //.drop("C8whitespaceratio").cache() - //df7.unpersist() - val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) //.drop("C9puncratio").cache() - - // Mean : - // character integer values : - val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head() - val C10_Mean = Mean_C10longcharacterseq.getDouble(0) - val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i } - val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq"))) - - //Median - val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) //.drop("C11arabicratio").cache() - // df9.unpersist() - val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) //.drop("C12bengaliratio").cache() - //df11.unpersist() - val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) //.drop("C13brahmiratio").cache() - // df12.unpersist() - val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) //.drop("C14cyrilinratio").cache() - // df13.unpersist() - val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) //.drop("C15hanratio").cache() - // df14.unpersist() - val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) //.drop("c16malysiaratio").cache() - //df15.unpersist() - val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) //.drop("C17tamiratio").cache() - //df16.unpersist() - val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) //.drop("C18telugratio").cache() - //df17.unpersist() - val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) //.drop("C19symbolratio").cache() - //df18.unpersist() - val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) //.drop("C20alpharatio").cache() - // df19.unpersist() - val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) //.drop("C21visibleratio").cache() - // df20.unpersist() - val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) //.drop("C22printableratio").cache() - //df21.unpersist() - val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) //.drop("C23blankratio").cache() - // df22.unpersist() - val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) //.drop("C24controlratio").cache() - //df23.unpersist() - val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) //.drop("C25hexaratio").cache() - - //************************************************End Character Features **************************************************************************************** - - //************************************************Start Word Features **************************************************************************************** - - // Word Ratio Features : UDF - val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i } - val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i } - val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i } - val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i } - val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i } - - //1. - val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) //.drop("W1languagewordratio").cache() - - //2.Boolean(Double) IsContainLanguageWord - - //3. - val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) //.drop("W3lowercaseratio").cache() - // df26.unpersist() - - //4. Integer " Mean: - val Mean_W4longestword = Samples.agg(mean("W4longestword")).head() - val W4_Mean = Mean_W4longestword.getDouble(0) - val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i } - val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword"))) - - //5. Boolean (Double ) W5IscontainURL - //6. - val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) //.drop("W6badwordratio").cache() - - //7. - val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) //.drop("W7uppercaseratio").cache() - - //8. - val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) //.drop("W8banwordratio").cache() - - //9.FemalFirst Boolean(Double) - //10.Male First Boolean(Double) - //11.ContainBadWord Boolean(Double) - //12ContainBanWord Boolean(Double) - - //13. Integer(Double): - val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head() - val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0) - val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i } - val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords"))) - - //14. Integer (Double): - val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head() - val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0) - val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i } - val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords"))) - - // 15. Double (Not ratio): - val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head() - val W15_Mean = Mean_W15PortionQid.getDouble(0) - val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i } - val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid"))) - - //16. Double(Not Ratio): - val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head() - val W16_Mean = Mean_W16PortionLnags.getDouble(0) - val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i } - val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags"))) - - //17.Double(Not ratio): - val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head() - val W17_Mean = Mean_W17PortionLinks.getDouble(0) - val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i } - val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks"))) - - //************************************************End Word Features **************************************************************************************** - - //************************************************Start Sentences Features **************************************************************************************** - // 1. Integer(Double) - val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head() - val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0)) - val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i } - val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength"))) - - //2. Double but Not ratio values : - val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head() - val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0)) - val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i } - val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel"))) - - //3. Double but Not ratio values : - val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head() - val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0)) - val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i } - val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink"))) - - //4. Double but Not ratio values : - val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head() - val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0)) - val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i } - val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment"))) - - //df41.show() - //************************************************End Sentences Features **************************************************************************************** - //*********************************************** Start Statement Features **************************************************************************************** - //1. String - //2. String - //3. String - //************************************************End Statement Features **************************************************************************************** - //*********************************************** Start User Features **************************************************************************************** - - //1.Boolean(Double) - //2.Boolean(Double) - //3.Boolean(Double) - //4.Boolean(Double) - //5.Boolean(Double) - //6.Boolean(Double) - //7. (Double) IP No need to fill Missing Data - //8. (Double) ID No need to fill Missing Data - //9.Boolean(Double) - //10.Boolean(Double) - - //*********************************************** End User Features **************************************************************************************** - //*********************************************** Start Item Features **************************************************************************************** - //1. Integer (Double) No need to fill missing values - //2. Integer (Double) No need to fill missing values - //3. Integer (Double) No need to fill missing values - //4. Integer (Double) No need to fill missing values - //5. Integer (Double) No need to fill missing values - //6. Integer (Double) No need to fill missing values - //7. Integer (Double) No need to fill missing values - //8. Integer (Double) No need to fill missing values - //9. Integer (Double) No need to fill missing values - //10. Integer (Double) No need to fill missing values - //11. String - //*********************************************** End Item Features **************************************************************************************** - //*********************************************** Start Revision Features **************************************************************************************** - //1.String - //2.String - //3.Boolean (Double) - //4.Integer(Double) - //5.String - //6.String - //7. Boolean(Double) - //8. String - //9.String - //10. Integer (Double) - //11.String - //12. integer(Double) - //13. Long(Double) - //14. integer (Double) - //15.String - //16.String - //*********************************************** End Revision Features **************************************************************************************** - //*********************************************** Meta Data , Truth Data and Frequnces **************************************************************************************** - //Meta - // 1.Revision Session :Integer (Converted to Double) - //2. User Country Code - //3.User Continent Code - //4.User Time Size - //5.User Region Code - //6.User-city Name - //7.User Country Name - //8.RevisionTags - + // Streaming records: + val jobConf = new JobConf() + val NormalXML_Parser_OBJ = new ParseNormalXML() + val RDD_OBJ = new ParseNormalXML() + + val Training_RDD_All_Record1 = RDD_OBJ.Training_DB_NormalXML_Parser_Input1(sc) + val Training_RDD_All_Record2 = RDD_OBJ.Training_DB_NormalXML_Parser_Input2(sc) + val Training_RDD_All_Record3 = RDD_OBJ.Training_DB_NormalXML_Parser_Input3(sc) + // RDD_All_Record1.foreach(println) + // RDD_All_Record2.foreach(println) + // RDD_All_Record3.foreach(println) + + val Training_RDD_All_Record = Training_RDD_All_Record1.union(Training_RDD_All_Record2).union(Training_RDD_All_Record3).distinct().cache() + + // println(RDD_All_Record.count()) + println(Training_RDD_All_Record.count()) + + // ======= Json part : + // Json RDD : Each record has its Revision iD: + val JsonRDD = Training_RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache() + // JsonRDD.foreach(println) + // println(JsonRDD.count()) + + // Data set + val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache() + // Ds_Json.show() + // println(Ds_Json.count()) + + // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage + val TagsRDD = Training_RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache() + val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", + "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache() + // DF_Tags.show() + // println(DF_Tags.count()) + + // ======== Join Json part with Tag Part:============================ + // Joining to have full data + val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter") + .select("Rid", "itemid", "comment", "pid", "time", "contributorIP", + "contributorID", "contributorName", "JsonText", "labels", "descriptions", + "aliases", "claims", "sitelinks", "model", "format", "sha") // .orderBy("Rid", "Itemid") + DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1") + val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache() + + val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", + "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", + "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") + val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) // .distinct() + DF_Second.registerTempTable("Data2") + + // ===================================================================Parent // Previous Revision============================================================================================================== + + // Joining based on Parent Id to get the previous cases: ParentID + val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct() + + val RDD_After_JoinDF = DF_Joined.rdd.distinct() + val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache() + val part = new RangePartitioner(4, x) + val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory. + // partitioned.foreach(println) + // + // //=====================================================All Features Based on Categories of Features Data Type :================================================================================== + // + val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on "," + // Result_all_Features.foreach(println) + // println("nayef" + Result_all_Features.count()) + + // Conver the RDD of All Features to DataFrame: + + val schema = StructType( + + // 0 + StructField("Rid", IntegerType, false) :: + + // Character Features : + /* 1 */ StructField("C1uppercaseratio", DoubleType, false) :: /* 2 */ StructField("C2lowercaseratio", DoubleType, false) :: /* 3 */ StructField("C3alphanumericratio", DoubleType, false) :: + /* 4 */ StructField("C4asciiratio", DoubleType, false) :: /* 5 */ StructField("C5bracketratio", DoubleType, false) :: /* 6 */ StructField("C6digitalratio", DoubleType, false) :: + /* 7 */ StructField("C7latinratio", DoubleType, false) :: /* 8 */ StructField("C8whitespaceratio", DoubleType, false) :: /* 9 */ StructField("C9puncratio", DoubleType, false) :: + /* 10 */ StructField("C10longcharacterseq", DoubleType, false) :: /* 11 */ StructField("C11arabicratio", DoubleType, false) :: /* 12 */ StructField("C12bengaliratio", DoubleType, false) :: + /* 13 */ StructField("C13brahmiratio", DoubleType, false) :: /* 14 */ StructField("C14cyrilinratio", DoubleType, false) :: /* 15 */ StructField("C15hanratio", DoubleType, false) :: + /* 16 */ StructField("c16malysiaratio", DoubleType, false) :: /* 17 */ StructField("C17tamiratio", DoubleType, false) :: /* 18 */ StructField("C18telugratio", DoubleType, false) :: + /* 19 */ StructField("C19symbolratio", DoubleType, false) :: /* 20 */ StructField("C20alpharatio", DoubleType, false) :: /* 21 */ StructField("C21visibleratio", DoubleType, false) :: + /* 22 */ StructField("C22printableratio", DoubleType, false) :: /* 23 */ StructField("C23blankratio", DoubleType, false) :: /* 24 */ StructField("C24controlratio", DoubleType, false) :: + /* 25 */ StructField("C25hexaratio", DoubleType, false) :: + + // word Features: + /* 26 */ StructField("W1languagewordratio", DoubleType, false) :: /* 27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /* 28 */ StructField("W3lowercaseratio", DoubleType, false) :: + /* 29 Integer */ StructField("W4longestword", IntegerType, false) :: /* 30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /* 31 */ StructField("W6badwordratio", DoubleType, false) :: + /* 32 */ StructField("W7uppercaseratio", DoubleType, false) :: /* 33 */ StructField("W8banwordratio", DoubleType, false) :: /* 34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) :: + /* 35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /* 36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: + /* 37 Boolean */ StructField("W12IsContainBanword", DoubleType, false) :: + /* 38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /* 39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) :: + /* 40 */ StructField("W15PortionQid", DoubleType, false) :: /* 41 */ StructField("W16PortionLnags", DoubleType, false) :: /* 42 */ StructField("W17PortionLinks", DoubleType, false) :: + + // + // // Sentences Features: + /* 43 */ StructField("S1CommentTailLength", DoubleType, false) :: /* 44 */ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: + /* 45 */ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /* 46 */ StructField("S4SimilarityCommentComment", DoubleType, false) :: + // + // // Statements Features : + /* 47 */ StructField("SS1Property", StringType, false) :: /* 48 */ StructField("SS2DataValue", StringType, false) :: /* 49 */ StructField("SS3ItemValue", StringType, false) :: + // + // + // // User Features : + /* 50 Boolean */ StructField("U1IsPrivileged", DoubleType, false) :: /* 51 Boolean */ StructField("U2IsBotUser", DoubleType, false) :: /* 52 Boolean */ StructField("U3IsBotuserWithFlaguser", DoubleType, false) :: + /* 53 Boolean */ StructField("U4IsProperty", DoubleType, false) :: /* 54 Boolean */ StructField("U5IsTranslator", DoubleType, false) :: /* 55 Boolean */ StructField("U6IsRegister", DoubleType, false) :: + /* 56 */ StructField("U7IPValue", DoubleType, false) :: /* 57 */ StructField("U8UserID", IntegerType, false) :: /* 58 */ StructField("U9HasBirthDate", DoubleType, false) :: + /* 59 */ StructField("U10HasDeathDate", DoubleType, false) :: + + // Items Features : + + /* 60 */ StructField("I1NumberLabels", DoubleType, false) :: /* 61 */ StructField("I2NumberDescription", DoubleType, false) :: /* 62 */ StructField("I3NumberAliases", DoubleType, false) :: + /* 63 */ StructField("I4NumberClaims", DoubleType, false) :: + /* 64 */ StructField("I5NumberSitelinks", DoubleType, false) :: /* 65 */ StructField("I6NumberStatement", DoubleType, false) :: /* 66 */ StructField("I7NumberReferences", DoubleType, false) :: + /* 67 */ StructField("I8NumberQualifier", DoubleType, false) :: + /* 68 */ StructField("I9NumberQualifierOrder", DoubleType, false) :: /* 69 */ StructField("I10NumberBadges", DoubleType, false) :: /* 70 */ StructField("I11ItemTitle", StringType, false) :: + + // Revision Features: + /* 71 */ StructField("R1languageRevision", StringType, false) :: /* 72 */ StructField("R2RevisionLanguageLocal", StringType, false) :: /* 73 */ StructField("R3IslatainLanguage", DoubleType, false) :: + /* 74 */ StructField("R4JsonLength", DoubleType, false) :: /* 75 */ StructField("R5RevisionAction", StringType, false) :: /* 76 */ StructField("R6PrevReviAction", StringType, false) :: + /* 77 */ StructField("R7RevisionAccountChange", DoubleType, false) :: /* 78 */ StructField("R8ParRevision", StringType, false) :: /* 79 */ StructField("R9RevisionTime", StringType, false) :: + /* 80 */ StructField("R10RevisionSize", DoubleType, false) :: /* 81 */ StructField("R11ContentType", StringType, false) :: /* 82 */ StructField("R12BytesIncrease", DoubleType, false) :: + /* 83 */ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /* 84 */ StructField("R14CommentLength", DoubleType, false) :: /* 85 */ StructField("R15RevisionSubaction", StringType, false) :: + /* 86 */ StructField("R16PrevReviSubaction", StringType, false) :: + + Nil) + + val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column + , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble), e(11).toDouble, e(12).toDouble // + , e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble // + , e(23).toDouble, e(24).toDouble, e(25).toDouble // Word Feature column + , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble // + , RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column: + , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble // Statement Features Column: + , e(47), e(48), e(49) // User Features Column: + , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble // Item Features column: + , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble // + , e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() // Revision Features Column: + , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86))) + + // a.User Frequency: + // number of revisions a user has contributed + // val resu= DF_Tags.groupBy("contributorID").agg(count("Rid")) + DF_Tags.registerTempTable("TagesTable") + val ContributorFreq_for_Each_Revision_DF = sqlContext + .sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") // .drop("CIDUSER1") + // ContributorFreq_for_Each_Revision_DF.show() + + // b.Cumulated : Number of a unique Item a user has contributed. + val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext + .sql("select contributorID as CIDUSER2, COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") // .drop("CIDUSER2") + // CumulatedNumberof_uniqueItemsForUser_DF.show() + + // 1.Item Frequency: + // number of revisions an Item has + val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable group by itemid") + // ItemFrequ_DF.show() + + // 2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name + val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid, COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid") + // CumulatedNumberof_UniqueUserForItem_DF.show() + + // 3. freq each Item : + val Fre_Item_DF = sqlContext.sql("select itemid, COUNT(itemid) as FreqItem from TagesTable group by itemid") + // Fre_Item_DF.show() + + // ***************************************************************************************************************************************** + // This is Main DataFrame: + val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema) + // BeforeJoin_All_Features.show() + + // ********************************** User feature Join + + // Join1 for add The first User Feature : number of revisions a user has contributed + val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1") + // AfterJoinUser1_All_Features.show() + + // Join2 for add The second User Feature + val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2") + // AfterJoinUser2_All_Features.show() + + // ********************************** Item Feature Join + // Join3 for add The First Item Feature :number of revisions an Item has + val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") + // AfterJoinItem3_All_Features.show() + + // Join4 for add The Second Item Feature + val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") + // AfterJoinItem4_All_Features.show() + + // Join5 for add The Third Item Feature + val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") + // 2 AfterJoinItem5_All_Features.show() + + // ******************************** + + // *Geografical information Feature from Meta File + // REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS + val df_GeoInf = sqlContext.read + .format("com.databricks.spark.csv") + .option("header", "true") // Use first line of all files as header + .option("inferSchema", "true") // Automatically infer data types + .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", + "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS") + // df_GeoInf.show() + + val df_Truth = sqlContext.read + .format("com.databricks.spark.csv") + .option("header", "true") // Use first line of all files as header + .option("inferSchema", "true") // Automatically infer data types + .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED") + // df_GeoInf.show() + + val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() + // AfterJoinGeoInfo_All_Features.show() + + val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() + // Final_All_Features.show() + + // Pre- process Data ============================================================================================================================================================ + + // For String Column, We fill the Null values by "NA": + + var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", + "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache() + + // For Integer Frequency Column, We fill the Null values by 0: + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit", + "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache() + // Fill_Missing_Final_All_Features.show() + + val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 } + val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble } + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED"))) + + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID"))) + + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit"))) + + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem"))) + + // ===========================================================================Caharacter Features : Double , Integer Features ======================================================== + // Double Ratio: For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features : + var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() // .where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0) + Samples.registerTempTable("df") + + val Query = "select " + + "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," + + "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," + + "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," + + "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," + + "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," + + "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," + + "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," + + "percentile_approx(c16malysiaratio, 0.5) as median16" + "," + + "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," + + "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," + + "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," + + "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," + + "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," + + "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," + + "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df" + + val medianValues = sqlContext.sql(Query).rdd + val Median = medianValues.first() + + // Median : + // Character Ratio Features: UDF + val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i } + val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i } + val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i } + val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i } + val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i } + val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i } + val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i } + val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i } + val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i } + + val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i } + val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i } + val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i } + val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i } + val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i } + val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i } + val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i } + val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i } + val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i } + val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i } + val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i } + val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i } + val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i } + val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i } + val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i } + + val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) // .drop("C1uppercaseratio").cache() + val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) // .drop("C2lowercaseratio").cache() + // df1.unpersist() + val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) // .drop("C3alphanumericratio").cache() + // df2.unpersist() + val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) // .drop("C4asciiratio").cache() + // df3.unpersist() + val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) // .drop("C5bracketratio").cache() + // df4.unpersist() + val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) // .drop("C6digitalratio").cache() + // df5.unpersist() + val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) // .drop("C7latinratio").cache() + // df6.unpersist() + val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) // .drop("C8whitespaceratio").cache() + // df7.unpersist() + val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) // .drop("C9puncratio").cache() + + // Mean : + // character integer values : + val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head() + val C10_Mean = Mean_C10longcharacterseq.getDouble(0) + val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i } + val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq"))) + + // Median + val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) // .drop("C11arabicratio").cache() + // df9.unpersist() + val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) // .drop("C12bengaliratio").cache() + // df11.unpersist() + val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) // .drop("C13brahmiratio").cache() + // df12.unpersist() + val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) // .drop("C14cyrilinratio").cache() + // df13.unpersist() + val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) // .drop("C15hanratio").cache() + // df14.unpersist() + val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) // .drop("c16malysiaratio").cache() + // df15.unpersist() + val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) // .drop("C17tamiratio").cache() + // df16.unpersist() + val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) // .drop("C18telugratio").cache() + // df17.unpersist() + val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) // .drop("C19symbolratio").cache() + // df18.unpersist() + val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) // .drop("C20alpharatio").cache() + // df19.unpersist() + val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) // .drop("C21visibleratio").cache() + // df20.unpersist() + val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) // .drop("C22printableratio").cache() + // df21.unpersist() + val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) // .drop("C23blankratio").cache() + // df22.unpersist() + val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) // .drop("C24controlratio").cache() + // df23.unpersist() + val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) // .drop("C25hexaratio").cache() + + // ************************************************End Character Features **************************************************************************************** + + // ************************************************Start Word Features **************************************************************************************** + + // Word Ratio Features : UDF + val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i } + val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i } + val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i } + val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i } + val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i } + + // 1. + val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) // .drop("W1languagewordratio").cache() + + // 2.Boolean(Double) IsContainLanguageWord + + // 3. + val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) // .drop("W3lowercaseratio").cache() + // df26.unpersist() + + // 4. Integer " Mean: + val Mean_W4longestword = Samples.agg(mean("W4longestword")).head() + val W4_Mean = Mean_W4longestword.getDouble(0) + val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i } + val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword"))) + + // 5. Boolean (Double ) W5IscontainURL + // 6. + val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) // .drop("W6badwordratio").cache() + + // 7. + val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) // .drop("W7uppercaseratio").cache() + + // 8. + val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) // .drop("W8banwordratio").cache() + + // 9.FemalFirst Boolean(Double) + // 10.Male First Boolean(Double) + // 11.ContainBadWord Boolean(Double) + // 12ContainBanWord Boolean(Double) + + // 13. Integer(Double): + val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head() + val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0) + val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i } + val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords"))) + + // 14. Integer (Double): + val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head() + val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0) + val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i } + val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords"))) + + // 15. Double (Not ratio): + val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head() + val W15_Mean = Mean_W15PortionQid.getDouble(0) + val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i } + val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid"))) + + // 16. Double(Not Ratio): + val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head() + val W16_Mean = Mean_W16PortionLnags.getDouble(0) + val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i } + val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags"))) + + // 17.Double(Not ratio): + val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head() + val W17_Mean = Mean_W17PortionLinks.getDouble(0) + val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i } + val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks"))) + + // ************************************************End Word Features **************************************************************************************** + + // ************************************************Start Sentences Features **************************************************************************************** + // 1. Integer(Double) + val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head() + val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0)) + val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i } + val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength"))) + + // 2. Double but Not ratio values : + val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head() + val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0)) + val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i } + val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel"))) + + // 3. Double but Not ratio values : + val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head() + val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0)) + val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i } + val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink"))) + + // 4. Double but Not ratio values : + val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head() + val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0)) + val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i } + val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment"))) + + // df41.show() + // ************************************************End Sentences Features **************************************************************************************** + // *********************************************** Start Statement Features **************************************************************************************** + // 1. String + // 2. String + // 3. String + // ************************************************End Statement Features **************************************************************************************** + // *********************************************** Start User Features **************************************************************************************** + + // 1.Boolean(Double) + // 2.Boolean(Double) + // 3.Boolean(Double) + // 4.Boolean(Double) + // 5.Boolean(Double) + // 6.Boolean(Double) + // 7. (Double) IP No need to fill Missing Data + // 8. (Double) ID No need to fill Missing Data + // 9.Boolean(Double) + // 10.Boolean(Double) + + // *********************************************** End User Features **************************************************************************************** + // *********************************************** Start Item Features **************************************************************************************** + // 1. Integer (Double) No need to fill missing values + // 2. Integer (Double) No need to fill missing values + // 3. Integer (Double) No need to fill missing values + // 4. Integer (Double) No need to fill missing values + // 5. Integer (Double) No need to fill missing values + // 6. Integer (Double) No need to fill missing values + // 7. Integer (Double) No need to fill missing values + // 8. Integer (Double) No need to fill missing values + // 9. Integer (Double) No need to fill missing values + // 10. Integer (Double) No need to fill missing values + // 11. String + // *********************************************** End Item Features **************************************************************************************** + // *********************************************** Start Revision Features **************************************************************************************** + // 1.String + // 2.String + // 3.Boolean (Double) + // 4.Integer(Double) + // 5.String + // 6.String + // 7. Boolean(Double) + // 8. String + // 9.String + // 10. Integer (Double) + // 11.String + // 12. integer(Double) + // 13. Long(Double) + // 14. integer (Double) + // 15.String + // 16.String + // *********************************************** End Revision Features **************************************************************************************** + // *********************************************** Meta Data , Truth Data and Frequnces **************************************************************************************** + // Meta + // 1.Revision Session :Integer (Converted to Double) + // 2. User Country Code + // 3.User Continent Code + // 4.User Time Size + // 5.User Region Code + // 6.User-city Name + // 7.User Country Name + // 8.RevisionTags + + // Truth: + // 1.Undo + + // Freq : + + // 1.5 features + + // Roll Boolean :Boolean (Double) + // Undo :Boolean (Double) + + // *********************************************** End Revision Features **************************************************************************************** + + // ===========================================================================String Features==================================================================================== + + val df42 = df41.withColumn( + // statement String features: + "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle", + // Revision String Features: + lit(";"), $"R1languageRevision", + lit(";"), $"R2RevisionLanguageLocal", + lit(";"), $"R5RevisionAction", + lit(";"), $"R6PrevReviAction", + lit(";"), $"R8ParRevision", + lit(";"), $"R9RevisionTime", + lit(";"), $"R11ContentType", + lit(";"), $"R15RevisionSubaction", + lit(";"), $"R16PrevReviSubaction", + + lit(";"), $"USER_COUNTRY_CODE", + lit(";"), $"USER_CONTINENT_CODE", + lit(";"), $"USER_TIME_ZONE", + lit(";"), $"USER_REGION_CODE", + lit(";"), $"USER_CITY_NAME", + lit(";"), $"USER_COUNTY_NAME", + lit(";"), $"REVISION_TAGS")) + + val toArray = udf((record: String) => record.split(";").map(_.toString())) + val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures"))) + // test1.show() + // test1.printSchema() + + val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0) + val model = word2Vec.fit(test1) + val result = model.transform(test1) // .rdd + + // result.show() + + val Todense = udf((b: Vector) => b.toDense) + val test_new2 = result.withColumn("result", Todense(col("result"))) + + val assembler = new VectorAssembler().setInputCols(Array( + "result", + + // character + "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio", + "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio", + "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio", + "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio", + + // Words + "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio", + "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword", + "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks", + + // Sentences : + "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment", + + // User : + "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID", + "U9HasBirthDate", "U10HasDeathDate", + + // Item: + + "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement", + "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges", + + // Revision: + "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease", + "R13TimeSinceLastRevi", "R14CommentLength", + + // Meta , truth , Freq + // meta : + "FinalREVISION_SESSION_ID", // Truth: - //1.Undo - - // Freq : - - //1.5 features - - // Roll Boolean :Boolean (Double) - // Undo :Boolean (Double) - - //*********************************************** End Revision Features **************************************************************************************** - - //===========================================================================String Features==================================================================================== - - val df42 = df41.withColumn( - //statement String features: - "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle", - //Revision String Features: - lit(";"), $"R1languageRevision", - lit(";"), $"R2RevisionLanguageLocal", - lit(";"), $"R5RevisionAction", - lit(";"), $"R6PrevReviAction", - lit(";"), $"R8ParRevision", - lit(";"), $"R9RevisionTime", - lit(";"), $"R11ContentType", - lit(";"), $"R15RevisionSubaction", - lit(";"), $"R16PrevReviSubaction", - - lit(";"), $"USER_COUNTRY_CODE", - lit(";"), $"USER_CONTINENT_CODE", - lit(";"), $"USER_TIME_ZONE", - lit(";"), $"USER_REGION_CODE", - lit(";"), $"USER_CITY_NAME", - lit(";"), $"USER_COUNTY_NAME", - lit(";"), $"REVISION_TAGS")) - - val toArray = udf((record: String) => record.split(";").map(_.toString())) - val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures"))) - // test1.show() - // test1.printSchema() - - val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0) - val model = word2Vec.fit(test1) - val result = model.transform(test1) //.rdd - - // result.show() - - val Todense = udf((b: Vector) => b.toDense) - val test_new2 = result.withColumn("result", Todense(col("result"))) - - val assembler = new VectorAssembler().setInputCols(Array( - "result", - - // character - "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio", - "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio", - "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio", - "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio", - - // Words - "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio", - "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword", - "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks", - - //Sentences : - "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment", - - // User : - "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID", - "U9HasBirthDate", "U10HasDeathDate", + "FinalUNDO_RESTORE_REVERTED", - //Item: + // Freq: + "FinalNumberofRevisionsUserContributed", + "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features") + val Training_Data = assembler.transform(test_new2) - "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement", - "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges", - - //Revision: - "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease", - "R13TimeSinceLastRevi", "R14CommentLength", - - // Meta , truth , Freq - // meta : - "FinalREVISION_SESSION_ID", - // Truth: - "FinalUNDO_RESTORE_REVERTED", - - //Freq: - "FinalNumberofRevisionsUserContributed", - "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features") - val Training_Data = assembler.transform(test_new2) - - // Prepare the data for classification: + // Prepare the data for classification: // NewData.registerTempTable("DB") // val Training_Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED from DB") - //val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision + // val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision - //Data.show() + // Data.show() - //val TestClassifiers = new Classifiers() -// - // TestClassifiers.RandomForestClassifer(Data, sqlContext) -// // TestClassifiers.DecisionTreeClassifier(Data, sqlContext) -// // TestClassifiers.LogisticRegrision(Data, sqlContext) -// // TestClassifiers.GradientBoostedTree(Data, sqlContext) -// // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext) + // val TestClassifiers = new Classifiers() + // + // TestClassifiers.RandomForestClassifer(Data, sqlContext) + // // TestClassifiers.DecisionTreeClassifier(Data, sqlContext) + // // TestClassifiers.LogisticRegrision(Data, sqlContext) + // // TestClassifiers.GradientBoostedTree(Data, sqlContext) + // // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext) Training_Data - - + } - - //*********************************************************************************************************************************************** - // Function 3:Testing XML and Vandalism Detection + // *********************************************************************************************************************************************** + // Function 3:Testing XML and Vandalism Detection def Testing_Start_StandardXMLParser_VD(sc: SparkContext): DataFrame = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ import org.apache.spark.sql.functions._ // for UDF import org.apache.spark.sql.types._ - // Streaming records: - val jobConf = new JobConf() - val NormalXML_Parser_OBJ = new ParseNormalXML() - val RDD_OBJ = new ParseNormalXML() - - val Testing_RDD_All_Record = RDD_OBJ.Testing_DB_NormalXML_Parser(sc).cache() - - - // ======= Json part : - //Json RDD : Each record has its Revision iD: - val JsonRDD = Testing_RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache() - //JsonRDD.foreach(println) - //println(JsonRDD.count()) - - // Data set - val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache() - //Ds_Json.show() - // println(Ds_Json.count()) - - // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage - val TagsRDD = Testing_RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache() - val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache() - // DF_Tags.show() - // println(DF_Tags.count()) - - //======== Join Json part with Tag Part:============================ - //Joining to have full data - val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha") //.orderBy("Rid", "Itemid") - DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1") - val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache() - - val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") - val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) //.distinct() - DF_Second.registerTempTable("Data2") - - //===================================================================Parent // Previous Revision============================================================================================================== - //val DF_Joined = result1.as("df1").join(result2.as("df2"), col("itemid") === col("itemid2") && col("index1") === col("index2") + 1, "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha", "Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") - //.select("itemid", "Rid","pid","time","itemid2","Rid2","pid2","time2") - - //Joining based on Parent Id to get the previous cases: ParentID - val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct() - - val RDD_After_JoinDF = DF_Joined.rdd.distinct() - val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache() - val part = new RangePartitioner(4, x) - val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory. - //partitioned.foreach(println) - // - // //=====================================================All Features Based on Categories of Features Data Type :================================================================================== - // - val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on "," - //Result_all_Features.foreach(println) - // println("nayef" + Result_all_Features.count()) - - // Conver the RDD of All Features to DataFrame: - - val schema = StructType( - - //0 - StructField("Rid", IntegerType, false) :: - - // Character Features : - /* 1*/ StructField("C1uppercaseratio", DoubleType, false) :: /*2 */ StructField("C2lowercaseratio", DoubleType, false) :: /*3*/ StructField("C3alphanumericratio", DoubleType, false) :: - /*4*/ StructField("C4asciiratio", DoubleType, false) :: /*5*/ StructField("C5bracketratio", DoubleType, false) :: /*6*/ StructField("C6digitalratio", DoubleType, false) :: - /*7*/ StructField("C7latinratio", DoubleType, false) :: /*8*/ StructField("C8whitespaceratio", DoubleType, false) :: /* 9*/ StructField("C9puncratio", DoubleType, false) :: - /*10*/ StructField("C10longcharacterseq", DoubleType, false) :: /*11*/ StructField("C11arabicratio", DoubleType, false) :: /*12*/ StructField("C12bengaliratio", DoubleType, false) :: - /*13 */ StructField("C13brahmiratio", DoubleType, false) :: /*14*/ StructField("C14cyrilinratio", DoubleType, false) :: /*15*/ StructField("C15hanratio", DoubleType, false) :: - /*16*/ StructField("c16malysiaratio", DoubleType, false) :: /*17*/ StructField("C17tamiratio", DoubleType, false) :: /*18*/ StructField("C18telugratio", DoubleType, false) :: - /*19 */ StructField("C19symbolratio", DoubleType, false) :: /*20 */ StructField("C20alpharatio", DoubleType, false) :: /*21*/ StructField("C21visibleratio", DoubleType, false) :: - /*22*/ StructField("C22printableratio", DoubleType, false) :: /*23*/ StructField("C23blankratio", DoubleType, false) :: /*24 */ StructField("C24controlratio", DoubleType, false) :: - /* 25 */ StructField("C25hexaratio", DoubleType, false) :: - - //word Features: - /*26*/ StructField("W1languagewordratio", DoubleType, false) :: /*27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /*28*/ StructField("W3lowercaseratio", DoubleType, false) :: - /*29 Integer */ StructField("W4longestword", IntegerType, false) :: /*30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /*31*/ StructField("W6badwordratio", DoubleType, false) :: - /*32*/ StructField("W7uppercaseratio", DoubleType, false) :: /*33*/ StructField("W8banwordratio", DoubleType, false) :: /*34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) :: - /*35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /*36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: /*37 Boolean*/ StructField("W12IsContainBanword", DoubleType, false) :: - /*38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /*39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) :: - /*40*/ StructField("W15PortionQid", DoubleType, false) :: /*41*/ StructField("W16PortionLnags", DoubleType, false) :: /*42*/ StructField("W17PortionLinks", DoubleType, false) :: - - // - // // Sentences Features: - /*43*/ StructField("S1CommentTailLength", DoubleType, false) :: /*44*/ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: /*45*/ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /*46*/ StructField("S4SimilarityCommentComment", DoubleType, false) :: - // - // // Statements Features : - /*47*/ StructField("SS1Property", StringType, false) :: /*48*/ StructField("SS2DataValue", StringType, false) :: /*49*/ StructField("SS3ItemValue", StringType, false) :: - // - // - // //User Features : - /*50 Boolean*/ StructField("U1IsPrivileged", DoubleType, false) :: /*51 Boolean*/ StructField("U2IsBotUser", DoubleType, false) :: /*52 Boolean*/ StructField("U3IsBotuserWithFlaguser", DoubleType, false) :: - /*53 Boolean*/ StructField("U4IsProperty", DoubleType, false) :: /*54 Boolean*/ StructField("U5IsTranslator", DoubleType, false) :: /*55 Boolean*/ StructField("U6IsRegister", DoubleType, false) :: - /*56*/ StructField("U7IPValue", DoubleType, false) :: /*57*/ StructField("U8UserID", IntegerType, false) :: /*58*/ StructField("U9HasBirthDate", DoubleType, false) :: /*59*/ StructField("U10HasDeathDate", DoubleType, false) :: - - //Items Features : - - /*60*/ StructField("I1NumberLabels", DoubleType, false) :: /*61*/ StructField("I2NumberDescription", DoubleType, false) :: /*62*/ StructField("I3NumberAliases", DoubleType, false) :: /*63*/ StructField("I4NumberClaims", DoubleType, false) :: - /*64*/ StructField("I5NumberSitelinks", DoubleType, false) :: /*65*/ StructField("I6NumberStatement", DoubleType, false) :: /*66*/ StructField("I7NumberReferences", DoubleType, false) :: /*67*/ StructField("I8NumberQualifier", DoubleType, false) :: - /*68*/ StructField("I9NumberQualifierOrder", DoubleType, false) :: /*69*/ StructField("I10NumberBadges", DoubleType, false) :: /*70*/ StructField("I11ItemTitle", StringType, false) :: - - // Revision Features: - /*71*/ StructField("R1languageRevision", StringType, false) :: /*72*/ StructField("R2RevisionLanguageLocal", StringType, false) :: /*73*/ StructField("R3IslatainLanguage", DoubleType, false) :: - /*74*/ StructField("R4JsonLength", DoubleType, false) :: /*75*/ StructField("R5RevisionAction", StringType, false) :: /*76*/ StructField("R6PrevReviAction", StringType, false) :: - /*77*/ StructField("R7RevisionAccountChange", DoubleType, false) :: /*78*/ StructField("R8ParRevision", StringType, false) :: /*79*/ StructField("R9RevisionTime", StringType, false) :: - /*80*/ StructField("R10RevisionSize", DoubleType, false) :: /*81*/ StructField("R11ContentType", StringType, false) :: /*82*/ StructField("R12BytesIncrease", DoubleType, false) :: - /*83*/ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /*84*/ StructField("R14CommentLength", DoubleType, false) :: /*85*/ StructField("R15RevisionSubaction", StringType, false) :: - /*86*/ StructField("R16PrevReviSubaction", StringType, false) :: - - Nil) - - val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column - , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble), - e(11).toDouble, e(12).toDouble, e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble //Word Feature column - , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble, RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column: - , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble //Statement Features Column: - , e(47), e(48), e(49) // User Features Column: - , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble //Item Features column: - , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() //Revision Features Column: - , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86))) - - //a.User Frequency: - //number of revisions a user has contributed - //val resu= DF_Tags.groupBy("contributorID").agg(count("Rid")) - DF_Tags.registerTempTable("TagesTable") - val ContributorFreq_for_Each_Revision_DF = sqlContext.sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") //.drop("CIDUSER1") - //ContributorFreq_for_Each_Revision_DF.show() - - //b.Cumulated : Number of a unique Item a user has contributed. - val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext.sql("select contributorID as CIDUSER2, COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") //.drop("CIDUSER2") - //CumulatedNumberof_uniqueItemsForUser_DF.show() - - //1.Item Frequency: - // number of revisions an Item has - val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable group by itemid") - // ItemFrequ_DF.show() - - //2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name - val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid, COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid") - //CumulatedNumberof_UniqueUserForItem_DF.show() - - //3. freq each Item : - val Fre_Item_DF = sqlContext.sql("select itemid, COUNT(itemid) as FreqItem from TagesTable group by itemid") - // Fre_Item_DF.show() - - //***************************************************************************************************************************************** - // This is Main DataFrame: - val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema) - //BeforeJoin_All_Features.show() - - //********************************** User feature Join - - // Join1 for add The first User Feature : number of revisions a user has contributed - val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1") - //AfterJoinUser1_All_Features.show() - - // Join2 for add The second User Feature - val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2") - //AfterJoinUser2_All_Features.show() - - //********************************** Item Feature Join - // Join3 for add The First Item Feature :number of revisions an Item has - val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") - // AfterJoinItem3_All_Features.show() - - // Join4 for add The Second Item Feature - val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") - // AfterJoinItem4_All_Features.show() - - // Join5 for add The Third Item Feature - val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") - //2 AfterJoinItem5_All_Features.show() - - //******************************** - - //*Geografical information Feature from Meta File - //REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS - val df_GeoInf = sqlContext.read - .format("com.databricks.spark.csv") - .option("header", "true") // Use first line of all files as header - .option("inferSchema", "true") // Automatically infer data types - .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS") - // df_GeoInf.show() - - val df_Truth = sqlContext.read - .format("com.databricks.spark.csv") - .option("header", "true") // Use first line of all files as header - .option("inferSchema", "true") // Automatically infer data types - .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED") - // df_GeoInf.show() - - val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() - // AfterJoinGeoInfo_All_Features.show() - - val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() - //Final_All_Features.show() - - // Pre- process Data ============================================================================================================================================================ - - // For String Column, We fill the Null values by "NA": - - var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache() - - // For Integer Frequency Column, We fill the Null values by 0: - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache() - //Fill_Missing_Final_All_Features.show() - - val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 } - val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble } - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED"))) - - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID"))) - - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit"))) - - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem"))) - - //===========================================================================Caharacter Features : Double , Integer Features ==================================================================================== - //Double Ratio: For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features : - var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() //.where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0) - Samples.registerTempTable("df") - - val Query = "select " + - "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," + - "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," + - "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," + - "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," + - "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," + - "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," + - "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," + - "percentile_approx(c16malysiaratio, 0.5) as median16" + "," + - "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," + - "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," + - "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," + - "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," + - "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," + - "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," + - "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df" - - val medianValues = sqlContext.sql(Query).rdd - val Median = medianValues.first() - - // Median : - // Character Ratio Features: UDF - val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i } - val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i } - val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i } - val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i } - val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i } - val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i } - val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i } - val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i } - val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i } - - val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i } - val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i } - val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i } - val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i } - val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i } - val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i } - val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i } - val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i } - val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i } - val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i } - val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i } - val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i } - val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i } - val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i } - val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i } - - val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) //.drop("C1uppercaseratio").cache() - val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) //.drop("C2lowercaseratio").cache() - //df1.unpersist() - val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) //.drop("C3alphanumericratio").cache() - //df2.unpersist() - val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) //.drop("C4asciiratio").cache() - //df3.unpersist() - val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) //.drop("C5bracketratio").cache() - //df4.unpersist() - val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) //.drop("C6digitalratio").cache() - //df5.unpersist() - val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) //.drop("C7latinratio").cache() - //df6.unpersist() - val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) //.drop("C8whitespaceratio").cache() - //df7.unpersist() - val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) //.drop("C9puncratio").cache() - - // Mean : - // character integer values : - val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head() - val C10_Mean = Mean_C10longcharacterseq.getDouble(0) - val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i } - val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq"))) - - //Median - val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) //.drop("C11arabicratio").cache() - // df9.unpersist() - val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) //.drop("C12bengaliratio").cache() - //df11.unpersist() - val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) //.drop("C13brahmiratio").cache() - // df12.unpersist() - val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) //.drop("C14cyrilinratio").cache() - // df13.unpersist() - val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) //.drop("C15hanratio").cache() - // df14.unpersist() - val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) //.drop("c16malysiaratio").cache() - //df15.unpersist() - val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) //.drop("C17tamiratio").cache() - //df16.unpersist() - val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) //.drop("C18telugratio").cache() - //df17.unpersist() - val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) //.drop("C19symbolratio").cache() - //df18.unpersist() - val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) //.drop("C20alpharatio").cache() - // df19.unpersist() - val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) //.drop("C21visibleratio").cache() - // df20.unpersist() - val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) //.drop("C22printableratio").cache() - //df21.unpersist() - val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) //.drop("C23blankratio").cache() - // df22.unpersist() - val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) //.drop("C24controlratio").cache() - //df23.unpersist() - val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) //.drop("C25hexaratio").cache() - - //************************************************End Character Features **************************************************************************************** - - //************************************************Start Word Features **************************************************************************************** - - // Word Ratio Features : UDF - val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i } - val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i } - val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i } - val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i } - val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i } - - //1. - val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) //.drop("W1languagewordratio").cache() - - //2.Boolean(Double) IsContainLanguageWord - - //3. - val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) //.drop("W3lowercaseratio").cache() - // df26.unpersist() - - //4. Integer " Mean: - val Mean_W4longestword = Samples.agg(mean("W4longestword")).head() - val W4_Mean = Mean_W4longestword.getDouble(0) - val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i } - val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword"))) - - //5. Boolean (Double ) W5IscontainURL - //6. - val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) //.drop("W6badwordratio").cache() - - //7. - val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) //.drop("W7uppercaseratio").cache() - - //8. - val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) //.drop("W8banwordratio").cache() - - //9.FemalFirst Boolean(Double) - //10.Male First Boolean(Double) - //11.ContainBadWord Boolean(Double) - //12ContainBanWord Boolean(Double) - - //13. Integer(Double): - val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head() - val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0) - val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i } - val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords"))) - - //14. Integer (Double): - val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head() - val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0) - val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i } - val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords"))) - - // 15. Double (Not ratio): - val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head() - val W15_Mean = Mean_W15PortionQid.getDouble(0) - val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i } - val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid"))) - - //16. Double(Not Ratio): - val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head() - val W16_Mean = Mean_W16PortionLnags.getDouble(0) - val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i } - val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags"))) - - //17.Double(Not ratio): - val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head() - val W17_Mean = Mean_W17PortionLinks.getDouble(0) - val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i } - val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks"))) - - //************************************************End Word Features **************************************************************************************** - - //************************************************Start Sentences Features **************************************************************************************** - // 1. Integer(Double) - val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head() - val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0)) - val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i } - val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength"))) - - //2. Double but Not ratio values : - val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head() - val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0)) - val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i } - val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel"))) - - //3. Double but Not ratio values : - val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head() - val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0)) - val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i } - val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink"))) - - //4. Double but Not ratio values : - val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head() - val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0)) - val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i } - val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment"))) - - //df41.show() - //************************************************End Sentences Features **************************************************************************************** - //*********************************************** Start Statement Features **************************************************************************************** - //1. String - //2. String - //3. String - //************************************************End Statement Features **************************************************************************************** - //*********************************************** Start User Features **************************************************************************************** - - //1.Boolean(Double) - //2.Boolean(Double) - //3.Boolean(Double) - //4.Boolean(Double) - //5.Boolean(Double) - //6.Boolean(Double) - //7. (Double) IP No need to fill Missing Data - //8. (Double) ID No need to fill Missing Data - //9.Boolean(Double) - //10.Boolean(Double) - - //*********************************************** End User Features **************************************************************************************** - //*********************************************** Start Item Features **************************************************************************************** - //1. Integer (Double) No need to fill missing values - //2. Integer (Double) No need to fill missing values - //3. Integer (Double) No need to fill missing values - //4. Integer (Double) No need to fill missing values - //5. Integer (Double) No need to fill missing values - //6. Integer (Double) No need to fill missing values - //7. Integer (Double) No need to fill missing values - //8. Integer (Double) No need to fill missing values - //9. Integer (Double) No need to fill missing values - //10. Integer (Double) No need to fill missing values - //11. String - //*********************************************** End Item Features **************************************************************************************** - //*********************************************** Start Revision Features **************************************************************************************** - //1.String - //2.String - //3.Boolean (Double) - //4.Integer(Double) - //5.String - //6.String - //7. Boolean(Double) - //8. String - //9.String - //10. Integer (Double) - //11.String - //12. integer(Double) - //13. Long(Double) - //14. integer (Double) - //15.String - //16.String - //*********************************************** End Revision Features **************************************************************************************** - //*********************************************** Meta Data , Truth Data and Frequnces **************************************************************************************** - //Meta - // 1.Revision Session :Integer (Converted to Double) - //2. User Country Code - //3.User Continent Code - //4.User Time Size - //5.User Region Code - //6.User-city Name - //7.User Country Name - //8.RevisionTags - + // Streaming records: + val jobConf = new JobConf() + val NormalXML_Parser_OBJ = new ParseNormalXML() + val RDD_OBJ = new ParseNormalXML() + + val Testing_RDD_All_Record = RDD_OBJ.Testing_DB_NormalXML_Parser(sc).cache() + + // ======= Json part : + // Json RDD : Each record has its Revision iD: + val JsonRDD = Testing_RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache() + // JsonRDD.foreach(println) + // println(JsonRDD.count()) + + // Data set + val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache() + // Ds_Json.show() + // println(Ds_Json.count()) + + // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage + val TagsRDD = Testing_RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache() + val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", + "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache() + // DF_Tags.show() + // println(DF_Tags.count()) + + // ======== Join Json part with Tag Part:============================ + // Joining to have full data + val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter").select("Rid", "itemid", "comment", "pid", "time", + "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", + "aliases", "claims", "sitelinks", "model", "format", "sha") // .orderBy("Rid", "Itemid") + DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1") + val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache() + + val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", + "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") + val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) // .distinct() + DF_Second.registerTempTable("Data2") + + // ===================================================================Parent // Previous Revision============================================================================================================== + + // Joining based on Parent Id to get the previous cases: ParentID + val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct() + + val RDD_After_JoinDF = DF_Joined.rdd.distinct() + val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache() + val part = new RangePartitioner(4, x) + val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory. + // partitioned.foreach(println) + // + // //=====================================================All Features Based on Categories of Features Data Type :================================================================================== + // + val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on "," + // Result_all_Features.foreach(println) + // println("nayef" + Result_all_Features.count()) + + // Conver the RDD of All Features to DataFrame: + + val schema = StructType( + + // 0 + StructField("Rid", IntegerType, false) :: + + // Character Features : + /* 1 */ StructField("C1uppercaseratio", DoubleType, false) :: /* 2 */ StructField("C2lowercaseratio", DoubleType, false) :: /* 3 */ StructField("C3alphanumericratio", DoubleType, false) :: + /* 4 */ StructField("C4asciiratio", DoubleType, false) :: /* 5 */ StructField("C5bracketratio", DoubleType, false) :: /* 6 */ StructField("C6digitalratio", DoubleType, false) :: + /* 7 */ StructField("C7latinratio", DoubleType, false) :: /* 8 */ StructField("C8whitespaceratio", DoubleType, false) :: /* 9 */ StructField("C9puncratio", DoubleType, false) :: + /* 10 */ StructField("C10longcharacterseq", DoubleType, false) :: /* 11 */ StructField("C11arabicratio", DoubleType, false) :: /* 12 */ StructField("C12bengaliratio", DoubleType, false) :: + /* 13 */ StructField("C13brahmiratio", DoubleType, false) :: /* 14 */ StructField("C14cyrilinratio", DoubleType, false) :: /* 15 */ StructField("C15hanratio", DoubleType, false) :: + /* 16 */ StructField("c16malysiaratio", DoubleType, false) :: /* 17 */ StructField("C17tamiratio", DoubleType, false) :: /* 18 */ StructField("C18telugratio", DoubleType, false) :: + /* 19 */ StructField("C19symbolratio", DoubleType, false) :: /* 20 */ StructField("C20alpharatio", DoubleType, false) :: /* 21 */ StructField("C21visibleratio", DoubleType, false) :: + /* 22 */ StructField("C22printableratio", DoubleType, false) :: /* 23 */ StructField("C23blankratio", DoubleType, false) :: /* 24 */ StructField("C24controlratio", DoubleType, false) :: + /* 25 */ StructField("C25hexaratio", DoubleType, false) :: + + // word Features: + /* 26 */ StructField("W1languagewordratio", DoubleType, false) :: /* 27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /* 28 */ StructField("W3lowercaseratio", DoubleType, false) :: + /* 29 Integer */ StructField("W4longestword", IntegerType, false) :: /* 30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /* 31 */ StructField("W6badwordratio", DoubleType, false) :: + /* 32 */ StructField("W7uppercaseratio", DoubleType, false) :: /* 33 */ StructField("W8banwordratio", DoubleType, false) :: /* 34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) :: + /* 35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /* 36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: + /* 37 Boolean */ StructField("W12IsContainBanword", DoubleType, false) :: + /* 38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /* 39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) :: + /* 40 */ StructField("W15PortionQid", DoubleType, false) :: /* 41 */ StructField("W16PortionLnags", DoubleType, false) :: /* 42 */ StructField("W17PortionLinks", DoubleType, false) :: + + // + // // Sentences Features: + /* 43 */ StructField("S1CommentTailLength", DoubleType, false) :: /* 44 */ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: + /* 45 */ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: + /* 46 */ StructField("S4SimilarityCommentComment", DoubleType, false) :: + // + // // Statements Features : + /* 47 */ StructField("SS1Property", StringType, false) :: /* 48 */ StructField("SS2DataValue", StringType, false) :: /* 49 */ StructField("SS3ItemValue", StringType, false) :: + // + // + // //User Features : + /* 50 Boolean */ StructField("U1IsPrivileged", DoubleType, false) :: /* 51 Boolean */ StructField("U2IsBotUser", DoubleType, false) :: /* 52 Boolean */ StructField("U3IsBotuserWithFlaguser", DoubleType, false) :: + /* 53 Boolean */ StructField("U4IsProperty", DoubleType, false) :: /* 54 Boolean */ StructField("U5IsTranslator", DoubleType, false) :: /* 55 Boolean */ StructField("U6IsRegister", DoubleType, false) :: + /* 56 */ StructField("U7IPValue", DoubleType, false) :: /* 57 */ StructField("U8UserID", IntegerType, false) :: /* 58 */ StructField("U9HasBirthDate", DoubleType, false) :: + /* 59 */ StructField("U10HasDeathDate", DoubleType, false) :: + + // Items Features : + + /* 60 */ StructField("I1NumberLabels", DoubleType, false) :: /* 61 */ StructField("I2NumberDescription", DoubleType, false) :: /* 62 */ StructField("I3NumberAliases", DoubleType, false) :: + /* 63 */ StructField("I4NumberClaims", DoubleType, false) :: + /* 64 */ StructField("I5NumberSitelinks", DoubleType, false) :: /* 65 */ StructField("I6NumberStatement", DoubleType, false) :: /* 66 */ StructField("I7NumberReferences", DoubleType, false) :: + /* 67 */ StructField("I8NumberQualifier", DoubleType, false) :: + /* 68 */ StructField("I9NumberQualifierOrder", DoubleType, false) :: /* 69 */ StructField("I10NumberBadges", DoubleType, false) :: /* 70 */ StructField("I11ItemTitle", StringType, false) :: + + // Revision Features: + /* 71 */ StructField("R1languageRevision", StringType, false) :: /* 72 */ StructField("R2RevisionLanguageLocal", StringType, false) :: /* 73 */ StructField("R3IslatainLanguage", DoubleType, false) :: + /* 74 */ StructField("R4JsonLength", DoubleType, false) :: /* 75 */ StructField("R5RevisionAction", StringType, false) :: /* 76 */ StructField("R6PrevReviAction", StringType, false) :: + /* 77 */ StructField("R7RevisionAccountChange", DoubleType, false) :: /* 78 */ StructField("R8ParRevision", StringType, false) :: /* 79 */ StructField("R9RevisionTime", StringType, false) :: + /* 80 */ StructField("R10RevisionSize", DoubleType, false) :: /* 81 */ StructField("R11ContentType", StringType, false) :: /* 82 */ StructField("R12BytesIncrease", DoubleType, false) :: + /* 83 */ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /* 84 */ StructField("R14CommentLength", DoubleType, false) :: /* 85 */ StructField("R15RevisionSubaction", StringType, false) :: + /* 86 */ StructField("R16PrevReviSubaction", StringType, false) :: + + Nil) + + val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column + , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble), e(11).toDouble, e(12).toDouble, e(13).toDouble // + , e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble // Word Feature column + , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble // + , RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column: + , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble // Statement Features Column: + , e(47), e(48), e(49) // User Features Column: + , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble // Item Features column: + , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble // + , e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() // Revision Features Column: + , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86))) + + // a.User Frequency: + // number of revisions a user has contributed + // val resu= DF_Tags.groupBy("contributorID").agg(count("Rid")) + DF_Tags.registerTempTable("TagesTable") + val ContributorFreq_for_Each_Revision_DF = sqlContext + .sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") // .drop("CIDUSER1") + // ContributorFreq_for_Each_Revision_DF.show() + + // b.Cumulated : Number of a unique Item a user has contributed. + val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext + .sql("select contributorID as CIDUSER2, COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") // .drop("CIDUSER2") + // CumulatedNumberof_uniqueItemsForUser_DF.show() + + // 1.Item Frequency: + // number of revisions an Item has + val ItemFrequ_DF = sqlContext + .sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable group by itemid") + // ItemFrequ_DF.show() + + // 2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name + val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid, COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid") + // CumulatedNumberof_UniqueUserForItem_DF.show() + + // 3. freq each Item : + val Fre_Item_DF = sqlContext.sql("select itemid, COUNT(itemid) as FreqItem from TagesTable group by itemid") + // Fre_Item_DF.show() + + // ***************************************************************************************************************************************** + // This is Main DataFrame: + val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema) + // BeforeJoin_All_Features.show() + + // ********************************** User feature Join + + // Join1 for add The first User Feature : number of revisions a user has contributed + val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1") + // AfterJoinUser1_All_Features.show() + + // Join2 for add The second User Feature + val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2") + // AfterJoinUser2_All_Features.show() + + // ********************************** Item Feature Join + // Join3 for add The First Item Feature :number of revisions an Item has + val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") + // AfterJoinItem3_All_Features.show() + + // Join4 for add The Second Item Feature + val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") + // AfterJoinItem4_All_Features.show() + + // Join5 for add The Third Item Feature + val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") + // 2 AfterJoinItem5_All_Features.show() + + // ******************************** + + // *Geografical information Feature from Meta File + // REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS + val df_GeoInf = sqlContext.read + .format("com.databricks.spark.csv") + .option("header", "true") // Use first line of all files as header + .option("inferSchema", "true") // Automatically infer data types + .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", + "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS") + // df_GeoInf.show() + + val df_Truth = sqlContext.read + .format("com.databricks.spark.csv") + .option("header", "true") // Use first line of all files as header + .option("inferSchema", "true") // Automatically infer data types + .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED") + // df_GeoInf.show() + + val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() + // AfterJoinGeoInfo_All_Features.show() + + val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() + // Final_All_Features.show() + + // Pre- process Data ============================================================================================================================================================ + + // For String Column, We fill the Null values by "NA": + + var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", + "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache() + + // For Integer Frequency Column, We fill the Null values by 0: + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", + "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache() + // Fill_Missing_Final_All_Features.show() + + val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 } + val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble } + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED"))) + + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID"))) + + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit"))) + + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem"))) + + // ===========================================================================Caharacter Features : Double , Integer Features ==================================================================================== + // Double Ratio: For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features : + var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() // .where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0) + Samples.registerTempTable("df") + + val Query = "select " + + "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," + + "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," + + "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," + + "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," + + "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," + + "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," + + "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," + + "percentile_approx(c16malysiaratio, 0.5) as median16" + "," + + "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," + + "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," + + "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," + + "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," + + "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," + + "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," + + "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df" + + val medianValues = sqlContext.sql(Query).rdd + val Median = medianValues.first() + + // Median : + // Character Ratio Features: UDF + val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i } + val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i } + val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i } + val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i } + val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i } + val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i } + val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i } + val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i } + val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i } + + val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i } + val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i } + val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i } + val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i } + val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i } + val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i } + val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i } + val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i } + val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i } + val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i } + val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i } + val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i } + val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i } + val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i } + val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i } + + val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) // .drop("C1uppercaseratio").cache() + val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) // .drop("C2lowercaseratio").cache() + // df1.unpersist() + val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) // .drop("C3alphanumericratio").cache() + // df2.unpersist() + val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) // .drop("C4asciiratio").cache() + // df3.unpersist() + val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) // .drop("C5bracketratio").cache() + // df4.unpersist() + val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) // .drop("C6digitalratio").cache() + // df5.unpersist() + val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) // .drop("C7latinratio").cache() + // df6.unpersist() + val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) // .drop("C8whitespaceratio").cache() + // df7.unpersist() + val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) // .drop("C9puncratio").cache() + + // Mean : + // character integer values : + val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head() + val C10_Mean = Mean_C10longcharacterseq.getDouble(0) + val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i } + val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq"))) + + // Median + val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) // .drop("C11arabicratio").cache() + // df9.unpersist() + val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) // .drop("C12bengaliratio").cache() + // df11.unpersist() + val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) // .drop("C13brahmiratio").cache() + // df12.unpersist() + val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) // .drop("C14cyrilinratio").cache() + // df13.unpersist() + val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) // .drop("C15hanratio").cache() + // df14.unpersist() + val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) // .drop("c16malysiaratio").cache() + // df15.unpersist() + val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) // .drop("C17tamiratio").cache() + // df16.unpersist() + val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) // .drop("C18telugratio").cache() + // df17.unpersist() + val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) // .drop("C19symbolratio").cache() + // df18.unpersist() + val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) // .drop("C20alpharatio").cache() + // df19.unpersist() + val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) // .drop("C21visibleratio").cache() + // df20.unpersist() + val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) // .drop("C22printableratio").cache() + // df21.unpersist() + val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) // .drop("C23blankratio").cache() + // df22.unpersist() + val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) // .drop("C24controlratio").cache() + // df23.unpersist() + val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) // .drop("C25hexaratio").cache() + + // ************************************************End Character Features **************************************************************************************** + + // ************************************************Start Word Features **************************************************************************************** + + // Word Ratio Features : UDF + val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i } + val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i } + val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i } + val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i } + val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i } + + // 1. + val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) // .drop("W1languagewordratio").cache() + + // 2.Boolean(Double) IsContainLanguageWord + + // 3. + val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) // .drop("W3lowercaseratio").cache() + // df26.unpersist() + + // 4. Integer " Mean: + val Mean_W4longestword = Samples.agg(mean("W4longestword")).head() + val W4_Mean = Mean_W4longestword.getDouble(0) + val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i } + val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword"))) + + // 5. Boolean (Double ) W5IscontainURL + // 6. + val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) // .drop("W6badwordratio").cache() + + // 7. + val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) // .drop("W7uppercaseratio").cache() + + // 8. + val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) // .drop("W8banwordratio").cache() + + // 9.FemalFirst Boolean(Double) + // 10.Male First Boolean(Double) + // 11.ContainBadWord Boolean(Double) + // 12ContainBanWord Boolean(Double) + + // 13. Integer(Double): + val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head() + val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0) + val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i } + val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords"))) + + // 14. Integer (Double): + val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head() + val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0) + val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i } + val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords"))) + + // 15. Double (Not ratio): + val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head() + val W15_Mean = Mean_W15PortionQid.getDouble(0) + val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i } + val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid"))) + + // 16. Double(Not Ratio): + val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head() + val W16_Mean = Mean_W16PortionLnags.getDouble(0) + val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i } + val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags"))) + + // 17.Double(Not ratio): + val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head() + val W17_Mean = Mean_W17PortionLinks.getDouble(0) + val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i } + val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks"))) + + // ************************************************End Word Features **************************************************************************************** + + // ************************************************Start Sentences Features **************************************************************************************** + // 1. Integer(Double) + val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head() + val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0)) + val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i } + val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength"))) + + // 2. Double but Not ratio values : + val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head() + val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0)) + val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i } + val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel"))) + + // 3. Double but Not ratio values : + val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head() + val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0)) + val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i } + val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink"))) + + // 4. Double but Not ratio values : + val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head() + val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0)) + val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i } + val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment"))) + + // df41.show() + // ************************************************End Sentences Features **************************************************************************************** + // *********************************************** Start Statement Features **************************************************************************************** + // 1. String + // 2. String + // 3. String + // ************************************************End Statement Features **************************************************************************************** + // *********************************************** Start User Features **************************************************************************************** + + // 1.Boolean(Double) + // 2.Boolean(Double) + // 3.Boolean(Double) + // 4.Boolean(Double) + // 5.Boolean(Double) + // 6.Boolean(Double) + // 7. (Double) IP No need to fill Missing Data + // 8. (Double) ID No need to fill Missing Data + // 9.Boolean(Double) + // 10.Boolean(Double) + + // *********************************************** End User Features **************************************************************************************** + // *********************************************** Start Item Features **************************************************************************************** + // 1. Integer (Double) No need to fill missing values + // 2. Integer (Double) No need to fill missing values + // 3. Integer (Double) No need to fill missing values + // 4. Integer (Double) No need to fill missing values + // 5. Integer (Double) No need to fill missing values + // 6. Integer (Double) No need to fill missing values + // 7. Integer (Double) No need to fill missing values + // 8. Integer (Double) No need to fill missing values + // 9. Integer (Double) No need to fill missing values + // 10. Integer (Double) No need to fill missing values + // 11. String + // *********************************************** End Item Features **************************************************************************************** + // *********************************************** Start Revision Features **************************************************************************************** + // 1.String + // 2.String + // 3.Boolean (Double) + // 4.Integer(Double) + // 5.String + // 6.String + // 7. Boolean(Double) + // 8. String + // 9.String + // 10. Integer (Double) + // 11.String + // 12. integer(Double) + // 13. Long(Double) + // 14. integer (Double) + // 15.String + // 16.String + // *********************************************** End Revision Features **************************************************************************************** + // *********************************************** Meta Data , Truth Data and Frequnces **************************************************************************************** + // Meta + // 1.Revision Session :Integer (Converted to Double) + // 2. User Country Code + // 3.User Continent Code + // 4.User Time Size + // 5.User Region Code + // 6.User-city Name + // 7.User Country Name + // 8.RevisionTags + + // Truth: + // 1.Undo + + // Freq : + + // 1.5 features + + // Roll Boolean :Boolean (Double) + // Undo :Boolean (Double) + + // *********************************************** End Revision Features **************************************************************************************** + + // ===========================================================================String Features==================================================================================== + + val df42 = df41.withColumn( + // statement String features: + "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle", + // Revision String Features: + lit(";"), $"R1languageRevision", + lit(";"), $"R2RevisionLanguageLocal", + lit(";"), $"R5RevisionAction", + lit(";"), $"R6PrevReviAction", + lit(";"), $"R8ParRevision", + lit(";"), $"R9RevisionTime", + lit(";"), $"R11ContentType", + lit(";"), $"R15RevisionSubaction", + lit(";"), $"R16PrevReviSubaction", + + lit(";"), $"USER_COUNTRY_CODE", + lit(";"), $"USER_CONTINENT_CODE", + lit(";"), $"USER_TIME_ZONE", + lit(";"), $"USER_REGION_CODE", + lit(";"), $"USER_CITY_NAME", + lit(";"), $"USER_COUNTY_NAME", + lit(";"), $"REVISION_TAGS")) + + val toArray = udf((record: String) => record.split(";").map(_.toString())) + val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures"))) + // test1.show() + // test1.printSchema() + + val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0) + val model = word2Vec.fit(test1) + val result = model.transform(test1) // .rdd + + // result.show() + + val Todense = udf((b: Vector) => b.toDense) + val test_new2 = result.withColumn("result", Todense(col("result"))) + + val assembler = new VectorAssembler().setInputCols(Array( + "result", + + // character + "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio", + "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio", + "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio", + "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio", + + // Words + "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio", + "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword", + "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks", + + // Sentences : + "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment", + + // User : + "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID", + "U9HasBirthDate", "U10HasDeathDate", + + // Item: + + "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement", + "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges", + + // Revision: + "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease", + "R13TimeSinceLastRevi", "R14CommentLength", + + // Meta , truth , Freq + // meta : + "FinalREVISION_SESSION_ID", // Truth: - //1.Undo - - // Freq : - - //1.5 features - - // Roll Boolean :Boolean (Double) - // Undo :Boolean (Double) - - //*********************************************** End Revision Features **************************************************************************************** - - //===========================================================================String Features==================================================================================== - - val df42 = df41.withColumn( - //statement String features: - "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle", - //Revision String Features: - lit(";"), $"R1languageRevision", - lit(";"), $"R2RevisionLanguageLocal", - lit(";"), $"R5RevisionAction", - lit(";"), $"R6PrevReviAction", - lit(";"), $"R8ParRevision", - lit(";"), $"R9RevisionTime", - lit(";"), $"R11ContentType", - lit(";"), $"R15RevisionSubaction", - lit(";"), $"R16PrevReviSubaction", - - lit(";"), $"USER_COUNTRY_CODE", - lit(";"), $"USER_CONTINENT_CODE", - lit(";"), $"USER_TIME_ZONE", - lit(";"), $"USER_REGION_CODE", - lit(";"), $"USER_CITY_NAME", - lit(";"), $"USER_COUNTY_NAME", - lit(";"), $"REVISION_TAGS")) + "FinalUNDO_RESTORE_REVERTED", - val toArray = udf((record: String) => record.split(";").map(_.toString())) - val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures"))) - // test1.show() - // test1.printSchema() + // Freq: + "FinalNumberofRevisionsUserContributed", + "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features") + val Testing_Data = assembler.transform(test_new2) - val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0) - val model = word2Vec.fit(test1) - val result = model.transform(test1) //.rdd - - // result.show() - - val Todense = udf((b: Vector) => b.toDense) - val test_new2 = result.withColumn("result", Todense(col("result"))) - - val assembler = new VectorAssembler().setInputCols(Array( - "result", - - // character - "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio", - "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio", - "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio", - "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio", - - // Words - "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio", - "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword", - "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks", - - //Sentences : - "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment", - - // User : - "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID", - "U9HasBirthDate", "U10HasDeathDate", - - //Item: - - "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement", - "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges", - - //Revision: - "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease", - "R13TimeSinceLastRevi", "R14CommentLength", - - // Meta , truth , Freq - // meta : - "FinalREVISION_SESSION_ID", - // Truth: - "FinalUNDO_RESTORE_REVERTED", - - //Freq: - "FinalNumberofRevisionsUserContributed", - "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features") - val Testing_Data = assembler.transform(test_new2) - - // Prepare the data for classification: + // Prepare the data for classification: // NewData.registerTempTable("DB") // val Training_Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED from DB") - //val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision + // val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision - //Data.show() + // Data.show() // val TestClassifiers = new Classifiers() -// - // TestClassifiers.RandomForestClassifer(Testing_Data, sqlContext) -// // TestClassifiers.DecisionTreeClassifier(Data, sqlContext) -// // TestClassifiers.LogisticRegrision(Data, sqlContext) -// // TestClassifiers.GradientBoostedTree(Data, sqlContext) -// // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext) + // + // TestClassifiers.RandomForestClassifer(Testing_Data, sqlContext) + // // TestClassifiers.DecisionTreeClassifier(Data, sqlContext) + // // TestClassifiers.LogisticRegrision(Data, sqlContext) + // // TestClassifiers.GradientBoostedTree(Data, sqlContext) + // // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext) Testing_Data - - - } - - - - - def Triger(sc: SparkContext): Unit = { - -// val sqlContext = new org.apache.spark.sql.SQLContext(sc) -// import sqlContext.implicits._ -// import org.apache.spark.sql.functions._ // for UDF -// import org.apache.spark.sql.types._ -// -// //******************************************************************************************************************************* -// println("Please Enter 0 for JTriple and 1 for TRIX process and 2 for RDFXML process and 3 for NormalXML:") -// val num = scala.io.StdIn.readLine() -// -// if (num == "0") { -// println("JTriple.........!!!!!!") -// // Streaming records:RDFJtriple file : -// val jobConf = new JobConf() -// -// val JTriple_Parser_OBJ = new ParseJTriple() -// val DRF_Builder_JTripleOBJ = new FacilitiesClass() -// val RDD_JTriple = JTriple_Parser_OBJ.Start_JTriple_Parser(jobConf, sc) -// RDD_JTriple.foreach(println) -// //----------------------------DF for RDF TRIX ------------------------------------------ -// // Create SQLContext Object: -// val sqlContext = new org.apache.spark.sql.SQLContext(sc) -// val DFR_JTriple = DRF_Builder_JTripleOBJ.RDD_TO_DFR_JTriple(RDD_JTriple, sqlContext) -// DFR_JTriple.show() -// -// } - -// if (num == "1") { -// -// println("TRIX.........!!!!!!") -// // Streaming records:RDFTRIX file : -// val jobConf = new JobConf() -// -// val TRIX_Parser_OBJ = new ParseTRIX() -// val DRF_Builder_RDFTRIX_OBJ = new FacilitiesClass() -// -// val RDD_TRIX = TRIX_Parser_OBJ.Start_TriX_Parser(jobConf, sc) -// RDD_TRIX.foreach(println) -// -// //----------------------------DF for RDF TRIX ------------------------------------------ -// // Create SQLContext Object: -// val sqlContext = new org.apache.spark.sql.SQLContext(sc) -// val DFR_TRIX = DRF_Builder_RDFTRIX_OBJ.RDD_TO_DFR_TRIX(RDD_TRIX, sqlContext) -// DFR_TRIX.show() -// -// } //RDF XML file :********************************************************************************************************* -// else if (num == "2") { -// println("RDF XML .........!!!!!!") -// // Streaming records:RDFXML file : -// val jobConf_Record = new JobConf() -// val jobConf_Prefixes = new JobConf() -// -// val RDFXML_Parser_OBJ = new ParseRDFXML() -// val DRF_Builder_RDFXML_OBJ = new FacilitiesClass() -// -// val RDD_RDFXML = RDFXML_Parser_OBJ.start_RDFXML_Parser(jobConf_Record, jobConf_Prefixes, sc) -// RDD_RDFXML.foreach(println) -// -// //----------------------------DF for RDF XML ------------------------------------------ -// // Create SQLContext Object: -// val sqlContext = new org.apache.spark.sql.SQLContext(sc) -// val DFR_RDF_XML = DRF_Builder_RDFXML_OBJ.RDD_TO_DFR_RDFXML(RDD_RDFXML, sqlContext) -// DFR_RDF_XML.show() -// // -// // NOrmal XML Example WikiData: *************************************************************************************************** -// } else if (num == "3") { - // Streaming records: -// val jobConf = new JobConf() -// val NormalXML_Parser_OBJ = new ParseNormalXML() -// val RDD_OBJ = new ParseNormalXML() -// val RDD_All_Record1 = RDD_OBJ.Training_DB_NormalXML_Parser_Input1(sc) -// val RDD_All_Record2 = RDD_OBJ.Training_DB_NormalXML_Parser_Input2(sc) -// val RDD_All_Record3 = RDD_OBJ.Training_DB_NormalXML_Parser_Input3(sc) -// //RDD_All_Record1.foreach(println) -// //RDD_All_Record2.foreach(println) -// // RDD_All_Record3.foreach(println) -// -// val RDD_All_Record = RDD_All_Record1.union(RDD_All_Record2).union(RDD_All_Record3).distinct().cache() -// -// //println(RDD_All_Record.count()) -// // println(RDD_All_Record.count()) -// -// // ======= Json part : -// //Json RDD : Each record has its Revision iD: -// val JsonRDD = RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache() -// //JsonRDD.foreach(println) -// //println(JsonRDD.count()) -// -// // Data set -// val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache() -// //Ds_Json.show() -// // println(Ds_Json.count()) -// -// // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage -// val TagsRDD = RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache() -// val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache() -// // DF_Tags.show() -// // println(DF_Tags.count()) -// -// //======== Join Json part with Tag Part:============================ -// //Joining to have full data -// val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha") //.orderBy("Rid", "Itemid") -// DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1") -// val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache() -// -// val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") -// val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) //.distinct() -// DF_Second.registerTempTable("Data2") -// -// //===================================================================Parent // Previous Revision============================================================================================================== -// //val DF_Joined = result1.as("df1").join(result2.as("df2"), col("itemid") === col("itemid2") && col("index1") === col("index2") + 1, "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha", "Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") -// //.select("itemid", "Rid","pid","time","itemid2","Rid2","pid2","time2") -// -// //Joining based on Parent Id to get the previous cases: ParentID -// val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct() -// -// val RDD_After_JoinDF = DF_Joined.rdd.distinct() -// val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache() -// val part = new RangePartitioner(4, x) -// val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory. -// //partitioned.foreach(println) -// // -// // //=====================================================All Features Based on Categories of Features Data Type :================================================================================== -// // -// val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on "," -// //Result_all_Features.foreach(println) -// // println("nayef" + Result_all_Features.count()) -// -// // Conver the RDD of All Features to DataFrame: -// -// val schema = StructType( -// -// //0 -// StructField("Rid", IntegerType, false) :: -// -// // Character Features : -// /* 1*/ StructField("C1uppercaseratio", DoubleType, false) :: /*2 */ StructField("C2lowercaseratio", DoubleType, false) :: /*3*/ StructField("C3alphanumericratio", DoubleType, false) :: -// /*4*/ StructField("C4asciiratio", DoubleType, false) :: /*5*/ StructField("C5bracketratio", DoubleType, false) :: /*6*/ StructField("C6digitalratio", DoubleType, false) :: -// /*7*/ StructField("C7latinratio", DoubleType, false) :: /*8*/ StructField("C8whitespaceratio", DoubleType, false) :: /* 9*/ StructField("C9puncratio", DoubleType, false) :: -// /*10*/ StructField("C10longcharacterseq", DoubleType, false) :: /*11*/ StructField("C11arabicratio", DoubleType, false) :: /*12*/ StructField("C12bengaliratio", DoubleType, false) :: -// /*13 */ StructField("C13brahmiratio", DoubleType, false) :: /*14*/ StructField("C14cyrilinratio", DoubleType, false) :: /*15*/ StructField("C15hanratio", DoubleType, false) :: -// /*16*/ StructField("c16malysiaratio", DoubleType, false) :: /*17*/ StructField("C17tamiratio", DoubleType, false) :: /*18*/ StructField("C18telugratio", DoubleType, false) :: -// /*19 */ StructField("C19symbolratio", DoubleType, false) :: /*20 */ StructField("C20alpharatio", DoubleType, false) :: /*21*/ StructField("C21visibleratio", DoubleType, false) :: -// /*22*/ StructField("C22printableratio", DoubleType, false) :: /*23*/ StructField("C23blankratio", DoubleType, false) :: /*24 */ StructField("C24controlratio", DoubleType, false) :: -// /* 25 */ StructField("C25hexaratio", DoubleType, false) :: -// -// //word Features: -// /*26*/ StructField("W1languagewordratio", DoubleType, false) :: /*27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /*28*/ StructField("W3lowercaseratio", DoubleType, false) :: -// /*29 Integer */ StructField("W4longestword", IntegerType, false) :: /*30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /*31*/ StructField("W6badwordratio", DoubleType, false) :: -// /*32*/ StructField("W7uppercaseratio", DoubleType, false) :: /*33*/ StructField("W8banwordratio", DoubleType, false) :: /*34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) :: -// /*35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /*36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: /*37 Boolean*/ StructField("W12IsContainBanword", DoubleType, false) :: -// /*38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /*39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) :: -// /*40*/ StructField("W15PortionQid", DoubleType, false) :: /*41*/ StructField("W16PortionLnags", DoubleType, false) :: /*42*/ StructField("W17PortionLinks", DoubleType, false) :: -// -// // -// // // Sentences Features: -// /*43*/ StructField("S1CommentTailLength", DoubleType, false) :: /*44*/ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: /*45*/ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /*46*/ StructField("S4SimilarityCommentComment", DoubleType, false) :: -// // -// // // Statements Features : -// /*47*/ StructField("SS1Property", StringType, false) :: /*48*/ StructField("SS2DataValue", StringType, false) :: /*49*/ StructField("SS3ItemValue", StringType, false) :: -// // -// // -// // //User Features : -// /*50 Boolean*/ StructField("U1IsPrivileged", DoubleType, false) :: /*51 Boolean*/ StructField("U2IsBotUser", DoubleType, false) :: /*52 Boolean*/ StructField("U3IsBotuserWithFlaguser", DoubleType, false) :: -// /*53 Boolean*/ StructField("U4IsProperty", DoubleType, false) :: /*54 Boolean*/ StructField("U5IsTranslator", DoubleType, false) :: /*55 Boolean*/ StructField("U6IsRegister", DoubleType, false) :: -// /*56*/ StructField("U7IPValue", DoubleType, false) :: /*57*/ StructField("U8UserID", IntegerType, false) :: /*58*/ StructField("U9HasBirthDate", DoubleType, false) :: /*59*/ StructField("U10HasDeathDate", DoubleType, false) :: -// -// //Items Features : -// -// /*60*/ StructField("I1NumberLabels", DoubleType, false) :: /*61*/ StructField("I2NumberDescription", DoubleType, false) :: /*62*/ StructField("I3NumberAliases", DoubleType, false) :: /*63*/ StructField("I4NumberClaims", DoubleType, false) :: -// /*64*/ StructField("I5NumberSitelinks", DoubleType, false) :: /*65*/ StructField("I6NumberStatement", DoubleType, false) :: /*66*/ StructField("I7NumberReferences", DoubleType, false) :: /*67*/ StructField("I8NumberQualifier", DoubleType, false) :: -// /*68*/ StructField("I9NumberQualifierOrder", DoubleType, false) :: /*69*/ StructField("I10NumberBadges", DoubleType, false) :: /*70*/ StructField("I11ItemTitle", StringType, false) :: -// -// // Revision Features: -// /*71*/ StructField("R1languageRevision", StringType, false) :: /*72*/ StructField("R2RevisionLanguageLocal", StringType, false) :: /*73*/ StructField("R3IslatainLanguage", DoubleType, false) :: -// /*74*/ StructField("R4JsonLength", DoubleType, false) :: /*75*/ StructField("R5RevisionAction", StringType, false) :: /*76*/ StructField("R6PrevReviAction", StringType, false) :: -// /*77*/ StructField("R7RevisionAccountChange", DoubleType, false) :: /*78*/ StructField("R8ParRevision", StringType, false) :: /*79*/ StructField("R9RevisionTime", StringType, false) :: -// /*80*/ StructField("R10RevisionSize", DoubleType, false) :: /*81*/ StructField("R11ContentType", StringType, false) :: /*82*/ StructField("R12BytesIncrease", DoubleType, false) :: -// /*83*/ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /*84*/ StructField("R14CommentLength", DoubleType, false) :: /*85*/ StructField("R15RevisionSubaction", StringType, false) :: -// /*86*/ StructField("R16PrevReviSubaction", StringType, false) :: -// -// Nil) -// -// val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column -// , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble), -// e(11).toDouble, e(12).toDouble, e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble //Word Feature column -// , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble, RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column: -// , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble //Statement Features Column: -// , e(47), e(48), e(49) // User Features Column: -// , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble //Item Features column: -// , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() //Revision Features Column: -// , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86))) -// -// //a.User Frequency: -// //number of revisions a user has contributed -// //val resu= DF_Tags.groupBy("contributorID").agg(count("Rid")) -// DF_Tags.registerTempTable("TagesTable") -// val ContributorFreq_for_Each_Revision_DF = sqlContext.sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") //.drop("CIDUSER1") -// //ContributorFreq_for_Each_Revision_DF.show() -// -// //b.Cumulated : Number of a unique Item a user has contributed. -// val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext.sql("select contributorID as CIDUSER2, COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") //.drop("CIDUSER2") -// //CumulatedNumberof_uniqueItemsForUser_DF.show() -// -// //1.Item Frequency: -// // number of revisions an Item has -// val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable group by itemid") -// // ItemFrequ_DF.show() -// -// //2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name -// val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid, COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid") -// //CumulatedNumberof_UniqueUserForItem_DF.show() -// -// //3. freq each Item : -// val Fre_Item_DF = sqlContext.sql("select itemid, COUNT(itemid) as FreqItem from TagesTable group by itemid") -// // Fre_Item_DF.show() -// -// //***************************************************************************************************************************************** -// // This is Main DataFrame: -// val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema) -// //BeforeJoin_All_Features.show() -// -// //********************************** User feature Join -// -// // Join1 for add The first User Feature : number of revisions a user has contributed -// val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1") -// //AfterJoinUser1_All_Features.show() -// -// // Join2 for add The second User Feature -// val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2") -// //AfterJoinUser2_All_Features.show() -// -// //********************************** Item Feature Join -// // Join3 for add The First Item Feature :number of revisions an Item has -// val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") -// // AfterJoinItem3_All_Features.show() -// -// // Join4 for add The Second Item Feature -// val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") -// // AfterJoinItem4_All_Features.show() -// -// // Join5 for add The Third Item Feature -// val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") -// //2 AfterJoinItem5_All_Features.show() -// -// //******************************** -// -// //*Geografical information Feature from Meta File -// //REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS -// val df_GeoInf = sqlContext.read -// .format("com.databricks.spark.csv") -// .option("header", "true") // Use first line of all files as header -// .option("inferSchema", "true") // Automatically infer data types -// .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS") -// // df_GeoInf.show() -// -// val df_Truth = sqlContext.read -// .format("com.databricks.spark.csv") -// .option("header", "true") // Use first line of all files as header -// .option("inferSchema", "true") // Automatically infer data types -// .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED") -// // df_GeoInf.show() -// -// val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() -// // AfterJoinGeoInfo_All_Features.show() -// -// val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() -// //Final_All_Features.show() -// -// // Pre- process Data ============================================================================================================================================================ -// -// // For String Column, We fill the Null values by "NA": -// -// var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache() -// -// // For Integer Frequency Column, We fill the Null values by 0: -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache() -// //Fill_Missing_Final_All_Features.show() -// -// val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 } -// val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble } -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED"))) -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED"))) -// -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID"))) -// -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed"))) -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit"))) -// -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas"))) -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem"))) -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem"))) -// -// //===========================================================================Caharacter Features : Double , Integer Features ==================================================================================== -// //Double Ratio: For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features : -// var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() //.where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0) -// Samples.registerTempTable("df") -// -// val Query = "select " + -// "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," + -// "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," + -// "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," + -// "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," + -// "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," + -// "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," + -// "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," + -// "percentile_approx(c16malysiaratio, 0.5) as median16" + "," + -// "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," + -// "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," + -// "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," + -// "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," + -// "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," + -// "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," + -// "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df" -// -// val medianValues = sqlContext.sql(Query).rdd -// val Median = medianValues.first() -// -// // Median : -// // Character Ratio Features: UDF -// val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i } -// val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i } -// val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i } -// val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i } -// val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i } -// val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i } -// val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i } -// val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i } -// val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i } -// -// val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i } -// val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i } -// val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i } -// val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i } -// val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i } -// val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i } -// val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i } -// val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i } -// val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i } -// val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i } -// val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i } -// val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i } -// val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i } -// val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i } -// val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i } -// -// val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) //.drop("C1uppercaseratio").cache() -// val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) //.drop("C2lowercaseratio").cache() -// //df1.unpersist() -// val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) //.drop("C3alphanumericratio").cache() -// //df2.unpersist() -// val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) //.drop("C4asciiratio").cache() -// //df3.unpersist() -// val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) //.drop("C5bracketratio").cache() -// //df4.unpersist() -// val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) //.drop("C6digitalratio").cache() -// //df5.unpersist() -// val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) //.drop("C7latinratio").cache() -// //df6.unpersist() -// val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) //.drop("C8whitespaceratio").cache() -// //df7.unpersist() -// val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) //.drop("C9puncratio").cache() -// -// // Mean : -// // character integer values : -// val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head() -// val C10_Mean = Mean_C10longcharacterseq.getDouble(0) -// val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i } -// val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq"))) -// -// //Median -// val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) //.drop("C11arabicratio").cache() -// // df9.unpersist() -// val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) //.drop("C12bengaliratio").cache() -// //df11.unpersist() -// val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) //.drop("C13brahmiratio").cache() -// // df12.unpersist() -// val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) //.drop("C14cyrilinratio").cache() -// // df13.unpersist() -// val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) //.drop("C15hanratio").cache() -// // df14.unpersist() -// val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) //.drop("c16malysiaratio").cache() -// //df15.unpersist() -// val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) //.drop("C17tamiratio").cache() -// //df16.unpersist() -// val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) //.drop("C18telugratio").cache() -// //df17.unpersist() -// val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) //.drop("C19symbolratio").cache() -// //df18.unpersist() -// val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) //.drop("C20alpharatio").cache() -// // df19.unpersist() -// val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) //.drop("C21visibleratio").cache() -// // df20.unpersist() -// val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) //.drop("C22printableratio").cache() -// //df21.unpersist() -// val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) //.drop("C23blankratio").cache() -// // df22.unpersist() -// val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) //.drop("C24controlratio").cache() -// //df23.unpersist() -// val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) //.drop("C25hexaratio").cache() -// -// //************************************************End Character Features **************************************************************************************** -// -// //************************************************Start Word Features **************************************************************************************** -// -// // Word Ratio Features : UDF -// val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i } -// val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i } -// val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i } -// val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i } -// val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i } -// -// //1. -// val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) //.drop("W1languagewordratio").cache() -// -// //2.Boolean(Double) IsContainLanguageWord -// -// //3. -// val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) //.drop("W3lowercaseratio").cache() -// // df26.unpersist() -// -// //4. Integer " Mean: -// val Mean_W4longestword = Samples.agg(mean("W4longestword")).head() -// val W4_Mean = Mean_W4longestword.getDouble(0) -// val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i } -// val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword"))) -// -// //5. Boolean (Double ) W5IscontainURL -// //6. -// val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) //.drop("W6badwordratio").cache() -// -// //7. -// val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) //.drop("W7uppercaseratio").cache() -// -// //8. -// val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) //.drop("W8banwordratio").cache() -// -// //9.FemalFirst Boolean(Double) -// //10.Male First Boolean(Double) -// //11.ContainBadWord Boolean(Double) -// //12ContainBanWord Boolean(Double) -// -// //13. Integer(Double): -// val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head() -// val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0) -// val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i } -// val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords"))) -// -// //14. Integer (Double): -// val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head() -// val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0) -// val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i } -// val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords"))) -// -// // 15. Double (Not ratio): -// val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head() -// val W15_Mean = Mean_W15PortionQid.getDouble(0) -// val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i } -// val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid"))) -// -// //16. Double(Not Ratio): -// val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head() -// val W16_Mean = Mean_W16PortionLnags.getDouble(0) -// val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i } -// val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags"))) -// -// //17.Double(Not ratio): -// val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head() -// val W17_Mean = Mean_W17PortionLinks.getDouble(0) -// val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i } -// val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks"))) -// -// //************************************************End Word Features **************************************************************************************** -// -// //************************************************Start Sentences Features **************************************************************************************** -// // 1. Integer(Double) -// val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head() -// val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0)) -// val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i } -// val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength"))) -// -// //2. Double but Not ratio values : -// val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head() -// val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0)) -// val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i } -// val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel"))) -// -// //3. Double but Not ratio values : -// val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head() -// val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0)) -// val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i } -// val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink"))) -// -// //4. Double but Not ratio values : -// val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head() -// val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0)) -// val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i } -// val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment"))) -// -// //df41.show() -// //************************************************End Sentences Features **************************************************************************************** -// //*********************************************** Start Statement Features **************************************************************************************** -// //1. String -// //2. String -// //3. String -// //************************************************End Statement Features **************************************************************************************** -// //*********************************************** Start User Features **************************************************************************************** -// -// //1.Boolean(Double) -// //2.Boolean(Double) -// //3.Boolean(Double) -// //4.Boolean(Double) -// //5.Boolean(Double) -// //6.Boolean(Double) -// //7. (Double) IP No need to fill Missing Data -// //8. (Double) ID No need to fill Missing Data -// //9.Boolean(Double) -// //10.Boolean(Double) -// -// //*********************************************** End User Features **************************************************************************************** -// //*********************************************** Start Item Features **************************************************************************************** -// //1. Integer (Double) No need to fill missing values -// //2. Integer (Double) No need to fill missing values -// //3. Integer (Double) No need to fill missing values -// //4. Integer (Double) No need to fill missing values -// //5. Integer (Double) No need to fill missing values -// //6. Integer (Double) No need to fill missing values -// //7. Integer (Double) No need to fill missing values -// //8. Integer (Double) No need to fill missing values -// //9. Integer (Double) No need to fill missing values -// //10. Integer (Double) No need to fill missing values -// //11. String -// //*********************************************** End Item Features **************************************************************************************** -// //*********************************************** Start Revision Features **************************************************************************************** -// //1.String -// //2.String -// //3.Boolean (Double) -// //4.Integer(Double) -// //5.String -// //6.String -// //7. Boolean(Double) -// //8. String -// //9.String -// //10. Integer (Double) -// //11.String -// //12. integer(Double) -// //13. Long(Double) -// //14. integer (Double) -// //15.String -// //16.String -// //*********************************************** End Revision Features **************************************************************************************** -// //*********************************************** Meta Data , Truth Data and Frequnces **************************************************************************************** -// //Meta -// // 1.Revision Session :Integer (Converted to Double) -// //2. User Country Code -// //3.User Continent Code -// //4.User Time Size -// //5.User Region Code -// //6.User-city Name -// //7.User Country Name -// //8.RevisionTags -// -// // Truth: -// //1.Undo -// -// // Freq : -// -// //1.5 features -// -// // Roll Boolean :Boolean (Double) -// // Undo :Boolean (Double) -// -// //*********************************************** End Revision Features **************************************************************************************** -// -// //===========================================================================String Features==================================================================================== -// -// val df42 = df41.withColumn( -// //statement String features: -// "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle", -// //Revision String Features: -// lit(";"), $"R1languageRevision", -// lit(";"), $"R2RevisionLanguageLocal", -// lit(";"), $"R5RevisionAction", -// lit(";"), $"R6PrevReviAction", -// lit(";"), $"R8ParRevision", -// lit(";"), $"R9RevisionTime", -// lit(";"), $"R11ContentType", -// lit(";"), $"R15RevisionSubaction", -// lit(";"), $"R16PrevReviSubaction", -// -// lit(";"), $"USER_COUNTRY_CODE", -// lit(";"), $"USER_CONTINENT_CODE", -// lit(";"), $"USER_TIME_ZONE", -// lit(";"), $"USER_REGION_CODE", -// lit(";"), $"USER_CITY_NAME", -// lit(";"), $"USER_COUNTY_NAME", -// lit(";"), $"REVISION_TAGS")) -// -// val toArray = udf((record: String) => record.split(";").map(_.toString())) -// val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures"))) -// // test1.show() -// // test1.printSchema() -// -// val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0) -// val model = word2Vec.fit(test1) -// val result = model.transform(test1) //.rdd -// -// // result.show() -// -// val Todense = udf((b: Vector) => b.toDense) -// val test_new2 = result.withColumn("result", Todense(col("result"))) -// -// val assembler = new VectorAssembler().setInputCols(Array( -// "result", -// -// // character -// "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio", -// "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio", -// "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio", -// "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio", -// -// // Words -// "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio", -// "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword", -// "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks", -// -// //Sentences : -// "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment", -// -// // User : -// "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID", -// "U9HasBirthDate", "U10HasDeathDate", -// -// //Item: -// -// "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement", -// "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges", -// -// //Revision: -// "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease", -// "R13TimeSinceLastRevi", "R14CommentLength", -// -// // Meta , truth , Freq -// // meta : -// "FinalREVISION_SESSION_ID", -// // Truth: -// "FinalUNDO_RESTORE_REVERTED", -// -// //Freq: -// "FinalNumberofRevisionsUserContributed", -// "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features") -// val NewData = assembler.transform(test_new2) -// -// // Prepare the data for classification: -// NewData.registerTempTable("DB") -// val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED from DB") -// // val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision -// -// //Data.show() -// -// val TestClassifiers = new Classifiers() -// -// // TestClassifiers.RandomForestClassifer(Data, sqlContext) -// // TestClassifiers.DecisionTreeClassifier(Data, sqlContext) -// // TestClassifiers.LogisticRegrision(Data, sqlContext) -// // TestClassifiers.GradientBoostedTree(Data, sqlContext) -// // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext) -// -// } + } - //=========================================================================================================================================== - //=================================================Functions Part============================================================================= + + // =========================================================================================================================================== + // =================================================Functions Part============================================================================= def Ration(va: Double, median: Double): Double = { @@ -2007,11 +1357,11 @@ class VandalismDetection extends Serializable { def All_Features(row: Row): String = { var temp = "" - //all characters + // all characters val character_Str_String = Character_Features(row) temp = character_Str_String - //all Words + // all Words val Words_Str_String = Words_Features(row) temp = temp + "," + Words_Str_String @@ -2023,15 +1373,15 @@ class VandalismDetection extends Serializable { val Statement_Str_String = Statement_Features(row) temp = temp + "," + Statement_Str_String - //User Features - there are 3 Joins in last stage when we have Data Frame + // User Features - there are 3 Joins in last stage when we have Data Frame val User_Str_String = User_Features_Normal(row) temp = temp + "," + User_Str_String - //Item Features - there are 3 Joins in last stage when we have Data Frame + // Item Features - there are 3 Joins in last stage when we have Data Frame val Item_Str_String = Item_Features(row) temp = temp + "," + Item_Str_String - //Revision Features + // Revision Features val Revision_Str_String = Revision_Features(row) temp = temp + "," + Revision_Str_String @@ -2043,13 +1393,13 @@ class VandalismDetection extends Serializable { def Character_Features(row: Row): String = { var str_results = "" - //1. Row from partitioned Pair RDD: + // 1. Row from partitioned Pair RDD: var new_Back_Row = Row() - //2. Revision ID current operation: + // 2. Revision ID current operation: var RevisionID = row(0) - //3. row(2) = represent the Comment: + // 3. row(2) = represent the Comment: var CommentRecord_AsString = row(2).toString() - //4. extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail + // 4. extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail val CommentObj = new CommentProcessor() val Temp_commentTail = CommentObj.Extract_CommentTail(CommentRecord_AsString) @@ -2060,8 +1410,8 @@ class VandalismDetection extends Serializable { val FacilityOBJ = new FacilitiesClass() var Str_vector_Values = FacilityOBJ.ArrayToString(vectorElements) str_results = Str_vector_Values - //CharacterFeatures = Vector_AsArrayElements - //new_Back_Row = Row(vectorElements) + // CharacterFeatures = Vector_AsArrayElements + // new_Back_Row = Row(vectorElements) } else { @@ -2095,11 +1445,11 @@ class VandalismDetection extends Serializable { val FacilityOBJ = new FacilitiesClass() var Str_vector_Values = FacilityOBJ.ArrayToString(RatioValues) str_results = Str_vector_Values - //new_Back_Row = Row(vector_Values) + // new_Back_Row = Row(vector_Values) } // CharacterFeatures - //new_Back_Row + // new_Back_Row str_results.trim() } @@ -2107,13 +1457,13 @@ class VandalismDetection extends Serializable { def Words_Features(row: Row): String = { var str_results = "" - //Row from partitioned Pair RDD: + // Row from partitioned Pair RDD: var new_Back_Row = Row() - //Revision ID current operation: + // Revision ID current operation: var RevisionID = row(0) - //row(2) = represent the Comment: + // row(2) = represent the Comment: var CommentRecord_AsString = row(2).toString() - //Extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail + // Extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail val CommentObj = new CommentProcessor() val Temp_commentTail = CommentObj.Extract_CommentTail(CommentRecord_AsString) var tempQids = 0.0 @@ -2146,9 +1496,9 @@ class VandalismDetection extends Serializable { temLinks = porportion_links } else { - var porortion_Qids = tempQids //=0.0 - var porportion_Lang = temlangs //=0.0 - var porportion_links = temLinks //=0.0 + var porortion_Qids = tempQids // =0.0 + var porportion_Lang = temlangs // =0.0 + var porportion_links = temLinks // =0.0 } @@ -2164,11 +1514,11 @@ class VandalismDetection extends Serializable { var Prev_commentTail = CommentObj.Extract_CommentTail(prevComment.toString()) if (Prev_commentTail != "") { - //11.Feature Current_Previous_CommentTial_NumberSharingWords: + // 11.Feature Current_Previous_CommentTial_NumberSharingWords: val NumberSharingWords = WordsOBJ.Current_Previous_CommentTial_NumberSharingWords(Temp_commentTail, Prev_commentTail) ArrayElements(12) = NumberSharingWords.toDouble - //12.Feature Current_Previous_CommentTial_NumberSharingWords without Stopword: + // 12.Feature Current_Previous_CommentTial_NumberSharingWords without Stopword: val NumberSharingWordsWithoutStopwords = WordsOBJ.Current_Previous_CommentTial_NumberSharingWords_WithoutStopWords(Temp_commentTail, Prev_commentTail) ArrayElements(13) = NumberSharingWordsWithoutStopwords.toDouble @@ -2218,8 +1568,8 @@ class VandalismDetection extends Serializable { str_results = Str_vector_Values } - //new_Back_Row - //Word_Features + // new_Back_Row + // Word_Features str_results } @@ -2227,16 +1577,16 @@ class VandalismDetection extends Serializable { def Sentences_Features(row: Row): String = { var str_results = "" - //This will be used to save values in vector + // This will be used to save values in vector var DoubleValues = new Array[Double](4) - //1. Row from partitioned Pair RDD: + // 1. Row from partitioned Pair RDD: var new_Back_Row = Row() - //2. Revision ID current operation: + // 2. Revision ID current operation: var RevisionID = row(0) - //3. row(2) = represent the Full Comment: + // 3. row(2) = represent the Full Comment: var CommentRecord_AsString = row(2).toString() - //4. extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail + // 4. extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail val CommentObj = new CommentProcessor() val Temp_commentTail = CommentObj.Extract_CommentTail(CommentRecord_AsString) @@ -2249,14 +1599,14 @@ class VandalismDetection extends Serializable { DoubleValues(0) = comment_Tail_Length // Feature 2 similarity between comment contain Sitelink and label : - //Check the language in comment that contain sitelinkword: -------------------- + // Check the language in comment that contain sitelinkword: -------------------- val Sitelink_inCommentObj = new SentencesFeatures() if (CommentRecord_AsString.contains("sitelink")) { // start 1 loop - //1. First step : get the language from comment + // 1. First step : get the language from comment val languagesitelink_from_Comment = Sitelink_inCommentObj.extract_CommentSiteLink_LanguageType(CommentRecord_AsString).trim() - //2. second step: get the Label tage from json table : + // 2. second step: get the Label tage from json table : if (row(9).toString() != "[]") { // start 2 loop // if (row(8).toString() != "") { val jsonStr = "\"\"\"" + row(9).toString() + "\"\"\"" // row(9) is the label record @@ -2271,7 +1621,7 @@ class VandalismDetection extends Serializable { DoubleValues(1) = 0.0 } - } // endd 2 loop + } // endd 2 loop else { DoubleValues(1) = 0.0 @@ -2285,12 +1635,12 @@ class VandalismDetection extends Serializable { } // Feature 3 similarity between comment contain label word and sitelink - //Check the language in comment that contain Label word:----------------------- + // Check the language in comment that contain Label word:----------------------- val Label_inCommentObj = new SentencesFeatures() if (CommentRecord_AsString.contains("label")) { - //1. First step : get the language from comment + // 1. First step : get the language from comment val languageLabel_from_Comment = Label_inCommentObj.extract_CommentLabel_LanguageType(CommentRecord_AsString).trim() - //2. second step: get the site link tage from json table : + // 2. second step: get the site link tage from json table : if (row(13).toString() != "[]") { // start 2 loop val jsonStr = "\"\"\"" + row(13).toString() + "\"\"\"" // row(13) is the sitelink record val jsonObj: JSONObject = new JSONObject(row(13).toString()) @@ -2351,7 +1701,7 @@ class VandalismDetection extends Serializable { } - //new_Back_Row + // new_Back_Row str_results } @@ -2359,7 +1709,7 @@ class VandalismDetection extends Serializable { // statement Features : def Statement_Features(row: Row): String = { var full_Str_Result = "" - //1. row(2) = represent the Comment: + // 1. row(2) = represent the Comment: var fullcomment = row(2).toString() val StatementOBJ = new StatementFeatures() @@ -2400,9 +1750,9 @@ class VandalismDetection extends Serializable { var str_results = "" var DoubleValues = new Array[Double](10) // you should change the index when add more element feature - //Row from partitioned Pair RDD: + // Row from partitioned Pair RDD: var new_Back_Row = Row() - //row(7) = represent the Contributor name: + // row(7) = represent the Contributor name: var full_comment = row(2).toString() var contributor_Name = row(7).toString() var contributor_ID = row(6).toString() @@ -2411,7 +1761,7 @@ class VandalismDetection extends Serializable { val useFeatureOBJ = new UserFeatures() - //1. Is privileged : There are 5 cases : if one of these cases is true that mean it is privileged else it is not privileged user + // 1. Is privileged : There are 5 cases : if one of these cases is true that mean it is privileged else it is not privileged user var flag_case1 = useFeatureOBJ.CheckName_isGlobalSysopUser(contributor_Name) var flag_case2 = useFeatureOBJ.CheckName_isGlobalRollBackerUser(contributor_Name) var flag_case3 = useFeatureOBJ.CheckName_isGlobalStewarUser(contributor_Name) @@ -2427,7 +1777,7 @@ class VandalismDetection extends Serializable { DoubleValues(0) = 0.0 } - //2. is BotUser : There are 3 cases : + // 2. is BotUser : There are 3 cases : var flag_case1_1 = useFeatureOBJ.CheckName_isLocalBotUser(contributor_Name) var flag_case2_2 = useFeatureOBJ.CheckName_isGlobalbotUser(contributor_Name) var flag_case3_3 = useFeatureOBJ.CheckName_isExtensionBotUser(contributor_Name) @@ -2441,7 +1791,7 @@ class VandalismDetection extends Serializable { DoubleValues(1) = 0.0 } - //3. is Bot User without BotflagUser : There is 1 case : + // 3. is Bot User without BotflagUser : There is 1 case : var flag_BUWBF = useFeatureOBJ.CheckName_isBotUserWithoutBotFlagUser(contributor_Name) if (flag_BUWBF == true) { @@ -2452,7 +1802,7 @@ class VandalismDetection extends Serializable { } - //4. is Property creator : + // 4. is Property creator : var flagCreator = useFeatureOBJ.CheckName_isPropertyCreator(contributor_Name) if (flagCreator == true) { @@ -2463,7 +1813,7 @@ class VandalismDetection extends Serializable { } - //5. is translator : + // 5. is translator : var flagTranslator = useFeatureOBJ.CheckName_isTranslator(contributor_Name) if (flagTranslator == true) { DoubleValues(4) = 1.0 @@ -2471,7 +1821,7 @@ class VandalismDetection extends Serializable { DoubleValues(4) = 0.0 } - //6. is register user: + // 6. is register user: var flagRegistered = useFeatureOBJ.IsRegisteroUser(contributor_Name) if (flagRegistered == true) { DoubleValues(5) = 1.0 @@ -2490,13 +1840,13 @@ class VandalismDetection extends Serializable { } - //7. IP as a long value + // 7. IP as a long value if (contributor_IP != "0") { DoubleValues(6) = contributor_IP.toDouble } else { DoubleValues(6) = 0.0 } - //8. ID + // 8. ID if (contributor_ID != "0") { DoubleValues(7) = contributor_ID.toDouble @@ -2504,7 +1854,7 @@ class VandalismDetection extends Serializable { DoubleValues(7) = 0.0 } - //9- 10 BitrthDate - DeatDate: + // 9- 10 BitrthDate - DeatDate: var DateObj = new UserFeatures() var BirthDate = DateObj.IsBirthDate(full_comment) @@ -2540,11 +1890,11 @@ class VandalismDetection extends Serializable { var str_results = "" var DoubleValues = new Array[Double](11) - //Row from partitioned Pair RDD: + // Row from partitioned Pair RDD: var new_Back_Row = Row() var ItemOBJ = new ItemFeatures() - //1. Feature depending on Label: + // 1. Feature depending on Label: var NumberOfLabel = 0.0 var Label_String = row(9).toString() if (Label_String != "[]") { @@ -2554,7 +1904,7 @@ class VandalismDetection extends Serializable { NumberOfLabel = 0.0 DoubleValues(0) = NumberOfLabel } - //2. Feature depending on Description: + // 2. Feature depending on Description: var Description_String = row(10).toString() var NumberOfDescription = 0.0 if (Description_String != "[]") { @@ -2566,7 +1916,7 @@ class VandalismDetection extends Serializable { DoubleValues(1) = NumberOfDescription } - //3. Feature depending on Aliases: + // 3. Feature depending on Aliases: var Aliases_String = row(11).toString() var NumberOfAliases = 0.0 if (Aliases_String != "[]") { @@ -2578,7 +1928,7 @@ class VandalismDetection extends Serializable { DoubleValues(2) = NumberOfAliases } - //4. Feature depending on Claims : + // 4. Feature depending on Claims : var Claims_String = row(12).toString() var NumberOfClaims = 0.0 if (Claims_String != "[]") { @@ -2590,7 +1940,7 @@ class VandalismDetection extends Serializable { DoubleValues(3) = NumberOfClaims } - //5. Feature depending on SiteLink + // 5. Feature depending on SiteLink var SiteLink_String = row(13).toString() var NumberOfSitelink = 0.0 if (SiteLink_String != "[]") { @@ -2603,7 +1953,7 @@ class VandalismDetection extends Serializable { } - //6. Feature depending on Claims - statements : + // 6. Feature depending on Claims - statements : var statement_String = row(12).toString() // from claim var NumberOfstatement = 0.0 if (statement_String != "[]") { @@ -2616,7 +1966,7 @@ class VandalismDetection extends Serializable { } - //7. Feature depending on Claims - References : + // 7. Feature depending on Claims - References : var References_String = row(12).toString() // from claim var NumberOfReferences = 0.0 if (References_String != "[]") { @@ -2628,7 +1978,7 @@ class VandalismDetection extends Serializable { DoubleValues(6) = NumberOfReferences } - //8. Feature depending on claim + // 8. Feature depending on claim var Qualifier_String = row(12).toString() // from claim var NumberOfQualifier = 0.0 if (Qualifier_String != "[]") { @@ -2641,7 +1991,7 @@ class VandalismDetection extends Serializable { } - //9. Features depending on claim + // 9. Features depending on claim var Qualifier_String_order = row(12).toString() // from claim var NumberOfQualifier_order = 0.0 if (Qualifier_String_order != "[]") { @@ -2654,7 +2004,7 @@ class VandalismDetection extends Serializable { } - //10. Feature depending on Site link + // 10. Feature depending on Site link var BadgesString = row(13).toString() // from claim var NumberOfBadges = 0.0 if (BadgesString != "[]") { @@ -2667,7 +2017,7 @@ class VandalismDetection extends Serializable { } - //11. Item Title (instead of Item ID) + // 11. Item Title (instead of Item ID) var Item_Id_Title = row(1).toString().replace("Q", "") var Item = Item_Id_Title.trim().toDouble DoubleValues(10) = Item @@ -2688,17 +2038,17 @@ class VandalismDetection extends Serializable { def Revision_Features(row: Row): String = { - //var DoubleValues = new Array[Double](6) + // var DoubleValues = new Array[Double](6) var full_Str_Result = "" - //1. Row from partitioned Pair RDD: + // 1. Row from partitioned Pair RDD: var new_Back_Row = Row() - //2. Revision ID current operation: + // 2. Revision ID current operation: var RevisionID = row(0) - //3. row(2) = represent the Comment: + // 3. row(2) = represent the Comment: var fullcomment = row(2).toString() // DoubleValues(0) = length - //1. Revision Language :--------------------------------------------------------------------------------- + // 1. Revision Language :--------------------------------------------------------------------------------- var comment_for_Language = row(2).toString() val CommentLanguageOBJ = new RevisionFeatures() @@ -2709,7 +2059,7 @@ class VandalismDetection extends Serializable { full_Str_Result = "NA".trim() } - //2. Revision Language local:---------------------------------------------------------------------------- + // 2. Revision Language local:---------------------------------------------------------------------------- if (language != "NA") { if (language.contains("-")) { // E.g.Revision ID = 10850 sample1 var LocalLangArray: Array[String] = language.split("-", 2) @@ -2724,7 +2074,7 @@ class VandalismDetection extends Serializable { full_Str_Result = full_Str_Result + "," + "NA" } - //3. Is it Latin Language or Not:------------------------------------------------------------------------- + // 3. Is it Latin Language or Not:------------------------------------------------------------------------- val revisionFeatureOBJ = new RevisionFeatures() val flagLatin = revisionFeatureOBJ.Check_ContainLanguageLatin_NonLatin(language) @@ -2737,26 +2087,26 @@ class VandalismDetection extends Serializable { full_Str_Result = full_Str_Result + "," + "0.0" } - //4. Json Length : be care full to RDD where the json before parsed-------------------------------------- + // 4. Json Length : be care full to RDD where the json before parsed-------------------------------------- // var Jason_Text = row(8).toString() - //replacing_with_Quoto for cleaning the Json tag from extr tags such as ... + // replacing_with_Quoto for cleaning the Json tag from extr tags such as ... var Jason_Text = replacing_with_Quoto(row(0).toString(), row(8).toString()) var Json_Length = Jason_Text.length() full_Str_Result = full_Str_Result + "," + Json_Length.toString() - //5. Revision Action -:----------------------------------------------------------------------- + // 5. Revision Action -:----------------------------------------------------------------------- val CommentProcessOBJ1 = new CommentProcessor() val actions1 = CommentProcessOBJ1.Extract_Actions_FromComments(fullcomment) var ActionsArray1: Array[String] = actions1.split("_", 2) var action1 = ActionsArray1(0).toString() - //var SubAction = ActionsArray(1) + // var SubAction = ActionsArray(1) full_Str_Result = full_Str_Result + "," + action1.trim() - //full_Str_Result = full_Str_Result + "," + SubAction.trim() + // full_Str_Result = full_Str_Result + "," + SubAction.trim() - //6. Revision Prev-Action :------------------------------------------------------------------------------- + // 6. Revision Prev-Action :------------------------------------------------------------------------------- if (row(19) != null) { var Prev_fullcomment1 = row(19).toString() val Prev_CommentProcessOBJ1 = new CommentProcessor() @@ -2765,7 +2115,7 @@ class VandalismDetection extends Serializable { var Prev_action1 = ActionsArray1(0).trim() // var Prev_SubAction = ActionsArray(1).trim() full_Str_Result = full_Str_Result + "," + Prev_action1.trim() - //full_Str_Result = full_Str_Result + "," + Prev_SubAction.trim() + // full_Str_Result = full_Str_Result + "," + Prev_SubAction.trim() // println(row(16).toString()) } else { @@ -2798,13 +2148,14 @@ class VandalismDetection extends Serializable { var RevisionParent = row(3).toString() full_Str_Result = full_Str_Result + "," + RevisionParent.toString().trim() - //9. Revision Time Stamp------------------------------------------------------------------------------------------------ + // 9. Revision Time Stamp------------------------------------------------------------------------------------------------ var RevisionTimeZone = row(4).toString() full_Str_Result = full_Str_Result + "," + RevisionTimeZone - //10. Revision Size:------------------------------------------------------------------------------------------------ + // 10. Revision Size:------------------------------------------------------------------------------------------------ - var RevisionBody = row(0).toString() + row(2).toString() + row(3).toString() + row(4).toString() + row(8).toString() + row(14).toString() + row(15).toString() + row(16).toString() + var RevisionBody = row(0).toString() + row(2).toString() + row(3).toString() + row(4).toString() + + row(8).toString() + row(14).toString() + row(15).toString() + row(16).toString() if (row(5).toString() != "0") { RevisionBody = RevisionBody + row(5).toString() @@ -2816,7 +2167,7 @@ class VandalismDetection extends Serializable { } - //11. ContentType: take Action1 as input : -------------------------------------------------------------- + // 11. ContentType: take Action1 as input : -------------------------------------------------------------- val CommentProcessOBJ_New = new CommentProcessor() val actions_New = CommentProcessOBJ_New.Extract_Actions_FromComments(fullcomment) @@ -2833,7 +2184,8 @@ class VandalismDetection extends Serializable { var PreviRevision = "" // For Current Revision - CurrentRevision = row(0).toString() + row(2).toString() + row(3).toString() + row(4).toString() + row(8).toString() + row(14).toString() + row(15).toString() + row(16).toString() + CurrentRevision = row(0).toString() + row(2).toString() + row(3).toString() + row(4).toString() + + row(8).toString() + row(14).toString() + row(15).toString() + row(16).toString() if (row(5).toString() != "0") { CurrentRevision = CurrentRevision.trim() + row(5).toString() } else { @@ -2843,10 +2195,12 @@ class VandalismDetection extends Serializable { // For Previous Revision : if (row(17) != null && row(19) != null && row(20) != null && row(21) != null && row(25) != null && row(31) != null && row(32) != null && row(33) != null) { if (row(22) != null && row(22).toString() != "0") { - var PreviRevision = row(17).toString() + row(19).toString() + row(20).toString() + row(21).toString() + row(25).toString() + row(31).toString() + row(32).toString() + row(33).toString() + row(22).toString() + var PreviRevision = row(17).toString() + row(19).toString() + row(20).toString() + row(21).toString() + + row(25).toString() + row(31).toString() + row(32).toString() + row(33).toString() + row(22).toString() } else if (row(23) != null && row(24) != null) { - var PreviRevision = row(17).toString() + row(19).toString() + row(20).toString() + row(21).toString() + row(25).toString() + row(31).toString() + row(32).toString() + row(33).toString() + row(23).toString() + row(24).toString() + var PreviRevision = row(17).toString() + row(19).toString() + row(20).toString() + row(21).toString() + + row(25).toString() + row(31).toString() + row(32).toString() + row(33).toString() + row(23).toString() + row(24).toString() } else { PreviRevision = null @@ -2868,7 +2222,7 @@ class VandalismDetection extends Serializable { } - //13. Time since last Revision: ---------------------------------------------------------------------- + // 13. Time since last Revision: ---------------------------------------------------------------------- if (row(21) != null) { @@ -2886,11 +2240,11 @@ class VandalismDetection extends Serializable { } - //14. Comment Length:--------------------------------------- + // 14. Comment Length:--------------------------------------- var lengthcomment = fullcomment.length().toString() full_Str_Result = full_Str_Result + "," + lengthcomment - //15. Revision SubAction: + // 15. Revision SubAction: val CommentProcessOBJ2 = new CommentProcessor() val actions2 = CommentProcessOBJ2.Extract_Actions_FromComments(fullcomment) @@ -2898,7 +2252,7 @@ class VandalismDetection extends Serializable { var SubAction2 = ActionsArray2(1) full_Str_Result = full_Str_Result + "," + SubAction2.trim() - //16.Prev_revision SubAction: + // 16.Prev_revision SubAction: if (row(19) != null) { var Prev_fullcomment2 = row(19).toString() val Prev_CommentProcessOBJ2 = new CommentProcessor() @@ -2921,7 +2275,7 @@ class VandalismDetection extends Serializable { } - //======================== + // ======================== def RoundDouble(va: Double): Double = { @@ -2984,4 +2338,4 @@ class VandalismDetection extends Serializable { } -}// endl class ------- +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/WordsFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/WordsFeatures.scala index 1cf0ee1..bc3ca45 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/WordsFeatures.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/WordsFeatures.scala @@ -1,7 +1,8 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import java.util.regex.{ Pattern, Matcher } -import java.util.{ List, Arrays, ArrayList } +import java.util.{ ArrayList, Arrays, List } +import java.util.regex.{ Matcher, Pattern } + import org.apache.commons.lang3.StringUtils class WordsFeatures extends Serializable { @@ -15,53 +16,53 @@ class WordsFeatures extends Serializable { def Vector_Words_Feature(StrValue: String): Array[Double] = { var RatioValues = new Array[Double](17) val WordsFeature_OBJ = new WordsFeatures() - //1. Double for LanguageWord Ratio - ok + // 1. Double for LanguageWord Ratio - ok val LanguageWord = LanguageWordRatio_Character(StrValue) if (!LanguageWord.isNaN()) { RatioValues(0) = RoundDouble(LanguageWord) } - //2. Boolean --> Double for Contain language word - ok (1 Boolean) + // 2. Boolean --> Double for Contain language word - ok (1 Boolean) val IsContainLanguageWord = ContainLanguageWord(StrValue) if (IsContainLanguageWord == true) { RatioValues(1) = 1.0 } else if (IsContainLanguageWord == false) { RatioValues(1) = 0.0 } - //3.Double for LowerCaseWord Ratio - ok + // 3.Double for LowerCaseWord Ratio - ok val LowerCaseWord = LowercaseWordRation(StrValue) if (!LowerCaseWord.isNaN()) { RatioValues(2) = RoundDouble(LowerCaseWord) } - //4.Integer --> to Double for LongestWord - ok (1 Integer) + // 4.Integer --> to Double for LongestWord - ok (1 Integer) val LongWord = LongestWord(StrValue) if (LongWord != null) { val castedValue = LongWord.toDouble RatioValues(3) = castedValue } - //5.Boolean --> Double for word Contain URL -ok(2 boolean) + // 5.Boolean --> Double for word Contain URL -ok(2 boolean) val IsWordContainURL = ContainURLWord(StrValue) if (IsWordContainURL == true) { RatioValues(4) = 1.0 } else if (IsWordContainURL == false) { RatioValues(4) = 0.0 } - //6.Double for Bad Word Ratio - ok + // 6.Double for Bad Word Ratio - ok val BadWord = BadWordRation(StrValue) if (!BadWord.isNaN()) { RatioValues(5) = RoundDouble(BadWord) } - //7. Double for UppercaseWord Ratio -ok + // 7. Double for UppercaseWord Ratio -ok val UpperCaseWord = UppercaseWordRation(StrValue) if (!UpperCaseWord.isNaN()) { RatioValues(6) = RoundDouble(UpperCaseWord) } - //8.Double for Ban Word Ratio - ok + // 8.Double for Ban Word Ratio - ok val BanWord = BanWordRation(StrValue) if (!BanWord.isNaN()) { RatioValues(7) = RoundDouble(BanWord) } - //9.Boolean Femal FirstName (3 Boolean ) + // 9.Boolean Femal FirstName (3 Boolean ) val IsFemalFirstName = FemaleName_word(StrValue) if (IsFemalFirstName == true) { @@ -70,7 +71,7 @@ class WordsFeatures extends Serializable { RatioValues(8) = 0.0 } - //10. Boolean Male FirstName (4 Boolean) + // 10. Boolean Male FirstName (4 Boolean) val IsMaleFirstName = MaleName_word(StrValue) if (IsMaleFirstName == true) { RatioValues(9) = 1.0 @@ -78,7 +79,7 @@ class WordsFeatures extends Serializable { RatioValues(9) = 0.0 } - //11. Boolean containBadWord_word (5 Boolean ) + // 11. Boolean containBadWord_word (5 Boolean ) val IsContainBad_Word = containBadWord_word(StrValue) if (IsContainBad_Word == true) { @@ -87,7 +88,7 @@ class WordsFeatures extends Serializable { RatioValues(10) = 0.0 } - //12. Boolean containBanWord_word (6 Boolean) + // 12. Boolean containBanWord_word (6 Boolean) val IsContainBan_Word = BanBuilderWordlist_word(StrValue) if (IsContainBan_Word == true) { @@ -124,16 +125,69 @@ class WordsFeatures extends Serializable { } - //1.Language Words Ratio : - val regex_LanguageWordRatio: String = "(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4)"; + // 1.Language Words Ratio : + val regex_LanguageWordRatio: String = "(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)" + + "|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)" + + "|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski" + + "|ra[sz]il(ian?)?|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)" + + "|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)|[c\\u010d](esky|e[s\\u0161]tina)\r\n" + + "|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)" + + "|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)" + + "|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))" + + "|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)" + + "|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]" + + "|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)" + + "|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto" + + "|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano" + + "|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)" + + "|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ngv i[e\\u1ec7]t" + + "|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh" + + "|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441" + + "|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446|[p\\u0440]\\u0443\\u0441\\u0441" + + "|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea" + + "|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)" + + "|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f" + + "|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)" + + "|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?" + + "|([c\\u0441]\\u0440\\u043f|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e" + + "|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40" + + "|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd" + + "|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc" + + "|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53" + + "|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4)" + val pattern_LanguageWordRatio: Pattern = Pattern.compile(regex_LanguageWordRatio); def LanguageWordRatio_Character(str: String): Double = { val result: Double = WordRatio(str, pattern_LanguageWordRatio) result } - //2. Contain language word : - val regex_ContainLanguageWord: String = "(^|\\n)([ei]n )??(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4)( language)??($|\\n)"; + // 2. Contain language word : + val regex_ContainLanguageWord: String = "(^|\\n)([ei]n )??(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)" + + "|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)" + + "|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?" + + "|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)" + + "|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)" + + "|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)" + + "|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))" + + "|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?" + + "|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)" + + "|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)" + + "|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)" + + "|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)" + + "|welsh|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446" + + "|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea" + + "|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)" + + "|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f" + + "|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)" + + "|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f" + + "|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0" + + "|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40|\\u092e\\u0930\\u093e\\u0920\\u0940" + + "|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41" + + "|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02" + + "|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?" + + "|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4)( language)??($|\\n)" + val pattern_ContainLanguageWord: Pattern = Pattern.compile(regex_ContainLanguageWord); val matcher_ContainLanguageWord: Matcher = pattern_ContainLanguageWord.matcher(""); def ContainLanguageWord(str: String): Boolean = { @@ -149,20 +203,20 @@ class WordsFeatures extends Serializable { result } - //3. Upper case word Ratio: + // 3. Upper case word Ratio: def UppercaseWordRation(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Lu}.*") val result: Double = WordRatio(str, pattern) result } - //4. Lower case word Ratio: + // 4. Lower case word Ratio: def LowercaseWordRation(str: String): Double = { val pattern: Pattern = Pattern.compile("[\\p{L}&&[^\\p{Lu}]].*") val result: Double = WordRatio(str, pattern) result } - //5.word Contain URL : + // 5.word Contain URL : val pattern_WordContainURL: Pattern = Pattern.compile("\\b(https?:\\/\\/|www\\.)\\S{10}.*", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL | Pattern.CANON_EQ) val matcher_WordContainURL: Matcher = pattern_WordContainURL.matcher(""); @@ -179,7 +233,7 @@ class WordsFeatures extends Serializable { result } - //6. Longest Word + // 6. Longest Word val pattern_longestWord: Pattern = Pattern.compile("\\p{IsAlphabetic}+", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL | Pattern.CANON_EQ); val matcher_longestWord: Matcher = pattern_WordContainURL.matcher(""); @@ -203,7 +257,7 @@ class WordsFeatures extends Serializable { max } - //7. Bad Word : It is Ok + // 7. Bad Word : It is Ok val luisVonAhnWordlist: Array[String] = Array("abbo", "abo", "abortion", "abuse", "addict", "addicts", "adult", "africa", @@ -465,7 +519,7 @@ class WordsFeatures extends Serializable { } - //8. Contain Bad Word:It is ok + // 8. Contain Bad Word:It is ok val tokens_containbadword: List[String] = new ArrayList[String](Arrays.asList(luisVonAhnWordlist: _*)) val patternString_containBadword: String = ".*\\b(" + StringUtils.join(tokens_containbadword, "|") + ")\\b.*" val pattern_containBadword: Pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL | Pattern.CANON_EQ) @@ -481,7 +535,7 @@ class WordsFeatures extends Serializable { results } - //9.Ban Builder Word:It is OK + // 9.Ban Builder Word:It is OK val BanBuilderWordlist: Array[String] = Array("$#!+", "$1ut", "$h1t", "$hit", "$lut", "'ho", "'hobag", "a$$", "anal", "anus", "ass", "assmunch", "b1tch", "ballsack", "bastard", "beaner", @@ -629,7 +683,7 @@ class WordsFeatures extends Serializable { results } - //10 Ban word Ratio: + // 10 Ban word Ratio: val tokens_ban: List[String] = new ArrayList[String](Arrays.asList(BanBuilderWordlist: _*)) val patternString_ban: String = StringUtils.join(tokens_ban, "|") val pattern_banWord: Pattern = Pattern.compile(patternString_ban) @@ -645,8 +699,34 @@ class WordsFeatures extends Serializable { } - //11.Contain language word:It is ok - val regex_containLanguageWord: String = ".*(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4).*"; + // 11.Contain language word:It is ok + val regex_containLanguageWord: String = ".*(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)|rmenian?" + + "|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)" + + "|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?" + + "|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)" + + "|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)|speranto|stonian|usk[ae]ra)" + + "|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))" + + "|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])" + + "|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro" + + "|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?" + + "| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?" + + "|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh" + + "|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446" + + "|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea" + + "|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)" + + "|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f" + + "|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)" + + "|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8" + + "|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f" + + "|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438" + + "|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)" + + "|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40" + + "|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648" + + "|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0" + + "|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02" + + "|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53" + + "|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4).*" + val pattern_forContainLanguageWord: Pattern = Pattern.compile(regex_containLanguageWord); val matcher_containLanguageWord: Matcher = pattern_forContainLanguageWord.matcher(""); def containLanguageBadWord_word(str: String): Boolean = { @@ -660,7 +740,7 @@ class WordsFeatures extends Serializable { results } - //12. Male Names: It is ok + // 12. Male Names: It is ok val MaleNames: Array[String] = Array("AARON", "ADAM", "ADRIAN", "ALAN", "ALBERT", "ALBERTO", "ALEX", "ALEXANDER", "ALFRED", "ALFREDO", "ALLAN", "ALLEN", "ALVIN", "ANDRE", "ANDREW", "ANDY", @@ -725,7 +805,7 @@ class WordsFeatures extends Serializable { } - //13. Female Names: It is ok + // 13. Female Names: It is ok val FemaleNames: Array[String] = Array("AGNES", "ALICE", "ALICIA", "ALLISON", "ALMA", "AMANDA", "AMBER", "AMY", "ANA", "ANDREA", "ANGELA", "ANITA", "ANN", "ANNA", "ANNE", "ANNETTE", @@ -934,10 +1014,8 @@ class WordsFeatures extends Serializable { } } - results } - def GetNumberofLinks(str: String): Double = { val input: String = str @@ -948,7 +1026,44 @@ class WordsFeatures extends Serializable { count } - val RegexStr = "(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|" + "[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[eə]rba" + "(ijani?|ycan(ca)?|yjan)|нглийский)|b(ahasa( (indonesia|" + "jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|" + "elarusian?|okmål|osanski|ra[sz]il(ian?)?|ritish( " + "kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?" + "|zech|roat([eo]|ian?)|atal[aà]n?|рпски|antonese)|[cč]" + "(esky|e[sš]tina)|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]" + "nika|ng(els|le(ski|za)|lisc?h)|spa(g?[nñ]h?i?ol|nisc?h)" + "|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[cç]" + "(ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|" + "uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|" + "ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|" + "ndonesian?|ngl[eê]se?|ngilizce|tali(ano?|en(isch)?))|" + "ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|" + "sova)|urd[iî])|l(at(in[ao]?|vi(an?|e[sš]u))|ietuvi[uų]" + "|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|" + "sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol" + "(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|" + "orsk( bokm[aå]l)?|ynorsk)|o(landese|dia)|p(ashto|" + "ersi?an?|ol(n?isc?h|ski)|or?tugu?[eê]se?(( d[eo])?" + "brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[aâi]ni?[aă]n?" + "|um(ano|änisch)|ussi([ao]n?|sch))|s(anskrit|erbian|" + "imple english|inha?la|lov(ak(ian?)?|enš?[cč]ina|" + "en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|" + "rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|" + "hai(land)?|i[eế]ng vi[eệ]t|[uü]rk([cç]e|isc?h|iş|ey))|" + "u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(англиис|" + "[kк]алмыкс|[kк]азахс|немец|[pр]усс|[yу]збекс|" + "татарс)кий( язык)??|עברית|[kкқ](аза[кқ]ша|ыргызча|" + "ирилл)|українськ(а|ою)|б(еларуская|" + "ългарски( език)?)|ελλ[ηι]" + "νικ(ά|α)|ქართული|हिन्दी|ไทย|[mм]онгол(иа)?|([cс]рп|" + "[mм]акедон)ски|العربية|日本語|한국(말|어)|‌हिनद़ि| " + " বাংলা|ਪੰਜਾਬੀ|मराठी|ಕನ್ನಡ|اُردُو|தமிழ்|తెలుగు|ગુજરાતી|" + "فارسی|پارسی|മലയാളം|پښتو|မြန်မာဘာသာ|中文(简体|繁體)?|" + "中文((简体?|繁體))|简体|繁體)"; + val RegexStr = "(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|" + + "[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[eə]rba" + + "(ijani?|ycan(ca)?|yjan)|нглийский)|b(ahasa( (indonesia|" + + "jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|" + + "elarusian?|okmål|osanski|ra[sz]il(ian?)?|ritish( " + + "kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?" + + "|zech|roat([eo]|ian?)|atal[aà]n?|рпски|antonese)|[cč]" + + "(esky|e[sš]tina)|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]" + + "nika|ng(els|le(ski|za)|lisc?h)|spa(g?[nñ]h?i?ol|nisc?h)" + + "|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[cç]" + + "(ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|" + + "uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|" + + "ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|" + + "ndonesian?|ngl[eê]se?|ngilizce|tali(ano?|en(isch)?))|" + + "ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|" + + "sova)|urd[iî])|l(at(in[ao]?|vi(an?|e[sš]u))|ietuvi[uų]" + + "|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|" + + "sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol" + + "(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|" + + "orsk( bokm[aå]l)?|ynorsk)|o(landese|dia)|p(ashto|" + + "ersi?an?|ol(n?isc?h|ski)|or?tugu?[eê]se?(( d[eo])?" + + "brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[aâi]ni?[aă]n?" + + "|um(ano|änisch)|ussi([ao]n?|sch))|s(anskrit|erbian|" + + "imple english|inha?la|lov(ak(ian?)?|enš?[cč]ina|" + + "en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|" + + "rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|" + + "hai(land)?|i[eế]ng vi[eệ]t|[uü]rk([cç]e|isc?h|iş|ey))|" + + "u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(англиис|" + + "[kк]алмыкс|[kк]азахс|немец|[pр]усс|[yу]збекс|" + + "татарс)кий( язык)??|עברית|[kкқ](аза[кқ]ша|ыргызча|" + + "ирилл)|українськ(а|ою)|б(еларуская|" + + "ългарски( език)?)|ελλ[ηι]" + + "νικ(ά|α)|ქართული|हिन्दी|ไทย|[mм]онгол(иа)?|([cс]рп|" + + "[mм]акедон)ски|العربية|日本語|한국(말|어)|‌हिनद़ि| " + + " বাংলা|ਪੰਜਾਬੀ|मराठी|ಕನ್ನಡ|اُردُو|தமிழ்|తెలుగు|ગુજરાતી|" + + "فارسی|پارسی|മലയാളം|پښتو|မြန်မာဘာသာ|中文(简体|繁體)?|" + + "中文((简体?|繁體))|简体|繁體)" + def GetNumberofLanguageword(str: String): Double = { val input: String = str val patternRegex: Pattern = Pattern.compile(RegexStr) @@ -971,5 +1086,4 @@ class WordsFeatures extends Serializable { result.toFloat } // Words features: ------ End calculation the Ratio for Words: - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/DistancesTest.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/DistancesTest.scala new file mode 100644 index 0000000..a755476 --- /dev/null +++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/DistancesTest.scala @@ -0,0 +1,13 @@ +package net.sansa_stack.ml.spark.clustering.algorithms + +import org.scalatest.FunSuite + +class DistancesTest extends FunSuite { + val testData1 = Set("a", "b") + val testData2 = Set("b", "c", "d") + + test("Distances.jaccardSimilarity") { + assert(new Distances().jaccardSimilarity(testData1, testData2) === 0.25) + } +} + diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/EncoderTest.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/EncoderTest.scala new file mode 100644 index 0000000..8d90fc5 --- /dev/null +++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/EncoderTest.scala @@ -0,0 +1,32 @@ +package net.sansa_stack.ml.spark.clustering.algorithms + +import com.holdenkarau.spark.testing.DataFrameSuiteBase +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.rdd.RDD +import org.scalatest.FunSuite + +class EncoderTest extends FunSuite with DataFrameSuiteBase { + test("Encoder.mdsEncoding") { + val mdsTestData = List((1.toLong, 2.toLong, 0.5), (1.toLong, 3.toLong, 1.0), (2.toLong, 3.toLong, 1.0)) + val mdsTestDataRDD: RDD[(Long, Long, Double)] = spark.sparkContext.parallelize(mdsTestData) + /* val oneHotTestData = List((1.toLong, Set("a", "b")), (2.toLong, Set("b", "c")), (3.toLong, Set("b", "d"))) + val oneHotTestDataRDD: RDD[(Long, Set[String])] = spark.sparkContext.parallelize(oneHotTestData) +*/ + val (mdsEncodedDF, mdsEncoded) = new Encoder().mdsEncoding(mdsTestDataRDD, 3, 2, spark) + assert(mdsEncodedDF.head().getAs[DenseVector](mdsEncodedDF.head().length - 1).size === 2) // (x, y) coordinate + } + + /* test("Encoder.oneHotEncoding") { + val (oneHotEncodedDF, oneHotEncoded) = new Encoder().oneHotEncoding(oneHotTestDataRDD, spark) + assert(oneHotEncodedDF.head().getAs[DenseVector](oneHotEncodedDF.head().length-1).size === 4) // encoded vector + } + + test("Encoder.wordVectorEncoder") { + val (word2VecEncodedDF, word2VecEncoded) = new Encoder().wordVectorEncoder(oneHotTestDataRDD, spark) + assert(word2VecEncodedDF.head().getAs[DenseVector](word2VecEncodedDF.head().length-1).size >= 1) // vector size for poi should be larger than equal to 1 + } +*/ +} + + + diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/KmeansTest.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/KmeansTest.scala new file mode 100644 index 0000000..a94ad7b --- /dev/null +++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/KmeansTest.scala @@ -0,0 +1,17 @@ +package net.sansa_stack.ml.spark.clustering.algorithms + +import com.holdenkarau.spark.testing.DataFrameSuiteBase +import org.apache.spark.rdd.RDD +import org.scalatest.FunSuite + +class KmeansTest extends FunSuite with DataFrameSuiteBase { + test("Kmeans.mdsEncoding") { + val mdsTestData = spark.sparkContext.parallelize(List((1.toLong, 2.toLong, 0.5), (1.toLong, 3.toLong, 1.0), (2.toLong, 3.toLong, 1.0))) + mdsTestData.foreach(println) + // val mdsTestDataRDD: RDD[(Long, Long, Double)] = sc.parallelize(mdsTestData) + val (mdsEncodedDF, mdsEncoded) = new Encoder().mdsEncoding(mdsTestData, 3, 2, spark) + val km_result = new Kmeans().kmClustering(2, 2, mdsEncodedDF, spark) + assert(km_result.size === 2) // 2 clusters + } +} + diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDSTest.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDSTest.scala new file mode 100644 index 0000000..1ecca91 --- /dev/null +++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDSTest.scala @@ -0,0 +1,14 @@ +package net.sansa_stack.ml.spark.clustering.algorithms + +import com.holdenkarau.spark.testing.DataFrameSuiteBase +import org.apache.spark.rdd.RDD +import org.scalatest.FunSuite + +class MultiDSTest extends FunSuite with DataFrameSuiteBase { + test("multiDS.multiDimensionScaling") { + val testData = List((1.toLong, 2.toLong, 0.5), (1.toLong, 3.toLong, 1.0), (2.toLong, 3.toLong, 1.0)) + val testDataRDD: RDD[(Long, Long, Double)] = spark.sparkContext.parallelize(testData) + val coordinates = new MultiDS().multiDimensionScaling(testDataRDD, 3, 2) + assert(coordinates.length === 3 && coordinates.head._2.length === 2) + } +} diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/PICTest.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/PICTest.scala new file mode 100644 index 0000000..dec944c --- /dev/null +++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/PICTest.scala @@ -0,0 +1,15 @@ +package net.sansa_stack.ml.spark.clustering.algorithms + +import com.holdenkarau.spark.testing.DataFrameSuiteBase +import org.apache.spark.rdd.RDD +import org.scalatest.FunSuite + +class PICTest extends FunSuite with DataFrameSuiteBase { + test("PIC.picSparkML") { + val testData = List((1.toLong, 2.toLong, 1.0), (1.toLong, 3.toLong, 0.0), (2.toLong, 3.toLong, 0.0)) + val testDataRDD: RDD[(Long, Long, Double)] = spark.sparkContext.parallelize(testData) + val clusters = new PIC().picSparkML(testDataRDD, 2, 1, sparkSession = spark) + assert(clusters.size === 2) + } +} + diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/RdfPicTest.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/RdfPicTest.scala similarity index 92% rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/RdfPicTest.scala rename to sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/RdfPicTest.scala index 9b1b1be..3c2ffdf 100644 --- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/RdfPicTest.scala +++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/RdfPicTest.scala @@ -1,4 +1,4 @@ -package net.sansa_stack.ml.spark.clustering +package net.sansa_stack.ml.spark.clustering.algorithms import com.holdenkarau.spark.testing.DataFrameSuiteBase import net.sansa_stack.rdf.spark.io._ diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernelTests.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernelTests.scala index 0025d09..994c517 100644 --- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernelTests.scala +++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernelTests.scala @@ -31,5 +31,4 @@ class RDFFastGraphKernelTests extends FunSuite with DataFrameSuiteBase { assert(true) } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/anomalydetection/AnomalyDetectionTests.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionTests.scala similarity index 97% rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/anomalydetection/AnomalyDetectionTests.scala rename to sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionTests.scala index b5696d0..475d555 100644 --- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/anomalydetection/AnomalyDetectionTests.scala +++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionTests.scala @@ -1,11 +1,11 @@ -package net.sansa_stack.ml.spark.anomalydetection +package net.sansa_stack.ml.spark.outliers.anomalydetection import com.holdenkarau.spark.testing.DataFrameSuiteBase -import net.sansa_stack.ml.spark.outliers.anomalydetection._ import org.apache.jena.riot.Lang import org.apache.spark.rdd.RDD import org.scalatest.FunSuite +import net.sansa_stack.ml.spark.outliers.anomalydetection._ class AnomalyDetectionTests extends FunSuite with DataFrameSuiteBase { import net.sansa_stack.rdf.spark.io._ @@ -34,9 +34,6 @@ class AnomalyDetectionTests extends FunSuite with DataFrameSuiteBase { val hypernym = "http://purl.org/linguistics/gold/hypernym" test("performing anomaly detection using HashingTF method should result in size 36") { - - - val triples = spark.rdf(Lang.NTRIPLES)(path) triples.repartition(125).persist @@ -52,7 +49,6 @@ class AnomalyDetectionTests extends FunSuite with DataFrameSuiteBase { test("performing anomaly detection using CountVetcorizerModel method should result in size 15") { - val triples = spark.rdf(Lang.NTRIPLES)(path) triples.repartition(125).persist @@ -80,3 +76,5 @@ class AnomalyDetectionTests extends FunSuite with DataFrameSuiteBase { } } + + diff --git a/scalastyle-config.xml b/scalastyle-config.xml index 6728712..d1cf125 100644 --- a/scalastyle-config.xml +++ b/scalastyle-config.xml @@ -116,7 +116,7 @@ This file is divided into 3 sections: - +