diff --git a/.gitignore b/.gitignore
index 417122f..7bface9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,4 +19,6 @@ project/plugins/project/
# IntelliJ IDEA specific
/.idea
-sansa-ml-parent_2.11.iml
+*.iml
+
+scalastyle-output.xml
diff --git a/.travis.yml b/.travis.yml
index 2e15138..682ef63 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,10 @@
-language: java
+language: scala
sudo: false
cache:
directories:
- - $HOME/.m2
\ No newline at end of file
+ - $HOME/.m2
+scala:
+ - 2.11.11
+script:
+ - mvn scalastyle:check
+ - mvn test
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index e240109..4a4c58b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
4.0.0
net.sansa-stack
sansa-ml-parent_2.11
- 0.4.0
+ 0.5.0
pom
ML API - Parent
RDF/OWL Machine Learning Library for Big Data
@@ -20,8 +20,8 @@
- GNU GENERAL PUBLIC LICENSE, Version 3
- http://www.gnu.org/licenses/gpl-3.0.txt
+ Apache License 2.0
+ http://www.apache.org/licenses/LICENSE-2.0.html
repo
@@ -65,10 +65,11 @@
UTF-8
2.11.11
2.11
- 2.3.1
- 1.5.0
- 3.7.0
- 0.4.0
+ 2.4.0
+ 1.7.0
+ 3.9.0
+ 0.5.0
+ 0.4.1
${project.basedir}/scalastyle-config.xml
@@ -85,14 +86,14 @@
net.sansa-stack
sansa-rdf-spark_${scala.binary.version}
- ${sansa.version}
+ ${sansa.rdf.version}
net.sansa-stack
sansa-owl-spark_${scala.binary.version}
- ${sansa.version}
+ ${sansa.owl.version}
@@ -157,13 +158,7 @@
org.scalatest
scalatest_${scala.binary.version}
- 3.0.3
- test
-
-
- com.holdenkarau
- spark-testing-base_${scala.binary.version}
- 2.3.0_0.9.0
+ 2.2.6
test
@@ -173,6 +168,7 @@
scala-logging_${scala.binary.version}
3.5.0
+
com.github.scopt
@@ -180,11 +176,25 @@
3.5.0
-
- com.google.guava
- guava
- 19.0
+ com.holdenkarau
+ spark-testing-base_${scala.binary.version}
+ 2.3.0_0.9.0
+ test
+
+
+
+ org.glassfish.jersey
+ jersey-bom
+ 2.26-b03
+ pom
+ import
+
+
+
+ org.apache.commons
+ commons-compress
+ 1.18
@@ -193,11 +203,44 @@
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 3.0.1
+
+
+ attach-sources
+ verify
+
+ jar-no-fork
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 2.10.4
+
+ false
+
+
+
+ attach-javadocs
+
+ jar
+
+
+
+
+
net.alchim31.maven
scala-maven-plugin
- 3.2.1
+ 3.3.1
@@ -209,13 +252,18 @@
-dependencyfile
${project.build.directory}/.scala_dependencies
+ -Xmax-classfile-name
+ 128
+
+ -Xss2048K
+
${scala.version}
- incremental
+
@@ -229,6 +277,48 @@
+
+ com.amashchenko.maven.plugin
+ gitflow-maven-plugin
+ 1.8.0
+
+
+ v
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 1.6
+
+
+ sign-artifacts
+ verify
+
+ sign
+
+
+ AKSW
+ ${gpg.keyname}
+
+
+
+
+
+
+ org.sonatype.plugins
+ nexus-staging-maven-plugin
+ 1.6.8
+ true
+
+ ossrh
+ https://oss.sonatype.org/
+ true
+
+
+
org.apache.maven.plugins
@@ -263,7 +353,7 @@
1.0.0
false
- false
+ true
true
false
${project.basedir}/src/main/scala
@@ -287,65 +377,49 @@
-
+
+
+ root-dir
+
+
+ ${project.basedir}/../../scalastyle-config.xml
+
+
+
+ ${project.basedir}/../scalastyle-config.xml
+
+
release
-
-
- ossrh
- https://oss.sonatype.org/service/local/staging/deploy/maven2/
-
-
+
+
+ performRelease
+ true
+
+
+
- net.alchim31.maven
- scala-maven-plugin
- 3.2.2
-
-
-
- compile
- testCompile
-
-
- ${scala.version}
- incremental
- true
-
- -unchecked
- -deprecation
- -feature
- -dependencyfile
- ${project.build.directory}/.scala_dependencies
-
-
-
-
-
- attach-javadocs
-
- doc-jar
-
-
-
-
+ org.apache.maven.plugins
+ maven-gpg-plugin
- org.apache.maven.plugins
- maven-source-plugin
-
-
- verify
- attach-sources
-
- jar
-
-
-
+ org.sonatype.plugins
+ nexus-staging-maven-plugin
+
+
+
+
+ doclint-java8-disable
+
+ [1.8,)
+
+
+
org.apache.maven.plugins
maven-javadoc-plugin
@@ -355,55 +429,18 @@
jar
-
-
-
-
-
- org.apache.maven.plugins
- maven-gpg-plugin
- 1.6
-
-
- sign-artifacts
- verify
-
- sign
-
- AKSW
- ${gpg.keyname}
+ false
-
-
-
- org.sonatype.plugins
- nexus-staging-maven-plugin
- 1.6.7
- true
- ossrh
- https://oss.sonatype.org/
- true
+ -Xdoclint:none
-
-
- root-dir
-
-
- ${project.basedir}/../../scalastyle-config.xml
-
-
-
- ${project.basedir}/../scalastyle-config.xml
-
-
diff --git a/sansa-ml-common/pom.xml b/sansa-ml-common/pom.xml
index 1d4ead0..57bae30 100644
--- a/sansa-ml-common/pom.xml
+++ b/sansa-ml-common/pom.xml
@@ -5,7 +5,7 @@
sansa-ml-parent_2.11
net.sansa-stack
- 0.4.0
+ 0.5.0
sansa-ml-common_2.11
ML API - Common
@@ -18,6 +18,20 @@
scala-library
+
+
+ net.sf.extjwnl
+ extjwnl
+ 1.9.4
+
+
+
+
+ net.sf.extjwnl
+ extjwnl-data-wn31-map
+ 1.0
+
+
junit
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNet.scala b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNet.scala
similarity index 81%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNet.scala
rename to sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNet.scala
index 93a4a44..546ff00 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNet.scala
+++ b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNet.scala
@@ -10,14 +10,15 @@
* and ws4j
* and nltk project
*/
-package net.sansa_stack.ml.spark.nlp.wordnet
+package net.sansa_stack.ml.common.nlp.wordnet
import java.io.Serializable
-import scala.collection.JavaConversions._
+
+import net.sf.extjwnl.data.{PointerType, PointerUtils, Word}
+import net.sf.extjwnl.dictionary.Dictionary
+import scala.collection.JavaConverters._
import scala.collection.breakOut
import scala.collection.mutable.ArrayBuffer
-import net.sf.extjwnl.dictionary.Dictionary
-import net.sf.extjwnl.data.{PointerType, PointerUtils, Word}
/**
* WordNet singleton to initialize WordNet dataset
@@ -33,8 +34,11 @@ object WordNet {
*/
class WordNet extends Serializable {
+ var maxDepth = 0
+
/**
* Returns an instance of the WordNet dictionary used in the package
+ *
* @return
*/
def getDict: Dictionary = WordNet.dict
@@ -46,10 +50,9 @@ class WordNet extends Serializable {
* @return : List[Synset]
*/
def getSynsets(lemma: String): List[Synset] =
- net.sf.extjwnl.data.POS.getAllPOS
+ net.sf.extjwnl.data.POS.getAllPOS.asScala
.flatMap(pos => getSynsets(lemma, pos))(breakOut)
-
/**
* Returns a Synset given a String
* Returns empty list if the lemma did not exist in the WordNet
@@ -63,7 +66,8 @@ class WordNet extends Serializable {
val indexWord = WordNet.dict.getIndexWord(pos, lemma)
var result = List.empty[Synset]
if (indexWord != null) {
- result = List(indexWord.getSenses()(sid))
+ val result_scala = indexWord.getSenses().asScala
+ result = List(result_scala(sid))
}
result
}
@@ -79,7 +83,7 @@ class WordNet extends Serializable {
def getSynsets(lemma: String, pos: POS): List[Synset] = {
val iword = WordNet.dict.getIndexWord(pos, lemma)
if (iword == null) List.empty[Synset]
- else iword.getSenses.toList
+ else iword.getSenses.asScala.toList
}
/**
@@ -89,7 +93,7 @@ class WordNet extends Serializable {
* @return : List[String]
*/
def lemmaNames(synset: Synset): List[String] =
- synset.getWords.map(_.getLemma)(breakOut)
+ synset.getWords.asScala.map(_.getLemma)(breakOut)
/**
* Input is a synset
@@ -172,7 +176,7 @@ class WordNet extends Serializable {
* @return : List[Synset]
*/
def relatedSynsets(synset: Synset, ptr: PointerType): List[Synset] =
- synset.getPointers(ptr).map(ptr => ptr.getTarget.asInstanceOf[Synset])(breakOut)
+ synset.getPointers(ptr).asScala.map(ptr => ptr.getTarget.asInstanceOf[Synset])(breakOut)
/**
* Returns list of all hypernyms of a synset
@@ -180,12 +184,12 @@ class WordNet extends Serializable {
* @param synset :Synset
* @return : List[Synset]
*/
- def allHypernyms(synset: Synset): List[List[Synset]] =
+ def getAllHypernyms(synset: Synset): List[List[Synset]] =
PointerUtils
.getHypernymTree(synset)
.toList
- .map(ptnl => ptnl
- .map(ptn => ptn.getSynset)
+ .asScala.map(ptnl => ptnl
+ .asScala.map(ptn => ptn.getSynset)
.toList)(breakOut)
/**
@@ -195,7 +199,7 @@ class WordNet extends Serializable {
* @return : List[Synset]
*/
def rootHypernyms(synset: Synset): List[Synset] =
- allHypernyms(synset)
+ getAllHypernyms(synset)
.map(hp => hp.reverse.head).distinct
/**
@@ -206,8 +210,8 @@ class WordNet extends Serializable {
* @return : List[Synset]
*/
def lowestCommonHypernym(synset1: Synset, synset2: Synset): List[Synset] = {
- val paths1 = allHypernyms(synset1)
- val paths2 = allHypernyms(synset2)
+ val paths1 = getAllHypernyms(synset1)
+ val paths2 = getAllHypernyms(synset2)
lch(paths1, paths2)
}
@@ -219,7 +223,7 @@ class WordNet extends Serializable {
* @return : Integer
*/
def shortestHypernymPathLength(synset1: Synset, hypernym: Synset): Int = {
- val paths1 = allHypernyms(synset1)
+ val paths1 = getAllHypernyms(synset1)
val path = ArrayBuffer[(Synset, Int)]()
val matchedPath = paths1.zipWithIndex.filter { case (s, i) => s.contains(hypernym) }
@@ -249,15 +253,30 @@ class WordNet extends Serializable {
}
/**
- * Returns the depth of a synset
- * Since there can be several paths to root, the minimum lenth is considered
+ * Returns the length of the shortest hypernym path from this
+ * synset to the root
+ * Since there can be several paths to root, the minimum length is considered
*
* @param synset : Synset
* @return : Integer
*/
- def depth(synset: Synset): Int = {
- val lens = allHypernyms(synset)
- if (lens.isEmpty) -1 else lens.map(_.size).min - 1
+ def minDepth(synset: Synset): Int = {
+ val lists = getAllHypernyms(synset)
+ if (lists.isEmpty) -1 else lists.map(_.size).min - 1
+ }
+
+
+
+ /**
+ * Returns the length of the longest hypernym path from this
+ * synset to the root
+ * Since there can be several paths to root, the minimum length is considered
+ * @param synset : Synset
+ * @return : Integer
+ */
+ def maxDepth(synset: Synset): Int = {
+ val lists = getAllHypernyms(synset)
+ if (lists.isEmpty) -1 else lists.map(_.size).max - 1
}
/**
@@ -278,6 +297,6 @@ class WordNet extends Serializable {
*/
def relatedLemmas(word: Word, ptr: PointerType): List[Word] =
word.getPointers(ptr)
- .map(ptr => ptr.getTarget.asInstanceOf[Word])(breakOut)
+ .asScala.map(ptr => ptr.getTarget.asInstanceOf[Word])(breakOut)
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNetSimilarity.scala b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNetSimilarity.scala
similarity index 67%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNetSimilarity.scala
rename to sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNetSimilarity.scala
index 047061c..86cc701 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNetSimilarity.scala
+++ b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNetSimilarity.scala
@@ -6,29 +6,28 @@
* Inspired from:
* WordNet::Similarity of Ted Peterson
* and ws4j
- * and ntlk project
+ * and nltk project
*/
-package net.sansa_stack.ml.spark.nlp.wordnet
-
+package net.sansa_stack.ml.common.nlp.wordnet
object WordNetSimilarity extends WordNet {
/**
- * Wu & Palmer (1994) method of measuring semantic relatedness based on node counting.
- * given two synsets, synset1 and synset2 returns the similarity score
- *
- * @param synset1 :Synset
- * @param synset2 :Synset
- * @return score :Double
- */
+ * Wu & Palmer (1994) method of measuring semantic relatedness based on node counting.
+ * given two synsets, synset1 and synset2 returns the similarity score
+ *
+ * @param synset1 :Synset
+ * @param synset2 :Synset
+ * @return score :Double
+ */
def wupSimilarity(synset1: Synset, synset2: Synset): Double = {
val min = 0.0
if (synset1 == null || synset2 == null) throw new IllegalArgumentException("arg 1 or 2 was null...")
val lcs = lowestCommonHypernym(synset1, synset2)
if (lcs.isEmpty) return min
- val depth = this.depth(lcs.head)
+ val depth = this.maxDepth(lcs.head)
val depth1 = shortestHypernymPathLength(synset1, lcs.head) + depth
val depth2 = shortestHypernymPathLength(synset2, lcs.head) + depth
var score = 0.0
@@ -37,13 +36,13 @@ object WordNetSimilarity extends WordNet {
}
/**
- * Returns the distance similarity of two synsets using the shortest path linking the two synsets (if
- * one exists)
- *
- * @param synset1 : Synset
- * @param synset2 : Synset
- * @return : Double
- */
+ * Returns the distance similarity of two synsets using the shortest path linking the two synsets (if
+ * one exists)
+ *
+ * @param synset1 : Synset
+ * @param synset2 : Synset
+ * @return : Double
+ */
def shortestPathSim(synset1: Synset, synset2: Synset): Double = {
if (synset1 == null || synset2 == null) throw new IllegalArgumentException("arg 1 or 2 was null...")
@@ -56,5 +55,4 @@ object WordNetSimilarity extends WordNet {
else score = 1.toDouble / distance
score
}
-
- }
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/package.scala b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/package.scala
similarity index 89%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/package.scala
rename to sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/package.scala
index a1440d1..a567761 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/package.scala
+++ b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/package.scala
@@ -1,6 +1,7 @@
-package net.sansa_stack.ml.spark.nlp
+package net.sansa_stack.ml.common.nlp
import java.io.Serializable
+
import net.sf.extjwnl.data.POS
package object wordnet extends Serializable {
@@ -13,4 +14,3 @@ package object wordnet extends Serializable {
val Adjective = POS.ADJECTIVE
val Adjverb = POS.ADVERB
}
-
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala
similarity index 80%
rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala
rename to sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala
index a2f4058..80abc36 100644
--- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala
+++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala
@@ -1,10 +1,9 @@
-package net.sansa_stack.ml.spark.nlp.wordnet
+package net.sansa_stack.ml.common.nlp.wordnet
-import com.holdenkarau.spark.testing.DataFrameSuiteBase
-import org.scalatest.FunSuite
import net.sf.extjwnl.data._
+import org.scalatest.FunSuite
-class DistanceWordNetSimilarityMeasureTests extends FunSuite with DataFrameSuiteBase {
+class DistanceWordNetSimilarityMeasureTests extends FunSuite {
test("shortest path similarity between dog and cat synset should result in value 0.3") {
try {
diff --git a/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestAllHypernims.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestAllHypernims.scala
new file mode 100644
index 0000000..1531ced
--- /dev/null
+++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestAllHypernims.scala
@@ -0,0 +1,27 @@
+package net.sansa_stack.ml.common.nlp.wordnet
+
+import net.sf.extjwnl.data._
+import org.scalatest.FunSuite
+
+class TestAllHypernims extends FunSuite {
+
+ test("Tests getting all hypernyms of the the first synset in the word cat") {
+ try {
+ val wn = new WordNet
+ val dict = wn.getDict
+ // getting a synset by a word and index
+
+ val cat = wn.getSynset("cat", POS.NOUN, 1).head
+
+ val getAllHypers = wn.getAllHypernyms(cat)
+
+ assert(getAllHypers != null)
+ }
+ catch {
+ case e: ExceptionInInitializerError => println("The WordNet dictionary is not installed, please check the readme for instructions to enable it.")
+ }
+
+ }
+}
+
+
diff --git a/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetMaxDepth.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetMaxDepth.scala
new file mode 100644
index 0000000..6e67049
--- /dev/null
+++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetMaxDepth.scala
@@ -0,0 +1,25 @@
+package net.sansa_stack.ml.common.nlp.wordnet
+
+import net.sf.extjwnl.data._
+import org.scalatest.FunSuite
+
+class TestGetMaxDepth extends FunSuite {
+
+ test("Test the function that gets the maximum depth of dataset graph ") {
+
+ try {
+ val wn = new WordNet
+ val dict = wn.getDict
+
+ val thing1 = wn.getSynset("thing", POS.NOUN, 1).head
+ val dog = wn.getSynset("dog", POS.NOUN, 1).head
+
+
+ val dogD = wn.maxDepth(dog)
+ val dogD2 = wn.minDepth(dog)
+ assert(dogD != 0)
+ } catch {
+ case e: ExceptionInInitializerError => println("The WordNet dictionary is not installed, please check the readme for instructions to enable it.")
+ }
+ }
+}
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/TestGetSynsets.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetSynsets.scala
similarity index 73%
rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/TestGetSynsets.scala
rename to sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetSynsets.scala
index c5d5eb5..607d585 100644
--- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/TestGetSynsets.scala
+++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetSynsets.scala
@@ -1,10 +1,9 @@
-package net.sansa_stack.ml.spark.nlp.wordnet
+package net.sansa_stack.ml.common.nlp.wordnet
-import com.holdenkarau.spark.testing.DataFrameSuiteBase
-import org.scalatest.FunSuite
import net.sf.extjwnl.data._
+import org.scalatest.FunSuite
-class TestGetSynsets extends FunSuite with DataFrameSuiteBase {
+class TestGetSynsets extends FunSuite {
test("If The WordNet dictionary is correctly installed synsets must not be null ") {
@@ -13,10 +12,9 @@ class TestGetSynsets extends FunSuite with DataFrameSuiteBase {
val dict = wn.getDict
// getting a synset by a word and index
- val thing1 = wn.getSynset("thing", POS.NOUN, 1)
+ val thing1 = wn.getSynset("thing", POS.NOUN, 1).head
// getting a list of synsets by a word
-
val thing2 = wn.getSynsets("thing", POS.NOUN).head
assert(thing1 != null)
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala
similarity index 70%
rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala
rename to sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala
index fa7913d..92a63e2 100644
--- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala
+++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala
@@ -1,13 +1,12 @@
-package net.sansa_stack.ml.spark.nlp.wordnet
+package net.sansa_stack.ml.common.nlp.wordnet
-import com.holdenkarau.spark.testing.DataFrameSuiteBase
-import org.scalatest.FunSuite
-//import net.didion.jwnl.data._
import net.sf.extjwnl.data._
+import org.scalatest.FunSuite
+// import net.didion.jwnl.data._
-class WUPWordNetSimilarityMeasuresTests extends FunSuite with DataFrameSuiteBase {
+class WUPWordNetSimilarityMeasuresTests extends FunSuite {
- test("wwup similarity between dog and cat synset should result in value 0.3") {
+ test(" WUP similarity between dog and cat synset should result in value 0.3") {
try {
val wn = new WordNet
val dict = wn.getDict
diff --git a/sansa-ml-flink/pom.xml b/sansa-ml-flink/pom.xml
index 9a4c33e..5b245b4 100644
--- a/sansa-ml-flink/pom.xml
+++ b/sansa-ml-flink/pom.xml
@@ -5,7 +5,7 @@
sansa-ml-parent_2.11
net.sansa-stack
- 0.4.0
+ 0.5.0
sansa-ml-flink_2.11
ML API - Apache Flink
diff --git a/sansa-ml-spark/pom.xml b/sansa-ml-spark/pom.xml
index 662236f..55cade1 100644
--- a/sansa-ml-spark/pom.xml
+++ b/sansa-ml-spark/pom.xml
@@ -5,7 +5,7 @@
sansa-ml-parent_2.11
net.sansa-stack
- 0.4.0
+ 0.5.0
sansa-ml-spark_2.11
ML API - Apache Spark
@@ -13,6 +13,7 @@
2.8.3
+ 1.1.3
@@ -29,7 +30,6 @@
net.sansa-stack
sansa-rdf-spark_${scala.binary.version}
-
net.sansa-stack
@@ -70,9 +70,9 @@
- net.jpountz.lz4
- lz4
- 1.3.0
+ net.jpountz.lz4
+ lz4
+ 1.3.0
@@ -105,37 +105,59 @@
pom
${jena.version}
+
+
- com.github.scopt
- scopt_${scala.binary.version}
- 3.5.0
+ com.intel.analytics.bigdl
+ bigdl-SPARK_2.2
+ 0.3.0
-
- net.sf.extjwnl
- extjwnl
- 1.9.4
+ org.json
+ json
+
+
+
+
+ com.github.haifengl
+ smile-core
+ 1.5.0
+
+
+ com.github.haifengl
+ smile-netlib
+ 1.5.0
+
+
+ org.json4s
+ json4s-native_${scala.binary.version}
+ 3.6.2
+
-
+
- net.sf.extjwnl
- extjwnl-data-wn31-map
- 1.0
+ org.datasyslab
+ geospark
+ ${geospark.version}
+ provided
-
+
- com.intel.analytics.bigdl
- bigdl-SPARK_2.2
- 0.3.0
+ com.vividsolutions
+ jts
+ 1.13
+
+
- org.json
- json
+ org.datasyslab
+ geospark-sql_2.3
+ ${geospark.version}
@@ -167,12 +189,6 @@
com.github.scopt
scopt_${scala.binary.version}
-
-
- org.springframework
- spring
- 2.5.6.SEC03
-
@@ -186,6 +202,19 @@
scala-maven-plugin
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+ 3.0.0
+
+
+ package
+
+ shade
+
+
+
+
maven-compiler-plugin
@@ -431,6 +460,13 @@
true
+
+
+ dbscan-on-spark-repo
+ Repo for DBSCAN on Spark
+ http://dl.bintray.com/irvingc/maven
+
+
maven.aksw.internal
AKSW Release Repository
diff --git a/sansa-ml-spark/src/main/resources/application.properties b/sansa-ml-spark/src/main/resources/application.properties
new file mode 100644
index 0000000..eed2c90
--- /dev/null
+++ b/sansa-ml-spark/src/main/resources/application.properties
@@ -0,0 +1,54 @@
+# spark configuration
+sansa.spark.master=local[*]
+sansa.spark.serializer=org.apache.spark.serializer.KryoSerializer
+sansa.spark.executor.memory=15g
+sansa.spark.driver.memory=15g
+sansa.spark.driver.maxResultSize=15g
+sansa.spark.app.name=SANSA_Clustering
+
+# clusterig profile
+sansa.clustering.profile=results/profile.txt
+
+# pic clustering configuration
+sansa.clustering.pic.result=results/pic_clusters.json
+sansa.clustering.pic.matrix=results/pic_matrix.json
+sansa.clustering.pic.number_clusters=10
+sansa.clustering.pic.iterations=5
+
+# ont hot km clustering configuration
+sansa.clustering.km.onehot.result=results/oneHot_kmeans_clusters.json
+sansa.clustering.km.onehot.matrix=results/oneHotMatrix.json
+sansa.clustering.km.onehot.number_clusters=10
+sansa.clustering.km.onehot.iterations=5
+
+# mds km clustering configuration
+sansa.clustering.km.mds.result=results/mds_kmeans_clusters.json
+sansa.clustering.km.mds.matrix=results/mds_coordinates.json
+sansa.clustering.km.mds.dimension=2
+sansa.clustering.km.mds.number_clusters=10
+sansa.clustering.km.mds.iterations=5
+
+# word2vec km clustering configuration
+sansa.clustering.km.word2vec.result=results/word2vec_kmeans_clusters.json
+sansa.clustering.km.word2vec.matrix=results/word2Vec.json
+sansa.clustering.km.word2vec.number_clusters=10
+sansa.clustering.km.word2vec.iterations=5
+
+# dataset configuration
+#sansa.data.input=data/merged_tomtom_yelp/
+#sansa.data.input=data/tomtom_pois_austria_v0.3.nt
+sansa.data.input=src/main/resources/Cluster/input.nt
+sansa.data.termValueUri=http://example.org/def#termValue
+sansa.data.termPrefix=http://example.org/id/term/
+sansa.data.typePOI=http://example.org/def#POI
+sansa.data.coordinatesPredicate=http://www.opengis.net/ont/geosparql#asWKT
+sansa.data.categoryPOI=http://example.org/def#category
+sansa.data.poiPrefix=http://example.org/id/poi/
+
+
+# sansa and yelp file merge
+sansa.merge.input=src/main/resources/Cluster/input.nt
+#yelp.sansa.merged_file=data/tomtom_yelp.nt
+yelp.data.input=src/main/resources/Cluster/categories.nt
+yelp.data.categoryPOI=http://example.org/hasYelpCategory
+yelp.data.rating=http://example.org/hasRating
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ClassMembership.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ClassMembership.scala
index 48c57f9..caf62cf 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ClassMembership.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ClassMembership.scala
@@ -1,25 +1,20 @@
package net.sansa_stack.ml.spark.classification
import java.io.PrintStream
-import java.util.ArrayList
-import java.util.HashSet
-import java.util.Set
-import scala.util.Random
-import collection.JavaConverters._
+import java.util.{ ArrayList, HashSet, Set }
+
import scala.collection
+import scala.util.Random
-import org.semanticweb.owlapi.model.OWLClassExpression
-import org.semanticweb.owlapi.model.OWLIndividual
-import org.semanticweb.owlapi.model.OWLNamedIndividual
+import collection.JavaConverters._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import org.semanticweb.owlapi.model.{ OWLClassExpression, OWLIndividual, OWLNamedIndividual }
import net.sansa_stack.ml.spark.classification
-import net.sansa_stack.ml.spark.classification.TDTInducer.TDTInducer
-import net.sansa_stack.ml.spark.classification.KB.KB
import net.sansa_stack.ml.spark.classification.ConceptsGenerator._
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.{ SparkConf, SparkContext }
+import net.sansa_stack.ml.spark.classification.KB.KB
+import net.sansa_stack.ml.spark.classification.TDTInducer.TDTInducer
object ClassMembership {
@@ -51,11 +46,11 @@ object ClassMembership {
println()
println(nFolds + "-fold BOOTSTRAP Experiment on ontology: ")
- //val classifierClass: Class[_] = ClassLoader.getSystemClassLoader.loadClass(className)
+ // val classifierClass: Class[_] = ClassLoader.getSystemClassLoader.loadClass(className)
val nOfConcepts: Int = if (testConcepts != null) testConcepts.size else 1
- //var Generator: Random = new Random()
- //val ntestExs: Array[Int] = Array.ofDim[Int](nFolds)
+ // var Generator: Random = new Random()
+ // val ntestExs: Array[Int] = Array.ofDim[Int](nFolds)
// main loop on the folds
for (f <- 0 until nFolds) {
@@ -71,10 +66,10 @@ object ClassMembership {
testRDD.foreach(println(_))
val classifier: TDTInducer = new TDTInducer(k, nOfConcepts, spark)
- //val classifier: TDTInducer = new TDTInducer(k, kb.Concepts.count().toInt, spark)
- /*val cl: TDTInducer = (classifierClass.getConstructor(classOf[KB], classOf[Int]))
- .newInstance(kb, nOfConcepts).asInstanceOf[TDTInducer]*/
- //ntestExs(f) = testRDD.count.toInt
+ // val classifier: TDTInducer = new TDTInducer(k, kb.Concepts.count().toInt, spark)
+ /* val cl: TDTInducer = (classifierClass.getConstructor(classOf[KB], classOf[Int]))
+ .newInstance(kb, nOfConcepts).asInstanceOf[TDTInducer] */
+ // ntestExs(f) = testRDD.count.toInt
// training phase: using all examples but only those in the f-th partition
println("\nTraining is starting...")
@@ -85,44 +80,6 @@ object ClassMembership {
val labels: Array[Array[Int]] = classifier.test(f, testRDD, testConcepts)
} // for loop
- } //bootstrap function
- } //class
+ } // bootstrap function
+ } // class
}
-
-
-
-// for (i<- 0 until allExamples.count.toInt)
-// trainRDD.add(allExamples.takeSample(true, 1)(0))
-
-// val trainingExsSet: Set[Integer] = new HashSet[Integer]()
-// var trainRDD = spark.sparkContext.parallelize(trainingExsSet.asScala.toSeq)
-//
-// val testingExsSet: Set[Integer] = new HashSet[Integer]()
-// var testRDD = spark.sparkContext.parallelize(testingExsSet.asScala.toSeq)
-//
-// var rand1 = new ArrayList[Integer]
-// for (r <- 0 until allExamples.count.toInt)
-// rand1.add(Generator.nextInt(allExamples.count.toInt))
-//
-// var newRDD = spark.sparkContext.parallelize(rand1.asScala)
-// trainRDD.union(newRDD)
-// //trainingExsSet.add(Generator.nextInt(allExamples.count.toInt))
-//
-// var r = 0 to allExamples.count.toInt
-// var rand2 = spark.sparkContext.parallelize(r)
-//
-// if (!trainRDD.collect().contains(rand2))
-// testRDD.union(rand2.asInstanceOf[RDD[Integer]])
-
- /*for (r <- 0 until allExamples.count.toInt){
- if (!trainRDD.collect().contains(r))
- testRDD.union(r)
- }*/
-
-
- /*var trainingExs: Array[Integer] = Array.ofDim[Integer](0)
- var testExs: Array[Integer] = Array.ofDim[Integer](0)
-
- trainingExs = trainingExsSet.toArray(trainingExs)
- testExs = testingExsSet.toArray(testExs)*/
-
\ No newline at end of file
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ConceptsGenerator.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ConceptsGenerator.scala
index 1acdc06..b3d36ea 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ConceptsGenerator.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ConceptsGenerator.scala
@@ -1,25 +1,23 @@
package net.sansa_stack.ml.spark.classification
import java.util.HashSet
+
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
-import org.apache.spark.{SparkConf, SparkContext}
import org.semanticweb.HermiT.Reasoner
-import org.semanticweb.owlapi.model.OWLClassExpression
-import org.semanticweb.owlapi.model.OWLDataFactory
-import org.semanticweb.owlapi.model.OWLIndividual
-import org.semanticweb.owlapi.model.OWLNamedIndividual
+import org.semanticweb.owlapi.model.{ OWLClassExpression, OWLDataFactory, OWLIndividual, OWLNamedIndividual }
+
import net.sansa_stack.ml.spark.classification.KB.KB
-object ConceptsGenerator{
+object ConceptsGenerator {
class ConceptsGenerator(protected var kb: KB) {
protected var reasoner: Reasoner = kb.getReasoner
protected var dataFactory: OWLDataFactory = kb.getDataFactory
protected var allExamples: RDD[OWLIndividual] = kb.getIndividuals
-
+
def generateQueryConcepts(numConceptsToGenerate: Int, sc: SparkSession): Array[OWLClassExpression] = {
-
+
println("\nConcepts Generation\n-----------\n")
val queryConcept: Array[OWLClassExpression] = Array.ofDim[OWLClassExpression](numConceptsToGenerate)
val minOfSubConcepts: Int = 2
@@ -29,7 +27,7 @@ object ConceptsGenerator{
var j: Int = 0
var nextConcept: OWLClassExpression = null
var complPartialConcept: OWLClassExpression = null
- var nEx : Int = allExamples.count().toInt
+ var nEx: Int = allExamples.count().toInt
// cycle to build numConceptsToGenerate new query concepts
i = 0
while (i < numConceptsToGenerate) {
@@ -37,56 +35,50 @@ object ConceptsGenerator{
numOfSubConcepts = minOfSubConcepts + KB.generator.nextInt(maxOfSubConcepts - minOfSubConcepts)
var numPosInst: Int = 0
var numNegInst: Int = 0
-
+
// build a single new query OWLClassExpression adding conjuncts or disjuncts
do {
-
- //take the first subConcept for builiding the query OWLClassExpression
+
+ // take the first subConcept for builiding the query OWLClassExpression
partialConcept = kb.getRandomConcept
- //println("partial concept" + partialConcept)
+ // println("partial concept" + partialConcept)
j = 1
-
+
while (j < numOfSubConcepts) {
val newConcepts: HashSet[OWLClassExpression] = new HashSet[OWLClassExpression]()
newConcepts.add(partialConcept)
-
+
nextConcept = kb.getRandomConcept
newConcepts.add(nextConcept)
-
+
partialConcept =
- if (KB.generator.nextInt(4) == 0)
+ if (KB.generator.nextInt(4) == 0) {
dataFactory.getOWLObjectIntersectionOf(newConcepts)
- else dataFactory.getOWLObjectUnionOf(newConcepts)
- j+=1
+ } else dataFactory.getOWLObjectUnionOf(newConcepts)
+ j += 1
} // for j
-
+
println()
complPartialConcept = dataFactory.getOWLObjectComplementOf(partialConcept)
- //println("\n", complPartialConcept)
+ // println("\n", complPartialConcept)
numPosInst = reasoner.getInstances(partialConcept, false).entities().count().toInt
numNegInst = reasoner.getInstances(complPartialConcept, false).entities().count().toInt
-
+
println(partialConcept)
- println ("\n pos: " + numPosInst + ", neg: " + numNegInst + ", und: " + (nEx - numNegInst - numPosInst))
+ println("\n pos: " + numPosInst + ", neg: " + numNegInst + ", und: " + (nEx - numNegInst - numPosInst))
println()
- } while ((numPosInst < 20) || (numNegInst >3))
- // ((numPosInst < 10) || (numNegInst > 10))
- // (numPosInst * numNegInst == 0)
- //add the newly built OWLClassExpression to the list of all required query concepts
+ } while ((numPosInst < 20) || (numNegInst > 3))
+ // ((numPosInst < 10) || (numNegInst > 10))
+ // (numPosInst * numNegInst == 0)
+ // add the newly built OWLClassExpression to the list of all required query concepts
queryConcept(i) = partialConcept
- println("Query " + (i+1) + " found\n\n")
- i+=1
+ println("Query " + (i + 1) + " found\n\n")
+ i += 1
}
-
- queryConcept
+
+ queryConcept
}
-
+
}
}
-
-
- /*println("pos:%d (%3.1f)\t\t neg:%d (%3.1f)\t\t und:%d (%3.1f)\n " + numPosInst + numPosInst * 100.0 / nExs,
- numNegInst, numNegInst * 100.0 / nExs,
- (nExs - numNegInst - numPosInst),
- (nExs - numNegInst - numPosInst) * 100.0 / nExs)*/
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/DLTree.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/DLTree.scala
index 47b6338..1144421 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/DLTree.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/DLTree.scala
@@ -1,15 +1,14 @@
package net.sansa_stack.ml.spark.classification
-import java.util.ArrayList
-import java.util.List
+import java.util.{ArrayList, List}
import collection.JavaConverters._
-
-import net.sansa_stack.ml.spark.classification._
-import org.semanticweb.owlapi.model.OWLClassExpression
-import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
+import org.semanticweb.owlapi.model.OWLClassExpression
+
+import net.sansa_stack.ml.spark.classification._
+
/*
* Class for basic functions of DL trees
@@ -99,8 +98,7 @@ class DLTree {
* function to get the number of nodes
*/
- /*
- def getNodi(sc: SparkSession): Double = {
+ /* def getNodi(sc: SparkSession): Double = {
// visit in to make the count
val lista: ArrayList[DLNode] = new ArrayList[DLNode]()
@@ -145,10 +143,6 @@ class DLTree {
}
num
}
-
-
-
- def getComplexityMeasure(sc: SparkSession) : Double = getNodi(sc)*/
+ def getComplexityMeasure(sc: SparkSession) : Double = getNodi(sc) */
}
-
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/KB.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/KB.scala
index 4cdd1d9..c2ee415 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/KB.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/KB.scala
@@ -5,29 +5,29 @@ import java.net.URI
import java.util.{ ArrayList, List, Random }
import java.util.stream.{ Collectors, IntStream, Stream }
-import scala.collection.JavaConversions._
-import collection.JavaConverters._
import scala.collection.{ Iterator, Map }
+import scala.collection.JavaConverters._
import scala.collection.immutable.{ HashMap, Set }
+import collection.JavaConverters._
+import net.sansa_stack.owl.spark.rdd.OWLAxiomsRDD
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import org.semanticweb.HermiT.{ Configuration, Reasoner, ReasonerFactory }
import org.semanticweb.owlapi.apibinding.OWLManager
import org.semanticweb.owlapi.model._
-import org.semanticweb.owlapi.util.SimpleIRIMapper
import org.semanticweb.owlapi.reasoner.{ OWLReasoner, OWLReasonerFactory }
import org.semanticweb.owlapi.reasoner.structural.StructuralReasonerFactory
+import org.semanticweb.owlapi.util.SimpleIRIMapper
-import org.semanticweb.HermiT.{ Configuration, Reasoner, ReasonerFactory }
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SparkSession
-import net.sansa_stack.owl.spark.rdd.OWLAxiomsRDD
object KB {
val d: Double = 0.3
var generator: Random = new Random(2)
- /*
- * The class to define the Knowledgebase elements
- */
+ /**
+ * The class to define the Knowledgebase elements
+ */
class KB(var UrlOwlFile: String, rdd: OWLAxiomsRDD, sparkSession: SparkSession) {
@@ -82,7 +82,7 @@ object KB {
val Concepts2: RDD[OWLClass] = rdd.flatMap {
case axiom: HasClassesInSignature => axiom.classesInSignature().iterator().asScala
- case _ => null
+ case _ => null
}.filter(_ != null).distinct()
Concepts = Concepts2
@@ -96,7 +96,7 @@ object KB {
val Roles2: RDD[OWLObjectProperty] = rdd.map {
case axiom: HasProperty[OWLObjectProperty] => axiom.getProperty
- case _ => null
+ case _ => null
}.filter(_ != null).distinct()
Roles = Roles2
@@ -110,7 +110,7 @@ object KB {
val Properties2: RDD[OWLDataProperty] = rdd.flatMap {
case axiom: HasDataPropertiesInSignature => axiom.dataPropertiesInSignature().iterator().asScala
- case _ => null
+ case _ => null
}.filter(_ != null).distinct()
Properties = Properties2
@@ -124,7 +124,7 @@ object KB {
val Examples2: RDD[OWLNamedIndividual] = rdd.flatMap {
case axiom: HasIndividualsInSignature => axiom.individualsInSignature().collect(Collectors.toSet()).asScala
- case _ => null
+ case _ => null
}.filter(_ != null).distinct()
Examples = Examples2.asInstanceOf[RDD[OWLIndividual]]
@@ -163,10 +163,12 @@ object KB {
p = p + 1
} else {
if (!flag) {
- if (r.isEntailed(getDataFactory.getOWLClassAssertionAxiom(negTestConcepts(c), ind)))
+ if (r.isEntailed(getDataFactory.getOWLClassAssertionAxiom(negTestConcepts(c), ind))) {
classifications(c)(e) = -1
- } else
+ }
+ } else {
classifications(c)(e) = -1
+ }
n = n + 1
}
@@ -231,7 +233,7 @@ object KB {
def getReasoner(): Reasoner = hermit
- //def getURL(): String = urlOwlFile
+ // def getURL(): String = urlOwlFile
def getRandomProperty(numQueryProperty: Int): Array[Int] = {
@@ -277,11 +279,12 @@ object KB {
val role: OWLObjectProperty = Roles.takeSample(true, 1)(0)
newConcept =
- if (KB.generator.nextDouble() < 0.5)
+ if (KB.generator.nextDouble() < 0.5) {
dataFactory.getOWLObjectAllValuesFrom(role, newConceptBase)
- else dataFactory.getOWLObjectSomeValuesFrom(role, newConceptBase)
- } else
+ } else dataFactory.getOWLObjectSomeValuesFrom(role, newConceptBase)
+ } else {
newConcept = dataFactory.getOWLObjectComplementOf(newConceptBase)
+ }
}
}
} while (!reasoner.isSatisfiable(newConcept))
@@ -301,12 +304,13 @@ object KB {
val role: OWLObjectProperty = Roles.takeSample(true, 1)(0)
newConcept =
- if (KB.generator.nextDouble() < d)
+ if (KB.generator.nextDouble() < d) {
dataFactory.getOWLObjectAllValuesFrom(role, newConceptBase)
- else dataFactory.getOWLObjectSomeValuesFrom(role, newConceptBase)
+ } else dataFactory.getOWLObjectSomeValuesFrom(role, newConceptBase)
}
- } else
+ } else {
newConcept = dataFactory.getOWLObjectComplementOf(newConcept)
+ }
} while (!reasoner.isSatisfiable(newConcept))
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/PerformanceMetrics.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/PerformanceMetrics.scala
index 9226568..dd9387f 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/PerformanceMetrics.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/PerformanceMetrics.scala
@@ -1,5 +1,4 @@
package net.sansa_stack.ml.spark.classification
object PerformanceMetrics {
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/RefinementOperator.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/RefinementOperator.scala
index ea1b1bd..2407956 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/RefinementOperator.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/RefinementOperator.scala
@@ -6,11 +6,12 @@ import java.util.stream.{ Collectors, Stream }
import scala.collection.JavaConverters._
import scala.util.Random
+import org.apache.spark.rdd.RDD
import org.semanticweb.owlapi.model._
import org.semanticweb.owlapi.search.EntitySearcher
+
import net.sansa_stack.ml.spark.classification._
import net.sansa_stack.ml.spark.classification.KB.KB
-import org.apache.spark.rdd.RDD
object RefinementOperator {
val d: Double = 0.5
@@ -25,9 +26,9 @@ class RefinementOperator(var kb: KB) {
private var Properties: RDD[OWLDataProperty] = kb.getDataProperties
private var dataFactory: OWLDataFactory = kb.getDataFactory
- /*
- * Function to generate subsumed random concepts
- */
+ /**
+ * Function to generate subsumed random concepts
+ */
def getSubsumedRandomConcept(currentConcept: OWLClassExpression): OWLClassExpression = {
val generator: Random = new Random()
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTClassifiers.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTClassifiers.scala
index 136e1af..9d5722f 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTClassifiers.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTClassifiers.scala
@@ -1,28 +1,24 @@
package net.sansa_stack.ml.spark.classification
-import java.util.ArrayList
-import java.util.HashSet
-import java.util.Iterator
-import java.util.List
-import collection.JavaConverters._
+import java.util.{ ArrayList, HashSet, Iterator, List }
+
import scala.util.control.Breaks._
+import collection.JavaConverters._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
import org.semanticweb.owlapi.model.OWLClassExpression
import org.semanticweb.owlapi.model.OWLDataFactory
+import org.semanticweb.owlapi.model.OWLEquivalentClassesAxiom
import org.semanticweb.owlapi.model.OWLIndividual
-import org.semanticweb.owlapi.model.OWLObjectProperty
import org.semanticweb.owlapi.model.OWLObjectAllValuesFrom
-import org.semanticweb.owlapi.model.OWLObjectSomeValuesFrom
import org.semanticweb.owlapi.model.OWLObjectIntersectionOf
-import org.semanticweb.owlapi.model.OWLEquivalentClassesAxiom
-//import org.semanticweb.owlapi.model.IRI
+import org.semanticweb.owlapi.model.OWLObjectProperty
+import org.semanticweb.owlapi.model.OWLObjectSomeValuesFrom
-import net.sansa_stack.ml.spark.classification
-import net.sansa_stack.ml.spark.classification.KB.KB
import net.sansa_stack.ml.spark.classification._
+import net.sansa_stack.ml.spark.classification.KB.KB
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SparkSession
/*
* Terminological Decision Tree Classifier
@@ -30,7 +26,8 @@ import org.apache.spark.sql.SparkSession
object TDTClassifiers {
- /* L for the left branch and R for the right branch
+ /**
+ * L for the left branch and R for the right branch
* P, N, U for postive, negative and unlabeled respectively
*/
@@ -40,7 +37,7 @@ object TDTClassifiers {
val PR: Int = 3
val NR: Int = 4
val UR: Int = 5
-
+
class TDTClassifiers(var k: KB, var sc: SparkSession) {
/**
@@ -57,12 +54,13 @@ object TDTClassifiers {
* @return
*/
- def induceDLTree(father: OWLClassExpression,
- posExs: RDD[String], negExs: RDD[String], undExs: RDD[String],
- nRefs: Int, prPos: Double, prNeg: Double): DLTree = {
+ def induceDLTree(
+ father: OWLClassExpression,
+ posExs: RDD[String], negExs: RDD[String], undExs: RDD[String],
+ nRefs: Int, prPos: Double, prNeg: Double): DLTree = {
val THRESHOLD: Double = 0.05
- val tree: DLTree = new DLTree()
+ val tree: DLTree = new DLTree()
if (posExs.count.toInt == 0 && negExs.count.toInt == 0) // There is no examples
if (prPos >= prNeg) { // prior majority of positives
@@ -80,12 +78,10 @@ object TDTClassifiers {
val total = numPos + numNeg
var perPos: Double = 0
var perNeg: Double = 0
- if (total !=0){
+ if (total != 0) {
perPos = numPos / total
perNeg = numNeg / total
- }
- else
- return tree
+ } else return tree
println("\nnew per Pos: " + perPos)
println("new per Neg: " + perNeg)
@@ -94,14 +90,13 @@ object TDTClassifiers {
tree.setRoot(k.getDataFactory().getOWLThing) // set positive leaf
println("-----\nPostive leaf (prior2)")
return tree
- }
- else if (perPos == 0 && perNeg > THRESHOLD) { // no positive
+ } else if (perPos == 0 && perNeg > THRESHOLD) { // no positive
tree.setRoot(k.getDataFactory().getOWLNothing); // set negative leaf
println("-----\nNegative leaf (prior2)\n");
return tree
}
- // else (a non-leaf node) ...
+ // else (a non-leaf node)
// generate set of concepts
val Con: RDD[OWLClassExpression] = generateRefs(k, father, nRefs, posExs, negExs)
@@ -109,56 +104,53 @@ object TDTClassifiers {
// select best partitioning node concept
val bestConcept: OWLClassExpression = selectBestConcept(k, Con, posExs, negExs, undExs, prPos, prNeg)
-
- if (bestConcept != null){
-
- val sNode = split(k, bestConcept, posExs, negExs, undExs)
-
- // set the root concept
- tree.setRoot(bestConcept.getNNF)
-
- // sNode._1._1 = PosEL, sNode._2._1 = NegEL, sNode._3._1 = undEL
- // sNode._1._2 = PosER, sNode._2._2 = NegER, sNode._3._2 = undER
-
-
- // build subtrees
-
+
+ if (bestConcept != null) {
+
+ val sNode = split(k, bestConcept, posExs, negExs, undExs)
+
+ // set the root concept
+ tree.setRoot(bestConcept.getNNF)
+
+ // sNode._1._1 = PosEL, sNode._2._1 = NegEL, sNode._3._1 = undEL
+ // sNode._1._2 = PosER, sNode._2._2 = NegER, sNode._3._2 = undER
+
+ // build subtrees
+
println("\nStart Positive tree \n----------")
tree.setPosTree(induceDLTree(bestConcept, sNode._1._1, sNode._2._1, sNode._3._1, nRefs, prPos, prNeg))
-
+
println("\nStart Negative tree \n----------")
tree.setNegTree(induceDLTree(bestConcept.getComplementNNF, sNode._1._2, sNode._2._2, sNode._3._2, nRefs, prPos, prNeg))
-
return tree
- }
- else
- return null
+ } else return null
}
-
+
/**
* recursive down through the tree model
* @param ind
* @param tree
* @return
*/
- def classify(ind: OWLIndividual, tree: DLTree): Int = {
+ def classify(ind: OWLIndividual, tree: DLTree): Int = {
val rootClass: OWLClassExpression = tree.getRoot
println("\nrootClass " + rootClass)
-
+
val negRootClass: OWLClassExpression = k.getDataFactory.getOWLObjectComplementOf(rootClass)
println("negRootClass " + negRootClass)
-
+
if (rootClass.equals(k.getDataFactory.getOWLThing)) return +1
if (rootClass.equals(k.getDataFactory.getOWLNothing)) return -1
var r1: Int = 0
var r2: Int = 0
- if (k.getReasoner.isEntailed(k.getDataFactory.getOWLClassAssertionAxiom(rootClass, ind)))
+ if (k.getReasoner.isEntailed(k.getDataFactory.getOWLClassAssertionAxiom(rootClass, ind))) {
r1 = classify(ind, tree.getPosSubTree)
- else if (k.getReasoner.isEntailed(k.getDataFactory.getOWLClassAssertionAxiom(negRootClass, ind)))
+ } else if (k.getReasoner.isEntailed(k.getDataFactory.getOWLClassAssertionAxiom(negRootClass, ind))) {
r2 = classify(ind, tree.getNegSubTree)
+ }
var cP: Int = 0
var cn: Int = 0
@@ -169,7 +161,7 @@ object TDTClassifiers {
if (missingVForTDT) {
cP += classify(ind, tree.getPosSubTree)
cn -= classify(ind, tree.getNegSubTree)
-
+
if (cP > (-1 * cn)) return +1
else if (cP < (-1 * cn)) return -1
else return 0
@@ -178,380 +170,367 @@ object TDTClassifiers {
else if ((r1 != 0)) r1
else r2
}
-
-
- /**
- * @param know
- * @param concept
- * @param dim
- * @param posExs
- * @param negExs
- * @return
- */
- private def generateRefs(know: KB, concept: OWLClassExpression, dim: Int, posExs: RDD[String],
- negExs: RDD[String]): RDD[OWLClassExpression] = {
-
- println("\nGenerating node concepts: \n ")
- var rConcepts: Array[OWLClassExpression] = Array.ofDim[OWLClassExpression](dim)
- var newConcept: OWLClassExpression = null
- var refinement: OWLClassExpression = null
- var emptyIntersection: Boolean = false
-
- //val conceptExp = concept.nestedClassExpressions.iterator().asScala.toArray
- val C = concept.asConjunctSet()
- val ConceptExp = concept.asConjunctSet().iterator().asScala.toSeq
- //println("\nconcept set " + C )
-
- for (c <- 0 until dim) {
-
- do {
- emptyIntersection = false //true
- val Concepts: HashSet[OWLClassExpression] = new HashSet[OWLClassExpression]()
-
- if (concept.equals(know.getDataFactory().getOWLThing))
- refinement = new RefinementOperator(know).getRandomConcept(know)
- else
- refinement = new RefinementOperator(know).getSubsumedRandomConcept(concept)
-
- /* val con: OWLEquivalentClassesAxiom = know.dataFactory.getOWLEquivalentClassesAxiom(concept)
+
+ /**
+ * @param know
+ * @param concept
+ * @param dim
+ * @param posExs
+ * @param negExs
+ * @return
+ */
+ private def generateRefs(know: KB, concept: OWLClassExpression, dim: Int, posExs: RDD[String],
+ negExs: RDD[String]): RDD[OWLClassExpression] = {
+
+ println("\nGenerating node concepts: \n ")
+ var rConcepts: Array[OWLClassExpression] = Array.ofDim[OWLClassExpression](dim)
+ var newConcept: OWLClassExpression = null
+ var refinement: OWLClassExpression = null
+ var emptyIntersection: Boolean = false
+
+ // val conceptExp = concept.nestedClassExpressions.iterator().asScala.toArray
+ val C = concept.asConjunctSet()
+ val ConceptExp = concept.asConjunctSet().iterator().asScala.toSeq
+ // println("\nconcept set " + C )
+
+ for (c <- 0 until dim) {
+
+ do {
+ emptyIntersection = false // true
+ val Concepts: HashSet[OWLClassExpression] = new HashSet[OWLClassExpression]()
+
+ if (concept.equals(know.getDataFactory().getOWLThing)) {
+ refinement = new RefinementOperator(know).getRandomConcept(know)
+ } else {
+ refinement = new RefinementOperator(know).getSubsumedRandomConcept(concept)
+ }
+
+ /* val con: OWLEquivalentClassesAxiom = know.dataFactory.getOWLEquivalentClassesAxiom(concept)
val conExp: Array[OWLClassExpression] = con.classExpressions.iterator().asScala.toArray
println("Concept Expressions = " )
- conExp.foreach(println(_))*/
-
- val refInstance: Boolean = refinement.isInstanceOf[OWLObjectAllValuesFrom]
- breakable{
-
- for (i <- ConceptExp)
- {
- if (i.isInstanceOf[OWLObjectSomeValuesFrom]){
- val y: OWLObjectSomeValuesFrom = i.asInstanceOf[OWLObjectSomeValuesFrom]
- val conprop: OWLObjectProperty = y.getProperty.getNamedProperty
- val confiller : OWLClassExpression = y.getFiller
- /*println("============================")
+ conExp.foreach(println(_)) */
+
+ val refInstance: Boolean = refinement.isInstanceOf[OWLObjectAllValuesFrom]
+ breakable {
+
+ for (i <- ConceptExp) {
+ if (i.isInstanceOf[OWLObjectSomeValuesFrom]) {
+ val y: OWLObjectSomeValuesFrom = i.asInstanceOf[OWLObjectSomeValuesFrom]
+ val conprop: OWLObjectProperty = y.getProperty.getNamedProperty
+ val confiller: OWLClassExpression = y.getFiller
+ /* println("============================")
println("concept property = " + conprop)
- println("concept filler = " + confiller)*/
-
- if (refInstance){
- val x : OWLObjectAllValuesFrom = refinement.asInstanceOf[OWLObjectAllValuesFrom]
- val rprop: OWLObjectProperty = x.getProperty.getNamedProperty
- val rfiller: OWLClassExpression = x.getFiller
- // println("refienment property = " + rprop)
- //println("refienment filler = " + rfiller)
- if (conprop == rprop) break
-
+ println("concept filler = " + confiller) */
+
+ if (refInstance) {
+ val x: OWLObjectAllValuesFrom = refinement.asInstanceOf[OWLObjectAllValuesFrom]
+ val rprop: OWLObjectProperty = x.getProperty.getNamedProperty
+ val rfiller: OWLClassExpression = x.getFiller
+ // println("refienment property = " + rprop)
+ // println("refienment filler = " + rfiller)
+ if (conprop == rprop) break
+
+ }
+ }
+ }
+ if ((!(ConceptExp.contains(refinement)))) {
+ Concepts.add(concept)
+ Concepts.add(refinement)
+ newConcept = know.getDataFactory.getOWLObjectIntersectionOf(Concepts)
+ if (newConcept != null) {
+ emptyIntersection = !know.getReasoner.isSatisfiable(newConcept)
}
}
}
- if ((!(ConceptExp.contains(refinement))))
- {
- Concepts.add(concept)
- Concepts.add(refinement)
- newConcept = know.getDataFactory.getOWLObjectIntersectionOf(Concepts)
- if (newConcept != null)
- emptyIntersection = !know.getReasoner.isSatisfiable(newConcept)
- }
- }
-
-
- } while (emptyIntersection )
-
- rConcepts(c) =
- if (newConcept != null) newConcept
- else concept
-
+
+ } while (emptyIntersection)
+
+ rConcepts(c) =
+ if (newConcept != null) newConcept
+ else concept
+
+ }
+ var Refs: RDD[OWLClassExpression] = sc.sparkContext.parallelize(rConcepts)
+ var nRef = Refs.distinct().count.toInt
+ println("\nNo. of generated concepts: " + nRef)
+ Refs.distinct()
}
- var Refs: RDD[OWLClassExpression] = sc.sparkContext.parallelize(rConcepts)
- var nRef = Refs.distinct().count.toInt
- println("\nNo. of generated concepts: " + nRef)
- Refs.distinct()
- }
-
- //val iterator: Iterator[OWLIndividual] = know.getReasoner().getInstances(newConcept, false).entities().iterator().asInstanceOf[Iterator[OWLIndividual]]
- //val nextInd : OWLIndividual = iterator.next()
-
- /**
- * Selecting the best in a list (RDD) of refinements
- * @param know
- * @param concepts
- * @param posExs
- * @param negExs
- * @param undExs
- * @param prPos
- * @param prNeg
- * @return
- */
- def selectBestConcept(know: KB,
- concepts: RDD[OWLClassExpression],
- posExs: RDD[String],
- negExs: RDD[String],
- undExs: RDD[String],
- prPos: Double, prNeg: Double): OWLClassExpression = {
+ // val iterator: Iterator[OWLIndividual] = know.getReasoner().getInstances(newConcept, false).entities().iterator().asInstanceOf[Iterator[OWLIndividual]]
+ // val nextInd : OWLIndividual = iterator.next()
- var bestConceptIndex: Int = 0
+ /**
+ * Selecting the best in a list (RDD) of refinements
+ * @param know
+ * @param concepts
+ * @param posExs
+ * @param negExs
+ * @param undExs
+ * @param prPos
+ * @param prNeg
+ * @return
+ */
- println("\nThe First concept is: " + concepts.first())
- var counts: Array[Int] = getSplitCounts(know, concepts.first(), posExs, negExs, undExs)
+ def selectBestConcept(
+ know: KB,
+ concepts: RDD[OWLClassExpression],
+ posExs: RDD[String],
+ negExs: RDD[String],
+ undExs: RDD[String],
+ prPos: Double, prNeg: Double): OWLClassExpression = {
+
+ var bestConceptIndex: Int = 0
- println("\nPL:" +counts(0) +",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) +
+ println("\nThe First concept is: " + concepts.first())
+ var counts: Array[Int] = getSplitCounts(know, concepts.first(), posExs, negExs, undExs)
+
+ println("\nPL:" + counts(0) + ",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) +
",\tNR:" + counts(4) + ",\tUR:" + counts(5))
- //var bestGain: Double = gain(counts, prPos, prNeg)
- var bestGain: Double = gain(counts)
- println("\nCurrent gain: "+ bestGain)
+ // var bestGain: Double = gain(counts, prPos, prNeg)
+ var bestGain: Double = gain(counts)
+ println("\nCurrent gain: " + bestGain)
- for (c <- 1 until concepts.count.toInt) {
+ for (c <- 1 until concepts.count.toInt) {
- var nConcept = concepts.take(concepts.count.toInt).apply(c)
- println("\nConcept " + (c+1) +" is: " + nConcept)
+ var nConcept = concepts.take(concepts.count.toInt).apply(c)
+ println("\nConcept " + (c + 1) + " is: " + nConcept)
- counts = getSplitCounts(know, nConcept, posExs, negExs, undExs)
- println("\nPL:" +counts(0) +",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) +
- ",\tNR:" + counts(4) + ",\tUR:" + counts(5))
+ counts = getSplitCounts(know, nConcept, posExs, negExs, undExs)
+ println("\nPL:" + counts(0) + ",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) +
+ ",\tNR:" + counts(4) + ",\tUR:" + counts(5))
- //var thisGain: Double = gain(counts, prPos, prNeg)
- var thisGain: Double = gain(counts)
- println("\nCurrent gain: " + thisGain)
+ // var thisGain: Double = gain(counts, prPos, prNeg)
+ var thisGain: Double = gain(counts)
+ println("\nCurrent gain: " + thisGain)
- if (thisGain > bestGain) {
- bestConceptIndex = c
- bestGain = thisGain
+ if (thisGain > bestGain) {
+ bestConceptIndex = c
+ bestGain = thisGain
+ }
+ }
+
+ val nCpt = concepts.take(concepts.count.toInt).apply(bestConceptIndex)
+
+ if (bestGain == 0.0) {
+ null
+ // val parts = nCpt.nestedClassExpressions.iterator().asScala.toList
+ // val ref = parts.last
+ // val x = parts.filterNot(elem => elem == ref)
+ // println("refienment removed: ")
+ // x.foreach(println(_))
+ // var y: ArrayList[OWLClassExpression] = new ArrayList()
+ // var i = 0
+ // while (i< x.size)
+ // {
+ // val z = x.get(i)
+ // y.add(z)
+ // i = i+1
+ // }
+ //
+ // nCpt
+ } else {
+ println("\n --------\nBest gain: " + bestGain + " \t Split index: " + bestConceptIndex)
+ println("\nPL:" + counts(0) + ",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) +
+ ",\tNR:" + counts(4) + ",\tUR:" + counts(5))
+
+ println("\n Best concept is: " + nCpt)
+ nCpt
}
}
-
- val nCpt = concepts.take(concepts.count.toInt).apply(bestConceptIndex)
-
- if (bestGain == 0.0) {
- null
-// val parts = nCpt.nestedClassExpressions.iterator().asScala.toList
-// val ref = parts.last
-// val x = parts.filterNot(elem => elem == ref)
-// println("refienment removed: ")
-// x.foreach(println(_))
-// var y: ArrayList[OWLClassExpression] = new ArrayList()
-// var i = 0
-// while (i< x.size)
-// {
-// val z = x.get(i)
-// y.add(z)
-// i = i+1
-// }
-//
-// nCpt
- }
- else {
- println("\n --------\nBest gain: " + bestGain + " \t Split index: " + bestConceptIndex)
- println("\nPL:" +counts(0) +",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) +
- ",\tNR:" + counts(4) + ",\tUR:" + counts(5))
-
- println("\n Best concept is: " + nCpt)
- nCpt
- }
- }
- /**
- * @param counts
- * @return The calculated Gain
- */
+ /**
+ * @param counts
+ * @return The calculated Gain
+ */
- /*
- * Function to calculate the gain
- */
-
- private def gain(counts: Array[Int]): Double = {
-
- var gain: Double = 0.0
- val totalL: Double = counts(PL) + counts(NL) + 0.001
- val totalR: Double = counts(PR) + counts(NR) + 0.001
- val total: Double = totalL + totalR
- val pPL: Double = counts(PL) / totalL
- val pPR: Double = counts(PR) / totalR
- val pNL: Double = counts(NL) / totalL
- val pNR: Double = counts(NR) / totalR
-
- if (Math.abs(pPL + pPR) != 0 && Math.abs(pNL + pNR) != 0 )
- {
- gain = (totalL / total) * (totalR / total) *
+ /**
+ * Function to calculate the gain
+ */
+ private def gain(counts: Array[Int]): Double = {
+
+ var gain: Double = 0.0
+ val totalL: Double = counts(PL) + counts(NL) + 0.001
+ val totalR: Double = counts(PR) + counts(NR) + 0.001
+ val total: Double = totalL + totalR
+ val pPL: Double = counts(PL) / totalL
+ val pPR: Double = counts(PR) / totalR
+ val pNL: Double = counts(NL) / totalL
+ val pNR: Double = counts(NR) / totalR
+
+ if (Math.abs(pPL + pPR) != 0 && Math.abs(pNL + pNR) != 0) {
+ gain = (totalL / total) * (totalR / total) *
Math.pow(Math.abs(pPL - pPR) / Math.abs(pPL + pPR) + Math.abs(pNL - pNR) / Math.abs(pNL + pNR), 2)
+ }
+ gain
+
}
-
- gain
-
- }
-
-
- /**
- * @param counts
- * @param prPos
- * @param prNeg
- * @return The calculated Gain
- */
- /*
- * Function to calculate the gain based on gini index
- */
+ /**
+ * @param counts
+ * @param prPos
+ * @param prNeg
+ * @return The calculated Gain
+ */
+
+ /**
+ * Function to calculate the gain based on gini index
+ */
+
+ /* def gain(counts: Array[Int], prPos: Double, prNeg: Double): Double = {
- /* def gain(counts: Array[Int], prPos: Double, prNeg: Double): Double = {
-
val Trsize: Double = counts(0) + counts(1)
val Flsize: Double = counts(3) + counts(4)
val Usize: Double = counts(2) + counts(5)// + counts(6) + counts(7)
-
+
val size: Double = Trsize + Flsize + Usize
-
- val startImpurity : Double = gini(counts(0) + counts(3), counts(1) + counts(4), prPos, prNeg)
-
+ val startImpurity : Double = gini(counts(0) + counts(3), counts(1) + counts(4), prPos, prNeg)
val TrImpurity = gini(counts(0), counts(1), prPos, prNeg)
val FlImpurity = gini(counts(3), counts(4), prPos, prNeg)
val UImpurity = gini(counts(2) , counts(5), prPos, prNeg) //counts(2)+ counts(6), counts(5) + counts(7)
-
+
val Gainval = startImpurity - (Trsize/size)*TrImpurity - (Flsize/size)*FlImpurity - -(Usize/size)*UImpurity
-
+
Gainval
}
-
+
def gini(nPos: Double, nNeg: Double, prPos: Double, prNeg: Double): Double = {
-
+
val estimatProp : Int = 3
val total: Double = nPos + nNeg
-
+
val p1 : Double = (nPos*estimatProp*prPos)/(total+estimatProp)
val p2: Double = (nNeg*estimatProp*prNeg)/(total+estimatProp)
-
+
val ginival = 1.0-p1*p1-p2*p2
ginival
- }*/
-
-
-
- /**
- * @param know
- * @param concept
- * @param posExs
- * @param negExs
- * @param undExs
- * @return
- */
+ } */
- private def getSplitCounts(know: KB,
- concept: OWLClassExpression,
- posExs: RDD[String],
- negExs: RDD[String],
- undExs: RDD[String]): Array[Int] = {
-
- val counts: Array[Int] = Array.ofDim[Int](6)
-
- val Pos = splitGroup(know, concept, posExs)
- val Neg = splitGroup(know, concept, negExs)
- val Und = splitGroup(know, concept, undExs)
-
- counts(PL) = Pos._1.count.toInt
- counts(NL) = Neg._1.count.toInt
- counts(UL) = Und._1.count.toInt
- counts(PR) = Pos._2.count.toInt
- counts(NR) = Neg._2.count.toInt
- counts(UR) = Und._2.count.toInt
-
- counts
- }
+ /**
+ * @param know
+ * @param concept
+ * @param posExs
+ * @param negExs
+ * @param undExs
+ * @return
+ */
- /**
- * @param know
- * @param concept
- * @param nodeExamples
- * @param leftExs
- * @param rightExs
- */
- private def splitGroup(know: KB,
- concept: OWLClassExpression,
- nodeExamples: RDD[String]): (RDD[String], RDD[String]) = {
+ private def getSplitCounts(
+ know: KB,
+ concept: OWLClassExpression,
+ posExs: RDD[String],
+ negExs: RDD[String],
+ undExs: RDD[String]): Array[Int] = {
+
+ val counts: Array[Int] = Array.ofDim[Int](6)
- /*println("\nNode examples: \n ----------")
- nodeExamples.take(nodeExamples.count.toInt).foreach(println(_))*/
+ val Pos = splitGroup(know, concept, posExs)
+ val Neg = splitGroup(know, concept, negExs)
+ val Und = splitGroup(know, concept, undExs)
- val negConcept: OWLClassExpression = know.getDataFactory.getOWLObjectComplementOf(concept)
-
- var Left = new ArrayList[String]()
- var Right = new ArrayList[String]()
+ counts(PL) = Pos._1.count.toInt
+ counts(NL) = Neg._1.count.toInt
+ counts(UL) = Und._1.count.toInt
+ counts(PR) = Pos._2.count.toInt
+ counts(NR) = Neg._2.count.toInt
+ counts(UR) = Und._2.count.toInt
- for (e <- 0 until nodeExamples.count.toInt) {
+ counts
+ }
+
+ /**
+ * @param know
+ * @param concept
+ * @param nodeExamples
+ * @param leftExs
+ * @param rightExs
+ */
+ private def splitGroup(
+ know: KB,
+ concept: OWLClassExpression,
+ nodeExamples: RDD[String]): (RDD[String], RDD[String]) = {
+
+ /* println("\nNode examples: \n ----------")
+ nodeExamples.take(nodeExamples.count.toInt).foreach(println(_)) */
- val nodeEx = nodeExamples.take(e + 1).apply(e)
- val nodeInd = know.getDataFactory().getOWLNamedIndividual(nodeEx).asInstanceOf[OWLIndividual]
+ val negConcept: OWLClassExpression = know.getDataFactory.getOWLObjectComplementOf(concept)
- if (know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(concept, nodeInd))) {
+ var Left = new ArrayList[String]()
+ var Right = new ArrayList[String]()
+
+ for (e <- 0 until nodeExamples.count.toInt) {
+
+ val nodeEx = nodeExamples.take(e + 1).apply(e)
+ val nodeInd = know.getDataFactory().getOWLNamedIndividual(nodeEx).asInstanceOf[OWLIndividual]
+
+ if (know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(concept, nodeInd))) {
Left.add(nodeEx)
-
- } else if (know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(negConcept, nodeInd))) {
+
+ } else if (know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(negConcept, nodeInd))) {
Right.add(nodeEx)
-
- } else {
+
+ } else {
Left.add(nodeEx)
Right.add(nodeEx)
+ }
}
- }
- val leftRDD = sc.sparkContext.parallelize(Left.asScala)
- val rightRDD = sc.sparkContext.parallelize(Right.asScala)
+ val leftRDD = sc.sparkContext.parallelize(Left.asScala)
+ val rightRDD = sc.sparkContext.parallelize(Right.asScala)
- /*println("\nleft ex: ")
+ /* println("\nleft ex: ")
leftRDD.take(20).foreach(println(_))
println("\nright ex: ")
- rightRDD.take(20).foreach(println(_))*/
-
- (leftRDD, rightRDD)
-
-
- //val propName: RDD[String] = know.getIndividuals().map( ind => ind.asOWLNamedIndividual().getIRI.getShortForm)
- // println("\n nodeEx = " + nodeEx )
- //val Filtered = know.getIndividuals().filter(_ == nodeInd)
+ rightRDD.take(20).foreach(println(_)) */
+
+ (leftRDD, rightRDD)
+
+ // val propName: RDD[String] = know.getIndividuals().map( ind => ind.asOWLNamedIndividual().getIRI.getShortForm)
+ // println("\n nodeEx = " + nodeEx )
+ // val Filtered = know.getIndividuals().filter(_ == nodeInd)
// println("\n filtered = " )
// Filtered.take(10).foreach(println(_))
- //val exIndex = ex.lookup(e)
+ // val exIndex = ex.lookup(e)
// println("the element: ")
- //exInd.take(1).foreach(println(_))
- //val ind = know.getDataFactory().getOWLNamedIndividual(IRI.create(nodeEx)).asInstanceOf[OWLIndividual]
- //println("newexample " + ind )
+ // exInd.take(1).foreach(println(_))
+ // val ind = know.getDataFactory().getOWLNamedIndividual(IRI.create(nodeEx)).asInstanceOf[OWLIndividual]
+ // println("newexample " + ind )
- //val x = know.getIndividuals().take(nodeExamples.count.toInt).apply(e)
- //val x = know.getIndividuals().filter( _ == neew).first()
+ // val x = know.getIndividuals().take(nodeExamples.count.toInt).apply(e)
+ // val x = know.getIndividuals().filter( _ == neew).first()
- //x.take(20).foreach(println(_))
+ // x.take(20).foreach(println(_))
- //val r =know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(concept, ind))
- //println("\n r = " + r)
+ // val r =know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(concept, ind))
+ // println("\n r = " + r)
// val l =know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(negConcept, ind))
- //println("\n l = " + l)
- }
-
- /**
- * @param know
- * @param concept
- * @param posExs
- * @param negExs
- * @param undExs
- */
-
- private def split(know: KB,
- concept: OWLClassExpression,
- posExs: RDD[String], negExs: RDD[String], undExs: RDD[String]):
- ((RDD[String], RDD[String]), (RDD[String], RDD[String]), (RDD[String], RDD[String])) = {
+ // println("\n l = " + l)
+ }
- val Pos = splitGroup(know, concept, posExs)
- val Neg = splitGroup(know, concept, negExs)
- val Und = splitGroup(know, concept, undExs)
-
- (Pos, Neg, Und)
- }
+ /**
+ * @param know
+ * @param concept
+ * @param posExs
+ * @param negExs
+ * @param undExs
+ */
- }//class
+ private def split(
+ know: KB,
+ concept: OWLClassExpression,
+ posExs: RDD[String], negExs: RDD[String], undExs: RDD[String]): ((RDD[String], RDD[String]), (RDD[String], RDD[String]), (RDD[String], RDD[String])) = {
-
+ val Pos = splitGroup(know, concept, posExs)
+ val Neg = splitGroup(know, concept, negExs)
+ val Und = splitGroup(know, concept, undExs)
+ (Pos, Neg, Und)
+ }
+ } // class
/**
* Selecting the best in a list (RDD) of refinements using Entropy calculations
@@ -567,7 +546,7 @@ object TDTClassifiers {
* @return
*/
- /* def selectBestConceptEntropy(know: KB, concepts: RDD[OWLClassExpression],
+ /* def selectBestConceptEntropy(know: KB, concepts: RDD[OWLClassExpression],
posExs: RDD[String],
negExs: RDD[String],
undExs: RDD[String],
@@ -607,13 +586,13 @@ object TDTClassifiers {
val nCpt = n.lookup(bestConceptIndex).asInstanceOf[OWLClassExpression]
println("\n %s\n\n", nCpt)
nCpt
- }*/
+ } */
- /*
+ /**
* Function to calculate the Entropy value
*/
- /* def Entropy(counts: Array[Int], prPos: Double, prNeg: Double, sizPos: Int, sizNeg: Int): Double = {
+ /* def Entropy(counts: Array[Int], prPos: Double, prNeg: Double, sizPos: Int, sizNeg: Int): Double = {
val nP = counts(0) + counts(1)
val nN = counts(3) + counts(4)
val nU = counts(2) + counts(5) + counts(6) + counts(7)
@@ -654,6 +633,6 @@ object TDTClassifiers {
- (2 - p1 - p2) * (p1 * Math.log(p1) - p2 * Math.log(p2)))
EntropyValue
- }*/
+ } */
-}//object
+}// object
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTInducer.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTInducer.scala
index e3cec25..a1873be 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTInducer.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTInducer.scala
@@ -1,21 +1,15 @@
package net.sansa_stack.ml.spark.classification
import java.io.PrintStream
-import java.util.ArrayList
-import java.util.List
-import java.util.Arrays
-import java.util.HashSet
-import collection.JavaConverters._
-import scala.collection
+import java.util.{ ArrayList, Arrays, HashSet, List }
-import org.semanticweb.owlapi.model.OWLClassExpression
-import org.semanticweb.owlapi.model.OWLIndividual
-import org.semanticweb.owlapi.model.OWLNamedIndividual
-import org.semanticweb.HermiT.Reasoner
+import scala.collection
+import collection.JavaConverters._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
-import org.apache.spark.{SparkConf, SparkContext}
+import org.semanticweb.HermiT.Reasoner
+import org.semanticweb.owlapi.model.{ OWLClassExpression, OWLIndividual, OWLNamedIndividual }
import net.sansa_stack.ml.spark.classification._
import net.sansa_stack.ml.spark.classification.KB.KB
@@ -26,151 +20,147 @@ import net.sansa_stack.ml.spark.classification.TDTClassifiers.TDTClassifiers
*/
object TDTInducer {
- var stream: PrintStream = _
-
-class TDTInducer(var kb: KB, var nConcepts: Int, var sc: SparkSession) {
+ var stream: PrintStream = _
-//for each query concept induce an ensemble
- var trees: Array[DLTree] = new Array[DLTree](nConcepts)
+ class TDTInducer(var kb: KB, var nConcepts: Int, var sc: SparkSession) {
- var cl: TDTClassifiers = new TDTClassifiers(kb, sc)
+ // for each query concept induce an ensemble
+ var trees: Array[DLTree] = new Array[DLTree](nConcepts)
-
-
- /*
- * Function for training the algorithm
- */
- def training(results: Array[Array[Int]], trainingExs: RDD[OWLIndividual],
- testConcepts: Array[OWLClassExpression],
- negTestConcepts: Array[OWLClassExpression]): Unit = {
-
- val op: RefinementOperator = new RefinementOperator(kb)
- val reasoner: Reasoner = kb.getReasoner
- val allExamples: RDD[OWLIndividual] = kb.getIndividuals
-
- //val trainingExsSet: HashSet[Integer] = new HashSet[Integer](Arrays.asList(trainingExs: _*))
-
- val length: Int = if (testConcepts != null) testConcepts.size else 1
-
- for (c <- 0 until length) {
-
- println("\n--- Query Concept # " + (c+1))
-
- // These instances should be divided into negative instances, positive and uncertain
- // split._1 = posExs, split._2 = negExs, split._3 = undExs
- val split = splitting(trainingExs, results, c)
-
- var prPos: Double = split._1.count.toDouble / (trainingExs.count.toInt)
- var prNeg: Double = split._2.count.toDouble / (trainingExs.count.toInt)
- println("Training set composition: " + split._1.count() + " - " + split._2.count() + " - " + split._3.count())
-
- val Sum: Double = prPos + prNeg
- if (Sum == 0) {
- prPos = 0.5
- prNeg = 0.5
- } else {
- prPos = prPos / Sum
- prNeg = prNeg / Sum
- }
- println("\nNew learning problem prepared "+ (c+1))
- println("Learning a tree ")
- trees(c) = cl.induceDLTree(kb.getDataFactory.getOWLThing, split._1, split._2, split._3, 50, prPos, prNeg)
+ var cl: TDTClassifiers = new TDTClassifiers(kb, sc)
+
+ /**
+ * Function for training the algorithm
+ */
+ def training(results: Array[Array[Int]], trainingExs: RDD[OWLIndividual],
+ testConcepts: Array[OWLClassExpression],
+ negTestConcepts: Array[OWLClassExpression]): Unit = {
+
+ val op: RefinementOperator = new RefinementOperator(kb)
+ val reasoner: Reasoner = kb.getReasoner
+ val allExamples: RDD[OWLIndividual] = kb.getIndividuals
+ // val trainingExsSet: HashSet[Integer] = new HashSet[Integer](Arrays.asList(trainingExs: _*))
+
+ val length: Int = if (testConcepts != null) testConcepts.size else 1
+
+ for (c <- 0 until length) {
+
+ println("\n--- Query Concept # " + (c + 1))
+
+ // These instances should be divided into negative instances, positive and uncertain
+ // split._1 = posExs, split._2 = negExs, split._3 = undExs
+ val split = splitting(trainingExs, results, c)
+
+ var prPos: Double = split._1.count.toDouble / (trainingExs.count.toInt)
+ var prNeg: Double = split._2.count.toDouble / (trainingExs.count.toInt)
+ println("Training set composition: " + split._1.count() + " - " + split._2.count() + " - " + split._3.count())
+
+ val Sum: Double = prPos + prNeg
+ if (Sum == 0) {
+ prPos = 0.5
+ prNeg = 0.5
+ } else {
+ prPos = prPos / Sum
+ prNeg = prNeg / Sum
+ }
+ println("\nNew learning problem prepared " + (c + 1))
+ println("Learning a tree ")
+ trees(c) = cl.induceDLTree(kb.getDataFactory.getOWLThing, split._1, split._2, split._3, 50, prPos, prNeg)
+
+ }
}
- }
-
- /*
+
+ /*
* Function for testing the algorithm
*/
- def test (f: Int, testExs: RDD[OWLIndividual], testConcepts: Array[OWLClassExpression]): Array[Array[Int]] = {
-
- // classifier answers for each example and for each concept
- val labels: Array[Array[Int]] = Array.ofDim[Int](testExs.count.toInt, nConcepts)
-
- for (t <- 0 until testExs.count.toInt) {
- val indTestEx = testExs.take(t+1).apply(t)
- println("\n\nFold #" + (f+1))
- println(" ---\n Classifying Example " + (t+1) + "/" + testExs.count.toInt + " [" + indTestEx + "] ")
-
- //labels(t) = Array.ofDim[Int](nConcepts)
-
-
- for (i <- 0 until nConcepts - 1) {
- labels(t)(i) = cl.classify(indTestEx, trees(i))
+ def test(f: Int, testExs: RDD[OWLIndividual], testConcepts: Array[OWLClassExpression]): Array[Array[Int]] = {
+
+ // classifier answers for each example and for each concept
+ val labels: Array[Array[Int]] = Array.ofDim[Int](testExs.count.toInt, nConcepts)
+
+ for (t <- 0 until testExs.count.toInt) {
+ val indTestEx = testExs.take(t + 1).apply(t)
+ println("\n\nFold #" + (f + 1))
+ println(" ---\n Classifying Example " + (t + 1) + "/" + testExs.count.toInt + " [" + indTestEx + "] ")
+
+ // labels(t) = Array.ofDim[Int](nConcepts)
+
+ for (i <- 0 until nConcepts - 1) {
+ labels(t)(i) = cl.classify(indTestEx, trees(i))
+ }
}
+ labels
}
- labels
- }
- /*
+ /*
* Function for splitting the training examples into positive, negative and undefined examples
*/
-
- def splitting(trainingExs: RDD[OWLIndividual], classifications: Array[Array[Int]], c: Int): (RDD[String],RDD[String],RDD[String]) = {
-
- var BINARYCLASSIFICATION : Boolean = false
-// var classRDD = sc.sparkContext.parallelize(classifications,2)
-// var pos = classRDD.filter(_ == +1)
-
- var pos = new ArrayList[String]()
- var neg = new ArrayList[String]()
- var und = new ArrayList[String]()
- var TExs = trainingExs.zipWithIndex()
-
- for (i <-0 until trainingExs.count.toInt){
-
- val trainValue = trainingExs.take(i+1).apply(i)
- //var trainIndex = TExs.lookup(trainValue)
- //println("\nvalue : " + trainValue)
- val trainIndex = trainingExs.take(trainingExs.count.toInt).indexOf(trainValue)
- // println("index : " + trainIndex)
-
-/* var p = trainingExs.filter{ exs =>
+
+ def splitting(trainingExs: RDD[OWLIndividual], classifications: Array[Array[Int]], c: Int): (RDD[String], RDD[String], RDD[String]) = {
+
+ var BINARYCLASSIFICATION: Boolean = false
+ // var classRDD = sc.sparkContext.parallelize(classifications,2)
+ // var pos = classRDD.filter(_ == +1)
+
+ var pos = new ArrayList[String]()
+ var neg = new ArrayList[String]()
+ var und = new ArrayList[String]()
+ var TExs = trainingExs.zipWithIndex()
+
+ for (i <- 0 until trainingExs.count.toInt) {
+
+ val trainValue = trainingExs.take(i + 1).apply(i)
+ // var trainIndex = TExs.lookup(trainValue)
+ // println("\nvalue : " + trainValue)
+ val trainIndex = trainingExs.take(trainingExs.count.toInt).indexOf(trainValue)
+ // println("index : " + trainIndex)
+
+ /* var p = trainingExs.filter{ exs =>
val v = exs.toString()
-
- }*/
-
- if (trainIndex != -1){
- val value = trainValue.toString()
- if (classifications(c)(trainIndex) == +1)
+
+ } */
+
+ if (trainIndex != -1) {
+ val value = trainValue.toString()
+ if (classifications(c)(trainIndex) == +1) {
pos.add(value)
- else if (!BINARYCLASSIFICATION) {
- if (classifications(c)(trainIndex) == -1)
+ } else if (!BINARYCLASSIFICATION) {
+ if (classifications(c)(trainIndex) == -1) {
+ neg.add(value)
+ } else {
+ und.add(value)
+ }
+ } else {
neg.add(value)
- else
- und.add(value)
+ }
+ }
}
- else
- neg.add(value)
- }
- }
- var posExs = sc.sparkContext.parallelize(pos.asScala)
- var negExs = sc.sparkContext.parallelize(neg.asScala)
- var undExs = sc.sparkContext.parallelize(und.asScala)
-
- (posExs, negExs, undExs)
- }
-// val TList : List[Integer]= new ArrayList[Integer]
-// var T = sc.sparkContext.parallelize(TList.asScala)
-//
-// var TExs = trainingExs.zipWithIndex()
-// for (e <- 0 until trainingExs.count.toInt) {
-//
-// var index = TExs.lookup(e)
-// T.union(index)
- //val Train = sc.sparkContext.parallelize(T.asScala)
-
- /*if (classifications(c)(TExs.lookup(e)) == +1) posExs.union(T)
+ var posExs = sc.sparkContext.parallelize(pos.asScala)
+ var negExs = sc.sparkContext.parallelize(neg.asScala)
+ var undExs = sc.sparkContext.parallelize(und.asScala)
+
+ (posExs, negExs, undExs)
+ }
+ // val TList : List[Integer]= new ArrayList[Integer]
+ // var T = sc.sparkContext.parallelize(TList.asScala)
+ //
+ // var TExs = trainingExs.zipWithIndex()
+ // for (e <- 0 until trainingExs.count.toInt) {
+ //
+ // var index = TExs.lookup(e)
+ // T.union(index)
+ // val Train = sc.sparkContext.parallelize(T.asScala)
+
+ /* if (classifications(c)(TExs.lookup(e)) == +1) posExs.union(T)
else if (!BINARYCLASSIFICATION) {
if (classifications(c)(TExs.lookup(e)) == -1)
negExs.union(T)
else undExs.union(T)
- } else negExs.union(T)*/
- //}
+ } else negExs.union(T) */
+ // }
-
-
- /* def getComplexityValues(sc: SparkSession): Array[Double] = {
+ /* def getComplexityValues(sc: SparkSession): Array[Double] = {
// a measure to express the model complexity (e.g. the number of nodes in a tree)
val complexityValue: Array[Double] = Array.ofDim[Double](trees.length)
@@ -179,7 +169,6 @@ class TDTInducer(var kb: KB, var nConcepts: Int, var sc: SparkSession) {
complexityValue(i) = current
}
complexityValue
- }*/
-
+ } */
}
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TermDecisionTrees.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TermDecisionTrees.scala
index 71f23f7..2a6dbcb 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TermDecisionTrees.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TermDecisionTrees.scala
@@ -1,30 +1,30 @@
-package net.sansa_stack.ml.spark.classification
+package net.sansa_stack.ml.spark.classification
import java.util.ArrayList
-import scala.reflect.runtime.universe._
+
import scala.collection.JavaConverters._
+import scala.reflect.runtime.universe._
+
+import org.apache.log4j.{ Level, Logger }
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
import org.semanticweb.owlapi.model.OWLClassExpression
import org.semanticweb.owlapi.model.OWLIndividual
+import scopt.OptionParser
-import net.sansa_stack.ml.spark.classification.KB.KB
import net.sansa_stack.ml.spark.classification.ClassMembership.ClassMembership
+import net.sansa_stack.ml.spark.classification.KB.KB
import net.sansa_stack.ml.spark.classification.TDTClassifiers.TDTClassifiers
-
import net.sansa_stack.owl.spark.rdd.FunctionalSyntaxOWLAxiomsRDDBuilder
import net.sansa_stack.owl.spark.rdd.OWLAxiomsRDD
-import scopt.OptionParser
-import org.apache.log4j.{Level, Logger}
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.rdd.RDD
-
object TermDecisionTrees {
- /*
- * The main file to call Terminological Decision Trees for Classification
- */
+ /**
+ * The main file to call Terminological Decision Trees for Classification
+ */
- def main(args: Array[String]) = {
+ def main(args: Array[String]): Unit = {
val input = "src/main/resources/Classification/trains.owl"
@@ -38,61 +38,59 @@ object TermDecisionTrees {
.config("spark.kryo.registrator", "net.sansa_stack.ml.spark.classification.Registrator")
.appName("Termnological Decision Tree")
.getOrCreate()
-
- //Call owl axion builder to read the classes and object properties and print
-
- val rdd : OWLAxiomsRDD = FunctionalSyntaxOWLAxiomsRDDBuilder.build(sparkSession, input)
-
+
+ // Call owl axion builder to read the classes and object properties and print
+
+ val rdd: OWLAxiomsRDD = FunctionalSyntaxOWLAxiomsRDDBuilder.build(sparkSession, input)
+
val kb: KB = new KB(input, rdd, sparkSession)
var ClassM = new ClassMembership(kb, sparkSession)
val ClassName = TDTInducer.toString()
ClassM.bootstrap(10, ClassName, sparkSession)
- //val c : TDTInducer = new TDTInducer(kb, kb.Concepts.count().toInt, sparkSession)
-
-// var PosExamples = sparkSession.sparkContext.parallelize(Array("http://example.com/foo#east1",
-// "http://example.com/foo#east2",
-// "http://example.com/foo#east3",
-// "http://example.com/foo#east4",
-// "http://example.com/foo#east5"))
-//
-// var NegExamples = sparkSession.sparkContext.parallelize(Array("http://example.com/foo#west6",
-// "http://example.com/foo#west7",
-// "http://example.com/foo#west8",
-// "http://example.com/foo#west9",
-// "http://example.com/foo#west10"))
-//
-// var UndExamples = sparkSession.sparkContext.parallelize(new ArrayList[String]().asScala)
-//
-// val numPos: Double = PosExamples.count
-// val numNeg: Double = NegExamples.count
-// val perPos: Double = numPos / (numPos + numNeg)
-// val perNeg: Double = numNeg / (numPos + numNeg)
-//
-// println("\nLearning problem: \n --------------------\n")
-// println("No. of Positive examples: " + PosExamples.count)
-// println("No. of Negative examples: " + NegExamples.count)
-// println("No. of Undefined examples: " + UndExamples.count)
-// println("\nper Pos: " + perPos)
-// println("per Neg: " + perNeg)
-//
-// val nGeneratedRef: Int = 50
-//
-// val c : TDTClassifiers = new TDTClassifiers (kb, sparkSession)
-// val tree : DLTree = c.induceDLTree(kb.getDataFactory.getOWLThing, PosExamples, NegExamples, UndExamples, nGeneratedRef, perPos, perNeg)
-//
-// val Root: OWLClassExpression = tree.getRoot()
-// println("\nRoot of the tree is: " + Root)
-
- /*val possubtree = tree.getPosSubTree().toString()
- println("possubtree: " + possubtree)*/
-
- //val ind = kb.getDataFactory().getOWLNamedIndividual("http://example.com/foo#east2")
- //val classification : Int = c.classify(ind, tree)
- //println("\nclassification of east2 is " + classification)
-
- sparkSession.stop
+ // val c : TDTInducer = new TDTInducer(kb, kb.Concepts.count().toInt, sparkSession)
- }
+ // var PosExamples = sparkSession.sparkContext.parallelize(Array("http://example.com/foo#east1",
+ // "http://example.com/foo#east2",
+ // "http://example.com/foo#east3",
+ // "http://example.com/foo#east4",
+ // "http://example.com/foo#east5"))
+ //
+ // var NegExamples = sparkSession.sparkContext.parallelize(Array("http://example.com/foo#west6",
+ // "http://example.com/foo#west7",
+ // "http://example.com/foo#west8",
+ // "http://example.com/foo#west9",
+ // "http://example.com/foo#west10"))
+ //
+ // var UndExamples = sparkSession.sparkContext.parallelize(new ArrayList[String]().asScala)
+ //
+ // val numPos: Double = PosExamples.count
+ // val numNeg: Double = NegExamples.count
+ // val perPos: Double = numPos / (numPos + numNeg)
+ // val perNeg: Double = numNeg / (numPos + numNeg)
+ //
+ // println("\nLearning problem: \n --------------------\n")
+ // println("No. of Positive examples: " + PosExamples.count)
+ // println("No. of Negative examples: " + NegExamples.count)
+ // println("No. of Undefined examples: " + UndExamples.count)
+ // println("\nper Pos: " + perPos)
+ // println("per Neg: " + perNeg)
+ //
+ // val nGeneratedRef: Int = 50
+ //
+ // val c : TDTClassifiers = new TDTClassifiers (kb, sparkSession)
+ // val tree : DLTree = c.induceDLTree(kb.getDataFactory.getOWLThing, PosExamples, NegExamples, UndExamples, nGeneratedRef, perPos, perNeg)
+ //
+ // val Root: OWLClassExpression = tree.getRoot()
+ // println("\nRoot of the tree is: " + Root)
+
+ /* val possubtree = tree.getPosSubTree().toString()
+ println("possubtree: " + possubtree) */
+
+ // val ind = kb.getDataFactory().getOWLNamedIndividual("http://example.com/foo#east2")
+ // val classification : Int = c.classify(ind, tree)
+ // println("\nclassification of east2 is " + classification)
-}
\ No newline at end of file
+ sparkSession.stop
+ }
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/Utils.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/Utils.scala
index bd0f03e..6d9a41a 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/Utils.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/Utils.scala
@@ -1,12 +1,13 @@
package net.sansa_stack.ml.spark.classification
-import org.apache.spark.serializer.{ KryoRegistrator => SparkKryoRegistrator }
import com.esotericsoftware.kryo.Kryo
+import org.apache.spark.serializer.{ KryoRegistrator => SparkKryoRegistrator }
import org.semanticweb.owlapi.model.OWLClass
-import net.sansa_stack.ml.spark.classification.KB.KB
import org.semanticweb.owlapi.reasoner.structural.StructuralReasoner
-/*
+import net.sansa_stack.ml.spark.classification.KB.KB
+
+/**
* Class for serialization by the Kryo serializer.
*/
class Registrator extends SparkKryoRegistrator {
@@ -17,4 +18,4 @@ class Registrator extends SparkKryoRegistrator {
kryo.register(classOf[StructuralReasoner])
kryo.register(classOf[net.sansa_stack.ml.spark.classification.KB.KB])
}
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/BorderFlow.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/BorderFlow.scala
similarity index 89%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/BorderFlow.scala
rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/BorderFlow.scala
index 4434e6f..d010777 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/BorderFlow.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/BorderFlow.scala
@@ -1,38 +1,32 @@
-package net.sansa_stack.ml.spark.clustering
+import java.io._
+import java.io.{ ByteArrayInputStream, FileNotFoundException, FileReader, IOException, StringWriter }
+import java.lang.{ Long => JLong }
-import org.apache.spark.rdd.RDD
-import org.apache.spark.graphx.{ Graph, EdgeDirection }
import scala.math.BigDecimal
-import org.apache.spark.sql.SparkSession
import scala.reflect.runtime.universe._
-import scopt.OptionParser
-import org.apache.log4j.{ Level, Logger }
-import org.apache.spark.mllib.util.MLUtils
-import java.io.{ FileReader, FileNotFoundException, IOException }
-import org.apache.spark.mllib.linalg.Vectors
-import java.lang.{ Long => JLong }
-import java.lang.{ Long => JLong }
-import breeze.linalg.{ squaredDistance, DenseVector, Vector }
-import org.apache.spark.sql.SparkSession
import scala.util.control.Breaks._
+
+import breeze.linalg.{ squaredDistance, DenseVector, Vector }
import org.apache.jena.datatypes.{ RDFDatatype, TypeMapper }
-import org.apache.jena.graph.{ Node => JenaNode, Triple => JenaTriple, _ }
-import org.apache.jena.riot.writer.NTriplesWriter
+import org.apache.jena.graph.{ Node => JenaNode, Node_ANY, Node_Blank, Node_Literal, Node_URI, Triple => JenaTriple, _ }
import org.apache.jena.riot.{ Lang, RDFDataMgr }
-import org.apache.jena.graph.{ Node_ANY, Node_Blank, Node_Literal, Node_URI, Node => JenaNode, Triple => JenaTriple }
+import org.apache.jena.riot.writer.NTriplesWriter
+import org.apache.jena.util._
import org.apache.jena.vocabulary.RDF
-import java.io.ByteArrayInputStream
-import org.apache.spark.rdd.PairRDDFunctions
+import org.apache.log4j.{ Level, Logger }
import org.apache.spark.SparkContext._
import org.apache.spark.graphx._
-import org.apache.jena.util._
-import java.io.StringWriter
-import java.io._
-import org.apache.spark.graphx.Graph
+import org.apache.spark.graphx.{ EdgeDirection, Graph }
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.PairRDDFunctions
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import scopt.OptionParser
object BorderFlow {
- def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputevlsoft: String, outputevlhard: String) = {
+ def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputevlsoft: String, outputevlhard: String): Unit = {
/**
* undirected graph : orient =0
@@ -51,9 +45,9 @@ object BorderFlow {
graphXinBorderFlow(orient, selectYourSimilarity)
}
- /*
- * Computes different similarities function for a given graph @graph.
- */
+ /**
+ * Computes different similarities function for a given graph @graph.
+ */
def graphXinBorderFlow(e: Int, f: Int): List[List[Long]] = {
val edge = graph.edges.collect()
@@ -197,7 +191,7 @@ object BorderFlow {
f3
}
- //computing f(X,V) for Heuristics BorderFlow
+ // computing f(X,V) for Heuristics BorderFlow
def fOmega(x: List[Long], v: Long): Double = {
var numberFlow = 0
@@ -223,9 +217,7 @@ object BorderFlow {
var jaccardBV = 0.0
if (b.size == 0) return 0.0
for (i <- 0 until b.length) yield {
-
- jaccardBV = jaccardBV.+(findingSimilarity(b(i), v).abs)
-
+ jaccardBV = jaccardBV. + (findingSimilarity(b(i), v).abs)
}
var jaccardVXV = 0.0
@@ -233,38 +225,13 @@ object BorderFlow {
for (i <- 0 until VX.length) yield {
if (VX(i) != v) {
- jaccardVXV = jaccardVXV.+(findingSimilarity(VX(i), v).abs)
+ jaccardVXV = jaccardVXV. + (findingSimilarity(VX(i), v).abs)
}
}
(jaccardVXV / jaccardBV)
- /*
- * without similarity
- val nv = neighborSort.lookup(v).distinct.head.toSet
- val nvX = nv.intersect(X.toSet)
- val nvx = nvX.toList.diff(x).size
-
-
- for(k <- 0 until x.length) yield{
- if(x.length>0){
-
- val xk = x(k)
- val bX = neighborSort.lookup(xk).distinct.head.toSet
- val bxX = bX.intersect(X.toSet)
-
- if(bxX.toList.diff(x).size > 0 && bxX.toList.diff(x).contains(v)) {
- numberFlow = numberFlow + 1
- }
-
- }
-
- }
-
- ( 1/(numberFlow.toDouble/ nvx.toDouble))
- *
- */
}
@@ -325,7 +292,7 @@ object BorderFlow {
for (i <- 0 until b.length) yield {
for (j <- 0 until x.length) yield {
if (b(i) != x(j)) {
- jaccardX = jaccardX.+(findingSimilarity(b(i), x(j)).abs)
+ jaccardX = jaccardX. + (findingSimilarity(b(i), x(j)).abs)
}
}
@@ -334,7 +301,7 @@ object BorderFlow {
for (i <- 0 until b.length) yield {
for (j <- 0 until n.length) yield {
- jaccardN = jaccardN.+(findingSimilarity(b(i), n(j)).abs)
+ jaccardN = jaccardN. + (findingSimilarity(b(i), n(j)).abs)
}
}
@@ -367,7 +334,7 @@ object BorderFlow {
for (i <- 0 until n.length) yield {
if (n(i) != u) {
- jaccardNU = jaccardNU.+(findingSimilarity(u, n(i)).abs)
+ jaccardNU = jaccardNU. + (findingSimilarity(u, n(i)).abs)
}
@@ -377,15 +344,14 @@ object BorderFlow {
val nu = neighborSort.lookup(u).distinct.head.toSet
val nuX = nu.intersect(X.toSet).toList
( (nuX.intersect(listOfN(x))).size.toDouble)
-
*/
jaccardNU
}
- /*
- * Use Heuristics method for producing clusters.
- */
+ /**
+ * Use Heuristics method for producing clusters.
+ */
def heuristicsCluster(a: List[Long]): List[Long] = {
var nj = 0.0
@@ -436,9 +402,9 @@ object BorderFlow {
}
- /*
- * Use Non-Heuristics(normal) method for producing clusters.
- */
+ /**
+ * Use Non-Heuristics(normal) method for producing clusters.
+ */
def nonHeuristicsCluster(a: List[Long], d: List[Long]): List[Long] = {
var nj: List[Long] = List()
@@ -529,18 +495,18 @@ object BorderFlow {
}
- /*
- * Input for heuristics heuristicsCluster(element) .
- * Input for nonHeuristics nonHeuristicsCluster(element,List()) .
- */
+ /**
+ * Input for heuristics heuristicsCluster(element) .
+ * Input for nonHeuristics nonHeuristicsCluster(element,List()) .
+ */
def makeClusters(a: Long): List[Long] = {
var clusters: List[Long] = List()
clusters = nonHeuristicsCluster(List(a), List())
- // if(b == 1){
- // clusters = heuristicsCluster(List(a))}
+ // if(b == 1) {
+ // clusters = heuristicsCluster(List(a)) }
(clusters)
@@ -558,9 +524,9 @@ object BorderFlow {
bigList = bigList.map(_.distinct)
- /*
- * Sillouhette Evaluation soft
- */
+ /**
+ * Sillouhette Evaluation soft
+ */
def avgAsoft(c: List[Long], d: Long): Double = {
var sumA = 0.0
@@ -585,6 +551,7 @@ object BorderFlow {
sumB / sizeC
}
+
def SIsoft(a: Double, b: Double): Double = {
var s = 0.0
if (a > b) {
@@ -632,9 +599,9 @@ object BorderFlow {
val evaluateSoft = AiBiSoft(bigList, X)
- /*
- * Apply Hardening
- */
+ /**
+ * Apply Hardening
+ */
def subset(c: List[List[Long]]): List[List[Long]] = {
var C = c
@@ -698,7 +665,7 @@ object BorderFlow {
for (i <- 0 until c.length) yield {
if (c(i) != v) {
- omega = omega.+(findingSimilarity(v, c(i)).abs)
+ omega = omega. + (findingSimilarity(v, c(i)).abs)
}
@@ -708,7 +675,6 @@ object BorderFlow {
val nu = neighborSort.lookup(u).distinct.head.toSet
val nuX = nu.intersect(X.toSet).toList
( (nuX.intersect(listOfN(x))).size.toDouble)
-
*/
omega
@@ -741,6 +707,7 @@ object BorderFlow {
}
C
}
+
def nul(c: List[List[Long]]): List[List[Long]] = {
var C = c
var newCluster: List[List[Long]] = List()
@@ -755,9 +722,9 @@ object BorderFlow {
bigList = reassignment(bigList, X)
bigList = nul(bigList)
- /*
- * Sillouhette Evaluation Hard
- */
+ /**
+ * Sillouhette Evaluation Hard
+ */
def avgA(c: List[Long], d: Long): Double = {
var sumA = 0.0
@@ -782,6 +749,7 @@ object BorderFlow {
sumB / sizeC
}
+
def SI(a: Double, b: Double): Double = {
var s = 0.0
if (a > b) {
@@ -838,14 +806,14 @@ object BorderFlow {
val evaluateStringRDDS = spark.sparkContext.parallelize(evaluateStringS)
evaluateStringRDDS.saveAsTextFile(outputevlsoft)
- //println(s"averagesoft: $avsoft\n")
+ // println(s"averagesoft: $avsoft\n")
bigList
}
- /*
- * convert to RDF
- */
+ /**
+ * convert to RDF
+ */
def makerdf(a: List[Long]): List[String] = {
var listuri: List[String] = List()
@@ -857,13 +825,12 @@ object BorderFlow {
}
listuri
-
}
val rdf = clusterRdd.map(x => makerdf(x))
val rdfRDD = spark.sparkContext.parallelize(rdf)
rdfRDD.saveAsTextFile(output)
-
}
}
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/DBSCAN.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/DBSCAN.scala
new file mode 100644
index 0000000..3c0dbfd
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/DBSCAN.scala
@@ -0,0 +1,260 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+/*
+* DBSCAN Distributed Edition in Spark & Scala.
+*
+* Authors: Panagiotis Kalampokis, Dr. Dimitris Skoutas
+* */
+
+import com.vividsolutions.jts.geom.{Coordinate, Envelope, GeometryFactory, Point}
+import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.storage.StorageLevel._
+import org.datasyslab.geospark.enums.GridType
+import org.datasyslab.geospark.spatialPartitioning.SpatialPartitioner
+import org.datasyslab.geospark.spatialRDD.PointRDD
+import scala.collection.mutable.{ArrayBuffer, HashMap}
+
+import net.sansa_stack.ml.spark.clustering.datatypes.DbPOI
+import net.sansa_stack.ml.spark.clustering.datatypes.POI
+import net.sansa_stack.ml.spark.clustering.utils.DBCLusterer
+
+
+class DBSCAN() extends Serializable {
+
+ private var clusterRDD: RDD[DbPOI] = null
+ private var mergingClusterNameVecBD: Broadcast[Vector[Set[String]]] = null
+ private var boundaryPoisToKeepHMBD : Broadcast[HashMap[String, String]] = null
+ private var spatialPartitionerBD : Broadcast[SpatialPartitioner] = null
+
+ private def areIntersectingSets(set1: Set[String], set2: Set[String]): Boolean = {
+ set1.exists(s1 => set2.exists(s2 => s1 == s2) )
+ }
+
+ private def insertSetIntoVec(vec: Vector[Set[String]], xSet: Set[String]): Vector[Set[String]] = {
+
+ var tmpVec = Vector[Set[String]]()
+ var unionSet = Set[String]() ++ xSet
+
+ for(set_i <- vec) {
+ if (areIntersectingSets(set_i, unionSet))
+ {
+ unionSet = unionSet ++ set_i
+ }
+ else
+ {
+ tmpVec = tmpVec :+ set_i
+ }
+ }
+ unionSet +: tmpVec
+ }
+
+ protected def getExpandedEnvelopeFromPoint(p: Point, epsilon: Double): Envelope = {
+ val env = p.getEnvelopeInternal
+ env.expandBy(epsilon)
+
+ env
+ }
+ /*
+ * Performs DBSCAN and Returns the clusters.
+ * */
+ def dbclusters(pointRDD_0: RDD[Point], eps: Double, minPts: Int, spark: SparkSession) : RDD[(String, Array[(String, DbPOI)])] = {
+
+ val pointRDD_1 = new JavaRDD[Point](pointRDD_0)
+ val pointRDD = new PointRDD(pointRDD_1)
+
+ pointRDD.analyze()
+
+ // Perform Spatial Partitioning with QuadTree
+ pointRDD.spatialPartitioning(GridType.QUADTREE, 16)
+
+ // val boundaryEnvelopes = pointRDD.getPartitioner.getGrids
+ // writeBoundaryEnvsToFile(pointRDD, outputFile + "_Envelopes_only.txt", geometryFactory)
+ this.spatialPartitionerBD = spark.sparkContext.broadcast(pointRDD.getPartitioner)
+
+ // RDD[partitionID, dbpoi]
+ val flatMappedRDD = pointRDD.spatialPartitionedRDD
+ .rdd
+ .mapPartitions{
+ pointIter => {
+ val geometryFactory = new GeometryFactory()
+ pointIter.flatMap{
+ point => {
+ // Get expanded by eps Envelope From Point.
+ val pointEnv = getExpandedEnvelopeFromPoint(point, eps)
+
+ // Given a Geometry, it Returns a List of Partitions it overlaps.
+ val pIDListTuple = this.spatialPartitionerBD.value.placeObject(geometryFactory.toGeometry(pointEnv))
+
+ // ArrayBuffer[PIDs]
+ val arrBuff = ArrayBuffer[Int]()
+
+ while (pIDListTuple.hasNext) {
+ val (pID, envP) = pIDListTuple.next()
+ arrBuff.append(pID.intValue())
+ }
+
+ // Is Boundary Point?
+ val isBoundaryP = (arrBuff.size > 1)
+ arrBuff.map{
+ pID => {
+ val poi = DbPOI(point.getUserData.asInstanceOf[String], point.getX, point.getY)
+ if (isBoundaryP) {
+ poi.isBoundary = true
+ }
+
+ (pID, poi)
+ }
+ }
+ }
+ }
+ }
+ }
+ // RDD[(pID, ArrayBuffer[DBPOI])]
+ val partitionRDD = flatMappedRDD.aggregateByKey(ArrayBuffer[DbPOI]())(
+ // SeqOp
+ (zArrBuffDBPoi, poi) => zArrBuffDBPoi += poi,
+
+ // CombOp
+ (zArrBuffDBPoi1, zArrBuffDBPoi2) => zArrBuffDBPoi1 ++= zArrBuffDBPoi2
+ )
+
+
+ // RDD[dbpoi]
+ this.clusterRDD = partitionRDD.flatMap{
+ case (pID, poiArrBuff) =>
+ // New DBSCAN CLusterer For Each Partition-Envelope
+ val dbclusterer = DBCLusterer(eps, minPts)
+
+ // Perform DBSCAN in each partition and return a List of Clusters: ArrayBuffer[ArrayBuffer[DBPOI]]
+ val clusters = dbclusterer.clusterPois(poiArrBuff)
+
+ var i = 0
+ for (cluster <- clusters) {
+ for (poi <- cluster) {
+ poi.clusterName = pID + "p" + i
+ }
+
+ i = i + 1
+ }
+ clusters.flatten
+ }
+ .persist(MEMORY_AND_DISK)
+
+
+ // Take all Boundary Pois.
+ // RDD[poiID, dbpoi]
+ val boundaryPoiRDD = this.clusterRDD.filter(_.isBoundary).map(poi => (poi.poiId, poi) )
+
+
+ // RDD[poiID, (List[pID&cID], isDense?)] Set[pID&cID], isDensePoi?
+ val bPoiRDD = boundaryPoiRDD.aggregateByKey( (Set[String](), false) )(
+ // SeqOp
+ (zTuple, poi) => (zTuple._1 + poi.clusterName, zTuple._2 | poi.isDense ),
+
+ // CombOp
+ (tuple1, tuple2) => (tuple1._1 ++ tuple2._1, tuple1._2 | tuple2._2)
+ )
+
+
+ // Vector[Set[pID&cIID]], HashMap[poiID ,pID&cID] Vector[Set[pID&cIID]] , HashMap[poiID , pID&cID]
+ val (mergingClusterNameVec, boundaryPoisToKeepHM) = bPoiRDD.aggregate( ( Vector[Set[String]](), HashMap[String, String]() ))(
+ // SeqOp
+ (zTuple, xTuple) => {
+
+ val (vec, zHashMap) = zTuple
+
+ // [poiID, (Set[pID&cID], isDense?)]
+ val (poiID, (pIDcIDSet, isDense)) = xTuple
+
+ if(isDense) {
+ (insertSetIntoVec(vec, pIDcIDSet), zHashMap)
+ }
+ else {
+ (vec, zHashMap += ((poiID, pIDcIDSet.head)) )
+ }
+ },
+
+ // CombOp
+ (zTuple1, zTuple2) => {
+ val (vec1, hashMap1) = zTuple1
+ val (vec2, hashMap2) = zTuple2
+ val vec3 = vec2.foldLeft(vec1)((zVec, xSet) => insertSetIntoVec(zVec, xSet))
+
+ (vec3, hashMap1 ++= hashMap2)
+ }
+ )
+
+
+ // Broadcast commonNames and PoisToKeep
+ this.mergingClusterNameVecBD = spark.sparkContext.broadcast(mergingClusterNameVec)
+ this.boundaryPoisToKeepHMBD = spark.sparkContext.broadcast(boundaryPoisToKeepHM)
+ val preFinalClusterRDD = this.clusterRDD.mapPartitions{
+ poiIter => {
+
+ val commonNameMap = this.mergingClusterNameVecBD.value.flatMap{
+ nameSet => {
+ val commonName = nameSet.toSeq.sortWith(_ < _).mkString("c")
+ nameSet.map(_ -> commonName)
+ }
+ }.toMap
+
+ poiIter.flatMap{
+ poi => {
+ var poiIDcIDName = poi.clusterName
+ commonNameMap.get(poi.clusterName) match {
+ case Some(commonName) => poiIDcIDName = commonName
+ case None => ()
+ }
+ var keepPoi = true
+ this.boundaryPoisToKeepHMBD.value.get(poi.poiId) match {
+ case Some(pIDcIDwhoKeepsPoi) =>
+ if (poi.clusterName != pIDcIDwhoKeepsPoi) {
+ keepPoi = false
+ }
+ case None => ()
+ }
+
+ poi.clusterName = poiIDcIDName
+ if(keepPoi) {
+ Seq((poi.clusterName, poi))
+ }
+ else {
+ Seq()
+ }
+ }
+ }
+ }
+ }
+
+
+ // RDD[clusterName, HashMap[poiID, poi]]
+ val dbclusterRDD = preFinalClusterRDD.aggregateByKey(HashMap[String, DbPOI]())(
+ // SeqOp
+ (zPoiHM, poi) => zPoiHM += ((poi.poiId, poi)),
+
+ // CombOp
+ (hm1, hm2) => hm1 ++= hm2
+ )
+ // RDD[(String, Array[POI])]
+ // dbclusterRDD.foreach(println)
+ val k = dbclusterRDD.mapValues(_.toArray)
+ k
+ }
+ /*
+ * This method should be called after
+ * finishing using this class(e.g: writing results, or printing stats).
+ * */
+ def clear(): Unit = {
+ this.clusterRDD.unpersist(true)
+ this.boundaryPoisToKeepHMBD.destroy()
+ this.mergingClusterNameVecBD.destroy()
+ this.spatialPartitionerBD.destroy()
+ }
+
+}
+
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Distances.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Distances.scala
new file mode 100644
index 0000000..234fda8
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Distances.scala
@@ -0,0 +1,17 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+class Distances {
+
+ /**
+ * Jaccard Similarity Coefficient between two sets of categories corresponding to two pois
+ *
+ * @param x set of categories
+ * @param y set of categories
+ */
+ def jaccardSimilarity(x: Set[String], y: Set[String]): Double = {
+ val union_l = x.union(y).toList.length.toDouble
+ val intersect_l = x.intersect(y).toList.length.doubleValue()
+ intersect_l / (union_l)
+ }
+}
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Encoder.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Encoder.scala
new file mode 100644
index 0000000..2624b8f
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Encoder.scala
@@ -0,0 +1,121 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import org.apache.spark.ml.feature.{ VectorAssembler, Word2Vec }
+import org.apache.spark.rdd._
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+
+
+class Encoder {
+
+ /**
+ * One hot encoding categorical data
+ *
+ * @param poiCategories, category ids with corresponding category values
+ * @param spark
+ * @return one hot encoded DataFrame for each poi
+ */
+ def oneHotEncoding(poiCategories: RDD[(Long, Set[String])], spark: SparkSession): (DataFrame, Array[Array[Int]]) = {
+ // create a set to contain all categories
+ var set = scala.collection.mutable.Set[String]()
+ // put all categories to set
+ poiCategories.collect().foreach(x => x._2.foreach(y => set += y))
+ // create columns base on the length of set
+ val numPOIS = poiCategories.count().toInt // Array.ofDim only accept Int
+ val categoryArray = set.toArray
+ val oneHotMatrix = Array.ofDim[Int](numPOIS, categoryArray.length + 1) // one column keep poi id
+ // initialize distance matrix, collect first needed
+ var i = 0
+ poiCategories.collect().foreach(x =>
+ {
+ oneHotMatrix(i)(0) = x._1.toInt
+ for (j <- 1 until categoryArray.length + 1) {
+ oneHotMatrix(i)(j) = 0
+ }
+ x._2.foreach(y =>
+ { // encode corresponding category value to 1
+ oneHotMatrix(i)(categoryArray.indexOf(y) + 1) = 1
+ })
+ i += 1
+ })
+ // vector keep all StructField
+ val fields = Array.ofDim[StructField](categoryArray.length + 1)
+ val featureColumns = Array.ofDim[String](categoryArray.length + 1)
+ // keep other columns with integer type
+ for (i <- 0 until categoryArray.length + 1) {
+ fields(i) = StructField(i.toString, IntegerType, true)
+ featureColumns(i) = i.toString
+ }
+ val schema = new StructType(fields)
+ val oneHotEncodedRDD = spark.sparkContext.parallelize(oneHotMatrix).map(x => Row.fromSeq(x.toList))
+ val oneHotEncodedDF = spark.createDataFrame(oneHotEncodedRDD, schema)
+ // set up 'features' column
+ val assemblerFeatures = new VectorAssembler().setInputCols(featureColumns.slice(1, featureColumns.length)).setOutputCol("features")
+ val transformedDf = assemblerFeatures.transform(oneHotEncodedDF)
+ (transformedDf, oneHotMatrix)
+ }
+
+ /**
+ * word2Vec encoding
+ *
+ * @param poiCategories category ids with corresponding category values
+ * @param spark
+ * @return word2Vec encoded categories for each poi in DataFrame
+ */
+ def wordVectorEncoder(poiCategories: RDD[(Long, Set[String])], spark: SparkSession): (DataFrame, RDD[(Int, Array[Double])]) = {
+ val word2vec = new Word2Vec().setInputCol("inputCol").setMinCount(1)
+ val schema = StructType(StructField("inputCol", ArrayType(StringType, true), true) :: Nil)
+ val df = spark.createDataFrame(poiCategories.map(f => Row(f._2.map(x => x.toString).toArray)), schema)
+ val wordVectorsRDD = word2vec.fit(df).getVectors.select("word", "vector").rdd
+ val vectors = wordVectorsRDD.map(f => (f.getString(0), f.getAs[org.apache.spark.ml.linalg.DenseVector](1)))
+ val categoryVectors = vectors.collectAsMap()
+ val poiCategoryVectors = poiCategories.map(f => (f._1, f._2.map(x => categoryVectors.get(x).head.toArray)))
+ val poiVector = poiCategoryVectors.map(f => (f._1, f._2.size, f._2.toArray.toList.transpose.map(_.sum).toArray))
+ val leng = poiVector.take(1)(0)._2
+ val poiAvgVector = poiVector.map(x => (x._1.toInt, x._3.map(y => y / x._2)))
+ val fields = Array.ofDim[StructField](leng + 1)
+ val featureColumns = Array.ofDim[String](leng + 1)
+ // keep other columns with integer type
+ fields(0) = StructField("id", IntegerType, true)
+ featureColumns(0) = "id"
+ for (i <- 1 until leng + 1) {
+ fields(i) = StructField(i.toString, DoubleType, true)
+ featureColumns(i) = i.toString
+ }
+ val schema2 = new StructType(fields)
+ val poiAvgVectorDF = spark.createDataFrame(poiAvgVector.map(x => Row.fromSeq(x._1 +: x._2)), schema2)
+ val assemblerFeatures = new VectorAssembler().setInputCols(featureColumns.slice(1, featureColumns.length)).setOutputCol("features")
+ val transformedDf = assemblerFeatures.transform(poiAvgVectorDF)
+ (transformedDf, poiAvgVector)
+ }
+
+ /**
+ * multiple dimensional encoding
+ *
+ * @param distancePairs distance between pair of pois
+ * @param numPOIS number of pois
+ * @param dimension mapped coordinate dimension
+ * @param spark
+ * @return encoded coordinates for each poi in DataFrame
+ */
+ def mdsEncoding(distancePairs: RDD[(Long, Long, Double)], numPOIS: Int, dimension: Int, spark: SparkSession): (DataFrame, Array[(Long, Array[Double])]) = {
+ val poi2Coordinates = new MultiDS().multiDimensionScaling(distancePairs, numPOIS, dimension)
+ val poi2Coordinates2 = poi2Coordinates.map(x => x._1.toInt :: x._2.toList)
+ // create schema
+ val fields = Array.ofDim[StructField](dimension + 1)
+ val featureColumns = Array.ofDim[String](dimension + 1)
+ fields(0) = StructField("id", IntegerType, true)
+ featureColumns(0) = "id"
+ for (i <- 1 until dimension + 1) {
+ fields(i) = StructField(i.toString, DoubleType, true)
+ featureColumns(i) = i.toString
+ }
+ val schema = new StructType(fields)
+ val coordinatesRDD = spark.sparkContext.parallelize(poi2Coordinates2.toSeq).map(x => Row.fromSeq(x))
+ val coordinatesDF = spark.createDataFrame(coordinatesRDD, schema)
+ val assembler = new VectorAssembler().setInputCols(featureColumns.slice(1, featureColumns.length)).setOutputCol("features")
+ val featureData = assembler.transform(coordinatesDF)
+ (featureData, poi2Coordinates)
+ }
+}
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/FirstHardeninginBorderFlow.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/FirstHardeninginBorderFlow.scala
similarity index 84%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/FirstHardeninginBorderFlow.scala
rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/FirstHardeninginBorderFlow.scala
index f22f1f9..eee7c7b 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/FirstHardeninginBorderFlow.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/FirstHardeninginBorderFlow.scala
@@ -1,31 +1,34 @@
-package net.sansa_stack.ml.spark.clustering
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import java.io._
+import java.io.{ ByteArrayInputStream, FileNotFoundException, FileReader, IOException, StringWriter }
+import java.lang.{ Long => JLong }
+import java.net.URI
-import org.apache.spark.rdd.RDD
-import org.apache.spark.graphx.{ Graph, EdgeDirection }
import scala.math.BigDecimal
-import org.apache.spark.sql.SparkSession
import scala.reflect.runtime.universe._
-import scopt.OptionParser
+import scala.util.control.Breaks._
+
+import breeze.linalg.{ squaredDistance, DenseVector, Vector }
+import org.apache.jena.graph.Node
import org.apache.log4j.{ Level, Logger }
-import org.apache.spark.mllib.util.MLUtils
-import java.io.{ FileReader, FileNotFoundException, IOException }
+import org.apache.spark.graphx._
+import org.apache.spark.graphx.{ EdgeDirection, Graph }
import org.apache.spark.mllib.linalg.Vectors
-import java.lang.{ Long => JLong }
-import breeze.linalg.{ squaredDistance, DenseVector, Vector }
-import scala.util.control.Breaks._
-import java.io.ByteArrayInputStream
+import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.PairRDDFunctions
-import java.io.StringWriter
-import java.io._
-import java.net.URI
-import org.apache.spark.graphx._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+
+
+
object FirstHardeninginBorderFlow {
- def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputeval: String) = {
+ def apply(spark: SparkSession, graph: Graph[Node, Node], output: String, outputeval: String): Unit = {
/**
- *
+ *
* Jaccard similarity measure : selectYourSimilarity = 0
* Batet similarity measure : selectYourSimilarity = 1
* Rodríguez and Egenhofer similarity measure : selectYourSimilarity = 2
@@ -39,9 +42,9 @@ object FirstHardeninginBorderFlow {
graphXinBorderFlow(selectYourSimilarity)
}
- /*
- * Computes different similarities function for a given graph @graph.
- */
+ /**
+ * Computes different similarities function for a given graph @graph.
+ */
def graphXinBorderFlow(f: Int): List[List[Long]] = {
val edge = graph.edges
@@ -55,7 +58,8 @@ object FirstHardeninginBorderFlow {
val x = f._1
x
})
-
+ println("hard")
+ sort.foreach(println)
var X = sort.collect()
neighborSort.unpersist()
@@ -63,9 +67,9 @@ object FirstHardeninginBorderFlow {
val neighborcollect = neighbor.collect()
val verticescollect = graph.vertices.collect()
- /*
- * finding neighbors for node a
- */
+ /**
+ * finding neighbors for node a
+ */
def findneighbors(a: VertexId): Array[VertexId] = {
var b: Array[VertexId] = Array()
@@ -80,15 +84,15 @@ object FirstHardeninginBorderFlow {
b
}
- /*
- * Computing logarithm based 2
- */
+ /**
+ * Computing logarithm based 2
+ */
val LOG2 = math.log(2)
val log2 = { x: Double => math.log(x) / LOG2 }
- /*
- * Difference between two set of vertices, used in different similarity measures
- */
+ /**
+ * Difference between two set of vertices, used in different similarity measures
+ */
def difference(a: Array[VertexId], b: Array[VertexId]): Double = {
if (a.length == 0) { return 0.0 }
@@ -97,9 +101,9 @@ object FirstHardeninginBorderFlow {
differ.size.toDouble
}
- /*
- * Intersection of two set of vertices, used in different similarity measures
- */
+ /**
+ * Intersection of two set of vertices, used in different similarity measures
+ */
def intersection(a: Array[VertexId], b: Array[VertexId]): Double = {
if ((a.length == 0) || (b.length == 0)) { return 0.0 }
val rst = a.intersect(b)
@@ -107,9 +111,9 @@ object FirstHardeninginBorderFlow {
rst.size.toDouble
}
- /*
- * Union of two set of vertices, used in different similarity measures
- */
+ /**
+ * Union of two set of vertices, used in different similarity measures
+ */
def union(a: Array[VertexId], b: Array[VertexId]): Double = {
val rst = a.union(b)
@@ -117,17 +121,17 @@ object FirstHardeninginBorderFlow {
rst.size.toDouble
}
- /*
- * similarity measures
- */
+ /**
+ * similarity measures
+ */
def selectSimilarity(a: Array[VertexId], b: Array[VertexId], c: Int): Double = {
var s = 0.0
if (c == 0) {
- /*
- * Jaccard similarity measure
- */
+ /**
+ * Jaccard similarity measure
+ */
val sim = intersection(a, b) / union(a, b).toDouble
if (sim == 0.0) { s = (1 / vertex) }
@@ -137,9 +141,9 @@ object FirstHardeninginBorderFlow {
if (c == 1) {
- /*
- * Rodríguez and Egenhofer similarity measure
- */
+ /**
+ * Rodríguez and Egenhofer similarity measure
+ */
var g = 0.8
@@ -149,9 +153,9 @@ object FirstHardeninginBorderFlow {
}
if (c == 2) {
- /*
- * The Ratio model similarity
- */
+ /**
+ * The Ratio model similarity
+ */
var alph = 0.5
var beth = 0.5
@@ -162,15 +166,14 @@ object FirstHardeninginBorderFlow {
}
if (c == 3) {
- /*
- * Batet similarity measure
- */
+ /**
+ * Batet similarity measure
+ */
val cal = 1 + ((difference(a, b) + difference(b, a)) / (difference(a, b) + difference(b, a) + intersection(a, b))).abs
val sim = log2(cal.toDouble)
if (sim == 0.0) { s = (1 / vertex) }
else { s = sim }
-
}
s
}
@@ -222,7 +225,7 @@ object FirstHardeninginBorderFlow {
val sortsim = sumsimilarity(X)
- //println(s"sortsim: $sortsim\n")
+ // println(s"sortsim: $sortsim\n")
var node = sortsim.map(f => {
f._1
@@ -232,7 +235,7 @@ object FirstHardeninginBorderFlow {
neighbor.unpersist()
- //computing F(X) for BorderFlow
+ // computing F(X) for BorderFlow
def fX(x: List[Long]): Double = {
var jaccardX = 0.0
@@ -276,13 +279,13 @@ object FirstHardeninginBorderFlow {
b.map(bi => {
x.map(xj => {
- if (bi.!=(xj)) { jaccardX = jaccardX.+(findingSimilarity(bi, xj).abs) }
+ if (bi.!=(xj)) { jaccardX = jaccardX. + (findingSimilarity(bi, xj).abs) }
})
})
b.map(bi => {
n.map(nj => {
- jaccardN = jaccardN.+(findingSimilarity(bi, nj).abs)
+ jaccardN = jaccardN. + (findingSimilarity(bi, nj).abs)
})
})
@@ -309,16 +312,16 @@ object FirstHardeninginBorderFlow {
val n = listOfN(x)
var jaccardNU = 0.0
n.map(ni => {
- if (ni.!=(u)) { jaccardNU = jaccardNU.+(findingSimilarity(u, ni).abs) }
+ if (ni.!=(u)) { jaccardNU = jaccardNU. + (findingSimilarity(u, ni).abs) }
})
jaccardNU
}
- /*
- * Use Non-Heuristics(normal) method for producing clusters.
- */
+ /**
+ * Use Non-Heuristics(normal) method for producing clusters.
+ */
def nonHeuristicsCluster(a: List[Long], d: List[Long]): List[Long] = {
var nj: List[Long] = List()
@@ -399,22 +402,20 @@ object FirstHardeninginBorderFlow {
}
- /*
- *
- * Input for nonHeuristics nonHeuristicsCluster(element,List()) .
- */
+ /**
+ * Input for nonHeuristics nonHeuristicsCluster(element,List()) .
+ */
def makerdf(a: List[Long]): List[String] = {
var listuri: List[String] = List()
val b: List[VertexId] = a
for (i <- 0 until b.length) {
verticescollect.map(v => {
- if (b(i) == v._1) listuri = listuri.::(v._2)
+ if (b(i) == v._1) listuri = listuri.::(v._2.toString())
})
}
listuri
-
}
def makeClusters(a: Long): List[Long] = {
@@ -453,13 +454,13 @@ object FirstHardeninginBorderFlow {
} while (node.size > 0)
neighborSort.unpersist()
- //println(s"RDF Cluster assignments: $rdfcluster\n")
+ // println(s"RDF Cluster assignments: $rdfcluster\n")
val rdfRDD = spark.sparkContext.parallelize(rdfcluster)
rdfRDD.saveAsTextFile(output)
- /*
- * Sillouhette Evaluation
- */
+ /**
+ * Sillouhette Evaluation
+ */
def avgA(c: List[Long], d: Long): Double = {
var sumA = 0.0
@@ -530,7 +531,7 @@ object FirstHardeninginBorderFlow {
val evaluate = AiBi(bigList, nnode)
val av = evaluate.sum / evaluate.size
- //println(s"average: $av\n")
+ // println(s"average: $av\n")
val evaluateString: List[String] = List(av.toString())
val evaluateStringRDD = spark.sparkContext.parallelize(evaluateString)
@@ -540,8 +541,6 @@ object FirstHardeninginBorderFlow {
}
val rdf = clusterRdd()
- //println(s"RDF Cluster assignments: $rdf\n")
-
+ // println(s"RDF Cluster assignments: $rdf\n")
}
-
}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Kmeans.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Kmeans.scala
new file mode 100644
index 0000000..a90f876
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Kmeans.scala
@@ -0,0 +1,28 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import org.apache.spark.ml.clustering.KMeans
+import org.apache.spark.sql._
+import org.apache.spark.sql.SparkSession
+
+class Kmeans {
+
+ /**
+ * K-means clustering based on given Dataframe
+ *
+ * @param numClusters
+ * @param df
+ * @param spark
+ * @return cluster id and corresponding pois in cluster
+ */
+ def kmClustering(numClusters: Int, maxIter: Int, df: DataFrame, spark: SparkSession): Map[Int, Array[Long]] = {
+ val km = new KMeans().setK(numClusters).setMaxIter(maxIter).setSeed(1L).setFeaturesCol("features").setPredictionCol("prediction")
+ val model = km.fit(df)
+ val transformedDataFrame = model.transform(df)
+ import spark.implicits._
+ // get (cluster_id, poi_id)
+ val clusterIdPoi = transformedDataFrame.map(f => (f.getInt(f.size - 1), f.getInt(0).toLong)).rdd.groupByKey()
+ val clustersMDSKM = clusterIdPoi.map(x => (x._1, x._2.toArray)).collectAsMap().toMap
+ clustersMDSKM
+ }
+}
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDS.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDS.scala
new file mode 100644
index 0000000..a9cc699
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDS.scala
@@ -0,0 +1,50 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import org.apache.spark.rdd._
+import smile.mds.MDS
+
+class MultiDS {
+
+ /**
+ * Multi-dimensional scaling
+ * Generate n dimensional coordinates based on input similarity matrix
+ *
+ * @param distancePairs distance between pair of poi
+ * @param numPOIS number of poi
+ * @param dimension dimension of generated coordinates
+ * @return poi id and coordinates in given dimension
+ */
+ def multiDimensionScaling(distancePairs: RDD[(Long, Long, Double)], numPOIS: Int, dimension: Int): Array[(Long, Array[Double])] = {
+ // vector keep recorded poi
+ var vector = Array.ofDim[Long](numPOIS)
+ // positive symmetric distance matrix
+ var distanceMatrix = Array.ofDim[Double](numPOIS, numPOIS)
+ // initialize distance matrix
+ for (i <- 0 until numPOIS) {
+ vector(i) = 0
+ for (j <- 0 until numPOIS) {
+ distanceMatrix(i)(j) = 0.0
+ }
+ }
+ var i = 0
+ distancePairs.collect().foreach(x => {
+ if (!vector.contains(x._1)) { // if there is no record for this poi
+ vector(i) = x._1
+ i += 1
+ }
+ if (!vector.contains(x._2)) { // if there is no record for this poi
+ vector(i) = x._2
+ i += 1
+ }
+ val i1 = vector.indexOf(x._1) // get the index as x-y axis for matrix
+ val i2 = vector.indexOf(x._2) // get the index as x-y axis for matrix
+ distanceMatrix(i1)(i2) = x._3
+ distanceMatrix(i2)(i1) = x._3
+ })
+ // create coordinates
+ val mds = new MDS(distanceMatrix, dimension, true)
+ mds.getCoordinates.zip(vector).map(x => (x._2, x._1))
+ }
+}
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/PIC.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/PIC.scala
new file mode 100644
index 0000000..3688318
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/PIC.scala
@@ -0,0 +1,34 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import org.apache.spark.graphx.Edge
+import org.apache.spark.graphx.Graph
+import org.apache.spark.mllib.clustering.PowerIterationClustering
+import org.apache.spark.rdd._
+import org.apache.spark.sql._
+
+
+class PIC {
+
+ /*
+ * Power Iteration clustering algorithm from Spark standard library
+ * */
+ def picSparkML(pairwisePOISimilarity: RDD[(Long, Long, Double)], numCentroids: Int, numIterations: Int, sparkSession: SparkSession): Map[Int, Array[Long]] = {
+ val model = new PowerIterationClustering().setK(numCentroids).setMaxIterations(numIterations).setInitializationMode("degree").run(pairwisePOISimilarity)
+ val clusters = model.assignments.collect().groupBy(_.cluster).mapValues(_.map(_.id))
+ clusters
+ }
+/*
+ * Power Iteration using implementation from SANSA
+ * */
+ def picSANSA(pairwisePOISimilarity: RDD[(Long, Long, Double)], numCentroids: Int, numIterations: Int, sparkSession: SparkSession) {
+ val verticeS = pairwisePOISimilarity.map(f => f._1)
+ val verticeD = pairwisePOISimilarity.map(f => f._2)
+ val indexedMap = verticeS.union(verticeD).distinct().zipWithIndex()
+ val vertices = indexedMap.map(f => (f._2, f._1))
+ val edges = pairwisePOISimilarity.map(f => Edge(f._1, f._2, f._3)) // from similarity to int
+ val similarityGraph = Graph(vertices, edges)
+ // val model = new RDFGraphPICClustering(sparkSession, similarityGraph, numCentroids, numIterations)
+ }
+}
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFByModularityClustering.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFByModularityClustering.scala
similarity index 88%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFByModularityClustering.scala
rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFByModularityClustering.scala
index 55e508c..47a498e 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFByModularityClustering.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFByModularityClustering.scala
@@ -1,19 +1,21 @@
-package net.sansa_stack.ml.spark.clustering
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import java.io.StringWriter
+
+import scala.util.control.Breaks._
import org.apache.log4j.{ Level, Logger }
+import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
-import org.apache.spark.{ SparkConf, SparkContext }
-import scala.util.control.Breaks._
-import java.io.StringWriter
/**
* Created by hpetzka on 09.11.2016.
*/
object RDFByModularityClustering {
- def apply(sc: SparkContext, numIterations: Int, graphFile: String, outputFile: String) = {
+ def apply(sc: SparkContext, numIterations: Int, graphFile: String, outputFile: String): Unit = {
// DEFAULT INPUT
// val (numIterations, graphFile) = (100 , "C:/Users/hpetzka/IdeaProjects/Clustering_in_Spark/Graphs/testRDF.txt")
@@ -39,7 +41,7 @@ object RDFByModularityClustering {
If weight edges come in, one probably needa a map as follows
val adjacencyMatrix: Array[Array[Int]] = Array.ofDim[Int](numVertices, numVertices)
var adjacencies: Map[(String, String), Int] = Map[(String, String),Int]()
- for (x <- edgesRDD.collect()){
+ for (x <- edgesRDD.collect()) {
// TODO add the weights here if they exist
if(x(0) < x(1)) adjacencies += ( (x(0),x(1)) -> 1 )
else adjacencies += ( (x(1),x(0)) -> 1 )
@@ -117,11 +119,12 @@ object RDFByModularityClustering {
}
- def iterationStepClusteringRDFByModularity(numEdges: Long,
- edgesBC: Broadcast[Array[(String, String)]],
- vertexDegreesBC: Broadcast[Map[String, Int]],
- clusterMapRDD: RDD[List[String]],
- sc: SparkContext): (RDD[List[String]], Boolean) = {
+ def iterationStepClusteringRDFByModularity(
+ numEdges: Long,
+ edgesBC: Broadcast[Array[(String, String)]],
+ vertexDegreesBC: Broadcast[Map[String, Int]],
+ clusterMapRDD: RDD[List[String]],
+ sc: SparkContext): (RDD[List[String]], Boolean) = {
// Start iteration
// The following RDD contains distinct pairs of clusters for which there is an edge between them
@@ -172,11 +175,12 @@ object RDFByModularityClustering {
// The function that computes delta Q for the merge of two clusters
- def deltaQ(numEdges: Long,
- vertexDegreesBC: Broadcast[Map[String, Int]],
- edgesBC: Broadcast[Array[(String, String)]],
- clusterI: List[String],
- clusterJ: List[String]): Double = {
+ def deltaQ(
+ numEdges: Long,
+ vertexDegreesBC: Broadcast[Map[String, Int]],
+ edgesBC: Broadcast[Array[(String, String)]],
+ clusterI: List[String],
+ clusterJ: List[String]): Double = {
val clusterPairs: List[(String, String)] = clusterI.flatMap(x => clusterJ.map(y => (x, y)))
@@ -189,12 +193,9 @@ object RDFByModularityClustering {
1.0 / numEdges * summand.fold(0.0)((a: Double, b: Double) => a - b)
}
- def WriteToFile[T](rdd: RDD[T], file: String, coalesce: (Boolean, Int) = (false, 0)) =
+ def WriteToFile[T](rdd: RDD[T], file: String, coalesce: (Boolean, Int) = (false, 0)): Unit =
coalesce._1 match {
- case true => rdd.coalesce(coalesce._2).saveAsTextFile(file)
+ case true => rdd.coalesce(coalesce._2).saveAsTextFile(file)
case false => rdd.saveAsTextFile(file)
}
-
}
-
-
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFGraphPowerIterationClustering.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFGraphPowerIterationClustering.scala
similarity index 87%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFGraphPowerIterationClustering.scala
rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFGraphPowerIterationClustering.scala
index 15a3234..e49c871 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFGraphPowerIterationClustering.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFGraphPowerIterationClustering.scala
@@ -1,46 +1,36 @@
-package net.sansa_stack.ml.spark.clustering
+package net.sansa_stack.ml.spark.clustering.algorithms
-import scala.reflect.runtime.universe._
-import scopt.OptionParser
-import org.apache.log4j.{ Level, Logger }
-import org.apache.spark.mllib.util.MLUtils
-import java.io.{ FileReader, FileNotFoundException, IOException }
-import org.apache.spark.mllib.linalg.Vectors
-import java.lang.{ Long => JLong }
+import java.io._
+import java.io.{ ByteArrayInputStream, FileNotFoundException, FileReader, IOException, StringWriter }
import java.lang.{ Long => JLong }
+import java.net.URI
+
+import scala.collection.mutable
+import scala.math.BigDecimal
+import scala.reflect.runtime.universe._
+
import breeze.linalg.{ squaredDistance, DenseVector, Vector }
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.graphx.GraphLoader
+import org.apache.commons.math3.util.MathUtils
import org.apache.jena.datatypes.{ RDFDatatype, TypeMapper }
-import org.apache.jena.graph.{ Node => JenaNode, Triple => JenaTriple, _ }
-import org.apache.jena.riot.writer.NTriplesWriter
+import org.apache.jena.graph.{ Node => JenaNode, Node_ANY, Node_Blank, Node_Literal, Node_URI, Triple => JenaTriple, _ }
import org.apache.jena.riot.{ Lang, RDFDataMgr }
-import org.apache.jena.graph.{ Node_ANY, Node_Blank, Node_Literal, Node_URI, Node => JenaNode, Triple => JenaTriple }
+import org.apache.jena.riot.writer.NTriplesWriter
import org.apache.jena.vocabulary.RDF
-import java.io.ByteArrayInputStream
-import org.apache.spark.rdd.PairRDDFunctions
+import org.apache.log4j.{ Level, Logger }
import org.apache.spark.SparkContext._
import org.apache.spark.graphx._
+import org.apache.spark.graphx.{ EdgeDirection, Graph, GraphLoader }
+import org.apache.spark.mllib.clustering.{ PowerIterationClustering, PowerIterationClusteringModel }
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.PairRDDFunctions
import org.apache.spark.rdd.RDD
-import java.io.StringWriter
-import java.io._
-import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.clustering.{ PowerIterationClusteringModel, PowerIterationClustering }
-import org.apache.spark.graphx.{ Graph, EdgeDirection }
-import scala.math.BigDecimal
-import org.apache.commons.math3.util.MathUtils
import org.apache.spark.sql.SparkSession
-import org.apache.spark.graphx._
-import java.net.URI
import org.apache.spark.storage.StorageLevel
-import org.apache.spark.graphx._
-import scala.collection.mutable
object RDFGraphPowerIterationClustering {
- def apply(spark: SparkSession, graph: Graph[String, String], output: String, k: Int = 2, maxIterations: Int = 5) = {
-
-
+ def apply(spark: SparkSession, graph: Graph[String, String], output: String, k: Int = 2, maxIterations: Int = 5): RDD[(Int, String)] = {
def clusterRdd(): RDD[(Int, String)] = {
SimilaritesInPIC()
@@ -48,16 +38,16 @@ object RDFGraphPowerIterationClustering {
def SimilaritesInPIC(): RDD[(Int, String)] = {
- /*
- * Collect all the edges of the graph
- */
+ /**
+ * Collect all the edges of the graph
+ */
val edge = graph.edges
val nodes = graph.vertices
- /*
- * Collect distinct vertices of the graph
- *
- */
+ /**
+ * Collect distinct vertices of the graph
+ *
+ */
val node = nodes.map(e => (e._1))
@@ -94,9 +84,9 @@ object RDFGraphPowerIterationClustering {
def model = pic.run(weightedGraph)
- /*
- * Cluster the graph data into two classes using PowerIterationClustering
- */
+ /**
+ * Cluster the graph data into two classes using PowerIterationClustering
+ */
def run() = model
val modelAssignments = model.assignments
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/SilviaClustering.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/SilviaClustering.scala
similarity index 91%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/SilviaClustering.scala
rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/SilviaClustering.scala
index d370a29..5aa3313 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/SilviaClustering.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/SilviaClustering.scala
@@ -1,36 +1,31 @@
-package net.sansa_stack.ml.spark.clustering
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import java.io._
+import java.io.{ ByteArrayInputStream, FileNotFoundException, FileReader, IOException, StringWriter }
+import java.lang.{ Long => JLong }
+import java.net.URI
-import org.apache.spark.rdd.RDD
-import org.apache.spark.graphx.{ Graph, EdgeDirection }
import scala.math.BigDecimal
-import org.apache.spark.sql.SparkSession
import scala.reflect.runtime.universe._
-import scopt.OptionParser
-import org.apache.log4j.{ Level, Logger }
-import org.apache.spark.mllib.util.MLUtils
-import java.io.{ FileReader, FileNotFoundException, IOException }
-import org.apache.spark.mllib.linalg.Vectors
-import java.lang.{ Long => JLong }
-import java.lang.{ Long => JLong }
-import breeze.linalg.{ squaredDistance, DenseVector, Vector }
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.graphx.GraphLoader
import scala.util.control.Breaks._
+
+import breeze.linalg.{ squaredDistance, DenseVector, Vector }
+import net.sansa_stack.rdf.spark.model.graph._
+import org.apache.jena.graph.{ Node, Triple }
import org.apache.jena.riot.{ Lang, RDFDataMgr }
-import java.io.ByteArrayInputStream
-import org.apache.spark.rdd.PairRDDFunctions
+import org.apache.log4j.{ Level, Logger }
import org.apache.spark.SparkContext._
import org.apache.spark.graphx._
-import java.io.StringWriter
-import java.io._
-import org.apache.jena.graph.{ Node, Triple }
-import org.apache.jena.riot.Lang
-import net.sansa_stack.rdf.spark.model.graph._
-import java.net.URI
+import org.apache.spark.graphx.{ EdgeDirection, Graph, GraphLoader }
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.PairRDDFunctions
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
object SilviaClustering {
- def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputeval: String) = {
+ def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputeval: String): Unit = {
Logger.getRootLogger.setLevel(Level.WARN)
@@ -40,7 +35,7 @@ object SilviaClustering {
*
* Jaccard similarity measure : selectYourSimilarity = 0
* Batet similarity measure : selectYourSimilarity = 1
- * Rodríguez and Egenhofer similarity measure : selectYourSimilarity = 2
+ * Rodriguez and Egenhofer similarity measure : selectYourSimilarity = 2
* The Contrast model similarity : selectYourSimilarity = 3
* The Ratio model similarity : selectYourSimilarity = 4
*/
@@ -48,12 +43,13 @@ object SilviaClustering {
val selectYourSimilarity = 0
def clusterRdd(): RDD[List[String]] = {
+ val a = graph.triplets
graphXinBorderFlow(graph, orient, selectYourSimilarity)
}
- /*
- * Computes different similarities function for a given graph @graph.
- */
+ /**
+ * Computes different similarities function for a given graph @graph.
+ */
def graphXinBorderFlow(graph: Graph[String, String], e: Int, f: Int): RDD[List[String]] = {
val edge = graph.edges.collect()
@@ -77,9 +73,9 @@ object SilviaClustering {
val LOG2 = math.log(2)
val log2 = { x: Double => math.log(x) / LOG2 }
- /*
- * Difference between two set of vertices, used in different similarity measures
- */
+ /**
+ * Difference between two set of vertices, used in different similarity measures
+ */
def difference(a: Long, b: Long): Double = {
val ansec = neighbor.lookup(a).distinct.head.toSet
val ansec1 = neighbor.lookup(b).distinct.head.toSet
@@ -90,9 +86,9 @@ object SilviaClustering {
differ.size.toDouble
}
- /*
- * Intersection of two set of vertices, used in different similarity measures
- */
+ /**
+ * Intersection of two set of vertices, used in different similarity measures
+ */
def intersection(a: Long, b: Long): Double = {
val inters = neighbor.lookup(a).distinct.head.toList
val inters1 = neighbor.lookup(b).distinct.head.toList
@@ -106,9 +102,9 @@ object SilviaClustering {
rst.size.toDouble
}
- /*
- * Union of two set of vertices, used in different similarity measures
- */
+ /**
+ * Union of two set of vertices, used in different similarity measures
+ */
def union(a: Long, b: Long): Double = {
val uni = neighbor.lookup(a).distinct.head.toList
val uni1 = neighbor.lookup(b).distinct.head.toList
@@ -124,9 +120,9 @@ object SilviaClustering {
var s = 0.0
if (c == 0) {
- /*
- * Jaccard similarity measure
- */
+ /**
+ * Jaccard similarity measure
+ */
val sim = intersection(a, b) / union(a, b).toDouble
@@ -136,9 +132,9 @@ object SilviaClustering {
if (c == 1) {
- /*
- * Rodríguez and Egenhofer similarity measure
- */
+ /**
+ * Rodríguez and Egenhofer similarity measure
+ */
var g = 0.8
@@ -148,9 +144,10 @@ object SilviaClustering {
}
if (c == 2) {
- /*
- * The Ratio model similarity
- */
+
+ /**
+ * The Ratio model similarity
+ */
var alph = 0.5
var beth = 0.5
@@ -161,9 +158,9 @@ object SilviaClustering {
}
if (c == 3) {
- /*
- * Batet similarity measure
- */
+ /**
+ * Batet similarity measure
+ */
val cal = 1 + ((difference(a, b) + difference(b, a)) / (difference(a, b) + difference(b, a) + intersection(a, b))).abs
val sim = log2(cal.toDouble)
@@ -518,11 +515,8 @@ object SilviaClustering {
result
}
-
val cRdd = clusterRdd()
-
- cRdd.saveAsTextFile(output)
-
+ val zipwithindex = cRdd.zipWithIndex().map(f => (f._2, f._1))
+ zipwithindex.saveAsTextFile(output)
}
-
}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/AppConfig.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/AppConfig.scala
new file mode 100644
index 0000000..b5a11c5
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/AppConfig.scala
@@ -0,0 +1,30 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+case class Spark(master: String,
+ spark_serializer: String,
+ spark_executor_memory: String,
+ spark_driver_memory: String,
+ spark_driver_maxResultSize: String,
+ app_name: String)
+
+case class Clustering(profile: String,
+ pic: String,
+ oneHotKM: String,
+ mdsKM: String,
+ word2VecKM: String,
+ picDistanceMatrix: String,
+ mdsCoordinates: String,
+ oneHotMatrix: String,
+ word2Vec: String)
+
+case class Datasets(input: String,
+ termValueUri: String,
+ termPrefix: String,
+ typePOI: String,
+ coordinatesPredicate: String,
+ categoryPOI: String,
+ poiPrefix: String)
+
+case class AppConfig(dataset: Datasets, clustering: Clustering, spark: Spark)
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Categories.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Categories.scala
new file mode 100644
index 0000000..bf693e1
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Categories.scala
@@ -0,0 +1,7 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+/**
+ * @param categories a set of category values
+ */
+case class Categories(categories: scala.collection.mutable.Set[String])
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Cluster.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Cluster.scala
new file mode 100644
index 0000000..d342e65
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Cluster.scala
@@ -0,0 +1,10 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+/**
+ * a cluster
+ *
+ * @param cluster_id id of cluster
+ * @param poi_in_cluster an array of pois in cluster
+ */
+case class Cluster(cluster_id: Int, poi_in_cluster: Array[Poi])
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Clusters.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Clusters.scala
new file mode 100644
index 0000000..3646c9b
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Clusters.scala
@@ -0,0 +1,9 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+/**
+ * @param numOfClusters number of clusters
+ * @param clusterSizes size of each cluster
+ * @param clusters a list of cluster
+ */
+case class Clusters(numOfClusters: Int, clusterSizes: Array[Int], clusters: List[Cluster])
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/CoordinatePOI.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/CoordinatePOI.scala
new file mode 100644
index 0000000..b722a73
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/CoordinatePOI.scala
@@ -0,0 +1,10 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+/**
+ * a coordinate
+ *
+ * @param longitude
+ * @param latitude
+ */
+case class CoordinatePOI(longitude: Double, latitude: Double)
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbPOI.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbPOI.scala
new file mode 100644
index 0000000..67aa807
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbPOI.scala
@@ -0,0 +1,14 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+import net.sansa_stack.ml.spark.clustering.datatypes.DbStatusEnum._
+
+case class DbPOI(val poiId: String,
+ val lon: Double,
+ val lat: Double) {
+
+ var dbstatus = UNDEFINED
+ var isDense = false
+ var isBoundary = false
+ var clusterName = ""
+}
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbStatusEnum.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbStatusEnum.scala
new file mode 100644
index 0000000..cceca32
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbStatusEnum.scala
@@ -0,0 +1,7 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+object DbStatusEnum extends Enumeration {
+
+ type DBSTATUS = Value
+ val UNDEFINED, NOISE, PARTOFCLUSTER = Value
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Distance.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Distance.scala
new file mode 100644
index 0000000..efa7596
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Distance.scala
@@ -0,0 +1,9 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+/**
+ * @param poi1
+ * @param poi2
+ * @param distance distance between poi1 and poi2
+ */
+case class Distance(poi1: Long, poi2: Long, distance: Double)
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DistanceMatrix.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DistanceMatrix.scala
new file mode 100644
index 0000000..27ff94d
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DistanceMatrix.scala
@@ -0,0 +1,4 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+case class DistanceMatrix(distances: List[Distance])
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinate.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinate.scala
new file mode 100644
index 0000000..87aadfa
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinate.scala
@@ -0,0 +1,4 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+case class MdsCoordinate (poiID: Long, coordinate: Array[Double])
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinates.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinates.scala
new file mode 100644
index 0000000..5318cc2
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinates.scala
@@ -0,0 +1,4 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+case class MdsCoordinates(coordinates: Array[MdsCoordinate])
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/POI.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/POI.scala
new file mode 100644
index 0000000..0c3bba4
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/POI.scala
@@ -0,0 +1,14 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+import com.vividsolutions.jts.geom.{Coordinate, GeometryFactory}
+
+class POI(
+ id: String,
+ name: String,
+ val x : Double,
+ val y : Double,
+ keywords: List[String],
+ score: Double,
+ geometryFactory: GeometryFactory
+ ) extends SpatialObject(id, name, keywords, score, geometryFactory.createPoint(new Coordinate(x, y)))
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Poi.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Poi.scala
new file mode 100644
index 0000000..2e15ac9
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Poi.scala
@@ -0,0 +1,11 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+/**
+ * Poi object representing a point of interest
+ *
+ * @param poi_id, id of poi
+ * @param coordinate, coordinate of poi
+ * @param categories, categories of poi
+ */
+case class Poi(poi_id: Long, coordinate: CoordinatePOI, categories: Categories, review: Double)
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/SpatialObject.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/SpatialObject.scala
new file mode 100644
index 0000000..2388fb1
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/SpatialObject.scala
@@ -0,0 +1,23 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+import com.vividsolutions.jts.geom.Geometry
+import scala.collection.mutable.HashMap
+
+class SpatialObject(
+ var id: String,
+ var name: String,
+ var keywords: List[String],
+ var score: Double,
+ var geometry: Geometry
+ ) extends Ordered[SpatialObject]{
+
+ var attributes = HashMap[Object, Object]()
+
+ // @Override
+ override def compare(o: SpatialObject ): Int = {
+ if (this.score > o.score) -1
+ else if (this.score == o.score) 0
+ else 1
+ }
+}
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Common.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Common.scala
new file mode 100644
index 0000000..0044bda
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Common.scala
@@ -0,0 +1,71 @@
+package net.sansa_stack.ml.spark.clustering.utils
+
+import java.io.PrintWriter
+
+import org.apache.jena.graph.{ NodeFactory, Triple}
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.json4s.DefaultFormats
+import org.json4s.jackson.Serialization
+
+import net.sansa_stack.ml.spark.clustering.datatypes.{Cluster, Clusters, Poi}
+
+object Common {
+ val prefixID = "http://example.org/id/poi/"
+ val prefixCategory = "http://example.org/hasCategory"
+ val prefixCoordinate = "http://example.org/id/hasCoordinate/"
+
+
+ /**
+ * create a pair RDD and join with another pair RDD
+ *
+ * @param sparkContext
+ * @param ids an array with poi id
+ * @param pairs
+ * @return an array of poi
+ */
+ def join(sparkContext: SparkContext, ids: Array[Long], pairs: RDD[(Long, Poi)]): Array[Poi] = {
+ val idsPair = sparkContext.parallelize(ids).map(x => (x, x))
+ idsPair.join(pairs).map(x => x._2._2).collect()
+ }
+
+ /**
+ * serialize clustering results to file
+ *
+ * @param sparkContext
+ * @param clusters clustering results
+ * @param pois pois object
+ * @return
+ */
+ def writeClusteringResult(sparkContext: SparkContext, clusters: Map[Int, Array[Long]], pois: RDD[Poi], fileWriter: PrintWriter): Unit = {
+ val assignments = clusters.toList.sortBy { case (k, v) => v.length }
+ val poisKeyPair = pois.keyBy(f => f.poi_id).persist()
+ val clustersPois = Clusters(assignments.size, assignments.map(_._2.length).toArray, assignments.map(f => Cluster(f._1, join(sparkContext, f._2, poisKeyPair))))
+ implicit val formats = DefaultFormats
+ Serialization.writePretty(clustersPois, fileWriter)
+ }
+ /**
+ * serialize clustering results to .nt file
+ */
+ def seralizeToNT(sparkContext: SparkContext, clusters: Map[Int, Array[Long]], pois: RDD[Poi]): Unit = {
+ val assignments = clusters.toList.sortBy { case (k, v) => v.length }
+ val poisKeyPair = pois.keyBy(f => f.poi_id).persist()
+ val newAssignment = assignments.map(f => (f._1, sparkContext.parallelize(f._2).map(x => (x, x)).join(poisKeyPair).map(x => ( x._2._2.poi_id, x._2._2.categories, x._2._2.coordinate)).collect()))
+ val newAssignmentRDD = sparkContext.parallelize(newAssignment)
+ println(newAssignmentRDD.count())
+ val newAssignmentRDDTriple = newAssignmentRDD.map(cluster => (cluster._1, cluster._2.flatMap(poi =>
+ {List(new Triple(NodeFactory.createURI(prefixID + poi._1.toString),
+ NodeFactory.createURI(prefixCategory),
+ NodeFactory.createLiteral(poi._2.categories.mkString(","))),
+ new Triple(NodeFactory.createURI(prefixID + poi._1.toString),
+ NodeFactory.createURI(prefixCoordinate),
+ NodeFactory.createLiteral((poi._3.latitude, poi._3.longitude).toString()))
+ )}
+ ).toList)
+ )
+ newAssignmentRDDTriple.saveAsTextFile("results/triples")
+ }
+
+}
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DBCLusterer.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DBCLusterer.scala
new file mode 100644
index 0000000..1609d60
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DBCLusterer.scala
@@ -0,0 +1,78 @@
+package net.sansa_stack.ml.spark.clustering.utils
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+import net.sansa_stack.ml.spark.clustering.datatypes.DbPOI
+import net.sansa_stack.ml.spark.clustering.datatypes.DbStatusEnum._
+
+case class DBCLusterer(val eps: Double, val minPts: Int) {
+
+ def clusterPois(poiArrBuff: ArrayBuffer[DbPOI]): ArrayBuffer[ArrayBuffer[DbPOI]] = {
+
+ val clusterArrBuff = ArrayBuffer[ArrayBuffer[DbPOI]]()
+ val grid = Grid(poiArrBuff, eps)
+
+ for{
+ dbpoi <- poiArrBuff
+
+ if(dbpoi.dbstatus == UNDEFINED)
+ }{
+
+ val neighbourArrBuff = grid.getNeighbours(dbpoi)
+
+ if(neighbourArrBuff.size < minPts)
+ {
+ dbpoi.dbstatus = NOISE
+ }
+ else
+ {
+ clusterArrBuff.append(findCluster(dbpoi, neighbourArrBuff, grid))
+ }
+ }
+
+ clusterArrBuff
+ }
+
+
+ def findCluster(dbpoi: DbPOI, neighbourArrBuff: ArrayBuffer[DbPOI], grid: Grid): ArrayBuffer[DbPOI] = {
+
+ dbpoi.dbstatus = PARTOFCLUSTER
+ dbpoi.isDense = true
+
+ val cluster = ArrayBuffer[DbPOI]()
+ cluster.append(dbpoi)
+
+ val neighbourQueue = mutable.Queue[DbPOI]() ++ neighbourArrBuff
+
+ while(neighbourQueue.nonEmpty) {
+ val poi = neighbourQueue.dequeue()
+ poi.dbstatus match {
+ case UNDEFINED =>
+ poi.dbstatus = PARTOFCLUSTER
+ val poi_i_neighbours = grid.getNeighbours(poi)
+ if(poi_i_neighbours.size >= minPts)
+ {
+ poi.isDense = true
+ neighbourQueue ++= poi_i_neighbours
+ }
+ else
+ {
+ poi.isDense = false
+ }
+ cluster.append(poi)
+ case NOISE =>
+ poi.dbstatus = PARTOFCLUSTER
+ poi.isDense = false
+ cluster.append(poi)
+ case _ => ()
+ }
+ }
+
+ cluster
+ }
+
+}
+
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataFiltering.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataFiltering.scala
new file mode 100644
index 0000000..742a90a
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataFiltering.scala
@@ -0,0 +1,63 @@
+package net.sansa_stack.ml.spark.clustering.utils
+
+import org.apache.jena.graph.Triple
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import scala.collection.mutable.ArrayBuffer
+
+import net.sansa_stack.ml.spark.clustering.datatypes.AppConfig
+import net.sansa_stack.rdf.spark.io.NTripleReader
+
+class DataFiltering(val spark: SparkSession, val conf: AppConfig) extends Serializable {
+
+ val dataRDD: RDD[Triple] = NTripleReader.load(spark, conf.dataset.input).persist()
+
+ /**
+ * Generate triples with related to poi in poiArray, method name not JavaBean format because of side effect with Unit return result
+ * @param poiArray id of pois in Vienna
+ * @param dataRDD RDD containing triples
+ * @param spark SparkSession
+ * @return
+ */
+ def get_triples(poiArray: Array[Long], dataRDD: RDD[Triple], spark: SparkSession) : (RDD[Triple], RDD[Triple]) = {
+ // create an array of subjects related with each poi
+ val subjects = ArrayBuffer[String]()
+ for (i <- 0 until poiArray.length - 1) {
+ subjects ++= createSubjects(poiArray(i))
+ }
+ // RDD[Triple] => RDD[(subject, Triple)]
+ val dataRDDPair = dataRDD.map(f => (f.getSubject.getURI, f)).persist()
+ // create RDD[(subject, subject)] from Array[subjects]
+ val subjectsRDD = spark.sparkContext.parallelize(subjects.toSet.toList).map(f => (f, f)).persist()
+ // get RDD[Triples] with subject in Array[subjects]
+ val viennaTriples = subjectsRDD.join(dataRDDPair).map(f => f._2._2).persist()
+ // find filtered Triples with prediction category, and get their object => RDD[Object]
+ val viennaCatgoriesObjects = viennaTriples.filter(f => f.getPredicate.getURI.equals("http://example.org/def#category")).map(f => f.getObject.getURI).distinct().persist()
+ // RDD[Object] => RDD[(Object, Object)]
+ val viennaPoiCategoriesRDD = viennaCatgoriesObjects.map(f => (f, f)).persist()
+ // RDD[(Object, Object)] => RDD[Triples], where Object is Subject in Triples
+ val viennaCategoryTriples = viennaPoiCategoriesRDD.join(dataRDDPair).map(f => f._2._2)
+ // RDD[Triples] => RDD[(Key, Triple)], where key=subject+predicate+object, because there are some duplicated triples in the tomtom data
+ val temp = viennaCategoryTriples.map(f => (f.getSubject.getURI + f.getPredicate.getURI + f.getObject.toString(), f)).persist()
+ // remove duplicated triples
+ val categoryTriples = temp.reduceByKey((v1, v2) => v1).map(f => f._2).persist()
+ (viennaTriples, categoryTriples)
+ }
+
+ /**
+ * @param poiID id of a poi
+ * @return an array of subject in RDF triples with related to this poi
+ */
+ def createSubjects(poiID: Long): ArrayBuffer[String] = {
+ val subjects = ArrayBuffer[String]()
+ val id = "http://example.org/id/poi/".concat(poiID.toString)
+ subjects.+=(id)
+ subjects.+=(id.concat("/address"))
+ subjects.+=(id.concat("/phone"))
+ subjects.+=(id.concat("/geometry"))
+ subjects.+=(id.concat("/name"))
+ subjects.+=(id.concat("/accuracy_info"))
+ subjects.+=(id.concat("/brandname"))
+ subjects
+ }
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataProcessing.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataProcessing.scala
new file mode 100644
index 0000000..89c892f
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataProcessing.scala
@@ -0,0 +1,178 @@
+package net.sansa_stack.ml.spark.clustering.utils
+
+import java.io.{File, FilenameFilter}
+
+import com.typesafe.config.Config
+import org.apache.jena.graph.Triple
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+
+import net.sansa_stack.ml.spark.clustering.datatypes.{Categories, CoordinatePOI, Poi}
+import net.sansa_stack.rdf.spark.io.NTripleReader
+
+
+
+/**
+ * load TomTom dataset
+ * @param spark SparkSession
+ * @param conf Configuration
+ */
+class DataProcessing(val spark: SparkSession, val conf: Config) extends Serializable {
+
+ // val dataRDD: RDD[Triple] = NTripleReader.load(spark, conf.getString("sansa.data.input")).persist()
+ val dataRDD: RDD[Triple] = loadNTriple(conf.getString("sansa.data.input"))
+
+ // var poiCoordinates: RDD[(Long, Coordinate)] = this.getPOICoordinates(16.192851, 16.593533, 48.104194, 48.316388).sample(withReplacement = false, fraction = 0.01, seed = 0)
+ var poiCoordinates: RDD[(Long, CoordinatePOI)] = this.getPOICoordinates
+ var poiFlatCategoryId: RDD[(Long, Long)] = this.getPOIFlatCategoryId
+ var poiCategoryId: RDD[(Long, Set[Long])] = this.getCategoryId(poiCoordinates, poiFlatCategoryId).persist()
+ var poiCategoryValueSet: RDD[(Long, Categories)] = this.getCategoryValues // (category_id, Categories)
+ var poiCategories: RDD[(Long, Categories)] = this.getPOICategories(poiCoordinates, poiFlatCategoryId, poiCategoryValueSet) // (poi_id, Categories)
+ val poiYelpCategories: RDD[(Long, (Categories, Double))] = this.getYelpCategories(dataRDD).sample(withReplacement = false, fraction = 0.1, seed = 0)
+ var pois: RDD[Poi] = { if (!poiYelpCategories.isEmpty()) {
+ // val poiAllCategories: RDD[(Long, Categories, Double)] = poiCategories.join(poiYelpCategories).map(x => (x._1, (Categories(x._2._1.categories++x._2._2._1.categories), x._2._2._2))
+ val poiAllCategories: RDD[(Long, (Categories, Double))] = poiYelpCategories.join(poiCategories).map(x => (x._1, (Categories(x._2._1._1.categories++x._2._2.categories), x._2._1._2)))
+ poiCoordinates.join(poiAllCategories).map(x => Poi(x._1, x._2._1, x._2._2._1, x._2._2._2)).persist()
+ } else {
+ println("--------pois--------------")
+ poiCoordinates.join(poiCategories).map(x => Poi(x._1, x._2._1, x._2._2, 0.0)).persist()
+ }}
+
+ def loadNTriple(tripleFilePath: String): RDD[Triple] = {
+ val tripleFile = new File(tripleFilePath)
+ if(tripleFile.isDirectory) {
+ val files = tripleFile.listFiles(new FilenameFilter() {
+ def accept(tripleFile: File, name: String): Boolean = {
+ !(name.toString.contains("SUCCESS") || name.toLowerCase.endsWith(".crc"))
+ }
+ })
+ var i = 0
+ var triple_0 = NTripleReader.load(spark, files(0).getAbsolutePath)
+ for(file <- files) {
+ if (i!=0) {
+ triple_0 = triple_0.union(NTripleReader.load(spark, file.getAbsolutePath))
+ }
+ i+=1
+ }
+ triple_0
+ }
+ else {
+ NTripleReader.load(spark, tripleFile.getAbsolutePath)
+ }
+ }
+
+
+ /**
+ * @param poiCoordinates super set of poi with coordinates
+ * @param lo_min min longitude
+ * @param lo_max max longitude
+ * @param la_min min latitude
+ * @param la_max max latitude
+ * @return pois within certain coordinates
+ */
+ def filterCoordinates(poiCoordinates: RDD[(Long, CoordinatePOI)], lo_min: Double, lo_max: Double, la_min: Double, la_max: Double): RDD[(Long, CoordinatePOI)] = {
+ poiCoordinates.filter(x => (x._2.longitude >= lo_min && x._2.longitude <= lo_max)
+ && (x._2.latitude >= la_min && x._2.latitude <= la_max))
+ }
+
+ /**
+ * get coordinate for all poi
+ */
+ def getPOICoordinates: RDD[(Long, CoordinatePOI)] = {
+ // get the coordinates of pois
+ val pattern = "POINT(.+ .+)".r
+ val poiCoordinatesString = dataRDD.filter(x => x.getPredicate.toString().equalsIgnoreCase(conf.getString("sansa.data.coordinatesPredicate")))
+ .map(x => (x.getSubject.toString().replace(conf.getString("sansa.data.poiPrefix"), "").replace("/geometry", "").toLong,
+ pattern.findFirstIn(x.getObject.toString()).head.replace("POINT", "")
+ .replace("^^http://www.opengis.net/ont/geosparql#wktLiteral", "").replaceAll("^\"|\"$", "")))
+ // transform to Coordinate object
+ poiCoordinatesString.mapValues(x => {
+ val coordinates = x.replace("(", "").replace(")", "").split(" ")
+ CoordinatePOI(coordinates(0).toDouble, coordinates(1).toDouble)
+ })
+ }
+
+ /**
+ * load data filter on geo-coordinates
+ * @param lo_min min longitude
+ * @param lo_max max longitude
+ * @param la_min min latitude
+ * @param la_max max latitude
+ */
+ def getPOICoordinates(lo_min: Double, lo_max: Double, la_min: Double, la_max: Double): RDD[(Long, CoordinatePOI)] = {
+ this.filterCoordinates(poiCoordinates = this.getPOICoordinates, lo_min = lo_min, lo_max = lo_max, la_min = la_min, la_max = la_max)
+ }
+
+ /**
+ *
+ * @return (poi, category_id)
+ */
+ def getPOIFlatCategoryId: RDD[(Long, Long)] = {
+ val poiFlatCategories = dataRDD.filter(x => x.getPredicate.toString().equalsIgnoreCase(conf.getString("sansa.data.categoryPOI")))
+ poiFlatCategories.map(x => (
+ x.getSubject.toString().replace(conf.getString("sansa.data.poiPrefix"), "").toLong,
+ x.getObject.toString().replace(conf.getString("sansa.data.termPrefix"), "").toLong)
+ )
+ }
+
+ /**
+ * get (poi_unique, Categories)
+ * @param poiCoordinates (poi_unique, Coordinate)
+ * @param poiFlatCategoryId (poi, category_id)
+ * @param poiCategoryValueSet (category_id, Categories)
+ * @return (poi, Categories)
+ */
+ def getPOICategories(poiCoordinates: RDD[(Long, CoordinatePOI)], poiFlatCategoryId: RDD[(Long, Long)], poiCategoryValueSet: RDD[(Long, Categories)]): RDD[(Long, Categories)] = {
+ // from (poi, category_id) map-> (category_id, poi) join-> (category_id, (poi, Categories)) map-> (poi, Categories) groupByKey-> (poi_unique, Iterable(Categories))
+ val poiCategorySets = poiFlatCategoryId.map(f => (f._2, f._1)).join(poiCategoryValueSet).map(f => (f._2._1, f._2._2)).groupByKey()
+ // from (poi_unique, Iterable(Categories)) join-> (poi_unique, (Coordinate, Iterable(Categories))) map-> (poi_unique, Categories)
+ poiCoordinates.join(poiCategorySets).map(x => (x._1, Categories(collection.mutable.Set(x._2._2.flatMap(_.categories).toList: _*))))
+ }
+
+ /**
+ * get (category_id, Categories)
+ * @return RDD with category values for category id
+ */
+ def getCategoryValues: RDD[(Long, Categories)] = {
+ // get category id(s)
+ val categoryTriples = dataRDD.filter(x => x.getPredicate.toString().equalsIgnoreCase(conf.getString("sansa.data.termValueUri")))
+ // get category id and it's corresponding values
+ val categoriesIdValues = categoryTriples.map(x => (
+ x.getSubject.toString().replace(conf.getString("sansa.data.termPrefix"), "").toLong,
+ x.getObject.toString().replaceAll("\"", "")))
+ // group by id and put all values of category to a set
+ categoriesIdValues.groupByKey().map(x => (x._1, Categories(scala.collection.mutable.Set(x._2.toList: _*))))
+ }
+
+ /**
+ * get (poi_unique, poi_category_id_set)
+ * @param poiCoordinates (poi_unique, Coordinate)
+ * @param poiFlatCategoryId (poi, category_id)
+ */
+ def getCategoryId(poiCoordinates: RDD[(Long, CoordinatePOI)], poiFlatCategoryId: RDD[(Long, Long)]): RDD[(Long, Set[Long])] = {
+ poiCoordinates.join(poiFlatCategoryId.groupByKey())
+ .map(x => (x._1, x._2._2.toSet))
+ }
+
+
+ def getYelpCategories(mergedRDD: RDD[Triple]): RDD[(Long, (Categories, Double))] = {
+ val yelpPOICategory = mergedRDD.filter(triple => triple.getPredicate.toString.equalsIgnoreCase(conf.getString("yelp.data.categoryPOI")))
+ println(conf.getString("yelp.data.rating"))
+ val yelpPOIRating = mergedRDD.filter(triple => triple.getPredicate.toString.contains(conf.getString("yelp.data.rating")))
+ println("category")
+ println(yelpPOICategory.count())
+ println("rating")
+ println(yelpPOIRating.count())
+ val yelpPOICategoryMapped = yelpPOICategory.map(triple => (
+ triple.getSubject.toString().replace(conf.getString("sansa.data.poiPrefix"), "").toLong,
+ triple.getObject.toString()
+ ))
+ val yelpPOIRatingMapped = yelpPOIRating.map(triple => (
+ triple.getSubject.toString().replace(conf.getString("sansa.data.poiPrefix"), "").toLong,
+ triple.getObject.getLiteralValue.toString.toDouble
+ ))
+ yelpPOICategoryMapped.groupByKey().join(yelpPOIRatingMapped).map(x => (x._1, (Categories(scala.collection.mutable.Set(x._2._1.toList: _*)), x._2._2)))
+ }
+}
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Grid.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Grid.scala
new file mode 100644
index 0000000..74d5a9c
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Grid.scala
@@ -0,0 +1,55 @@
+package net.sansa_stack.ml.spark.clustering.utils
+
+import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.HashMap
+
+import net.sansa_stack.ml.spark.clustering.datatypes.DbPOI
+
+case class Grid(val poiArrBuf: ArrayBuffer[DbPOI], val eps: Double) {
+
+ val startX = poiArrBuf.head.lon
+ val startY = poiArrBuf.head.lat
+ val gridCell = HashMap[(Int, Int), ArrayBuffer[DbPOI]]()
+
+ init()
+
+ private def init(): Unit = {
+ var i = 0
+ var j = 0
+ for(dbpoi <- poiArrBuf) {
+ i = math.floor( (dbpoi.lon - startX) / eps).toInt
+ j = math.floor( (dbpoi.lat - startY) / eps).toInt
+
+ gridCell.get((i, j)) match {
+ case Some(cellArrBuff) => cellArrBuff.append(dbpoi)
+ case None => gridCell += ( ((i, j), ArrayBuffer(dbpoi)) )
+ }
+ }
+ }
+
+
+ def getNeighbours(dbpoi: DbPOI): ArrayBuffer[DbPOI] = {
+
+ val neighbourArrBuff = ArrayBuffer[DbPOI]()
+
+ val celli = math.floor( (dbpoi.lon - startX) / eps).toInt
+ val cellj = math.floor( (dbpoi.lat - startY) / eps).toInt
+ for{
+ i <- (celli - 1) to (celli + 1)
+ j <- (cellj - 1) to (cellj + 1)
+ }{
+ gridCell.get((i, j)) match {
+ case Some(cellArrBuff) => neighbourArrBuff ++= cellArrBuff
+ case None => ()
+ }
+ }
+
+ neighbourArrBuff.filter{
+ p => (math.abs(p.lon - dbpoi.lon) <= eps) && (math.abs(p.lat - dbpoi.lat) <= eps) && p.poiId != dbpoi.poiId
+ }
+
+ }
+
+}
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernel.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernel.scala
index a49c2b3..a7ef9b4 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernel.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernel.scala
@@ -1,18 +1,18 @@
package net.sansa_stack.ml.spark.kernel
+import org.apache.jena.graph.Triple
import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel, StringIndexer }
import org.apache.spark.mllib.linalg.SparseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.functions._
import org.apache.spark.sql.{ DataFrame, SparkSession }
-import org.apache.jena.graph.Triple
+import org.apache.spark.sql.functions._
class RDFFastGraphKernel(
@transient val sparkSession: SparkSession,
- val tripleRDD: RDD[Triple],
- val predicateToPredict: String) extends Serializable {
+ val tripleRDD: RDD[Triple],
+ val predicateToPredict: String) extends Serializable {
import sparkSession.implicits._
@@ -84,8 +84,8 @@ class RDFFastGraphKernel(
object RDFFastGraphKernel {
def apply(
- sparkSession: SparkSession,
- tripleRDD: RDD[Triple],
+ sparkSession: SparkSession,
+ tripleRDD: RDD[Triple],
predicateToPredict: String): RDFFastGraphKernel = {
new RDFFastGraphKernel(sparkSession, tripleRDD, predicateToPredict)
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel.scala
index 1ea7080..a3f64fa 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel.scala
@@ -1,13 +1,14 @@
package net.sansa_stack.ml.spark.kernel
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{ DataFrame, SparkSession }
-import org.apache.spark.sql.functions._
+import org.apache.jena.graph.Triple
import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel }
import org.apache.spark.mllib.linalg.SparseVector
-import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.jena.graph.Triple
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{ DataFrame, SparkSession }
+import org.apache.spark.sql.functions._
+
object Uri2Index {
/*
@@ -81,9 +82,9 @@ object Uri2Index {
class RDFFastTreeGraphKernel(
@transient val sparkSession: SparkSession,
- val tripleRDD: RDD[Triple],
- val instanceDF: DataFrame,
- val maxDepth: Int) extends Serializable {
+ val tripleRDD: RDD[Triple],
+ val instanceDF: DataFrame,
+ val maxDepth: Int) extends Serializable {
/*
* Construct Triples DataFrame and Instances DataFrame
* Also, Get/Set Index for each URI and Literal
@@ -168,9 +169,9 @@ object RDFFastTreeGraphKernel {
def apply(
sparkSession: SparkSession,
- tripleRDD: RDD[Triple],
- instanceDF: DataFrame,
- maxDepth: Int): RDFFastTreeGraphKernel = {
+ tripleRDD: RDD[Triple],
+ instanceDF: DataFrame,
+ maxDepth: Int): RDFFastTreeGraphKernel = {
new RDFFastTreeGraphKernel(sparkSession, tripleRDD, instanceDF, maxDepth)
}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelApp.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelApp.scala
index b3ef264..6ecc25d 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelApp.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelApp.scala
@@ -46,7 +46,7 @@ object RDFFastTreeGraphKernelApp {
}
def experimentAffiliationPrediction(sparkSession: SparkSession, depth: Int, iteration: Int): Unit = {
- //val input = "src/main/resources/kernel/aifb-fixed_complete4.nt"
+ // val input = "src/main/resources/kernel/aifb-fixed_complete4.nt"
val input = "src/main/resources/kernel/aifb-fixed_no_schema4.nt"
val t0 = System.nanoTime
@@ -137,7 +137,7 @@ object RDFFastTreeGraphKernelApp {
tripleRDD.filter(_.getPredicate.getURI == "http://data.bgs.ac.uk/ref/Lexicon/hasTheme")
.foreach(f => Uri2Index.setInstanceAndLabel(f.getSubject.toString, f.getObject.toString))
- val filteredTripleRDD=tripleRDD
+ val filteredTripleRDD = tripleRDD
.filter(_.getPredicate.getURI != "http://data.bgs.ac.uk/ref/Lexicon/hasTheme")
val instanceDF = Uri2Index.getInstanceLabelsDF(sparkSession)
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelUtil.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelUtil.scala
index f4bb7f0..a8f2ec7 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelUtil.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelUtil.scala
@@ -2,31 +2,31 @@ package net.sansa_stack.ml.spark.kernel
import org.apache.jena.graph
import org.apache.spark.ml.feature.StringIndexer
-import org.apache.spark.mllib.classification.{LogisticRegressionModel, LogisticRegressionWithLBFGS}
+import org.apache.spark.mllib.classification.{ LogisticRegressionModel, LogisticRegressionWithLBFGS }
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{ DataFrame, SparkSession }
import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{DataFrame, SparkSession}
-
object RDFFastTreeGraphKernelUtil {
- def triplesToDF(sparkSession: SparkSession,
- triples: RDD[graph.Triple],
- subjectColName:String = "subject",
- predicateColName:String = "predicate",
- objectColName:String ="object"
- ): DataFrame = {
+ def triplesToDF(
+ sparkSession: SparkSession,
+ triples: RDD[graph.Triple],
+ subjectColName: String = "subject",
+ predicateColName: String = "predicate",
+ objectColName: String = "object"): DataFrame = {
import sparkSession.implicits._
- triples.map(f => (f.getSubject.toString,f.getPredicate.toString,f.getObject.toString))
+ triples.map(f => (f.getSubject.toString, f.getPredicate.toString, f.getObject.toString))
.toDF(subjectColName, predicateColName, objectColName)
}
- def getInstanceAndLabelDF( filteredTripleDF: DataFrame,
- subjectColName:String = "subject",
- objectColName:String ="object" ): DataFrame = {
+ def getInstanceAndLabelDF(
+ filteredTripleDF: DataFrame,
+ subjectColName: String = "subject",
+ objectColName: String = "object"): DataFrame = {
/*
root
|-- instance: string (nullable = true)
@@ -47,7 +47,7 @@ object RDFFastTreeGraphKernelUtil {
indexedDF
}
- def predictLogisticRegressionMLLIB(data: RDD[LabeledPoint], numClasses : Int = 2, maxIteration: Int = 5): Unit = {
+ def predictLogisticRegressionMLLIB(data: RDD[LabeledPoint], numClasses: Int = 2, maxIteration: Int = 5): Unit = {
val t0 = System.nanoTime
data.cache()
@@ -61,7 +61,7 @@ object RDFFastTreeGraphKernelUtil {
val validation = splits(1)
val model = new LogisticRegressionWithLBFGS().setNumClasses(numClasses).run(training)
- val predictions = validation.map{ point =>
+ val predictions = validation.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
@@ -73,15 +73,14 @@ object RDFFastTreeGraphKernelUtil {
var sumOfAccuracy = 0.0
- for ( seed <- 1 to maxIteration ) {
+ for (seed <- 1 to maxIteration) {
val (model, accuracy) = trainAndValidate(data, seed)
-// println(accuracy)
+ // println(accuracy)
sumOfAccuracy += accuracy
}
val t2 = System.nanoTime
-
// score the model on test data.
println("Average Accuracy: " + sumOfAccuracy / maxIteration)
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel_v2.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel_v2.scala
index 5f9aa81..6c24a7f 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel_v2.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel_v2.scala
@@ -1,18 +1,18 @@
package net.sansa_stack.ml.spark.kernel
-import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, StringIndexer}
+import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel, StringIndexer }
import org.apache.spark.mllib.linalg.SparseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{ DataFrame, SparkSession }
import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{DataFrame, SparkSession}
-class RDFFastTreeGraphKernel_v2 (@transient val sparkSession: SparkSession,
- val tripleDF: DataFrame,
- val instanceDF: DataFrame,
- val maxDepth: Int
- ) extends Serializable {
+class RDFFastTreeGraphKernel_v2(
+ @transient val sparkSession: SparkSession,
+ val tripleDF: DataFrame,
+ val instanceDF: DataFrame,
+ val maxDepth: Int) extends Serializable {
def computeFeatures(): DataFrame = {
/*
@@ -46,7 +46,6 @@ class RDFFastTreeGraphKernel_v2 (@transient val sparkSession: SparkSession,
intermediateDF.createOrReplaceTempView("df")
}
-
// Indexing on path
val indexer = new StringIndexer()
.setInputCol("path")
@@ -59,12 +58,10 @@ class RDFFastTreeGraphKernel_v2 (@transient val sparkSession: SparkSession,
.agg(collect_list("pathIndex") as "paths")
.toDF("instance", "label", "paths")
-
// CountVectorize the aggregated paths
val cvModel: CountVectorizerModel = new CountVectorizer().setInputCol("paths").setOutputCol("features").fit(aggDF)
val dataML = cvModel.transform(aggDF)
-
dataML
}
@@ -97,11 +94,11 @@ class RDFFastTreeGraphKernel_v2 (@transient val sparkSession: SparkSession,
object RDFFastTreeGraphKernel_v2 {
- def apply(sparkSession: SparkSession,
- tripleDF: DataFrame,
- instanceDF: DataFrame,
- maxDepth: Int
- ): RDFFastTreeGraphKernel_v2 = {
+ def apply(
+ sparkSession: SparkSession,
+ tripleDF: DataFrame,
+ instanceDF: DataFrame,
+ maxDepth: Int): RDFFastTreeGraphKernel_v2 = {
new RDFFastTreeGraphKernel_v2(sparkSession, tripleDF, instanceDF, maxDepth)
}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Bootstrapping.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Bootstrapping.scala
index 02bbfa9..7350c53 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Bootstrapping.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Bootstrapping.scala
@@ -1,5 +1,8 @@
package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation
+import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples
+import org.apache.spark.sql._
+
/**
* Bootstrapping
* -------------
@@ -8,18 +11,12 @@ package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation
*
* Created by lpfgarcia
*/
-
-import org.apache.spark.sql._
-
-import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples
-
class Bootstrapping(data: Dataset[IntegerTriples])
- extends CrossValidation[Dataset[IntegerTriples]] {
+ extends CrossValidation[Dataset[IntegerTriples]] {
- def crossValidation() = {
+ def crossValidation(): (Dataset[IntegerTriples], Dataset[IntegerTriples]) = {
val train = data.sample(true, 1)
val test = data.except(train)
(train, test)
}
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/CrossValidation.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/CrossValidation.scala
index 791d0b1..b55c36a 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/CrossValidation.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/CrossValidation.scala
@@ -13,4 +13,4 @@ trait CrossValidation[T] {
def crossValidation: (T, T)
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Holdout.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Holdout.scala
index f25bc84..1cbf42d 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Holdout.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Holdout.scala
@@ -1,5 +1,8 @@
package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation
+import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples
+import org.apache.spark.sql._
+
/**
* Hould Out
* ---------
@@ -8,22 +11,17 @@ package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation
*
* Created by lpfgarcia
*/
-
-import org.apache.spark.sql._
-
-import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples
-
case class rateException(info: String) extends Exception
class Holdout(data: Dataset[IntegerTriples], rate: Float) extends CrossValidation[Dataset[IntegerTriples]] {
- if (rate < 0 || rate >= 1)
+ if (rate < 0 || rate >= 1) {
throw new rateException("Rate value should be higher than 0 and lower than 1")
+ }
- def crossValidation() = {
+ def crossValidation(): (Dataset[IntegerTriples], Dataset[IntegerTriples]) = {
val train = data.sample(false, rate)
val test = data.except(train)
(train, test)
}
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/kFold.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/kFold.scala
index 97021e2..eed57a2 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/kFold.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/kFold.scala
@@ -1,5 +1,8 @@
package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation
+import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples
+import org.apache.spark.sql._
+
/**
* k-fold Cross Validation
* -----------------------
@@ -9,26 +12,23 @@ package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation
* Created by lpfgarcia
*/
-import org.apache.spark.sql._
-
-import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples
-
case class kException(info: String) extends Exception
case class withIndex(Subject: Int, Predicate: Int, Object: Int, k: Int)
class kFold(data: Dataset[IntegerTriples], k: Int, sk: SparkSession)
- extends CrossValidation[Seq[Dataset[IntegerTriples]]] {
+ extends CrossValidation[Seq[Dataset[IntegerTriples]]] {
import sk.implicits._
- if (k > 1 && k <= 10)
+ if (k > 1 && k <= 10) {
throw new kException("The k value should be higher than 1 and lower or equal to 10")
+ }
val id = (1 to data.count().toInt / k).flatMap(List.fill(k)(_))
val fold = sk.sparkContext.parallelize(id, data.rdd.getNumPartitions)
- def crossValidation() = {
+ def crossValidation(): (IndexedSeq[Dataset[IntegerTriples]], IndexedSeq[Dataset[IntegerTriples]]) = {
val df = sk.createDataFrame(data.rdd.zip(fold).map { r =>
withIndex(r._1.Subject, r._1.Predicate, r._1.Object, r._2)
@@ -45,4 +45,4 @@ class kFold(data: Dataset[IntegerTriples], k: Int, sk: SparkSession)
(train, test)
}
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/evaluate/Evaluate.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/evaluate/Evaluate.scala
index 1fda916..0d092e8 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/evaluate/Evaluate.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/evaluate/Evaluate.scala
@@ -9,9 +9,8 @@ package net.sansa_stack.ml.spark.kge.linkprediction.evaluate
object Evaluate {
- def meanRank(left: Array[Float], right: Array[Float]) {
+ def meanRank(left: Array[Float], right: Array[Float]): (Float, Float) = {
(left.sum / left.length,
right.sum / right.length)
}
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/DistMult.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/DistMult.scala
index 51fcfe5..da3f6ac 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/DistMult.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/DistMult.scala
@@ -1,5 +1,11 @@
package net.sansa_stack.ml.spark.kge.linkprediction.models
+import com.intel.analytics.bigdl.optim.Adam
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
+import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples }
+import org.apache.spark.sql._
+
/**
* DistMult: diagonal bilinear model
* ---------------------------------
@@ -9,24 +15,15 @@ package net.sansa_stack.ml.spark.kge.linkprediction.models
*
* Created by lpfgarcia on 20/11/2017.
*/
-
-import org.apache.spark.sql._
-
-import com.intel.analytics.bigdl.optim.Adam
-import com.intel.analytics.bigdl.tensor.Tensor
-import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
-
-import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples}
-
class DistMult(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession)
- extends Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
+ extends Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
val epochs = 100
val rate = 0.01f
var opt = new Adam(learningRate = rate)
- def dist(data: Dataset[IntegerTriples]) = {
+ def dist(data: Dataset[IntegerTriples]): Float = {
val aux = data.collect().map { i =>
e(i.Subject) * r(i.Predicate) * e(i.Object)
}.reduce((a, b) => a + b)
@@ -34,7 +31,7 @@ class DistMult(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k:
L2(aux)
}
- def run() = {
+ def run(): Unit = {
for (i <- 1 to epochs) {
@@ -53,5 +50,4 @@ class DistMult(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k:
}
}
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/Models.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/Models.scala
index 7720f30..1bfb3ad 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/Models.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/Models.scala
@@ -1,23 +1,21 @@
package net.sansa_stack.ml.spark.kge.linkprediction.models
-/**
- * Model Abstract Class
- * --------------------
- *
- * Created by lpfgarcia on 14/11/2017.
- */
-
import scala.math._
import scala.util._
-import org.apache.spark.sql._
-
import com.intel.analytics.bigdl.nn.Power
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
+import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples }
+import org.apache.spark.sql._
-import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples}
+/**
+ * Model Abstract Class
+ * --------------------
+ *
+ * Created by lpfgarcia on 14/11/2017.
+ */
abstract class Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
val Ne = ne
@@ -26,11 +24,11 @@ abstract class Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
var e = initialize(ne)
var r = normalize(initialize(nr))
- def initialize(size: Int) = {
+ def initialize(size: Int): Tensor[Float] = {
Tensor(size, k).rand(-6 / sqrt(k), 6 / sqrt(k))
}
- def normalize(data: Tensor[Float]) = {
+ def normalize(data: Tensor[Float]): Tensor[Float] = {
data / data.abs().sum()
}
@@ -38,7 +36,7 @@ abstract class Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
val seed = new Random(System.currentTimeMillis())
- def tuple(aux: IntegerTriples) = {
+ def tuple(aux: IntegerTriples): IntegerTriples = {
if (seed.nextBoolean()) {
IntegerTriples(seed.nextInt(Ne) + 1, aux.Predicate, aux.Object)
} else {
@@ -46,20 +44,20 @@ abstract class Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
}
}
- def negative(data: Dataset[IntegerTriples]) = {
+ def negative(data: Dataset[IntegerTriples]): Dataset[IntegerTriples] = {
data.map(i => tuple(i))
}
- def subset(data: Dataset[IntegerTriples]) = {
+ def subset(data: Dataset[IntegerTriples]): Dataset[IntegerTriples] = {
data.sample(false, 2 * (batch.toDouble / data.count().toDouble)).limit(batch)
}
- def L1(vec: Tensor[Float]) = {
+ def L1(vec: Tensor[Float]): Float = {
vec.abs().sum()
}
- def L2(vec: Tensor[Float]) = {
+ def L2(vec: Tensor[Float]): Float = {
vec.pow(2).sqrt().sum()
}
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/TransE.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/TransE.scala
index f5fb0db..43a4205 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/TransE.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/TransE.scala
@@ -1,5 +1,14 @@
package net.sansa_stack.ml.spark.kge.linkprediction.models
+import scala.math._
+
+import com.intel.analytics.bigdl.optim.Adam
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
+import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples }
+import org.apache.spark.sql._
+
+
/**
* TransE embedding model
* ----------------------
@@ -9,19 +18,8 @@ package net.sansa_stack.ml.spark.kge.linkprediction.models
*
* Created by lpfgarcia on 14/11/2017.
*/
-
-import scala.math._
-
-import org.apache.spark.sql._
-
-import com.intel.analytics.bigdl.optim.Adam
-import com.intel.analytics.bigdl.tensor.Tensor
-import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
-
-import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples}
-
class TransE(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: Int, margin: Float, L: String, sk: SparkSession)
- extends Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
+ extends Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
val epochs = 1000
val rate = 0.01f
@@ -30,12 +28,12 @@ class TransE(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: In
val myL = L match {
case "L2" => L2 _
- case _ => L1 _
+ case _ => L1 _
}
import sk.implicits._
- def dist(data: Dataset[IntegerTriples]) = {
+ def dist(data: Dataset[IntegerTriples]): Float = {
val aux = data.collect().map { i =>
e(i.Subject) + r(i.Predicate) - e(i.Object)
@@ -44,11 +42,11 @@ class TransE(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: In
myL(aux)
}
- def dist(row: IntegerTriples) = {
+ def dist(row: IntegerTriples): Tensor[Float] = {
e(row.Subject) + r(row.Predicate) - e(row.Object)
}
- def run() = {
+ def run(): Unit = {
for (i <- 1 to epochs) {
@@ -70,5 +68,4 @@ class TransE(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: In
}
}
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/Predict.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/Predict.scala
index 29cee10..e1c8227 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/Predict.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/Predict.scala
@@ -1,29 +1,27 @@
package net.sansa_stack.ml.spark.kge.linkprediction.prediction
+import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples }
+import org.apache.spark.sql._
+
/**
* Predict Abstract Class
* ----------------------
*
* Created by lpfgarcia on 14/11/2017.
*/
-
-import org.apache.spark.sql._
-
-import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples}
-
abstract class Evaluate(test: Dataset[IntegerTriples]) {
- def left(row: IntegerTriples, i: Int) = {
+ def left(row: IntegerTriples, i: Int): IntegerTriples = {
IntegerTriples(i, row.Predicate, row.Object)
}
- def right(row: IntegerTriples, i: Int) = {
+ def right(row: IntegerTriples, i: Int): IntegerTriples = {
IntegerTriples(row.Subject, row.Predicate, i)
}
def rank(row: IntegerTriples, spo: String): Integer
- def ranking() = {
+ def ranking(): (Seq[Integer], Seq[Integer]) = {
var l, r = Seq[Integer]()
@@ -35,7 +33,7 @@ abstract class Evaluate(test: Dataset[IntegerTriples]) {
(l, r)
}
- def rawHits10() = {
+ def rawHits10(): (Seq[Boolean], Seq[Boolean]) = {
var l, r = Seq[Boolean]()
@@ -46,5 +44,4 @@ abstract class Evaluate(test: Dataset[IntegerTriples]) {
(l, r)
}
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/PredictTransE.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/PredictTransE.scala
index 263c19d..a7c60dc 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/PredictTransE.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/PredictTransE.scala
@@ -1,28 +1,26 @@
package net.sansa_stack.ml.spark.kge.linkprediction.prediction
+import org.apache.spark.sql._
+
+import net.sansa_stack.ml.spark.kge.linkprediction.models.TransE
+import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples }
+
/**
* Predict TransE Class
* --------------------
*
* Created by lpfgarcia on 14/11/2017.
*/
-
-import org.apache.spark.sql._
-
-import net.sansa_stack.ml.spark.kge.linkprediction.models.TransE
-
-import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples}
-
class PredictTransE(model: TransE, test: Dataset[IntegerTriples]) extends Evaluate(test: Dataset[IntegerTriples]) {
- def rank(row: IntegerTriples, spo: String) = {
+ def rank(row: IntegerTriples, spo: String): Integer = {
var x = Seq[Float]()
val y = model.myL(model.dist(row))
val cor = spo match {
case "l" => left _
- case _ => right _
+ case _ => right _
}
x = y +: x
@@ -33,4 +31,4 @@ class PredictTransE(model: TransE, test: Dataset[IntegerTriples]) extends Evalua
x.sorted.indexOf(y)
}
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TransERun.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TransERun.scala
index 2b238a4..45e34b3 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TransERun.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TransERun.scala
@@ -4,18 +4,14 @@ package net.sansa_stack.ml.spark.kge.linkprediction.run
* Created by lpfgarcia on 14/11/2017.
*/
+import org.apache.log4j.{ Level, Logger }
import org.apache.spark.sql._
-import org.apache.log4j.Logger
-import org.apache.log4j.Level
-
-import net.sansa_stack.rdf.spark.kge.convertor.ByIndex
-import net.sansa_stack.rdf.spark.kge.triples._
-
-import net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation.Holdout
-import net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation.{kFold,Bootstrapping,Holdout}
+import net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation.{ kFold, Bootstrapping, Holdout }
import net.sansa_stack.ml.spark.kge.linkprediction.models.TransE
import net.sansa_stack.ml.spark.kge.linkprediction.prediction.PredictTransE
+import net.sansa_stack.rdf.spark.kge.convertor.ByIndex
+import net.sansa_stack.rdf.spark.kge.triples._
object TransERun {
@@ -25,7 +21,7 @@ object TransERun {
val spark = SparkSession.builder.master("local")
.appName("kge").getOrCreate
- def main(args: Array[String]) = {
+ def main(args: Array[String]): Unit = {
val table = new Triples("/home/lpfgarcia/Desktop/SANSA-ML/data/train.txt", "\t", false, false, spark)
@@ -37,18 +33,15 @@ object TransERun {
val (train, test) = new Holdout(data.triples, 0.6f).crossValidation()
-
println("Trinamento:")
println(train.show())
println("Teste:")
println(test.show())
- //var model = new TransE(train, data.e.length, data.r.length, 100, 20, 1, "L1", spark)
- //model.run()
-
- //val predict = new PredictTransE(model, test).ranking()
- //println(predict)
+ // var model = new TransE(train, data.e.length, data.r.length, 100, 20, 1, "L1", spark)
+ // model.run()
+ // val predict = new PredictTransE(model, test).ranking()
+ // println(predict)
}
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TriplesRun.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TriplesRun.scala
index 763c443..73cc080 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TriplesRun.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TriplesRun.scala
@@ -1,16 +1,13 @@
package net.sansa_stack.ml.spark.kge.linkprediction.run
-import scala.util.Random
-
-import net.sansa_stack.rdf.spark.kge.triples._
import net.sansa_stack.rdf.spark.kge.convertor.ByIndex
-import org.apache.spark.sql._
+import net.sansa_stack.rdf.spark.kge.triples._
import org.apache.log4j.{ Level, Logger }
-import org.springframework.util.StopWatch
+import org.apache.spark.sql._
-object runTesting extends App {
+object TriplesRun extends App {
- def printType[T](x: T): Unit = { println(x.getClass.toString()) }
+ def printType[T](x: T): Unit = { println(x.getClass.toString) }
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
@@ -26,27 +23,26 @@ object runTesting extends App {
println("<<< STARTING >>>")
- var watch: StopWatch = new StopWatch()
+ var startTime = System.currentTimeMillis()
- watch.start()
+ startTime = System.currentTimeMillis()
val trp = new Triples("/home/hamed/workspace/TransE/DataSets/FB15k/freebase_mtr100_mte100-train.txt", "\t", false, false, spark)
- watch.stop()
- println("Readin triples done in " + watch.getTotalTimeSeconds + " seconds")
+ println("Reading triples done in " + (System.currentTimeMillis() - startTime) + " seconds")
- watch.start()
+ startTime = System.currentTimeMillis()
var num: Long = trp.triples.count()
- watch.stop()
- println("\n\n No triples = " + num.toString() + " - Done in " + watch.getTotalTimeSeconds + " seconds.")
- watch.start()
+ println("\n\n No triples = " + num.toString + " - Done in " + (System.currentTimeMillis() - startTime) + " seconds.")
+
+ startTime = System.currentTimeMillis()
num = trp.getEntities().length
- watch.stop()
- println("\n\n No Entities = " + num.toString() + " - Done in " + watch.getTotalTimeSeconds + " seconds.")
- watch.start()
+ println("\n\n No Entities = " + num.toString + " - Done in " + (System.currentTimeMillis() - startTime) + " seconds.")
+
+ startTime = System.currentTimeMillis()
num = trp.getRelations().length
- watch.stop()
- println("\n\n No Predicates = " + num.toString() + " - Done in " + watch.getTotalTimeSeconds + " seconds.")
+
+ println("\n\n No Predicates = " + num.toString + " - Done in " + (System.currentTimeMillis() - startTime) + " seconds.")
// trp.getAllDistinctEntities().take(10).foreach(println)
// println("\n \n No entities = ",trp.getAllDistinctEntities().count() )
// println("\n \n No predicates = ",trp.getAllDistinctPredicates().count() )
@@ -79,10 +75,10 @@ object runTesting extends App {
sample1.show()
- //val r3 = conv.getTriplesByIndex(sample1)
- //r3.printSchema()
- //r3.show
+ // val r3 = conv.getTriplesByIndex(sample1)
+ // r3.printSchema()
+ // r3.show
- //val r4 = conv.getTriplesByString(r3)
- //println("<<< DONE >>>")
+ // val r4 = conv.getTriplesByString(r3)
+ // println("<<< DONE >>>")
}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/AbstractRDFGraph.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/AbstractRDFGraph.scala
index f5a54b5..a608b50 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/AbstractRDFGraph.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/AbstractRDFGraph.scala
@@ -3,6 +3,7 @@ package net.sansa_stack.ml.spark.mining.amieSpark
import org.apache.jena.graph.Triple
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{ DataFrame, SparkSession }
+
import net.sansa_stack.ml.spark.mining.amieSpark._
/**
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/DfLoader.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/DfLoader.scala
index 1e1674f..888b54c 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/DfLoader.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/DfLoader.scala
@@ -1,7 +1,5 @@
package net.sansa_stack.ml.spark.mining.amieSpark
-import net.sansa_stack.ml.spark.mining.amieSpark._
-
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
@@ -18,9 +16,9 @@ object DfLoader {
val startTime = System.currentTimeMillis()
import sqlContext.implicits._
- /* var y = StructType(StructField("sub", StringType,false)::
+ /* var y = StructType(StructField("sub", StringType,false)::
StructField("rel", StringType, false)::
- StructField("ob", StringType, false):: Nil)*/
+ StructField("ob", StringType, false):: Nil) */
val triples =
sc.textFile(path, minPartitions)
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/EmptyRDFGraphDataFrame.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/EmptyRDFGraphDataFrame.scala
index 44421de..dd51b98 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/EmptyRDFGraphDataFrame.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/EmptyRDFGraphDataFrame.scala
@@ -1,7 +1,7 @@
package net.sansa_stack.ml.spark.mining.amieSpark
-import org.apache.spark.sql.types.{ StringType, StructField, StructType }
import org.apache.spark.sql.{ DataFrame, Row, SQLContext }
+import org.apache.spark.sql.types.{ StringType, StructField, StructType }
/**
* @author Lorenz Buehmann
@@ -26,4 +26,4 @@ object EmptyRDFGraphDataFrame {
triplesDataFrame
}
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/KBObject.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/KBObject.scala
index b7d8088..442a589 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/KBObject.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/KBObject.scala
@@ -1,25 +1,23 @@
package net.sansa_stack.ml.spark.mining.amieSpark
-import org.apache.spark.SparkContext
-import org.apache.spark.sql.{ DataFrame, SQLContext }
+import java.io.File
import scala.collection.mutable.{ ArrayBuffer, Map }
-//import net.sansa_stack.ml.spark.dissect.inference.utils._
-
-import java.io.File
-
-import net.sansa_stack.ml.spark.mining.amieSpark.Rules.RuleContainer
+import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{ DataFrame, SQLContext }
import org.apache.spark.sql.functions.udf
+import net.sansa_stack.ml.spark.mining.amieSpark.Rules.RuleContainer
+
object KBObject {
case class Atom(rdf: RDFTriple)
class KB() extends Serializable {
var kbSrc: String = ""
- var kbGraph: RDFGraph = null
- var dfTable: DataFrame = null
+ var kbGraph: RDFGraph = _
+ var dfTable: DataFrame = _
var dfMap: Map[String, DataFrame] = Map()
@@ -62,7 +60,7 @@ object KBObject {
}
str = str.replace(" ", "_").replace("?", "_")
- return str
+ str
}
def calcName(whole: ArrayBuffer[RDFTriple]): String = {
@@ -83,11 +81,11 @@ object KBObject {
} else {
countMap += (w._3 -> 1)
}
- if (!(numberMap.contains(w._1))) {
+ if (!numberMap.contains(w._1)) {
numberMap += (w._1 -> counter)
counter += 1
}
- if (!(numberMap.contains(w._3))) {
+ if (!numberMap.contains(w._3)) {
numberMap += (w._3 -> counter)
counter += 1
}
@@ -113,28 +111,28 @@ object KBObject {
out += a + "_" + wh._2 + "_" + b + "_"
}
out = out.stripSuffix("_")
- return out
+ out
}
def getRngSize(rel: String): Double = {
- return this.predicate2object2subject.get(rel).get.size
+ this.predicate2object2subject.get(rel).get.size
}
def setKbSrc(x: String) {
this.kbSrc = x
}
- def getKbSrc(): String = {
+ def getKbSrc: String = {
- return this.kbSrc
+ this.kbSrc
}
- def getKbGraph(): RDFGraph = {
- return this.kbGraph
+ def getKbGraph: RDFGraph = {
+ this.kbGraph
}
- //TODO: think about Graph representation
+ // TODO: think about Graph representation
def setKbGraph(x: RDFGraph) {
this.kbGraph = x
val graph = x.triples.collect
@@ -163,7 +161,7 @@ object KBObject {
}
- return out
+ out
}
/**
@@ -178,7 +176,7 @@ object KBObject {
val subject = tp.subject
val relation = tp.predicate
val o = tp.`object`
- //filling the to to to maps
+ // filling the to to to maps
if (!(add(subject, relation, o, this.subject2predicate2object))) {
add(relation, o, subject, this.predicate2object2subject)
add(o, subject, relation, this.object2subject2predicate)
@@ -187,7 +185,7 @@ object KBObject {
add(subject, o, relation, this.subject2object2predicate)
}
- //filling the sizes
+ // filling the sizes
if (this.subjectSize.get(subject).isEmpty) {
this.subjectSize += (subject -> 1)
} else {
@@ -212,7 +210,7 @@ object KBObject {
this.objectSize += (o -> obSize)
}
- //filling the overlaps
+ // filling the overlaps
if (this.subject2subjectOverlap.get(relation).isEmpty) {
subject2subjectOverlap += (relation -> Map())
@@ -242,18 +240,18 @@ object KBObject {
return 0
}
- return this.relationSize.get(rel).get
+ this.relationSize.get(rel).get
}
- /*TO DO
+ /* TODO
* Functionality
* bulidOverlapTable
* */
def relationsSize(): Int = {
- return this.relationSize.size
+ this.relationSize.size
}
/**
@@ -264,7 +262,7 @@ object KBObject {
var x = this.subjectSize.size
var y = this.objectSize.size
- return (x + y)
+ (x + y)
}
/**
@@ -283,24 +281,24 @@ object KBObject {
val objects2 = predicate2object2subject.get(r2).get.keys.toSet
if (!r1.equals(r2)) {
- var ssoverlap: Int = computeOverlap(subjects1, subjects2);
- subject2subjectOverlap.get(r1).get.put(r2, ssoverlap);
- subject2subjectOverlap.get(r2).get.put(r1, ssoverlap);
+ var ssoverlap: Int = computeOverlap(subjects1, subjects2)
+ subject2subjectOverlap.get(r1).get.put(r2, ssoverlap)
+ subject2subjectOverlap.get(r2).get.put(r1, ssoverlap)
} else {
- subject2subjectOverlap.get(r1).get.put(r1, subjects2.size);
+ subject2subjectOverlap.get(r1).get.put(r1, subjects2.size)
}
- var soverlap1: Int = computeOverlap(subjects1, objects2);
- subject2objectOverlap.get(r1).get.put(r2, soverlap1);
- var soverlap2: Int = computeOverlap(subjects2, objects1);
- subject2objectOverlap.get(r2).get.put(r1, soverlap2);
+ var soverlap1: Int = computeOverlap(subjects1, objects2)
+ subject2objectOverlap.get(r1).get.put(r2, soverlap1)
+ var soverlap2: Int = computeOverlap(subjects2, objects1)
+ subject2objectOverlap.get(r2).get.put(r1, soverlap2)
if (!r1.equals(r2)) {
- var oooverlap: Int = computeOverlap(objects1, objects2);
- object2objectOverlap.get(r1).get.put(r2, oooverlap);
- object2objectOverlap.get(r2).get.put(r1, oooverlap);
+ var oooverlap: Int = computeOverlap(objects1, objects2)
+ object2objectOverlap.get(r1).get.put(r2, oooverlap)
+ object2objectOverlap.get(r2).get.put(r1, oooverlap)
} else {
- object2objectOverlap.get(r1).get.put(r1, objects2.size);
+ object2objectOverlap.get(r1).get.put(r1, objects2.size)
}
}
}
@@ -316,11 +314,12 @@ object KBObject {
def computeOverlap(s1: Set[String], s2: Set[String]): Int = {
var overlap: Int = 0
for (r <- s1) {
- if (s2.contains(r))
+ if (s2.contains(r)) {
overlap += 1
+ }
}
- return overlap
+ overlap
}
// ---------------------------------------------------------------------------
@@ -334,13 +333,13 @@ object KBObject {
*
*/
def functionality(relation: String): Double = {
- /*if (relation.equals(EQUALSbs)) {
- return 1.0;*/
+ /* if (relation.equals(EQUALSbs)) {
+ return 1.0; */
if (this.predicate2subject2object.get(relation).isEmpty) { return 0.0 }
var a: Double = this.predicate2subject2object.get(relation).get.size
var b: Double = this.relationSize.get(relation).get
- return (a / b)
+ (a / b)
}
@@ -351,12 +350,12 @@ object KBObject {
*
*/
def inverseFunctionality(relation: String): Double = {
- /*if (relation.equals(EQUALSbs)) {
- return 1.0;
- } */
+ /* if (relation.equals(EQUALSbs)) {
+ return 1.0
+ } */
var a: Double = this.predicate2object2subject.get(relation).get.size
var b: Double = this.relationSize.get(relation).get
- return (a / b)
+ (a / b)
}
@@ -368,7 +367,7 @@ object KBObject {
* @author AMIE+ Team
*/
def isFunctional(relation: String): Boolean = {
- return functionality(relation) >= inverseFunctionality(relation);
+ functionality(relation) >= inverseFunctionality(relation)
}
/**
@@ -381,10 +380,11 @@ object KBObject {
*
*/
def functionality(relation: String, inversed: Boolean): Double = {
- if (inversed)
- return inverseFunctionality(relation);
- else
- return functionality(relation);
+ if (inversed) {
+ inverseFunctionality(relation)
+ } else {
+ functionality(relation)
+ }
}
/**
@@ -396,10 +396,11 @@ object KBObject {
*
*/
def inverseFunctionality(relation: String, inversed: Boolean): Double = {
- if (inversed)
- return functionality(relation);
- else
- return inverseFunctionality(relation);
+ if (inversed) {
+ functionality(relation)
+ } else {
+ inverseFunctionality(relation)
+ }
}
/**
@@ -408,28 +409,29 @@ object KBObject {
* length of maplist is the number of instantiations of a rule
*
* @param triplesCard rule as an ArrayBuffer of RDFTriples, triplesCard(0)
- * is the head of the rule
+ * is the head of the rule
* @param sc spark context
*
*/
- //----------------------------------------------------------------
+ // ----------------------------------------------------------------
// Statistics
- //----------------------------------------------------------------
+ // ----------------------------------------------------------------
def overlap(relation1: String, relation2: String, overlap: Int): Double = {
overlap match {
- case SUBJECT2SUBJECT => if ((!(subject2subjectOverlap.get(relation1).isEmpty)) && (!(subject2subjectOverlap.get(relation1).get.get(relation2).isEmpty))) { return subject2subjectOverlap.get(relation1).get.get(relation2).get }
- else return 0.0
+ case SUBJECT2SUBJECT =>
+ if (subject2subjectOverlap.get(relation1).isDefined && (!(subject2subjectOverlap.get(relation1).get.get(relation2).isEmpty))) {
+ subject2subjectOverlap.get(relation1).get.get(relation2).get
+ } else 0.0
case SUBJECT2OBJECT =>
-
- if ((!(subject2objectOverlap.get(relation1).isEmpty)) && (!(subject2objectOverlap.get(relation1).get.get(relation2).isEmpty))) { return subject2objectOverlap.get(relation1).get.get(relation2).get }
- else return 0.0
+ if ((!(subject2objectOverlap.get(relation1).isEmpty)) && (!(subject2objectOverlap.get(relation1).get.get(relation2).isEmpty))) {
+ subject2objectOverlap.get(relation1).get.get(relation2).get
+ } else 0.0
case OBJECT2OBJECT =>
-
- if ((!(object2objectOverlap.get(relation1).isEmpty)) && (!(object2objectOverlap.get(relation1).get.get(relation2).isEmpty))) { return object2objectOverlap.get(relation1).get.get(relation2).get }
- else return 0.0
-
+ if ((!(object2objectOverlap.get(relation1).isEmpty)) && (!(object2objectOverlap.get(relation1).get.get(relation2).isEmpty))) {
+ object2objectOverlap.get(relation1).get.get(relation2).get
+ } else 0.0
}
}
@@ -445,16 +447,16 @@ object KBObject {
def relationColumnSize(rel: String, elem: String): Int = {
elem match {
case "subject" =>
- return predicate2subject2object.get(rel).get.size
+ predicate2subject2object.get(rel).get.size
case "object" =>
- return predicate2object2subject.get(rel).get.size
+ predicate2object2subject.get(rel).get.size
}
}
- //TODO: better than cardinality
+ // TODO: better than cardinality
def bindingExists(triplesCard: ArrayBuffer[RDFTriple]): Boolean = {
val k = this.kbGraph
@@ -478,7 +480,7 @@ object KBObject {
var minSize = this.relationSize.get(triplesCard(0).predicate).get
var index = 0
- for (i <- 1 to triplesCard.length - 1) {
+ for (i <- 1 until triplesCard.length) {
if (this.relationSize.get(triplesCard(i).predicate).get < minSize) {
minSize = this.relationSize.get(triplesCard(i).predicate).get
min = triplesCard(i)
@@ -500,7 +502,7 @@ object KBObject {
x = k.find(None, Some(min.predicate), None).collect
}
- //x.foreach(println)
+ // x.foreach(println)
triplesCard.remove(index)
for (i <- x) {
@@ -530,16 +532,16 @@ object KBObject {
if (test) {
if ((a.startsWith("?")) && ((j._1 == a) && (!(atestLeft)))) {
- temp += new RDFTriple(i._1, j._2, j._3)
+ temp += RDFTriple(i._1, j._2, j._3)
} else if ((a.startsWith("?")) && ((j._3 == a) && (!(atestRight)))) {
- temp += new RDFTriple(j._1, j._2, i._1)
+ temp += RDFTriple(j._1, j._2, i._1)
} else if ((b.startsWith("?")) && ((j._3 == b) && (!(btestRight)))) {
- temp += new RDFTriple(j._1, j._2, i._3)
+ temp += RDFTriple(j._1, j._2, i._3)
} else if ((b.startsWith("?")) && ((j._1 == b) && (!(btestLeft)))) {
- temp += new RDFTriple(i._3, j._2, j._3)
+ temp += RDFTriple(i._3, j._2, j._3)
} else if ((b.startsWith("?")) && (((j._3 == b) && (btestRight)) || ((j._1 == b) && (btestLeft)))) {
exploreFurther = false
} else if ((a.startsWith("?")) && (((j._1 == a) && (atestLeft)) || ((j._3 == a) && (atestRight)))) {
@@ -564,12 +566,12 @@ object KBObject {
}
- return false
+ false
}
- def varCount(tpAr: ArrayBuffer[RDFTriple]): ArrayBuffer[Tuple2[String, String]] = {
+ def varCount(tpAr: ArrayBuffer[RDFTriple]): ArrayBuffer[(String, String)] = {
- var out2: ArrayBuffer[Tuple2[String, String]] = new ArrayBuffer
+ var out2: ArrayBuffer[(String, String)] = new ArrayBuffer
for (i <- tpAr) {
if (!(out2.contains(Tuple2(i.subject, i.predicate)))) {
@@ -582,9 +584,9 @@ object KBObject {
}
- return out2
+ out2
}
- def countProjectionQueriesDF(posit: Int, id: Int, operator: String, minHC: Double, tpAr: ArrayBuffer[RDFTriple], RXY: ArrayBuffer[Tuple2[String, String]], sc: SparkContext, sqlContext: SQLContext): DataFrame =
+ def countProjectionQueriesDF(posit: Int, id: Int, operator: String, minHC: Double, tpAr: ArrayBuffer[RDFTriple], RXY: ArrayBuffer[(String, String)], sc: SparkContext, sqlContext: SQLContext): DataFrame =
{
val threshold = minHC * this.relationSize.get(tpAr(0).predicate).get
@@ -643,13 +645,13 @@ object KBObject {
}
- return whole
+ whole
}
def cardinalityQueries(id: Int, tpArDF: DataFrame, wholeAr: ArrayBuffer[RDFTriple], sc: SparkContext, sqlContext: SQLContext): DataFrame = {
val DF = this.dfTable
- var tpMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map()
+ var tpMap: Map[String, ArrayBuffer[(Int, String)]] = Map()
DF.registerTempTable("table")
tpArDF.registerTempTable("tpArTable")
@@ -659,10 +661,10 @@ object KBObject {
var v = sqlContext.sql("SELECT * FROM tpArTable JOIN newColumn")
var varAr: ArrayBuffer[String] = new ArrayBuffer
- var checkMap: Map[Int, Tuple2[String, String]] = Map()
+ var checkMap: Map[Int, (String, String)] = Map()
var checkSQLSELECT = "SELECT "
- for (i <- 0 to wholeAr.length - 1) {
+ for (i <- wholeAr.indices) {
var a = wholeAr(i).subject
var b = wholeAr(i)._3
@@ -693,7 +695,7 @@ object KBObject {
var cloneTpAr = wholeAr.clone()
- var removedMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map()
+ var removedMap: Map[String, ArrayBuffer[(Int, String)]] = Map()
varAr = varAr.distinct
var checkSQLWHERE = "WHERE "
@@ -732,7 +734,7 @@ object KBObject {
}
checkSQLWHERE = checkSQLWHERE.stripSuffix(" AND ")
- var seq: Seq[String] = Seq((wholeAr.last.toString() + " " + id.toString()))
+ var seq: Seq[String] = Seq((wholeAr.last.toString() + " " + id.toString))
import sqlContext.implicits._
var key: DataFrame = seq.toDF("key")
@@ -745,7 +747,7 @@ object KBObject {
key.registerTempTable("keyTable")
var out = sqlContext.sql(checkSQLSELECT + ", keyTable.key FROM lastTable JOIN keyTable")
- return out
+ out
}
@@ -754,18 +756,19 @@ object KBObject {
*/
def cardinality(tpAr: ArrayBuffer[RDFTriple], sc: SparkContext, sqlContext: SQLContext): DataFrame = {
+ println(s"computing cardinality for ${tpAr.mkString(",")} ...")
var name = calcName(tpAr)
if (dfMap.contains(name)) {
- return dfMap.get(name).get
+ dfMap.get(name).get
} else {
val DF = this.dfTable
- var tpMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map()
+ var tpMap: Map[String, ArrayBuffer[(Int, String)]] = Map()
DF.registerTempTable("table")
var v = sqlContext.sql("SELECT rdf AS tp0 FROM table WHERE rdf.predicate = '" + tpAr(0).predicate + "'")
- for (k <- 1 to tpAr.length - 1) {
+ for (k <- 1 until tpAr.length) {
var w = sqlContext.sql("SELECT rdf AS tp" + k + " FROM table WHERE rdf.predicate = '" + tpAr(k).predicate + "'")
w.registerTempTable("newColumn")
@@ -773,7 +776,7 @@ object KBObject {
tempO.registerTempTable("previous")
var sqlString = ""
- for (re <- 0 to k - 1) {
+ for (re <- 0 until k) {
sqlString += "previous.tp" + re + ", "
}
@@ -782,10 +785,10 @@ object KBObject {
}
var varAr: ArrayBuffer[String] = new ArrayBuffer
- var checkMap: Map[Int, Tuple2[String, String]] = Map()
+ var checkMap: Map[Int, (String, String)] = Map()
var checkSQLSELECT = "SELECT "
- for (i <- 0 to tpAr.length - 1) {
+ for (i <- tpAr.indices) {
var a = tpAr(i).subject
var b = tpAr(i)._3
@@ -816,7 +819,7 @@ object KBObject {
var cloneTpAr = tpAr.clone()
- var removedMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map()
+ var removedMap: Map[String, ArrayBuffer[(Int, String)]] = Map()
varAr = varAr.distinct
var checkSQLWHERE = "WHERE "
@@ -856,8 +859,9 @@ object KBObject {
checkSQLWHERE = checkSQLWHERE.stripSuffix(" AND ")
v.registerTempTable("t")
+ println(checkSQLSELECT + " FROM t " + checkSQLWHERE)
var out = sqlContext.sql(checkSQLSELECT + " FROM t " + checkSQLWHERE)
- return out
+ out
}
}
@@ -869,7 +873,7 @@ object KBObject {
var go = false
var outCount: Double = 0.0
var tpsString = calcName(tpAr)
- for (i <- 1 to tpAr.length - 1) {
+ for (i <- 1 until tpAr.length) {
if ((tpAr(i)._1 == "?a") || (tpAr(i)._3 == "?a")) {
go = true
}
@@ -879,7 +883,7 @@ object KBObject {
return outCount
}
- if (go) {
+ if ( go ) {
var card = dfMap.get(tpsString).get
@@ -905,33 +909,23 @@ object KBObject {
h.registerTempTable("subjects")
out = sqlContext.sql("SELECT twoLengthT.tp0 FROM twoLengthT JOIN subjects ON twoLengthT.tp0." + abString + "=subjects.sub")
- /*
- if ((tpAr(0).predicate == "directed")&&(tpAr(1).predicate== "produced")&&(tpAr(1).subject== "?a")&&(tpAr(1)._3== "?b")){
- h.show(800, false)
-
- var fjgf = sqlContext.sql("SELECT ")
- }
-
-
- */
-
}
outCount = out.count()
}
- return outCount
+ outCount
}
def negatveExampleBuilder(subjects: DataFrame, wholeAr: ArrayBuffer[RDFTriple], sc: SparkContext, sqlContext: SQLContext): DataFrame = {
val DF = this.dfTable
- var tpMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map()
+ var tpMap: Map[String, ArrayBuffer[(Int, String)]] = Map()
DF.registerTempTable("table")
var wholeTPARBackup = wholeAr.clone()
wholeAr.remove(0)
var complete = sqlContext.sql("SELECT rdf AS tp" + 0 + " FROM table WHERE rdf.predicate = '" + (wholeAr(0)).predicate + "'")
- for (i <- 1 to wholeAr.length - 1) {
+ for (i <- 1 until wholeAr.length) {
var w = sqlContext.sql("SELECT rdf AS tp" + i + " FROM table WHERE rdf.predicate = '" + (wholeAr(i)).predicate + "'")
w.registerTempTable("newColumn")
@@ -940,12 +934,12 @@ object KBObject {
}
var varAr: ArrayBuffer[String] = new ArrayBuffer
- var checkMap: Map[Int, Tuple2[String, String]] = Map()
+ var checkMap: Map[Int, (String, String)] = Map()
var checkSQLSELECT = "SELECT "
var abString = ("", "")
- for (i <- 0 to wholeAr.length - 1) {
+ for (i <- wholeAr.indices) {
var a = wholeAr(i).subject
var b = wholeAr(i)._3
@@ -984,7 +978,7 @@ object KBObject {
var cloneTpAr = wholeAr.clone()
- var removedMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map()
+ var removedMap: Map[String, ArrayBuffer[(Int, String)]] = Map()
varAr = varAr.distinct
var checkSQLWHERE = "WHERE "
@@ -1032,11 +1026,11 @@ object KBObject {
var out = sqlContext.sql(checkSQLSELECT + " FROM lastTable JOIN keyTable ON lastTable." + abString._2 + "." + abString._1 + "=keyTable.sub")
- return out
+ out
}
- //TODO: solve with DataFrames
+ // TODO: solve with DataFrames
def cardPlusnegativeExamplesLength(triplesCard: ArrayBuffer[RDFTriple], sc: SparkContext): Double = {
val k = this.kbGraph
@@ -1058,7 +1052,7 @@ object KBObject {
}
- /**initializing maplist with head of the rule*/
+ /** initializing maplist with head of the rule */
for (ii <- arbuf(0).collect()) {
mapList += Map(triplesCard(0).subject -> ii._1, triplesCard(0).`object` -> ii._3)
@@ -1066,9 +1060,9 @@ object KBObject {
var temp = mapList.clone()
- for (tripleCount <- 1 to triplesCard.length - 1) {
+ for (tripleCount <- 1 until triplesCard.length) {
- val rdd1 = sc.parallelize(mapList.toSeq)
+ val rdd1 = sc.parallelize(mapList)
val rdd2 = arbuf(tripleCount)
val comb = rdd1.cartesian(rdd2) // cartesian() to get every possible combination
@@ -1084,17 +1078,17 @@ object KBObject {
for (i <- combinations) {
var ltrip = i._2
- var elem1 = ltrip._1 //subject from combination
+ var elem1 = ltrip._1 // subject from combination
var elem2 = ltrip._3
var trip1 = triplesCard(tripleCount)._1 // subject from Rule
var trip2 = triplesCard(tripleCount)._3
- /**checking map for placeholder for the subject*/
+ /** checking map for placeholder for the subject */
if (!(i._1.contains(trip1))) {
i._1 += (trip1 -> elem1)
}
- /**checking map for placeholder for the object*/
+ /** checking map for placeholder for the object */
if (!(i._1.contains(trip2))) {
i._1 += (trip2 -> elem2)
}
@@ -1106,9 +1100,9 @@ object KBObject {
}
}
- var rightOnes = sc.parallelize(mapList.toSeq).map(y => y.get(triplesCard(0).subject).get).distinct.collect
+ var rightOnes = sc.parallelize(mapList).map(y => y.get(triplesCard(0).subject).get).distinct.collect
- var as = sc.parallelize(temp.toSeq).map {
+ var as = sc.parallelize(temp).map {
x =>
(x.get(triplesCard(0).subject).get, 1)
@@ -1116,19 +1110,17 @@ object KBObject {
var out: Double = 0.0
for (i <- as) {
- if (rightOnes.contains(i._1))
+ if (rightOnes.contains(i._1)) {
out += (i._2 - 1)
-
+ }
}
-
- return ((mapList.length) + out)
-
+ ((mapList.length) + out)
}
def addDanglingAtom(c: Int, id: Int, minHC: Double, rule: RuleContainer, sc: SparkContext, sqlContext: SQLContext): DataFrame =
{
val tpAr = rule.getRule()
- var RXY: ArrayBuffer[Tuple2[String, String]] = new ArrayBuffer
+ var RXY: ArrayBuffer[(String, String)] = new ArrayBuffer
val notC = rule.notClosed()
@@ -1148,13 +1140,13 @@ object KBObject {
var x = this.countProjectionQueriesDF(c, id, "OD", minHC, tpAr, RXY, sc, sqlContext)
- return x
+ x
}
def addClosingAtom(c: Int, id: Int, minHC: Double, rule: RuleContainer, sc: SparkContext, sqlContext: SQLContext): DataFrame =
{
val tpAr = rule.getRule()
- var RXY: ArrayBuffer[Tuple2[String, String]] = new ArrayBuffer
+ var RXY: ArrayBuffer[(String, String)] = new ArrayBuffer
val notC = rule.notClosed()
@@ -1190,9 +1182,8 @@ object KBObject {
}
var x = this.countProjectionQueriesDF(c, id, "OC", minHC, tpAr, RXY, sc, sqlContext)
- return x
+ x
}
}
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/MineRules.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/MineRules.scala
index 5f15515..193a760 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/MineRules.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/MineRules.scala
@@ -3,23 +3,23 @@ package net.sansa_stack.ml.spark.mining.amieSpark
import java.io.File
import java.net.URI
-import net.sansa_stack.ml.spark.mining.amieSpark.KBObject.KB
-import net.sansa_stack.ml.spark.mining.amieSpark.Rules.RuleContainer
-import org.apache.spark.SparkContext
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{ DataFrame, SQLContext, SparkSession, _ }
-
import scala.collection.mutable.{ ArrayBuffer, Map }
import scala.util.Try
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{ DataFrame, SparkSession, SQLContext, _ }
import net.sansa_stack.ml.spark.mining.amieSpark.DfLoader.Atom
+import net.sansa_stack.ml.spark.mining.amieSpark.KBObject.KB
+import net.sansa_stack.ml.spark.mining.amieSpark.Rules.RuleContainer
+
object MineRules {
/**
- * Algorithm that mines the Rules.
+ * Algorithm that mines the Rules.
*
* @param kb object knowledge base that was created in main
* @param minHC threshold on head coverage
@@ -55,11 +55,11 @@ object MineRules {
} else {
countMap += (w._3 -> 1)
}
- if (!(numberMap.contains(w._1))) {
+ if (!numberMap.contains(w._1)) {
numberMap += (w._1 -> counter)
counter += 1
}
- if (!(numberMap.contains(w._3))) {
+ if (!numberMap.contains(w._3)) {
numberMap += (w._3 -> counter)
counter += 1
}
@@ -85,15 +85,16 @@ object MineRules {
out += a + "_" + wh._2 + "_" + b + "_"
}
out = out.stripSuffix("_")
- return out
+ out
}
def ruleMining(sc: SparkContext, sqlContext: SQLContext): ArrayBuffer[RuleContainer] = {
- var predicates = kb.getKbGraph().triples.map { x => x.predicate
+ var predicates = kb.getKbGraph.triples.map { x => x.predicate
}.distinct
var z = predicates.collect()
+ println(s"#predicates:$z.length")
/**
* q is a queue with one atom rules
@@ -119,7 +120,7 @@ object MineRules {
var out: ArrayBuffer[RuleContainer] = new ArrayBuffer
var dublicate: ArrayBuffer[String] = ArrayBuffer("")
- for (i <- 0 to this.maxLen - 1) {
+ for (i <- 0 until this.maxLen) {
if ((i > 0) && (dataFrameRuleParts != null)) {
var temp = q.clone
@@ -147,14 +148,14 @@ object MineRules {
var dubCheck = fstTp
- for (i <- 1 to newTpArr.length - 1) {
+ for (i <- 1 until newTpArr.length) {
var temp = newTpArr(i).toString
dubCheck += sortedNewTpArr(i).toString
if (temp == fstTp) {
counter += 1
}
}
- if ((counter < newTpArr.length) && (!(dublicate.contains(dubCheck)))) {
+ if ((counter < newTpArr.length) && (!dublicate.contains(dubCheck))) {
dublicate += dubCheck
newRuleC.setRule(minConf, n1._2, parent, newTpArr, sortedNewTpArr, kb, sc, sqlContext)
q += newRuleC
@@ -162,12 +163,12 @@ object MineRules {
}
- } else if ((i > 0) && ((dataFrameRuleParts == null) || (dataFrameRuleParts.isEmpty()))) {
+ } else if ((i > 0) && ((dataFrameRuleParts == null) || dataFrameRuleParts.isEmpty())) {
q = new ArrayBuffer
}
- if ((!q.isEmpty)) {
- for (j <- 0 to q.length - 1) {
+ if (q.nonEmpty) {
+ for (j <- q.indices) {
val r: RuleContainer = q(j)
@@ -180,11 +181,11 @@ object MineRules {
if (acceptedForOutput(outMap, r, minConf, kb, sc, sqlContext)) {
out += r
- if (!(outMap.contains(tp(0).predicate))) {
+ if (!outMap.contains(tp(0).predicate)) {
outMap += (tp(0).predicate -> ArrayBuffer((tp, r)))
} else {
var temp: ArrayBuffer[(ArrayBuffer[RDFTriple], RuleContainer)] = outMap.remove(tp(0).predicate).get
- temp += new Tuple2(tp, r)
+ temp += Tuple2(tp, r)
outMap += (tp(0).predicate -> temp)
}
@@ -195,7 +196,7 @@ object MineRules {
if (r.getRule().length < maxLen) {
dataFrameRuleParts = refine(i, j, r, dataFrameRuleParts, sc, sqlContext)
- //TODO: Dublicate check
+ // TODO: Dublicate check
}
@@ -204,7 +205,7 @@ object MineRules {
}
- return out
+ out
}
/**
@@ -219,14 +220,14 @@ object MineRules {
var out: DataFrame = null
var OUT: RDD[(RDFTriple, Int, Int)] = dataFrameRuleParts
- //var count2:RDD[(String, Int)] = null
+ // var count2:RDD[(String, Int)] = null
var path = new File("test_table/")
var temp = 0
val tpAr = r.getRule()
var stringSELECT = ""
- for (tp <- 0 to tpAr.length - 1) {
+ for (tp <- tpAr.indices) {
stringSELECT += "tp" + tp + ", "
@@ -239,7 +240,7 @@ object MineRules {
var a = kb.addDanglingAtom(c, id, minHC, r, sc, sqlContext)
z = Try(a.first())
- if ((!(z.isFailure)) && (z.isSuccess)) {
+ if ((!z.isFailure) && z.isSuccess) {
out = a
@@ -251,7 +252,7 @@ object MineRules {
var t = Try(b.first)
- if ((!(t.isFailure)) && (t.isSuccess) && (temp == 0)) {
+ if ((!t.isFailure) && t.isSuccess && (temp == 0)) {
if (out == null) {
out = b
@@ -265,12 +266,12 @@ object MineRules {
var count: RDD[(String, Int)] = null
var o: RDD[(RDFTriple, Int, Int)] = null
- if (((!(t.isFailure)) && (t.isSuccess)) || ((z != null) && (!(z.isFailure)) && (z.isSuccess))) {
- count = out.rdd.map(x => (x(r.getRule().length + 1).toString(), 1)).reduceByKey(_ + _)
+ if (((!t.isFailure) && t.isSuccess) || ((z != null) && (!z.isFailure) && z.isSuccess)) {
+ count = out.rdd.map(x => (x(r.getRule().length + 1).toString, 1)).reduceByKey(_ + _)
o = count.map(q => (q._1.split("\\s+"), q._2)).map { token =>
Tuple3(RDFTriple(token._1(0), token._1(1), token._1(2)), token._2, token._1(3).toInt)
- }.filter(n1 => (n1._2 >= (kb.getRngSize(n1._1.predicate) * minHC)))
+ }.filter(n1 => n1._2 >= (kb.getRngSize(n1._1.predicate) * minHC))
if (OUT == null) {
OUT = o
@@ -280,7 +281,7 @@ object MineRules {
}
- return OUT
+ OUT
}
@@ -294,14 +295,14 @@ object MineRules {
*/
def acceptedForOutput(outMap: Map[String, ArrayBuffer[(ArrayBuffer[RDFTriple], RuleContainer)]], r: RuleContainer, minConf: Double, k: KB, sc: SparkContext, sqlContext: SQLContext): Boolean = {
- //if ((!(r.closed())) || (r.getPcaConfidence(k, sc, sqlContext) < minConf)) {
- if ((!(r.closed())) || (r.getPcaConfidence() < minConf)) {
+ // if ((!(r.closed())) || (r.getPcaConfidence(k, sc, sqlContext) < minConf)) {
+ if ((!r.closed()) || (r.getPcaConfidence() < minConf)) {
return false
}
var parents: ArrayBuffer[RuleContainer] = r.parentsOfRule(outMap, sc)
- if (r.getRule.length > 2) {
+ if (r.getRule().length > 2) {
for (rp <- parents) {
if (r.getPcaConfidence() <= rp.getPcaConfidence()) {
return false
@@ -310,14 +311,14 @@ object MineRules {
}
}
- return true
+ true
}
def sort(tp: ArrayBuffer[RDFTriple]): ArrayBuffer[RDFTriple] = {
var out = ArrayBuffer(tp(0))
- var temp = new ArrayBuffer[Tuple2[String, RDFTriple]]
+ var temp = new ArrayBuffer[(String, RDFTriple)]
- for (i <- 1 to tp.length - 1) {
+ for (i <- 1 until tp.length) {
var tempString: String = tp(i).predicate + tp(i).subject + tp(i).`object`
temp += Tuple2(tempString, tp(i))
@@ -327,63 +328,8 @@ object MineRules {
out += t._2
}
- return out
+ out
}
}
-
- def main(args: Array[String]) = {
- val know = new KB()
-
- val sparkSession = SparkSession.builder
-
- .master("local[*]")
- .appName("AMIESpark example")
-
- .getOrCreate()
-
- if (args.length < 2) {
- System.err.println(
- "Usage: Triple reader