diff --git a/.gitignore b/.gitignore
index 417122f..7bface9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,4 +19,6 @@ project/plugins/project/
 
 # IntelliJ IDEA specific
 /.idea
-sansa-ml-parent_2.11.iml
+*.iml
+
+scalastyle-output.xml
diff --git a/.travis.yml b/.travis.yml
index 2e15138..682ef63 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,10 @@
-language: java
+language: scala
 sudo: false
 cache:
   directories:
-  - $HOME/.m2
\ No newline at end of file
+  - $HOME/.m2
+scala:
+  - 2.11.11
+script:
+  - mvn scalastyle:check
+  - mvn test 
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index e240109..4a4c58b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
 	<modelVersion>4.0.0</modelVersion>
 	<groupId>net.sansa-stack</groupId>
 	<artifactId>sansa-ml-parent_2.11</artifactId>
-	<version>0.4.0</version>
+	<version>0.5.0</version>
 	<packaging>pom</packaging>
 	<name>ML API - Parent</name>
 	<description>RDF/OWL Machine Learning Library for Big Data</description>
@@ -20,8 +20,8 @@
 
 	<licenses>
 		<license>
-			<name>GNU GENERAL PUBLIC LICENSE, Version 3</name>
-			<url>http://www.gnu.org/licenses/gpl-3.0.txt</url>
+			<name>Apache License 2.0</name>
+			<url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
 			<distribution>repo</distribution>
 		</license>
 	</licenses>
@@ -65,10 +65,11 @@
 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 		<scala.version>2.11.11</scala.version>
 		<scala.binary.version>2.11</scala.binary.version>
-		<spark.version>2.3.1</spark.version>
-		<flink.version>1.5.0</flink.version>
-		<jena.version>3.7.0</jena.version>
-		<sansa.version>0.4.0</sansa.version>
+		<spark.version>2.4.0</spark.version>
+		<flink.version>1.7.0</flink.version>
+		<jena.version>3.9.0</jena.version>
+		<sansa.rdf.version>0.5.0</sansa.rdf.version>
+		<sansa.owl.version>0.4.1</sansa.owl.version>
 		<scalastyle.config.path>${project.basedir}/scalastyle-config.xml</scalastyle.config.path>
 	</properties>
 
@@ -85,14 +86,14 @@
 			<dependency>
 				<groupId>net.sansa-stack</groupId>
 				<artifactId>sansa-rdf-spark_${scala.binary.version}</artifactId>
-				<version>${sansa.version}</version>
+				<version>${sansa.rdf.version}</version>
 			</dependency>
 
 			<!-- SANSA OWL -->
 			<dependency>
 				<groupId>net.sansa-stack</groupId>
 				<artifactId>sansa-owl-spark_${scala.binary.version}</artifactId>
-				<version>${sansa.version}</version>
+				<version>${sansa.owl.version}</version>
 			</dependency>
 
 			<!-- Apache Spark Core -->
@@ -157,13 +158,7 @@
 			<dependency>
 				<groupId>org.scalatest</groupId>
 				<artifactId>scalatest_${scala.binary.version}</artifactId>
-				<version>3.0.3</version>
-				<scope>test</scope>
-			</dependency>
-			<dependency>
-				<groupId>com.holdenkarau</groupId>
-				<artifactId>spark-testing-base_${scala.binary.version}</artifactId>
-				<version>2.3.0_0.9.0</version>
+				<version>2.2.6</version>
 				<scope>test</scope>
 			</dependency>
 
@@ -173,6 +168,7 @@
 				<artifactId>scala-logging_${scala.binary.version}</artifactId>
 				<version>3.5.0</version>
 			</dependency>
+
 			<!-- Scopt -->
 			<dependency>
 				<groupId>com.github.scopt</groupId>
@@ -180,11 +176,25 @@
 				<version>3.5.0</version>
 			</dependency>
 
-			<!-- Guava -->
 			<dependency>
-				<groupId>com.google.guava</groupId>
-				<artifactId>guava</artifactId>
-				<version>19.0</version>
+				<groupId>com.holdenkarau</groupId>
+				<artifactId>spark-testing-base_${scala.binary.version}</artifactId>
+				<version>2.3.0_0.9.0</version>
+				<scope>test</scope>
+			</dependency>
+
+			<dependency>
+				<groupId>org.glassfish.jersey</groupId>
+				<artifactId>jersey-bom</artifactId>
+				<version>2.26-b03</version>
+				<type>pom</type>
+				<scope>import</scope>
+			</dependency>
+
+			<dependency>
+				<groupId>org.apache.commons</groupId>
+				<artifactId>commons-compress</artifactId>
+				<version>1.18</version>
 			</dependency>
 
 		</dependencies>
@@ -193,11 +203,44 @@
 	<build>
 		<pluginManagement>
 			<plugins>
+
+				<plugin>
+					<groupId>org.apache.maven.plugins</groupId>
+					<artifactId>maven-source-plugin</artifactId>
+					<version>3.0.1</version>
+					<executions>
+						<execution>
+							<id>attach-sources</id>
+							<phase>verify</phase>
+							<goals>
+								<goal>jar-no-fork</goal>
+							</goals>
+						</execution>
+					</executions>
+				</plugin>
+
+				<plugin>
+					<groupId>org.apache.maven.plugins</groupId>
+					<artifactId>maven-javadoc-plugin</artifactId>
+					<version>2.10.4</version>
+					<configuration>
+						<failOnError>false</failOnError>
+					</configuration>
+					<executions>
+						<execution>
+							<id>attach-javadocs</id>
+							<goals>
+								<goal>jar</goal>
+							</goals>
+						</execution>
+					</executions>
+				</plugin>
+
 				<!-- Scala Maven -->
 				<plugin>
 					<groupId>net.alchim31.maven</groupId>
 					<artifactId>scala-maven-plugin</artifactId>
-					<version>3.2.1</version>
+					<version>3.3.1</version>
 					<executions>
 						<execution>
 							<goals>
@@ -209,13 +252,18 @@
 									<!--<arg>-make:transitive</arg> -->
 									<arg>-dependencyfile</arg>
 									<arg>${project.build.directory}/.scala_dependencies</arg>
+									<arg>-Xmax-classfile-name</arg>
+									<arg>128</arg>
 								</args>
+								<jvmArgs>
+									<jvmArg>-Xss2048K</jvmArg>
+								</jvmArgs>
 							</configuration>
 						</execution>
 					</executions>
 					<configuration>
 						<scalaVersion>${scala.version}</scalaVersion>
-						<recompileMode>incremental</recompileMode>
+						<!--<recompileMode>incremental</recompileMode> -->
 					</configuration>
 				</plugin>
 
@@ -229,6 +277,48 @@
 					</configuration>
 				</plugin>
 
+				<plugin>
+					<groupId>com.amashchenko.maven.plugin</groupId>
+					<artifactId>gitflow-maven-plugin</artifactId>
+					<version>1.8.0</version>
+					<configuration>
+						<gitFlowConfig>
+							<versionTagPrefix>v</versionTagPrefix>
+						</gitFlowConfig>
+					</configuration>
+				</plugin>
+
+				<plugin>
+					<groupId>org.apache.maven.plugins</groupId>
+					<artifactId>maven-gpg-plugin</artifactId>
+					<version>1.6</version>
+					<executions>
+						<execution>
+							<id>sign-artifacts</id>
+							<phase>verify</phase>
+							<goals>
+								<goal>sign</goal>
+							</goals>
+							<configuration>
+								<keyname>AKSW</keyname>
+								<passphraseServerId>${gpg.keyname}</passphraseServerId>
+							</configuration>
+						</execution>
+					</executions>
+				</plugin>
+
+				<plugin>
+					<groupId>org.sonatype.plugins</groupId>
+					<artifactId>nexus-staging-maven-plugin</artifactId>
+					<version>1.6.8</version>
+					<extensions>true</extensions>
+					<configuration>
+						<serverId>ossrh</serverId>
+						<nexusUrl>https://oss.sonatype.org/</nexusUrl>
+						<autoReleaseAfterClose>true</autoReleaseAfterClose>
+					</configuration>
+				</plugin>
+
 				<!-- Surefire -->
 				<plugin>
 					<groupId>org.apache.maven.plugins</groupId>
@@ -263,7 +353,7 @@
 					<version>1.0.0</version>
 					<configuration>
 						<verbose>false</verbose>
-						<failOnViolation>false</failOnViolation>
+						<failOnViolation>true</failOnViolation>
 						<includeTestSourceDirectory>true</includeTestSourceDirectory>
 						<failOnWarning>false</failOnWarning>
 						<sourceDirectory>${project.basedir}/src/main/scala</sourceDirectory>
@@ -287,65 +377,49 @@
 	</build>
 
 	<profiles>
-		<!-- the profile used for deployment to Sonatype Maven repository -->
+		<!-- profile necessary for Scalastyle plugin to find the conf file -->
+		<profile>
+			<id>root-dir</id>
+			<activation>
+				<file>
+					<exists>${project.basedir}/../../scalastyle-config.xml</exists>
+				</file>
+			</activation>
+			<properties>
+				<scalastyle.config.path>${project.basedir}/../scalastyle-config.xml</scalastyle.config.path>
+			</properties>
+		</profile>
 		<profile>
 			<id>release</id>
-			<distributionManagement>
-				<repository>
-					<id>ossrh</id>
-					<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
-				</repository>
-			</distributionManagement>
+			<activation>
+				<property>
+					<name>performRelease</name>
+					<value>true</value>
+				</property>
+			</activation>
+
 			<build>
 				<plugins>
 					<plugin>
-						<groupId>net.alchim31.maven</groupId>
-						<artifactId>scala-maven-plugin</artifactId>
-						<version>3.2.2</version>
-						<executions>
-							<execution>
-								<goals>
-									<goal>compile</goal>
-									<goal>testCompile</goal>
-								</goals>
-								<configuration>
-									<scalaVersion>${scala.version}</scalaVersion>
-									<recompileMode>incremental</recompileMode>
-									<useZincServer>true</useZincServer>
-									<args>
-										<arg>-unchecked</arg>
-										<arg>-deprecation</arg>
-										<arg>-feature</arg>
-										<arg>-dependencyfile</arg>
-										<arg>${project.build.directory}/.scala_dependencies</arg>
-									</args>
-								</configuration>
-							</execution>
-							<!-- necessary for Scaladoc Jar generation during deployment -->
-							<execution>
-								<id>attach-javadocs</id>
-								<goals>
-									<goal>doc-jar</goal>
-								</goals>
-							</execution>
-						</executions>
-
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-gpg-plugin</artifactId>
 					</plugin>
 
 					<plugin>
-						<groupId>org.apache.maven.plugins</groupId>
-						<artifactId>maven-source-plugin</artifactId>
-						<executions>
-							<execution>
-								<phase>verify</phase>
-								<id>attach-sources</id>
-								<goals>
-									<goal>jar</goal>
-								</goals>
-							</execution>
-						</executions>
+						<groupId>org.sonatype.plugins</groupId>
+						<artifactId>nexus-staging-maven-plugin</artifactId>
 					</plugin>
+				</plugins>
+			</build>
+		</profile>
+		<profile>
+			<id>doclint-java8-disable</id>
+			<activation>
+				<jdk>[1.8,)</jdk>
+			</activation>
 
+			<build>
+				<plugins>
 					<plugin>
 						<groupId>org.apache.maven.plugins</groupId>
 						<artifactId>maven-javadoc-plugin</artifactId>
@@ -355,55 +429,18 @@
 								<goals>
 									<goal>jar</goal>
 								</goals>
-							</execution>
-						</executions>
-					</plugin>
-
-					<plugin>
-						<groupId>org.apache.maven.plugins</groupId>
-						<artifactId>maven-gpg-plugin</artifactId>
-						<version>1.6</version>
-						<executions>
-							<execution>
-								<id>sign-artifacts</id>
-								<phase>verify</phase>
-								<goals>
-									<goal>sign</goal>
-								</goals>
 								<configuration>
-									<keyname>AKSW</keyname>
-									<passphraseServerId>${gpg.keyname}</passphraseServerId>
+									<failOnError>false</failOnError>
 								</configuration>
 							</execution>
 						</executions>
-					</plugin>
-
-					<plugin>
-						<groupId>org.sonatype.plugins</groupId>
-						<artifactId>nexus-staging-maven-plugin</artifactId>
-						<version>1.6.7</version>
-						<extensions>true</extensions>
 						<configuration>
-							<serverId>ossrh</serverId>
-							<nexusUrl>https://oss.sonatype.org/</nexusUrl>
-							<autoReleaseAfterClose>true</autoReleaseAfterClose>
+							<additionalparam>-Xdoclint:none</additionalparam>
 						</configuration>
 					</plugin>
 				</plugins>
 			</build>
 		</profile>
-		<!-- profile necessary for Scalastyle plugin to find the conf file -->
-		<profile>
-			<id>root-dir</id>
-			<activation>
-				<file>
-					<exists>${project.basedir}/../../scalastyle-config.xml</exists>
-				</file>
-			</activation>
-			<properties>
-				<scalastyle.config.path>${project.basedir}/../scalastyle-config.xml</scalastyle.config.path>
-			</properties>
-		</profile>
 	</profiles>
 
 	<distributionManagement>
diff --git a/sansa-ml-common/pom.xml b/sansa-ml-common/pom.xml
index 1d4ead0..57bae30 100644
--- a/sansa-ml-common/pom.xml
+++ b/sansa-ml-common/pom.xml
@@ -5,7 +5,7 @@
 	<parent>
 		<artifactId>sansa-ml-parent_2.11</artifactId>
 		<groupId>net.sansa-stack</groupId>
-		<version>0.4.0</version>
+		<version>0.5.0</version>
 	</parent>
 	<artifactId>sansa-ml-common_2.11</artifactId>
 	<name>ML API - Common</name>
@@ -18,6 +18,20 @@
 			<artifactId>scala-library</artifactId>
 		</dependency>
 
+		<!-- WordNet Library -->
+		<dependency>
+			<groupId>net.sf.extjwnl</groupId>
+			<artifactId>extjwnl</artifactId>
+			<version>1.9.4</version>
+		</dependency>
+
+		<!-- WordNet Library Dataset -->
+		<dependency>
+			<groupId>net.sf.extjwnl</groupId>
+			<artifactId>extjwnl-data-wn31-map</artifactId>
+			<version>1.0</version>
+		</dependency>
+
 		<!-- Test -->
 		<dependency>
 			<groupId>junit</groupId>
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNet.scala b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNet.scala
similarity index 81%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNet.scala
rename to sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNet.scala
index 93a4a44..546ff00 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNet.scala
+++ b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNet.scala
@@ -10,14 +10,15 @@
  *  and ws4j
  *  and nltk project
 */
-package net.sansa_stack.ml.spark.nlp.wordnet
+package net.sansa_stack.ml.common.nlp.wordnet
 
 import java.io.Serializable
-import scala.collection.JavaConversions._
+
+import net.sf.extjwnl.data.{PointerType, PointerUtils, Word}
+import net.sf.extjwnl.dictionary.Dictionary
+import scala.collection.JavaConverters._
 import scala.collection.breakOut
 import scala.collection.mutable.ArrayBuffer
-import net.sf.extjwnl.dictionary.Dictionary
-import net.sf.extjwnl.data.{PointerType, PointerUtils, Word}
 
 /**
   * WordNet singleton to initialize WordNet dataset
@@ -33,8 +34,11 @@ object WordNet {
   */
 class WordNet extends Serializable {
 
+  var maxDepth = 0
+
   /**
     * Returns an instance of the WordNet dictionary used in the package
+    *
     * @return
     */
   def getDict: Dictionary = WordNet.dict
@@ -46,10 +50,9 @@ class WordNet extends Serializable {
     * @return : List[Synset]
     */
   def getSynsets(lemma: String): List[Synset] =
-    net.sf.extjwnl.data.POS.getAllPOS
+    net.sf.extjwnl.data.POS.getAllPOS.asScala
       .flatMap(pos => getSynsets(lemma, pos))(breakOut)
 
-
   /**
     * Returns a Synset given a String
     * Returns empty list if the lemma did not exist in the WordNet
@@ -63,7 +66,8 @@ class WordNet extends Serializable {
     val indexWord = WordNet.dict.getIndexWord(pos, lemma)
     var result = List.empty[Synset]
     if (indexWord != null) {
-      result = List(indexWord.getSenses()(sid))
+      val result_scala = indexWord.getSenses().asScala
+      result = List(result_scala(sid))
     }
     result
   }
@@ -79,7 +83,7 @@ class WordNet extends Serializable {
   def getSynsets(lemma: String, pos: POS): List[Synset] = {
     val iword = WordNet.dict.getIndexWord(pos, lemma)
     if (iword == null) List.empty[Synset]
-    else iword.getSenses.toList
+    else iword.getSenses.asScala.toList
   }
 
   /**
@@ -89,7 +93,7 @@ class WordNet extends Serializable {
     * @return : List[String]
     */
   def lemmaNames(synset: Synset): List[String] =
-    synset.getWords.map(_.getLemma)(breakOut)
+    synset.getWords.asScala.map(_.getLemma)(breakOut)
 
   /**
     * Input is a synset
@@ -172,7 +176,7 @@ class WordNet extends Serializable {
     * @return : List[Synset]
     */
   def relatedSynsets(synset: Synset, ptr: PointerType): List[Synset] =
-    synset.getPointers(ptr).map(ptr => ptr.getTarget.asInstanceOf[Synset])(breakOut)
+    synset.getPointers(ptr).asScala.map(ptr => ptr.getTarget.asInstanceOf[Synset])(breakOut)
 
   /**
     * Returns list of all hypernyms of a synset
@@ -180,12 +184,12 @@ class WordNet extends Serializable {
     * @param synset :Synset
     * @return : List[Synset]
     */
-  def allHypernyms(synset: Synset): List[List[Synset]] =
+  def getAllHypernyms(synset: Synset): List[List[Synset]] =
     PointerUtils
       .getHypernymTree(synset)
       .toList
-      .map(ptnl => ptnl
-        .map(ptn => ptn.getSynset)
+      .asScala.map(ptnl => ptnl
+        .asScala.map(ptn => ptn.getSynset)
         .toList)(breakOut)
 
   /**
@@ -195,7 +199,7 @@ class WordNet extends Serializable {
     * @return : List[Synset]
     */
   def rootHypernyms(synset: Synset): List[Synset] =
-    allHypernyms(synset)
+    getAllHypernyms(synset)
       .map(hp => hp.reverse.head).distinct
 
   /**
@@ -206,8 +210,8 @@ class WordNet extends Serializable {
     * @return : List[Synset]
     */
   def lowestCommonHypernym(synset1: Synset, synset2: Synset): List[Synset] = {
-    val paths1 = allHypernyms(synset1)
-    val paths2 = allHypernyms(synset2)
+    val paths1 = getAllHypernyms(synset1)
+    val paths2 = getAllHypernyms(synset2)
     lch(paths1, paths2)
   }
 
@@ -219,7 +223,7 @@ class WordNet extends Serializable {
     * @return : Integer
     */
   def shortestHypernymPathLength(synset1: Synset, hypernym: Synset): Int = {
-    val paths1 = allHypernyms(synset1)
+    val paths1 = getAllHypernyms(synset1)
     val path = ArrayBuffer[(Synset, Int)]()
 
     val matchedPath = paths1.zipWithIndex.filter { case (s, i) => s.contains(hypernym) }
@@ -249,15 +253,30 @@ class WordNet extends Serializable {
   }
 
   /**
-    * Returns the depth of a synset
-    * Since there can be several paths to root, the minimum lenth is considered
+    * Returns the length of the shortest hypernym path from this
+    * synset to the root
+    * Since there can be several paths to root, the minimum length is considered
     *
     * @param synset : Synset
     * @return : Integer
     */
-  def depth(synset: Synset): Int = {
-    val lens = allHypernyms(synset)
-    if (lens.isEmpty) -1 else lens.map(_.size).min - 1
+  def minDepth(synset: Synset): Int = {
+    val lists = getAllHypernyms(synset)
+    if (lists.isEmpty) -1 else lists.map(_.size).min - 1
+  }
+
+
+
+  /**
+    * Returns the length of the longest hypernym path from this
+    * synset to the root
+    * Since there can be several paths to root, the minimum length is considered
+    * @param synset : Synset
+    * @return : Integer
+    */
+  def maxDepth(synset: Synset): Int = {
+    val lists = getAllHypernyms(synset)
+    if (lists.isEmpty) -1 else lists.map(_.size).max - 1
   }
 
   /**
@@ -278,6 +297,6 @@ class WordNet extends Serializable {
     */
   def relatedLemmas(word: Word, ptr: PointerType): List[Word] =
     word.getPointers(ptr)
-      .map(ptr => ptr.getTarget.asInstanceOf[Word])(breakOut)
+      .asScala.map(ptr => ptr.getTarget.asInstanceOf[Word])(breakOut)
 
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNetSimilarity.scala b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNetSimilarity.scala
similarity index 67%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNetSimilarity.scala
rename to sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNetSimilarity.scala
index 047061c..86cc701 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/WordNetSimilarity.scala
+++ b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/WordNetSimilarity.scala
@@ -6,29 +6,28 @@
  *  Inspired from:
  *  WordNet::Similarity of Ted Peterson
  *  and ws4j
- *  and ntlk project
+ *  and nltk project
 
  */
-package net.sansa_stack.ml.spark.nlp.wordnet
-
+package net.sansa_stack.ml.common.nlp.wordnet
 
 object WordNetSimilarity extends WordNet {
 
   /**
-   * Wu & Palmer (1994) method of measuring semantic relatedness based on node counting.
-   * given two synsets, synset1 and synset2 returns the similarity score
-   *
-   * @param synset1 :Synset
-   * @param synset2 :Synset
-   * @return score :Double
-   */
+    * Wu & Palmer (1994) method of measuring semantic relatedness based on node counting.
+    * given two synsets, synset1 and synset2 returns the similarity score
+    *
+    * @param synset1 :Synset
+    * @param synset2 :Synset
+    * @return score :Double
+    */
   def wupSimilarity(synset1: Synset, synset2: Synset): Double = {
     val min = 0.0
     if (synset1 == null || synset2 == null) throw new IllegalArgumentException("arg 1 or 2 was null...")
 
     val lcs = lowestCommonHypernym(synset1, synset2)
     if (lcs.isEmpty) return min
-    val depth = this.depth(lcs.head)
+    val depth = this.maxDepth(lcs.head)
     val depth1 = shortestHypernymPathLength(synset1, lcs.head) + depth
     val depth2 = shortestHypernymPathLength(synset2, lcs.head) + depth
     var score = 0.0
@@ -37,13 +36,13 @@ object WordNetSimilarity extends WordNet {
   }
 
   /**
-   * Returns the distance similarity of two synsets using the shortest path linking the two synsets (if
-   * one exists)
-   *
-   * @param synset1 : Synset
-   * @param synset2 : Synset
-   * @return : Double
-   */
+    * Returns the distance similarity of two synsets using the shortest path linking the two synsets (if
+    * one exists)
+    *
+    * @param synset1 : Synset
+    * @param synset2 : Synset
+    * @return : Double
+    */
   def shortestPathSim(synset1: Synset, synset2: Synset): Double = {
 
     if (synset1 == null || synset2 == null) throw new IllegalArgumentException("arg 1 or 2 was null...")
@@ -56,5 +55,4 @@ object WordNetSimilarity extends WordNet {
     else score = 1.toDouble / distance
     score
   }
-
-  }
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/package.scala b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/package.scala
similarity index 89%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/package.scala
rename to sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/package.scala
index a1440d1..a567761 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/nlp/wordnet/package.scala
+++ b/sansa-ml-common/src/main/scala/net/sansa_stack/ml/common/nlp/wordnet/package.scala
@@ -1,6 +1,7 @@
-package net.sansa_stack.ml.spark.nlp
+package net.sansa_stack.ml.common.nlp
 
 import java.io.Serializable
+
 import net.sf.extjwnl.data.POS
 
 package object wordnet extends Serializable {
@@ -13,4 +14,3 @@ package object wordnet extends Serializable {
   val Adjective = POS.ADJECTIVE
   val Adjverb = POS.ADVERB
 }
-
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala
similarity index 80%
rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala
rename to sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala
index a2f4058..80abc36 100644
--- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala
+++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/DistanceWordNetSimilarityMeasureTests.scala
@@ -1,10 +1,9 @@
-package net.sansa_stack.ml.spark.nlp.wordnet
+package net.sansa_stack.ml.common.nlp.wordnet
 
-import com.holdenkarau.spark.testing.DataFrameSuiteBase
-import org.scalatest.FunSuite
 import net.sf.extjwnl.data._
+import org.scalatest.FunSuite
 
-class DistanceWordNetSimilarityMeasureTests extends FunSuite with DataFrameSuiteBase {
+class DistanceWordNetSimilarityMeasureTests extends FunSuite {
 
   test("shortest path similarity between dog and cat synset should result in value 0.3") {
     try {
diff --git a/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestAllHypernims.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestAllHypernims.scala
new file mode 100644
index 0000000..1531ced
--- /dev/null
+++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestAllHypernims.scala
@@ -0,0 +1,27 @@
+package net.sansa_stack.ml.common.nlp.wordnet
+
+import net.sf.extjwnl.data._
+import org.scalatest.FunSuite
+
+class TestAllHypernims extends FunSuite {
+
+  test("Tests getting all hypernyms of the the first synset in the word cat") {
+    try {
+      val wn = new WordNet
+      val dict = wn.getDict
+        // getting a synset by a word and index
+
+        val cat = wn.getSynset("cat", POS.NOUN, 1).head
+
+        val getAllHypers = wn.getAllHypernyms(cat)
+
+         assert(getAllHypers != null)
+      }
+      catch {
+        case e: ExceptionInInitializerError => println("The WordNet dictionary is not installed, please check the readme for instructions to enable it.")
+      }
+
+  }
+}
+
+
diff --git a/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetMaxDepth.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetMaxDepth.scala
new file mode 100644
index 0000000..6e67049
--- /dev/null
+++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetMaxDepth.scala
@@ -0,0 +1,25 @@
+package net.sansa_stack.ml.common.nlp.wordnet
+
+import net.sf.extjwnl.data._
+import org.scalatest.FunSuite
+
+class TestGetMaxDepth extends FunSuite {
+
+  test("Test the function that gets the maximum depth of dataset graph ") {
+
+    try {
+      val wn = new WordNet
+      val dict = wn.getDict
+
+      val thing1 = wn.getSynset("thing", POS.NOUN, 1).head
+      val dog = wn.getSynset("dog", POS.NOUN, 1).head
+
+
+      val dogD = wn.maxDepth(dog)
+      val dogD2 = wn.minDepth(dog)
+      assert(dogD != 0)
+    } catch {
+      case e: ExceptionInInitializerError => println("The WordNet dictionary is not installed, please check the readme for instructions to enable it.")
+    }
+  }
+}
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/TestGetSynsets.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetSynsets.scala
similarity index 73%
rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/TestGetSynsets.scala
rename to sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetSynsets.scala
index c5d5eb5..607d585 100644
--- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/TestGetSynsets.scala
+++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/TestGetSynsets.scala
@@ -1,10 +1,9 @@
-package net.sansa_stack.ml.spark.nlp.wordnet
+package net.sansa_stack.ml.common.nlp.wordnet
 
-import com.holdenkarau.spark.testing.DataFrameSuiteBase
-import org.scalatest.FunSuite
 import net.sf.extjwnl.data._
+import org.scalatest.FunSuite
 
-class TestGetSynsets extends FunSuite with DataFrameSuiteBase {
+class TestGetSynsets extends FunSuite {
 
   test("If The WordNet dictionary is correctly installed synsets must not be null ") {
 
@@ -13,10 +12,9 @@ class TestGetSynsets extends FunSuite with DataFrameSuiteBase {
       val dict = wn.getDict
       // getting a synset by a word and index
 
-      val thing1 = wn.getSynset("thing", POS.NOUN, 1)
+      val thing1 = wn.getSynset("thing", POS.NOUN, 1).head
 
       // getting a list of synsets by a word
-
       val thing2 = wn.getSynsets("thing", POS.NOUN).head
 
       assert(thing1 != null)
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala
similarity index 70%
rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala
rename to sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala
index fa7913d..92a63e2 100644
--- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala
+++ b/sansa-ml-common/src/test/scala/net/sansa_stack/ml/common/nlp/wordnet/WUPWordNetSimilarityMeasuresTests.scala
@@ -1,13 +1,12 @@
-package net.sansa_stack.ml.spark.nlp.wordnet
+package net.sansa_stack.ml.common.nlp.wordnet
 
-import com.holdenkarau.spark.testing.DataFrameSuiteBase
-import org.scalatest.FunSuite
-//import net.didion.jwnl.data._
 import net.sf.extjwnl.data._
+import org.scalatest.FunSuite
+// import net.didion.jwnl.data._
 
-class WUPWordNetSimilarityMeasuresTests extends FunSuite with DataFrameSuiteBase {
+class WUPWordNetSimilarityMeasuresTests extends FunSuite {
 
-  test("wwup similarity between dog and cat synset should result in value 0.3") {
+  test(" WUP similarity between dog and cat synset should result in value 0.3") {
     try {
       val wn = new WordNet
       val dict = wn.getDict
diff --git a/sansa-ml-flink/pom.xml b/sansa-ml-flink/pom.xml
index 9a4c33e..5b245b4 100644
--- a/sansa-ml-flink/pom.xml
+++ b/sansa-ml-flink/pom.xml
@@ -5,7 +5,7 @@
 	<parent>
 		<artifactId>sansa-ml-parent_2.11</artifactId>
 		<groupId>net.sansa-stack</groupId>
-		<version>0.4.0</version>
+		<version>0.5.0</version>
 	</parent>
 	<artifactId>sansa-ml-flink_2.11</artifactId>
 	<name>ML API - Apache Flink</name>
diff --git a/sansa-ml-spark/pom.xml b/sansa-ml-spark/pom.xml
index 662236f..55cade1 100644
--- a/sansa-ml-spark/pom.xml
+++ b/sansa-ml-spark/pom.xml
@@ -5,7 +5,7 @@
 	<parent>
 		<artifactId>sansa-ml-parent_2.11</artifactId>
 		<groupId>net.sansa-stack</groupId>
-		<version>0.4.0</version>
+		<version>0.5.0</version>
 	</parent>
 	<artifactId>sansa-ml-spark_2.11</artifactId>
 	<name>ML API - Apache Spark</name>
@@ -13,6 +13,7 @@
 
 	<properties>
 		<hadoop.version>2.8.3</hadoop.version>
+		<geospark.version>1.1.3</geospark.version>
 	</properties>
 
 	<dependencies>
@@ -29,7 +30,6 @@
 			<groupId>net.sansa-stack</groupId>
 			<artifactId>sansa-rdf-spark_${scala.binary.version}</artifactId>
 		</dependency>
-
 		<!-- SANSA OWL -->
 		<dependency>
 			<groupId>net.sansa-stack</groupId>
@@ -70,9 +70,9 @@
 		</dependency>
 		<!-- LZ4BlockInputStream -->
 		<dependency>
-    	<groupId>net.jpountz.lz4</groupId>
-    	<artifactId>lz4</artifactId>
-    	<version>1.3.0</version>
+			<groupId>net.jpountz.lz4</groupId>
+			<artifactId>lz4</artifactId>
+			<version>1.3.0</version>
 		</dependency>
 		<!-- HermiT reasoner -->
 		<dependency>
@@ -105,37 +105,59 @@
 			<type>pom</type>
 			<version>${jena.version}</version>
 		</dependency>
+
+		<!-- BigDL Library -->
 		<dependency>
-			<groupId>com.github.scopt</groupId>
-			<artifactId>scopt_${scala.binary.version}</artifactId>
-			<version>3.5.0</version>
+			<groupId>com.intel.analytics.bigdl</groupId>
+			<artifactId>bigdl-SPARK_2.2</artifactId>
+			<version>0.3.0</version>
 		</dependency>
 
-		<!-- WordNet Library -->
 		<dependency>
-			<groupId>net.sf.extjwnl</groupId>
-			<artifactId>extjwnl</artifactId>
-			<version>1.9.4</version>
+			<groupId>org.json</groupId>
+			<artifactId>json</artifactId>
+		</dependency>
+		
+		
+		<dependency>
+			<groupId>com.github.haifengl</groupId>
+			<artifactId>smile-core</artifactId>
+			<version>1.5.0</version>
+		</dependency>
+		<dependency>
+			<groupId>com.github.haifengl</groupId>
+			<artifactId>smile-netlib</artifactId>
+			<version>1.5.0</version>
+		</dependency>
+		<dependency>
+			<groupId>org.json4s</groupId>
+			<artifactId>json4s-native_${scala.binary.version}</artifactId>
+			<version>3.6.2</version>
 		</dependency>
+		
 
-		<!-- WordNet Library Dataset -->
+		<!-- https://mvnrepository.com/artifact/org.datasyslab/geospark -->
 		<dependency>
-			<groupId>net.sf.extjwnl</groupId>
-			<artifactId>extjwnl-data-wn31-map</artifactId>
-			<version>1.0</version>
+			<groupId>org.datasyslab</groupId>
+			<artifactId>geospark</artifactId>
+			<version>${geospark.version}</version>
+			<scope>provided</scope>
 		</dependency>
 
 
-		<!-- BigDL Library -->
+		<!-- https://mvnrepository.com/artifact/com.vividsolutions/jts -->
 		<dependency>
-			<groupId>com.intel.analytics.bigdl</groupId>
-			<artifactId>bigdl-SPARK_2.2</artifactId>
-			<version>0.3.0</version>
+			<groupId>com.vividsolutions</groupId>
+			<artifactId>jts</artifactId>
+			<version>1.13</version>
 		</dependency>
 
+
+		<!-- https://mvnrepository.com/artifact/org.datasyslab/geospark-sql -->
 		<dependency>
-			<groupId>org.json</groupId>
-			<artifactId>json</artifactId>
+			<groupId>org.datasyslab</groupId>
+			<artifactId>geospark-sql_2.3</artifactId>
+			<version>${geospark.version}</version>
 		</dependency>
 
 
@@ -167,12 +189,6 @@
 			<groupId>com.github.scopt</groupId>
 			<artifactId>scopt_${scala.binary.version}</artifactId>
 		</dependency>
-
-		<dependency>
-			<groupId>org.springframework</groupId>
-			<artifactId>spring</artifactId>
-			<version>2.5.6.SEC03</version>
-		</dependency>
 	</dependencies>
 
 	<build>
@@ -186,6 +202,19 @@
 				<artifactId>scala-maven-plugin</artifactId>
 			</plugin>
 
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-shade-plugin</artifactId>
+				<version>3.0.0</version>
+				<executions>
+					<execution>
+						<phase>package</phase>
+						<goals>
+							<goal>shade</goal>
+						</goals>
+					</execution>
+				</executions>
+			</plugin>
 			<plugin>
 				<artifactId>maven-compiler-plugin</artifactId>
 			</plugin>
@@ -431,6 +460,13 @@
 				<enabled>true</enabled>
 			</snapshots>
 		</repository>
+
+		<repository>
+			<id>dbscan-on-spark-repo</id>
+			<name>Repo for DBSCAN on Spark</name>
+			<url>http://dl.bintray.com/irvingc/maven</url>
+		</repository>
+
 		<repository>
 			<id>maven.aksw.internal</id>
 			<name>AKSW Release Repository</name>
diff --git a/sansa-ml-spark/src/main/resources/application.properties b/sansa-ml-spark/src/main/resources/application.properties
new file mode 100644
index 0000000..eed2c90
--- /dev/null
+++ b/sansa-ml-spark/src/main/resources/application.properties
@@ -0,0 +1,54 @@
+# spark configuration
+sansa.spark.master=local[*]
+sansa.spark.serializer=org.apache.spark.serializer.KryoSerializer
+sansa.spark.executor.memory=15g
+sansa.spark.driver.memory=15g
+sansa.spark.driver.maxResultSize=15g
+sansa.spark.app.name=SANSA_Clustering
+
+# clusterig profile
+sansa.clustering.profile=results/profile.txt
+
+# pic clustering configuration
+sansa.clustering.pic.result=results/pic_clusters.json
+sansa.clustering.pic.matrix=results/pic_matrix.json
+sansa.clustering.pic.number_clusters=10
+sansa.clustering.pic.iterations=5
+
+# ont hot km clustering configuration
+sansa.clustering.km.onehot.result=results/oneHot_kmeans_clusters.json
+sansa.clustering.km.onehot.matrix=results/oneHotMatrix.json
+sansa.clustering.km.onehot.number_clusters=10
+sansa.clustering.km.onehot.iterations=5
+
+# mds km clustering configuration
+sansa.clustering.km.mds.result=results/mds_kmeans_clusters.json
+sansa.clustering.km.mds.matrix=results/mds_coordinates.json
+sansa.clustering.km.mds.dimension=2
+sansa.clustering.km.mds.number_clusters=10
+sansa.clustering.km.mds.iterations=5
+
+# word2vec km clustering configuration
+sansa.clustering.km.word2vec.result=results/word2vec_kmeans_clusters.json
+sansa.clustering.km.word2vec.matrix=results/word2Vec.json
+sansa.clustering.km.word2vec.number_clusters=10
+sansa.clustering.km.word2vec.iterations=5
+
+# dataset configuration
+#sansa.data.input=data/merged_tomtom_yelp/
+#sansa.data.input=data/tomtom_pois_austria_v0.3.nt
+sansa.data.input=src/main/resources/Cluster/input.nt
+sansa.data.termValueUri=http://example.org/def#termValue
+sansa.data.termPrefix=http://example.org/id/term/
+sansa.data.typePOI=http://example.org/def#POI
+sansa.data.coordinatesPredicate=http://www.opengis.net/ont/geosparql#asWKT
+sansa.data.categoryPOI=http://example.org/def#category
+sansa.data.poiPrefix=http://example.org/id/poi/
+
+
+# sansa and yelp file merge
+sansa.merge.input=src/main/resources/Cluster/input.nt
+#yelp.sansa.merged_file=data/tomtom_yelp.nt 
+yelp.data.input=src/main/resources/Cluster/categories.nt
+yelp.data.categoryPOI=http://example.org/hasYelpCategory
+yelp.data.rating=http://example.org/hasRating
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ClassMembership.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ClassMembership.scala
index 48c57f9..caf62cf 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ClassMembership.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ClassMembership.scala
@@ -1,25 +1,20 @@
 package net.sansa_stack.ml.spark.classification
 
 import java.io.PrintStream
-import java.util.ArrayList
-import java.util.HashSet
-import java.util.Set
-import scala.util.Random
-import collection.JavaConverters._
+import java.util.{ ArrayList, HashSet, Set }
+
 import scala.collection
+import scala.util.Random
 
-import org.semanticweb.owlapi.model.OWLClassExpression
-import org.semanticweb.owlapi.model.OWLIndividual
-import org.semanticweb.owlapi.model.OWLNamedIndividual
+import collection.JavaConverters._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import org.semanticweb.owlapi.model.{ OWLClassExpression, OWLIndividual, OWLNamedIndividual }
 
 import net.sansa_stack.ml.spark.classification
-import net.sansa_stack.ml.spark.classification.TDTInducer.TDTInducer
-import net.sansa_stack.ml.spark.classification.KB.KB
 import net.sansa_stack.ml.spark.classification.ConceptsGenerator._
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.{ SparkConf, SparkContext }
+import net.sansa_stack.ml.spark.classification.KB.KB
+import net.sansa_stack.ml.spark.classification.TDTInducer.TDTInducer
 
 object ClassMembership {
 
@@ -51,11 +46,11 @@ object ClassMembership {
 
       println()
       println(nFolds + "-fold BOOTSTRAP Experiment on ontology: ")
-      //val classifierClass: Class[_] = ClassLoader.getSystemClassLoader.loadClass(className)
+      // val classifierClass: Class[_] = ClassLoader.getSystemClassLoader.loadClass(className)
       val nOfConcepts: Int = if (testConcepts != null) testConcepts.size else 1
 
-      //var Generator: Random = new Random()
-      //val ntestExs: Array[Int] = Array.ofDim[Int](nFolds)
+      // var Generator: Random = new Random()
+      // val ntestExs: Array[Int] = Array.ofDim[Int](nFolds)
 
       // main loop on the folds
       for (f <- 0 until nFolds) {
@@ -71,10 +66,10 @@ object ClassMembership {
         testRDD.foreach(println(_))
 
         val classifier: TDTInducer = new TDTInducer(k, nOfConcepts, spark)
-        //val classifier: TDTInducer = new TDTInducer(k, kb.Concepts.count().toInt, spark)
-        /*val cl: TDTInducer = (classifierClass.getConstructor(classOf[KB], classOf[Int]))
-        .newInstance(kb, nOfConcepts).asInstanceOf[TDTInducer]*/
-        //ntestExs(f) = testRDD.count.toInt
+        // val classifier: TDTInducer = new TDTInducer(k, kb.Concepts.count().toInt, spark)
+        /*  val cl: TDTInducer = (classifierClass.getConstructor(classOf[KB], classOf[Int]))
+        .newInstance(kb, nOfConcepts).asInstanceOf[TDTInducer] */
+        // ntestExs(f) = testRDD.count.toInt
 
         // training phase: using all examples but only those in the f-th partition
         println("\nTraining is starting...")
@@ -85,44 +80,6 @@ object ClassMembership {
         val labels: Array[Array[Int]] = classifier.test(f, testRDD, testConcepts)
 
       } // for loop
-    } //bootstrap function
-  } //class
+    } // bootstrap function
+  } // class
 }
-
-
-        
-//        for (i<- 0 until allExamples.count.toInt)
-//          trainRDD.add(allExamples.takeSample(true, 1)(0))
-
-//        val trainingExsSet: Set[Integer] = new HashSet[Integer]()
-//        var trainRDD = spark.sparkContext.parallelize(trainingExsSet.asScala.toSeq)
-//        
-//        val testingExsSet: Set[Integer] = new HashSet[Integer]()
-//        var testRDD = spark.sparkContext.parallelize(testingExsSet.asScala.toSeq)
-//        
-//        var rand1 = new ArrayList[Integer]
-//        for (r <- 0 until allExamples.count.toInt)
-//            rand1.add(Generator.nextInt(allExamples.count.toInt))  
-//    
-//        var newRDD = spark.sparkContext.parallelize(rand1.asScala)
-//        trainRDD.union(newRDD) 
-//        //trainingExsSet.add(Generator.nextInt(allExamples.count.toInt))
-//         
-//        var r = 0 to allExamples.count.toInt
-//        var rand2 = spark.sparkContext.parallelize(r)
-//        
-//        if (!trainRDD.collect().contains(rand2))
-//          testRDD.union(rand2.asInstanceOf[RDD[Integer]])
-         
-         /*for (r <- 0 until allExamples.count.toInt){
-           if (!trainRDD.collect().contains(r))
-              testRDD.union(r)
-         }*/
-           
-        
-        /*var trainingExs: Array[Integer] = Array.ofDim[Integer](0)
-        var testExs: Array[Integer] = Array.ofDim[Integer](0)
-        
-        trainingExs = trainingExsSet.toArray(trainingExs)
-        testExs = testingExsSet.toArray(testExs)*/
-        
\ No newline at end of file
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ConceptsGenerator.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ConceptsGenerator.scala
index 1acdc06..b3d36ea 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ConceptsGenerator.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/ConceptsGenerator.scala
@@ -1,25 +1,23 @@
 package net.sansa_stack.ml.spark.classification
 
 import java.util.HashSet
+
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.{SparkConf, SparkContext}
 import org.semanticweb.HermiT.Reasoner
-import org.semanticweb.owlapi.model.OWLClassExpression
-import org.semanticweb.owlapi.model.OWLDataFactory
-import org.semanticweb.owlapi.model.OWLIndividual
-import org.semanticweb.owlapi.model.OWLNamedIndividual
+import org.semanticweb.owlapi.model.{ OWLClassExpression, OWLDataFactory, OWLIndividual, OWLNamedIndividual }
+
 import net.sansa_stack.ml.spark.classification.KB.KB
 
-object ConceptsGenerator{
+object ConceptsGenerator {
   class ConceptsGenerator(protected var kb: KB) {
 
     protected var reasoner: Reasoner = kb.getReasoner
     protected var dataFactory: OWLDataFactory = kb.getDataFactory
     protected var allExamples: RDD[OWLIndividual] = kb.getIndividuals
-  
+
     def generateQueryConcepts(numConceptsToGenerate: Int, sc: SparkSession): Array[OWLClassExpression] = {
-      
+
       println("\nConcepts Generation\n-----------\n")
       val queryConcept: Array[OWLClassExpression] = Array.ofDim[OWLClassExpression](numConceptsToGenerate)
       val minOfSubConcepts: Int = 2
@@ -29,7 +27,7 @@ object ConceptsGenerator{
       var j: Int = 0
       var nextConcept: OWLClassExpression = null
       var complPartialConcept: OWLClassExpression = null
-      var nEx : Int = allExamples.count().toInt
+      var nEx: Int = allExamples.count().toInt
       // cycle to build numConceptsToGenerate new query concepts
       i = 0
       while (i < numConceptsToGenerate) {
@@ -37,56 +35,50 @@ object ConceptsGenerator{
         numOfSubConcepts = minOfSubConcepts + KB.generator.nextInt(maxOfSubConcepts - minOfSubConcepts)
         var numPosInst: Int = 0
         var numNegInst: Int = 0
-  
+
         // build a single new query OWLClassExpression adding conjuncts or disjuncts
         do {
-  
-          //take the first subConcept for builiding the query OWLClassExpression
+
+          // take the first subConcept for builiding the query OWLClassExpression
           partialConcept = kb.getRandomConcept
-          //println("partial concept" + partialConcept)
+          // println("partial concept" + partialConcept)
           j = 1
-                   
+
           while (j < numOfSubConcepts) {
             val newConcepts: HashSet[OWLClassExpression] = new HashSet[OWLClassExpression]()
             newConcepts.add(partialConcept)
-            
+
             nextConcept = kb.getRandomConcept
             newConcepts.add(nextConcept)
-           
+
             partialConcept =
-              if (KB.generator.nextInt(4) == 0)
+              if (KB.generator.nextInt(4) == 0) {
                 dataFactory.getOWLObjectIntersectionOf(newConcepts)
-              else dataFactory.getOWLObjectUnionOf(newConcepts) 
-            j+=1
+              } else dataFactory.getOWLObjectUnionOf(newConcepts)
+            j += 1
           } // for j
-          
+
           println()
           complPartialConcept = dataFactory.getOWLObjectComplementOf(partialConcept)
-          //println("\n", complPartialConcept)
+          // println("\n", complPartialConcept)
           numPosInst = reasoner.getInstances(partialConcept, false).entities().count().toInt
           numNegInst = reasoner.getInstances(complPartialConcept, false).entities().count().toInt
-         
+
           println(partialConcept)
-          println ("\n pos: " + numPosInst + ",  neg: " + numNegInst + ",    und: " + (nEx - numNegInst - numPosInst))
+          println("\n pos: " + numPosInst + ",  neg: " + numNegInst + ",    und: " + (nEx - numNegInst - numPosInst))
           println()
 
-        } while ((numPosInst < 20) || (numNegInst >3))     
-  // ((numPosInst < 10) || (numNegInst > 10))
-  //         (numPosInst * numNegInst == 0) 
-        //add the newly built OWLClassExpression to the list of all required query concepts
+        } while ((numPosInst < 20) || (numNegInst > 3))
+        // ((numPosInst < 10) || (numNegInst > 10))
+        //         (numPosInst * numNegInst == 0)
+        // add the newly built OWLClassExpression to the list of all required query concepts
         queryConcept(i) = partialConcept
-        println("Query " + (i+1) + " found\n\n")
-        i+=1
+        println("Query " + (i + 1) + " found\n\n")
+        i += 1
       }
-  
-     queryConcept
+
+      queryConcept
     }
-  
+
   }
 }
-
-          
-          /*println("pos:%d (%3.1f)\t\t neg:%d (%3.1f)\t\t und:%d (%3.1f)\n " + numPosInst + numPosInst * 100.0 / nExs, 
-            numNegInst, numNegInst * 100.0 / nExs,
-            (nExs - numNegInst - numPosInst),
-            (nExs - numNegInst - numPosInst) * 100.0 / nExs)*/
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/DLTree.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/DLTree.scala
index 47b6338..1144421 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/DLTree.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/DLTree.scala
@@ -1,15 +1,14 @@
 package net.sansa_stack.ml.spark.classification
 
-import java.util.ArrayList
-import java.util.List
+import java.util.{ArrayList, List}
 
 import collection.JavaConverters._
-
-import net.sansa_stack.ml.spark.classification._
-import org.semanticweb.owlapi.model.OWLClassExpression
-import org.apache.spark.{ SparkConf, SparkContext }
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
+import org.semanticweb.owlapi.model.OWLClassExpression
+
+import net.sansa_stack.ml.spark.classification._
+
 
 /*
  * Class for basic functions of DL trees
@@ -99,8 +98,7 @@ class DLTree {
    * function to get the number of nodes
    */
 
-  /*
-    def getNodi(sc: SparkSession): Double = {
+  /* def getNodi(sc: SparkSession): Double = {
 
     // visit in to make the count
     val lista: ArrayList[DLNode] = new ArrayList[DLNode]()
@@ -145,10 +143,6 @@ class DLTree {
     }
     num
   }
-
-
-
-   def getComplexityMeasure(sc: SparkSession) : Double = getNodi(sc)*/
+   def getComplexityMeasure(sc: SparkSession) : Double = getNodi(sc) */
 
 }
-
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/KB.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/KB.scala
index 4cdd1d9..c2ee415 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/KB.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/KB.scala
@@ -5,29 +5,29 @@ import java.net.URI
 import java.util.{ ArrayList, List, Random }
 import java.util.stream.{ Collectors, IntStream, Stream }
 
-import scala.collection.JavaConversions._
-import collection.JavaConverters._
 import scala.collection.{ Iterator, Map }
+import scala.collection.JavaConverters._
 import scala.collection.immutable.{ HashMap, Set }
 
+import collection.JavaConverters._
+import net.sansa_stack.owl.spark.rdd.OWLAxiomsRDD
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import org.semanticweb.HermiT.{ Configuration, Reasoner, ReasonerFactory }
 import org.semanticweb.owlapi.apibinding.OWLManager
 import org.semanticweb.owlapi.model._
-import org.semanticweb.owlapi.util.SimpleIRIMapper
 import org.semanticweb.owlapi.reasoner.{ OWLReasoner, OWLReasonerFactory }
 import org.semanticweb.owlapi.reasoner.structural.StructuralReasonerFactory
+import org.semanticweb.owlapi.util.SimpleIRIMapper
 
-import org.semanticweb.HermiT.{ Configuration, Reasoner, ReasonerFactory }
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SparkSession
-import net.sansa_stack.owl.spark.rdd.OWLAxiomsRDD
 
 object KB {
   val d: Double = 0.3
   var generator: Random = new Random(2)
 
-  /*
-		 * The class to define the Knowledgebase elements
-		 */
+  /**
+   * The class to define the Knowledgebase elements
+   */
 
   class KB(var UrlOwlFile: String, rdd: OWLAxiomsRDD, sparkSession: SparkSession) {
 
@@ -82,7 +82,7 @@ object KB {
 
       val Concepts2: RDD[OWLClass] = rdd.flatMap {
         case axiom: HasClassesInSignature => axiom.classesInSignature().iterator().asScala
-        case _                            => null
+        case _ => null
       }.filter(_ != null).distinct()
 
       Concepts = Concepts2
@@ -96,7 +96,7 @@ object KB {
 
       val Roles2: RDD[OWLObjectProperty] = rdd.map {
         case axiom: HasProperty[OWLObjectProperty] => axiom.getProperty
-        case _                                     => null
+        case _ => null
       }.filter(_ != null).distinct()
 
       Roles = Roles2
@@ -110,7 +110,7 @@ object KB {
 
       val Properties2: RDD[OWLDataProperty] = rdd.flatMap {
         case axiom: HasDataPropertiesInSignature => axiom.dataPropertiesInSignature().iterator().asScala
-        case _                                   => null
+        case _ => null
       }.filter(_ != null).distinct()
 
       Properties = Properties2
@@ -124,7 +124,7 @@ object KB {
 
       val Examples2: RDD[OWLNamedIndividual] = rdd.flatMap {
         case axiom: HasIndividualsInSignature => axiom.individualsInSignature().collect(Collectors.toSet()).asScala
-        case _                                => null
+        case _ => null
       }.filter(_ != null).distinct()
 
       Examples = Examples2.asInstanceOf[RDD[OWLIndividual]]
@@ -163,10 +163,12 @@ object KB {
             p = p + 1
           } else {
             if (!flag) {
-              if (r.isEntailed(getDataFactory.getOWLClassAssertionAxiom(negTestConcepts(c), ind)))
+              if (r.isEntailed(getDataFactory.getOWLClassAssertionAxiom(negTestConcepts(c), ind))) {
                 classifications(c)(e) = -1
-            } else
+              }
+            } else {
               classifications(c)(e) = -1
+            }
 
             n = n + 1
           }
@@ -231,7 +233,7 @@ object KB {
 
     def getReasoner(): Reasoner = hermit
 
-    //def getURL(): String = urlOwlFile
+    // def getURL(): String = urlOwlFile
 
     def getRandomProperty(numQueryProperty: Int): Array[Int] = {
 
@@ -277,11 +279,12 @@ object KB {
                 val role: OWLObjectProperty = Roles.takeSample(true, 1)(0)
 
                 newConcept =
-                  if (KB.generator.nextDouble() < 0.5)
+                  if (KB.generator.nextDouble() < 0.5) {
                     dataFactory.getOWLObjectAllValuesFrom(role, newConceptBase)
-                  else dataFactory.getOWLObjectSomeValuesFrom(role, newConceptBase)
-              } else
+                  } else dataFactory.getOWLObjectSomeValuesFrom(role, newConceptBase)
+              } else {
                 newConcept = dataFactory.getOWLObjectComplementOf(newConceptBase)
+              }
             }
           }
         } while (!reasoner.isSatisfiable(newConcept))
@@ -301,12 +304,13 @@ object KB {
                 val role: OWLObjectProperty = Roles.takeSample(true, 1)(0)
 
                 newConcept =
-                  if (KB.generator.nextDouble() < d)
+                  if (KB.generator.nextDouble() < d) {
                     dataFactory.getOWLObjectAllValuesFrom(role, newConceptBase)
-                  else dataFactory.getOWLObjectSomeValuesFrom(role, newConceptBase)
+                  } else dataFactory.getOWLObjectSomeValuesFrom(role, newConceptBase)
               }
-          } else
+          } else {
             newConcept = dataFactory.getOWLObjectComplementOf(newConcept)
+          }
 
         } while (!reasoner.isSatisfiable(newConcept))
 
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/PerformanceMetrics.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/PerformanceMetrics.scala
index 9226568..dd9387f 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/PerformanceMetrics.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/PerformanceMetrics.scala
@@ -1,5 +1,4 @@
 package net.sansa_stack.ml.spark.classification
 
 object PerformanceMetrics {
-  
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/RefinementOperator.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/RefinementOperator.scala
index ea1b1bd..2407956 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/RefinementOperator.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/RefinementOperator.scala
@@ -6,11 +6,12 @@ import java.util.stream.{ Collectors, Stream }
 import scala.collection.JavaConverters._
 import scala.util.Random
 
+import org.apache.spark.rdd.RDD
 import org.semanticweb.owlapi.model._
 import org.semanticweb.owlapi.search.EntitySearcher
+
 import net.sansa_stack.ml.spark.classification._
 import net.sansa_stack.ml.spark.classification.KB.KB
-import org.apache.spark.rdd.RDD
 
 object RefinementOperator {
   val d: Double = 0.5
@@ -25,9 +26,9 @@ class RefinementOperator(var kb: KB) {
   private var Properties: RDD[OWLDataProperty] = kb.getDataProperties
   private var dataFactory: OWLDataFactory = kb.getDataFactory
 
-  /*
-	 * Function to generate subsumed random concepts
-	 */
+  /**
+   * Function to generate subsumed random concepts
+   */
   def getSubsumedRandomConcept(currentConcept: OWLClassExpression): OWLClassExpression = {
 
     val generator: Random = new Random()
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTClassifiers.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTClassifiers.scala
index 136e1af..9d5722f 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTClassifiers.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTClassifiers.scala
@@ -1,28 +1,24 @@
 package net.sansa_stack.ml.spark.classification
 
-import java.util.ArrayList
-import java.util.HashSet
-import java.util.Iterator
-import java.util.List
-import collection.JavaConverters._
+import java.util.{ ArrayList, HashSet, Iterator, List }
+
 import scala.util.control.Breaks._
 
+import collection.JavaConverters._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
 import org.semanticweb.owlapi.model.OWLClassExpression
 import org.semanticweb.owlapi.model.OWLDataFactory
+import org.semanticweb.owlapi.model.OWLEquivalentClassesAxiom
 import org.semanticweb.owlapi.model.OWLIndividual
-import org.semanticweb.owlapi.model.OWLObjectProperty
 import org.semanticweb.owlapi.model.OWLObjectAllValuesFrom
-import org.semanticweb.owlapi.model.OWLObjectSomeValuesFrom
 import org.semanticweb.owlapi.model.OWLObjectIntersectionOf
-import org.semanticweb.owlapi.model.OWLEquivalentClassesAxiom
-//import org.semanticweb.owlapi.model.IRI
+import org.semanticweb.owlapi.model.OWLObjectProperty
+import org.semanticweb.owlapi.model.OWLObjectSomeValuesFrom
 
-import net.sansa_stack.ml.spark.classification
-import net.sansa_stack.ml.spark.classification.KB.KB
 import net.sansa_stack.ml.spark.classification._
+import net.sansa_stack.ml.spark.classification.KB.KB
 
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SparkSession
 
 /*
  * Terminological Decision Tree Classifier
@@ -30,7 +26,8 @@ import org.apache.spark.sql.SparkSession
 
 object TDTClassifiers {
 
-  /* L for the left branch and R for the right branch
+  /**
+   * L for the left branch and R for the right branch
    * P, N, U for postive, negative and unlabeled respectively
    */
 
@@ -40,7 +37,7 @@ object TDTClassifiers {
   val PR: Int = 3
   val NR: Int = 4
   val UR: Int = 5
-  
+
   class TDTClassifiers(var k: KB, var sc: SparkSession) {
 
     /**
@@ -57,12 +54,13 @@ object TDTClassifiers {
      * @return
      */
 
-    def induceDLTree(father: OWLClassExpression,
-                     posExs: RDD[String], negExs: RDD[String], undExs: RDD[String],
-                     nRefs: Int, prPos: Double, prNeg: Double): DLTree = {
+    def induceDLTree(
+      father: OWLClassExpression,
+      posExs: RDD[String], negExs: RDD[String], undExs: RDD[String],
+      nRefs: Int, prPos: Double, prNeg: Double): DLTree = {
 
       val THRESHOLD: Double = 0.05
-      val tree: DLTree = new DLTree() 
+      val tree: DLTree = new DLTree()
 
       if (posExs.count.toInt == 0 && negExs.count.toInt == 0) // There is no examples
         if (prPos >= prNeg) { // prior majority of positives
@@ -80,12 +78,10 @@ object TDTClassifiers {
       val total = numPos + numNeg
       var perPos: Double = 0
       var perNeg: Double = 0
-      if (total !=0){
+      if (total != 0) {
         perPos = numPos / total
         perNeg = numNeg / total
-      }
-      else 
-        return tree
+      } else return tree
 
       println("\nnew per Pos: " + perPos)
       println("new per Neg: " + perNeg)
@@ -94,14 +90,13 @@ object TDTClassifiers {
         tree.setRoot(k.getDataFactory().getOWLThing) // set positive leaf
         println("-----\nPostive leaf (prior2)")
         return tree
-      } 
-      else if (perPos == 0 && perNeg > THRESHOLD) { // no positive			
+      } else if (perPos == 0 && perNeg > THRESHOLD) { // no positive
         tree.setRoot(k.getDataFactory().getOWLNothing); // set negative leaf
         println("-----\nNegative leaf (prior2)\n");
         return tree
       }
 
-      //	else (a non-leaf node) ...
+      // else (a non-leaf node)
 
       // generate set of concepts
       val Con: RDD[OWLClassExpression] = generateRefs(k, father, nRefs, posExs, negExs)
@@ -109,56 +104,53 @@ object TDTClassifiers {
 
       // select best partitioning node concept
       val bestConcept: OWLClassExpression = selectBestConcept(k, Con, posExs, negExs, undExs, prPos, prNeg)
-      
-      if (bestConcept != null){
-         
-         val sNode = split(k, bestConcept, posExs, negExs, undExs)
-         
-         // set the root concept
-         tree.setRoot(bestConcept.getNNF)
-         
-        // sNode._1._1 = PosEL, sNode._2._1 = NegEL, sNode._3._1 =  undEL             
-        // sNode._1._2 = PosER, sNode._2._2 = NegER, sNode._3._2 =  undER     
-         
-        
-       // build subtrees
-         
+
+      if (bestConcept != null) {
+
+        val sNode = split(k, bestConcept, posExs, negExs, undExs)
+
+        // set the root concept
+        tree.setRoot(bestConcept.getNNF)
+
+        // sNode._1._1 = PosEL, sNode._2._1 = NegEL, sNode._3._1 =  undEL
+        // sNode._1._2 = PosER, sNode._2._2 = NegER, sNode._3._2 =  undER
+
+        // build subtrees
+
         println("\nStart Positive tree \n----------")
         tree.setPosTree(induceDLTree(bestConcept, sNode._1._1, sNode._2._1, sNode._3._1, nRefs, prPos, prNeg))
-        
+
         println("\nStart Negative tree \n----------")
         tree.setNegTree(induceDLTree(bestConcept.getComplementNNF, sNode._1._2, sNode._2._2, sNode._3._2, nRefs, prPos, prNeg))
-        
         return tree
-      }
-      else
-        return null
+      } else return null
     }
-   
+
     /**
      * recursive down through the tree model
      * @param ind
      * @param tree
      * @return
      */
-   def classify(ind: OWLIndividual, tree: DLTree): Int = {
+    def classify(ind: OWLIndividual, tree: DLTree): Int = {
 
       val rootClass: OWLClassExpression = tree.getRoot
       println("\nrootClass " + rootClass)
-      
+
       val negRootClass: OWLClassExpression = k.getDataFactory.getOWLObjectComplementOf(rootClass)
       println("negRootClass " + negRootClass)
-       
+
       if (rootClass.equals(k.getDataFactory.getOWLThing)) return +1
       if (rootClass.equals(k.getDataFactory.getOWLNothing)) return -1
 
       var r1: Int = 0
       var r2: Int = 0
 
-      if (k.getReasoner.isEntailed(k.getDataFactory.getOWLClassAssertionAxiom(rootClass, ind)))
+      if (k.getReasoner.isEntailed(k.getDataFactory.getOWLClassAssertionAxiom(rootClass, ind))) {
         r1 = classify(ind, tree.getPosSubTree)
-      else if (k.getReasoner.isEntailed(k.getDataFactory.getOWLClassAssertionAxiom(negRootClass, ind)))
+      } else if (k.getReasoner.isEntailed(k.getDataFactory.getOWLClassAssertionAxiom(negRootClass, ind))) {
         r2 = classify(ind, tree.getNegSubTree)
+      }
 
       var cP: Int = 0
       var cn: Int = 0
@@ -169,7 +161,7 @@ object TDTClassifiers {
         if (missingVForTDT) {
           cP += classify(ind, tree.getPosSubTree)
           cn -= classify(ind, tree.getNegSubTree)
-         
+
           if (cP > (-1 * cn)) return +1
           else if (cP < (-1 * cn)) return -1
           else return 0
@@ -178,380 +170,367 @@ object TDTClassifiers {
       else if ((r1 != 0)) r1
       else r2
     }
-    
- 
-   /**
-   * @param know
-   * @param concept
-   * @param dim
-   * @param posExs
-   * @param negExs
-   * @return
-   */
-  private def generateRefs(know: KB, concept: OWLClassExpression, dim: Int, posExs: RDD[String],
-                           negExs: RDD[String]): RDD[OWLClassExpression] = {
-
-    println("\nGenerating node concepts: \n ")
-    var rConcepts: Array[OWLClassExpression] = Array.ofDim[OWLClassExpression](dim)
-    var newConcept: OWLClassExpression = null
-    var refinement: OWLClassExpression = null
-    var emptyIntersection: Boolean = false
-    
-    //val conceptExp = concept.nestedClassExpressions.iterator().asScala.toArray
-    val C = concept.asConjunctSet()
-    val ConceptExp = concept.asConjunctSet().iterator().asScala.toSeq
-    //println("\nconcept set   " + C )
-    
-    for (c <- 0 until dim) {
-
-      do {
-        emptyIntersection = false //true
-        val Concepts: HashSet[OWLClassExpression] = new HashSet[OWLClassExpression]()
-        
-        if (concept.equals(know.getDataFactory().getOWLThing))
-          refinement = new RefinementOperator(know).getRandomConcept(know)
-        else
-          refinement = new RefinementOperator(know).getSubsumedRandomConcept(concept)
-               
-       /* val con: OWLEquivalentClassesAxiom = know.dataFactory.getOWLEquivalentClassesAxiom(concept)
+
+    /**
+     * @param know
+     * @param concept
+     * @param dim
+     * @param posExs
+     * @param negExs
+     * @return
+     */
+    private def generateRefs(know: KB, concept: OWLClassExpression, dim: Int, posExs: RDD[String],
+                             negExs: RDD[String]): RDD[OWLClassExpression] = {
+
+      println("\nGenerating node concepts: \n ")
+      var rConcepts: Array[OWLClassExpression] = Array.ofDim[OWLClassExpression](dim)
+      var newConcept: OWLClassExpression = null
+      var refinement: OWLClassExpression = null
+      var emptyIntersection: Boolean = false
+
+      // val conceptExp = concept.nestedClassExpressions.iterator().asScala.toArray
+      val C = concept.asConjunctSet()
+      val ConceptExp = concept.asConjunctSet().iterator().asScala.toSeq
+      // println("\nconcept set   " + C )
+
+      for (c <- 0 until dim) {
+
+        do {
+          emptyIntersection = false // true
+          val Concepts: HashSet[OWLClassExpression] = new HashSet[OWLClassExpression]()
+
+          if (concept.equals(know.getDataFactory().getOWLThing)) {
+            refinement = new RefinementOperator(know).getRandomConcept(know)
+          } else {
+            refinement = new RefinementOperator(know).getSubsumedRandomConcept(concept)
+          }
+
+          /*  val con: OWLEquivalentClassesAxiom = know.dataFactory.getOWLEquivalentClassesAxiom(concept)
 
         val conExp: Array[OWLClassExpression] = con.classExpressions.iterator().asScala.toArray
         println("Concept Expressions = "  )
-        conExp.foreach(println(_))*/
-                
-        val refInstance: Boolean = refinement.isInstanceOf[OWLObjectAllValuesFrom]
-        breakable{
-        
-          for (i <- ConceptExp)
-          {
-            if (i.isInstanceOf[OWLObjectSomeValuesFrom]){
-              val y: OWLObjectSomeValuesFrom = i.asInstanceOf[OWLObjectSomeValuesFrom]
-              val conprop: OWLObjectProperty = y.getProperty.getNamedProperty
-              val confiller : OWLClassExpression = y.getFiller
-             /*println("============================")
+        conExp.foreach(println(_)) */
+
+          val refInstance: Boolean = refinement.isInstanceOf[OWLObjectAllValuesFrom]
+          breakable {
+
+            for (i <- ConceptExp) {
+              if (i.isInstanceOf[OWLObjectSomeValuesFrom]) {
+                val y: OWLObjectSomeValuesFrom = i.asInstanceOf[OWLObjectSomeValuesFrom]
+                val conprop: OWLObjectProperty = y.getProperty.getNamedProperty
+                val confiller: OWLClassExpression = y.getFiller
+                /* println("============================")
               println("concept property = " + conprop)
-              println("concept filler = " + confiller)*/
-            
-              if (refInstance){
-                val x : OWLObjectAllValuesFrom =  refinement.asInstanceOf[OWLObjectAllValuesFrom]
-                val rprop: OWLObjectProperty = x.getProperty.getNamedProperty
-                val rfiller: OWLClassExpression = x.getFiller
-               // println("refienment property = " + rprop)
-                //println("refienment filler = " + rfiller)
-                if (conprop == rprop) break
-                
+              println("concept filler = " + confiller) */
+
+                if (refInstance) {
+                  val x: OWLObjectAllValuesFrom = refinement.asInstanceOf[OWLObjectAllValuesFrom]
+                  val rprop: OWLObjectProperty = x.getProperty.getNamedProperty
+                  val rfiller: OWLClassExpression = x.getFiller
+                  // println("refienment property = " + rprop)
+                  // println("refienment filler = " + rfiller)
+                  if (conprop == rprop) break
+
+                }
+              }
+            }
+            if ((!(ConceptExp.contains(refinement)))) {
+              Concepts.add(concept)
+              Concepts.add(refinement)
+              newConcept = know.getDataFactory.getOWLObjectIntersectionOf(Concepts)
+              if (newConcept != null) {
+                emptyIntersection = !know.getReasoner.isSatisfiable(newConcept)
               }
             }
           }
-          if ((!(ConceptExp.contains(refinement)))) 
-          {
-            Concepts.add(concept)
-            Concepts.add(refinement) 
-            newConcept = know.getDataFactory.getOWLObjectIntersectionOf(Concepts)
-            if (newConcept != null)
-              emptyIntersection = !know.getReasoner.isSatisfiable(newConcept)
-          }
-        }
-        
-  
-      } while (emptyIntersection )
-      
-      rConcepts(c) = 
-        if (newConcept != null) newConcept
-        else concept
-        
+
+        } while (emptyIntersection)
+
+        rConcepts(c) =
+          if (newConcept != null) newConcept
+          else concept
+
+      }
+      var Refs: RDD[OWLClassExpression] = sc.sparkContext.parallelize(rConcepts)
+      var nRef = Refs.distinct().count.toInt
+      println("\nNo. of generated concepts: " + nRef)
+      Refs.distinct()
     }
-    var Refs: RDD[OWLClassExpression] = sc.sparkContext.parallelize(rConcepts)
-    var nRef = Refs.distinct().count.toInt
-    println("\nNo. of generated concepts: " + nRef)
-    Refs.distinct()
-  }
-  
-        //val iterator: Iterator[OWLIndividual] = know.getReasoner().getInstances(newConcept, false).entities().iterator().asInstanceOf[Iterator[OWLIndividual]]
-        //val nextInd : OWLIndividual = iterator.next()     
- 
-   /**
-   * Selecting the best in a list (RDD) of refinements
-   * @param know
-   * @param concepts
-   * @param posExs
-   * @param negExs
-   * @param undExs
-   * @param prPos
-   * @param prNeg
-   * @return
-   */
 
-  def selectBestConcept(know: KB, 
-                        concepts: RDD[OWLClassExpression],
-                        posExs: RDD[String],
-                        negExs: RDD[String],
-                        undExs: RDD[String],
-                        prPos: Double, prNeg: Double): OWLClassExpression = {
+    // val iterator: Iterator[OWLIndividual] = know.getReasoner().getInstances(newConcept, false).entities().iterator().asInstanceOf[Iterator[OWLIndividual]]
+    // val nextInd : OWLIndividual = iterator.next()
 
-    var bestConceptIndex: Int = 0
+    /**
+     * Selecting the best in a list (RDD) of refinements
+     * @param know
+     * @param concepts
+     * @param posExs
+     * @param negExs
+     * @param undExs
+     * @param prPos
+     * @param prNeg
+     * @return
+     */
 
-    println("\nThe First concept is: " + concepts.first())
-    var counts: Array[Int] = getSplitCounts(know, concepts.first(), posExs, negExs, undExs)
+    def selectBestConcept(
+      know: KB,
+      concepts: RDD[OWLClassExpression],
+      posExs: RDD[String],
+      negExs: RDD[String],
+      undExs: RDD[String],
+      prPos: Double, prNeg: Double): OWLClassExpression = {
+
+      var bestConceptIndex: Int = 0
 
-    println("\nPL:" +counts(0) +",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) + 
+      println("\nThe First concept is: " + concepts.first())
+      var counts: Array[Int] = getSplitCounts(know, concepts.first(), posExs, negExs, undExs)
+
+      println("\nPL:" + counts(0) + ",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) +
         ",\tNR:" + counts(4) + ",\tUR:" + counts(5))
 
-    //var bestGain: Double = gain(counts,  prPos, prNeg)
-    var bestGain: Double = gain(counts)
-    println("\nCurrent gain: "+ bestGain)
+      // var bestGain: Double = gain(counts,  prPos, prNeg)
+      var bestGain: Double = gain(counts)
+      println("\nCurrent gain: " + bestGain)
 
-    for (c <- 1 until concepts.count.toInt) {
+      for (c <- 1 until concepts.count.toInt) {
 
-      var nConcept = concepts.take(concepts.count.toInt).apply(c)
-      println("\nConcept " + (c+1) +" is: " + nConcept)
+        var nConcept = concepts.take(concepts.count.toInt).apply(c)
+        println("\nConcept " + (c + 1) + " is: " + nConcept)
 
-      counts = getSplitCounts(know, nConcept, posExs, negExs, undExs)
-      println("\nPL:" +counts(0) +",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) + 
-        ",\tNR:" + counts(4) + ",\tUR:" + counts(5))
+        counts = getSplitCounts(know, nConcept, posExs, negExs, undExs)
+        println("\nPL:" + counts(0) + ",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) +
+          ",\tNR:" + counts(4) + ",\tUR:" + counts(5))
 
-      //var thisGain: Double = gain(counts, prPos, prNeg)
-      var thisGain: Double = gain(counts)
-      println("\nCurrent gain: " + thisGain)
+        // var thisGain: Double = gain(counts, prPos, prNeg)
+        var thisGain: Double = gain(counts)
+        println("\nCurrent gain: " + thisGain)
 
-      if (thisGain > bestGain) {
-        bestConceptIndex = c
-        bestGain = thisGain
+        if (thisGain > bestGain) {
+          bestConceptIndex = c
+          bestGain = thisGain
+        }
+      }
+
+      val nCpt = concepts.take(concepts.count.toInt).apply(bestConceptIndex)
+
+      if (bestGain == 0.0) {
+        null
+        //      val parts = nCpt.nestedClassExpressions.iterator().asScala.toList
+        //      val ref = parts.last
+        //      val x = parts.filterNot(elem => elem == ref)
+        //      println("refienment removed: ")
+        //      x.foreach(println(_))
+        //      var y: ArrayList[OWLClassExpression] = new ArrayList()
+        //      var i = 0
+        //      while (i< x.size)
+        //      {
+        //        val z = x.get(i)
+        //        y.add(z)
+        //        i = i+1
+        //      }
+        //
+        //      nCpt
+      } else {
+        println("\n --------\nBest gain: " + bestGain + " \t Split index: " + bestConceptIndex)
+        println("\nPL:" + counts(0) + ",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) +
+          ",\tNR:" + counts(4) + ",\tUR:" + counts(5))
+
+        println("\n Best concept is: " + nCpt)
+        nCpt
       }
     }
-    
-    val nCpt = concepts.take(concepts.count.toInt).apply(bestConceptIndex)
-    
-    if (bestGain == 0.0)  {
-      null
-//      val parts = nCpt.nestedClassExpressions.iterator().asScala.toList
-//      val ref = parts.last
-//      val x = parts.filterNot(elem => elem == ref)
-//      println("refienment removed: ")
-//      x.foreach(println(_))
-//      var y: ArrayList[OWLClassExpression] = new ArrayList()
-//      var i = 0
-//      while (i< x.size)
-//      {
-//        val z = x.get(i)
-//        y.add(z)
-//        i = i+1
-//      }
-//      
-//      nCpt
-    }
-    else {
-      println("\n --------\nBest gain: " + bestGain + " \t Split index: " + bestConceptIndex)
-      println("\nPL:" +counts(0) +",\t NL:" + counts(1) + ",\t UL:" + counts(2) + ",\tPR:" + counts(3) + 
-        ",\tNR:" + counts(4) + ",\tUR:" + counts(5))
-        
-      println("\n Best concept is: " + nCpt)
-      nCpt
-    }
-  }
 
- /**
-   * @param counts
-   * @return The calculated Gain
-   */
+    /**
+     * @param counts
+     * @return The calculated Gain
+     */
 
-  /*
-   * Function to calculate the gain
-   */ 
-  
-  private def gain(counts: Array[Int]): Double = {
-
-    var gain: Double = 0.0
-    val totalL: Double = counts(PL) + counts(NL) + 0.001
-    val totalR: Double = counts(PR) + counts(NR) + 0.001
-    val total: Double = totalL + totalR
-    val pPL: Double = counts(PL) / totalL
-    val pPR: Double = counts(PR) / totalR
-    val pNL: Double = counts(NL) / totalL
-    val pNR: Double = counts(NR) / totalR
-    
-    if (Math.abs(pPL + pPR) != 0 && Math.abs(pNL + pNR) != 0 )
-    {
-      gain = (totalL / total) * (totalR / total) *
+    /**
+     * Function to calculate the gain
+     */
+    private def gain(counts: Array[Int]): Double = {
+
+      var gain: Double = 0.0
+      val totalL: Double = counts(PL) + counts(NL) + 0.001
+      val totalR: Double = counts(PR) + counts(NR) + 0.001
+      val total: Double = totalL + totalR
+      val pPL: Double = counts(PL) / totalL
+      val pPR: Double = counts(PR) / totalR
+      val pNL: Double = counts(NL) / totalL
+      val pNR: Double = counts(NR) / totalR
+
+      if (Math.abs(pPL + pPR) != 0 && Math.abs(pNL + pNR) != 0) {
+        gain = (totalL / total) * (totalR / total) *
           Math.pow(Math.abs(pPL - pPR) / Math.abs(pPL + pPR) + Math.abs(pNL - pNR) / Math.abs(pNL + pNR), 2)
+      }
+      gain
+
     }
-    
-    gain
-    
-  }
-  
-  
-  /**
-   * @param counts
-   * @param prPos
-   * @param prNeg
-   * @return The calculated Gain
-   */
 
-  /*
-   * Function to calculate the gain based on gini index
-   */
+    /**
+     * @param counts
+     * @param prPos
+     * @param prNeg
+     * @return The calculated Gain
+     */
+
+    /**
+     * Function to calculate the gain based on gini index
+     */
+
+    /*  def gain(counts: Array[Int], prPos: Double, prNeg: Double): Double = {
 
-  /* def gain(counts: Array[Int], prPos: Double, prNeg: Double): Double = {
-    
     val Trsize: Double = counts(0) + counts(1)
     val Flsize: Double = counts(3) + counts(4)
     val Usize: Double = counts(2) + counts(5)// + counts(6) + counts(7)
-    
+
     val size: Double =  Trsize + Flsize + Usize
-    
-    val startImpurity : Double = 	gini(counts(0) + counts(3), counts(1) + counts(4), prPos, prNeg)
-     
+    val startImpurity : Double = gini(counts(0) + counts(3), counts(1) + counts(4), prPos, prNeg)
     val TrImpurity = gini(counts(0), counts(1), prPos, prNeg)
     val FlImpurity = gini(counts(3), counts(4), prPos, prNeg)
     val UImpurity = gini(counts(2) , counts(5), prPos, prNeg) //counts(2)+ counts(6),  counts(5) + counts(7)
-	  
+
     val Gainval = startImpurity - (Trsize/size)*TrImpurity - (Flsize/size)*FlImpurity - -(Usize/size)*UImpurity
-	
+
     Gainval
   }
-  
+
   def gini(nPos: Double, nNeg: Double, prPos: Double, prNeg: Double): Double = {
-    
+
     val estimatProp : Int = 3
     val total: Double = nPos + nNeg
-    
+
     val p1 : Double = (nPos*estimatProp*prPos)/(total+estimatProp)
     val p2: Double = (nNeg*estimatProp*prNeg)/(total+estimatProp)
-    
+
     val ginival = 1.0-p1*p1-p2*p2
     ginival
-  }*/
- 
-  
-  
-  /**
-   * @param know
-   * @param concept
-   * @param posExs
-   * @param negExs
-   * @param undExs
-   * @return
-   */
+  } */
 
-  private def getSplitCounts(know: KB,
-                             concept: OWLClassExpression,
-                             posExs: RDD[String],
-                             negExs: RDD[String],
-                             undExs: RDD[String]): Array[Int] = {
-
-    val counts: Array[Int] = Array.ofDim[Int](6)
-   
-    val Pos = splitGroup(know, concept, posExs)
-    val Neg = splitGroup(know, concept, negExs)
-    val Und = splitGroup(know, concept, undExs)
-
-    counts(PL) = Pos._1.count.toInt
-    counts(NL) = Neg._1.count.toInt
-    counts(UL) = Und._1.count.toInt
-    counts(PR) = Pos._2.count.toInt
-    counts(NR) = Neg._2.count.toInt
-    counts(UR) = Und._2.count.toInt
-
-    counts
-  }
+    /**
+     * @param know
+     * @param concept
+     * @param posExs
+     * @param negExs
+     * @param undExs
+     * @return
+     */
 
-  /**
-   * @param know
-   * @param concept
-   * @param nodeExamples
-   * @param leftExs
-   * @param rightExs
-   */
-  private def splitGroup(know: KB, 
-                         concept: OWLClassExpression,
-                         nodeExamples: RDD[String]): (RDD[String], RDD[String]) = {
+    private def getSplitCounts(
+      know: KB,
+      concept: OWLClassExpression,
+      posExs: RDD[String],
+      negExs: RDD[String],
+      undExs: RDD[String]): Array[Int] = {
+
+      val counts: Array[Int] = Array.ofDim[Int](6)
 
-    /*println("\nNode examples: \n ----------")
-    nodeExamples.take(nodeExamples.count.toInt).foreach(println(_))*/
+      val Pos = splitGroup(know, concept, posExs)
+      val Neg = splitGroup(know, concept, negExs)
+      val Und = splitGroup(know, concept, undExs)
 
-    val negConcept: OWLClassExpression = know.getDataFactory.getOWLObjectComplementOf(concept)
-    
-    var Left = new ArrayList[String]()
-    var Right = new ArrayList[String]()
+      counts(PL) = Pos._1.count.toInt
+      counts(NL) = Neg._1.count.toInt
+      counts(UL) = Und._1.count.toInt
+      counts(PR) = Pos._2.count.toInt
+      counts(NR) = Neg._2.count.toInt
+      counts(UR) = Und._2.count.toInt
 
-    for (e <- 0 until nodeExamples.count.toInt) {
+      counts
+    }
+
+    /**
+     * @param know
+     * @param concept
+     * @param nodeExamples
+     * @param leftExs
+     * @param rightExs
+     */
+    private def splitGroup(
+      know: KB,
+      concept: OWLClassExpression,
+      nodeExamples: RDD[String]): (RDD[String], RDD[String]) = {
+
+      /*  println("\nNode examples: \n ----------")
+    nodeExamples.take(nodeExamples.count.toInt).foreach(println(_)) */
 
-      val nodeEx = nodeExamples.take(e + 1).apply(e)
-      val nodeInd = know.getDataFactory().getOWLNamedIndividual(nodeEx).asInstanceOf[OWLIndividual]
+      val negConcept: OWLClassExpression = know.getDataFactory.getOWLObjectComplementOf(concept)
 
-      if (know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(concept, nodeInd))) {
+      var Left = new ArrayList[String]()
+      var Right = new ArrayList[String]()
+
+      for (e <- 0 until nodeExamples.count.toInt) {
+
+        val nodeEx = nodeExamples.take(e + 1).apply(e)
+        val nodeInd = know.getDataFactory().getOWLNamedIndividual(nodeEx).asInstanceOf[OWLIndividual]
+
+        if (know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(concept, nodeInd))) {
           Left.add(nodeEx)
-     
-      } else if (know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(negConcept, nodeInd))) {
+
+        } else if (know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(negConcept, nodeInd))) {
           Right.add(nodeEx)
-       
-      } else {
+
+        } else {
           Left.add(nodeEx)
           Right.add(nodeEx)
+        }
       }
-   }
 
-    val leftRDD = sc.sparkContext.parallelize(Left.asScala)
-    val rightRDD = sc.sparkContext.parallelize(Right.asScala)
+      val leftRDD = sc.sparkContext.parallelize(Left.asScala)
+      val rightRDD = sc.sparkContext.parallelize(Right.asScala)
 
-    /*println("\nleft ex: ")
+      /* println("\nleft ex: ")
     leftRDD.take(20).foreach(println(_))
 
     println("\nright ex: ")
-    rightRDD.take(20).foreach(println(_))*/
-    
-    (leftRDD, rightRDD)
-    
-    
-    //val propName: RDD[String] = know.getIndividuals().map( ind => ind.asOWLNamedIndividual().getIRI.getShortForm)
-    //  println("\n nodeEx = " + nodeEx )
-      //val Filtered = know.getIndividuals().filter(_ == nodeInd)
+    rightRDD.take(20).foreach(println(_)) */
+
+      (leftRDD, rightRDD)
+
+      // val propName: RDD[String] = know.getIndividuals().map( ind => ind.asOWLNamedIndividual().getIRI.getShortForm)
+      //  println("\n nodeEx = " + nodeEx )
+      // val Filtered = know.getIndividuals().filter(_ == nodeInd)
       // println("\n filtered = " )
       // Filtered.take(10).foreach(println(_))
-      //val exIndex = ex.lookup(e)
+      // val exIndex = ex.lookup(e)
       // println("the element: ")
-      //exInd.take(1).foreach(println(_))    
-      //val ind  = know.getDataFactory().getOWLNamedIndividual(IRI.create(nodeEx)).asInstanceOf[OWLIndividual]
-      //println("newexample  " + ind )
+      // exInd.take(1).foreach(println(_))
+      // val ind  = know.getDataFactory().getOWLNamedIndividual(IRI.create(nodeEx)).asInstanceOf[OWLIndividual]
+      // println("newexample  " + ind )
 
-      //val x = know.getIndividuals().take(nodeExamples.count.toInt).apply(e)
-      //val x = know.getIndividuals().filter( _ == neew).first()
+      // val x = know.getIndividuals().take(nodeExamples.count.toInt).apply(e)
+      // val x = know.getIndividuals().filter( _ == neew).first()
 
-      //x.take(20).foreach(println(_))
+      // x.take(20).foreach(println(_))
 
-      //val r =know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(concept, ind))
-      //println("\n r = " + r)
+      // val r =know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(concept, ind))
+      // println("\n r = " + r)
 
       // val l =know.getReasoner().isEntailed(know.getDataFactory.getOWLClassAssertionAxiom(negConcept, ind))
-      //println("\n l = " + l)
-  }
-
-  /**
-   * @param know
-   * @param concept
-   * @param posExs
-   * @param negExs
-   * @param undExs
-   */
-
-  private def split(know: KB,
-                    concept: OWLClassExpression,
-                    posExs: RDD[String], negExs: RDD[String], undExs: RDD[String]):
-                    ((RDD[String], RDD[String]), (RDD[String], RDD[String]), (RDD[String], RDD[String])) = {
+      // println("\n l = " + l)
+    }
 
-   val Pos = splitGroup(know, concept, posExs)
-   val Neg = splitGroup(know, concept, negExs)
-   val Und = splitGroup(know, concept, undExs)
-   
-   (Pos, Neg, Und)
-  }
+    /**
+     * @param know
+     * @param concept
+     * @param posExs
+     * @param negExs
+     * @param undExs
+     */
 
-  }//class
+    private def split(
+      know: KB,
+      concept: OWLClassExpression,
+      posExs: RDD[String], negExs: RDD[String], undExs: RDD[String]): ((RDD[String], RDD[String]), (RDD[String], RDD[String]), (RDD[String], RDD[String])) = {
 
- 
+      val Pos = splitGroup(know, concept, posExs)
+      val Neg = splitGroup(know, concept, negExs)
+      val Und = splitGroup(know, concept, undExs)
 
+      (Pos, Neg, Und)
+    }
 
+  } // class
 
   /**
    * Selecting the best in a list (RDD) of refinements using Entropy calculations
@@ -567,7 +546,7 @@ object TDTClassifiers {
    * @return
    */
 
- /* def selectBestConceptEntropy(know: KB, concepts: RDD[OWLClassExpression],
+  /* def selectBestConceptEntropy(know: KB, concepts: RDD[OWLClassExpression],
                                posExs: RDD[String],
                                negExs: RDD[String],
                                undExs: RDD[String],
@@ -607,13 +586,13 @@ object TDTClassifiers {
     val nCpt = n.lookup(bestConceptIndex).asInstanceOf[OWLClassExpression]
     println("\n %s\n\n", nCpt)
     nCpt
-  }*/
+  } */
 
-  /*
+  /**
    * Function to calculate the Entropy value
    */
 
- /* def Entropy(counts: Array[Int], prPos: Double, prNeg: Double, sizPos: Int, sizNeg: Int): Double = {
+  /* def Entropy(counts: Array[Int], prPos: Double, prNeg: Double, sizPos: Int, sizNeg: Int): Double = {
     val nP = counts(0) + counts(1)
     val nN = counts(3) + counts(4)
     val nU = counts(2) + counts(5) + counts(6) + counts(7)
@@ -654,6 +633,6 @@ object TDTClassifiers {
       - (2 - p1 - p2) * (p1 * Math.log(p1) - p2 * Math.log(p2)))
 
     EntropyValue
-  }*/
+  } */
 
-}//object
+}// object
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTInducer.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTInducer.scala
index e3cec25..a1873be 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTInducer.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TDTInducer.scala
@@ -1,21 +1,15 @@
 package net.sansa_stack.ml.spark.classification
 
 import java.io.PrintStream
-import java.util.ArrayList
-import java.util.List
-import java.util.Arrays
-import java.util.HashSet
-import collection.JavaConverters._
-import scala.collection
+import java.util.{ ArrayList, Arrays, HashSet, List }
 
-import org.semanticweb.owlapi.model.OWLClassExpression
-import org.semanticweb.owlapi.model.OWLIndividual
-import org.semanticweb.owlapi.model.OWLNamedIndividual
-import org.semanticweb.HermiT.Reasoner
+import scala.collection
 
+import collection.JavaConverters._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.{SparkConf, SparkContext}
+import org.semanticweb.HermiT.Reasoner
+import org.semanticweb.owlapi.model.{ OWLClassExpression, OWLIndividual, OWLNamedIndividual }
 
 import net.sansa_stack.ml.spark.classification._
 import net.sansa_stack.ml.spark.classification.KB.KB
@@ -26,151 +20,147 @@ import net.sansa_stack.ml.spark.classification.TDTClassifiers.TDTClassifiers
  */
 
 object TDTInducer {
-    var stream: PrintStream = _
-    
-class TDTInducer(var kb: KB, var nConcepts: Int, var sc: SparkSession) {
+  var stream: PrintStream = _
 
-//for each query concept induce an ensemble
-  var trees: Array[DLTree] = new Array[DLTree](nConcepts)
+  class TDTInducer(var kb: KB, var nConcepts: Int, var sc: SparkSession) {
 
-  var cl: TDTClassifiers = new TDTClassifiers(kb, sc)
+    // for each query concept induce an ensemble
+    var trees: Array[DLTree] = new Array[DLTree](nConcepts)
 
-  
-  
-  /*
-   * Function for training the algorithm
-   */
-  def training(results: Array[Array[Int]], trainingExs: RDD[OWLIndividual],
-                        testConcepts: Array[OWLClassExpression],
-                        negTestConcepts: Array[OWLClassExpression]): Unit = {
-  
-    val op: RefinementOperator = new RefinementOperator(kb)
-    val reasoner: Reasoner = kb.getReasoner
-    val allExamples: RDD[OWLIndividual] = kb.getIndividuals
-
-    //val trainingExsSet: HashSet[Integer] = new HashSet[Integer](Arrays.asList(trainingExs: _*))
-
-    val length: Int = if (testConcepts != null) testConcepts.size else 1
-    
-    for (c <- 0 until length) {
-    
-      println("\n--- Query Concept # " + (c+1))
-      
-      // These instances should be divided into negative instances, positive and uncertain 
-      // split._1 = posExs,    split._2 = negExs,  split._3 = undExs
-      val split = splitting(trainingExs, results, c)
-      
-      var prPos: Double = split._1.count.toDouble / (trainingExs.count.toInt)
-      var prNeg: Double = split._2.count.toDouble / (trainingExs.count.toInt)
-      println("Training set composition: " + split._1.count() + " - " + split._2.count() + " - " + split._3.count())
-      
-      val Sum: Double = prPos + prNeg
-      if (Sum == 0) {
-        prPos = 0.5
-        prNeg = 0.5
-      } else {
-        prPos = prPos / Sum
-        prNeg = prNeg / Sum
-      }
-      println("\nNew learning problem prepared "+ (c+1))
-      println("Learning a tree ")
-      trees(c) = cl.induceDLTree(kb.getDataFactory.getOWLThing, split._1, split._2, split._3, 50, prPos, prNeg)
+    var cl: TDTClassifiers = new TDTClassifiers(kb, sc)
+
+    /**
+     * Function for training the algorithm
+     */
+    def training(results: Array[Array[Int]], trainingExs: RDD[OWLIndividual],
+                 testConcepts: Array[OWLClassExpression],
+                 negTestConcepts: Array[OWLClassExpression]): Unit = {
+
+      val op: RefinementOperator = new RefinementOperator(kb)
+      val reasoner: Reasoner = kb.getReasoner
+      val allExamples: RDD[OWLIndividual] = kb.getIndividuals
 
+      // val trainingExsSet: HashSet[Integer] = new HashSet[Integer](Arrays.asList(trainingExs: _*))
+
+      val length: Int = if (testConcepts != null) testConcepts.size else 1
+
+      for (c <- 0 until length) {
+
+        println("\n--- Query Concept # " + (c + 1))
+
+        // These instances should be divided into negative instances, positive and uncertain
+        // split._1 = posExs,    split._2 = negExs,  split._3 = undExs
+        val split = splitting(trainingExs, results, c)
+
+        var prPos: Double = split._1.count.toDouble / (trainingExs.count.toInt)
+        var prNeg: Double = split._2.count.toDouble / (trainingExs.count.toInt)
+        println("Training set composition: " + split._1.count() + " - " + split._2.count() + " - " + split._3.count())
+
+        val Sum: Double = prPos + prNeg
+        if (Sum == 0) {
+          prPos = 0.5
+          prNeg = 0.5
+        } else {
+          prPos = prPos / Sum
+          prNeg = prNeg / Sum
+        }
+        println("\nNew learning problem prepared " + (c + 1))
+        println("Learning a tree ")
+        trees(c) = cl.induceDLTree(kb.getDataFactory.getOWLThing, split._1, split._2, split._3, 50, prPos, prNeg)
+
+      }
     }
-  }
-  
-  /*
+
+    /*
    * Function for testing the algorithm
    */
-  def test (f: Int, testExs: RDD[OWLIndividual], testConcepts: Array[OWLClassExpression]): Array[Array[Int]] = {
-
-   // classifier answers for each example and for each concept
-    val labels: Array[Array[Int]] = Array.ofDim[Int](testExs.count.toInt, nConcepts)
-    
-    for (t <- 0 until testExs.count.toInt) {
-      val indTestEx = testExs.take(t+1).apply(t)
-      println("\n\nFold #" + (f+1))
-      println(" ---\n Classifying Example " + (t+1) + "/" + testExs.count.toInt + " [" + indTestEx + "] ")
-      
-      //labels(t) = Array.ofDim[Int](nConcepts)
-      
-      
-      for (i <- 0 until nConcepts - 1) {
-        labels(t)(i) = cl.classify(indTestEx, trees(i))
+    def test(f: Int, testExs: RDD[OWLIndividual], testConcepts: Array[OWLClassExpression]): Array[Array[Int]] = {
+
+      // classifier answers for each example and for each concept
+      val labels: Array[Array[Int]] = Array.ofDim[Int](testExs.count.toInt, nConcepts)
+
+      for (t <- 0 until testExs.count.toInt) {
+        val indTestEx = testExs.take(t + 1).apply(t)
+        println("\n\nFold #" + (f + 1))
+        println(" ---\n Classifying Example " + (t + 1) + "/" + testExs.count.toInt + " [" + indTestEx + "] ")
+
+        // labels(t) = Array.ofDim[Int](nConcepts)
+
+        for (i <- 0 until nConcepts - 1) {
+          labels(t)(i) = cl.classify(indTestEx, trees(i))
+        }
       }
+      labels
     }
-    labels
-  }
 
-  /*
+    /*
    * Function for splitting the training examples into positive, negative and undefined examples
    */
-  
-  def splitting(trainingExs: RDD[OWLIndividual], classifications: Array[Array[Int]], c: Int): (RDD[String],RDD[String],RDD[String]) = {
-    
-    var BINARYCLASSIFICATION : Boolean = false
-//    var classRDD = sc.sparkContext.parallelize(classifications,2)
-//    var pos = classRDD.filter(_ == +1)
-    
-    var pos = new ArrayList[String]()
-    var neg = new ArrayList[String]()
-    var und = new ArrayList[String]()
-    var TExs = trainingExs.zipWithIndex()
-    
-    for (i <-0 until trainingExs.count.toInt){
-      
-      val trainValue = trainingExs.take(i+1).apply(i)
-      //var trainIndex = TExs.lookup(trainValue)
-      //println("\nvalue : " + trainValue)
-      val trainIndex = trainingExs.take(trainingExs.count.toInt).indexOf(trainValue)
-     // println("index : " + trainIndex)
-      
-/*      var p = trainingExs.filter{ exs => 
+
+    def splitting(trainingExs: RDD[OWLIndividual], classifications: Array[Array[Int]], c: Int): (RDD[String], RDD[String], RDD[String]) = {
+
+      var BINARYCLASSIFICATION: Boolean = false
+      //    var classRDD = sc.sparkContext.parallelize(classifications,2)
+      //    var pos = classRDD.filter(_ == +1)
+
+      var pos = new ArrayList[String]()
+      var neg = new ArrayList[String]()
+      var und = new ArrayList[String]()
+      var TExs = trainingExs.zipWithIndex()
+
+      for (i <- 0 until trainingExs.count.toInt) {
+
+        val trainValue = trainingExs.take(i + 1).apply(i)
+        // var trainIndex = TExs.lookup(trainValue)
+        // println("\nvalue : " + trainValue)
+        val trainIndex = trainingExs.take(trainingExs.count.toInt).indexOf(trainValue)
+        // println("index : " + trainIndex)
+
+        /*      var p = trainingExs.filter{ exs =>
         val v = exs.toString()
- 
-      }*/
-    
-      if (trainIndex != -1){
-        val value = trainValue.toString()
-        if (classifications(c)(trainIndex) == +1)
+
+      } */
+
+        if (trainIndex != -1) {
+          val value = trainValue.toString()
+          if (classifications(c)(trainIndex) == +1) {
             pos.add(value)
-        else if (!BINARYCLASSIFICATION) {
-          if (classifications(c)(trainIndex) == -1)
+          } else if (!BINARYCLASSIFICATION) {
+            if (classifications(c)(trainIndex) == -1) {
+              neg.add(value)
+            } else {
+              und.add(value)
+            }
+          } else {
             neg.add(value)
-        else
-            und.add(value)
+          }
+        }
       }
-      else
-          neg.add(value)
-     }
-   }
-    var posExs = sc.sparkContext.parallelize(pos.asScala)
-    var negExs = sc.sparkContext.parallelize(neg.asScala)
-    var undExs = sc.sparkContext.parallelize(und.asScala)
-    
-    (posExs, negExs, undExs)
-  }    
-//    val TList : List[Integer]= new ArrayList[Integer]
-//    var T = sc.sparkContext.parallelize(TList.asScala)
-//    
-//    var TExs = trainingExs.zipWithIndex()
-//    for (e <- 0 until trainingExs.count.toInt) {
-//      
-//     var index = TExs.lookup(e)
-//     T.union(index)
-      //val Train = sc.sparkContext.parallelize(T.asScala)
-      
-      /*if (classifications(c)(TExs.lookup(e)) == +1) posExs.union(T)
+      var posExs = sc.sparkContext.parallelize(pos.asScala)
+      var negExs = sc.sparkContext.parallelize(neg.asScala)
+      var undExs = sc.sparkContext.parallelize(und.asScala)
+
+      (posExs, negExs, undExs)
+    }
+    //    val TList : List[Integer]= new ArrayList[Integer]
+    //    var T = sc.sparkContext.parallelize(TList.asScala)
+    //
+    //    var TExs = trainingExs.zipWithIndex()
+    //    for (e <- 0 until trainingExs.count.toInt) {
+    //
+    //     var index = TExs.lookup(e)
+    //     T.union(index)
+    // val Train = sc.sparkContext.parallelize(T.asScala)
+
+    /*  if (classifications(c)(TExs.lookup(e)) == +1) posExs.union(T)
       else if (!BINARYCLASSIFICATION) {
           if (classifications(c)(TExs.lookup(e)) == -1)
             negExs.union(T)
           else undExs.union(T)
-      } else negExs.union(T)*/
-    //}
+      } else negExs.union(T) */
+    // }
 
- 
-
- /* def getComplexityValues(sc: SparkSession): Array[Double] = {
+    /* def getComplexityValues(sc: SparkSession): Array[Double] = {
 
     // a measure to express the model complexity (e.g. the number of nodes in a tree)
     val complexityValue: Array[Double] = Array.ofDim[Double](trees.length)
@@ -179,7 +169,6 @@ class TDTInducer(var kb: KB, var nConcepts: Int, var sc: SparkSession) {
       complexityValue(i) = current
     }
     complexityValue
-  }*/
-
+  } */
   }
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TermDecisionTrees.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TermDecisionTrees.scala
index 71f23f7..2a6dbcb 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TermDecisionTrees.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/TermDecisionTrees.scala
@@ -1,30 +1,30 @@
-package net.sansa_stack.ml.spark.classification 
+package net.sansa_stack.ml.spark.classification
 
 import java.util.ArrayList
-import scala.reflect.runtime.universe._
+
 import scala.collection.JavaConverters._
+import scala.reflect.runtime.universe._
+
+import org.apache.log4j.{ Level, Logger }
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
 import org.semanticweb.owlapi.model.OWLClassExpression
 import org.semanticweb.owlapi.model.OWLIndividual
+import scopt.OptionParser
 
-import net.sansa_stack.ml.spark.classification.KB.KB
 import net.sansa_stack.ml.spark.classification.ClassMembership.ClassMembership
+import net.sansa_stack.ml.spark.classification.KB.KB
 import net.sansa_stack.ml.spark.classification.TDTClassifiers.TDTClassifiers
-
 import net.sansa_stack.owl.spark.rdd.FunctionalSyntaxOWLAxiomsRDDBuilder
 import net.sansa_stack.owl.spark.rdd.OWLAxiomsRDD
 
-import scopt.OptionParser
-import org.apache.log4j.{Level, Logger}
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.rdd.RDD
-
 object TermDecisionTrees {
 
-  /*
- * The main file to call Terminological Decision Trees for Classification 
- */
+  /**
+   * The main file to call Terminological Decision Trees for Classification
+   */
 
-  def main(args: Array[String]) = {
+  def main(args: Array[String]): Unit = {
 
     val input = "src/main/resources/Classification/trains.owl"
 
@@ -38,61 +38,59 @@ object TermDecisionTrees {
       .config("spark.kryo.registrator", "net.sansa_stack.ml.spark.classification.Registrator")
       .appName("Termnological Decision Tree")
       .getOrCreate()
-   
-    //Call owl axion builder to read the classes and object properties and print
-      
- 	 	val rdd : OWLAxiomsRDD = FunctionalSyntaxOWLAxiomsRDDBuilder.build(sparkSession, input)
-   
+
+    // Call owl axion builder to read the classes and object properties and print
+
+    val rdd: OWLAxiomsRDD = FunctionalSyntaxOWLAxiomsRDDBuilder.build(sparkSession, input)
+
     val kb: KB = new KB(input, rdd, sparkSession)
     var ClassM = new ClassMembership(kb, sparkSession)
     val ClassName = TDTInducer.toString()
     ClassM.bootstrap(10, ClassName, sparkSession)
 
-    //val c : TDTInducer = new TDTInducer(kb, kb.Concepts.count().toInt, sparkSession)
-    
-//    var PosExamples = sparkSession.sparkContext.parallelize(Array("http://example.com/foo#east1", 
-//        "http://example.com/foo#east2",
-//        "http://example.com/foo#east3",
-//        "http://example.com/foo#east4",
-//        "http://example.com/foo#east5"))
-//    
-//    var NegExamples = sparkSession.sparkContext.parallelize(Array("http://example.com/foo#west6", 
-//        "http://example.com/foo#west7", 
-//        "http://example.com/foo#west8", 
-//        "http://example.com/foo#west9", 
-//        "http://example.com/foo#west10"))
-//    
-//    var UndExamples = sparkSession.sparkContext.parallelize(new ArrayList[String]().asScala)
-//
-//    val numPos: Double = PosExamples.count
-//    val numNeg: Double = NegExamples.count
-//    val perPos: Double = numPos / (numPos + numNeg)
-//    val perNeg: Double = numNeg / (numPos + numNeg)
-//    
-//    println("\nLearning problem: \n --------------------\n")
-//    println("No. of Positive examples: " + PosExamples.count)
-//    println("No. of Negative examples: " + NegExamples.count)
-//    println("No. of Undefined examples: " + UndExamples.count)
-//    println("\nper Pos: " + perPos)
-//    println("per Neg: " + perNeg)
-//    
-//    val nGeneratedRef: Int = 50
-//    
-//    val c : TDTClassifiers = new TDTClassifiers (kb, sparkSession)
-//    val tree : DLTree = c.induceDLTree(kb.getDataFactory.getOWLThing, PosExamples, NegExamples, UndExamples, nGeneratedRef, perPos, perNeg)
-//    
-//    val Root: OWLClassExpression = tree.getRoot()
-//    println("\nRoot of the tree is: " + Root)
-    
-    /*val possubtree = tree.getPosSubTree().toString()
-    println("possubtree: " + possubtree)*/
-    
-    //val ind = kb.getDataFactory().getOWLNamedIndividual("http://example.com/foo#east2") 
-    //val classification : Int = c.classify(ind, tree) 
-    //println("\nclassification of east2 is " + classification)
-    
-    sparkSession.stop
+    // val c : TDTInducer = new TDTInducer(kb, kb.Concepts.count().toInt, sparkSession)
 
-  }
+    //    var PosExamples = sparkSession.sparkContext.parallelize(Array("http://example.com/foo#east1",
+    //        "http://example.com/foo#east2",
+    //        "http://example.com/foo#east3",
+    //        "http://example.com/foo#east4",
+    //        "http://example.com/foo#east5"))
+    //
+    //    var NegExamples = sparkSession.sparkContext.parallelize(Array("http://example.com/foo#west6",
+    //        "http://example.com/foo#west7",
+    //        "http://example.com/foo#west8",
+    //        "http://example.com/foo#west9",
+    //        "http://example.com/foo#west10"))
+    //
+    //    var UndExamples = sparkSession.sparkContext.parallelize(new ArrayList[String]().asScala)
+    //
+    //    val numPos: Double = PosExamples.count
+    //    val numNeg: Double = NegExamples.count
+    //    val perPos: Double = numPos / (numPos + numNeg)
+    //    val perNeg: Double = numNeg / (numPos + numNeg)
+    //
+    //    println("\nLearning problem: \n --------------------\n")
+    //    println("No. of Positive examples: " + PosExamples.count)
+    //    println("No. of Negative examples: " + NegExamples.count)
+    //    println("No. of Undefined examples: " + UndExamples.count)
+    //    println("\nper Pos: " + perPos)
+    //    println("per Neg: " + perNeg)
+    //
+    //    val nGeneratedRef: Int = 50
+    //
+    //    val c : TDTClassifiers = new TDTClassifiers (kb, sparkSession)
+    //    val tree : DLTree = c.induceDLTree(kb.getDataFactory.getOWLThing, PosExamples, NegExamples, UndExamples, nGeneratedRef, perPos, perNeg)
+    //
+    //    val Root: OWLClassExpression = tree.getRoot()
+    //    println("\nRoot of the tree is: " + Root)
+
+    /*   val possubtree = tree.getPosSubTree().toString()
+    println("possubtree: " + possubtree) */
+
+    // val ind = kb.getDataFactory().getOWLNamedIndividual("http://example.com/foo#east2")
+    // val classification : Int = c.classify(ind, tree)
+    // println("\nclassification of east2 is " + classification)
 
-}
\ No newline at end of file
+    sparkSession.stop
+  }
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/Utils.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/Utils.scala
index bd0f03e..6d9a41a 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/Utils.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/classification/Utils.scala
@@ -1,12 +1,13 @@
 package net.sansa_stack.ml.spark.classification
 
-import org.apache.spark.serializer.{ KryoRegistrator => SparkKryoRegistrator }
 import com.esotericsoftware.kryo.Kryo
+import org.apache.spark.serializer.{ KryoRegistrator => SparkKryoRegistrator }
 import org.semanticweb.owlapi.model.OWLClass
-import net.sansa_stack.ml.spark.classification.KB.KB
 import org.semanticweb.owlapi.reasoner.structural.StructuralReasoner
 
-/*
+import net.sansa_stack.ml.spark.classification.KB.KB
+
+/**
  * Class for serialization by the Kryo serializer.
  */
 class Registrator extends SparkKryoRegistrator {
@@ -17,4 +18,4 @@ class Registrator extends SparkKryoRegistrator {
     kryo.register(classOf[StructuralReasoner])
     kryo.register(classOf[net.sansa_stack.ml.spark.classification.KB.KB])
   }
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/BorderFlow.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/BorderFlow.scala
similarity index 89%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/BorderFlow.scala
rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/BorderFlow.scala
index 4434e6f..d010777 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/BorderFlow.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/BorderFlow.scala
@@ -1,38 +1,32 @@
-package net.sansa_stack.ml.spark.clustering
+import java.io._
+import java.io.{ ByteArrayInputStream, FileNotFoundException, FileReader, IOException, StringWriter }
+import java.lang.{ Long => JLong }
 
-import org.apache.spark.rdd.RDD
-import org.apache.spark.graphx.{ Graph, EdgeDirection }
 import scala.math.BigDecimal
-import org.apache.spark.sql.SparkSession
 import scala.reflect.runtime.universe._
-import scopt.OptionParser
-import org.apache.log4j.{ Level, Logger }
-import org.apache.spark.mllib.util.MLUtils
-import java.io.{ FileReader, FileNotFoundException, IOException }
-import org.apache.spark.mllib.linalg.Vectors
-import java.lang.{ Long => JLong }
-import java.lang.{ Long => JLong }
-import breeze.linalg.{ squaredDistance, DenseVector, Vector }
-import org.apache.spark.sql.SparkSession
 import scala.util.control.Breaks._
+
+import breeze.linalg.{ squaredDistance, DenseVector, Vector }
 import org.apache.jena.datatypes.{ RDFDatatype, TypeMapper }
-import org.apache.jena.graph.{ Node => JenaNode, Triple => JenaTriple, _ }
-import org.apache.jena.riot.writer.NTriplesWriter
+import org.apache.jena.graph.{ Node => JenaNode, Node_ANY, Node_Blank, Node_Literal, Node_URI, Triple => JenaTriple, _ }
 import org.apache.jena.riot.{ Lang, RDFDataMgr }
-import org.apache.jena.graph.{ Node_ANY, Node_Blank, Node_Literal, Node_URI, Node => JenaNode, Triple => JenaTriple }
+import org.apache.jena.riot.writer.NTriplesWriter
+import org.apache.jena.util._
 import org.apache.jena.vocabulary.RDF
-import java.io.ByteArrayInputStream
-import org.apache.spark.rdd.PairRDDFunctions
+import org.apache.log4j.{ Level, Logger }
 import org.apache.spark.SparkContext._
 import org.apache.spark.graphx._
-import org.apache.jena.util._
-import java.io.StringWriter
-import java.io._
-import org.apache.spark.graphx.Graph
+import org.apache.spark.graphx.{ EdgeDirection, Graph }
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.PairRDDFunctions
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import scopt.OptionParser
 
 object BorderFlow {
 
-  def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputevlsoft: String, outputevlhard: String) = {
+  def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputevlsoft: String, outputevlhard: String): Unit = {
 
     /**
      * undirected graph : orient =0
@@ -51,9 +45,9 @@ object BorderFlow {
       graphXinBorderFlow(orient, selectYourSimilarity)
     }
 
-    /*
-	 * Computes different similarities function for a given graph @graph.
-	 */
+    /**
+     * Computes different similarities function for a given graph @graph.
+     */
     def graphXinBorderFlow(e: Int, f: Int): List[List[Long]] = {
 
       val edge = graph.edges.collect()
@@ -197,7 +191,7 @@ object BorderFlow {
         f3
       }
 
-      //computing f(X,V) for Heuristics BorderFlow
+      // computing f(X,V) for Heuristics BorderFlow
 
       def fOmega(x: List[Long], v: Long): Double = {
         var numberFlow = 0
@@ -223,9 +217,7 @@ object BorderFlow {
         var jaccardBV = 0.0
         if (b.size == 0) return 0.0
         for (i <- 0 until b.length) yield {
-
-          jaccardBV = jaccardBV.+(findingSimilarity(b(i), v).abs)
-
+          jaccardBV = jaccardBV. + (findingSimilarity(b(i), v).abs)
         }
 
         var jaccardVXV = 0.0
@@ -233,38 +225,13 @@ object BorderFlow {
         for (i <- 0 until VX.length) yield {
           if (VX(i) != v) {
 
-            jaccardVXV = jaccardVXV.+(findingSimilarity(VX(i), v).abs)
+            jaccardVXV = jaccardVXV. + (findingSimilarity(VX(i), v).abs)
 
           }
 
         }
 
         (jaccardVXV / jaccardBV)
-        /*
-         *  without similarity
-         val nv = neighborSort.lookup(v).distinct.head.toSet
-         val nvX = nv.intersect(X.toSet)
-         val nvx = nvX.toList.diff(x).size
-
-
-          for(k <- 0 until x.length) yield{
-            if(x.length>0){
-
-           val xk = x(k)
-           val bX = neighborSort.lookup(xk).distinct.head.toSet
-           val bxX = bX.intersect(X.toSet)
-
-           if(bxX.toList.diff(x).size > 0 && bxX.toList.diff(x).contains(v)) {
-             numberFlow = numberFlow + 1
-             }
-
-            }
-
-         }
-
-        ( 1/(numberFlow.toDouble/ nvx.toDouble))
-        *
-        */
 
       }
 
@@ -325,7 +292,7 @@ object BorderFlow {
         for (i <- 0 until b.length) yield {
           for (j <- 0 until x.length) yield {
             if (b(i) != x(j)) {
-              jaccardX = jaccardX.+(findingSimilarity(b(i), x(j)).abs)
+              jaccardX = jaccardX. + (findingSimilarity(b(i), x(j)).abs)
 
             }
           }
@@ -334,7 +301,7 @@ object BorderFlow {
         for (i <- 0 until b.length) yield {
           for (j <- 0 until n.length) yield {
 
-            jaccardN = jaccardN.+(findingSimilarity(b(i), n(j)).abs)
+            jaccardN = jaccardN. + (findingSimilarity(b(i), n(j)).abs)
 
           }
         }
@@ -367,7 +334,7 @@ object BorderFlow {
         for (i <- 0 until n.length) yield {
           if (n(i) != u) {
 
-            jaccardNU = jaccardNU.+(findingSimilarity(u, n(i)).abs)
+            jaccardNU = jaccardNU. + (findingSimilarity(u, n(i)).abs)
 
           }
 
@@ -377,15 +344,14 @@ object BorderFlow {
          val nu = neighborSort.lookup(u).distinct.head.toSet
          val nuX = nu.intersect(X.toSet).toList
         ( (nuX.intersect(listOfN(x))).size.toDouble)
-
          */
         jaccardNU
 
       }
 
-      /*
-	 * Use Heuristics method for producing clusters.
-	 */
+      /**
+       * Use Heuristics method for producing clusters.
+       */
 
       def heuristicsCluster(a: List[Long]): List[Long] = {
         var nj = 0.0
@@ -436,9 +402,9 @@ object BorderFlow {
 
       }
 
-      /*
-	 * Use Non-Heuristics(normal) method for producing clusters.
-	 */
+      /**
+       * Use Non-Heuristics(normal) method for producing clusters.
+       */
 
       def nonHeuristicsCluster(a: List[Long], d: List[Long]): List[Long] = {
         var nj: List[Long] = List()
@@ -529,18 +495,18 @@ object BorderFlow {
 
       }
 
-      /*
-	 * Input for heuristics heuristicsCluster(element)    .
-	 * Input for nonHeuristics nonHeuristicsCluster(element,List())  .
-	 */
+      /**
+       * Input for heuristics heuristicsCluster(element)    .
+       * Input for nonHeuristics nonHeuristicsCluster(element,List())  .
+       */
 
       def makeClusters(a: Long): List[Long] = {
 
         var clusters: List[Long] = List()
 
         clusters = nonHeuristicsCluster(List(a), List())
-        // if(b == 1){
-        // clusters = heuristicsCluster(List(a))}
+        // if(b == 1) {
+        // clusters = heuristicsCluster(List(a)) }
 
         (clusters)
 
@@ -558,9 +524,9 @@ object BorderFlow {
 
       bigList = bigList.map(_.distinct)
 
-      /*
-			 * Sillouhette Evaluation soft
-			 */
+      /**
+       * Sillouhette Evaluation soft
+       */
 
       def avgAsoft(c: List[Long], d: Long): Double = {
         var sumA = 0.0
@@ -585,6 +551,7 @@ object BorderFlow {
 
         sumB / sizeC
       }
+
       def SIsoft(a: Double, b: Double): Double = {
         var s = 0.0
         if (a > b) {
@@ -632,9 +599,9 @@ object BorderFlow {
 
       val evaluateSoft = AiBiSoft(bigList, X)
 
-      /*
-			 * Apply Hardening
-			 */
+      /**
+       * Apply Hardening
+       */
 
       def subset(c: List[List[Long]]): List[List[Long]] = {
         var C = c
@@ -698,7 +665,7 @@ object BorderFlow {
         for (i <- 0 until c.length) yield {
           if (c(i) != v) {
 
-            omega = omega.+(findingSimilarity(v, c(i)).abs)
+            omega = omega. + (findingSimilarity(v, c(i)).abs)
 
           }
 
@@ -708,7 +675,6 @@ object BorderFlow {
          val nu = neighborSort.lookup(u).distinct.head.toSet
          val nuX = nu.intersect(X.toSet).toList
         ( (nuX.intersect(listOfN(x))).size.toDouble)
-
          */
         omega
 
@@ -741,6 +707,7 @@ object BorderFlow {
         }
         C
       }
+
       def nul(c: List[List[Long]]): List[List[Long]] = {
         var C = c
         var newCluster: List[List[Long]] = List()
@@ -755,9 +722,9 @@ object BorderFlow {
       bigList = reassignment(bigList, X)
       bigList = nul(bigList)
 
-      /*
-			 * Sillouhette Evaluation Hard
-			 */
+      /**
+       * Sillouhette Evaluation Hard
+       */
 
       def avgA(c: List[Long], d: Long): Double = {
         var sumA = 0.0
@@ -782,6 +749,7 @@ object BorderFlow {
 
         sumB / sizeC
       }
+
       def SI(a: Double, b: Double): Double = {
         var s = 0.0
         if (a > b) {
@@ -838,14 +806,14 @@ object BorderFlow {
       val evaluateStringRDDS = spark.sparkContext.parallelize(evaluateStringS)
 
       evaluateStringRDDS.saveAsTextFile(outputevlsoft)
-      //println(s"averagesoft: $avsoft\n")
+      // println(s"averagesoft: $avsoft\n")
 
       bigList
     }
 
-    /*
-			 * convert to RDF
-			 */
+    /**
+     * convert to RDF
+     */
 
     def makerdf(a: List[Long]): List[String] = {
       var listuri: List[String] = List()
@@ -857,13 +825,12 @@ object BorderFlow {
 
       }
       listuri
-
     }
 
     val rdf = clusterRdd.map(x => makerdf(x))
     val rdfRDD = spark.sparkContext.parallelize(rdf)
 
     rdfRDD.saveAsTextFile(output)
-
   }
 }
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/DBSCAN.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/DBSCAN.scala
new file mode 100644
index 0000000..3c0dbfd
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/DBSCAN.scala
@@ -0,0 +1,260 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+/*
+* DBSCAN Distributed Edition in Spark & Scala.
+*
+* Authors: Panagiotis Kalampokis, Dr. Dimitris Skoutas
+* */
+
+import com.vividsolutions.jts.geom.{Coordinate, Envelope, GeometryFactory, Point}
+import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.storage.StorageLevel._
+import org.datasyslab.geospark.enums.GridType
+import org.datasyslab.geospark.spatialPartitioning.SpatialPartitioner
+import org.datasyslab.geospark.spatialRDD.PointRDD
+import scala.collection.mutable.{ArrayBuffer, HashMap}
+
+import net.sansa_stack.ml.spark.clustering.datatypes.DbPOI
+import net.sansa_stack.ml.spark.clustering.datatypes.POI
+import net.sansa_stack.ml.spark.clustering.utils.DBCLusterer
+
+
+class DBSCAN() extends Serializable {
+
+    private var clusterRDD: RDD[DbPOI] = null
+    private var mergingClusterNameVecBD: Broadcast[Vector[Set[String]]] = null
+    private var boundaryPoisToKeepHMBD : Broadcast[HashMap[String, String]] = null
+    private var spatialPartitionerBD   : Broadcast[SpatialPartitioner] = null
+
+    private def areIntersectingSets(set1: Set[String], set2: Set[String]): Boolean = {
+        set1.exists(s1 => set2.exists(s2 => s1 == s2) )
+    }
+
+    private def insertSetIntoVec(vec: Vector[Set[String]], xSet: Set[String]): Vector[Set[String]] = {
+
+        var tmpVec = Vector[Set[String]]()
+        var unionSet = Set[String]() ++ xSet
+
+        for(set_i <- vec) {
+            if (areIntersectingSets(set_i, unionSet))
+            {
+                unionSet = unionSet ++ set_i
+            }
+            else
+            {
+                tmpVec = tmpVec :+ set_i
+            }
+        }
+        unionSet +: tmpVec
+    }
+
+    protected def getExpandedEnvelopeFromPoint(p: Point, epsilon: Double): Envelope = {
+        val env = p.getEnvelopeInternal
+        env.expandBy(epsilon)
+
+        env
+    }
+    /*
+    * Performs DBSCAN and Returns the clusters.
+    * */
+        def dbclusters(pointRDD_0: RDD[Point], eps: Double, minPts: Int, spark: SparkSession) : RDD[(String, Array[(String, DbPOI)])] = {
+
+        val pointRDD_1 = new JavaRDD[Point](pointRDD_0)
+        val pointRDD = new PointRDD(pointRDD_1)
+
+        pointRDD.analyze()
+
+        // Perform Spatial Partitioning with QuadTree
+        pointRDD.spatialPartitioning(GridType.QUADTREE, 16)
+
+        // val boundaryEnvelopes = pointRDD.getPartitioner.getGrids
+        // writeBoundaryEnvsToFile(pointRDD, outputFile + "_Envelopes_only.txt", geometryFactory)
+        this.spatialPartitionerBD = spark.sparkContext.broadcast(pointRDD.getPartitioner)
+
+        // RDD[partitionID, dbpoi]
+        val flatMappedRDD = pointRDD.spatialPartitionedRDD
+                                    .rdd
+                                    .mapPartitions{
+                                        pointIter => {
+                                            val geometryFactory = new GeometryFactory()
+                                            pointIter.flatMap{
+                                                point => {
+                                                    // Get expanded by eps Envelope From Point.
+                                                    val pointEnv = getExpandedEnvelopeFromPoint(point, eps)
+
+                                                    // Given a Geometry, it Returns a List of Partitions it overlaps.
+                                                    val pIDListTuple = this.spatialPartitionerBD.value.placeObject(geometryFactory.toGeometry(pointEnv))
+
+                                                    // ArrayBuffer[PIDs]
+                                                    val arrBuff = ArrayBuffer[Int]()
+
+                                                    while (pIDListTuple.hasNext) {
+                                                        val (pID, envP) = pIDListTuple.next()
+                                                        arrBuff.append(pID.intValue())
+                                                    }
+
+                                                    // Is Boundary Point?
+                                                    val isBoundaryP = (arrBuff.size > 1)
+                                                    arrBuff.map{
+                                                        pID => {
+                                                            val poi = DbPOI(point.getUserData.asInstanceOf[String], point.getX, point.getY)
+                                                            if (isBoundaryP) {
+                                                                poi.isBoundary = true
+                                                            }
+
+                                                            (pID, poi)
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+        // RDD[(pID, ArrayBuffer[DBPOI])]
+        val partitionRDD = flatMappedRDD.aggregateByKey(ArrayBuffer[DbPOI]())(
+            // SeqOp
+            (zArrBuffDBPoi, poi) => zArrBuffDBPoi += poi,
+
+            // CombOp
+            (zArrBuffDBPoi1, zArrBuffDBPoi2) => zArrBuffDBPoi1 ++= zArrBuffDBPoi2
+        )
+
+
+        // RDD[dbpoi]
+        this.clusterRDD = partitionRDD.flatMap{
+            case (pID, poiArrBuff) =>
+                // New DBSCAN CLusterer For Each Partition-Envelope
+                val dbclusterer = DBCLusterer(eps, minPts)
+
+                // Perform DBSCAN in each partition and return a List of Clusters: ArrayBuffer[ArrayBuffer[DBPOI]]
+                val clusters = dbclusterer.clusterPois(poiArrBuff)
+
+                var i = 0
+                for (cluster <- clusters) {
+                    for (poi <- cluster) {
+                        poi.clusterName = pID + "p" + i
+                    }
+
+                    i = i + 1
+                }
+                clusters.flatten
+        }
+        .persist(MEMORY_AND_DISK)
+
+
+        // Take all Boundary Pois.
+        // RDD[poiID, dbpoi]
+        val boundaryPoiRDD = this.clusterRDD.filter(_.isBoundary).map(poi => (poi.poiId, poi) )
+
+
+        // RDD[poiID, (List[pID&cID], isDense?)]      Set[pID&cID], isDensePoi?
+        val bPoiRDD = boundaryPoiRDD.aggregateByKey( (Set[String](), false) )(
+            // SeqOp
+            (zTuple, poi) => (zTuple._1 + poi.clusterName, zTuple._2 | poi.isDense ),
+
+            // CombOp
+            (tuple1, tuple2) => (tuple1._1 ++ tuple2._1, tuple1._2 | tuple2._2)
+        )
+
+
+        // Vector[Set[pID&cIID]],  HashMap[poiID ,pID&cID]                    Vector[Set[pID&cIID]] , HashMap[poiID , pID&cID]
+        val (mergingClusterNameVec, boundaryPoisToKeepHM) = bPoiRDD.aggregate( ( Vector[Set[String]](), HashMap[String, String]() ))(
+            // SeqOp
+            (zTuple, xTuple) => {
+
+                val (vec, zHashMap) = zTuple
+
+                //  [poiID, (Set[pID&cID], isDense?)]
+                val (poiID, (pIDcIDSet, isDense)) = xTuple
+
+                if(isDense) {
+                    (insertSetIntoVec(vec, pIDcIDSet), zHashMap)
+                }
+                else {
+                    (vec, zHashMap += ((poiID, pIDcIDSet.head)) )
+                }
+            },
+
+            // CombOp
+            (zTuple1, zTuple2) => {
+                val (vec1, hashMap1) = zTuple1
+                val (vec2, hashMap2) = zTuple2
+                val vec3 = vec2.foldLeft(vec1)((zVec, xSet) => insertSetIntoVec(zVec, xSet))
+
+                (vec3, hashMap1 ++= hashMap2)
+            }
+        )
+
+
+        // Broadcast commonNames and PoisToKeep
+        this.mergingClusterNameVecBD = spark.sparkContext.broadcast(mergingClusterNameVec)
+        this.boundaryPoisToKeepHMBD = spark.sparkContext.broadcast(boundaryPoisToKeepHM)
+        val preFinalClusterRDD = this.clusterRDD.mapPartitions{
+            poiIter => {
+
+                val commonNameMap = this.mergingClusterNameVecBD.value.flatMap{
+                    nameSet => {
+                        val commonName = nameSet.toSeq.sortWith(_ < _).mkString("c")
+                        nameSet.map(_ -> commonName)
+                    }
+                }.toMap
+
+                poiIter.flatMap{
+                    poi => {
+                        var poiIDcIDName = poi.clusterName
+                        commonNameMap.get(poi.clusterName) match {
+                            case Some(commonName) => poiIDcIDName = commonName
+                            case None => ()
+                        }
+                        var keepPoi = true
+                        this.boundaryPoisToKeepHMBD.value.get(poi.poiId) match {
+                            case Some(pIDcIDwhoKeepsPoi) =>
+                                if (poi.clusterName != pIDcIDwhoKeepsPoi) {
+                                    keepPoi = false
+                                }
+                            case None => ()
+                        }
+
+                        poi.clusterName = poiIDcIDName
+                        if(keepPoi) {
+                            Seq((poi.clusterName, poi))
+                        }
+                        else {
+                            Seq()
+                        }
+                    }
+                }
+            }
+        }
+
+
+        // RDD[clusterName, HashMap[poiID, poi]]
+        val dbclusterRDD = preFinalClusterRDD.aggregateByKey(HashMap[String, DbPOI]())(
+            // SeqOp
+            (zPoiHM, poi) => zPoiHM += ((poi.poiId, poi)),
+
+            // CombOp
+            (hm1, hm2) => hm1 ++= hm2
+        )
+        // RDD[(String, Array[POI])]
+      // dbclusterRDD.foreach(println)
+        val k = dbclusterRDD.mapValues(_.toArray)
+        k
+    }
+    /*
+    * This method should be called after
+    * finishing using this class(e.g: writing results, or printing stats).
+    * */
+    def clear(): Unit = {
+        this.clusterRDD.unpersist(true)
+        this.boundaryPoisToKeepHMBD.destroy()
+        this.mergingClusterNameVecBD.destroy()
+        this.spatialPartitionerBD.destroy()
+    }
+
+}
+
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Distances.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Distances.scala
new file mode 100644
index 0000000..234fda8
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Distances.scala
@@ -0,0 +1,17 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+class Distances {
+
+  /**
+   * Jaccard Similarity Coefficient between two sets of categories corresponding to two pois
+   *
+   * @param x set of categories
+   * @param y set of categories
+   */
+  def jaccardSimilarity(x: Set[String], y: Set[String]): Double = {
+    val union_l = x.union(y).toList.length.toDouble
+    val intersect_l = x.intersect(y).toList.length.doubleValue()
+    intersect_l / (union_l)
+  }
+}
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Encoder.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Encoder.scala
new file mode 100644
index 0000000..2624b8f
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Encoder.scala
@@ -0,0 +1,121 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import org.apache.spark.ml.feature.{ VectorAssembler, Word2Vec }
+import org.apache.spark.rdd._
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+
+
+class Encoder {
+
+  /**
+   * One hot encoding categorical data
+   *
+   * @param poiCategories, category ids with corresponding category values
+   * @param spark
+   * @return one hot encoded DataFrame for each poi
+   */
+  def oneHotEncoding(poiCategories: RDD[(Long, Set[String])], spark: SparkSession): (DataFrame, Array[Array[Int]]) = {
+    // create a set to contain all categories
+    var set = scala.collection.mutable.Set[String]()
+    // put all categories to set
+    poiCategories.collect().foreach(x => x._2.foreach(y => set += y))
+    // create columns base on the length of set
+    val numPOIS = poiCategories.count().toInt // Array.ofDim only accept Int
+    val categoryArray = set.toArray
+    val oneHotMatrix = Array.ofDim[Int](numPOIS, categoryArray.length + 1) // one column keep poi id
+    // initialize distance matrix, collect first needed
+    var i = 0
+    poiCategories.collect().foreach(x =>
+      {
+        oneHotMatrix(i)(0) = x._1.toInt
+        for (j <- 1 until categoryArray.length + 1) {
+          oneHotMatrix(i)(j) = 0
+        }
+        x._2.foreach(y =>
+          { // encode corresponding category value to 1
+            oneHotMatrix(i)(categoryArray.indexOf(y) + 1) = 1
+          })
+        i += 1
+      })
+    // vector keep all StructField
+    val fields = Array.ofDim[StructField](categoryArray.length + 1)
+    val featureColumns = Array.ofDim[String](categoryArray.length + 1)
+    // keep other columns with integer type
+    for (i <- 0 until categoryArray.length + 1) {
+      fields(i) = StructField(i.toString, IntegerType, true)
+      featureColumns(i) = i.toString
+    }
+    val schema = new StructType(fields)
+    val oneHotEncodedRDD = spark.sparkContext.parallelize(oneHotMatrix).map(x => Row.fromSeq(x.toList))
+    val oneHotEncodedDF = spark.createDataFrame(oneHotEncodedRDD, schema)
+    // set up 'features' column
+    val assemblerFeatures = new VectorAssembler().setInputCols(featureColumns.slice(1, featureColumns.length)).setOutputCol("features")
+    val transformedDf = assemblerFeatures.transform(oneHotEncodedDF)
+    (transformedDf, oneHotMatrix)
+  }
+
+  /**
+   * word2Vec encoding
+   *
+   * @param poiCategories category ids with corresponding category values
+   * @param spark
+   * @return word2Vec encoded categories for each poi in DataFrame
+   */
+  def wordVectorEncoder(poiCategories: RDD[(Long, Set[String])], spark: SparkSession): (DataFrame, RDD[(Int, Array[Double])]) = {
+    val word2vec = new Word2Vec().setInputCol("inputCol").setMinCount(1)
+    val schema = StructType(StructField("inputCol", ArrayType(StringType, true), true) :: Nil)
+    val df = spark.createDataFrame(poiCategories.map(f => Row(f._2.map(x => x.toString).toArray)), schema)
+    val wordVectorsRDD = word2vec.fit(df).getVectors.select("word", "vector").rdd
+    val vectors = wordVectorsRDD.map(f => (f.getString(0), f.getAs[org.apache.spark.ml.linalg.DenseVector](1)))
+    val categoryVectors = vectors.collectAsMap()
+    val poiCategoryVectors = poiCategories.map(f => (f._1, f._2.map(x => categoryVectors.get(x).head.toArray)))
+    val poiVector = poiCategoryVectors.map(f => (f._1, f._2.size, f._2.toArray.toList.transpose.map(_.sum).toArray))
+    val leng = poiVector.take(1)(0)._2
+    val poiAvgVector = poiVector.map(x => (x._1.toInt, x._3.map(y => y / x._2)))
+    val fields = Array.ofDim[StructField](leng + 1)
+    val featureColumns = Array.ofDim[String](leng + 1)
+    // keep other columns with integer type
+    fields(0) = StructField("id", IntegerType, true)
+    featureColumns(0) = "id"
+    for (i <- 1 until leng + 1) {
+      fields(i) = StructField(i.toString, DoubleType, true)
+      featureColumns(i) = i.toString
+    }
+    val schema2 = new StructType(fields)
+    val poiAvgVectorDF = spark.createDataFrame(poiAvgVector.map(x => Row.fromSeq(x._1 +: x._2)), schema2)
+    val assemblerFeatures = new VectorAssembler().setInputCols(featureColumns.slice(1, featureColumns.length)).setOutputCol("features")
+    val transformedDf = assemblerFeatures.transform(poiAvgVectorDF)
+    (transformedDf, poiAvgVector)
+  }
+
+  /**
+   * multiple dimensional encoding
+   *
+   * @param distancePairs distance between pair of pois
+   * @param numPOIS number of pois
+   * @param dimension mapped coordinate dimension
+   * @param spark
+   * @return encoded coordinates for each poi in DataFrame
+   */
+  def mdsEncoding(distancePairs: RDD[(Long, Long, Double)], numPOIS: Int, dimension: Int, spark: SparkSession): (DataFrame, Array[(Long, Array[Double])]) = {
+    val poi2Coordinates = new MultiDS().multiDimensionScaling(distancePairs, numPOIS, dimension)
+    val poi2Coordinates2 = poi2Coordinates.map(x => x._1.toInt :: x._2.toList)
+    // create schema
+    val fields = Array.ofDim[StructField](dimension + 1)
+    val featureColumns = Array.ofDim[String](dimension + 1)
+    fields(0) = StructField("id", IntegerType, true)
+    featureColumns(0) = "id"
+    for (i <- 1 until dimension + 1) {
+      fields(i) = StructField(i.toString, DoubleType, true)
+      featureColumns(i) = i.toString
+    }
+    val schema = new StructType(fields)
+    val coordinatesRDD = spark.sparkContext.parallelize(poi2Coordinates2.toSeq).map(x => Row.fromSeq(x))
+    val coordinatesDF = spark.createDataFrame(coordinatesRDD, schema)
+    val assembler = new VectorAssembler().setInputCols(featureColumns.slice(1, featureColumns.length)).setOutputCol("features")
+    val featureData = assembler.transform(coordinatesDF)
+    (featureData, poi2Coordinates)
+  }
+}
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/FirstHardeninginBorderFlow.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/FirstHardeninginBorderFlow.scala
similarity index 84%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/FirstHardeninginBorderFlow.scala
rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/FirstHardeninginBorderFlow.scala
index f22f1f9..eee7c7b 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/FirstHardeninginBorderFlow.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/FirstHardeninginBorderFlow.scala
@@ -1,31 +1,34 @@
-package net.sansa_stack.ml.spark.clustering
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import java.io._
+import java.io.{ ByteArrayInputStream, FileNotFoundException, FileReader, IOException, StringWriter }
+import java.lang.{ Long => JLong }
+import java.net.URI
 
-import org.apache.spark.rdd.RDD
-import org.apache.spark.graphx.{ Graph, EdgeDirection }
 import scala.math.BigDecimal
-import org.apache.spark.sql.SparkSession
 import scala.reflect.runtime.universe._
-import scopt.OptionParser
+import scala.util.control.Breaks._
+
+import breeze.linalg.{ squaredDistance, DenseVector, Vector }
+import org.apache.jena.graph.Node
 import org.apache.log4j.{ Level, Logger }
-import org.apache.spark.mllib.util.MLUtils
-import java.io.{ FileReader, FileNotFoundException, IOException }
+import org.apache.spark.graphx._
+import org.apache.spark.graphx.{ EdgeDirection, Graph }
 import org.apache.spark.mllib.linalg.Vectors
-import java.lang.{ Long => JLong }
-import breeze.linalg.{ squaredDistance, DenseVector, Vector }
-import scala.util.control.Breaks._
-import java.io.ByteArrayInputStream
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.PairRDDFunctions
-import java.io.StringWriter
-import java.io._
-import java.net.URI
-import org.apache.spark.graphx._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+
+
+
 
 object FirstHardeninginBorderFlow {
 
-  def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputeval: String) = {
+  def apply(spark: SparkSession, graph: Graph[Node, Node], output: String, outputeval: String): Unit = {
 
     /**
-     * 
+     *
      * Jaccard similarity measure : selectYourSimilarity = 0
      * Batet similarity measure : selectYourSimilarity = 1
      * Rodríguez and Egenhofer similarity measure : selectYourSimilarity = 2
@@ -39,9 +42,9 @@ object FirstHardeninginBorderFlow {
       graphXinBorderFlow(selectYourSimilarity)
     }
 
-    /*
-	 * Computes different similarities function for a given graph @graph.
-	 */
+    /**
+     * Computes different similarities function for a given graph @graph.
+     */
     def graphXinBorderFlow(f: Int): List[List[Long]] = {
 
       val edge = graph.edges
@@ -55,7 +58,8 @@ object FirstHardeninginBorderFlow {
         val x = f._1
         x
       })
-
+      println("hard")
+      sort.foreach(println)
       var X = sort.collect()
 
       neighborSort.unpersist()
@@ -63,9 +67,9 @@ object FirstHardeninginBorderFlow {
       val neighborcollect = neighbor.collect()
       val verticescollect = graph.vertices.collect()
 
-      /*
-	 * finding neighbors for node a
-	 */
+      /**
+       * finding neighbors for node a
+       */
 
       def findneighbors(a: VertexId): Array[VertexId] = {
         var b: Array[VertexId] = Array()
@@ -80,15 +84,15 @@ object FirstHardeninginBorderFlow {
         b
       }
 
-      /*
-	 * Computing logarithm based 2
-	 */
+      /**
+       * Computing logarithm based 2
+       */
       val LOG2 = math.log(2)
       val log2 = { x: Double => math.log(x) / LOG2 }
 
-      /*
-	 * Difference between two set of vertices, used in different similarity measures
-	 */
+      /**
+       * Difference between two set of vertices, used in different similarity measures
+       */
 
       def difference(a: Array[VertexId], b: Array[VertexId]): Double = {
         if (a.length == 0) { return 0.0 }
@@ -97,9 +101,9 @@ object FirstHardeninginBorderFlow {
         differ.size.toDouble
       }
 
-      /*
-	 * Intersection of two set of vertices, used in different similarity measures
-	 */
+      /**
+       * Intersection of two set of vertices, used in different similarity measures
+       */
       def intersection(a: Array[VertexId], b: Array[VertexId]): Double = {
         if ((a.length == 0) || (b.length == 0)) { return 0.0 }
         val rst = a.intersect(b)
@@ -107,9 +111,9 @@ object FirstHardeninginBorderFlow {
         rst.size.toDouble
       }
 
-      /*
-			 * Union of two set of vertices, used in different similarity measures
-			 */
+      /**
+       * Union of two set of vertices, used in different similarity measures
+       */
 
       def union(a: Array[VertexId], b: Array[VertexId]): Double = {
         val rst = a.union(b)
@@ -117,17 +121,17 @@ object FirstHardeninginBorderFlow {
         rst.size.toDouble
       }
 
-      /*
-			 * similarity measures
-			 */
+      /**
+       * similarity measures
+       */
 
       def selectSimilarity(a: Array[VertexId], b: Array[VertexId], c: Int): Double = {
         var s = 0.0
         if (c == 0) {
 
-          /*
-			 * Jaccard similarity measure
-			 */
+          /**
+           * Jaccard similarity measure
+           */
 
           val sim = intersection(a, b) / union(a, b).toDouble
           if (sim == 0.0) { s = (1 / vertex) }
@@ -137,9 +141,9 @@ object FirstHardeninginBorderFlow {
 
         if (c == 1) {
 
-          /*
-			 * Rodríguez and Egenhofer similarity measure
-			 */
+          /**
+           * Rodríguez and Egenhofer similarity measure
+           */
 
           var g = 0.8
 
@@ -149,9 +153,9 @@ object FirstHardeninginBorderFlow {
 
         }
         if (c == 2) {
-          /*
-			 * The Ratio model similarity
-			 */
+          /**
+           * The Ratio model similarity
+           */
           var alph = 0.5
           var beth = 0.5
 
@@ -162,15 +166,14 @@ object FirstHardeninginBorderFlow {
         }
 
         if (c == 3) {
-          /*
-			 * Batet similarity measure
-			 */
+          /**
+           * Batet similarity measure
+           */
 
           val cal = 1 + ((difference(a, b) + difference(b, a)) / (difference(a, b) + difference(b, a) + intersection(a, b))).abs
           val sim = log2(cal.toDouble)
           if (sim == 0.0) { s = (1 / vertex) }
           else { s = sim }
-
         }
         s
       }
@@ -222,7 +225,7 @@ object FirstHardeninginBorderFlow {
 
       val sortsim = sumsimilarity(X)
 
-      //println(s"sortsim: $sortsim\n")
+      // println(s"sortsim: $sortsim\n")
 
       var node = sortsim.map(f => {
         f._1
@@ -232,7 +235,7 @@ object FirstHardeninginBorderFlow {
 
       neighbor.unpersist()
 
-      //computing F(X) for BorderFlow
+      // computing F(X) for BorderFlow
 
       def fX(x: List[Long]): Double = {
         var jaccardX = 0.0
@@ -276,13 +279,13 @@ object FirstHardeninginBorderFlow {
 
         b.map(bi => {
           x.map(xj => {
-            if (bi.!=(xj)) { jaccardX = jaccardX.+(findingSimilarity(bi, xj).abs) }
+            if (bi.!=(xj)) { jaccardX = jaccardX. + (findingSimilarity(bi, xj).abs) }
           })
         })
 
         b.map(bi => {
           n.map(nj => {
-            jaccardN = jaccardN.+(findingSimilarity(bi, nj).abs)
+            jaccardN = jaccardN. + (findingSimilarity(bi, nj).abs)
           })
         })
 
@@ -309,16 +312,16 @@ object FirstHardeninginBorderFlow {
         val n = listOfN(x)
         var jaccardNU = 0.0
         n.map(ni => {
-          if (ni.!=(u)) { jaccardNU = jaccardNU.+(findingSimilarity(u, ni).abs) }
+          if (ni.!=(u)) { jaccardNU = jaccardNU. + (findingSimilarity(u, ni).abs) }
         })
 
         jaccardNU
 
       }
 
-      /*
-	 * Use Non-Heuristics(normal) method for producing clusters.
-	 */
+      /**
+       * Use Non-Heuristics(normal) method for producing clusters.
+       */
 
       def nonHeuristicsCluster(a: List[Long], d: List[Long]): List[Long] = {
         var nj: List[Long] = List()
@@ -399,22 +402,20 @@ object FirstHardeninginBorderFlow {
 
       }
 
-      /*
-	 *
-	 * Input for nonHeuristics nonHeuristicsCluster(element,List())  .
-	 */
+      /**
+       * Input for nonHeuristics nonHeuristicsCluster(element,List())  .
+       */
 
       def makerdf(a: List[Long]): List[String] = {
         var listuri: List[String] = List()
         val b: List[VertexId] = a
         for (i <- 0 until b.length) {
           verticescollect.map(v => {
-            if (b(i) == v._1) listuri = listuri.::(v._2)
+            if (b(i) == v._1) listuri = listuri.::(v._2.toString())
           })
 
         }
         listuri
-
       }
 
       def makeClusters(a: Long): List[Long] = {
@@ -453,13 +454,13 @@ object FirstHardeninginBorderFlow {
       } while (node.size > 0)
 
       neighborSort.unpersist()
-      //println(s"RDF Cluster assignments: $rdfcluster\n")
+      // println(s"RDF Cluster assignments: $rdfcluster\n")
       val rdfRDD = spark.sparkContext.parallelize(rdfcluster)
       rdfRDD.saveAsTextFile(output)
 
-      /*
-			 * Sillouhette Evaluation
-			 */
+      /**
+       * Sillouhette Evaluation
+       */
 
       def avgA(c: List[Long], d: Long): Double = {
         var sumA = 0.0
@@ -530,7 +531,7 @@ object FirstHardeninginBorderFlow {
       val evaluate = AiBi(bigList, nnode)
 
       val av = evaluate.sum / evaluate.size
-      //println(s"average: $av\n")
+      // println(s"average: $av\n")
       val evaluateString: List[String] = List(av.toString())
       val evaluateStringRDD = spark.sparkContext.parallelize(evaluateString)
 
@@ -540,8 +541,6 @@ object FirstHardeninginBorderFlow {
     }
 
     val rdf = clusterRdd()
-    //println(s"RDF Cluster assignments: $rdf\n")
-
+    // println(s"RDF Cluster assignments: $rdf\n")
   }
-
 }
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Kmeans.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Kmeans.scala
new file mode 100644
index 0000000..a90f876
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/Kmeans.scala
@@ -0,0 +1,28 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import org.apache.spark.ml.clustering.KMeans
+import org.apache.spark.sql._
+import org.apache.spark.sql.SparkSession
+
+class Kmeans {
+
+  /**
+   * K-means clustering based on given Dataframe
+   *
+   * @param numClusters
+   * @param df
+   * @param spark
+   * @return cluster id and corresponding pois in cluster
+   */
+  def kmClustering(numClusters: Int, maxIter: Int, df: DataFrame, spark: SparkSession): Map[Int, Array[Long]] = {
+    val km = new KMeans().setK(numClusters).setMaxIter(maxIter).setSeed(1L).setFeaturesCol("features").setPredictionCol("prediction")
+    val model = km.fit(df)
+    val transformedDataFrame = model.transform(df)
+    import spark.implicits._
+    // get (cluster_id, poi_id)
+    val clusterIdPoi = transformedDataFrame.map(f => (f.getInt(f.size - 1), f.getInt(0).toLong)).rdd.groupByKey()
+    val clustersMDSKM = clusterIdPoi.map(x => (x._1, x._2.toArray)).collectAsMap().toMap
+    clustersMDSKM
+  }
+}
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDS.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDS.scala
new file mode 100644
index 0000000..a9cc699
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDS.scala
@@ -0,0 +1,50 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import org.apache.spark.rdd._
+import smile.mds.MDS
+
+class MultiDS {
+
+  /**
+   * Multi-dimensional scaling
+   * Generate n dimensional coordinates based on input similarity matrix
+   *
+   * @param distancePairs distance between pair of poi
+   * @param numPOIS number of poi
+   * @param dimension dimension of generated coordinates
+   * @return poi id and coordinates in given dimension
+   */
+  def multiDimensionScaling(distancePairs: RDD[(Long, Long, Double)], numPOIS: Int, dimension: Int): Array[(Long, Array[Double])] = {
+    // vector keep recorded poi
+    var vector = Array.ofDim[Long](numPOIS)
+    // positive symmetric distance matrix
+    var distanceMatrix = Array.ofDim[Double](numPOIS, numPOIS)
+    // initialize distance matrix
+    for (i <- 0 until numPOIS) {
+      vector(i) = 0
+      for (j <- 0 until numPOIS) {
+        distanceMatrix(i)(j) = 0.0
+      }
+    }
+    var i = 0
+    distancePairs.collect().foreach(x => {
+      if (!vector.contains(x._1)) { // if there is no record for this poi
+        vector(i) = x._1
+        i += 1
+      }
+      if (!vector.contains(x._2)) { // if there is no record for this poi
+        vector(i) = x._2
+        i += 1
+      }
+      val i1 = vector.indexOf(x._1) // get the index as x-y axis for matrix
+      val i2 = vector.indexOf(x._2) // get the index as x-y axis for matrix
+      distanceMatrix(i1)(i2) = x._3
+      distanceMatrix(i2)(i1) = x._3
+    })
+    // create coordinates
+    val mds = new MDS(distanceMatrix, dimension, true)
+    mds.getCoordinates.zip(vector).map(x => (x._2, x._1))
+  }
+}
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/PIC.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/PIC.scala
new file mode 100644
index 0000000..3688318
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/PIC.scala
@@ -0,0 +1,34 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import org.apache.spark.graphx.Edge
+import org.apache.spark.graphx.Graph
+import org.apache.spark.mllib.clustering.PowerIterationClustering
+import org.apache.spark.rdd._
+import org.apache.spark.sql._
+
+
+class PIC {
+
+  /*
+   * Power Iteration clustering algorithm from Spark standard library
+   * */
+  def picSparkML(pairwisePOISimilarity: RDD[(Long, Long, Double)], numCentroids: Int, numIterations: Int, sparkSession: SparkSession): Map[Int, Array[Long]] = {
+    val model = new PowerIterationClustering().setK(numCentroids).setMaxIterations(numIterations).setInitializationMode("degree").run(pairwisePOISimilarity)
+    val clusters = model.assignments.collect().groupBy(_.cluster).mapValues(_.map(_.id))
+    clusters
+  }
+/*
+   * Power Iteration using implementation from SANSA
+   * */
+    def picSANSA(pairwisePOISimilarity: RDD[(Long, Long, Double)], numCentroids: Int, numIterations: Int, sparkSession: SparkSession) {
+    val verticeS = pairwisePOISimilarity.map(f => f._1)
+    val verticeD = pairwisePOISimilarity.map(f => f._2)
+    val indexedMap = verticeS.union(verticeD).distinct().zipWithIndex()
+    val vertices = indexedMap.map(f => (f._2, f._1))
+    val edges = pairwisePOISimilarity.map(f => Edge(f._1, f._2, f._3)) // from similarity to int
+    val similarityGraph = Graph(vertices, edges)
+    // val model = new RDFGraphPICClustering(sparkSession, similarityGraph, numCentroids, numIterations)
+  }
+}
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFByModularityClustering.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFByModularityClustering.scala
similarity index 88%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFByModularityClustering.scala
rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFByModularityClustering.scala
index 55e508c..47a498e 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFByModularityClustering.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFByModularityClustering.scala
@@ -1,19 +1,21 @@
-package net.sansa_stack.ml.spark.clustering
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import java.io.StringWriter
+
+import scala.util.control.Breaks._
 
 import org.apache.log4j.{ Level, Logger }
+import org.apache.spark.{ SparkConf, SparkContext }
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
-import org.apache.spark.{ SparkConf, SparkContext }
 
-import scala.util.control.Breaks._
-import java.io.StringWriter
 
 /**
  * Created by hpetzka on 09.11.2016.
  */
 object RDFByModularityClustering {
 
-  def apply(sc: SparkContext, numIterations: Int, graphFile: String, outputFile: String) = {
+  def apply(sc: SparkContext, numIterations: Int, graphFile: String, outputFile: String): Unit = {
 
     // DEFAULT INPUT
     // val (numIterations, graphFile) = (100 , "C:/Users/hpetzka/IdeaProjects/Clustering_in_Spark/Graphs/testRDF.txt")
@@ -39,7 +41,7 @@ object RDFByModularityClustering {
         If weight edges come in, one probably needa a map as follows
         val adjacencyMatrix: Array[Array[Int]] = Array.ofDim[Int](numVertices, numVertices)
         var adjacencies: Map[(String, String), Int] = Map[(String, String),Int]()
-        for (x <- edgesRDD.collect()){
+        for (x <- edgesRDD.collect()) {
           // TODO add the weights here if they exist
           if(x(0) < x(1)) adjacencies += ( (x(0),x(1)) -> 1 )
           else adjacencies += ( (x(1),x(0)) -> 1 )
@@ -117,11 +119,12 @@ object RDFByModularityClustering {
 
   }
 
-  def iterationStepClusteringRDFByModularity(numEdges: Long,
-                                             edgesBC: Broadcast[Array[(String, String)]],
-                                             vertexDegreesBC: Broadcast[Map[String, Int]],
-                                             clusterMapRDD: RDD[List[String]],
-                                             sc: SparkContext): (RDD[List[String]], Boolean) = {
+  def iterationStepClusteringRDFByModularity(
+    numEdges: Long,
+    edgesBC: Broadcast[Array[(String, String)]],
+    vertexDegreesBC: Broadcast[Map[String, Int]],
+    clusterMapRDD: RDD[List[String]],
+    sc: SparkContext): (RDD[List[String]], Boolean) = {
     // Start iteration
 
     // The following RDD contains distinct pairs of clusters for which there is an edge between them
@@ -172,11 +175,12 @@ object RDFByModularityClustering {
 
   // The function that computes delta Q for the merge of two clusters
 
-  def deltaQ(numEdges: Long,
-             vertexDegreesBC: Broadcast[Map[String, Int]],
-             edgesBC: Broadcast[Array[(String, String)]],
-             clusterI: List[String],
-             clusterJ: List[String]): Double = {
+  def deltaQ(
+    numEdges: Long,
+    vertexDegreesBC: Broadcast[Map[String, Int]],
+    edgesBC: Broadcast[Array[(String, String)]],
+    clusterI: List[String],
+    clusterJ: List[String]): Double = {
 
     val clusterPairs: List[(String, String)] = clusterI.flatMap(x => clusterJ.map(y => (x, y)))
 
@@ -189,12 +193,9 @@ object RDFByModularityClustering {
     1.0 / numEdges * summand.fold(0.0)((a: Double, b: Double) => a - b)
   }
 
-  def WriteToFile[T](rdd: RDD[T], file: String, coalesce: (Boolean, Int) = (false, 0)) =
+  def WriteToFile[T](rdd: RDD[T], file: String, coalesce: (Boolean, Int) = (false, 0)): Unit =
     coalesce._1 match {
-      case true  => rdd.coalesce(coalesce._2).saveAsTextFile(file)
+      case true => rdd.coalesce(coalesce._2).saveAsTextFile(file)
       case false => rdd.saveAsTextFile(file)
     }
-
 }
-
-
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFGraphPowerIterationClustering.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFGraphPowerIterationClustering.scala
similarity index 87%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFGraphPowerIterationClustering.scala
rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFGraphPowerIterationClustering.scala
index 15a3234..e49c871 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/RDFGraphPowerIterationClustering.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/RDFGraphPowerIterationClustering.scala
@@ -1,46 +1,36 @@
-package net.sansa_stack.ml.spark.clustering
+package net.sansa_stack.ml.spark.clustering.algorithms
 
-import scala.reflect.runtime.universe._
-import scopt.OptionParser
-import org.apache.log4j.{ Level, Logger }
-import org.apache.spark.mllib.util.MLUtils
-import java.io.{ FileReader, FileNotFoundException, IOException }
-import org.apache.spark.mllib.linalg.Vectors
-import java.lang.{ Long => JLong }
+import java.io._
+import java.io.{ ByteArrayInputStream, FileNotFoundException, FileReader, IOException, StringWriter }
 import java.lang.{ Long => JLong }
+import java.net.URI
+
+import scala.collection.mutable
+import scala.math.BigDecimal
+import scala.reflect.runtime.universe._
+
 import breeze.linalg.{ squaredDistance, DenseVector, Vector }
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.graphx.GraphLoader
+import org.apache.commons.math3.util.MathUtils
 import org.apache.jena.datatypes.{ RDFDatatype, TypeMapper }
-import org.apache.jena.graph.{ Node => JenaNode, Triple => JenaTriple, _ }
-import org.apache.jena.riot.writer.NTriplesWriter
+import org.apache.jena.graph.{ Node => JenaNode, Node_ANY, Node_Blank, Node_Literal, Node_URI, Triple => JenaTriple, _ }
 import org.apache.jena.riot.{ Lang, RDFDataMgr }
-import org.apache.jena.graph.{ Node_ANY, Node_Blank, Node_Literal, Node_URI, Node => JenaNode, Triple => JenaTriple }
+import org.apache.jena.riot.writer.NTriplesWriter
 import org.apache.jena.vocabulary.RDF
-import java.io.ByteArrayInputStream
-import org.apache.spark.rdd.PairRDDFunctions
+import org.apache.log4j.{ Level, Logger }
 import org.apache.spark.SparkContext._
 import org.apache.spark.graphx._
+import org.apache.spark.graphx.{ EdgeDirection, Graph, GraphLoader }
+import org.apache.spark.mllib.clustering.{ PowerIterationClustering, PowerIterationClusteringModel }
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.PairRDDFunctions
 import org.apache.spark.rdd.RDD
-import java.io.StringWriter
-import java.io._
-import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.clustering.{ PowerIterationClusteringModel, PowerIterationClustering }
-import org.apache.spark.graphx.{ Graph, EdgeDirection }
-import scala.math.BigDecimal
-import org.apache.commons.math3.util.MathUtils
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.graphx._
-import java.net.URI
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.graphx._
-import scala.collection.mutable
 
 object RDFGraphPowerIterationClustering {
 
-  def apply(spark: SparkSession, graph: Graph[String, String], output: String, k: Int = 2, maxIterations: Int = 5) = {
-
-    
+  def apply(spark: SparkSession, graph: Graph[String, String], output: String, k: Int = 2, maxIterations: Int = 5): RDD[(Int, String)] = {
 
     def clusterRdd(): RDD[(Int, String)] = {
       SimilaritesInPIC()
@@ -48,16 +38,16 @@ object RDFGraphPowerIterationClustering {
 
     def SimilaritesInPIC(): RDD[(Int, String)] = {
 
-      /*
-	 * Collect all the edges of the graph
-	*/
+      /**
+       * Collect all the edges of the graph
+       */
       val edge = graph.edges
       val nodes = graph.vertices
 
-      /*
-	 * Collect distinct vertices of the graph
-	 *
-	 */
+      /**
+       * Collect distinct vertices of the graph
+       *
+       */
 
       val node = nodes.map(e => (e._1))
 
@@ -94,9 +84,9 @@ object RDFGraphPowerIterationClustering {
 
       def model = pic.run(weightedGraph)
 
-      /*
-			 * Cluster the graph data into two classes using PowerIterationClustering
-			 */
+      /**
+       * Cluster the graph data into two classes using PowerIterationClustering
+       */
       def run() = model
 
       val modelAssignments = model.assignments
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/SilviaClustering.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/SilviaClustering.scala
similarity index 91%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/SilviaClustering.scala
rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/SilviaClustering.scala
index d370a29..5aa3313 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/SilviaClustering.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/algorithms/SilviaClustering.scala
@@ -1,36 +1,31 @@
-package net.sansa_stack.ml.spark.clustering
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import java.io._
+import java.io.{ ByteArrayInputStream, FileNotFoundException, FileReader, IOException, StringWriter }
+import java.lang.{ Long => JLong }
+import java.net.URI
 
-import org.apache.spark.rdd.RDD
-import org.apache.spark.graphx.{ Graph, EdgeDirection }
 import scala.math.BigDecimal
-import org.apache.spark.sql.SparkSession
 import scala.reflect.runtime.universe._
-import scopt.OptionParser
-import org.apache.log4j.{ Level, Logger }
-import org.apache.spark.mllib.util.MLUtils
-import java.io.{ FileReader, FileNotFoundException, IOException }
-import org.apache.spark.mllib.linalg.Vectors
-import java.lang.{ Long => JLong }
-import java.lang.{ Long => JLong }
-import breeze.linalg.{ squaredDistance, DenseVector, Vector }
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.graphx.GraphLoader
 import scala.util.control.Breaks._
+
+import breeze.linalg.{ squaredDistance, DenseVector, Vector }
+import net.sansa_stack.rdf.spark.model.graph._
+import org.apache.jena.graph.{ Node, Triple }
 import org.apache.jena.riot.{ Lang, RDFDataMgr }
-import java.io.ByteArrayInputStream
-import org.apache.spark.rdd.PairRDDFunctions
+import org.apache.log4j.{ Level, Logger }
 import org.apache.spark.SparkContext._
 import org.apache.spark.graphx._
-import java.io.StringWriter
-import java.io._
-import org.apache.jena.graph.{ Node, Triple }
-import org.apache.jena.riot.Lang
-import net.sansa_stack.rdf.spark.model.graph._
-import java.net.URI
+import org.apache.spark.graphx.{ EdgeDirection, Graph, GraphLoader }
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.PairRDDFunctions
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
 
 object SilviaClustering {
 
-  def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputeval: String) = {
+  def apply(spark: SparkSession, graph: Graph[String, String], output: String, outputeval: String): Unit = {
 
     Logger.getRootLogger.setLevel(Level.WARN)
 
@@ -40,7 +35,7 @@ object SilviaClustering {
      *
      * Jaccard similarity measure : selectYourSimilarity = 0
      * Batet similarity measure : selectYourSimilarity = 1
-     * Rodríguez and Egenhofer similarity measure : selectYourSimilarity = 2
+     * Rodriguez and Egenhofer similarity measure : selectYourSimilarity = 2
      * The Contrast model similarity : selectYourSimilarity = 3
      * The Ratio model similarity : selectYourSimilarity = 4
      */
@@ -48,12 +43,13 @@ object SilviaClustering {
     val selectYourSimilarity = 0
 
     def clusterRdd(): RDD[List[String]] = {
+      val a = graph.triplets
       graphXinBorderFlow(graph, orient, selectYourSimilarity)
     }
 
-    /*
-	 * Computes different similarities function for a given graph @graph.
-	 */
+    /**
+     * Computes different similarities function for a given graph @graph.
+     */
     def graphXinBorderFlow(graph: Graph[String, String], e: Int, f: Int): RDD[List[String]] = {
 
       val edge = graph.edges.collect()
@@ -77,9 +73,9 @@ object SilviaClustering {
       val LOG2 = math.log(2)
       val log2 = { x: Double => math.log(x) / LOG2 }
 
-      /*
-	 * Difference between two set of vertices, used in different similarity measures
-	 */
+      /**
+       * Difference between two set of vertices, used in different similarity measures
+       */
       def difference(a: Long, b: Long): Double = {
         val ansec = neighbor.lookup(a).distinct.head.toSet
         val ansec1 = neighbor.lookup(b).distinct.head.toSet
@@ -90,9 +86,9 @@ object SilviaClustering {
         differ.size.toDouble
       }
 
-      /*
-	 * Intersection of two set of vertices, used in different similarity measures
-	 */
+      /**
+       * Intersection of two set of vertices, used in different similarity measures
+       */
       def intersection(a: Long, b: Long): Double = {
         val inters = neighbor.lookup(a).distinct.head.toList
         val inters1 = neighbor.lookup(b).distinct.head.toList
@@ -106,9 +102,9 @@ object SilviaClustering {
         rst.size.toDouble
       }
 
-      /*
-			 * Union of two set of vertices, used in different similarity measures
-			 */
+      /**
+       * Union of two set of vertices, used in different similarity measures
+       */
       def union(a: Long, b: Long): Double = {
         val uni = neighbor.lookup(a).distinct.head.toList
         val uni1 = neighbor.lookup(b).distinct.head.toList
@@ -124,9 +120,9 @@ object SilviaClustering {
         var s = 0.0
         if (c == 0) {
 
-          /*
-			 * Jaccard similarity measure
-			 */
+          /**
+           * Jaccard similarity measure
+           */
 
           val sim = intersection(a, b) / union(a, b).toDouble
 
@@ -136,9 +132,9 @@ object SilviaClustering {
 
         if (c == 1) {
 
-          /*
-			 * Rodríguez and Egenhofer similarity measure
-			 */
+          /**
+           * Rodríguez and Egenhofer similarity measure
+           */
 
           var g = 0.8
 
@@ -148,9 +144,10 @@ object SilviaClustering {
 
         }
         if (c == 2) {
-          /*
-			 * The Ratio model similarity
-			 */
+
+          /**
+           * The Ratio model similarity
+           */
           var alph = 0.5
           var beth = 0.5
 
@@ -161,9 +158,9 @@ object SilviaClustering {
         }
 
         if (c == 3) {
-          /*
-			 * Batet similarity measure
-			 */
+          /**
+           * Batet similarity measure
+           */
 
           val cal = 1 + ((difference(a, b) + difference(b, a)) / (difference(a, b) + difference(b, a) + intersection(a, b))).abs
           val sim = log2(cal.toDouble)
@@ -518,11 +515,8 @@ object SilviaClustering {
 
       result
     }
-
     val cRdd = clusterRdd()
-
-    cRdd.saveAsTextFile(output)
-
+    val zipwithindex = cRdd.zipWithIndex().map(f => (f._2, f._1))
+    zipwithindex.saveAsTextFile(output)
   }
-
 }
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/AppConfig.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/AppConfig.scala
new file mode 100644
index 0000000..b5a11c5
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/AppConfig.scala
@@ -0,0 +1,30 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+case class Spark(master: String,
+                 spark_serializer: String,
+                 spark_executor_memory: String,
+                 spark_driver_memory: String,
+                 spark_driver_maxResultSize: String,
+                 app_name: String)
+
+case class Clustering(profile: String,
+                      pic: String,
+                      oneHotKM: String,
+                      mdsKM: String,
+                      word2VecKM: String,
+                      picDistanceMatrix: String,
+                      mdsCoordinates: String,
+                      oneHotMatrix: String,
+                      word2Vec: String)
+
+case class Datasets(input: String,
+                    termValueUri: String,
+                    termPrefix: String,
+                    typePOI: String,
+                    coordinatesPredicate: String,
+                    categoryPOI: String,
+                    poiPrefix: String)
+
+case class AppConfig(dataset: Datasets, clustering: Clustering, spark: Spark)
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Categories.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Categories.scala
new file mode 100644
index 0000000..bf693e1
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Categories.scala
@@ -0,0 +1,7 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+/**
+ * @param categories a set of category values
+ */
+case class Categories(categories: scala.collection.mutable.Set[String])
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Cluster.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Cluster.scala
new file mode 100644
index 0000000..d342e65
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Cluster.scala
@@ -0,0 +1,10 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+/**
+ * a cluster
+ *
+ * @param cluster_id id of cluster
+ * @param poi_in_cluster an array of pois in cluster
+ */
+case class Cluster(cluster_id: Int, poi_in_cluster: Array[Poi])
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Clusters.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Clusters.scala
new file mode 100644
index 0000000..3646c9b
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Clusters.scala
@@ -0,0 +1,9 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+/**
+ * @param numOfClusters number of clusters
+ * @param clusterSizes size of each cluster
+ * @param clusters a list of cluster
+ */
+case class Clusters(numOfClusters: Int, clusterSizes: Array[Int], clusters: List[Cluster])
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/CoordinatePOI.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/CoordinatePOI.scala
new file mode 100644
index 0000000..b722a73
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/CoordinatePOI.scala
@@ -0,0 +1,10 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+/**
+ * a coordinate
+ *
+ * @param longitude
+ * @param latitude
+ */
+case class CoordinatePOI(longitude: Double, latitude: Double)
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbPOI.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbPOI.scala
new file mode 100644
index 0000000..67aa807
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbPOI.scala
@@ -0,0 +1,14 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+import net.sansa_stack.ml.spark.clustering.datatypes.DbStatusEnum._
+
+case class DbPOI(val poiId: String,
+                 val lon: Double,
+                 val lat: Double) {
+
+    var dbstatus = UNDEFINED
+    var isDense = false
+    var isBoundary = false
+    var clusterName = ""
+}
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbStatusEnum.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbStatusEnum.scala
new file mode 100644
index 0000000..cceca32
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DbStatusEnum.scala
@@ -0,0 +1,7 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+object DbStatusEnum extends Enumeration {
+
+    type DBSTATUS = Value
+    val UNDEFINED, NOISE, PARTOFCLUSTER = Value
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Distance.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Distance.scala
new file mode 100644
index 0000000..efa7596
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Distance.scala
@@ -0,0 +1,9 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+/**
+  * @param poi1
+  * @param poi2
+  * @param distance distance between poi1 and poi2
+  */
+case class Distance(poi1: Long, poi2: Long, distance: Double)
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DistanceMatrix.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DistanceMatrix.scala
new file mode 100644
index 0000000..27ff94d
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/DistanceMatrix.scala
@@ -0,0 +1,4 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+case class DistanceMatrix(distances: List[Distance])
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinate.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinate.scala
new file mode 100644
index 0000000..87aadfa
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinate.scala
@@ -0,0 +1,4 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+case class MdsCoordinate (poiID: Long, coordinate: Array[Double])
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinates.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinates.scala
new file mode 100644
index 0000000..5318cc2
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/MdsCoordinates.scala
@@ -0,0 +1,4 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+case class MdsCoordinates(coordinates: Array[MdsCoordinate])
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/POI.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/POI.scala
new file mode 100644
index 0000000..0c3bba4
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/POI.scala
@@ -0,0 +1,14 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+import com.vividsolutions.jts.geom.{Coordinate, GeometryFactory}
+
+class POI(
+          id: String,
+          name: String,
+          val x : Double,
+          val y : Double,
+          keywords: List[String],
+          score: Double,
+          geometryFactory: GeometryFactory
+         ) extends SpatialObject(id, name, keywords, score, geometryFactory.createPoint(new Coordinate(x, y)))
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Poi.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Poi.scala
new file mode 100644
index 0000000..2e15ac9
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/Poi.scala
@@ -0,0 +1,11 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+/**
+ * Poi object representing a point of interest
+ *
+ * @param poi_id, id of poi
+ * @param coordinate, coordinate of poi
+ * @param categories, categories of poi
+ */
+case class Poi(poi_id: Long, coordinate: CoordinatePOI, categories: Categories, review: Double)
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/SpatialObject.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/SpatialObject.scala
new file mode 100644
index 0000000..2388fb1
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/datatypes/SpatialObject.scala
@@ -0,0 +1,23 @@
+package net.sansa_stack.ml.spark.clustering.datatypes
+
+import com.vividsolutions.jts.geom.Geometry
+import scala.collection.mutable.HashMap
+
+class SpatialObject(
+                     var id: String,
+                     var name: String,
+                     var keywords: List[String],
+                     var score: Double,
+                     var geometry: Geometry
+                   ) extends Ordered[SpatialObject]{
+
+    var attributes = HashMap[Object, Object]()
+
+    // @Override
+    override def compare(o: SpatialObject ): Int = {
+        if (this.score > o.score)      -1
+        else if (this.score == o.score) 0
+        else 1
+    }
+}
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Common.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Common.scala
new file mode 100644
index 0000000..0044bda
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Common.scala
@@ -0,0 +1,71 @@
+package net.sansa_stack.ml.spark.clustering.utils
+
+import java.io.PrintWriter
+
+import org.apache.jena.graph.{ NodeFactory, Triple}
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.json4s.DefaultFormats
+import org.json4s.jackson.Serialization
+
+import net.sansa_stack.ml.spark.clustering.datatypes.{Cluster, Clusters, Poi}
+
+object Common {
+  val prefixID = "http://example.org/id/poi/"
+  val prefixCategory = "http://example.org/hasCategory"
+  val prefixCoordinate = "http://example.org/id/hasCoordinate/"
+
+
+  /**
+    * create a pair RDD and join with another pair RDD
+    *
+    * @param sparkContext
+    * @param ids an array with poi id
+    * @param pairs
+    * @return an array of poi
+    */
+  def join(sparkContext: SparkContext, ids: Array[Long], pairs: RDD[(Long, Poi)]): Array[Poi] = {
+    val idsPair = sparkContext.parallelize(ids).map(x => (x, x))
+    idsPair.join(pairs).map(x => x._2._2).collect()
+  }
+
+  /**
+    * serialize clustering results to file
+    *
+    * @param sparkContext
+    * @param clusters clustering results
+    * @param pois pois object
+    * @return
+    */
+  def writeClusteringResult(sparkContext: SparkContext, clusters: Map[Int, Array[Long]], pois: RDD[Poi], fileWriter: PrintWriter): Unit = {
+    val assignments = clusters.toList.sortBy { case (k, v) => v.length }
+    val poisKeyPair = pois.keyBy(f => f.poi_id).persist()
+    val clustersPois = Clusters(assignments.size, assignments.map(_._2.length).toArray, assignments.map(f => Cluster(f._1, join(sparkContext, f._2, poisKeyPair))))
+    implicit val formats = DefaultFormats
+    Serialization.writePretty(clustersPois, fileWriter)
+  }
+   /**
+    * serialize clustering results to .nt file
+    */
+  def seralizeToNT(sparkContext: SparkContext, clusters: Map[Int, Array[Long]], pois: RDD[Poi]): Unit = {
+    val assignments = clusters.toList.sortBy { case (k, v) => v.length }
+    val poisKeyPair = pois.keyBy(f => f.poi_id).persist()
+    val newAssignment = assignments.map(f => (f._1, sparkContext.parallelize(f._2).map(x => (x, x)).join(poisKeyPair).map(x => ( x._2._2.poi_id, x._2._2.categories, x._2._2.coordinate)).collect()))
+    val newAssignmentRDD = sparkContext.parallelize(newAssignment)
+    println(newAssignmentRDD.count())
+    val newAssignmentRDDTriple = newAssignmentRDD.map(cluster => (cluster._1, cluster._2.flatMap(poi =>
+                                          {List(new Triple(NodeFactory.createURI(prefixID + poi._1.toString),
+                                                    NodeFactory.createURI(prefixCategory),
+                                                    NodeFactory.createLiteral(poi._2.categories.mkString(","))),
+                                                new Triple(NodeFactory.createURI(prefixID + poi._1.toString),
+                                                  NodeFactory.createURI(prefixCoordinate),
+                                                  NodeFactory.createLiteral((poi._3.latitude, poi._3.longitude).toString()))
+                                          )}
+                                            ).toList)
+    )
+    newAssignmentRDDTriple.saveAsTextFile("results/triples")
+  }
+
+}
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DBCLusterer.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DBCLusterer.scala
new file mode 100644
index 0000000..1609d60
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DBCLusterer.scala
@@ -0,0 +1,78 @@
+package net.sansa_stack.ml.spark.clustering.utils
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+import net.sansa_stack.ml.spark.clustering.datatypes.DbPOI
+import net.sansa_stack.ml.spark.clustering.datatypes.DbStatusEnum._
+
+case class DBCLusterer(val eps: Double, val minPts: Int) {
+
+    def clusterPois(poiArrBuff: ArrayBuffer[DbPOI]): ArrayBuffer[ArrayBuffer[DbPOI]] = {
+
+        val clusterArrBuff = ArrayBuffer[ArrayBuffer[DbPOI]]()
+        val grid = Grid(poiArrBuff, eps)
+
+        for{
+            dbpoi <- poiArrBuff
+
+            if(dbpoi.dbstatus == UNDEFINED)
+        }{
+
+            val neighbourArrBuff = grid.getNeighbours(dbpoi)
+
+            if(neighbourArrBuff.size < minPts)
+            {
+                dbpoi.dbstatus = NOISE
+            }
+            else
+            {
+                clusterArrBuff.append(findCluster(dbpoi, neighbourArrBuff, grid))
+            }
+        }
+
+        clusterArrBuff
+    }
+
+
+    def findCluster(dbpoi: DbPOI, neighbourArrBuff: ArrayBuffer[DbPOI], grid: Grid): ArrayBuffer[DbPOI] = {
+
+        dbpoi.dbstatus = PARTOFCLUSTER
+        dbpoi.isDense = true
+
+        val cluster = ArrayBuffer[DbPOI]()
+        cluster.append(dbpoi)
+
+        val neighbourQueue = mutable.Queue[DbPOI]() ++ neighbourArrBuff
+
+        while(neighbourQueue.nonEmpty) {
+            val poi = neighbourQueue.dequeue()
+            poi.dbstatus match {
+                case UNDEFINED =>
+                    poi.dbstatus = PARTOFCLUSTER
+                    val poi_i_neighbours = grid.getNeighbours(poi)
+                    if(poi_i_neighbours.size >= minPts)
+                    {
+                        poi.isDense = true
+                        neighbourQueue ++= poi_i_neighbours
+                    }
+                    else
+                    {
+                        poi.isDense = false
+                    }
+                    cluster.append(poi)
+                case NOISE =>
+                    poi.dbstatus = PARTOFCLUSTER
+                    poi.isDense = false
+                    cluster.append(poi)
+                case _ => ()
+            }
+        }
+
+        cluster
+    }
+
+}
+
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataFiltering.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataFiltering.scala
new file mode 100644
index 0000000..742a90a
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataFiltering.scala
@@ -0,0 +1,63 @@
+package net.sansa_stack.ml.spark.clustering.utils
+
+import org.apache.jena.graph.Triple
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import scala.collection.mutable.ArrayBuffer
+
+import net.sansa_stack.ml.spark.clustering.datatypes.AppConfig
+import net.sansa_stack.rdf.spark.io.NTripleReader
+
+class DataFiltering(val spark: SparkSession, val conf: AppConfig) extends Serializable {
+
+  val dataRDD: RDD[Triple] = NTripleReader.load(spark, conf.dataset.input).persist()
+
+  /**
+    * Generate triples with related to poi in poiArray, method name not JavaBean format because of side effect with Unit return result
+    * @param poiArray id of pois in Vienna
+    * @param dataRDD RDD containing triples
+    * @param spark SparkSession
+    * @return
+    */
+  def get_triples(poiArray: Array[Long], dataRDD: RDD[Triple], spark: SparkSession) : (RDD[Triple], RDD[Triple]) = {
+    // create an array of subjects related with each poi
+    val subjects = ArrayBuffer[String]()
+    for (i <- 0 until poiArray.length - 1) {
+      subjects ++= createSubjects(poiArray(i))
+    }
+    // RDD[Triple] => RDD[(subject, Triple)]
+    val dataRDDPair = dataRDD.map(f => (f.getSubject.getURI, f)).persist()
+    // create RDD[(subject, subject)] from Array[subjects]
+    val subjectsRDD = spark.sparkContext.parallelize(subjects.toSet.toList).map(f => (f, f)).persist()
+    // get RDD[Triples] with subject in Array[subjects]
+    val viennaTriples = subjectsRDD.join(dataRDDPair).map(f => f._2._2).persist()
+    // find filtered Triples with prediction category, and get their object => RDD[Object]
+    val viennaCatgoriesObjects = viennaTriples.filter(f => f.getPredicate.getURI.equals("http://example.org/def#category")).map(f => f.getObject.getURI).distinct().persist()
+    // RDD[Object] => RDD[(Object, Object)]
+    val viennaPoiCategoriesRDD = viennaCatgoriesObjects.map(f => (f, f)).persist()
+    // RDD[(Object, Object)] => RDD[Triples], where Object is Subject in Triples
+    val viennaCategoryTriples = viennaPoiCategoriesRDD.join(dataRDDPair).map(f => f._2._2)
+    // RDD[Triples] => RDD[(Key, Triple)], where key=subject+predicate+object, because there are some duplicated triples in the tomtom data
+    val temp = viennaCategoryTriples.map(f => (f.getSubject.getURI + f.getPredicate.getURI + f.getObject.toString(), f)).persist()
+    // remove duplicated triples
+    val categoryTriples = temp.reduceByKey((v1, v2) => v1).map(f => f._2).persist()
+    (viennaTriples, categoryTriples)
+  }
+
+  /**
+    * @param poiID id of a poi
+    * @return an array of subject in RDF triples with related to this poi
+    */
+  def createSubjects(poiID: Long): ArrayBuffer[String] = {
+    val subjects = ArrayBuffer[String]()
+    val id = "http://example.org/id/poi/".concat(poiID.toString)
+    subjects.+=(id)
+    subjects.+=(id.concat("/address"))
+    subjects.+=(id.concat("/phone"))
+    subjects.+=(id.concat("/geometry"))
+    subjects.+=(id.concat("/name"))
+    subjects.+=(id.concat("/accuracy_info"))
+    subjects.+=(id.concat("/brandname"))
+    subjects
+  }
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataProcessing.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataProcessing.scala
new file mode 100644
index 0000000..89c892f
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/DataProcessing.scala
@@ -0,0 +1,178 @@
+package net.sansa_stack.ml.spark.clustering.utils
+
+import java.io.{File, FilenameFilter}
+
+import com.typesafe.config.Config
+import org.apache.jena.graph.Triple
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+
+import net.sansa_stack.ml.spark.clustering.datatypes.{Categories, CoordinatePOI, Poi}
+import net.sansa_stack.rdf.spark.io.NTripleReader
+
+
+
+/**
+  * load TomTom dataset
+  * @param spark SparkSession
+  * @param conf Configuration
+  */
+class DataProcessing(val spark: SparkSession, val conf: Config) extends Serializable {
+
+  // val dataRDD: RDD[Triple] = NTripleReader.load(spark, conf.getString("sansa.data.input")).persist()
+  val dataRDD: RDD[Triple] = loadNTriple(conf.getString("sansa.data.input"))
+
+  // var poiCoordinates: RDD[(Long, Coordinate)] = this.getPOICoordinates(16.192851, 16.593533, 48.104194, 48.316388).sample(withReplacement = false, fraction = 0.01, seed = 0)
+  var poiCoordinates: RDD[(Long, CoordinatePOI)] = this.getPOICoordinates
+  var poiFlatCategoryId: RDD[(Long, Long)] = this.getPOIFlatCategoryId
+  var poiCategoryId: RDD[(Long, Set[Long])] = this.getCategoryId(poiCoordinates, poiFlatCategoryId).persist()
+  var poiCategoryValueSet: RDD[(Long, Categories)] = this.getCategoryValues  // (category_id, Categories)
+  var poiCategories: RDD[(Long, Categories)] = this.getPOICategories(poiCoordinates, poiFlatCategoryId, poiCategoryValueSet)  // (poi_id, Categories)
+  val poiYelpCategories: RDD[(Long, (Categories, Double))] = this.getYelpCategories(dataRDD).sample(withReplacement = false, fraction = 0.1, seed = 0)
+  var pois: RDD[Poi] = { if (!poiYelpCategories.isEmpty()) {
+    // val poiAllCategories: RDD[(Long, Categories, Double)] = poiCategories.join(poiYelpCategories).map(x => (x._1, (Categories(x._2._1.categories++x._2._2._1.categories), x._2._2._2))
+    val poiAllCategories: RDD[(Long, (Categories, Double))] = poiYelpCategories.join(poiCategories).map(x => (x._1, (Categories(x._2._1._1.categories++x._2._2.categories), x._2._1._2)))
+    poiCoordinates.join(poiAllCategories).map(x => Poi(x._1, x._2._1, x._2._2._1, x._2._2._2)).persist()
+  } else {
+    println("--------pois--------------")
+    poiCoordinates.join(poiCategories).map(x => Poi(x._1, x._2._1, x._2._2, 0.0)).persist()
+  }}
+
+  def loadNTriple(tripleFilePath: String): RDD[Triple] = {
+    val tripleFile = new File(tripleFilePath)
+    if(tripleFile.isDirectory) {
+      val files = tripleFile.listFiles(new FilenameFilter() {
+        def accept(tripleFile: File, name: String): Boolean = {
+          !(name.toString.contains("SUCCESS") || name.toLowerCase.endsWith(".crc"))
+        }
+      })
+      var i = 0
+      var triple_0 = NTripleReader.load(spark, files(0).getAbsolutePath)
+      for(file <- files) {
+        if (i!=0) {
+          triple_0 = triple_0.union(NTripleReader.load(spark, file.getAbsolutePath))
+        }
+        i+=1
+      }
+      triple_0
+    }
+    else {
+      NTripleReader.load(spark, tripleFile.getAbsolutePath)
+    }
+  }
+
+
+  /**
+    * @param poiCoordinates super set of poi with coordinates
+    * @param lo_min min longitude
+    * @param lo_max max longitude
+    * @param la_min min latitude
+    * @param la_max max latitude
+    * @return pois within certain coordinates
+    */
+  def filterCoordinates(poiCoordinates: RDD[(Long, CoordinatePOI)], lo_min: Double, lo_max: Double, la_min: Double, la_max: Double): RDD[(Long, CoordinatePOI)] = {
+    poiCoordinates.filter(x => (x._2.longitude >= lo_min && x._2.longitude <= lo_max)
+      && (x._2.latitude >= la_min && x._2.latitude <= la_max))
+  }
+
+  /**
+    * get coordinate for all poi
+    */
+  def getPOICoordinates: RDD[(Long, CoordinatePOI)] = {
+    // get the coordinates of pois
+    val pattern = "POINT(.+ .+)".r
+    val poiCoordinatesString = dataRDD.filter(x => x.getPredicate.toString().equalsIgnoreCase(conf.getString("sansa.data.coordinatesPredicate")))
+      .map(x => (x.getSubject.toString().replace(conf.getString("sansa.data.poiPrefix"), "").replace("/geometry", "").toLong,
+        pattern.findFirstIn(x.getObject.toString()).head.replace("POINT", "")
+          .replace("^^http://www.opengis.net/ont/geosparql#wktLiteral", "").replaceAll("^\"|\"$", "")))
+    // transform to Coordinate object
+    poiCoordinatesString.mapValues(x => {
+        val coordinates = x.replace("(", "").replace(")", "").split(" ")
+        CoordinatePOI(coordinates(0).toDouble, coordinates(1).toDouble)
+      })
+  }
+
+  /**
+    * load data filter on geo-coordinates
+    * @param lo_min min longitude
+    * @param lo_max max longitude
+    * @param la_min min latitude
+    * @param la_max max latitude
+    */
+  def getPOICoordinates(lo_min: Double, lo_max: Double, la_min: Double, la_max: Double): RDD[(Long, CoordinatePOI)] = {
+    this.filterCoordinates(poiCoordinates = this.getPOICoordinates, lo_min = lo_min, lo_max = lo_max, la_min = la_min, la_max = la_max)
+  }
+
+  /**
+    *
+    * @return (poi, category_id)
+    */
+  def getPOIFlatCategoryId: RDD[(Long, Long)] = {
+    val poiFlatCategories = dataRDD.filter(x => x.getPredicate.toString().equalsIgnoreCase(conf.getString("sansa.data.categoryPOI")))
+    poiFlatCategories.map(x => (
+      x.getSubject.toString().replace(conf.getString("sansa.data.poiPrefix"), "").toLong,
+      x.getObject.toString().replace(conf.getString("sansa.data.termPrefix"), "").toLong)
+    )
+  }
+
+  /**
+    * get (poi_unique, Categories)
+    * @param poiCoordinates (poi_unique, Coordinate)
+    * @param poiFlatCategoryId (poi, category_id)
+    * @param poiCategoryValueSet (category_id, Categories)
+    * @return (poi, Categories)
+    */
+  def getPOICategories(poiCoordinates: RDD[(Long, CoordinatePOI)], poiFlatCategoryId: RDD[(Long, Long)], poiCategoryValueSet: RDD[(Long, Categories)]): RDD[(Long, Categories)] = {
+    // from (poi, category_id) map-> (category_id, poi) join-> (category_id, (poi, Categories)) map-> (poi, Categories) groupByKey-> (poi_unique, Iterable(Categories))
+    val poiCategorySets = poiFlatCategoryId.map(f => (f._2, f._1)).join(poiCategoryValueSet).map(f => (f._2._1, f._2._2)).groupByKey()
+    // from (poi_unique, Iterable(Categories)) join-> (poi_unique, (Coordinate, Iterable(Categories))) map-> (poi_unique, Categories)
+    poiCoordinates.join(poiCategorySets).map(x => (x._1, Categories(collection.mutable.Set(x._2._2.flatMap(_.categories).toList: _*))))
+  }
+
+  /**
+    * get (category_id, Categories)
+    * @return RDD with category values for category id
+    */
+  def getCategoryValues: RDD[(Long, Categories)] = {
+    // get category id(s)
+    val categoryTriples = dataRDD.filter(x => x.getPredicate.toString().equalsIgnoreCase(conf.getString("sansa.data.termValueUri")))
+    // get category id and it's corresponding values
+    val categoriesIdValues = categoryTriples.map(x => (
+      x.getSubject.toString().replace(conf.getString("sansa.data.termPrefix"), "").toLong,
+      x.getObject.toString().replaceAll("\"", "")))
+    // group by id and put all values of category to a set
+    categoriesIdValues.groupByKey().map(x => (x._1, Categories(scala.collection.mutable.Set(x._2.toList: _*))))
+  }
+
+  /**
+    * get (poi_unique, poi_category_id_set)
+    * @param poiCoordinates (poi_unique, Coordinate)
+    * @param poiFlatCategoryId (poi, category_id)
+    */
+  def getCategoryId(poiCoordinates: RDD[(Long, CoordinatePOI)], poiFlatCategoryId: RDD[(Long, Long)]): RDD[(Long, Set[Long])] = {
+    poiCoordinates.join(poiFlatCategoryId.groupByKey())
+    .map(x => (x._1, x._2._2.toSet))
+  }
+
+
+  def getYelpCategories(mergedRDD: RDD[Triple]): RDD[(Long, (Categories, Double))] = {
+    val yelpPOICategory = mergedRDD.filter(triple => triple.getPredicate.toString.equalsIgnoreCase(conf.getString("yelp.data.categoryPOI")))
+    println(conf.getString("yelp.data.rating"))
+    val yelpPOIRating = mergedRDD.filter(triple => triple.getPredicate.toString.contains(conf.getString("yelp.data.rating")))
+    println("category")
+    println(yelpPOICategory.count())
+    println("rating")
+    println(yelpPOIRating.count())
+    val yelpPOICategoryMapped = yelpPOICategory.map(triple => (
+      triple.getSubject.toString().replace(conf.getString("sansa.data.poiPrefix"), "").toLong,
+      triple.getObject.toString()
+    ))
+    val yelpPOIRatingMapped = yelpPOIRating.map(triple => (
+      triple.getSubject.toString().replace(conf.getString("sansa.data.poiPrefix"), "").toLong,
+      triple.getObject.getLiteralValue.toString.toDouble
+      ))
+    yelpPOICategoryMapped.groupByKey().join(yelpPOIRatingMapped).map(x => (x._1, (Categories(scala.collection.mutable.Set(x._2._1.toList: _*)), x._2._2)))
+  }
+}
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Grid.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Grid.scala
new file mode 100644
index 0000000..74d5a9c
--- /dev/null
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/clustering/utils/Grid.scala
@@ -0,0 +1,55 @@
+package net.sansa_stack.ml.spark.clustering.utils
+
+import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.HashMap
+
+import net.sansa_stack.ml.spark.clustering.datatypes.DbPOI
+
+case class Grid(val poiArrBuf: ArrayBuffer[DbPOI], val eps: Double) {
+
+    val startX = poiArrBuf.head.lon
+    val startY = poiArrBuf.head.lat
+    val gridCell = HashMap[(Int, Int), ArrayBuffer[DbPOI]]()
+
+    init()
+
+    private def init(): Unit = {
+        var i = 0
+        var j = 0
+        for(dbpoi <- poiArrBuf) {
+            i = math.floor( (dbpoi.lon - startX) / eps).toInt
+            j = math.floor( (dbpoi.lat - startY) / eps).toInt
+
+            gridCell.get((i, j)) match {
+                case Some(cellArrBuff) => cellArrBuff.append(dbpoi)
+                case None => gridCell += ( ((i, j), ArrayBuffer(dbpoi)) )
+            }
+        }
+    }
+
+
+    def getNeighbours(dbpoi: DbPOI): ArrayBuffer[DbPOI] = {
+
+        val neighbourArrBuff = ArrayBuffer[DbPOI]()
+
+        val celli = math.floor( (dbpoi.lon - startX) / eps).toInt
+        val cellj = math.floor( (dbpoi.lat - startY) / eps).toInt
+        for{
+            i <- (celli - 1) to (celli + 1)
+            j <- (cellj - 1) to (cellj + 1)
+        }{
+            gridCell.get((i, j)) match {
+                case Some(cellArrBuff) => neighbourArrBuff ++= cellArrBuff
+                case None => ()
+            }
+        }
+
+        neighbourArrBuff.filter{
+            p => (math.abs(p.lon - dbpoi.lon) <= eps) && (math.abs(p.lat - dbpoi.lat) <= eps) && p.poiId != dbpoi.poiId
+        }
+
+    }
+
+}
+
+
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernel.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernel.scala
index a49c2b3..a7ef9b4 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernel.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernel.scala
@@ -1,18 +1,18 @@
 package net.sansa_stack.ml.spark.kernel
 
+import org.apache.jena.graph.Triple
 import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel, StringIndexer }
 import org.apache.spark.mllib.linalg.SparseVector
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.functions._
 import org.apache.spark.sql.{ DataFrame, SparkSession }
-import org.apache.jena.graph.Triple
+import org.apache.spark.sql.functions._
 
 class RDFFastGraphKernel(
   @transient val sparkSession: SparkSession,
-  val tripleRDD:               RDD[Triple],
-  val predicateToPredict:      String) extends Serializable {
+  val tripleRDD: RDD[Triple],
+  val predicateToPredict: String) extends Serializable {
 
   import sparkSession.implicits._
 
@@ -84,8 +84,8 @@ class RDFFastGraphKernel(
 object RDFFastGraphKernel {
 
   def apply(
-    sparkSession:       SparkSession,
-    tripleRDD:          RDD[Triple],
+    sparkSession: SparkSession,
+    tripleRDD: RDD[Triple],
     predicateToPredict: String): RDFFastGraphKernel = {
 
     new RDFFastGraphKernel(sparkSession, tripleRDD, predicateToPredict)
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel.scala
index 1ea7080..a3f64fa 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel.scala
@@ -1,13 +1,14 @@
 package net.sansa_stack.ml.spark.kernel
 
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{ DataFrame, SparkSession }
-import org.apache.spark.sql.functions._
+import org.apache.jena.graph.Triple
 import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel }
 import org.apache.spark.mllib.linalg.SparseVector
-import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.jena.graph.Triple
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{ DataFrame, SparkSession }
+import org.apache.spark.sql.functions._
+
 
 object Uri2Index {
   /*
@@ -81,9 +82,9 @@ object Uri2Index {
 
 class RDFFastTreeGraphKernel(
   @transient val sparkSession: SparkSession,
-  val tripleRDD:               RDD[Triple],
-  val instanceDF:              DataFrame,
-  val maxDepth:                Int) extends Serializable {
+  val tripleRDD: RDD[Triple],
+  val instanceDF: DataFrame,
+  val maxDepth: Int) extends Serializable {
   /*
   * Construct Triples DataFrame and Instances DataFrame
   * Also, Get/Set Index for each URI and Literal
@@ -168,9 +169,9 @@ object RDFFastTreeGraphKernel {
 
   def apply(
     sparkSession: SparkSession,
-    tripleRDD:    RDD[Triple],
-    instanceDF:   DataFrame,
-    maxDepth:     Int): RDFFastTreeGraphKernel = {
+    tripleRDD: RDD[Triple],
+    instanceDF: DataFrame,
+    maxDepth: Int): RDFFastTreeGraphKernel = {
 
     new RDFFastTreeGraphKernel(sparkSession, tripleRDD, instanceDF, maxDepth)
   }
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelApp.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelApp.scala
index b3ef264..6ecc25d 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelApp.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelApp.scala
@@ -46,7 +46,7 @@ object RDFFastTreeGraphKernelApp {
   }
 
   def experimentAffiliationPrediction(sparkSession: SparkSession, depth: Int, iteration: Int): Unit = {
-    //val input = "src/main/resources/kernel/aifb-fixed_complete4.nt"
+    // val input = "src/main/resources/kernel/aifb-fixed_complete4.nt"
     val input = "src/main/resources/kernel/aifb-fixed_no_schema4.nt"
 
     val t0 = System.nanoTime
@@ -137,7 +137,7 @@ object RDFFastTreeGraphKernelApp {
     tripleRDD.filter(_.getPredicate.getURI == "http://data.bgs.ac.uk/ref/Lexicon/hasTheme")
       .foreach(f => Uri2Index.setInstanceAndLabel(f.getSubject.toString, f.getObject.toString))
 
-    val filteredTripleRDD=tripleRDD
+    val filteredTripleRDD = tripleRDD
       .filter(_.getPredicate.getURI != "http://data.bgs.ac.uk/ref/Lexicon/hasTheme")
 
     val instanceDF = Uri2Index.getInstanceLabelsDF(sparkSession)
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelUtil.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelUtil.scala
index f4bb7f0..a8f2ec7 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelUtil.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernelUtil.scala
@@ -2,31 +2,31 @@ package net.sansa_stack.ml.spark.kernel
 
 import org.apache.jena.graph
 import org.apache.spark.ml.feature.StringIndexer
-import org.apache.spark.mllib.classification.{LogisticRegressionModel, LogisticRegressionWithLBFGS}
+import org.apache.spark.mllib.classification.{ LogisticRegressionModel, LogisticRegressionWithLBFGS }
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{ DataFrame, SparkSession }
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{DataFrame, SparkSession}
-
 
 object RDFFastTreeGraphKernelUtil {
 
-  def triplesToDF(sparkSession: SparkSession,
-                  triples: RDD[graph.Triple],
-                  subjectColName:String = "subject",
-                  predicateColName:String = "predicate",
-                  objectColName:String ="object"
-                 ): DataFrame = {
+  def triplesToDF(
+    sparkSession: SparkSession,
+    triples: RDD[graph.Triple],
+    subjectColName: String = "subject",
+    predicateColName: String = "predicate",
+    objectColName: String = "object"): DataFrame = {
     import sparkSession.implicits._
 
-    triples.map(f => (f.getSubject.toString,f.getPredicate.toString,f.getObject.toString))
+    triples.map(f => (f.getSubject.toString, f.getPredicate.toString, f.getObject.toString))
       .toDF(subjectColName, predicateColName, objectColName)
   }
 
-  def getInstanceAndLabelDF( filteredTripleDF: DataFrame,
-                             subjectColName:String = "subject",
-                             objectColName:String ="object" ): DataFrame = {
+  def getInstanceAndLabelDF(
+    filteredTripleDF: DataFrame,
+    subjectColName: String = "subject",
+    objectColName: String = "object"): DataFrame = {
     /*
       root
       |-- instance: string (nullable = true)
@@ -47,7 +47,7 @@ object RDFFastTreeGraphKernelUtil {
     indexedDF
   }
 
-  def predictLogisticRegressionMLLIB(data: RDD[LabeledPoint], numClasses : Int = 2, maxIteration: Int = 5): Unit = {
+  def predictLogisticRegressionMLLIB(data: RDD[LabeledPoint], numClasses: Int = 2, maxIteration: Int = 5): Unit = {
     val t0 = System.nanoTime
 
     data.cache()
@@ -61,7 +61,7 @@ object RDFFastTreeGraphKernelUtil {
       val validation = splits(1)
       val model = new LogisticRegressionWithLBFGS().setNumClasses(numClasses).run(training)
 
-      val predictions = validation.map{ point =>
+      val predictions = validation.map { point =>
         val prediction = model.predict(point.features)
         (point.label, prediction)
       }
@@ -73,15 +73,14 @@ object RDFFastTreeGraphKernelUtil {
 
     var sumOfAccuracy = 0.0
 
-    for ( seed <- 1 to maxIteration ) {
+    for (seed <- 1 to maxIteration) {
       val (model, accuracy) = trainAndValidate(data, seed)
-//      println(accuracy)
+      //      println(accuracy)
       sumOfAccuracy += accuracy
     }
 
     val t2 = System.nanoTime
 
-
     // score the model on test data.
     println("Average Accuracy: " + sumOfAccuracy / maxIteration)
 
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel_v2.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel_v2.scala
index 5f9aa81..6c24a7f 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel_v2.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kernel/RDFFastTreeGraphKernel_v2.scala
@@ -1,18 +1,18 @@
 package net.sansa_stack.ml.spark.kernel
 
-import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, StringIndexer}
+import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel, StringIndexer }
 import org.apache.spark.mllib.linalg.SparseVector
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{ DataFrame, SparkSession }
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{DataFrame, SparkSession}
 
-class RDFFastTreeGraphKernel_v2 (@transient val sparkSession: SparkSession,
-                               val tripleDF: DataFrame,
-                               val instanceDF: DataFrame,
-                               val maxDepth: Int
-                              ) extends Serializable {
+class RDFFastTreeGraphKernel_v2(
+  @transient val sparkSession: SparkSession,
+  val tripleDF: DataFrame,
+  val instanceDF: DataFrame,
+  val maxDepth: Int) extends Serializable {
 
   def computeFeatures(): DataFrame = {
     /*
@@ -46,7 +46,6 @@ class RDFFastTreeGraphKernel_v2 (@transient val sparkSession: SparkSession,
       intermediateDF.createOrReplaceTempView("df")
     }
 
-
     // Indexing on path
     val indexer = new StringIndexer()
       .setInputCol("path")
@@ -59,12 +58,10 @@ class RDFFastTreeGraphKernel_v2 (@transient val sparkSession: SparkSession,
       .agg(collect_list("pathIndex") as "paths")
       .toDF("instance", "label", "paths")
 
-
     // CountVectorize the aggregated paths
     val cvModel: CountVectorizerModel = new CountVectorizer().setInputCol("paths").setOutputCol("features").fit(aggDF)
     val dataML = cvModel.transform(aggDF)
 
-
     dataML
 
   }
@@ -97,11 +94,11 @@ class RDFFastTreeGraphKernel_v2 (@transient val sparkSession: SparkSession,
 
 object RDFFastTreeGraphKernel_v2 {
 
-  def apply(sparkSession: SparkSession,
-            tripleDF: DataFrame,
-            instanceDF: DataFrame,
-            maxDepth: Int
-           ): RDFFastTreeGraphKernel_v2 = {
+  def apply(
+    sparkSession: SparkSession,
+    tripleDF: DataFrame,
+    instanceDF: DataFrame,
+    maxDepth: Int): RDFFastTreeGraphKernel_v2 = {
 
     new RDFFastTreeGraphKernel_v2(sparkSession, tripleDF, instanceDF, maxDepth)
   }
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Bootstrapping.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Bootstrapping.scala
index 02bbfa9..7350c53 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Bootstrapping.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Bootstrapping.scala
@@ -1,5 +1,8 @@
 package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation
 
+import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples
+import org.apache.spark.sql._
+
 /**
  * Bootstrapping
  * -------------
@@ -8,18 +11,12 @@ package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation
  *
  * Created by lpfgarcia
  */
-
-import org.apache.spark.sql._
-
-import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples
-
 class Bootstrapping(data: Dataset[IntegerTriples])
-    extends CrossValidation[Dataset[IntegerTriples]] {
+  extends CrossValidation[Dataset[IntegerTriples]] {
 
-  def crossValidation() = {
+  def crossValidation(): (Dataset[IntegerTriples], Dataset[IntegerTriples]) = {
     val train = data.sample(true, 1)
     val test = data.except(train)
     (train, test)
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/CrossValidation.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/CrossValidation.scala
index 791d0b1..b55c36a 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/CrossValidation.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/CrossValidation.scala
@@ -13,4 +13,4 @@ trait CrossValidation[T] {
 
   def crossValidation: (T, T)
 
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Holdout.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Holdout.scala
index f25bc84..1cbf42d 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Holdout.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/Holdout.scala
@@ -1,5 +1,8 @@
 package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation
 
+import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples
+import org.apache.spark.sql._
+
 /**
  * Hould Out
  * ---------
@@ -8,22 +11,17 @@ package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation
  *
  * Created by lpfgarcia
  */
-
-import org.apache.spark.sql._
-
-import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples
-
 case class rateException(info: String) extends Exception
 
 class Holdout(data: Dataset[IntegerTriples], rate: Float) extends CrossValidation[Dataset[IntegerTriples]] {
 
-  if (rate < 0 || rate >= 1)
+  if (rate < 0 || rate >= 1) {
     throw new rateException("Rate value should be higher than 0 and lower than 1")
+  }
 
-  def crossValidation() = {
+  def crossValidation(): (Dataset[IntegerTriples], Dataset[IntegerTriples]) = {
     val train = data.sample(false, rate)
     val test = data.except(train)
     (train, test)
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/kFold.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/kFold.scala
index 97021e2..eed57a2 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/kFold.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/crossvalidation/kFold.scala
@@ -1,5 +1,8 @@
 package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation
 
+import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples
+import org.apache.spark.sql._
+
 /**
  * k-fold Cross Validation
  * -----------------------
@@ -9,26 +12,23 @@ package net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation
  * Created by lpfgarcia
  */
 
-import org.apache.spark.sql._
-
-import net.sansa_stack.rdf.spark.kge.triples.IntegerTriples
-
 case class kException(info: String) extends Exception
 
 case class withIndex(Subject: Int, Predicate: Int, Object: Int, k: Int)
 
 class kFold(data: Dataset[IntegerTriples], k: Int, sk: SparkSession)
-    extends CrossValidation[Seq[Dataset[IntegerTriples]]] {
+  extends CrossValidation[Seq[Dataset[IntegerTriples]]] {
 
   import sk.implicits._
 
-  if (k > 1 && k <= 10)
+  if (k > 1 && k <= 10) {
     throw new kException("The k value should be higher than 1 and lower or equal to 10")
+  }
 
   val id = (1 to data.count().toInt / k).flatMap(List.fill(k)(_))
   val fold = sk.sparkContext.parallelize(id, data.rdd.getNumPartitions)
 
-  def crossValidation() = {
+  def crossValidation(): (IndexedSeq[Dataset[IntegerTriples]], IndexedSeq[Dataset[IntegerTriples]]) = {
 
     val df = sk.createDataFrame(data.rdd.zip(fold).map { r =>
       withIndex(r._1.Subject, r._1.Predicate, r._1.Object, r._2)
@@ -45,4 +45,4 @@ class kFold(data: Dataset[IntegerTriples], k: Int, sk: SparkSession)
     (train, test)
   }
 
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/evaluate/Evaluate.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/evaluate/Evaluate.scala
index 1fda916..0d092e8 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/evaluate/Evaluate.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/evaluate/Evaluate.scala
@@ -9,9 +9,8 @@ package net.sansa_stack.ml.spark.kge.linkprediction.evaluate
 
 object Evaluate {
 
-  def meanRank(left: Array[Float], right: Array[Float]) {
+  def meanRank(left: Array[Float], right: Array[Float]): (Float, Float) = {
     (left.sum / left.length,
       right.sum / right.length)
   }
-  
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/DistMult.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/DistMult.scala
index 51fcfe5..da3f6ac 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/DistMult.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/DistMult.scala
@@ -1,5 +1,11 @@
 package net.sansa_stack.ml.spark.kge.linkprediction.models
 
+import com.intel.analytics.bigdl.optim.Adam
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
+import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples }
+import org.apache.spark.sql._
+
 /**
  * DistMult: diagonal bilinear model
  * ---------------------------------
@@ -9,24 +15,15 @@ package net.sansa_stack.ml.spark.kge.linkprediction.models
  *
  * Created by lpfgarcia on 20/11/2017.
  */
-
-import org.apache.spark.sql._
-
-import com.intel.analytics.bigdl.optim.Adam
-import com.intel.analytics.bigdl.tensor.Tensor
-import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
-
-import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples}
-
 class DistMult(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession)
-    extends Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
+  extends Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
 
   val epochs = 100
   val rate = 0.01f
 
   var opt = new Adam(learningRate = rate)
 
-  def dist(data: Dataset[IntegerTriples]) = {
+  def dist(data: Dataset[IntegerTriples]): Float = {
     val aux = data.collect().map { i =>
       e(i.Subject) * r(i.Predicate) * e(i.Object)
     }.reduce((a, b) => a + b)
@@ -34,7 +31,7 @@ class DistMult(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k:
     L2(aux)
   }
 
-  def run() = {
+  def run(): Unit = {
 
     for (i <- 1 to epochs) {
 
@@ -53,5 +50,4 @@ class DistMult(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k:
 
     }
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/Models.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/Models.scala
index 7720f30..1bfb3ad 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/Models.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/Models.scala
@@ -1,23 +1,21 @@
 package net.sansa_stack.ml.spark.kge.linkprediction.models
 
-/**
- * Model Abstract Class
- * --------------------
- *
- * Created by lpfgarcia on 14/11/2017.
- */
-
 import scala.math._
 import scala.util._
 
-import org.apache.spark.sql._
-
 import com.intel.analytics.bigdl.nn.Power
 import com.intel.analytics.bigdl.tensor.Tensor
 import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
+import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples }
+import org.apache.spark.sql._
 
-import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples}
 
+/**
+ * Model Abstract Class
+ * --------------------
+ *
+ * Created by lpfgarcia on 14/11/2017.
+ */
 abstract class Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
 
   val Ne = ne
@@ -26,11 +24,11 @@ abstract class Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
   var e = initialize(ne)
   var r = normalize(initialize(nr))
 
-  def initialize(size: Int) = {
+  def initialize(size: Int): Tensor[Float] = {
     Tensor(size, k).rand(-6 / sqrt(k), 6 / sqrt(k))
   }
 
-  def normalize(data: Tensor[Float]) = {
+  def normalize(data: Tensor[Float]): Tensor[Float] = {
     data / data.abs().sum()
   }
 
@@ -38,7 +36,7 @@ abstract class Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
 
   val seed = new Random(System.currentTimeMillis())
 
-  def tuple(aux: IntegerTriples) = {
+  def tuple(aux: IntegerTriples): IntegerTriples = {
     if (seed.nextBoolean()) {
       IntegerTriples(seed.nextInt(Ne) + 1, aux.Predicate, aux.Object)
     } else {
@@ -46,20 +44,20 @@ abstract class Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
     }
   }
 
-  def negative(data: Dataset[IntegerTriples]) = {
+  def negative(data: Dataset[IntegerTriples]): Dataset[IntegerTriples] = {
     data.map(i => tuple(i))
   }
 
-  def subset(data: Dataset[IntegerTriples]) = {
+  def subset(data: Dataset[IntegerTriples]): Dataset[IntegerTriples] = {
     data.sample(false, 2 * (batch.toDouble / data.count().toDouble)).limit(batch)
   }
 
-  def L1(vec: Tensor[Float]) = {
+  def L1(vec: Tensor[Float]): Float = {
     vec.abs().sum()
   }
 
-  def L2(vec: Tensor[Float]) = {
+  def L2(vec: Tensor[Float]): Float = {
     vec.pow(2).sqrt().sum()
   }
 
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/TransE.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/TransE.scala
index f5fb0db..43a4205 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/TransE.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/models/TransE.scala
@@ -1,5 +1,14 @@
 package net.sansa_stack.ml.spark.kge.linkprediction.models
 
+import scala.math._
+
+import com.intel.analytics.bigdl.optim.Adam
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
+import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples }
+import org.apache.spark.sql._
+
+
 /**
  * TransE embedding model
  * ----------------------
@@ -9,19 +18,8 @@ package net.sansa_stack.ml.spark.kge.linkprediction.models
  *
  * Created by lpfgarcia on 14/11/2017.
  */
-
-import scala.math._
-
-import org.apache.spark.sql._
-
-import com.intel.analytics.bigdl.optim.Adam
-import com.intel.analytics.bigdl.tensor.Tensor
-import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
-
-import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples}
-
 class TransE(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: Int, margin: Float, L: String, sk: SparkSession)
-    extends Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
+  extends Models(ne: Int, nr: Int, batch: Int, k: Int, sk: SparkSession) {
 
   val epochs = 1000
   val rate = 0.01f
@@ -30,12 +28,12 @@ class TransE(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: In
 
   val myL = L match {
     case "L2" => L2 _
-    case _    => L1 _
+    case _ => L1 _
   }
 
   import sk.implicits._
 
-  def dist(data: Dataset[IntegerTriples]) = {
+  def dist(data: Dataset[IntegerTriples]): Float = {
 
     val aux = data.collect().map { i =>
       e(i.Subject) + r(i.Predicate) - e(i.Object)
@@ -44,11 +42,11 @@ class TransE(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: In
     myL(aux)
   }
 
-  def dist(row: IntegerTriples) = {
+  def dist(row: IntegerTriples): Tensor[Float] = {
     e(row.Subject) + r(row.Predicate) - e(row.Object)
   }
 
-  def run() = {
+  def run(): Unit = {
 
     for (i <- 1 to epochs) {
 
@@ -70,5 +68,4 @@ class TransE(train: Dataset[IntegerTriples], ne: Int, nr: Int, batch: Int, k: In
 
     }
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/Predict.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/Predict.scala
index 29cee10..e1c8227 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/Predict.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/Predict.scala
@@ -1,29 +1,27 @@
 package net.sansa_stack.ml.spark.kge.linkprediction.prediction
 
+import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples }
+import org.apache.spark.sql._
+
 /**
  * Predict Abstract Class
  * ----------------------
  *
  * Created by lpfgarcia on 14/11/2017.
  */
-
-import org.apache.spark.sql._
-
-import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples}
-
 abstract class Evaluate(test: Dataset[IntegerTriples]) {
 
-  def left(row: IntegerTriples, i: Int) = {
+  def left(row: IntegerTriples, i: Int): IntegerTriples = {
     IntegerTriples(i, row.Predicate, row.Object)
   }
 
-  def right(row: IntegerTriples, i: Int) = {
+  def right(row: IntegerTriples, i: Int): IntegerTriples = {
     IntegerTriples(row.Subject, row.Predicate, i)
   }
 
   def rank(row: IntegerTriples, spo: String): Integer
 
-  def ranking() = {
+  def ranking(): (Seq[Integer], Seq[Integer]) = {
 
     var l, r = Seq[Integer]()
 
@@ -35,7 +33,7 @@ abstract class Evaluate(test: Dataset[IntegerTriples]) {
     (l, r)
   }
 
-  def rawHits10() = {
+  def rawHits10(): (Seq[Boolean], Seq[Boolean]) = {
 
     var l, r = Seq[Boolean]()
 
@@ -46,5 +44,4 @@ abstract class Evaluate(test: Dataset[IntegerTriples]) {
 
     (l, r)
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/PredictTransE.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/PredictTransE.scala
index 263c19d..a7c60dc 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/PredictTransE.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/prediction/PredictTransE.scala
@@ -1,28 +1,26 @@
 package net.sansa_stack.ml.spark.kge.linkprediction.prediction
 
+import org.apache.spark.sql._
+
+import net.sansa_stack.ml.spark.kge.linkprediction.models.TransE
+import net.sansa_stack.rdf.spark.kge.triples.{ IntegerTriples, StringTriples }
+
 /**
  * Predict TransE Class
  * --------------------
  *
  * Created by lpfgarcia on 14/11/2017.
  */
-
-import org.apache.spark.sql._
-
-import net.sansa_stack.ml.spark.kge.linkprediction.models.TransE
-
-import net.sansa_stack.rdf.spark.kge.triples.{StringTriples,IntegerTriples}
-
 class PredictTransE(model: TransE, test: Dataset[IntegerTriples]) extends Evaluate(test: Dataset[IntegerTriples]) {
 
-  def rank(row: IntegerTriples, spo: String) = {
+  def rank(row: IntegerTriples, spo: String): Integer = {
 
     var x = Seq[Float]()
     val y = model.myL(model.dist(row))
 
     val cor = spo match {
       case "l" => left _
-      case _   => right _
+      case _ => right _
     }
 
     x = y +: x
@@ -33,4 +31,4 @@ class PredictTransE(model: TransE, test: Dataset[IntegerTriples]) extends Evalua
     x.sorted.indexOf(y)
   }
 
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TransERun.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TransERun.scala
index 2b238a4..45e34b3 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TransERun.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TransERun.scala
@@ -4,18 +4,14 @@ package net.sansa_stack.ml.spark.kge.linkprediction.run
  * Created by lpfgarcia on 14/11/2017.
  */
 
+import org.apache.log4j.{ Level, Logger }
 import org.apache.spark.sql._
 
-import org.apache.log4j.Logger
-import org.apache.log4j.Level
-
-import net.sansa_stack.rdf.spark.kge.convertor.ByIndex
-import net.sansa_stack.rdf.spark.kge.triples._
-
-import net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation.Holdout
-import net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation.{kFold,Bootstrapping,Holdout}
+import net.sansa_stack.ml.spark.kge.linkprediction.crossvalidation.{ kFold, Bootstrapping, Holdout }
 import net.sansa_stack.ml.spark.kge.linkprediction.models.TransE
 import net.sansa_stack.ml.spark.kge.linkprediction.prediction.PredictTransE
+import net.sansa_stack.rdf.spark.kge.convertor.ByIndex
+import net.sansa_stack.rdf.spark.kge.triples._
 
 object TransERun {
 
@@ -25,7 +21,7 @@ object TransERun {
   val spark = SparkSession.builder.master("local")
     .appName("kge").getOrCreate
 
-  def main(args: Array[String]) = {
+  def main(args: Array[String]): Unit = {
 
     val table = new Triples("/home/lpfgarcia/Desktop/SANSA-ML/data/train.txt", "\t", false, false, spark)
 
@@ -37,18 +33,15 @@ object TransERun {
 
     val (train, test) = new Holdout(data.triples, 0.6f).crossValidation()
 
-    
     println("Trinamento:")
     println(train.show())
     println("Teste:")
     println(test.show())
 
-    //var model = new TransE(train, data.e.length, data.r.length, 100, 20, 1, "L1", spark)
-    //model.run()
-
-    //val predict = new PredictTransE(model, test).ranking()
-    //println(predict)
+    // var model = new TransE(train, data.e.length, data.r.length, 100, 20, 1, "L1", spark)
+    // model.run()
 
+    // val predict = new PredictTransE(model, test).ranking()
+    // println(predict)
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TriplesRun.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TriplesRun.scala
index 763c443..73cc080 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TriplesRun.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/kge/linkprediction/run/TriplesRun.scala
@@ -1,16 +1,13 @@
 package net.sansa_stack.ml.spark.kge.linkprediction.run
 
-import scala.util.Random
-
-import net.sansa_stack.rdf.spark.kge.triples._
 import net.sansa_stack.rdf.spark.kge.convertor.ByIndex
-import org.apache.spark.sql._
+import net.sansa_stack.rdf.spark.kge.triples._
 import org.apache.log4j.{ Level, Logger }
-import org.springframework.util.StopWatch
+import org.apache.spark.sql._
 
-object runTesting extends App {
+object TriplesRun extends App {
 
-  def printType[T](x: T): Unit = { println(x.getClass.toString()) }
+  def printType[T](x: T): Unit = { println(x.getClass.toString) }
 
   Logger.getLogger("org").setLevel(Level.OFF)
   Logger.getLogger("akka").setLevel(Level.OFF)
@@ -26,27 +23,26 @@ object runTesting extends App {
 
   println("<<< STARTING >>>")
 
-  var watch: StopWatch = new StopWatch()
+  var startTime = System.currentTimeMillis()
 
-  watch.start()
+  startTime = System.currentTimeMillis()
   val trp = new Triples("/home/hamed/workspace/TransE/DataSets/FB15k/freebase_mtr100_mte100-train.txt", "\t", false, false, spark)
-  watch.stop()
-  println("Readin triples done in " + watch.getTotalTimeSeconds + " seconds")
+  println("Reading triples done in " + (System.currentTimeMillis() - startTime) + " seconds")
 
-  watch.start()
+  startTime = System.currentTimeMillis()
   var num: Long = trp.triples.count()
-  watch.stop()
-  println("\n\n No triples = " + num.toString() + " - Done in " + watch.getTotalTimeSeconds + " seconds.")
 
-  watch.start()
+  println("\n\n No triples = " + num.toString + " - Done in " + (System.currentTimeMillis() - startTime) + " seconds.")
+
+  startTime = System.currentTimeMillis()
   num = trp.getEntities().length
-  watch.stop()
-  println("\n\n No Entities = " + num.toString() + " - Done in " + watch.getTotalTimeSeconds + " seconds.")
 
-  watch.start()
+  println("\n\n No Entities = " + num.toString + " - Done in " + (System.currentTimeMillis() - startTime) + " seconds.")
+
+  startTime = System.currentTimeMillis()
   num = trp.getRelations().length
-  watch.stop()
-  println("\n\n No Predicates = " + num.toString() + " - Done in " + watch.getTotalTimeSeconds + " seconds.")
+
+  println("\n\n No Predicates = " + num.toString + " - Done in " + (System.currentTimeMillis() - startTime) + " seconds.")
   //  trp.getAllDistinctEntities().take(10).foreach(println)
   //  println("\n \n No entities = ",trp.getAllDistinctEntities().count() )
   //  println("\n \n No predicates = ",trp.getAllDistinctPredicates().count() )
@@ -79,10 +75,10 @@ object runTesting extends App {
 
   sample1.show()
 
-  //val r3 = conv.getTriplesByIndex(sample1)
-  //r3.printSchema()
-  //r3.show
+  // val r3 = conv.getTriplesByIndex(sample1)
+  // r3.printSchema()
+  // r3.show
 
-  //val r4 = conv.getTriplesByString(r3)
-  //println("<<< DONE >>>")
+  // val r4 = conv.getTriplesByString(r3)
+  // println("<<< DONE >>>")
 }
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/AbstractRDFGraph.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/AbstractRDFGraph.scala
index f5a54b5..a608b50 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/AbstractRDFGraph.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/AbstractRDFGraph.scala
@@ -3,6 +3,7 @@ package net.sansa_stack.ml.spark.mining.amieSpark
 import org.apache.jena.graph.Triple
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{ DataFrame, SparkSession }
+
 import net.sansa_stack.ml.spark.mining.amieSpark._
 
 /**
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/DfLoader.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/DfLoader.scala
index 1e1674f..888b54c 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/DfLoader.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/DfLoader.scala
@@ -1,7 +1,5 @@
 package net.sansa_stack.ml.spark.mining.amieSpark
 
-import net.sansa_stack.ml.spark.mining.amieSpark._
-
 import org.apache.spark.SparkContext
 import org.apache.spark.sql._
 import org.apache.spark.sql.types._
@@ -18,9 +16,9 @@ object DfLoader {
     val startTime = System.currentTimeMillis()
     import sqlContext.implicits._
 
-    /* var y = StructType(StructField("sub", StringType,false)::
+    /*   var y = StructType(StructField("sub", StringType,false)::
                         StructField("rel", StringType, false)::
-                        StructField("ob", StringType, false):: Nil)*/
+                        StructField("ob", StringType, false):: Nil) */
 
     val triples =
       sc.textFile(path, minPartitions)
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/EmptyRDFGraphDataFrame.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/EmptyRDFGraphDataFrame.scala
index 44421de..dd51b98 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/EmptyRDFGraphDataFrame.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/EmptyRDFGraphDataFrame.scala
@@ -1,7 +1,7 @@
 package net.sansa_stack.ml.spark.mining.amieSpark
 
-import org.apache.spark.sql.types.{ StringType, StructField, StructType }
 import org.apache.spark.sql.{ DataFrame, Row, SQLContext }
+import org.apache.spark.sql.types.{ StringType, StructField, StructType }
 
 /**
  * @author Lorenz Buehmann
@@ -26,4 +26,4 @@ object EmptyRDFGraphDataFrame {
 
     triplesDataFrame
   }
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/KBObject.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/KBObject.scala
index b7d8088..442a589 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/KBObject.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/KBObject.scala
@@ -1,25 +1,23 @@
 package net.sansa_stack.ml.spark.mining.amieSpark
 
-import org.apache.spark.SparkContext
-import org.apache.spark.sql.{ DataFrame, SQLContext }
+import java.io.File
 
 import scala.collection.mutable.{ ArrayBuffer, Map }
 
-//import net.sansa_stack.ml.spark.dissect.inference.utils._
-
-import java.io.File
-
-import net.sansa_stack.ml.spark.mining.amieSpark.Rules.RuleContainer
+import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{ DataFrame, SQLContext }
 import org.apache.spark.sql.functions.udf
 
+import net.sansa_stack.ml.spark.mining.amieSpark.Rules.RuleContainer
+
 object KBObject {
   case class Atom(rdf: RDFTriple)
   class KB() extends Serializable {
     var kbSrc: String = ""
 
-    var kbGraph: RDFGraph = null
-    var dfTable: DataFrame = null
+    var kbGraph: RDFGraph = _
+    var dfTable: DataFrame = _
 
     var dfMap: Map[String, DataFrame] = Map()
 
@@ -62,7 +60,7 @@ object KBObject {
       }
 
       str = str.replace(" ", "_").replace("?", "_")
-      return str
+      str
     }
 
     def calcName(whole: ArrayBuffer[RDFTriple]): String = {
@@ -83,11 +81,11 @@ object KBObject {
         } else {
           countMap += (w._3 -> 1)
         }
-        if (!(numberMap.contains(w._1))) {
+        if (!numberMap.contains(w._1)) {
           numberMap += (w._1 -> counter)
           counter += 1
         }
-        if (!(numberMap.contains(w._3))) {
+        if (!numberMap.contains(w._3)) {
           numberMap += (w._3 -> counter)
           counter += 1
         }
@@ -113,28 +111,28 @@ object KBObject {
         out += a + "_" + wh._2 + "_" + b + "_"
       }
       out = out.stripSuffix("_")
-      return out
+      out
     }
 
     def getRngSize(rel: String): Double = {
 
-      return this.predicate2object2subject.get(rel).get.size
+      this.predicate2object2subject.get(rel).get.size
     }
 
     def setKbSrc(x: String) {
       this.kbSrc = x
     }
-    def getKbSrc(): String = {
+    def getKbSrc: String = {
 
-      return this.kbSrc
+      this.kbSrc
     }
 
-    def getKbGraph(): RDFGraph = {
-      return this.kbGraph
+    def getKbGraph: RDFGraph = {
+      this.kbGraph
 
     }
 
-    //TODO: think about Graph representation
+    // TODO: think about Graph representation
     def setKbGraph(x: RDFGraph) {
       this.kbGraph = x
       val graph = x.triples.collect
@@ -163,7 +161,7 @@ object KBObject {
 
       }
 
-      return out
+      out
     }
 
     /**
@@ -178,7 +176,7 @@ object KBObject {
       val subject = tp.subject
       val relation = tp.predicate
       val o = tp.`object`
-      //filling the to to to maps
+      // filling the to to to maps
       if (!(add(subject, relation, o, this.subject2predicate2object))) {
         add(relation, o, subject, this.predicate2object2subject)
         add(o, subject, relation, this.object2subject2predicate)
@@ -187,7 +185,7 @@ object KBObject {
         add(subject, o, relation, this.subject2object2predicate)
       }
 
-      //filling the sizes
+      // filling the sizes
       if (this.subjectSize.get(subject).isEmpty) {
         this.subjectSize += (subject -> 1)
       } else {
@@ -212,7 +210,7 @@ object KBObject {
         this.objectSize += (o -> obSize)
       }
 
-      //filling the overlaps
+      // filling the overlaps
 
       if (this.subject2subjectOverlap.get(relation).isEmpty) {
         subject2subjectOverlap += (relation -> Map())
@@ -242,18 +240,18 @@ object KBObject {
         return 0
 
       }
-      return this.relationSize.get(rel).get
+      this.relationSize.get(rel).get
 
     }
 
-    /*TO DO 
+    /* TODO
     * Functionality
     * bulidOverlapTable
     * */
 
     def relationsSize(): Int = {
 
-      return this.relationSize.size
+      this.relationSize.size
     }
 
     /**
@@ -264,7 +262,7 @@ object KBObject {
       var x = this.subjectSize.size
       var y = this.objectSize.size
 
-      return (x + y)
+      (x + y)
     }
 
     /**
@@ -283,24 +281,24 @@ object KBObject {
           val objects2 = predicate2object2subject.get(r2).get.keys.toSet
 
           if (!r1.equals(r2)) {
-            var ssoverlap: Int = computeOverlap(subjects1, subjects2);
-            subject2subjectOverlap.get(r1).get.put(r2, ssoverlap);
-            subject2subjectOverlap.get(r2).get.put(r1, ssoverlap);
+            var ssoverlap: Int = computeOverlap(subjects1, subjects2)
+            subject2subjectOverlap.get(r1).get.put(r2, ssoverlap)
+            subject2subjectOverlap.get(r2).get.put(r1, ssoverlap)
           } else {
-            subject2subjectOverlap.get(r1).get.put(r1, subjects2.size);
+            subject2subjectOverlap.get(r1).get.put(r1, subjects2.size)
           }
 
-          var soverlap1: Int = computeOverlap(subjects1, objects2);
-          subject2objectOverlap.get(r1).get.put(r2, soverlap1);
-          var soverlap2: Int = computeOverlap(subjects2, objects1);
-          subject2objectOverlap.get(r2).get.put(r1, soverlap2);
+          var soverlap1: Int = computeOverlap(subjects1, objects2)
+          subject2objectOverlap.get(r1).get.put(r2, soverlap1)
+          var soverlap2: Int = computeOverlap(subjects2, objects1)
+          subject2objectOverlap.get(r2).get.put(r1, soverlap2)
 
           if (!r1.equals(r2)) {
-            var oooverlap: Int = computeOverlap(objects1, objects2);
-            object2objectOverlap.get(r1).get.put(r2, oooverlap);
-            object2objectOverlap.get(r2).get.put(r1, oooverlap);
+            var oooverlap: Int = computeOverlap(objects1, objects2)
+            object2objectOverlap.get(r1).get.put(r2, oooverlap)
+            object2objectOverlap.get(r2).get.put(r1, oooverlap)
           } else {
-            object2objectOverlap.get(r1).get.put(r1, objects2.size);
+            object2objectOverlap.get(r1).get.put(r1, objects2.size)
           }
         }
       }
@@ -316,11 +314,12 @@ object KBObject {
     def computeOverlap(s1: Set[String], s2: Set[String]): Int = {
       var overlap: Int = 0
       for (r <- s1) {
-        if (s2.contains(r))
+        if (s2.contains(r)) {
           overlap += 1
+        }
       }
 
-      return overlap
+      overlap
     }
 
     // ---------------------------------------------------------------------------
@@ -334,13 +333,13 @@ object KBObject {
      *
      */
     def functionality(relation: String): Double = {
-      /*if (relation.equals(EQUALSbs)) {
-			return 1.0;*/
+      /* if (relation.equals(EQUALSbs)) {
+         return 1.0; */
 
       if (this.predicate2subject2object.get(relation).isEmpty) { return 0.0 }
       var a: Double = this.predicate2subject2object.get(relation).get.size
       var b: Double = this.relationSize.get(relation).get
-      return (a / b)
+      (a / b)
 
     }
 
@@ -351,12 +350,12 @@ object KBObject {
      *
      */
     def inverseFunctionality(relation: String): Double = {
-      /*if (relation.equals(EQUALSbs)) {
-			return 1.0;
-		} */
+      /* if (relation.equals(EQUALSbs)) {
+       return 1.0
+       } */
       var a: Double = this.predicate2object2subject.get(relation).get.size
       var b: Double = this.relationSize.get(relation).get
-      return (a / b)
+      (a / b)
 
     }
 
@@ -368,7 +367,7 @@ object KBObject {
      * @author AMIE+ Team
      */
     def isFunctional(relation: String): Boolean = {
-      return functionality(relation) >= inverseFunctionality(relation);
+      functionality(relation) >= inverseFunctionality(relation)
     }
 
     /**
@@ -381,10 +380,11 @@ object KBObject {
      *
      */
     def functionality(relation: String, inversed: Boolean): Double = {
-      if (inversed)
-        return inverseFunctionality(relation);
-      else
-        return functionality(relation);
+      if (inversed) {
+        inverseFunctionality(relation)
+      } else {
+        functionality(relation)
+      }
     }
 
     /**
@@ -396,10 +396,11 @@ object KBObject {
      *
      */
     def inverseFunctionality(relation: String, inversed: Boolean): Double = {
-      if (inversed)
-        return functionality(relation);
-      else
-        return inverseFunctionality(relation);
+      if (inversed) {
+        functionality(relation)
+      } else {
+        inverseFunctionality(relation)
+      }
     }
 
     /**
@@ -408,28 +409,29 @@ object KBObject {
      * length of maplist is the number of instantiations of a rule
      *
      * @param triplesCard rule as an ArrayBuffer of RDFTriples, triplesCard(0)
-     * 										is the head of the rule
+     * is the head of the rule
      * @param sc spark context
      *
      */
 
-    //----------------------------------------------------------------
+    // ----------------------------------------------------------------
     // Statistics
-    //----------------------------------------------------------------
+    // ----------------------------------------------------------------
 
     def overlap(relation1: String, relation2: String, overlap: Int): Double = {
       overlap match {
-        case SUBJECT2SUBJECT => if ((!(subject2subjectOverlap.get(relation1).isEmpty)) && (!(subject2subjectOverlap.get(relation1).get.get(relation2).isEmpty))) { return subject2subjectOverlap.get(relation1).get.get(relation2).get }
-        else return 0.0
+        case SUBJECT2SUBJECT =>
+          if (subject2subjectOverlap.get(relation1).isDefined && (!(subject2subjectOverlap.get(relation1).get.get(relation2).isEmpty))) {
+            subject2subjectOverlap.get(relation1).get.get(relation2).get
+          } else 0.0
         case SUBJECT2OBJECT =>
-
-          if ((!(subject2objectOverlap.get(relation1).isEmpty)) && (!(subject2objectOverlap.get(relation1).get.get(relation2).isEmpty))) { return subject2objectOverlap.get(relation1).get.get(relation2).get }
-          else return 0.0
+          if ((!(subject2objectOverlap.get(relation1).isEmpty)) && (!(subject2objectOverlap.get(relation1).get.get(relation2).isEmpty))) {
+            subject2objectOverlap.get(relation1).get.get(relation2).get
+          } else 0.0
         case OBJECT2OBJECT =>
-
-          if ((!(object2objectOverlap.get(relation1).isEmpty)) && (!(object2objectOverlap.get(relation1).get.get(relation2).isEmpty))) { return object2objectOverlap.get(relation1).get.get(relation2).get }
-          else return 0.0
-
+          if ((!(object2objectOverlap.get(relation1).isEmpty)) && (!(object2objectOverlap.get(relation1).get.get(relation2).isEmpty))) {
+            object2objectOverlap.get(relation1).get.get(relation2).get
+          } else 0.0
       }
     }
 
@@ -445,16 +447,16 @@ object KBObject {
     def relationColumnSize(rel: String, elem: String): Int = {
       elem match {
         case "subject" =>
-          return predicate2subject2object.get(rel).get.size
+          predicate2subject2object.get(rel).get.size
 
         case "object" =>
-          return predicate2object2subject.get(rel).get.size
+          predicate2object2subject.get(rel).get.size
 
       }
 
     }
 
-    //TODO: better than cardinality
+    // TODO: better than cardinality
 
     def bindingExists(triplesCard: ArrayBuffer[RDFTriple]): Boolean = {
       val k = this.kbGraph
@@ -478,7 +480,7 @@ object KBObject {
       var minSize = this.relationSize.get(triplesCard(0).predicate).get
       var index = 0
 
-      for (i <- 1 to triplesCard.length - 1) {
+      for (i <- 1 until triplesCard.length) {
         if (this.relationSize.get(triplesCard(i).predicate).get < minSize) {
           minSize = this.relationSize.get(triplesCard(i).predicate).get
           min = triplesCard(i)
@@ -500,7 +502,7 @@ object KBObject {
         x = k.find(None, Some(min.predicate), None).collect
       }
 
-      //x.foreach(println)
+      // x.foreach(println)
       triplesCard.remove(index)
 
       for (i <- x) {
@@ -530,16 +532,16 @@ object KBObject {
           if (test) {
 
             if ((a.startsWith("?")) && ((j._1 == a) && (!(atestLeft)))) {
-              temp += new RDFTriple(i._1, j._2, j._3)
+              temp += RDFTriple(i._1, j._2, j._3)
 
             } else if ((a.startsWith("?")) && ((j._3 == a) && (!(atestRight)))) {
-              temp += new RDFTriple(j._1, j._2, i._1)
+              temp += RDFTriple(j._1, j._2, i._1)
 
             } else if ((b.startsWith("?")) && ((j._3 == b) && (!(btestRight)))) {
-              temp += new RDFTriple(j._1, j._2, i._3)
+              temp += RDFTriple(j._1, j._2, i._3)
 
             } else if ((b.startsWith("?")) && ((j._1 == b) && (!(btestLeft)))) {
-              temp += new RDFTriple(i._3, j._2, j._3)
+              temp += RDFTriple(i._3, j._2, j._3)
             } else if ((b.startsWith("?")) && (((j._3 == b) && (btestRight)) || ((j._1 == b) && (btestLeft)))) {
               exploreFurther = false
             } else if ((a.startsWith("?")) && (((j._1 == a) && (atestLeft)) || ((j._3 == a) && (atestRight)))) {
@@ -564,12 +566,12 @@ object KBObject {
 
       }
 
-      return false
+      false
     }
 
-    def varCount(tpAr: ArrayBuffer[RDFTriple]): ArrayBuffer[Tuple2[String, String]] = {
+    def varCount(tpAr: ArrayBuffer[RDFTriple]): ArrayBuffer[(String, String)] = {
 
-      var out2: ArrayBuffer[Tuple2[String, String]] = new ArrayBuffer
+      var out2: ArrayBuffer[(String, String)] = new ArrayBuffer
 
       for (i <- tpAr) {
         if (!(out2.contains(Tuple2(i.subject, i.predicate)))) {
@@ -582,9 +584,9 @@ object KBObject {
 
       }
 
-      return out2
+      out2
     }
-    def countProjectionQueriesDF(posit: Int, id: Int, operator: String, minHC: Double, tpAr: ArrayBuffer[RDFTriple], RXY: ArrayBuffer[Tuple2[String, String]], sc: SparkContext, sqlContext: SQLContext): DataFrame =
+    def countProjectionQueriesDF(posit: Int, id: Int, operator: String, minHC: Double, tpAr: ArrayBuffer[RDFTriple], RXY: ArrayBuffer[(String, String)], sc: SparkContext, sqlContext: SQLContext): DataFrame =
       {
 
         val threshold = minHC * this.relationSize.get(tpAr(0).predicate).get
@@ -643,13 +645,13 @@ object KBObject {
 
         }
 
-        return whole
+        whole
 
       }
 
     def cardinalityQueries(id: Int, tpArDF: DataFrame, wholeAr: ArrayBuffer[RDFTriple], sc: SparkContext, sqlContext: SQLContext): DataFrame = {
       val DF = this.dfTable
-      var tpMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map()
+      var tpMap: Map[String, ArrayBuffer[(Int, String)]] = Map()
       DF.registerTempTable("table")
       tpArDF.registerTempTable("tpArTable")
 
@@ -659,10 +661,10 @@ object KBObject {
       var v = sqlContext.sql("SELECT * FROM tpArTable JOIN newColumn")
 
       var varAr: ArrayBuffer[String] = new ArrayBuffer
-      var checkMap: Map[Int, Tuple2[String, String]] = Map()
+      var checkMap: Map[Int, (String, String)] = Map()
       var checkSQLSELECT = "SELECT "
 
-      for (i <- 0 to wholeAr.length - 1) {
+      for (i <- wholeAr.indices) {
         var a = wholeAr(i).subject
         var b = wholeAr(i)._3
 
@@ -693,7 +695,7 @@ object KBObject {
 
       var cloneTpAr = wholeAr.clone()
 
-      var removedMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map()
+      var removedMap: Map[String, ArrayBuffer[(Int, String)]] = Map()
 
       varAr = varAr.distinct
       var checkSQLWHERE = "WHERE "
@@ -732,7 +734,7 @@ object KBObject {
       }
 
       checkSQLWHERE = checkSQLWHERE.stripSuffix(" AND ")
-      var seq: Seq[String] = Seq((wholeAr.last.toString() + "  " + id.toString()))
+      var seq: Seq[String] = Seq((wholeAr.last.toString() + "  " + id.toString))
       import sqlContext.implicits._
       var key: DataFrame = seq.toDF("key")
 
@@ -745,7 +747,7 @@ object KBObject {
       key.registerTempTable("keyTable")
       var out = sqlContext.sql(checkSQLSELECT + ", keyTable.key FROM lastTable JOIN keyTable")
 
-      return out
+      out
 
     }
 
@@ -754,18 +756,19 @@ object KBObject {
      */
 
     def cardinality(tpAr: ArrayBuffer[RDFTriple], sc: SparkContext, sqlContext: SQLContext): DataFrame = {
+      println(s"computing cardinality for ${tpAr.mkString(",")} ...")
       var name = calcName(tpAr)
 
       if (dfMap.contains(name)) {
-        return dfMap.get(name).get
+        dfMap.get(name).get
       } else {
         val DF = this.dfTable
-        var tpMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map()
+        var tpMap: Map[String, ArrayBuffer[(Int, String)]] = Map()
         DF.registerTempTable("table")
 
         var v = sqlContext.sql("SELECT rdf AS tp0 FROM table WHERE rdf.predicate = '" + tpAr(0).predicate + "'")
 
-        for (k <- 1 to tpAr.length - 1) {
+        for (k <- 1 until tpAr.length) {
           var w = sqlContext.sql("SELECT rdf AS tp" + k + " FROM table WHERE rdf.predicate = '" + tpAr(k).predicate + "'")
           w.registerTempTable("newColumn")
 
@@ -773,7 +776,7 @@ object KBObject {
           tempO.registerTempTable("previous")
 
           var sqlString = ""
-          for (re <- 0 to k - 1) {
+          for (re <- 0 until k) {
             sqlString += "previous.tp" + re + ", "
           }
 
@@ -782,10 +785,10 @@ object KBObject {
         }
 
         var varAr: ArrayBuffer[String] = new ArrayBuffer
-        var checkMap: Map[Int, Tuple2[String, String]] = Map()
+        var checkMap: Map[Int, (String, String)] = Map()
         var checkSQLSELECT = "SELECT "
 
-        for (i <- 0 to tpAr.length - 1) {
+        for (i <- tpAr.indices) {
           var a = tpAr(i).subject
           var b = tpAr(i)._3
 
@@ -816,7 +819,7 @@ object KBObject {
 
         var cloneTpAr = tpAr.clone()
 
-        var removedMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map()
+        var removedMap: Map[String, ArrayBuffer[(Int, String)]] = Map()
 
         varAr = varAr.distinct
         var checkSQLWHERE = "WHERE "
@@ -856,8 +859,9 @@ object KBObject {
 
         checkSQLWHERE = checkSQLWHERE.stripSuffix(" AND ")
         v.registerTempTable("t")
+        println(checkSQLSELECT + " FROM t " + checkSQLWHERE)
         var out = sqlContext.sql(checkSQLSELECT + " FROM t " + checkSQLWHERE)
-        return out
+        out
       }
 
     }
@@ -869,7 +873,7 @@ object KBObject {
       var go = false
       var outCount: Double = 0.0
       var tpsString = calcName(tpAr)
-      for (i <- 1 to tpAr.length - 1) {
+      for (i <- 1 until tpAr.length) {
         if ((tpAr(i)._1 == "?a") || (tpAr(i)._3 == "?a")) {
           go = true
         }
@@ -879,7 +883,7 @@ object KBObject {
         return outCount
       }
 
-      if (go) {
+      if ( go ) {
 
         var card = dfMap.get(tpsString).get
 
@@ -905,33 +909,23 @@ object KBObject {
           h.registerTempTable("subjects")
           out = sqlContext.sql("SELECT twoLengthT.tp0 FROM twoLengthT JOIN subjects ON twoLengthT.tp0." + abString + "=subjects.sub")
 
-          /*
-	   if ((tpAr(0).predicate == "directed")&&(tpAr(1).predicate== "produced")&&(tpAr(1).subject== "?a")&&(tpAr(1)._3== "?b")){
-	     h.show(800, false)
-	     
-	     var fjgf = sqlContext.sql("SELECT ")
-	   }
-	   
-	   
-	   */
-
         }
         outCount = out.count()
       }
-      return outCount
+      outCount
 
     }
 
     def negatveExampleBuilder(subjects: DataFrame, wholeAr: ArrayBuffer[RDFTriple], sc: SparkContext, sqlContext: SQLContext): DataFrame = {
       val DF = this.dfTable
-      var tpMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map()
+      var tpMap: Map[String, ArrayBuffer[(Int, String)]] = Map()
       DF.registerTempTable("table")
       var wholeTPARBackup = wholeAr.clone()
       wholeAr.remove(0)
 
       var complete = sqlContext.sql("SELECT rdf AS tp" + 0 + " FROM table WHERE rdf.predicate = '" + (wholeAr(0)).predicate + "'")
 
-      for (i <- 1 to wholeAr.length - 1) {
+      for (i <- 1 until wholeAr.length) {
         var w = sqlContext.sql("SELECT rdf AS tp" + i + " FROM table WHERE rdf.predicate = '" + (wholeAr(i)).predicate + "'")
         w.registerTempTable("newColumn")
 
@@ -940,12 +934,12 @@ object KBObject {
       }
 
       var varAr: ArrayBuffer[String] = new ArrayBuffer
-      var checkMap: Map[Int, Tuple2[String, String]] = Map()
+      var checkMap: Map[Int, (String, String)] = Map()
       var checkSQLSELECT = "SELECT "
 
       var abString = ("", "")
 
-      for (i <- 0 to wholeAr.length - 1) {
+      for (i <- wholeAr.indices) {
         var a = wholeAr(i).subject
         var b = wholeAr(i)._3
 
@@ -984,7 +978,7 @@ object KBObject {
 
       var cloneTpAr = wholeAr.clone()
 
-      var removedMap: Map[String, ArrayBuffer[Tuple2[Int, String]]] = Map()
+      var removedMap: Map[String, ArrayBuffer[(Int, String)]] = Map()
 
       varAr = varAr.distinct
       var checkSQLWHERE = "WHERE "
@@ -1032,11 +1026,11 @@ object KBObject {
 
       var out = sqlContext.sql(checkSQLSELECT + " FROM lastTable JOIN keyTable ON lastTable." + abString._2 + "." + abString._1 + "=keyTable.sub")
 
-      return out
+      out
 
     }
 
-    //TODO: solve with DataFrames
+    // TODO: solve with DataFrames
     def cardPlusnegativeExamplesLength(triplesCard: ArrayBuffer[RDFTriple], sc: SparkContext): Double = {
 
       val k = this.kbGraph
@@ -1058,7 +1052,7 @@ object KBObject {
 
       }
 
-      /**initializing maplist with head of the rule*/
+      /** initializing maplist with head of the rule */
       for (ii <- arbuf(0).collect()) {
         mapList += Map(triplesCard(0).subject -> ii._1, triplesCard(0).`object` -> ii._3)
 
@@ -1066,9 +1060,9 @@ object KBObject {
 
       var temp = mapList.clone()
 
-      for (tripleCount <- 1 to triplesCard.length - 1) {
+      for (tripleCount <- 1 until triplesCard.length) {
 
-        val rdd1 = sc.parallelize(mapList.toSeq)
+        val rdd1 = sc.parallelize(mapList)
         val rdd2 = arbuf(tripleCount)
         val comb = rdd1.cartesian(rdd2) // cartesian() to get every possible combination
 
@@ -1084,17 +1078,17 @@ object KBObject {
 
         for (i <- combinations) {
           var ltrip = i._2
-          var elem1 = ltrip._1 //subject from combination
+          var elem1 = ltrip._1 // subject from combination
           var elem2 = ltrip._3
           var trip1 = triplesCard(tripleCount)._1 // subject from Rule
           var trip2 = triplesCard(tripleCount)._3
 
-          /**checking map for placeholder for the subject*/
+          /** checking map for placeholder for the subject */
           if (!(i._1.contains(trip1))) {
             i._1 += (trip1 -> elem1)
           }
 
-          /**checking map for placeholder for the object*/
+          /** checking map for placeholder for the object */
           if (!(i._1.contains(trip2))) {
             i._1 += (trip2 -> elem2)
           }
@@ -1106,9 +1100,9 @@ object KBObject {
         }
 
       }
-      var rightOnes = sc.parallelize(mapList.toSeq).map(y => y.get(triplesCard(0).subject).get).distinct.collect
+      var rightOnes = sc.parallelize(mapList).map(y => y.get(triplesCard(0).subject).get).distinct.collect
 
-      var as = sc.parallelize(temp.toSeq).map {
+      var as = sc.parallelize(temp).map {
         x =>
           (x.get(triplesCard(0).subject).get, 1)
 
@@ -1116,19 +1110,17 @@ object KBObject {
 
       var out: Double = 0.0
       for (i <- as) {
-        if (rightOnes.contains(i._1))
+        if (rightOnes.contains(i._1)) {
           out += (i._2 - 1)
-
+        }
       }
-
-      return ((mapList.length) + out)
-
+      ((mapList.length) + out)
     }
 
     def addDanglingAtom(c: Int, id: Int, minHC: Double, rule: RuleContainer, sc: SparkContext, sqlContext: SQLContext): DataFrame =
       {
         val tpAr = rule.getRule()
-        var RXY: ArrayBuffer[Tuple2[String, String]] = new ArrayBuffer
+        var RXY: ArrayBuffer[(String, String)] = new ArrayBuffer
 
         val notC = rule.notClosed()
 
@@ -1148,13 +1140,13 @@ object KBObject {
 
         var x = this.countProjectionQueriesDF(c, id, "OD", minHC, tpAr, RXY, sc, sqlContext)
 
-        return x
+        x
       }
 
     def addClosingAtom(c: Int, id: Int, minHC: Double, rule: RuleContainer, sc: SparkContext, sqlContext: SQLContext): DataFrame =
       {
         val tpAr = rule.getRule()
-        var RXY: ArrayBuffer[Tuple2[String, String]] = new ArrayBuffer
+        var RXY: ArrayBuffer[(String, String)] = new ArrayBuffer
 
         val notC = rule.notClosed()
 
@@ -1190,9 +1182,8 @@ object KBObject {
         }
         var x = this.countProjectionQueriesDF(c, id, "OC", minHC, tpAr, RXY, sc, sqlContext)
 
-        return x
+        x
       }
 
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/MineRules.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/MineRules.scala
index 5f15515..193a760 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/MineRules.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/MineRules.scala
@@ -3,23 +3,23 @@ package net.sansa_stack.ml.spark.mining.amieSpark
 import java.io.File
 import java.net.URI
 
-import net.sansa_stack.ml.spark.mining.amieSpark.KBObject.KB
-import net.sansa_stack.ml.spark.mining.amieSpark.Rules.RuleContainer
-import org.apache.spark.SparkContext
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{ DataFrame, SQLContext, SparkSession, _ }
-
 import scala.collection.mutable.{ ArrayBuffer, Map }
 import scala.util.Try
 
 import org.apache.hadoop.fs.FileSystem
 import org.apache.hadoop.fs.Path
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{ DataFrame, SparkSession, SQLContext, _ }
 
 import net.sansa_stack.ml.spark.mining.amieSpark.DfLoader.Atom
+import net.sansa_stack.ml.spark.mining.amieSpark.KBObject.KB
+import net.sansa_stack.ml.spark.mining.amieSpark.Rules.RuleContainer
+
 
 object MineRules {
   /**
-   * 	Algorithm that mines the Rules.
+   * Algorithm that mines the Rules.
    *
    * @param kb object knowledge base that was created in main
    * @param minHC threshold on head coverage
@@ -55,11 +55,11 @@ object MineRules {
         } else {
           countMap += (w._3 -> 1)
         }
-        if (!(numberMap.contains(w._1))) {
+        if (!numberMap.contains(w._1)) {
           numberMap += (w._1 -> counter)
           counter += 1
         }
-        if (!(numberMap.contains(w._3))) {
+        if (!numberMap.contains(w._3)) {
           numberMap += (w._3 -> counter)
           counter += 1
         }
@@ -85,15 +85,16 @@ object MineRules {
         out += a + "_" + wh._2 + "_" + b + "_"
       }
       out = out.stripSuffix("_")
-      return out
+      out
     }
 
     def ruleMining(sc: SparkContext, sqlContext: SQLContext): ArrayBuffer[RuleContainer] = {
 
-      var predicates = kb.getKbGraph().triples.map { x => x.predicate
+      var predicates = kb.getKbGraph.triples.map { x => x.predicate
 
       }.distinct
       var z = predicates.collect()
+      println(s"#predicates:$z.length")
 
       /**
        * q is a queue with one atom rules
@@ -119,7 +120,7 @@ object MineRules {
       var out: ArrayBuffer[RuleContainer] = new ArrayBuffer
       var dublicate: ArrayBuffer[String] = ArrayBuffer("")
 
-      for (i <- 0 to this.maxLen - 1) {
+      for (i <- 0 until this.maxLen) {
 
         if ((i > 0) && (dataFrameRuleParts != null)) {
           var temp = q.clone
@@ -147,14 +148,14 @@ object MineRules {
 
             var dubCheck = fstTp
 
-            for (i <- 1 to newTpArr.length - 1) {
+            for (i <- 1 until newTpArr.length) {
               var temp = newTpArr(i).toString
               dubCheck += sortedNewTpArr(i).toString
               if (temp == fstTp) {
                 counter += 1
               }
             }
-            if ((counter < newTpArr.length) && (!(dublicate.contains(dubCheck)))) {
+            if ((counter < newTpArr.length) && (!dublicate.contains(dubCheck))) {
               dublicate += dubCheck
               newRuleC.setRule(minConf, n1._2, parent, newTpArr, sortedNewTpArr, kb, sc, sqlContext)
               q += newRuleC
@@ -162,12 +163,12 @@ object MineRules {
 
           }
 
-        } else if ((i > 0) && ((dataFrameRuleParts == null) || (dataFrameRuleParts.isEmpty()))) {
+        } else if ((i > 0) && ((dataFrameRuleParts == null) || dataFrameRuleParts.isEmpty())) {
           q = new ArrayBuffer
         }
 
-        if ((!q.isEmpty)) {
-          for (j <- 0 to q.length - 1) {
+        if (q.nonEmpty) {
+          for (j <- q.indices) {
 
             val r: RuleContainer = q(j)
 
@@ -180,11 +181,11 @@ object MineRules {
             if (acceptedForOutput(outMap, r, minConf, kb, sc, sqlContext)) {
               out += r
 
-              if (!(outMap.contains(tp(0).predicate))) {
+              if (!outMap.contains(tp(0).predicate)) {
                 outMap += (tp(0).predicate -> ArrayBuffer((tp, r)))
               } else {
                 var temp: ArrayBuffer[(ArrayBuffer[RDFTriple], RuleContainer)] = outMap.remove(tp(0).predicate).get
-                temp += new Tuple2(tp, r)
+                temp += Tuple2(tp, r)
                 outMap += (tp(0).predicate -> temp)
 
               }
@@ -195,7 +196,7 @@ object MineRules {
             if (r.getRule().length < maxLen) {
 
               dataFrameRuleParts = refine(i, j, r, dataFrameRuleParts, sc, sqlContext)
-              //TODO: Dublicate check
+              // TODO: Dublicate check
 
             }
 
@@ -204,7 +205,7 @@ object MineRules {
 
       }
 
-      return out
+      out
     }
 
     /**
@@ -219,14 +220,14 @@ object MineRules {
 
       var out: DataFrame = null
       var OUT: RDD[(RDFTriple, Int, Int)] = dataFrameRuleParts
-      //var count2:RDD[(String, Int)] = null 
+      // var count2:RDD[(String, Int)] = null
       var path = new File("test_table/")
       var temp = 0
 
       val tpAr = r.getRule()
 
       var stringSELECT = ""
-      for (tp <- 0 to tpAr.length - 1) {
+      for (tp <- tpAr.indices) {
 
         stringSELECT += "tp" + tp + ", "
 
@@ -239,7 +240,7 @@ object MineRules {
         var a = kb.addDanglingAtom(c, id, minHC, r, sc, sqlContext)
 
         z = Try(a.first())
-        if ((!(z.isFailure)) && (z.isSuccess)) {
+        if ((!z.isFailure) && z.isSuccess) {
 
           out = a
 
@@ -251,7 +252,7 @@ object MineRules {
 
       var t = Try(b.first)
 
-      if ((!(t.isFailure)) && (t.isSuccess) && (temp == 0)) {
+      if ((!t.isFailure) && t.isSuccess && (temp == 0)) {
 
         if (out == null) {
           out = b
@@ -265,12 +266,12 @@ object MineRules {
       var count: RDD[(String, Int)] = null
       var o: RDD[(RDFTriple, Int, Int)] = null
 
-      if (((!(t.isFailure)) && (t.isSuccess)) || ((z != null) && (!(z.isFailure)) && (z.isSuccess))) {
-        count = out.rdd.map(x => (x(r.getRule().length + 1).toString(), 1)).reduceByKey(_ + _)
+      if (((!t.isFailure) && t.isSuccess) || ((z != null) && (!z.isFailure) && z.isSuccess)) {
+        count = out.rdd.map(x => (x(r.getRule().length + 1).toString, 1)).reduceByKey(_ + _)
 
         o = count.map(q => (q._1.split("\\s+"), q._2)).map { token =>
           Tuple3(RDFTriple(token._1(0), token._1(1), token._1(2)), token._2, token._1(3).toInt)
-        }.filter(n1 => (n1._2 >= (kb.getRngSize(n1._1.predicate) * minHC)))
+        }.filter(n1 => n1._2 >= (kb.getRngSize(n1._1.predicate) * minHC))
 
         if (OUT == null) {
           OUT = o
@@ -280,7 +281,7 @@ object MineRules {
 
       }
 
-      return OUT
+      OUT
 
     }
 
@@ -294,14 +295,14 @@ object MineRules {
      */
     def acceptedForOutput(outMap: Map[String, ArrayBuffer[(ArrayBuffer[RDFTriple], RuleContainer)]], r: RuleContainer, minConf: Double, k: KB, sc: SparkContext, sqlContext: SQLContext): Boolean = {
 
-      //if ((!(r.closed())) || (r.getPcaConfidence(k, sc, sqlContext) < minConf)) {
-      if ((!(r.closed())) || (r.getPcaConfidence() < minConf)) {
+      // if ((!(r.closed())) || (r.getPcaConfidence(k, sc, sqlContext) < minConf)) {
+      if ((!r.closed()) || (r.getPcaConfidence() < minConf)) {
         return false
 
       }
 
       var parents: ArrayBuffer[RuleContainer] = r.parentsOfRule(outMap, sc)
-      if (r.getRule.length > 2) {
+      if (r.getRule().length > 2) {
         for (rp <- parents) {
           if (r.getPcaConfidence() <= rp.getPcaConfidence()) {
             return false
@@ -310,14 +311,14 @@ object MineRules {
         }
       }
 
-      return true
+      true
     }
 
     def sort(tp: ArrayBuffer[RDFTriple]): ArrayBuffer[RDFTriple] = {
       var out = ArrayBuffer(tp(0))
-      var temp = new ArrayBuffer[Tuple2[String, RDFTriple]]
+      var temp = new ArrayBuffer[(String, RDFTriple)]
 
-      for (i <- 1 to tp.length - 1) {
+      for (i <- 1 until tp.length) {
         var tempString: String = tp(i).predicate + tp(i).subject + tp(i).`object`
         temp += Tuple2(tempString, tp(i))
 
@@ -327,63 +328,8 @@ object MineRules {
         out += t._2
       }
 
-      return out
+      out
     }
 
   }
-
-  def main(args: Array[String]) = {
-    val know = new KB()
-
-    val sparkSession = SparkSession.builder
-
-      .master("local[*]")
-      .appName("AMIESpark example")
-
-      .getOrCreate()
-
-    if (args.length < 2) {
-      System.err.println(
-        "Usage: Triple reader <input> <output>")
-      System.exit(1)
-    }
-
-    val input = args(0)
-    val outputPath: String = args(1)
-    val hdfsPath: String = outputPath + "/"
-
-    val sc = sparkSession.sparkContext
-    val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-
-    know.sethdfsPath(hdfsPath)
-    know.setKbSrc(input)
-
-    know.setKbGraph(RDFGraphLoader.loadFromFile(know.getKbSrc(), sc, 2))
-    know.setDFTable(DfLoader.loadFromFileDF(know.getKbSrc, sc, sqlContext, 2))
-
-    val algo = new Algorithm(know, 0.01, 3, 0.1, hdfsPath)
-
-    var output = algo.ruleMining(sc, sqlContext)
-
-    var outString = output.map { x =>
-      var rdfTrp = x.getRule()
-      var temp = ""
-      for (i <- 0 to rdfTrp.length - 1) {
-        if (i == 0) {
-          temp = rdfTrp(i) + " <= "
-        } else {
-          temp += rdfTrp(i) + " \u2227 "
-        }
-      }
-      temp = temp.stripSuffix(" \u2227 ")
-      temp
-    }.toSeq
-    var rddOut = sc.parallelize(outString)
-
-    rddOut.saveAsTextFile(outputPath + "/testOut")
-
-    sc.stop
-
-  }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraph.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraph.scala
index beaf08c..c0f8936 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraph.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraph.scala
@@ -1,8 +1,9 @@
 package net.sansa_stack.ml.spark.mining.amieSpark
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.{ StringType, StructField, StructType }
 import org.apache.spark.sql.{ DataFrame, Row, SQLContext }
+import org.apache.spark.sql.types.{ StringType, StructField, StructType }
+
 import net.sansa_stack.ml.spark.mining.amieSpark._
 
 /**
@@ -40,7 +41,7 @@ case class RDFGraph(triples: RDD[RDFTriple]) {
   /**
    * Persist the triples RDD with the default storage level (`MEMORY_ONLY`).
    */
-  def cache() = {
+  def cache(): RDFGraph = {
     triples.cache()
     this
   }
@@ -49,7 +50,7 @@ case class RDFGraph(triples: RDD[RDFTriple]) {
    * Return the number of triples.
    * @return the number of triples
    */
-  def size() = {
+  def size(): Long = {
     triples.count()
   }
 
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphDataFrame.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphDataFrame.scala
index a773da2..aa9f6d0 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphDataFrame.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphDataFrame.scala
@@ -1,8 +1,11 @@
 package net.sansa_stack.ml.spark.mining.amieSpark
 
 import org.apache.jena.graph.Triple
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{ DataFrame, SparkSession }
+
 import net.sansa_stack.ml.spark.mining.amieSpark._
+
 /**
  * A data structure that comprises a set of triples.
  *
@@ -65,7 +68,7 @@ class RDFGraphDataFrame(triples: DataFrame) extends AbstractRDFGraph[DataFrame,
     this
   }
 
-  def distinct() = {
+  def distinct(): RDFGraphDataFrame = {
     new RDFGraphDataFrame(triples.distinct())
   }
 
@@ -74,11 +77,11 @@ class RDFGraphDataFrame(triples: DataFrame) extends AbstractRDFGraph[DataFrame,
    *
    * @return the number of triples
    */
-  def size() = {
+  def size(): Long = {
     triples.count()
   }
 
   def toDataFrame(sparkSession: SparkSession): DataFrame = triples
 
-  def toRDD() = triples.rdd.map(row => RDFTriple(row.getString(0), row.getString(1), row.getString(2)))
+  def toRDD(): RDD[RDFTriple] = triples.rdd.map(row => RDFTriple(row.getString(0), row.getString(1), row.getString(2)))
 }
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphLoader.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphLoader.scala
index 7dc355f..81278c1 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphLoader.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphLoader.scala
@@ -4,6 +4,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.sql._
 import org.apache.spark.sql.types._
 import org.slf4j.LoggerFactory
+
 import net.sansa_stack.ml.spark.mining.amieSpark._
 
 /**
@@ -14,7 +15,7 @@ import net.sansa_stack.ml.spark.mining.amieSpark._
  */
 object RDFGraphLoader {
 
-  //private val logger = com.typesafe.scalalogging.slf4j.Logger(LoggerFactory.getLogger(this.getClass.getName))
+  // private val logger = com.typesafe.scalalogging.slf4j.Logger(LoggerFactory.getLogger(this.getClass.getName))
   private val logger = LoggerFactory.getLogger(this.getClass.getName)
 
   def loadFromFile(path: String, sc: SparkContext, minPartitions: Int = 2): RDFGraph = {
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphNative.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphNative.scala
index 7bcf965..f979825 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphNative.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFGraphNative.scala
@@ -2,45 +2,44 @@ package net.sansa_stack.ml.spark.mining.amieSpark
 
 import org.apache.jena.graph.Triple
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.{StringType, StructField, StructType}
-import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+import org.apache.spark.sql.{ DataFrame, Row, SparkSession }
+import org.apache.spark.sql.types.{ StringType, StructField, StructType }
+
 import net.sansa_stack.ml.spark.mining.amieSpark._
 
 /**
-  * A data structure that comprises a set of triples.
-  *
-  * @author Lorenz Buehmann
-  *
-  */
-class RDFGraphNative(val triples: RDD[RDFTriple]) extends AbstractRDFGraph[RDD[RDFTriple], RDFGraphNative](triples){
+ * A data structure that comprises a set of triples.
+ *
+ * @author Lorenz Buehmann
+ *
+ */
+class RDFGraphNative(val triples: RDD[RDFTriple]) extends AbstractRDFGraph[RDD[RDFTriple], RDFGraphNative](triples) {
 
   /**
-    * Returns an RDD of triples that match with the given input.
-    *
-    * @param s the subject
-    * @param p the predicate
-    * @param o the object
-    * @return RDD of triples
-    */
-  def find (s: Option[String] = None, p: Option[String] = None, o: Option[String] = None): RDD[RDFTriple]= {
-      triples.filter(t =>
-          (s == None || t.subject == s.get) &&
-          (p == None || t.predicate == p.get) &&
-          (o == None || t.`object` == o.get)
-      )
+   * Returns an RDD of triples that match with the given input.
+   *
+   * @param s the subject
+   * @param p the predicate
+   * @param o the object
+   * @return RDD of triples
+   */
+  def find(s: Option[String] = None, p: Option[String] = None, o: Option[String] = None): RDD[RDFTriple] = {
+    triples.filter(t =>
+      (s == None || t.subject == s.get) &&
+        (p == None || t.predicate == p.get) &&
+        (o == None || t.`object` == o.get))
   }
 
   /**
-    * Returns an RDD of triples that match with the given input.
-    *
-    * @return RDD of triples
-    */
+   * Returns an RDD of triples that match with the given input.
+   *
+   * @return RDD of triples
+   */
   def find(triple: Triple): RDD[RDFTriple] = {
     find(
       if (triple.getSubject.isVariable) None else Option(triple.getSubject.toString),
       if (triple.getPredicate.isVariable) None else Option(triple.getPredicate.toString),
-      if (triple.getObject.isVariable) None else Option(triple.getObject.toString)
-    )
+      if (triple.getObject.isVariable) None else Option(triple.getObject.toString))
   }
 
   def union(graph: RDFGraphNative): RDFGraphNative = {
@@ -52,20 +51,20 @@ class RDFGraphNative(val triples: RDD[RDFTriple]) extends AbstractRDFGraph[RDD[R
     this
   }
 
-  def distinct() = {
+  def distinct(): RDFGraphNative = {
     new RDFGraphNative(triples.distinct())
   }
 
   /**
-    * Return the number of triples.
- *
-    * @return the number of triples
-    */
-  def size() = {
+   * Return the number of triples.
+   *
+   * @return the number of triples
+   */
+  def size(): Long = {
     triples.count()
   }
 
-  def toRDD() = triples
+  def toRDD(): RDD[RDFTriple] = triples
 
   def toDataFrame(sparkSession: SparkSession): DataFrame = {
     // convert RDD to DataFrame
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFTriple.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFTriple.scala
index e70cc27..c5753a4 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFTriple.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/RDFTriple.scala
@@ -1,6 +1,5 @@
-
-
 package net.sansa_stack.ml.spark.mining.amieSpark
+
 /**
   * An RDF triple.
   *
@@ -11,9 +10,9 @@ case class RDFTriple(subject: String, predicate: String, `object`: String) exten
   override def _2: String = predicate
   override def _3: String = `object`
 
-  def s = subject
-  def p = predicate
-  def o = `object`
+  def s: String = subject
+  def p: String = predicate
+  def o: String = `object`
 
-  override def toString = subject + "  " + predicate + "  " + `object`
+  override def toString: String = subject + "  " + predicate + "  " + `object`
 }
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/Rules.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/Rules.scala
index 9ee008f..c2d0b28 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/Rules.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/Rules.scala
@@ -1,19 +1,15 @@
 package net.sansa_stack.ml.spark.mining.amieSpark
 
-import net.sansa_stack.ml.spark.mining.amieSpark._
-
-import org.apache.spark.SparkConf
-import org.apache.spark.SparkContext
+import scala.collection.mutable.{ ArrayBuffer, Map }
 
-import org.apache.spark.sql.DataFrame
+import KBObject.KB
+import org.apache.spark.{ SparkConf, SparkContext }
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types._
 
-import org.apache.spark.rdd.RDD
-
-import scala.collection.mutable.ArrayBuffer
-import KBObject.KB
-import scala.collection.mutable.Map
+import net.sansa_stack.ml.spark.mining.amieSpark._
 
 object Rules {
 
@@ -49,7 +45,7 @@ object Rules {
       return this.sortedRule
     }
 
-    /**initializes rule, support, bodySize and sizeHead*/
+    /** initializes rule, support, bodySize and sizeHead */
     def initRule(x: ArrayBuffer[RDFTriple], k: KB, sc: SparkContext, sqlContext: SQLContext) {
       this.rule = x
 
@@ -78,7 +74,7 @@ object Rules {
       this.rule = tp
     }
 
-    /**returns ArrayBuffer with every triplePattern of the body as a RDFTriple*/
+    /** returns ArrayBuffer with every triplePattern of the body as a RDFTriple */
 
     def hc(): Double = {
       if (this.bodySize < 1) {
@@ -134,15 +130,17 @@ object Rules {
      */
 
     def calcSupport(k: KB, sc: SparkContext, sqlContext: SQLContext) {
+      println(s"computing support for rule $rule ...")
 
       if (this.rule.length > 1) {
 
         val mapList = k.cardinality(this.rule, sc, sqlContext)
 
         this.support = mapList.count()
+        println(s"#support($rule):$support")
       }
     }
-    /**returns the length of the body*/
+    /** returns the length of the body */
 
     def bodyLength(): Int = {
       var x = this.rule.length - 1
@@ -168,7 +166,7 @@ object Rules {
           maptp += (x._1 -> counter)
         }
 
-        /**checking map for placeholder for the object*/
+        /** checking map for placeholder for the object */
         if (!(maptp.contains(x._3))) {
           maptp += (x._3 -> 1)
         } else {
@@ -262,29 +260,30 @@ object Rules {
     }
 
     def parentsOfRule(outMap: Map[String, ArrayBuffer[(ArrayBuffer[RDFTriple], RuleContainer)]], sc: SparkContext): ArrayBuffer[RuleContainer] = {
-      // TODO: create new rules with body in alphabetical order     
+      // TODO: create new rules with body in alphabetical order
       var parents = ArrayBuffer(this.parent)
       val r = this.sortedRule.clone
 
-      if (outMap.get(r(0).predicate) == None) { return parents }
-      var rel = outMap.get(r(0).predicate).get
+      if (outMap.get(r(0).predicate) == None) {
+        return parents
+      }
 
+      var rel = outMap.get(r(0).predicate).get
       var tp: ArrayBuffer[RDFTriple] = new ArrayBuffer
-
       var filtered = rel.filter(x => (x._1.length == r.length - 1))
 
-      /* 
-             for (f <- filtered){
+      /*
+             for (f <- filtered) {
                var bool = true
-               for (ff <- f._1){
-                 if (!(r.contains(ff))){
+               for (ff <- f._1) {
+                 if (!(r.contains(ff))) {
                    bool = false
                  }
                }
-               if (bool){
+               if (bool) {
                  parents += f._2
                }
-             }*/
+             } */
 
       for (l <- 1 to r.length - 1) {
         if (!(filtered.isEmpty)) {
@@ -319,7 +318,7 @@ object Rules {
           maptp.put(x._1, (maptp.get(x._1).get + 1)).get
         }
 
-        /**checking map for placeholder for the object*/
+        /** checking map for placeholder for the object */
         if (!(maptp.contains(x._3))) {
 
           maptp += (x._3 -> 1)
@@ -359,7 +358,7 @@ object Rules {
           maptp.put(x._1, (maptp.get(x._1).get + 1)).get
         }
 
-        /**checking map for placeholder for the object*/
+        /** checking map for placeholder for the object */
         if (!(maptp.contains(x._3))) {
           varArBuff += x._3
 
@@ -398,13 +397,11 @@ object Rules {
       var out2: ArrayBuffer[(ArrayBuffer[RDFTriple], RuleContainer)] = new ArrayBuffer
 
       var out: (ArrayBuffer[RuleContainer], ArrayBuffer[(ArrayBuffer[RDFTriple], RuleContainer)]) = (out1, out2)
-      if (triples.length <= 1) {
-        return out
-      }
-      var triplesCardcombis: ArrayBuffer[ArrayBuffer[RDFTriple]] = new ArrayBuffer
+      if (triples.length <= 1) return out
 
-      //var rdd =sc.parallelize(arbuf.toSeq)
-      //out ++= rdd.filter(x => (sameRule(triples, x._1))).map(y => y._2).collect
+      var triplesCardcombis: ArrayBuffer[ArrayBuffer[RDFTriple]] = new ArrayBuffer
+      // var rdd =sc.parallelize(arbuf.toSeq)
+      // out ++= rdd.filter(x => (sameRule(triples, x._1))).map(y => y._2).collect
 
       for (x <- arbuf) {
         if (sameRule(triples, x._1)) {
@@ -415,13 +412,13 @@ object Rules {
 
       /* var rdd = sc.parallelize(arbuf.toSeq)
           var pq = rdd.map{ x=>
-           if (sameRule(triples, x._1)){
-             ("out1", x._2) 
-             
+           if (sameRule(triples, x._1)) {
+             ("out1", x._2)
+
            }
            else ("out2",x)
          }.groupByKey()
-         * 
+         *
          */
 
       out = (out1, out2)
@@ -492,8 +489,6 @@ object Rules {
       return out
     }
 
-    //end
-
+    // end
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/SQLSchema.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/SQLSchema.scala
index 9a108d9..11808e9 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/SQLSchema.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/SQLSchema.scala
@@ -1,6 +1,5 @@
-
-
 package net.sansa_stack.ml.spark.mining.amieSpark
+
 /**
  * The SQL schema used for RDF triples in a Dataframe.
  *
@@ -8,12 +7,12 @@ package net.sansa_stack.ml.spark.mining.amieSpark
  */
 object SQLSchema {
 
-  def triplesTable = "TRIPLES"
+  def triplesTable: String = "TRIPLES"
 
-  def subjectCol = "subject"
+  def subjectCol: String = "subject"
 
-  def predicateCol = "predicate"
+  def predicateCol: String = "predicate"
 
-  def objectCol = "object"
+  def objectCol: String = "object"
 
 }
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/amieExample.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/amieExample.scala
deleted file mode 100644
index 4f91dad..0000000
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/mining/amieSpark/amieExample.scala
+++ /dev/null
@@ -1,87 +0,0 @@
-package net.sansa_stack.ml.spark.mining.amieSpark
-
-import org.apache.spark.SparkContext
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession, _}
-import net.sansa_stack.ml.spark.mining.amieSpark.KBObject.KB
-import net.sansa_stack.ml.spark.mining.amieSpark.MineRules.Algorithm
-
-import org.apache.hadoop.fs.FileSystem
-import org.apache.hadoop.fs.Path
-import java.net.URI
-
-
-import java.io.File
-
-object amieExample {
-  
-  def main(args: Array[String]) = {
-    
-    
-    
-      
-
-  val know = new KB()
- 
-   val sparkSession = SparkSession.builder
-
-    .master("spark://172.18.160.16:3077")
-      .appName("SPARK Reasoning")
-    .config("spark.sql.warehouse.dir", "file:///data/home/MohamedMami/spark-2.1.0-bin-hadoop2.7/bin/spark-warehouse")
-    
-   
-    .getOrCreate()
- 
-    
-  val hdfsPath:String = args(0)
-  
-  val outputPath =hdfsPath
-  val inputFile = hdfsPath + args(1)
-    
-  
-  
-  
-  val sc = sparkSession.sparkContext
-  
-  val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-
-  
-  know.sethdfsPath(hdfsPath)
-  know.setKbSrc(inputFile)
-  
-  know.setKbGraph(RDFGraphLoader.loadFromFile(know.getKbSrc(), sc, 2))
-  know.setDFTable(DfLoader.loadFromFileDF(know.getKbSrc, sc, sqlContext, 2)  )
-  
-   
- 
-  
-  
-  val algo = new Algorithm (know, 0.01, 3, 0.1, hdfsPath)
-
-    
-    var erg = algo.ruleMining(sc, sqlContext)
-    var outString = erg.map { x =>
-      var rdfTrp = x.getRule()
-      var temp = ""
-      for (i <- 0 to rdfTrp.length - 1) {
-        if (i == 0) {
-          temp = rdfTrp(i) + " <= "
-        } else {
-          temp += rdfTrp(i) + " \u2227 "
-        }
-      }
-      temp = temp.stripSuffix(" \u2227 ")
-      temp
-    }.toSeq
-    
-    outString.foreach(println)
-    var rddOut = sc.parallelize(outString).repartition(1)
-
-    rddOut.saveAsTextFile(outputPath + "testOut")
-  
-    sc.stop
-
-  
-}
-  
-}
\ No newline at end of file
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalWithDataframeCrossJoin.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalWithDataframeCrossJoin.scala
index 12fab07..75c6a2f 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalWithDataframeCrossJoin.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalWithDataframeCrossJoin.scala
@@ -1,23 +1,18 @@
 package net.sansa_stack.ml.spark.outliers.anomalydetection
 
-import org.apache.jena.graph.Node
-import org.apache.spark.rdd.RDD
-import org.apache.spark.RangePartitioner
-import org.apache.jena.graph.Triple
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.HashPartitioner
 import scala.collection.mutable
 import scala.collection.mutable.HashSet
-import org.apache.jena.graph.NodeFactory
+
+import org.apache.commons.math3.stat.descriptive._
+import org.apache.jena.graph.{ Node, NodeFactory, Triple }
+import org.apache.spark.{ HashPartitioner, RangePartitioner }
+import org.apache.spark.ml.feature.{ MinHashLSH, _ }
+import org.apache.spark.ml.linalg._
+import org.apache.spark.rdd.{ RDD, _ }
 import org.apache.spark.sql._
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions.{ col, udf }
 import org.apache.spark.sql.types._
-import org.apache.spark.rdd._
-import org.apache.spark.ml.feature.MinHashLSH
-import org.apache.spark.sql.functions.udf
-import org.apache.spark.ml.feature._
-import org.apache.spark.ml.linalg._
-import org.apache.spark.sql.functions.col
-import org.apache.commons.math3.stat.descriptive._
 import org.apache.spark.storage.StorageLevel
 
 /* Dataframe CrossJoin works well for smaller datasets(for e.g. 3.6GB)
@@ -30,34 +25,34 @@ import org.apache.spark.storage.StorageLevel
  */
 
 class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[String],
-                          triplesType: List[String], JSimThreshold: Double,
-                          listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) extends Serializable {
+                                   triplesType: List[String], JSimThreshold: Double,
+                                   listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) extends Serializable {
   def run(): RDD[(Set[(String, String, Object)])] = {
 
-    // get all the triples whose objects are literal 
-    //these literals also contains xsd:date as well as xsd:langstring 
+    // get all the triples whose objects are literal
+    // these literals also contains xsd:date as well as xsd:langstring
     val getObjectLiteral = getObjectList()
 
-    //remove the literal which has ^^xsd:date or xsd:langstring(only considering numerical)
+    // remove the literal which has ^^xsd:date or xsd:langstring(only considering numerical)
     val removedLangString = getObjectLiteral.filter(f => searchedge(f.getObject.toString(), objList))
 
-    //the predicate wikipageId,wikiPageRevisionID are not important for outliers
+    // the predicate wikipageId,wikiPageRevisionID are not important for outliers
     val removewiki = removedLangString.filter(f => (!f.getPredicate.toString().contains("wikiPageID")) &&
       (!f.getPredicate.toString().contains("wikiPageRevisionID")))
 
-    //checking object has only numerical data only
+    // checking object has only numerical data only
     val triplesWithNumericLiteral = triplesWithNumericLit(removewiki)
-    
-   //Pair rdd with key as subject and calue as triple with numerical literal
-    val mapSubWithTriples = propClustering(triplesWithNumericLiteral) //.persist
 
-    //get triples of hypernym
+    // Pair rdd with key as subject and calue as triple with numerical literal
+    val mapSubWithTriples = propClustering(triplesWithNumericLiteral) // .persist
+
+    // get triples of hypernym
     val getHypernymTriples = getHyp()
 
-    //filter Dbpedia's rdf type and join with hyernym
+    // filter Dbpedia's rdf type and join with hyernym
     val rdfTypeDBwiki = rdfType(getHypernymTriples)
 
-    //joining those subjects only who has rdf:ytpe/hypernym and numerical literal 
+    // joining those subjects only who has rdf:ytpe/hypernym and numerical literal
     val rdfTypeWithSubject = mapSubWithTriples.join(rdfTypeDBwiki)
 
     val mapSubjectwithType = rdfTypeWithSubject.map(f => (f._1, f._2._2))
@@ -68,13 +63,13 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
 
   }
 
- //filter triples with hypernm 
+  // filter triples with hypernm
   def getHyp(): RDD[Triple] = nTriplesRDD.filter(f => f.getPredicate.toString().equals(hypernym))
 
-  //filtering triples with literal at object position
+  // filtering triples with literal at object position
   def getObjectList(): RDD[Triple] = nTriplesRDD.filter(f => f.getObject.isLiteral())
 
-  //filtering only numeric literals
+  // filtering only numeric literals
   def triplesWithNumericLit(objLit: RDD[Triple]): RDD[Triple] = objLit.filter(f => isNumeric(f.getObject.toString()))
 
   def isNumeric(x: String): Boolean =
@@ -83,21 +78,17 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
         val c = x.indexOf('^')
         val subject = x.substring(1, c - 1)
 
-        if (isAllDigits(subject))
-          true
-        else
-          false
-      } else
-        false
+        if (isAllDigits(subject)) true
+        else false
+      } else false
     }
 
   def isAllDigits(x: String): Boolean = {
     var found = false
     for (ch <- x) {
-      if (ch.isDigit || ch == '.')
+      if (ch.isDigit || ch == '.') {
         found = true
-      else if (ch.isLetter) {
-
+      } else if (ch.isLetter) {
         found = false
       }
     }
@@ -110,20 +101,19 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
       val c = x.indexOf('^')
       val subject = x.substring(c + 2)
       y.contains(subject)
-    } else
-      false
+    } else false
   }
 
   def rdfType(getHypernym: RDD[Triple]): RDD[(String, HashSet[String])] = {
 
-    //filter triples with predicate as rdf:type
+    // filter triples with predicate as rdf:type
     val triplesWithRDFType = nTriplesRDD.filter(_.getPredicate.toString() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
 
     val triplesWithDBpedia = triplesWithRDFType.filter(f => searchType(f.getObject.toString(), triplesType))
 
     val subWithType1 = triplesWithDBpedia.map(f =>
       // ...
-      (getLocalName1(f.getSubject), (getLocalName1(f.getObject)))) //.reduceByKey(_ ++ _) //.partitionBy(new HashPartitioner(8)).persist()
+      (getLocalName1(f.getSubject), (getLocalName1(f.getObject)))) // .reduceByKey(_ ++ _) //.partitionBy(new HashPartitioner(8)).persist()
 
     val initialSet1 = mutable.HashSet.empty[String]
     val addToSet1 = (s: mutable.HashSet[String], v: String) => s += v
@@ -131,7 +121,7 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
     val uniqueByKey1 = subWithType1.aggregateByKey(initialSet1)(addToSet1, mergePartitionSets1)
 
     val hyper1 = getHypernym.map(f =>
-      (getLocalName1(f.getSubject), (getLocalName1(f.getObject) + ("hypernym")))) //.partitionBy(new HashPartitioner(8)).persist
+      (getLocalName1(f.getSubject), (getLocalName1(f.getObject) + ("hypernym")))) // .partitionBy(new HashPartitioner(8)).persist
 
     val initialSet = mutable.HashSet.empty[String]
     val addToSet = (s: mutable.HashSet[String], v: String) => s += v
@@ -157,31 +147,31 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
   def searchType(x: String, y: List[String]): Boolean = {
     if (y.exists(x.contains)) {
       true
-    } else
-      false
+    } else false
   }
-  def jSimilarity(TriplesWithNumericLiteral: RDD[Triple],
-                  rdfTypeDBwiki: RDD[(String, HashSet[String])], mapSubWithTriples: RDD[(String, mutable.Set[(String, String, Object)])]): RDD[(Set[(String, String, Object)])] = {
+  def jSimilarity(
+    TriplesWithNumericLiteral: RDD[Triple],
+    rdfTypeDBwiki: RDD[(String, HashSet[String])], mapSubWithTriples: RDD[(String, mutable.Set[(String, String, Object)])]): RDD[(Set[(String, String, Object)])] = {
 
     nTriplesRDD.unpersist()
     import sparkSession.implicits._
-    //KV pair with subject as key and rdf type/hypernym as value
+    // KV pair with subject as key and rdf type/hypernym as value
     val hashtoseq = rdfTypeDBwiki.map(f => (f._1, f._2.toSeq))
     val part = new RangePartitioner(30, hashtoseq)
     val partitioned = hashtoseq.partitionBy(part).persist()
-    //converting the rdd to dataframe
+    // converting the rdd to dataframe
     val dfA = partitioned.toDF("id1", "value1")
     val dfB = partitioned.toDF("id2", "value2")
 
-     //crossJoin of the rdd
+    // crossJoin of the rdd
     val joindfA = dfA.crossJoin(dfB)
-    //registering Jaccard similarity function in udf
+    // registering Jaccard similarity function in udf
     val myUDF = udf(sim _)
-    //papplying jaccard similarity function to each row
+    // papplying jaccard similarity function to each row
     val newDF = joindfA.withColumn("Jsim", myUDF(joindfA("value1"), joindfA("value2"))).select("id1", "id2", "Jsim").filter($"Jsim" > 0.6)
 
-    //converting df to rdd
-    val x1 = newDF.rdd 
+    // converting df to rdd
+    val x1 = newDF.rdd
       .map(row => {
         val id = row.getString(0)
         val value = row.getString(1)
@@ -193,13 +183,13 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
     val mergePartitionSets3 = (p1: mutable.Set[String], p2: mutable.Set[String]) => p1 ++= p2
     val uniqueByKey3 = x1.aggregateByKey(initialSet3)(addToSet3, mergePartitionSets3)
 
-    //create cohort of subjects
+    // create cohort of subjects
     val SubKV = uniqueByKey3.map(f => ((f._1, (f._2 += (f._1)).toSet)))
     val partitioner = new HashPartitioner(80)
     val mapSubWithTriplesPart = mapSubWithTriples.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK) //   --heap size error on local mode when not unpersisted with persist
 
-    //join cohort of subjects with KV value of mapSubWithTriples
-    val ys = SubKV.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK) 
+    // join cohort of subjects with KV value of mapSubWithTriples
+    val ys = SubKV.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK)
     val g = ys.join(mapSubWithTriples)
 
     val clusterOfSubjects = g.map({
@@ -229,8 +219,7 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
   def isContains(a: List[Node], b: List[Node]): Boolean = {
     if (a.forall(b.contains) || b.forall(a.contains)) {
       true
-    } else
-      false
+    } else false
   }
 
   def removeSupType(a: RDD[((String, HashSet[String]), (String, HashSet[String]))]): RDD[((String, HashSet[String]), (String, HashSet[String]))] = {
@@ -251,13 +240,12 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
       })
 
       clusterOfProp
-    } else
-      a
+    } else a
   }
   def propClustering(triplesWithNumericLiteral: RDD[Triple]): RDD[(String, mutable.Set[(String, String, Object)])] = {
 
     val subMap = triplesWithNumericLiteral.map(f => (getLocalName1(f.getSubject),
-      (getLocalName1(f.getSubject), getLocalName1(f.getPredicate), getNumber(f.getObject.toString())))) //.partitionBy(new HashPartitioner(8)) //make a function instead of using
+      (getLocalName1(f.getSubject), getLocalName1(f.getPredicate), getNumber(f.getObject.toString())))) // .partitionBy(new HashPartitioner(8)) //make a function instead of using
 
     val initialSet = mutable.Set.empty[(String, String, Object)]
     val addToSet = (s: mutable.Set[(String, String, Object)], v: (String, String, Object)) => s += v
@@ -281,9 +269,9 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
     var dbtype2: Seq[String] = null
     val hyper1 = seq1.filter(p => p.contains("hypernym"))
     val hyper2 = seq2.filter(p => p.contains("hypernym"))
-    //case of usa and India
+    // case of usa and India
 
-    //USA= hypernym/states and India :- hypernym//Country
+    // USA= hypernym/states and India :- hypernym//Country
     if (hyper1 == hyper2 && !hyper1.isEmpty && !hyper2.isEmpty) {
 
       jSimilarity = 1.0
@@ -291,12 +279,14 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
     } else {
       if (seq1.contains("hypernym")) {
         dbtype1 = seq1.dropRight(1)
-      } else
+      } else {
         dbtype1 = seq1
+      }
       if (seq2.contains("hypernym")) {
         dbtype2 = seq2.dropRight(1)
-      } else
+      } else {
         dbtype2 = seq2
+      }
 
       val intersect_cnt = dbtype1.toSet.intersect(dbtype2.toSet).size
 
@@ -308,7 +298,7 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
   }
   def iqr2(cluster: Seq[(String, String, Object)], anomalyListLimit: Int): Seq[(String, String, Object)] = {
 
-    //create sample data 
+    // create sample data
 
     val listofData = cluster.map(b => (b._3.toString()).toDouble).toArray
 
@@ -318,13 +308,13 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
     genericArrayOps(c).foreach(v => arrMean.addValue(v))
     // Get first and third quartiles and then calc IQR
     val Q1 = arrMean.getPercentile(25)
-    //println("Q1="+Q1)
+    // println("Q1="+Q1)
     val Q3 = arrMean.getPercentile(75)
-    //println("Q3="+Q3)
+    // println("Q3="+Q3)
     val IQR = Q3 - Q1
-    //println("IQR="+IQR)
+    // println("IQR="+IQR)
     val lowerRange = Q1 - 1.5 * IQR
-    //println("lowerRange="+lowerRange)
+    // println("lowerRange="+lowerRange)
     val upperRange = Q3 + 1.5 * IQR
     //    println("upperRange="+upperRange)
     val yse = c.filter(p => (p < lowerRange || p > upperRange))
@@ -335,15 +325,13 @@ class AnomalWithDataframeCrossJoin(nTriplesRDD: RDD[Triple], objList: List[Strin
   }
 
   def search(a: Double, b: Array[Double]): Boolean = {
-    if (b.contains(a))
-      true
-    else
-      false
-
+    if (b.contains(a)) true
+    else false
   }
 }
 object AnomalWithDataframeCrossJoin {
   def apply(nTriplesRDD: RDD[Triple], objList: List[String], triplesType: List[String],
-            JSimThreshold: Double, listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) = new AnomalWithDataframeCrossJoin(nTriplesRDD, objList, triplesType,
+            JSimThreshold: Double, listSuperType: List[String], sparkSession: SparkSession,
+            hypernym: String, numPartition: Int): AnomalWithDataframeCrossJoin = new AnomalWithDataframeCrossJoin(nTriplesRDD, objList, triplesType,
     JSimThreshold, listSuperType, sparkSession, hypernym, numPartition)
 }
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionWithCountVetcorizerModel.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionWithCountVetcorizerModel.scala
index 56e98da..89507a8 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionWithCountVetcorizerModel.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionWithCountVetcorizerModel.scala
@@ -1,22 +1,18 @@
 package net.sansa_stack.ml.spark.outliers.anomalydetection
 
-import org.apache.jena.graph.Node
-import org.apache.spark.rdd.RDD
-import org.apache.jena.graph.Triple
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.HashPartitioner
 import scala.collection.mutable
 import scala.collection.mutable.HashSet
-import org.apache.jena.graph.NodeFactory
-import org.apache.spark.sql._
-import org.apache.spark.sql.types._
-import org.apache.spark.rdd._
-import org.apache.spark.ml.feature.MinHashLSH
-import org.apache.spark.sql.functions.udf
-import org.apache.spark.ml.feature._
-import org.apache.spark.ml.linalg._
-import org.apache.spark.sql.functions.col
+
 import org.apache.commons.math3.stat.descriptive._
+import org.apache.jena.graph.{ Node, NodeFactory, Triple }
+import org.apache.spark.{ HashPartitioner, RangePartitioner }
+import org.apache.spark.ml.feature.{ MinHashLSH, _ }
+import org.apache.spark.ml.linalg._
+import org.apache.spark.rdd.{ RDD, _ }
+import org.apache.spark.sql.{ SparkSession, _ }
+import org.apache.spark.sql.functions.{ col, udf }
+import org.apache.spark.sql.types._
+import org.apache.spark.storage.StorageLevel
 
 /*
  *
@@ -27,39 +23,39 @@ import org.apache.commons.math3.stat.descriptive._
  */
 
 class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList: List[String],
-                       triplesType: List[String], JSimThreshold: Double,
-                       listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) extends Serializable {
+                                               triplesType: List[String], JSimThreshold: Double,
+                                               listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) extends Serializable {
   def run(): RDD[(Set[(String, String, Object)])] = {
 
-    // get all the triples whose objects are literal 
-    //these literals also contains xsd:date as well as xsd:langstring 
+    // get all the triples whose objects are literal
+    // these literals also contains xsd:date as well as xsd:langstring
     val getObjectLiteral = getObjectList()
 
-    //remove the literal which has ^^xsd:date or xsd:langstring(only considering numerical)
+    // remove the literal which has ^^xsd:date or xsd:langstring(only considering numerical)
     val removedLangString = getObjectLiteral.filter(f => searchedge(f.getObject.toString(), objList))
 
     val removewiki = removedLangString.filter(f => (!f.getPredicate.toString().contains("wikiPageID")) &&
       (!f.getPredicate.toString().contains("wikiPageRevisionID")))
 
-    //checking still object has only numerical data only
+    // checking still object has only numerical data only
     val triplesWithNumericLiteral = triplesWithNumericLit(removewiki)
 
-    val mapSubWithTriples = propClustering(triplesWithNumericLiteral) //.persist
+    val mapSubWithTriples = propClustering(triplesWithNumericLiteral) // .persist
 
-    //get triples of hypernym
+    // get triples of hypernym
     val getHypernymTriples = getHyp()
- 
-    //filter rdf type having object value dbpedia and join with hyernym
+
+    // filter rdf type having object value dbpedia and join with hyernym
     // val rdfTypeDBwiki = rdfType(getHypernym) //.partitionBy(new HashPartitioner(2)).persist()
     val rdfTypeDBwiki = rdfType(getHypernymTriples)
-    
-    //joining those subjects only who has rdf:ytpe and numerical literal 
+
+    // joining those subjects only who has rdf:ytpe and numerical literal
     val rdfTypeWithSubject = mapSubWithTriples.join(rdfTypeDBwiki)
-    
+
     val mapSubjectwithType = rdfTypeWithSubject.map(f => (f._1, f._2._2))
-    
+
     //  val propwithSub = propwithsubject(triplesWithNumericLiteral)
-    //cluster subjects on the basis of rdf type
+    // cluster subjects on the basis of rdf type
     val jacardSimilarity = jSimilarity(triplesWithNumericLiteral, mapSubjectwithType, mapSubWithTriples)
 
     jacardSimilarity
@@ -79,20 +75,17 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
         val c = x.indexOf('^')
         val subject = x.substring(1, c - 1)
 
-        if (isAllDigits(subject))
-          true
-        else
-          false
-      } else
-        false
+        if (isAllDigits(subject)) true
+        else false
+      } else false
     }
 
   def isAllDigits(x: String): Boolean = {
     var found = false
     for (ch <- x) {
-      if (ch.isDigit || ch == '.')
+      if (ch.isDigit || ch == '.') {
         found = true
-      else if (ch.isLetter) {
+      } else if (ch.isLetter) {
 
         found = false
       }
@@ -106,20 +99,19 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
       val c = x.indexOf('^')
       val subject = x.substring(c + 2)
       y.contains(subject)
-    } else
-      false
+    } else false
   }
 
   def rdfType(getHypernym: RDD[Triple]): RDD[(String, HashSet[String])] = {
 
-    //filter triples with predicate as rdf:type
+    // filter triples with predicate as rdf:type
     val triplesWithRDFType = nTriplesRDD.filter(_.getPredicate.toString() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
 
     val triplesWithDBpedia = triplesWithRDFType.filter(f => searchType(f.getObject.toString(), triplesType))
 
     val subWithType1 = triplesWithDBpedia.map(f =>
       // ...
-      (getLocalName1(f.getSubject), (getLocalName1(f.getObject)))) //.reduceByKey(_ ++ _) //.partitionBy(new HashPartitioner(8)).persist()
+      (getLocalName1(f.getSubject), (getLocalName1(f.getObject)))) // .reduceByKey(_ ++ _) //.partitionBy(new HashPartitioner(8)).persist()
 
     val initialSet1 = mutable.HashSet.empty[String]
     val addToSet1 = (s: mutable.HashSet[String], v: String) => s += v
@@ -127,7 +119,7 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
     val uniqueByKey1 = subWithType1.aggregateByKey(initialSet1)(addToSet1, mergePartitionSets1)
 
     val hyper1 = getHypernym.map(f =>
-      (getLocalName1(f.getSubject), (getLocalName1(f.getObject) + ("hypernym")))) //.partitionBy(new HashPartitioner(8)).persist
+      (getLocalName1(f.getSubject), (getLocalName1(f.getObject) + ("hypernym")))) // .partitionBy(new HashPartitioner(8)).persist
 
     val initialSet = mutable.HashSet.empty[String]
     val addToSet = (s: mutable.HashSet[String], v: String) => s += v
@@ -153,11 +145,11 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
   def searchType(x: String, y: List[String]): Boolean = {
     if (y.exists(x.contains)) {
       true
-    } else
-      false
+    } else false
   }
-  def jSimilarity(TriplesWithNumericLiteral: RDD[Triple],
-                  rdfTypeDBwiki: RDD[(String, HashSet[String])], mapSubWithTriples: RDD[(String, mutable.Set[(String, String, Object)])]): RDD[(Set[(String, String, Object)])] = {
+  def jSimilarity(
+    TriplesWithNumericLiteral: RDD[Triple],
+    rdfTypeDBwiki: RDD[(String, HashSet[String])], mapSubWithTriples: RDD[(String, mutable.Set[(String, String, Object)])]): RDD[(Set[(String, String, Object)])] = {
 
     nTriplesRDD.unpersist()
     import sparkSession.implicits._
@@ -172,10 +164,10 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
       .setMinDF(1)
       .fit(dfA)
 
-    val kt = cvModel.transform(dfA) //.filter(isNoneZeroVector(col("features")))
-   
+    val kt = cvModel.transform(dfA) // .filter(isNoneZeroVector(col("features")))
+
     val mh = new MinHashLSH()
-      .setNumHashTables(3) //tested with 100 on out4.nt file ..result in /home/rajjat/Desktop/recent_dataset/output_removed_boolean_udf.txt
+      .setNumHashTables(3) // tested with 100 on out4.nt file ..result in /home/rajjat/Desktop/recent_dataset/output_removed_boolean_udf.txt
       .setInputCol("features")
       .setOutputCol("hashes")
 
@@ -183,13 +175,13 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
     // val model1 = mh.fit(featurizedData)
     val dffilter = model.approxSimilarityJoin(kt, kt, 0.45)
 
-  
     val opiu = dffilter.filter($"datasetA.id".isNotNull).filter($"datasetB.id".isNotNull)
       .filter(($"datasetA.id" =!= $"datasetB.id"))
-      .select(col("datasetA.id").alias("id1"),
+      .select(
+        col("datasetA.id").alias("id1"),
         col("datasetB.id").alias("id2"))
 
-    val x1 = opiu.rdd //maimum time taken by this rdd
+    val x1 = opiu.rdd // maimum time taken by this rdd
       .map(row => {
         val id = row.getString(0)
         val value = row.getString(1)
@@ -200,16 +192,16 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
     val addToSet3 = (s: mutable.Set[String], v: String) => s += v
     val mergePartitionSets3 = (p1: mutable.Set[String], p2: mutable.Set[String]) => p1 ++= p2
     val uniqueByKey3 = x1.aggregateByKey(initialSet3)(addToSet3, mergePartitionSets3)
-    
+
     x1.unpersist()
-    
+
     val k = uniqueByKey3.map(f => ((f._2 += (f._1)).toSet)).map(a => (a, a))
       .aggregateByKey(Set[String]())((x, y) => y, (x, y) => x)
       .keys.distinct()
 
     val abc = k.repartition(50).persist()
     val simSubjectCart = abc.cartesian(abc).filter(f => f._1.intersect(f._2).size > 0)
-   
+
     partitionedy.unpersist()
     // joined.unpersist()
     val subsetMembers = simSubjectCart.filter { case (set1, set2) => (set2.subsetOf(set1)) && (set1 -- set2).nonEmpty }
@@ -217,8 +209,8 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
     val superset1 = abc.subtract(sdf)
 
     val ys = superset1.flatMap(f => (f.map(g => (g, f))))
-    
-     val g=ys.join(mapSubWithTriples)
+
+    val g = ys.join(mapSubWithTriples)
 
     val clusterOfSubjects = g.map({
       case (s, (iter, iter1)) => ((iter).toSet, iter1)
@@ -240,15 +232,14 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
     val clusterOfProp = propDistinct.map({
       case (a, (iter1)) => (iter1.filter(f => f._2.equals(a)))
     })
-  
+
     clusterOfProp
 
   }
   def isContains(a: List[Node], b: List[Node]): Boolean = {
     if (a.forall(b.contains) || b.forall(a.contains)) {
       true
-    } else
-      false
+    } else false
   }
 
   def removeSupType(a: RDD[((String, HashSet[String]), (String, HashSet[String]))]): RDD[((String, HashSet[String]), (String, HashSet[String]))] = {
@@ -269,13 +260,12 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
       })
 
       clusterOfProp
-    } else
-      a
+    } else a
   }
   def propClustering(triplesWithNumericLiteral: RDD[Triple]): RDD[(String, mutable.Set[(String, String, Object)])] = {
 
     val subMap = triplesWithNumericLiteral.map(f => (getLocalName1(f.getSubject),
-      (getLocalName1(f.getSubject), getLocalName1(f.getPredicate), getNumber(f.getObject.toString())))) //.partitionBy(new HashPartitioner(8)) //make a function instead of using
+      (getLocalName1(f.getSubject), getLocalName1(f.getPredicate), getNumber(f.getObject.toString())))) // .partitionBy(new HashPartitioner(8)) //make a function instead of using
 
     val initialSet = mutable.Set.empty[(String, String, Object)]
     val addToSet = (s: mutable.Set[(String, String, Object)], v: (String, String, Object)) => s += v
@@ -299,9 +289,9 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
     var dbtype2: HashSet[String] = null
     val hyper1 = seq1.filter(p => p.contains("hypernym"))
     val hyper2 = seq2.filter(p => p.contains("hypernym"))
-    //case of usa and India
+    // case of usa and India
 
-    //USA= hypernym/states and India :- hypernym//Country
+    // USA= hypernym/states and India :- hypernym//Country
     if (hyper1 == hyper2 && !hyper1.isEmpty && !hyper2.isEmpty) {
 
       jSimilarity = 1.0
@@ -309,12 +299,14 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
     } else {
       if (seq1.contains("hypernym")) {
         dbtype1 = seq1.dropRight(1)
-      } else
+      } else {
         dbtype1 = seq1
+      }
       if (seq2.contains("hypernym")) {
         dbtype2 = seq2.dropRight(1)
-      } else
+      } else {
         dbtype2 = seq2
+      }
 
       val intersect_cnt = dbtype1.toSet.intersect(dbtype2.toSet).size
 
@@ -327,7 +319,7 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
 
   def iqr1(cluster: Seq[(String, String, Object)], anomalyListLimit: Int): Dataset[Row] = {
 
-    //create sample data 
+    // create sample data
 
     var result: Dataset[Row] = null
     // var _partitionData: RDD[String] = _
@@ -337,7 +329,7 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
     val listofData = cluster.map(b => (b._3.toString()).toDouble).toList
 
     val k = sparkSession.sparkContext.makeRDD(listofData)
-    //create sample data 
+    // create sample data
     //  println("sampleData=" + listofData)
     val c = listofData.sorted
 
@@ -351,9 +343,10 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
     val dfWithoutSchema = sparkSession.createDataFrame(KVcluster).toDF("id", "outliers")
 
     // calculate quantiles and IQR
-    val quantiles = df.stat.approxQuantile("value",
+    val quantiles = df.stat.approxQuantile(
+      "value",
       Array(0.25, 0.75), 0.0)
-    //quantiles.foreach(println)
+    // quantiles.foreach(println)
 
     val Q1 = quantiles(0)
 
@@ -383,52 +376,48 @@ class AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD: RDD[Triple], objList
       result
     }
     result
-    //    
+    //
     //    // result.show()
     //    result.where(result.col("outliers").isNotNull)
   }
 
   def iqr2(cluster: Seq[(String, String, Object)], anomalyListLimit: Int): Seq[(String, String, Object)] = {
 
-    //create sample data 
+    // create sample data
 
     val listofData = cluster.map(b => (b._3.toString()).toDouble).toArray
 
- 
-
     val c = listofData.sorted
-  
+
     val arrMean = new DescriptiveStatistics()
     genericArrayOps(c).foreach(v => arrMean.addValue(v))
     // Get first and third quartiles and then calc IQR
     val Q1 = arrMean.getPercentile(25)
-    //println("Q1="+Q1)
+    // println("Q1="+Q1)
     val Q3 = arrMean.getPercentile(75)
-    //println("Q3="+Q3)
+    // println("Q3="+Q3)
     val IQR = Q3 - Q1
-    //println("IQR="+IQR)
+    // println("IQR="+IQR)
     val lowerRange = Q1 - 1.5 * IQR
-    //println("lowerRange="+lowerRange)
+    // println("lowerRange="+lowerRange)
     val upperRange = Q3 + 1.5 * IQR
     //    println("upperRange="+upperRange)
     val yse = c.filter(p => (p < lowerRange || p > upperRange))
-    
 
     val xde = cluster.filter(f => search(f._3.toString().toDouble, yse))
-    
+
     xde
   }
 
   def search(a: Double, b: Array[Double]): Boolean = {
-    if (b.contains(a))
-      true
-    else
-      false
+    if (b.contains(a)) true
+    else false
 
   }
 }
 object AnomalyDetectionWithCountVetcorizerModel {
   def apply(nTriplesRDD: RDD[Triple], objList: List[String], triplesType: List[String],
-            JSimThreshold: Double, listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) = new AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD, objList, triplesType,
+            JSimThreshold: Double, listSuperType: List[String], sparkSession: SparkSession,
+            hypernym: String, numPartition: Int): AnomalyDetectionWithCountVetcorizerModel = new AnomalyDetectionWithCountVetcorizerModel(nTriplesRDD, objList, triplesType,
     JSimThreshold, listSuperType, sparkSession, hypernym, numPartition)
 }
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyWithHashingTF.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyWithHashingTF.scala
index dc0db93..99a9305 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyWithHashingTF.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyWithHashingTF.scala
@@ -1,24 +1,19 @@
 package net.sansa_stack.ml.spark.outliers.anomalydetection
 
-import org.apache.jena.graph.Node
-import org.apache.spark.rdd.RDD
-import org.apache.spark.RangePartitioner
-import org.apache.jena.graph.Triple
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.HashPartitioner
 import scala.collection.mutable
 import scala.collection.mutable.HashSet
-import org.apache.jena.graph.NodeFactory
-import org.apache.spark.sql._
-import org.apache.spark.sql.types._
-import org.apache.spark.rdd._
-import org.apache.spark.ml.feature.MinHashLSH
-import org.apache.spark.sql.functions.udf
-import org.apache.spark.ml.feature._
-import org.apache.spark.ml.linalg._
-import org.apache.spark.sql.functions.col
+
 import org.apache.commons.math3.stat.descriptive._
+import org.apache.jena.graph.{ Node, NodeFactory, Triple }
+import org.apache.spark.{ HashPartitioner, RangePartitioner }
+import org.apache.spark.ml.feature.{ MinHashLSH, _ }
+import org.apache.spark.ml.linalg._
+import org.apache.spark.rdd.{ RDD, _ }
+import org.apache.spark.sql.{ SparkSession, _ }
+import org.apache.spark.sql.functions.{ col, udf }
+import org.apache.spark.sql.types._
 import org.apache.spark.storage.StorageLevel
+
 /*
  *
  * AnomalyDetection - Anomaly detection of numerical data
@@ -32,34 +27,34 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String],
                            listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) extends Serializable {
   def run(): RDD[(Set[(String, String, Object)])] = {
 
-    // get all the triples whose objects are literal 
-    //these literals also contains xsd:date as well as xsd:langstring 
+    // get all the triples whose objects are literal
+    // these literals also contains xsd:date as well as xsd:langstring
     val getObjectLiteral = getObjectList()
 
-    //remove the literal which has ^^xsd:date or xsd:langstring(only considering numerical)
+    // remove the literal which has ^^xsd:date or xsd:langstring(only considering numerical)
     val removedLangString = getObjectLiteral.filter(f => searchedge(f.getObject.toString(), objList))
 
     val removewiki = removedLangString.filter(f => (!f.getPredicate.toString().contains("wikiPageID")) &&
       (!f.getPredicate.toString().contains("wikiPageRevisionID")))
 
-    //checking still object has only numerical data only
+    // checking still object has only numerical data only
     val triplesWithNumericLiteral = triplesWithNumericLit(removewiki)
 
-    val mapSubWithTriples = propClustering(triplesWithNumericLiteral) //.partitionBy(new HashPartitioner(40)).persist()
+    val mapSubWithTriples = propClustering(triplesWithNumericLiteral) // .partitionBy(new HashPartitioner(40)).persist()
 
-    //get triples of hypernym
+    // get triples of hypernym
     val getHypernymTriples = getHyp()
 
-    //filter rdf type having object value dbpedia and join with hyernym
+    // filter rdf type having object value dbpedia and join with hyernym
 
     val rdfTypeDBwiki = rdfType(getHypernymTriples)
 
-    //joining those subjects only who has rdf:ytpe and numerical literal 
+    // joining those subjects only who has rdf:ytpe and numerical literal
     val rdfTypeWithSubject = mapSubWithTriples.join(rdfTypeDBwiki)
 
     val mapSubjectwithType = rdfTypeWithSubject.map(f => (f._1, f._2._2))
     val propwithSub = propwithsubject(triplesWithNumericLiteral)
-    //cluster subjects on the basis of rdf type
+    // cluster subjects on the basis of rdf type
     val jacardSimilarity = jSimilarity(triplesWithNumericLiteral, propwithSub, mapSubjectwithType, mapSubWithTriples)
 
     jacardSimilarity
@@ -79,25 +74,20 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String],
         val c = x.indexOf('^')
         val subject = x.substring(1, c - 1)
 
-        if (isAllDigits(subject))
-          true
-        else
-          false
-      } else
-        false
+        if (isAllDigits(subject)) true
+        else false
+      } else false
     }
 
   def isAllDigits(x: String): Boolean = {
     var found = false
     for (ch <- x) {
-      if (ch.isDigit || ch == '.')
+      if (ch.isDigit || ch == '.') {
         found = true
-      else if (ch.isLetter) {
-
+      } else if (ch.isLetter) {
         found = false
       }
     }
-
     found
   }
 
@@ -106,20 +96,19 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String],
       val c = x.indexOf('^')
       val subject = x.substring(c + 2)
       y.contains(subject)
-    } else
-      false
+    } else false
   }
 
   def rdfType(getHypernym: RDD[Triple]): RDD[(String, HashSet[String])] = {
 
-    //filter triples with predicate as rdf:type
+    // filter triples with predicate as rdf:type
     val triplesWithRDFType = nTriplesRDD.filter(_.getPredicate.toString() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
 
     val triplesWithDBpedia = triplesWithRDFType.filter(f => searchType(f.getObject.toString(), triplesType))
 
     val subWithType1 = triplesWithDBpedia.map(f =>
       // ...
-      (getLocalName1(f.getSubject), (getLocalName1(f.getObject)))) //.reduceByKey(_ ++ _) //.partitionBy(new HashPartitioner(8)).persist()
+      (getLocalName1(f.getSubject), (getLocalName1(f.getObject)))) // .reduceByKey(_ ++ _) //.partitionBy(new HashPartitioner(8)).persist()
 
     val initialSet1 = mutable.HashSet.empty[String]
     val addToSet1 = (s: mutable.HashSet[String], v: String) => s += v
@@ -127,7 +116,7 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String],
     val uniqueByKey1 = subWithType1.aggregateByKey(initialSet1)(addToSet1, mergePartitionSets1)
 
     val hyper1 = getHypernym.map(f =>
-      (getLocalName1(f.getSubject), (getLocalName1(f.getObject) + ("hypernym")))) //.partitionBy(new HashPartitioner(8)).persist
+      (getLocalName1(f.getSubject), (getLocalName1(f.getObject) + ("hypernym")))) // .partitionBy(new HashPartitioner(8)).persist
 
     val initialSet = mutable.HashSet.empty[String]
     val addToSet = (s: mutable.HashSet[String], v: String) => s += v
@@ -153,8 +142,7 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String],
   def searchType(x: String, y: List[String]): Boolean = {
     if (y.exists(x.contains)) {
       true
-    } else
-      false
+    } else false
   }
   def jSimilarity(TriplesWithNumericLiteral: RDD[Triple], xse: RDD[(String, String)],
                   rdfTypeDBwiki: RDD[(String, HashSet[String])], mapSubWithTriples: RDD[(String, mutable.Set[(String, String, Object)])]): RDD[(Set[(String, String, Object)])] = {
@@ -171,24 +159,24 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String],
     val hashingTF = new HashingTF()
       .setInputCol("values").setOutputCol("features").setNumFeatures(1048576)
 
-    val featurizedData = hashingTF.transform(dropDup) 
-    
+    val featurizedData = hashingTF.transform(dropDup)
+
     val mh = new MinHashLSH()
-      .setNumHashTables(3) 
+      .setNumHashTables(3)
       .setInputCol("features")
       .setOutputCol("hashes")
 
     val model = mh.fit(featurizedData)
 
     val dffilter = model.approxSimilarityJoin(featurizedData, featurizedData, 0.45)
-    println("dffilter")
-  
+
     val opiu = dffilter.filter($"datasetA.id".isNotNull).filter($"datasetB.id".isNotNull)
       .filter(($"datasetA.id" =!= $"datasetB.id"))
-      .select(col("datasetA.id").alias("id1"),
-        col("datasetB.id").alias("id2")) //heap space error due to persist
+      .select(
+        col("datasetA.id").alias("id1"),
+        col("datasetB.id").alias("id2")) // heap space error due to persist
 
-    val x1 = opiu.repartition(400).persist(StorageLevel.MEMORY_AND_DISK) 
+    val x1 = opiu.repartition(400).persist(StorageLevel.MEMORY_AND_DISK)
     val x1Map = x1.rdd.map(row => {
       val id = row.getString(0)
       val value = row.getString(1)
@@ -200,14 +188,12 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String],
     val mergePartitionSets3 = (p1: mutable.Set[String], p2: mutable.Set[String]) => p1 ++= p2
     val uniqueByKey3 = x1Map.aggregateByKey(initialSet3)(addToSet3, mergePartitionSets3)
 
-  
-
     val k = uniqueByKey3.map(f => ((f._1, (f._2 += (f._1)).toSet)))
-    
+
     val partitioner = new HashPartitioner(500)
-    val mapSubWithTriplesPart = mapSubWithTriples.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK) 
+    val mapSubWithTriplesPart = mapSubWithTriples.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK)
 
-    val ys = k.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK) 
+    val ys = k.partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK)
     val joinSimSubTriples2 = ys.join(mapSubWithTriplesPart)
 
     val clusterOfSubjects = joinSimSubTriples2.map({
@@ -233,21 +219,20 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String],
 
     mapSubWithTriplesPart.unpersist()
     ys.unpersist()
-   
+
     clusterOfProp
 
   }
   def isContains(a: List[Node], b: List[Node]): Boolean = {
     if (a.forall(b.contains) || b.forall(a.contains)) {
       true
-    } else
-      false
+    } else false
   }
 
   def propClustering(triplesWithNumericLiteral: RDD[Triple]): RDD[(String, mutable.Set[(String, String, Object)])] = {
 
     val subMap = triplesWithNumericLiteral.map(f => (getLocalName1(f.getSubject),
-      (getLocalName1(f.getSubject), getLocalName1(f.getPredicate), getNumber(f.getObject.toString())))) //.partitionBy(new HashPartitioner(8)) //make a function instead of using
+      (getLocalName1(f.getSubject), getLocalName1(f.getPredicate), getNumber(f.getObject.toString())))) // .partitionBy(new HashPartitioner(8)) //make a function instead of using
 
     val initialSet = mutable.Set.empty[(String, String, Object)]
     val addToSet = (s: mutable.Set[(String, String, Object)], v: (String, String, Object)) => s += v
@@ -266,7 +251,7 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String],
   }
   def iqr2(cluster: Seq[(String, String, Object)], anomalyListLimit: Int): Seq[(String, String, Object)] = {
 
-    //create sample data 
+    // create sample data
 
     val listofData = cluster.map(b => (b._3.toString()).toDouble).toArray
 
@@ -274,36 +259,33 @@ class AnomalyWithHashingTF(nTriplesRDD: RDD[Triple], objList: List[String],
 
     val arrMean = new DescriptiveStatistics()
     genericArrayOps(c).foreach(v => arrMean.addValue(v))
-   
+
     val Q1 = arrMean.getPercentile(25)
 
     val Q3 = arrMean.getPercentile(75)
-  
+
     val IQR = Q3 - Q1
-   
+
     val lowerRange = Q1 - 1.5 * IQR
- 
+
     val upperRange = Q3 + 1.5 * IQR
-    
+
     val yse = c.filter(p => (p < lowerRange || p > upperRange))
 
     val xde = cluster.filter(f => search(f._3.toString().toDouble, yse))
 
     xde
-  
+
   }
 
   def search(a: Double, b: Array[Double]): Boolean = {
-    if (b.contains(a))
-      true
-    else
-      false
-
+    if (b.contains(a)) true
+    else false
   }
 
 }
 object AnomalyWithHashingTF {
   def apply(nTriplesRDD: RDD[Triple], objList: List[String], triplesType: List[String],
-            JSimThreshold: Double, listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int) = new AnomalyWithHashingTF(nTriplesRDD, objList, triplesType,
+            JSimThreshold: Double, listSuperType: List[String], sparkSession: SparkSession, hypernym: String, numPartition: Int): AnomalyWithHashingTF = new AnomalyWithHashingTF(nTriplesRDD, objList, triplesType,
     JSimThreshold, listSuperType, sparkSession, hypernym, numPartition)
 }
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/readme.md b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/README.md
similarity index 100%
rename from sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/readme.md
rename to sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/README.md
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CharactersFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CharactersFeatures.scala
index e2b6641..056f6cd 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CharactersFeatures.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CharactersFeatures.scala
@@ -1,7 +1,8 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
 
+import java.util.regex.{ Matcher, Pattern }
+
 import org.apache.spark.ml.linalg.{ Vector, Vectors }
-import java.util.regex.{ Pattern, Matcher }
 
 class CharactersFeatures extends Serializable {
 
@@ -10,7 +11,6 @@ class CharactersFeatures extends Serializable {
     val rounded: Double = Math.round(va * 10000).toDouble / 10000
 
     rounded
-
   }
 
   def Vector_Characters_Feature(StrValue: String): Array[Double] = {
@@ -19,148 +19,145 @@ class CharactersFeatures extends Serializable {
     var RatioValues = new Array[Double](25) //  Index is Important here
     val characterFeature_OBJ = new CharactersFeatures()
 
-    //1.Double result Value for uppercase Ration
+    // 1.Double result Value for uppercase Ration
     val uppercase = characterFeature_OBJ.UppercaseRation_Character(StrValue)
     if (!uppercase.isNaN()) {
       RatioValues(0) = RoundDouble(uppercase)
     }
-    //2.Double result Value for lowerCase Ratio
+    // 2.Double result Value for lowerCase Ratio
     val lowerCase = characterFeature_OBJ.LowercaseRation_Character(StrValue)
     if (!lowerCase.isNaN()) {
       RatioValues(1) = RoundDouble(lowerCase)
     }
-    //3.Double result Value for  Alphanumeric Ratio
+    // 3.Double result Value for  Alphanumeric Ratio
     val Alphanumeric = characterFeature_OBJ.AlphanumericRation_Character(StrValue)
     if (!Alphanumeric.isNaN()) {
       RatioValues(2) = RoundDouble(Alphanumeric)
     }
-    //4.Double result Value for ASCII Ratio
+    // 4.Double result Value for ASCII Ratio
     val ASCII = characterFeature_OBJ.ASCIIRation_Character(StrValue)
     if (!ASCII.isNaN()) {
       RatioValues(3) = RoundDouble(ASCII)
     }
-    //5.Double result Value for Bracket Ratio
+    // 5.Double result Value for Bracket Ratio
     val Bracket = characterFeature_OBJ.BracketRation_Character(StrValue)
     if (!Bracket.isNaN()) {
       RatioValues(4) = RoundDouble(Bracket)
 
     }
-    //6.Double result Value for Digits Ratio
+    // 6.Double result Value for Digits Ratio
     val Digits = characterFeature_OBJ.DigitsRation_Character(StrValue)
     if (!Digits.isNaN()) {
       RatioValues(5) = RoundDouble(Digits)
     }
-    //7.Double result Value for Latin Ratio
+    // 7.Double result Value for Latin Ratio
     val Latin = characterFeature_OBJ.Latin_Character(StrValue)
     if (!Latin.isNaN()) {
       RatioValues(6) = RoundDouble(Latin)
     }
-    //8.Double result Value for WhiteSpace Ratio
+    // 8.Double result Value for WhiteSpace Ratio
     val WhiteSpace = characterFeature_OBJ.WhiteSpace_Character(StrValue)
     if (!WhiteSpace.isNaN()) {
       RatioValues(7) = RoundDouble(WhiteSpace)
     }
-    //9.Double result Value for punc Ratio
+    // 9.Double result Value for punc Ratio
     val punc = characterFeature_OBJ.Punct_Character(StrValue)
     if (!punc.isNaN()) {
       RatioValues(8) = RoundDouble(punc)
     }
-    //10. Integer to Double result Value for LongCharacterSequence (1 integer)
+    // 10. Integer to Double result Value for LongCharacterSequence (1 integer)
     val LongCharacterSequence = characterFeature_OBJ.Longcharactersequence_Character(StrValue)
     if (!LongCharacterSequence.isNaN()) {
       RatioValues(9) = LongCharacterSequence
     }
 
-    //11.Double result Value for ArabicCharacter
+    // 11.Double result Value for ArabicCharacter
     val ArabicCharacter = characterFeature_OBJ.ArabicRation_Character(StrValue)
     if (!ArabicCharacter.isNaN()) {
       RatioValues(10) = RoundDouble(ArabicCharacter)
     }
 
-    //12.Double result Value for Bengali
+    // 12.Double result Value for Bengali
     val Bengali = characterFeature_OBJ.BengaliRation_Character(StrValue)
     if (!Bengali.isNaN()) {
       RatioValues(11) = RoundDouble(Bengali)
 
     }
 
-    //13.Double result Value for Brahmi
+    // 13.Double result Value for Brahmi
     val Brahmi = characterFeature_OBJ.BrahmiRation_Character(StrValue)
     if (!Brahmi.isNaN()) {
       RatioValues(12) = RoundDouble(Brahmi)
-
     }
 
-    //14.Double result Value for Cyrillic
+    // 14.Double result Value for Cyrillic
     val Cyrillic = characterFeature_OBJ.CyrillicRation_Character(StrValue)
     if (!Cyrillic.isNaN()) {
       RatioValues(13) = RoundDouble(Cyrillic)
-
     }
-    //15.Double result Value for Han
+    // 15.Double result Value for Han
     val Han = characterFeature_OBJ.HanRatio_Character(StrValue)
     if (!Han.isNaN()) {
       RatioValues(14) = RoundDouble(Han)
-
     }
 
-    //16.Double result Value for Malysia
+    // 16.Double result Value for Malysia
     val Malysia = characterFeature_OBJ.MalaysRatio_Character(StrValue)
     if (!Malysia.isNaN()) {
       RatioValues(15) = RoundDouble(Malysia)
     }
 
-    //17.Double result Value for Tami
+    // 17.Double result Value for Tami
     val Tami = characterFeature_OBJ.TamilRatio_Character(StrValue)
     if (!Tami.isNaN()) {
       RatioValues(16) = RoundDouble(Tami)
     }
-    //18.Double result Value for Telugu
+    // 18.Double result Value for Telugu
     val Telugu = characterFeature_OBJ.TeluguRatio_Character(StrValue)
     if (!Telugu.isNaN()) {
       RatioValues(17) = RoundDouble(Telugu)
 
     }
-    //19.Double result Value for  Symbol
+    // 19.Double result Value for  Symbol
     val Symbol = characterFeature_OBJ.Symbol_Character(StrValue)
     if (!Symbol.isNaN()) {
       RatioValues(18) = RoundDouble(Symbol)
 
     }
-    //20. Double Alphabets Ration:
+    // 20. Double Alphabets Ration:
     val Alphabets = characterFeature_OBJ.AlphaBetsRation_Character(StrValue)
     if (!Alphabets.isNaN()) {
       RatioValues(19) = RoundDouble(Alphabets)
     }
-    //21. Double AVisible character Ratio:
+    // 21. Double AVisible character Ratio:
     val Visible = characterFeature_OBJ.VisibleRation_Character(StrValue)
     if (!Visible.isNaN()) {
       RatioValues(20) = RoundDouble(Visible)
     }
 
-    //22. Double Printable character Ratio:
+    // 22. Double Printable character Ratio:
     val Printable = characterFeature_OBJ.PrintableRation_Character(StrValue)
     if (!Printable.isNaN()) {
       RatioValues(21) = RoundDouble(Printable)
     }
 
-    //23.Double Blank character Ratio:
+    // 23.Double Blank character Ratio:
     val Blank = characterFeature_OBJ.BlankRation_Character(StrValue)
     if (!Blank.isNaN()) {
       RatioValues(22) = RoundDouble(Blank)
     }
 
-    //24.Double A control character:
+    // 24.Double A control character:
     val Control = characterFeature_OBJ.ControlRation_Character(StrValue)
     if (!Control.isNaN()) {
       RatioValues(23) = RoundDouble(Control)
     }
-
-    //25. Double A hexadecimal digit :
+    // 25. Double A hexadecimal digit :
     val hexadecimal = characterFeature_OBJ.HexaRation_Character(StrValue)
     if (!hexadecimal.isNaN()) {
       RatioValues(24) = RoundDouble(hexadecimal)
     }
+
     //    val FacilityOBJ = new FacilitiesClass()
     //    val vector_Values = FacilityOBJ.ToVector(RatioValues)
 
@@ -176,7 +173,8 @@ class CharactersFeatures extends Serializable {
     }
     charRatio
   }
-  //1.Uppercase Ratio:
+
+  // 1.Uppercase Ratio:
   def UppercaseRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{javaUpperCase}")
     val result: Double = characterRatio(str, pattern)
@@ -187,51 +185,51 @@ class CharactersFeatures extends Serializable {
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //3.Alphanumeric
+  // 3.Alphanumeric
   def AlphanumericRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{Alnum}")
     val result: Double = characterRatio(str, pattern)
     result
   }
 
-  //4.ASCII
+  // 4.ASCII
   def ASCIIRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{ASCII}")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //5.Bracket
+  // 5.Bracket
   def BracketRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\(|\\)|\\}|\\{|\\[|\\]")
 
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //6.Digits
+  // 6.Digits
   def DigitsRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\d")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //7.Latin
+  // 7.Latin
   def Latin_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{IsLatin}")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //8.WhiteSpace
+  // 8.WhiteSpace
   def WhiteSpace_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\s")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //9.Punct
+  // 9.Punct
   def Punct_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{Punct}")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //10.Long character sequence:
+  // 10.Long character sequence:
   def Longcharactersequence_Character(str: String): Double = {
     var text: String = str
     var maxlength: Integer = null
@@ -265,96 +263,96 @@ class CharactersFeatures extends Serializable {
 
   }
 
-  //11.ARabic Ratio:
+  // 11.ARabic Ratio:
   def ArabicRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{IsArabic}")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //12. Bengali Ratio
+  // 12. Bengali Ratio
   def BengaliRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{IsBengali}")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //13.Brahmi Ratio
+  // 13.Brahmi Ratio
   def BrahmiRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{IsBrahmi}")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //14.Cyrillic Ratio
+  // 14.Cyrillic Ratio
   def CyrillicRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{IsCyrillic}")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //15.HanRatio
+  // 15.HanRatio
   def HanRatio_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{IsHan}")
     val result: Double = characterRatio(str, pattern)
     result
   }
 
-  //16.Malaysian Ratio:
+  // 16.Malaysian Ratio:
   def MalaysRatio_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{IsMalayalam}")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //17.Tamil Ratio:
+  // 17.Tamil Ratio:
   def TamilRatio_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{IsTamil}")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //18.Telugu Ration:
+  // 18.Telugu Ration:
   def TeluguRatio_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{IsTelugu}")
     val result: Double = characterRatio(str, pattern)
     result
   }
 
-  //19.Symbols Ratio :
+  // 19.Symbols Ratio :
   def Symbol_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("[#$%&@+-_+*/]*")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //20.Alphabets Ratio :
+  // 20.Alphabets Ratio :
   def AlphaBetsRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{Alpha}")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //21.A visible character Ratio:
+  // 21.A visible character Ratio:
   def VisibleRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{Graph}")
     val result: Double = characterRatio(str, pattern)
     result
   }
-  //22.A printable character
+  // 22.A printable character
   def PrintableRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{Print}")
     val result: Double = characterRatio(str, pattern)
     result
   }
 
-  //23.A Black(it is different from White space) character Ratio
+  // 23.A Black(it is different from White space) character Ratio
   def BlankRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{Blank}")
     val result: Double = characterRatio(str, pattern)
     result
   }
 
-  //24.Control character  Ratio
+  // 24.Control character  Ratio
   def ControlRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{Cntrl}")
     val result: Double = characterRatio(str, pattern)
     result
   }
 
-  //25.HexaDecimal character Ratio
+  // 25.HexaDecimal character Ratio
   def HexaRation_Character(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{XDigit}")
     val result: Double = characterRatio(str, pattern)
@@ -362,4 +360,4 @@ class CharactersFeatures extends Serializable {
   }
   // Character features: ------ End calculation the Ratio for character:
 
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Classifiers.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Classifiers.scala
index 3549c50..2aa2ea3 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Classifiers.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Classifiers.scala
@@ -1,33 +1,27 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
 
-import org.apache.spark.{ SparkContext, RangePartitioner }
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql._
-import org.apache.spark.sql.types.{ DoubleType, StringType, IntegerType, StructField, StructType }
-import org.apache.spark.ml.linalg.{ Vector, Vectors }
-import org.apache.spark.ml.classification.{ GBTClassificationModel, GBTClassifier }
-import org.apache.spark.ml.classification.DecisionTreeClassificationModel
-import org.apache.spark.ml.classification.DecisionTreeClassifier
-import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
+import java.io.{ File, IOException }
+import java.text.SimpleDateFormat
+import java.util.{ Calendar, Date }
+
 import scala.collection.mutable
-import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
-import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
-import org.apache.spark.ml.feature.{ IndexToString, StringIndexer, VectorIndexer }
-import org.apache.spark.ml.classification.{ RandomForestClassificationModel, RandomForestClassifier }
+
+import org.apache.commons.io.FileUtils
+import org.apache.spark.{ RangePartitioner, SparkContext }
 import org.apache.spark.ml.Pipeline
-import org.apache.commons.io.FileUtils;
-import java.io.File;
-import java.io.IOException;
-import java.util.Calendar
-import java.text.SimpleDateFormat
-import java.util.Date
-import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
+import org.apache.spark.ml.classification.{ DecisionTreeClassificationModel, DecisionTreeClassifier, GBTClassificationModel, GBTClassifier, LogisticRegression, MultilayerPerceptronClassifier, RandomForestClassificationModel, RandomForestClassifier }
+import org.apache.spark.ml.evaluation.{ BinaryClassificationEvaluator, MulticlassClassificationEvaluator }
+import org.apache.spark.ml.feature.{ IndexToString, StringIndexer, VectorIndexer }
+import org.apache.spark.ml.linalg.{ Vector, Vectors }
+import org.apache.spark.mllib.classification.{ SVMModel, SVMWithSGD }
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.sql.types.{ DoubleType, IntegerType, StringType, StructField, StructType }
 
 class Classifiers extends Serializable {
 
-  //1.ok -----
+  // 1.ok -----
   def RandomForestClassifer(DF_Training: DataFrame, DF_Testing: DataFrame, sc: SparkContext): String = {
 
     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
@@ -48,7 +42,8 @@ class Classifiers extends Serializable {
     //    val Array(DF_Testing) = DF_Testing//.randomSplit(Array(0.100))
 
     // Train a RandomForest model.
-    val rf = new RandomForestClassifier().setImpurity("gini").setMaxDepth(3).setNumTrees(20).setFeatureSubsetStrategy("auto").setSeed(5043).setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") //.setNumTrees(20)
+    val rf = new RandomForestClassifier().setImpurity("gini").setMaxDepth(3).setNumTrees(20).setFeatureSubsetStrategy("auto").setSeed(5043)
+    .setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") // .setNumTrees(20)
 
     // Convert indexed labels back to original labels.
     val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
@@ -66,7 +61,7 @@ class Classifiers extends Serializable {
     val finlaPrediction = predictions.select("Rid", "features", "FinalROLLBACK_REVERTED", "predictedLabel")
     predictions.show()
 
-    //Case1 : BinaryClassificationEvaluator:OK ------------------------------------------------------
+    // Case1 : BinaryClassificationEvaluator:OK ------------------------------------------------------
     val binaryClassificationEvaluator = new BinaryClassificationEvaluator().setLabelCol("indexedLabel").setRawPredictionCol("rawPrediction")
     var results1 = 0.0
     def printlnMetricCAse1(metricName: String): Double = {
@@ -79,7 +74,7 @@ class Classifiers extends Serializable {
     val PR = printlnMetricCAse1("areaUnderPR")
 
     // Case 2: MulticlassClassificationEvaluator:OK -----------------------------------------------------
-    //Select (prediction, true label) and compute test error.
+    // Select (prediction, true label) and compute test error.
     val MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")
     var results2 = 0.0
 
@@ -93,10 +88,10 @@ class Classifiers extends Serializable {
     val Recall = printlnMetricCase2("weightedRecall")
 
     val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString()
-  finalResult
+    finalResult
 
   }
-  //2.ok------
+  // 2.ok------
   def DecisionTreeClassifier(DF_Training: DataFrame, DF_Testing: DataFrame, sc: SparkContext): String = {
 
     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
@@ -135,40 +130,38 @@ class Classifiers extends Serializable {
     val predictions = modelxx.transform(TestingData)
 
     // Select example rows to display.
-    //val finlaPrediction = predictions.select("Rid", "features", "FinalROLLBACK_REVERTED", "predictedLabel")
+    // val finlaPrediction = predictions.select("Rid", "features", "FinalROLLBACK_REVERTED", "predictedLabel")
 
-    //Case1 : BinaryClassificationEvaluator:----------------------------------------------------------
+    // Case1 : BinaryClassificationEvaluator:----------------------------------------------------------
     val binaryClassificationEvaluator = new BinaryClassificationEvaluator().setLabelCol("indexedLabel").setRawPredictionCol("rawPrediction")
-    
-    var result1=0.0
+
+    var result1 = 0.0
     def printlnMetricCAse1(metricName: String): Double = {
-     result1 =binaryClassificationEvaluator.setMetricName(metricName).evaluate(predictions)
-      println(metricName + " = " +result1 )
-      
+      result1 = binaryClassificationEvaluator.setMetricName(metricName).evaluate(predictions)
+      println(metricName + " = " + result1)
+
       result1
     }
     val ROC = printlnMetricCAse1("areaUnderROC")
     val PR = printlnMetricCAse1("areaUnderPR")
 
-    //Case 2: MulticlassClassificationEvaluator:-----------------------------------------------------
-    //Select (prediction, true label) and compute test error.
+    // Case 2: MulticlassClassificationEvaluator:-----------------------------------------------------
+    // Select (prediction, true label) and compute test error.
     val MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")
-   var result2=0.0
+    var result2 = 0.0
     def printlnMetricCase2(metricName: String): Double = {
-      result2=MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions)
+      result2 = MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions)
       println(metricName + " = " + result2)
       result2
     }
-     val accuracy = printlnMetricCase2("accuracy")
+    val accuracy = printlnMetricCase2("accuracy")
     val Precision = printlnMetricCase2("weightedPrecision")
     val Recall = printlnMetricCase2("weightedRecall")
 
-    
-        val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString()
+    val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString()
 
     finalResult
-     
-    
+
   }
 
   // 3.Ok --------
@@ -210,7 +203,7 @@ class Classifiers extends Serializable {
 
     predictions.show()
 
-    //Case1 : BinaryClassificationEvaluator:----------------------------------------------------------
+    // Case1 : BinaryClassificationEvaluator:----------------------------------------------------------
     val binaryClassificationEvaluator = new BinaryClassificationEvaluator().setLabelCol("indexedLabel").setRawPredictionCol("rawPrediction")
     var results1 = 0.0
     def printlnMetricCase1(metricName: String): Double = {
@@ -222,13 +215,13 @@ class Classifiers extends Serializable {
     val ROC = printlnMetricCase1("areaUnderROC")
     val PR = printlnMetricCase1("areaUnderPR")
 
-    //Case 2: MulticlassClassificationEvaluator:-----------------------------------------------------
-    //Select (prediction, true label) and compute test error.
+    // Case 2: MulticlassClassificationEvaluator:-----------------------------------------------------
+    // Select (prediction, true label) and compute test error.
     val MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")
-   var result2=0.0
+    var result2 = 0.0
     def printlnMetricCase2(metricName: String): Double = {
-      
-      result2=MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions)
+
+      result2 = MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions)
       println(metricName + " = " + result2)
       result2
     }
@@ -236,13 +229,12 @@ class Classifiers extends Serializable {
     val Precision = printlnMetricCase2("weightedPrecision")
     val Recall = printlnMetricCase2("weightedRecall")
 
-    
-     val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString()
+    val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString()
+
+    finalResult
 
-     finalResult
-    
   }
-  //4. OK-----
+  // 4. OK-----
   def GradientBoostedTree(DF_Training: DataFrame, DF_Testing: DataFrame, sc: SparkContext): String = {
 
     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
@@ -265,7 +257,7 @@ class Classifiers extends Serializable {
     //    val Array(trainingData, testData) = Data.randomSplit(Array(0.7, 0.3))
 
     // Train a DecisionTree model.
-    val gbt = new GBTClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") //.setMaxIter(10)
+    val gbt = new GBTClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") // .setMaxIter(10)
 
     // Convert indexed labels back to original labels.
     val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
@@ -281,7 +273,7 @@ class Classifiers extends Serializable {
 
     // Select example rows to display.
 
-    //Case1 : BinaryClassificationEvaluator:----------------------------------------------------------
+    // Case1 : BinaryClassificationEvaluator:----------------------------------------------------------
 
     var predictionsRDD = predictions.select("prediction", "FinalROLLBACK_REVERTED").rdd
     var predictionAndLabels = predictionsRDD.map { row => (row.get(0).asInstanceOf[Double], row.get(1).asInstanceOf[Double]) }
@@ -290,32 +282,31 @@ class Classifiers extends Serializable {
     println("Area under ROC = " + metrics.areaUnderROC())
     println("Area under PR = " + metrics.areaUnderPR())
 
-      val ROC =metrics.areaUnderROC()
-      val PR= metrics.areaUnderPR()
-    
-    
-    //Case 2: MulticlassClassificationEvaluator:-----------------------------------------------------
-    //Select (prediction, true label) and compute test error.
+    val ROC = metrics.areaUnderROC()
+    val PR = metrics.areaUnderPR()
+
+    // Case 2: MulticlassClassificationEvaluator:-----------------------------------------------------
+    // Select (prediction, true label) and compute test error.
     val MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")
 
-    var result2=0.0 
+    var result2 = 0.0
     def printlnMetric(metricName: String): Double = {
-      
-      result2= MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions)
-      println(metricName + " = " +result2)
+
+      result2 = MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions)
+      println(metricName + " = " + result2)
       result2
     }
     val accuracy = printlnMetric("accuracy")
     val Precision = printlnMetric("weightedPrecision")
     val Recall = printlnMetric("weightedRecall")
-    
+
     val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString()
 
-     finalResult
+    finalResult
 
   }
 
-  //5.Ok------------
+  // 5.Ok------------
   def MultilayerPerceptronClassifier(DF_Training: DataFrame, DF_Testing: DataFrame, sc: SparkContext): String = {
 
     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
@@ -352,7 +343,7 @@ class Classifiers extends Serializable {
 
     // predictions.show()
 
-    //Case1 : BinaryClassificationEvaluator:----------------------------------------------------------
+    // Case1 : BinaryClassificationEvaluator:----------------------------------------------------------
     var predictionsDF = predictions.select("prediction", "label")
     var predictionsRDD = predictions.select("prediction", "label").rdd
     var predictionAndLabels = predictionsRDD.map { row => (row.get(0).asInstanceOf[Double], row.get(1).asInstanceOf[Double]) }
@@ -361,13 +352,10 @@ class Classifiers extends Serializable {
     println("Area under ROC = " + metrics.areaUnderROC())
     println("Area under PR = " + metrics.areaUnderPR())
 
-    
-      val ROC =metrics.areaUnderROC()
-      val PR= metrics.areaUnderPR()
-    
-    
-    
-    //Case 2: MulticlassClassificationEvaluator:-----------------------------------------------------
+    val ROC = metrics.areaUnderROC()
+    val PR = metrics.areaUnderPR()
+
+    // Case 2: MulticlassClassificationEvaluator:-----------------------------------------------------
     val accuracyevaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy")
     val weightedPrecisionevaluator = new MulticlassClassificationEvaluator().setMetricName("weightedPrecision")
     val weightedRecallevaluator = new MulticlassClassificationEvaluator().setMetricName("weightedRecall")
@@ -375,22 +363,14 @@ class Classifiers extends Serializable {
     println("Accuracy = " + accuracyevaluator.evaluate(predictionsDF))
     println("weightedPrecision = " + weightedPrecisionevaluator.evaluate(predictionsDF))
     println("weightedRecall = " + weightedRecallevaluator.evaluate(predictionsDF))
-    
-    
+
     val accuracy = accuracyevaluator.evaluate(predictionsDF)
     val Precision = weightedPrecisionevaluator.evaluate(predictionsDF)
     val Recall = weightedRecallevaluator.evaluate(predictionsDF)
-    
-    
-     val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString()
-     finalResult
 
-    
-    
+    val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString()
+    finalResult
 
   }
 
-  
-  
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CommentProcessor.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CommentProcessor.scala
index 0f0ecc3..834cd7f 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CommentProcessor.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CommentProcessor.scala
@@ -1,6 +1,7 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
 
 import java.util.regex.{ Matcher, Pattern }
+
 import org.slf4j.{ Logger, LoggerFactory }
 
 class CommentProcessor extends Serializable {
@@ -97,7 +98,7 @@ class CommentProcessor extends Serializable {
     actions
   }
 
-  //Ok: helper for Revision Features:  extract Action- subaction from comment:
+  // Ok: helper for Revision Features:  extract Action- subaction from comment:
   def Extract_ActionsOfNormalComment(comment: String): String = {
 
     var result: Boolean = false
@@ -108,7 +109,7 @@ class CommentProcessor extends Serializable {
     var Param = ""
     var parameters: Array[String] = Array.ofDim[String](0)
     var asterisk_Start = 0 // == /*
-    var asterisk_End = 0 //== */
+    var asterisk_End = 0 // == */
     var colon = 0
     if (comment != null) {
       val check_asterisk_Start = comment.contains("/*")
@@ -182,7 +183,7 @@ class CommentProcessor extends Serializable {
     var suffixComment = ""
 
     var asterisk_Start = 0 // == /*
-    var asterisk_End = 0 //== */
+    var asterisk_End = 0 // == */
     var colon = 0
 
     if (comment != null) {
@@ -246,7 +247,7 @@ class CommentProcessor extends Serializable {
     var Param = ""
     var parameters: Array[String] = Array.ofDim[String](0)
     var asterisk_Start = 0 // == /*
-    var asterisk_End = 0 //== */
+    var asterisk_End = 0 // == */
     var colon = 0
     if (comment != null) {
       val check_asterisk_Start = comment.contains("/*")
@@ -324,7 +325,7 @@ class CommentProcessor extends Serializable {
     var Param = ""
     var parameters: Array[String] = Array.ofDim[String](0)
     var asterisk_Start = 0 // == /*
-    var asterisk_End = 0 //== */
+    var asterisk_End = 0 // == */
     var colon = 0
     if (comment != null) {
       val check_asterisk_Start = comment.contains("/*")
@@ -403,7 +404,7 @@ class CommentProcessor extends Serializable {
     var Param = ""
     var parameters: Array[String] = Array.ofDim[String](0)
     var asterisk_Start = 0 // == /*
-    var asterisk_End = 0 //== */
+    var asterisk_End = 0 // == */
     var colon = 0
     if (comment != null) {
       val check_asterisk_Start = comment.contains("/*")
@@ -569,7 +570,7 @@ class CommentProcessor extends Serializable {
 
         } else {
 
-          //do not thing
+          // do not thing
 
         }
 
@@ -584,7 +585,7 @@ class CommentProcessor extends Serializable {
 
   }
 
-  //"Thecommentis" + result_Str + "&&&" + "Ac1:" + Action1 + "&&&" + "Ac2 :" + Action2 + "&&&" + "SF:" + suffixComment
+  // "Thecommentis" + result_Str + "&&&" + "Ac1:" + Action1 + "&&&" + "Ac2 :" + Action2 + "&&&" + "SF:" + suffixComment
   def isRollback(comment: String): Boolean = {
     var result: Boolean = false
     if (comment != null) {
@@ -594,8 +595,8 @@ class CommentProcessor extends Serializable {
         logger.debug("Robust but not precise rollback match (result = " + result + ") : " + tmp)
       }
     }
-    //result =  tmp.startsWith("Reverted");
-    //result =  tmp.startsWith("Reverted");
+    // result =  tmp.startsWith("Reverted");
+    // result =  tmp.startsWith("Reverted");
     result
   }
 
@@ -613,8 +614,8 @@ class CommentProcessor extends Serializable {
         }
       }
     }
-    //result = (tmp.startsWith("Undid") || tmp.startsWith("Undo")) ;
-    //result = (tmp.startsWith("Undid") || tmp.startsWith("Undo")) ;
+    // result = (tmp.startsWith("Undid") || tmp.startsWith("Undo")) ;
+    // result = (tmp.startsWith("Undid") || tmp.startsWith("Undo")) ;
     result
   }
 
@@ -632,8 +633,8 @@ class CommentProcessor extends Serializable {
         }
       }
     }
-    //result = (tmp.startsWith("Restored") || tmp.startsWith("Restore"));
-    //result = (tmp.startsWith("Restored") || tmp.startsWith("Restore"));
+    // result = (tmp.startsWith("Restored") || tmp.startsWith("Restore"));
+    // result = (tmp.startsWith("Restored") || tmp.startsWith("Restore"));
     result
   }
 
@@ -693,7 +694,7 @@ class CommentProcessor extends Serializable {
   }
 
   def getUndoneRevisionId(comment: String): Long = {
-    var result: Long = 0l
+    var result: Long = 0L
     val matcher: Matcher = ROBUST_UNDO_PATTERN.matcher(comment)
     if (matcher.matches()) {
       val str: String = matcher.group(2)
@@ -705,7 +706,7 @@ class CommentProcessor extends Serializable {
   }
 
   def getRestoredRevisionId(comment: String): Long = {
-    var result: Long = 0l
+    var result: Long = 0L
     val matcher: Matcher = ROBUST_RESTORE_PATTERN.matcher(comment)
     if (matcher.matches()) {
       val str: String = matcher.group(1)
@@ -869,4 +870,4 @@ class CommentProcessor extends Serializable {
 
   def getItemValue(): String = itemValue
 
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/FacilitiesClass.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/FacilitiesClass.scala
index a0902aa..333ec38 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/FacilitiesClass.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/FacilitiesClass.scala
@@ -1,9 +1,9 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
 
+import org.apache.spark.ml.linalg.{ Vector, Vectors }
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
-import org.apache.spark.sql.types.{ DoubleType, StringType, IntegerType, StructField, StructType }
-import org.apache.spark.ml.linalg.{ Vector, Vectors }
+import org.apache.spark.sql.types.{ DoubleType, IntegerType, StringType, StructField, StructType }
 
 class FacilitiesClass extends Serializable {
 
@@ -18,68 +18,68 @@ class FacilitiesClass extends Serializable {
     namesList
   }
 
-  //ok --- Used for DF Triples
+  // ok --- Used for DF Triples
   def RDD_TO_DFR_RDFXML(rdd: RDD[String], sqlContext: org.apache.spark.sql.SQLContext): DataFrame = {
-    //Create an Encoded Schema in a String Format:
+    // Create an Encoded Schema in a String Format:
     val schemaString = "Subject Predicate Object"
-    //Generate schema:
-    val schema = StructType(schemaString.split(" ").map(fieldName ⇒ StructField(fieldName, StringType, true)))
-    //Apply Transformation for Reading Data from Text File
-    val rowRDD = rdd.map(_.split(" ")).map(e ⇒ Row(e(0), e(1), e(2)))
-    //Apply RowRDD in Row Data based on Schema:
+    // Generate schema:
+    val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))
+    // Apply Transformation for Reading Data from Text File
+    val rowRDD = rdd.map(_.split(" ")).map(e => Row(e(0), e(1), e(2)))
+    // Apply RowRDD in Row Data based on Schema:
     val RDFTRIPLE = sqlContext.createDataFrame(rowRDD, schema)
-    //Store DataFrame Data into Table
+    // Store DataFrame Data into Table
     RDFTRIPLE.registerTempTable("SPO")
-    //Select Query on DataFrame
+    // Select Query on DataFrame
     val dfr = sqlContext.sql("SELECT * FROM SPO")
     dfr.show()
 
     dfr
   }
 
-  //ok --- Used for DF Triples
+  // ok --- Used for DF Triples
   def RDD_TO_DFR_TRIX(rdd: RDD[String], sqlContext: org.apache.spark.sql.SQLContext): DataFrame = {
-    //Create an Encoded Schema in a String Format:
+    // Create an Encoded Schema in a String Format:
     val schemaString = "Subject Predicate Object"
-    //Generate schema:
-    val schema = StructType(schemaString.split(" ").map(fieldName ⇒ StructField(fieldName, StringType, true)))
-    //Apply Transformation for Reading Data from Text File
-    val rowRDD = rdd.map(_.split("><")).map(e ⇒ Row(e(0), e(1), e(2)))
-    //Apply RowRDD in Row Data based on Schema:
+    // Generate schema:
+    val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))
+    // Apply Transformation for Reading Data from Text File
+    val rowRDD = rdd.map(_.split("><")).map(e => Row(e(0), e(1), e(2)))
+    // Apply RowRDD in Row Data based on Schema:
     val RDFTRIPLE = sqlContext.createDataFrame(rowRDD, schema)
-    //Store DataFrame Data into Table
+    // Store DataFrame Data into Table
     RDFTRIPLE.registerTempTable("SPO")
-    //Select Query on DataFrame
+    // Select Query on DataFrame
     val dfr = sqlContext.sql("SELECT * FROM SPO")
     dfr.show()
 
     dfr
   }
 
-    //ok --- Used for DF Triples
+  // ok --- Used for DF Triples
   def RDD_TO_DFR_JTriple(rdd: RDD[String], sqlContext: org.apache.spark.sql.SQLContext): DataFrame = {
-    //Create an Encoded Schema in a String Format:
+    // Create an Encoded Schema in a String Format:
     val schemaString = "Subject Predicate Object"
-    //Generate schema:
-    val schema = StructType(schemaString.split(" ").map(fieldName ⇒ StructField(fieldName, StringType, true)))
-    //Apply Transformation for Reading Data from Text File
-    val rowRDD = rdd.map(_.split(",")).map(e ⇒ Row(e(0), e(1), e(2)))
-    //Apply RowRDD in Row Data based on Schema:
+    // Generate schema:
+    val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))
+    // Apply Transformation for Reading Data from Text File
+    val rowRDD = rdd.map(_.split(",")).map(e => Row(e(0), e(1), e(2)))
+    // Apply RowRDD in Row Data based on Schema:
     val RDFTRIPLE = sqlContext.createDataFrame(rowRDD, schema)
-    //Store DataFrame Data into Table
+    // Store DataFrame Data into Table
     RDFTRIPLE.registerTempTable("SPO")
-    //Select Query on DataFrame
+    // Select Query on DataFrame
     val dfr = sqlContext.sql("SELECT * FROM SPO")
     dfr.show()
 
     dfr
   }
+
   def RoundDouble(va: Double): Double = {
 
     val rounded: Double = Math.round(va * 10000).toDouble / 10000
 
     rounded
-
   }
 
   def stringToInt(str: String): Integer = {
@@ -139,7 +139,5 @@ class FacilitiesClass extends Serializable {
     }
 
     tem.trim()
-
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ItemFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ItemFeatures.scala
index b4fc8c1..2992634 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ItemFeatures.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ItemFeatures.scala
@@ -1,10 +1,10 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
 
-import java.util.regex.{ Pattern, Matcher }
+import java.util.regex.{ Matcher, Pattern }
 
 class ItemFeatures extends Serializable {
 
-  //1.
+  // 1.
   def Get_NumberOfLabels(str: String): Double = {
 
     // from Label Tag
@@ -15,11 +15,9 @@ class ItemFeatures extends Serializable {
     while (matcher.find()) { count += 1; count - 1 }
 
     count
-
-    count
   }
 
-  //2.
+  // 2.
   def Get_NumberOfDescription(str: String): Double = {
 
     // from description tag
@@ -30,11 +28,9 @@ class ItemFeatures extends Serializable {
     while (matcher.find()) { count += 1; count - 1 }
 
     count
-
-    count
   }
 
-  //3.
+  // 3.
   def Get_NumberOfAliases(str: String): Double = {
 
     // from Aliases Tag
@@ -45,11 +41,9 @@ class ItemFeatures extends Serializable {
     while (matcher.find()) { count += 1; count - 1 }
 
     count
-
-    count
   }
 
-  //4.
+  // 4.
   def Get_NumberOfClaim(str: String): Double = {
 
     // from claim tag
@@ -60,10 +54,8 @@ class ItemFeatures extends Serializable {
     while (matcher.find()) { count += 1; count - 1 }
 
     count
-
-    count
   }
-  //5.
+  // 5.
   def Get_NumberOfSiteLinks(str: String): Double = {
 
     // from Sitelink tag
@@ -74,10 +66,8 @@ class ItemFeatures extends Serializable {
     while (matcher.find()) { count += 1; count - 1 }
 
     count
-
-    count
   }
-  //6.
+  // 6.
   def Get_NumberOfstatements(str: String): Double = {
 
     // from claims tag
@@ -88,10 +78,8 @@ class ItemFeatures extends Serializable {
     while (matcher.find()) { count += 1; count - 1 }
 
     count
-
-    count
   }
-  //7.
+  // 7.
 
   def Get_NumberOfReferences(str: String): Double = {
 
@@ -107,7 +95,7 @@ class ItemFeatures extends Serializable {
     count
   }
 
-  //8.
+  // 8.
   def Get_NumberOfQualifier(str: String): Double = {
 
     // from claims tag
@@ -118,10 +106,8 @@ class ItemFeatures extends Serializable {
     while (matcher.find()) { count += 1; count - 1 }
 
     count
-
-    count
   }
-  //9.
+  // 9.
   def Get_NumberOfQualifier_Order(str: String): Double = {
     // from claims tag
     val input: String = str
@@ -131,12 +117,9 @@ class ItemFeatures extends Serializable {
     while (matcher.find()) { count += 1; count - 1 }
 
     count
-
-    count
   }
-  //10.
+  // 10.
   def Get_NumberOfBadges(str: String): Double = {
-
     // from Sitelink  tag
     val input: String = str
     val pattern: Pattern = Pattern.compile(""""badges"""" + ":")
@@ -145,8 +128,6 @@ class ItemFeatures extends Serializable {
     while (matcher.find()) { count += 1; count - 1 }
 
     count
-
-    count
   }
 
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Main.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Main.scala
index 02f0bdd..5fa21d8 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Main.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Main.scala
@@ -1,8 +1,7 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
 
-import org.apache.spark.{ SparkConf, SparkContext }
+import org.apache.spark.{ RangePartitioner, SparkConf, SparkContext }
 import org.apache.spark.sql._
-import org.apache.spark.{ SparkContext, RangePartitioner }
 
 object Main {
 
@@ -19,7 +18,7 @@ object Main {
     if (num == "1") {
 
       Start.Start_RDF_Parser_Appraoch(sc)
-    } // Distributed Standard Parser and Vandalism Detection : 
+    } // Distributed Standard Parser and Vandalism Detection:
     else if (num == "2") {
 
       val Training_Data = Start.Training_Start_StandardXMLParser_VD(sc)
@@ -27,22 +26,21 @@ object Main {
 
       val OBJClassifiers = new Classifiers()
 
-      //1.Random Forest Classifer:
+      // 1.Random Forest Classifer:
       val RandomForestClassifer_Values = OBJClassifiers.RandomForestClassifer(Training_Data, Testing_Data, sc)
 
-      //2.DecisionTreeClassifier
+      // 2.DecisionTreeClassifier
       val DecisionTreeClassifier_values = OBJClassifiers.DecisionTreeClassifier(Training_Data, Testing_Data, sc)
 
       // 3.LogisticRegrision
       val LogisticRegrision_values = OBJClassifiers.LogisticRegrision(Training_Data, Testing_Data, sc)
 
-      //4.GradientBoostedTree
+      // 4.GradientBoostedTree
       val GradientBoostedTree_values = OBJClassifiers.GradientBoostedTree(Training_Data, Testing_Data, sc)
 
-      //5.MultilayerPerceptronClassifier
+      // 5.MultilayerPerceptronClassifier
       val MultilayerPerceptronClassifier_values = OBJClassifiers.MultilayerPerceptronClassifier(Training_Data, Testing_Data, sc)
 
-      
       println(RandomForestClassifer_Values)
       println(DecisionTreeClassifier_values)
       println(LogisticRegrision_values)
@@ -52,4 +50,4 @@ object Main {
     }
 
   }
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseJTriple.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseJTriple.scala
index 122e297..395e53b 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseJTriple.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseJTriple.scala
@@ -1,24 +1,25 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
-import org.apache.spark.SparkContext
+
+import java.io.ByteArrayInputStream
+import java.util.ArrayList
+import java.util.regex.Pattern
+
 import org.apache.hadoop.mapred.JobConf
-import org.apache.spark.rdd.RDD
 import org.apache.jena.graph.Triple
 import org.apache.jena.rdf.model.ModelFactory
-import java.util.ArrayList
-import java.util.regex.Pattern
-import java.io.ByteArrayInputStream
- 
-class ParseJTriple extends Serializable{
-  
-  
-    def Start_JTriple_Parser(jobConf_Record: JobConf, sc: SparkContext): RDD[String] = {
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+
+class ParseJTriple extends Serializable {
+
+  def Start_JTriple_Parser(jobConf_Record: JobConf, sc: SparkContext): RDD[String] = {
 
     jobConf_Record.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader")
     jobConf_Record.set("stream.recordreader.begin", """"s":""") // start Tag
     jobConf_Record.set("stream.recordreader.end", "}") // End Tag
 
     org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf_Record, "hdfs://localhost:9000/mydata/xxx.json") // input path from Hadoop
-    //------------JTriple Record
+    // ------------JTriple Record
     // read data and save in RDD as block- JTriple Record
     val JTriple_Dataset_Record = sc.hadoopRDD(jobConf_Record, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text])
     // println("HelloRecords" + " " + JTriple_Dataset_Record.count)
@@ -29,14 +30,10 @@ class ParseJTriple extends Serializable{
     val RevisioninOneString = JTriple_Dataset_Record_AsstringBlock.map(line => New_abendRevision(line)).distinct().cache()
     RevisioninOneString
   }
-    def New_abendRevision(str: String): String = {
+  def New_abendRevision(str: String): String = {
 
     val s1 = str.replaceAll("[\r\n]+", " ");
-    val s2 = s1.replaceAll("[.\\s]","").trim()
-
+    val s2 = s1.replaceAll("[.\\s]", "").trim()
     s2
   }
-  
-  
-  
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseNormalXML.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseNormalXML.scala
index 5b70361..cea1e38 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseNormalXML.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseNormalXML.scala
@@ -1,19 +1,20 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
 
-import org.apache.spark.SparkContext
-import org.apache.hadoop.mapred.JobConf
-import org.apache.spark.rdd.RDD
 import java.math.BigInteger
+import java.net.InetAddress
 import java.util.ArrayList
-import org.apache.commons.lang3.ArrayUtils
 import java.util.regex.{ Matcher, Pattern }
-import java.net.InetAddress
+
+import org.apache.commons.lang3.ArrayUtils
+import org.apache.hadoop.mapred.JobConf
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
 
 class ParseNormalXML extends Serializable {
 
   def Training_DB_NormalXML_Parser_Input1(sc: SparkContext): RDD[String] = {
 
-    //Streaming records:==================================================================Input Files
+    // Streaming records:==================================================================Input Files
     val jobConf = new JobConf()
     jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader")
     jobConf.set("stream.recordreader.begin", "<revision>") // start Tag
@@ -21,10 +22,10 @@ class ParseNormalXML extends Serializable {
     org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf, "hdfs://localhost:9000/mydata/sample.xml") // input path from Hadoop
 
     // read data and save in RDD as block
-    val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) //.distinct()
+    val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // .distinct()
     println(wikiData.count)
     val RevisionTagewikidata = wikiData.map { case (x, y) => (x.toString()) }
-    //println(RevisionTagewikidata.count)
+    // println(RevisionTagewikidata.count)
 
     // ABend the revision in one line string
     val RevisioninOneString = RevisionTagewikidata.map(line => New_abendRevision(line)).cache()
@@ -37,7 +38,7 @@ class ParseNormalXML extends Serializable {
   }
   def Training_DB_NormalXML_Parser_Input2(sc: SparkContext): RDD[String] = {
 
-    //Streaming records:==================================================================Input Files
+    // Streaming records:==================================================================Input Files
     val jobConf = new JobConf()
     jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader")
     jobConf.set("stream.recordreader.begin", "<revision>") // start Tag
@@ -45,10 +46,10 @@ class ParseNormalXML extends Serializable {
     org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf, "hdfs://localhost:9000/mydata/2.xml") // input path from Hadoop
 
     // read data and save in RDD as block
-    val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) //.distinct()
+    val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // .distinct()
     println(wikiData.count)
     val RevisionTagewikidata = wikiData.map { case (x, y) => (x.toString()) }
-    //println(RevisionTagewikidata.count)
+    // println(RevisionTagewikidata.count)
 
     // ABend the revision in one line string
     val RevisioninOneString = RevisionTagewikidata.map(line => New_abendRevision(line)).cache()
@@ -61,7 +62,7 @@ class ParseNormalXML extends Serializable {
   }
   def Training_DB_NormalXML_Parser_Input3(sc: SparkContext): RDD[String] = {
 
-   //Streaming records:==================================================================Input Files
+    // Streaming records:==================================================================Input Files
     val jobConf = new JobConf()
     jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader")
     jobConf.set("stream.recordreader.begin", "<revision>") // start Tag
@@ -69,10 +70,10 @@ class ParseNormalXML extends Serializable {
     org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf, "hdfs://localhost:9000/mydata/3.xml") // input path from Hadoop
 
     // read data and save in RDD as block
-    val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) //.distinct()
+    val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // .distinct()
     println(wikiData.count)
     val RevisionTagewikidata = wikiData.map { case (x, y) => (x.toString()) }
-    //println(RevisionTagewikidata.count)
+    // println(RevisionTagewikidata.count)
     // ABend the revision in one line string
     val RevisioninOneString = RevisionTagewikidata.map(line => New_abendRevision(line)).cache()
     // println("TotalCount" + " " + RevisioninOneString.count)
@@ -83,9 +84,9 @@ class ParseNormalXML extends Serializable {
 
   }
 
-    def Testing_DB_NormalXML_Parser(sc: SparkContext): RDD[String] = {
+  def Testing_DB_NormalXML_Parser(sc: SparkContext): RDD[String] = {
 
-   //Streaming records:==================================================================Input Files
+    // Streaming records:==================================================================Input Files
     val jobConf = new JobConf()
     jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader")
     jobConf.set("stream.recordreader.begin", "<revision>") // start Tag
@@ -93,10 +94,10 @@ class ParseNormalXML extends Serializable {
     org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf, "hdfs://localhost:9000/mydata/3.xml") // input path from Hadoop
 
     // read data and save in RDD as block
-    val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) //.distinct()
+    val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // .distinct()
     println(wikiData.count)
     val RevisionTagewikidata = wikiData.map { case (x, y) => (x.toString()) }
-    //println(RevisionTagewikidata.count)
+    // println(RevisionTagewikidata.count)
     // ABend the revision in one line string
     val RevisioninOneString = RevisionTagewikidata.map(line => New_abendRevision(line)).cache()
     // println("TotalCount" + " " + RevisioninOneString.count)
@@ -107,11 +108,6 @@ class ParseNormalXML extends Serializable {
 
   }
 
-  
-  
-  
-
-  
   // make the revision as one string
   def New_abendRevision(str: String): String = {
 
@@ -125,13 +121,13 @@ class ParseNormalXML extends Serializable {
   // Ok:  used on the Top
   def New_Build_Revision_map(obj: String): String = {
     var Store_Record_String = ""
-    //Json Revision :
+    // Json Revision :
     val JsonStr = Get_Json_Revision(obj)
     val Standered_JsonStr = Standared_Get_Json_Revision(obj) // for full string Jason with all formating for parsing by spark
     val Json_Standered = Standered_JsonStr.get(0).toString() // for full string Jason with all formating for parsing by spark
     val Json = JsonStr.get(0).toString()
 
-    //0.Id Revision
+    // 0.Id Revision
     val IdRevision = Get_ID_Revision(obj)
     if (IdRevision != "") {
       val ID = IdRevision.toString().trim()
@@ -141,7 +137,7 @@ class ParseNormalXML extends Serializable {
     //    else {
     //      Store_Record_String = "0"
     //    }
-    //1. Item Title :
+    // 1. Item Title :
     val ItemTitle: ArrayList[String] = Get_Item_Title_FromJson(Json)
     if (ItemTitle.size() > 0) {
       val groupItemTilte = ItemTitle.get(0).toString()
@@ -164,8 +160,8 @@ class ParseNormalXML extends Serializable {
       }
     }
 
-    //=============Start:======= extract information from the json string
-    //2.Comments :
+    // =============Start:======= extract information from the json string
+    // 2.Comments :
     val commentarray = Get_Comment(obj)
     val comment = commentarray.get(0)
     if (comment.nonEmpty) {
@@ -174,7 +170,7 @@ class ParseNormalXML extends Serializable {
       Store_Record_String = Store_Record_String.trim() + "NNLL" + "NA"
     }
 
-    //3.Parent ID :
+    // 3.Parent ID :
     val ParentIDStr = Get_ParentID(obj)
 
     if (ParentIDStr.nonEmpty) {
@@ -185,7 +181,7 @@ class ParseNormalXML extends Serializable {
       Store_Record_String = Store_Record_String + "NNLL" + "0"
 
     }
-    //4.Timestamp:
+    // 4.Timestamp:
     val TimeStamparray = Get_TIMEStamp(obj)
     val TimeSta = TimeStamparray.get(0)
     if (TimeSta.nonEmpty) {
@@ -194,41 +190,41 @@ class ParseNormalXML extends Serializable {
       Store_Record_String = Store_Record_String + "NNLL" + "NA"
     }
 
-    //5. Contributor Data( IP ):
+    // 5. Contributor Data( IP ):
     val Contributstr = Get_Contributor_IP(obj)
-    //val ContributorSta = Contributorarray.get(0)
+    // val ContributorSta = Contributorarray.get(0)
     if (Contributstr != "0") {
       Store_Record_String = Store_Record_String + "NNLL" + Contributstr.trim()
     } else {
       Store_Record_String = Store_Record_String + "NNLL" + "0"
     }
 
-    //6. Contributor ID :
+    // 6. Contributor ID :
     val Contributor_IDStr = Get_Contributor_ID(obj)
-    //val Contributor_IDSta = Contributor_IDarray.get(0)
+    // val Contributor_IDSta = Contributor_IDarray.get(0)
     if (Contributor_IDStr != "0") {
       Store_Record_String = Store_Record_String + "NNLL" + Contributor_IDStr.trim()
     } else {
       Store_Record_String = Store_Record_String + "NNLL" + "0"
     }
 
-    //7. Contributor Name :
+    // 7. Contributor Name :
     val Contributor_NameStr = Get_Contributor_Name(obj)
-    //val Contributor_IDSta = Contributor_IDarray.get(0)
+    // val Contributor_IDSta = Contributor_IDarray.get(0)
     if (Contributor_NameStr != "NA") {
       Store_Record_String = Store_Record_String + "NNLL" + Contributor_NameStr.trim()
     } else {
       Store_Record_String = Store_Record_String + "NNLL" + "NA"
     }
 
-    //8. Full Json Tag for Parsing:
+    // 8. Full Json Tag for Parsing:
     if (Json_Standered.nonEmpty) {
       Store_Record_String = Store_Record_String + "NNLL" + Json_Standered.trim()
     } else {
       Store_Record_String = Store_Record_String + "NNLL" + "NA"
     }
 
-    //9. Model :
+    // 9. Model :
 
     val modelstr = Get_Model(obj)
     if (modelstr.nonEmpty) {
@@ -236,14 +232,14 @@ class ParseNormalXML extends Serializable {
     } else {
       Store_Record_String = Store_Record_String + "NNLL" + "NA"
     }
-    //10.Format:
+    // 10.Format:
     val Formatstr = Get_Format(obj)
     if (Formatstr.nonEmpty) {
       Store_Record_String = Store_Record_String + "NNLL" + Formatstr.trim()
     } else {
       Store_Record_String = Store_Record_String + "NNLL" + "NA"
     }
-    //11.SHA1 :
+    // 11.SHA1 :
     val SHAstr = Get_SHA1(obj)
     if (SHAstr.nonEmpty) {
       Store_Record_String = Store_Record_String + "NNLL" + SHAstr.trim()
@@ -290,8 +286,8 @@ class ParseNormalXML extends Serializable {
 
     }
 
-    //**********************
-    //   if (str.contains("</id><parentid>")){
+    // **********************
+    //   if (str.contains("</id><parentid>")) {
     //
     //        val inputID: CharSequence = str
     //        val pattStr_id: String = "<revision><id>[0-9]+</id><parentid>"
@@ -306,7 +302,7 @@ class ParseNormalXML extends Serializable {
     //     }
     //  }
     //
-    //    else if (str.contains("</id><timestamp>")){
+    //    else if (str.contains("</id><timestamp>")) {
     //
     //       val inputID: CharSequence = str
     //        val pattStr_id: String = "<revision><id>[0-9]+</id><timestamp>"
@@ -327,7 +323,7 @@ class ParseNormalXML extends Serializable {
     tem
   }
 
-  //Extract TimeStampe value from  Tag:
+  // Extract TimeStampe value from  Tag:
   def Get_TIMEStamp(str: String): ArrayList[String] = {
 
     val TimeStamp: ArrayList[String] = new ArrayList[String]()
@@ -382,7 +378,7 @@ class ParseNormalXML extends Serializable {
 
   }
 
-  //extract Item Title from Json string
+  // extract Item Title from Json string
   def Get_Item_Title_FromJson(str: String): ArrayList[String] = {
 
     val Item_Title_FromJason: ArrayList[String] = new ArrayList[String]()
@@ -634,5 +630,4 @@ class ParseNormalXML extends Serializable {
     }
     temp
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseRDFXML.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseRDFXML.scala
index 3f83897..2add40c 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseRDFXML.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseRDFXML.scala
@@ -1,13 +1,14 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
 
-import org.apache.spark.SparkContext
+import java.io.ByteArrayInputStream
+import java.util.ArrayList
+import java.util.regex.Pattern
+
 import org.apache.hadoop.mapred.JobConf
-import org.apache.spark.rdd.RDD
 import org.apache.jena.graph.Triple
 import org.apache.jena.rdf.model.ModelFactory
-import java.util.ArrayList
-import java.util.regex.Pattern
-import java.io.ByteArrayInputStream
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
 
 class ParseRDFXML extends Serializable {
 
@@ -24,7 +25,7 @@ class ParseRDFXML extends Serializable {
     org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf_Record, "hdfs://localhost:9000/mydata/Germany.rdf") // input path from Hadoop
     org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf_Prefixes, "hdfs://localhost:9000/mydata/Germany.rdf") // input path from Hadoop
 
-    //------------ RDF XML Record
+    // ------------ RDF XML Record
     // read data and save in RDD as block- RDFXML Record
     val RDFXML_Dataset_Record = sc.hadoopRDD(jobConf_Record, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text])
     //      println("HelloRecords" + " " + RDFXML_Dataset_Record.count)
@@ -34,14 +35,14 @@ class ParseRDFXML extends Serializable {
     println("HelloRecords" + " " + RDFXML_Dataset_Record_AsstringBlock.count)
     //      RDFXML_Dataset_Record_AsstringBlock.foreach(println)
 
-    //-------------RDF XML Prefixes
+    // -------------RDF XML Prefixes
     // read data and save in RDD as block- RDFXML Prefixes
     val RDFXML_Dataset_Prefixes = sc.hadoopRDD(jobConf_Prefixes, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text])
     println("HelloPrefixes" + " " + RDFXML_Dataset_Prefixes.count)
     //      RDFXML_Dataset_Prefixes.foreach(println)
     // Convert the block- RDFXML Prefixes to String DataType
     var RDFXML_Dataset_AsstringPrefixes_WithoutDist = RDFXML_Dataset_Prefixes.map { case (x, y) => (x.toString()) }
-    val RDFXML_Dataset_AsstringPrefixes=RDFXML_Dataset_AsstringPrefixes_WithoutDist.distinct()
+    val RDFXML_Dataset_AsstringPrefixes = RDFXML_Dataset_AsstringPrefixes_WithoutDist.distinct()
     println("HelloPrefixes" + " " + RDFXML_Dataset_AsstringPrefixes.count)
     //      RDFXML_Dataset_AsstringPrefixes.foreach(println)
     val pref = RDFXML_Dataset_AsstringPrefixes.reduce((a, b) => a + "\n" + b)
@@ -88,5 +89,4 @@ class ParseRDFXML extends Serializable {
     val str = Arraylistval.get(0).toString()
     str
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseTRIX.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseTRIX.scala
index 3bd8364..f3a4201 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseTRIX.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseTRIX.scala
@@ -1,12 +1,14 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
-import org.apache.spark.SparkContext
+
+import java.io.ByteArrayInputStream
+import java.util.ArrayList
+import java.util.regex.Pattern
+
 import org.apache.hadoop.mapred.JobConf
-import org.apache.spark.rdd.RDD
 import org.apache.jena.graph.Triple
 import org.apache.jena.rdf.model.ModelFactory
-import java.util.ArrayList
-import java.util.regex.Pattern
-import java.io.ByteArrayInputStream
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
 
 class ParseTRIX extends Serializable {
 
@@ -18,7 +20,7 @@ class ParseTRIX extends Serializable {
 
     org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf_Record, "hdfs://localhost:9000/mydata/xx.trix") // input path from Hadoop
 
-    //------------TRIX Record
+    // ------------TRIX Record
     // read data and save in RDD as block- TRIX Record
     val TRIX_Dataset_Record = sc.hadoopRDD(jobConf_Record, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text])
     //      println("HelloRecords" + " " + TRIX_Dataset_Record.count)
@@ -43,11 +45,9 @@ class ParseTRIX extends Serializable {
     s4
   }
 
-
   // This function for TRIX case.
   def arrayListTOstring(Arraylistval: ArrayList[Triple]): String = {
     val str = Arraylistval.get(0).toString()
     str
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/RevisionFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/RevisionFeatures.scala
index 7dc3c19..ccbd2b4 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/RevisionFeatures.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/RevisionFeatures.scala
@@ -1,4 +1,5 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
+
 import java.util.regex.{ Matcher, Pattern }
 
 class RevisionFeatures extends Serializable {
@@ -53,7 +54,7 @@ class RevisionFeatures extends Serializable {
 
     }
 
-    //      if (result_isNonLatin==true){ // is matched
+    //      if (result_isNonLatin==true) { // is matched
     //
     //        Final_Result=false
     //
@@ -123,4 +124,4 @@ class RevisionFeatures extends Serializable {
 
   }
 
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/SentencesFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/SentencesFeatures.scala
index 5490ec1..62c0432 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/SentencesFeatures.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/SentencesFeatures.scala
@@ -13,7 +13,7 @@ class SentencesFeatures extends Serializable {
 
   }
 
-  //1.comment tail Lenght  Action subaction param+ tail
+  // 1.comment tail Lenght  Action subaction param+ tail
   def CommentTailLenght(Full_Comment_Str: String): Integer = {
     val parsedCommment_OBJ = new CommentProcessor()
     val commentTail_Str = parsedCommment_OBJ.Extract_CommentTail(Full_Comment_Str)
@@ -23,9 +23,9 @@ class SentencesFeatures extends Serializable {
   }
   // similarity  between the comment ( suffix of the comment = Tail ) where the comment is normal comment /* .........*/ or  /* .........
   // e.g This comment includes wb...sitelink
-  //1-we have to be sure the comment is normal comment take the form /* ........./*
-  //2-Next step: we check the Action part if it includes a sitelink word or not.
-  //3-we compare the suffix in this case to  site link with pay attention to  the same language.
+  // 1-we have to be sure the comment is normal comment take the form /* ........./*
+  // 2-Next step: we check the Action part if it includes a sitelink word or not.
+  // 3-we compare the suffix in this case to  site link with pay attention to  the same language.
 
   // we check the type of Normal comment if it contains Aliases  .
   def extract_CommentAliases_LanguageType(Full_Comment_Str: String): String = {
@@ -185,5 +185,4 @@ class SentencesFeatures extends Serializable {
     langeType.trim()
 
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/StatementFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/StatementFeatures.scala
index 33b1b5a..31d1158 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/StatementFeatures.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/StatementFeatures.scala
@@ -42,5 +42,4 @@ class StatementFeatures extends Serializable {
     }
     result
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/VandalismDetection.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/VandalismDetection.scala
index 065adb1..5937817 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/VandalismDetection.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/VandalismDetection.scala
@@ -1,31 +1,28 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
 
-import org.apache.spark.{ SparkContext, RangePartitioner }
-import org.apache.spark.sql._
-import org.apache.spark.sql.expressions.Window
-import org.apache.hadoop.mapred.JobConf
 import java.util.Scanner
-import org.json.JSONObject
+
 import org.apache.commons.lang3.StringUtils
-import org.apache.spark.sql.functions.{ concat, lit }
-import org.apache.spark.ml.feature.{ Word2Vec, Word2VecModel }
+import org.apache.hadoop.mapred.JobConf
+import org.apache.spark.{ RangePartitioner, SparkContext }
 import org.apache.spark.ml.Pipeline
-import org.apache.spark.ml.feature.VectorAssembler
+import org.apache.spark.ml.feature.{ VectorAssembler, Word2Vec, Word2VecModel }
 import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.sql._
+import org.apache.spark.sql.expressions.Window
+import org.apache.spark.sql.functions.{ concat, lit }
+import org.json.JSONObject
 
 class VandalismDetection extends Serializable {
 
-  
-  
-  
   // Function 1 : Distributed RDF Parser Approach
   def Start_RDF_Parser_Appraoch(sc: SparkContext): Unit = {
-    
+
     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
     import sqlContext.implicits._
     import org.apache.spark.sql.functions._ // for UDF
     import org.apache.spark.sql.types._
-    
+
     println("*********************************************************************")
     println("Distributed RDF Parser Model")
     println("Please Enter 1 for JTriple and  2 for TRIX  process and 3 for RDFXML:")
@@ -41,12 +38,11 @@ class VandalismDetection extends Serializable {
       val DRF_Builder_JTripleOBJ = new FacilitiesClass()
       val RDD_JTriple = JTriple_Parser_OBJ.Start_JTriple_Parser(jobConf, sc)
       RDD_JTriple.foreach(println)
-      //----------------------------DF for RDF TRIX ------------------------------------------
+      // ----------------------------DF for RDF TRIX ------------------------------------------
       //  Create SQLContext Object:
       val sqlContext = new org.apache.spark.sql.SQLContext(sc)
       val DFR_JTriple = DRF_Builder_JTripleOBJ.RDD_TO_DFR_JTriple(RDD_JTriple, sqlContext)
       DFR_JTriple.show()
-      
 
     } else if (num == "2") {
 
@@ -57,12 +53,11 @@ class VandalismDetection extends Serializable {
       val DRF_Builder_RDFTRIX_OBJ = new FacilitiesClass()
       val RDD_TRIX = TRIX_Parser_OBJ.Start_TriX_Parser(jobConf, sc)
       RDD_TRIX.foreach(println)
-      //----------------------------DF for RDF TRIX ------------------------------------------
+      // ----------------------------DF for RDF TRIX ------------------------------------------
       //  Create SQLContext Object:
       val sqlContext = new org.apache.spark.sql.SQLContext(sc)
       val DFR_TRIX = DRF_Builder_RDFTRIX_OBJ.RDD_TO_DFR_TRIX(RDD_TRIX, sqlContext)
       DFR_TRIX.show()
-      
 
     } else if (num == "3") {
       println("RDF XML .........!!!!!!")
@@ -76,1917 +71,1272 @@ class VandalismDetection extends Serializable {
       val RDD_RDFXML = RDFXML_Parser_OBJ.start_RDFXML_Parser(jobConf_Record, jobConf_Prefixes, sc)
       RDD_RDFXML.foreach(println)
 
-      //----------------------------DF for RDF XML ------------------------------------------
+      // ----------------------------DF for RDF XML ------------------------------------------
       //  Create SQLContext Object:
       val sqlContext = new org.apache.spark.sql.SQLContext(sc)
       val DFR_RDF_XML = DRF_Builder_RDFXML_OBJ.RDD_TO_DFR_RDFXML(RDD_RDFXML, sqlContext)
       DFR_RDF_XML.show()
 
     }
-    
-  sc.stop()
+
+    sc.stop()
   }
-  
-  //***********************************************************************************************************************************************
-  // Function 2:Training XML and Vandalism Detection 
+
+  // *********************************************************************************
+  // Function 2:Training XML and Vandalism Detection
   def Training_Start_StandardXMLParser_VD(sc: SparkContext): DataFrame = {
     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
     import sqlContext.implicits._
     import org.apache.spark.sql.functions._ // for UDF
     import org.apache.spark.sql.types._
 
-          // Streaming records:
-      val jobConf = new JobConf()
-      val NormalXML_Parser_OBJ = new ParseNormalXML()
-      val RDD_OBJ = new ParseNormalXML()
-    
-      val Training_RDD_All_Record1 = RDD_OBJ.Training_DB_NormalXML_Parser_Input1(sc)
-      val Training_RDD_All_Record2 = RDD_OBJ.Training_DB_NormalXML_Parser_Input2(sc)
-      val Training_RDD_All_Record3 = RDD_OBJ.Training_DB_NormalXML_Parser_Input3(sc)
-      //RDD_All_Record1.foreach(println)
-      //RDD_All_Record2.foreach(println)
-      // RDD_All_Record3.foreach(println)
-
-      val Training_RDD_All_Record = Training_RDD_All_Record1.union(Training_RDD_All_Record2).union(Training_RDD_All_Record3).distinct().cache()
-
-      //println(RDD_All_Record.count())
-       println(Training_RDD_All_Record.count())
-
-      // ======= Json part :
-      //Json RDD : Each record has its Revision iD:
-      val JsonRDD = Training_RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache()
-      //JsonRDD.foreach(println)
-      //println(JsonRDD.count())
-
-      // Data set
-      val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache()
-      //Ds_Json.show()
-      // println(Ds_Json.count())
-
-      // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage
-      val TagsRDD = Training_RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache()
-      val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache()
-      //    DF_Tags.show()
-      //    println(DF_Tags.count())
-
-      //======== Join Json part with Tag Part:============================
-      //Joining to have full data
-      val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha") //.orderBy("Rid", "Itemid")
-      DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1")
-      val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache()
-
-      val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2")
-      val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) //.distinct()
-      DF_Second.registerTempTable("Data2")
-
-      //===================================================================Parent // Previous Revision==============================================================================================================
-      //val DF_Joined = result1.as("df1").join(result2.as("df2"), col("itemid") === col("itemid2") && col("index1") === col("index2") + 1, "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha", "Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2")
-      //.select("itemid", "Rid","pid","time","itemid2","Rid2","pid2","time2")
-
-      //Joining based on Parent Id to get the previous cases: ParentID
-      val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct()
-
-      val RDD_After_JoinDF = DF_Joined.rdd.distinct()
-      val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache()
-      val part = new RangePartitioner(4, x)
-      val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory.
-      //partitioned.foreach(println)
-      //
-      //      //=====================================================All Features Based on Categories of Features Data Type :==================================================================================
-      //
-      val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on ","
-      //Result_all_Features.foreach(println)
-      // println("nayef" + Result_all_Features.count())
-
-      // Conver the RDD of All Features to  DataFrame:
-
-      val schema = StructType(
-
-        //0
-        StructField("Rid", IntegerType, false) ::
-
-          // Character Features :
-          /* 1*/ StructField("C1uppercaseratio", DoubleType, false) :: /*2 */ StructField("C2lowercaseratio", DoubleType, false) :: /*3*/ StructField("C3alphanumericratio", DoubleType, false) ::
-          /*4*/ StructField("C4asciiratio", DoubleType, false) :: /*5*/ StructField("C5bracketratio", DoubleType, false) :: /*6*/ StructField("C6digitalratio", DoubleType, false) ::
-          /*7*/ StructField("C7latinratio", DoubleType, false) :: /*8*/ StructField("C8whitespaceratio", DoubleType, false) :: /* 9*/ StructField("C9puncratio", DoubleType, false) ::
-          /*10*/ StructField("C10longcharacterseq", DoubleType, false) :: /*11*/ StructField("C11arabicratio", DoubleType, false) :: /*12*/ StructField("C12bengaliratio", DoubleType, false) ::
-          /*13 */ StructField("C13brahmiratio", DoubleType, false) :: /*14*/ StructField("C14cyrilinratio", DoubleType, false) :: /*15*/ StructField("C15hanratio", DoubleType, false) ::
-          /*16*/ StructField("c16malysiaratio", DoubleType, false) :: /*17*/ StructField("C17tamiratio", DoubleType, false) :: /*18*/ StructField("C18telugratio", DoubleType, false) ::
-          /*19 */ StructField("C19symbolratio", DoubleType, false) :: /*20 */ StructField("C20alpharatio", DoubleType, false) :: /*21*/ StructField("C21visibleratio", DoubleType, false) ::
-          /*22*/ StructField("C22printableratio", DoubleType, false) :: /*23*/ StructField("C23blankratio", DoubleType, false) :: /*24 */ StructField("C24controlratio", DoubleType, false) ::
-          /* 25 */ StructField("C25hexaratio", DoubleType, false) ::
-
-          //word Features:
-          /*26*/ StructField("W1languagewordratio", DoubleType, false) :: /*27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /*28*/ StructField("W3lowercaseratio", DoubleType, false) ::
-          /*29 Integer */ StructField("W4longestword", IntegerType, false) :: /*30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /*31*/ StructField("W6badwordratio", DoubleType, false) ::
-          /*32*/ StructField("W7uppercaseratio", DoubleType, false) :: /*33*/ StructField("W8banwordratio", DoubleType, false) :: /*34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) ::
-          /*35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /*36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: /*37 Boolean*/ StructField("W12IsContainBanword", DoubleType, false) ::
-          /*38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /*39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) ::
-          /*40*/ StructField("W15PortionQid", DoubleType, false) :: /*41*/ StructField("W16PortionLnags", DoubleType, false) :: /*42*/ StructField("W17PortionLinks", DoubleType, false) ::
-
-          //
-          //          // Sentences Features:
-          /*43*/ StructField("S1CommentTailLength", DoubleType, false) :: /*44*/ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: /*45*/ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /*46*/ StructField("S4SimilarityCommentComment", DoubleType, false) ::
-          //
-          //          // Statements Features :
-          /*47*/ StructField("SS1Property", StringType, false) :: /*48*/ StructField("SS2DataValue", StringType, false) :: /*49*/ StructField("SS3ItemValue", StringType, false) ::
-          //
-          //
-          //        //User Features :
-          /*50 Boolean*/ StructField("U1IsPrivileged", DoubleType, false) :: /*51 Boolean*/ StructField("U2IsBotUser", DoubleType, false) :: /*52 Boolean*/ StructField("U3IsBotuserWithFlaguser", DoubleType, false) ::
-          /*53 Boolean*/ StructField("U4IsProperty", DoubleType, false) :: /*54 Boolean*/ StructField("U5IsTranslator", DoubleType, false) :: /*55 Boolean*/ StructField("U6IsRegister", DoubleType, false) ::
-          /*56*/ StructField("U7IPValue", DoubleType, false) :: /*57*/ StructField("U8UserID", IntegerType, false) :: /*58*/ StructField("U9HasBirthDate", DoubleType, false) :: /*59*/ StructField("U10HasDeathDate", DoubleType, false) ::
-
-          //Items Features :
-
-          /*60*/ StructField("I1NumberLabels", DoubleType, false) :: /*61*/ StructField("I2NumberDescription", DoubleType, false) :: /*62*/ StructField("I3NumberAliases", DoubleType, false) :: /*63*/ StructField("I4NumberClaims", DoubleType, false) ::
-          /*64*/ StructField("I5NumberSitelinks", DoubleType, false) :: /*65*/ StructField("I6NumberStatement", DoubleType, false) :: /*66*/ StructField("I7NumberReferences", DoubleType, false) :: /*67*/ StructField("I8NumberQualifier", DoubleType, false) ::
-          /*68*/ StructField("I9NumberQualifierOrder", DoubleType, false) :: /*69*/ StructField("I10NumberBadges", DoubleType, false) :: /*70*/ StructField("I11ItemTitle", StringType, false) ::
-
-          // Revision Features:
-          /*71*/ StructField("R1languageRevision", StringType, false) :: /*72*/ StructField("R2RevisionLanguageLocal", StringType, false) :: /*73*/ StructField("R3IslatainLanguage", DoubleType, false) ::
-          /*74*/ StructField("R4JsonLength", DoubleType, false) :: /*75*/ StructField("R5RevisionAction", StringType, false) :: /*76*/ StructField("R6PrevReviAction", StringType, false) ::
-          /*77*/ StructField("R7RevisionAccountChange", DoubleType, false) :: /*78*/ StructField("R8ParRevision", StringType, false) :: /*79*/ StructField("R9RevisionTime", StringType, false) ::
-          /*80*/ StructField("R10RevisionSize", DoubleType, false) :: /*81*/ StructField("R11ContentType", StringType, false) :: /*82*/ StructField("R12BytesIncrease", DoubleType, false) ::
-          /*83*/ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /*84*/ StructField("R14CommentLength", DoubleType, false) :: /*85*/ StructField("R15RevisionSubaction", StringType, false) ::
-          /*86*/ StructField("R16PrevReviSubaction", StringType, false) ::
-
-          Nil)
-
-      val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column
-      , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble),
-        e(11).toDouble, e(12).toDouble, e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble //Word Feature column
-        , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble, RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column:
-        , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble //Statement Features Column: 
-        , e(47), e(48), e(49) // User Features Column: 
-        , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble //Item Features column:
-        , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() //Revision Features Column: 
-        , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86)))
-
-      //a.User Frequency:
-      //number of revisions a user has contributed
-      //val resu= DF_Tags.groupBy("contributorID").agg(count("Rid"))
-      DF_Tags.registerTempTable("TagesTable")
-      val ContributorFreq_for_Each_Revision_DF = sqlContext.sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") //.drop("CIDUSER1")
-      //ContributorFreq_for_Each_Revision_DF.show()
-
-      //b.Cumulated : Number of a unique Item a user has contributed.
-      val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext.sql("select contributorID as CIDUSER2,  COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") //.drop("CIDUSER2")
-      //CumulatedNumberof_uniqueItemsForUser_DF.show()
-
-      //1.Item Frequency:
-      // number of revisions an Item has
-      val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable  group by itemid")
-      // ItemFrequ_DF.show()
-
-      //2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name
-      val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid,  COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid")
-      //CumulatedNumberof_UniqueUserForItem_DF.show()
-
-      //3. freq each Item :
-      val Fre_Item_DF = sqlContext.sql("select itemid,  COUNT(itemid) as FreqItem from TagesTable  group by itemid")
-      // Fre_Item_DF.show()
-
-      //*****************************************************************************************************************************************
-      // This is Main DataFrame:
-      val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema)
-      //BeforeJoin_All_Features.show()
-
-      //********************************** User feature Join
-
-      // Join1 for add The first User Feature : number of revisions a user has contributed
-      val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1")
-      //AfterJoinUser1_All_Features.show()
-
-      // Join2 for add The second  User Feature
-      val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2")
-      //AfterJoinUser2_All_Features.show()
-
-      //********************************** Item Feature Join
-      // Join3 for add The First  Item Feature :number of revisions an Item has
-      val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
-      // AfterJoinItem3_All_Features.show()
-
-      // Join4 for add The Second  Item Feature
-      val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
-      // AfterJoinItem4_All_Features.show()
-
-      // Join5 for add The Third  Item Feature
-      val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
-      //2 AfterJoinItem5_All_Features.show()
-
-      //********************************
-
-      //*Geografical information Feature from Meta File
-      //REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS
-      val df_GeoInf = sqlContext.read
-        .format("com.databricks.spark.csv")
-        .option("header", "true") // Use first line of all files as header
-        .option("inferSchema", "true") // Automatically infer data types
-        .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")
-      // df_GeoInf.show()
-
-      val df_Truth = sqlContext.read
-        .format("com.databricks.spark.csv")
-        .option("header", "true") // Use first line of all files as header
-        .option("inferSchema", "true") // Automatically infer data types
-        .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED")
-      // df_GeoInf.show()
-
-      val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache()
-      // AfterJoinGeoInfo_All_Features.show()
-
-      val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache()
-      //Final_All_Features.show()
-
-      // Pre- process Data ============================================================================================================================================================
-
-      // For String Column, We fill the Null values by "NA":
-
-      var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache()
-
-      // For Integer Frequency  Column, We fill the Null values by 0:
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache()
-      //Fill_Missing_Final_All_Features.show()
-
-      val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 }
-      val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble }
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED")))
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED")))
-
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID")))
-
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed")))
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit")))
-
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas")))
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem")))
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem")))
-
-      //===========================================================================Caharacter Features : Double , Integer Features ====================================================================================
-      //Double Ratio:  For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features :
-      var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() //.where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0)
-      Samples.registerTempTable("df")
-
-      val Query = "select " +
-        "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," +
-        "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," +
-        "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," +
-        "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," +
-        "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," +
-        "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," +
-        "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," +
-        "percentile_approx(c16malysiaratio, 0.5) as median16" + "," +
-        "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," +
-        "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," +
-        "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," +
-        "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," +
-        "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," +
-        "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," +
-        "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df"
-
-      val medianValues = sqlContext.sql(Query).rdd
-      val Median = medianValues.first()
-
-      // Median :
-      // Character Ratio Features: UDF
-      val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i }
-      val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i }
-      val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i }
-      val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i }
-      val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i }
-      val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i }
-      val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i }
-      val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i }
-      val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i }
-
-      val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i }
-      val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i }
-      val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i }
-      val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i }
-      val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i }
-      val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i }
-      val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i }
-      val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i }
-      val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i }
-      val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i }
-      val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i }
-      val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i }
-      val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i }
-      val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i }
-      val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i }
-
-      val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) //.drop("C1uppercaseratio").cache()
-      val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) //.drop("C2lowercaseratio").cache()
-      //df1.unpersist()
-      val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) //.drop("C3alphanumericratio").cache()
-      //df2.unpersist()
-      val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) //.drop("C4asciiratio").cache()
-      //df3.unpersist()
-      val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) //.drop("C5bracketratio").cache()
-      //df4.unpersist()
-      val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) //.drop("C6digitalratio").cache()
-      //df5.unpersist()
-      val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) //.drop("C7latinratio").cache()
-      //df6.unpersist()
-      val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) //.drop("C8whitespaceratio").cache()
-      //df7.unpersist()
-      val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) //.drop("C9puncratio").cache()
-
-      // Mean :
-      // character integer values :
-      val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head()
-      val C10_Mean = Mean_C10longcharacterseq.getDouble(0)
-      val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i }
-      val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq")))
-
-      //Median
-      val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) //.drop("C11arabicratio").cache()
-      // df9.unpersist()
-      val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) //.drop("C12bengaliratio").cache()
-      //df11.unpersist()
-      val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) //.drop("C13brahmiratio").cache()
-      // df12.unpersist()
-      val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) //.drop("C14cyrilinratio").cache()
-      // df13.unpersist()
-      val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) //.drop("C15hanratio").cache()
-      // df14.unpersist()
-      val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) //.drop("c16malysiaratio").cache()
-      //df15.unpersist()
-      val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) //.drop("C17tamiratio").cache()
-      //df16.unpersist()
-      val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) //.drop("C18telugratio").cache()
-      //df17.unpersist()
-      val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) //.drop("C19symbolratio").cache()
-      //df18.unpersist()
-      val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) //.drop("C20alpharatio").cache()
-      // df19.unpersist()
-      val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) //.drop("C21visibleratio").cache()
-      // df20.unpersist()
-      val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) //.drop("C22printableratio").cache()
-      //df21.unpersist()
-      val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) //.drop("C23blankratio").cache()
-      // df22.unpersist()
-      val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) //.drop("C24controlratio").cache()
-      //df23.unpersist()
-      val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) //.drop("C25hexaratio").cache()
-
-      //************************************************End Character Features ****************************************************************************************
-
-      //************************************************Start Word  Features ****************************************************************************************
-
-      // Word Ratio Features : UDF
-      val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i }
-      val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i }
-      val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i }
-      val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i }
-      val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i }
-
-      //1.
-      val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) //.drop("W1languagewordratio").cache()
-
-      //2.Boolean(Double) IsContainLanguageWord
-
-      //3.
-      val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) //.drop("W3lowercaseratio").cache()
-      // df26.unpersist()
-
-      //4. Integer " Mean:
-      val Mean_W4longestword = Samples.agg(mean("W4longestword")).head()
-      val W4_Mean = Mean_W4longestword.getDouble(0)
-      val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i }
-      val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword")))
-
-      //5. Boolean (Double ) W5IscontainURL
-      //6.
-      val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) //.drop("W6badwordratio").cache()
-
-      //7.
-      val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) //.drop("W7uppercaseratio").cache()
-
-      //8.
-      val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) //.drop("W8banwordratio").cache()
-
-      //9.FemalFirst       Boolean(Double)
-      //10.Male First      Boolean(Double)
-      //11.ContainBadWord  Boolean(Double)
-      //12ContainBanWord   Boolean(Double)
-
-      //13. Integer(Double):
-      val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head()
-      val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0)
-      val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i }
-      val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords")))
-
-      //14. Integer (Double):
-      val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head()
-      val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0)
-      val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i }
-      val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords")))
-
-      // 15. Double (Not ratio):
-      val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head()
-      val W15_Mean = Mean_W15PortionQid.getDouble(0)
-      val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i }
-      val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid")))
-
-      //16. Double(Not Ratio):
-      val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head()
-      val W16_Mean = Mean_W16PortionLnags.getDouble(0)
-      val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i }
-      val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags")))
-
-      //17.Double(Not ratio):
-      val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head()
-      val W17_Mean = Mean_W17PortionLinks.getDouble(0)
-      val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i }
-      val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks")))
-
-      //************************************************End Word  Features ****************************************************************************************
-
-      //************************************************Start Sentences  Features ****************************************************************************************
-      // 1. Integer(Double)
-      val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head()
-      val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0))
-      val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i }
-      val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength")))
-
-      //2. Double  but Not ratio values :
-      val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head()
-      val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0))
-      val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i }
-      val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel")))
-
-      //3. Double  but Not ratio values :
-      val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head()
-      val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0))
-      val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i }
-      val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink")))
-
-      //4.  Double  but Not ratio values :
-      val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head()
-      val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0))
-      val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i }
-      val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment")))
-
-      //df41.show()
-      //************************************************End Sentences  Features ****************************************************************************************
-      //*********************************************** Start Statement  Features ****************************************************************************************
-      //1. String
-      //2. String
-      //3. String
-      //************************************************End Statement  Features ****************************************************************************************
-      //*********************************************** Start User Features ****************************************************************************************
-
-      //1.Boolean(Double)
-      //2.Boolean(Double)
-      //3.Boolean(Double)
-      //4.Boolean(Double)
-      //5.Boolean(Double)
-      //6.Boolean(Double)
-      //7. (Double) IP No need to fill Missing Data
-      //8. (Double) ID No need to fill Missing Data
-      //9.Boolean(Double)
-      //10.Boolean(Double)
-
-      //*********************************************** End User Features ****************************************************************************************
-      //*********************************************** Start Item Features ****************************************************************************************
-      //1. Integer (Double) No need to fill missing values
-      //2. Integer (Double) No need to fill missing values
-      //3. Integer (Double) No need to fill missing values
-      //4. Integer (Double) No need to fill missing values
-      //5. Integer (Double) No need to fill missing values
-      //6. Integer (Double) No need to fill missing values
-      //7. Integer (Double) No need to fill missing values
-      //8. Integer (Double) No need to fill missing values
-      //9. Integer (Double) No need to fill missing values
-      //10. Integer (Double) No need to fill missing values
-      //11. String
-      //*********************************************** End Item Features ****************************************************************************************
-      //*********************************************** Start Revision Features ****************************************************************************************
-      //1.String
-      //2.String
-      //3.Boolean (Double)
-      //4.Integer(Double)
-      //5.String
-      //6.String
-      //7. Boolean(Double)
-      //8. String
-      //9.String
-      //10. Integer (Double)
-      //11.String
-      //12. integer(Double)
-      //13. Long(Double)
-      //14. integer (Double)
-      //15.String
-      //16.String
-      //*********************************************** End Revision Features ****************************************************************************************
-      //*********************************************** Meta Data , Truth Data and Frequnces  ****************************************************************************************
-      //Meta
-      // 1.Revision Session :Integer (Converted to Double)
-      //2. User Country Code
-      //3.User Continent Code
-      //4.User Time Size
-      //5.User Region Code
-      //6.User-city Name
-      //7.User Country Name
-      //8.RevisionTags
-
+    // Streaming records:
+    val jobConf = new JobConf()
+    val NormalXML_Parser_OBJ = new ParseNormalXML()
+    val RDD_OBJ = new ParseNormalXML()
+
+    val Training_RDD_All_Record1 = RDD_OBJ.Training_DB_NormalXML_Parser_Input1(sc)
+    val Training_RDD_All_Record2 = RDD_OBJ.Training_DB_NormalXML_Parser_Input2(sc)
+    val Training_RDD_All_Record3 = RDD_OBJ.Training_DB_NormalXML_Parser_Input3(sc)
+    // RDD_All_Record1.foreach(println)
+    // RDD_All_Record2.foreach(println)
+    // RDD_All_Record3.foreach(println)
+
+    val Training_RDD_All_Record = Training_RDD_All_Record1.union(Training_RDD_All_Record2).union(Training_RDD_All_Record3).distinct().cache()
+
+    // println(RDD_All_Record.count())
+    println(Training_RDD_All_Record.count())
+
+    // ======= Json part :
+    // Json RDD : Each record has its Revision iD:
+    val JsonRDD = Training_RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache()
+    // JsonRDD.foreach(println)
+    // println(JsonRDD.count())
+
+    // Data set
+    val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache()
+    // Ds_Json.show()
+    // println(Ds_Json.count())
+
+    // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage
+    val TagsRDD = Training_RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache()
+    val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP",
+      "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache()
+    //    DF_Tags.show()
+    //    println(DF_Tags.count())
+
+    // ======== Join Json part with Tag Part:============================
+    // Joining to have full data
+    val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter")
+      .select("Rid", "itemid", "comment", "pid", "time", "contributorIP",
+        "contributorID", "contributorName", "JsonText", "labels", "descriptions",
+        "aliases", "claims", "sitelinks", "model", "format", "sha") // .orderBy("Rid", "Itemid")
+    DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1")
+    val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache()
+
+    val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2",
+      "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2",
+      "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2")
+    val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) // .distinct()
+    DF_Second.registerTempTable("Data2")
+
+    // ===================================================================Parent // Previous Revision==============================================================================================================
+
+    // Joining based on Parent Id to get the previous cases: ParentID
+    val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct()
+
+    val RDD_After_JoinDF = DF_Joined.rdd.distinct()
+    val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache()
+    val part = new RangePartitioner(4, x)
+    val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory.
+    // partitioned.foreach(println)
+    //
+    //      //=====================================================All Features Based on Categories of Features Data Type :==================================================================================
+    //
+    val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on ","
+    // Result_all_Features.foreach(println)
+    // println("nayef" + Result_all_Features.count())
+
+    // Conver the RDD of All Features to  DataFrame:
+
+    val schema = StructType(
+
+      // 0
+      StructField("Rid", IntegerType, false) ::
+
+        // Character Features :
+        /* 1 */ StructField("C1uppercaseratio", DoubleType, false) :: /* 2 */ StructField("C2lowercaseratio", DoubleType, false) :: /* 3 */ StructField("C3alphanumericratio", DoubleType, false) ::
+        /* 4 */ StructField("C4asciiratio", DoubleType, false) :: /* 5 */ StructField("C5bracketratio", DoubleType, false) :: /* 6 */ StructField("C6digitalratio", DoubleType, false) ::
+        /* 7 */ StructField("C7latinratio", DoubleType, false) :: /* 8 */ StructField("C8whitespaceratio", DoubleType, false) :: /* 9 */ StructField("C9puncratio", DoubleType, false) ::
+        /* 10 */ StructField("C10longcharacterseq", DoubleType, false) :: /* 11 */ StructField("C11arabicratio", DoubleType, false) :: /* 12 */ StructField("C12bengaliratio", DoubleType, false) ::
+        /* 13 */ StructField("C13brahmiratio", DoubleType, false) :: /* 14 */ StructField("C14cyrilinratio", DoubleType, false) :: /* 15 */ StructField("C15hanratio", DoubleType, false) ::
+        /* 16 */ StructField("c16malysiaratio", DoubleType, false) :: /* 17 */ StructField("C17tamiratio", DoubleType, false) :: /* 18 */ StructField("C18telugratio", DoubleType, false) ::
+        /* 19 */ StructField("C19symbolratio", DoubleType, false) :: /* 20 */ StructField("C20alpharatio", DoubleType, false) :: /* 21 */ StructField("C21visibleratio", DoubleType, false) ::
+        /* 22 */ StructField("C22printableratio", DoubleType, false) :: /* 23 */ StructField("C23blankratio", DoubleType, false) :: /* 24 */ StructField("C24controlratio", DoubleType, false) ::
+        /* 25 */ StructField("C25hexaratio", DoubleType, false) ::
+
+        // word Features:
+        /* 26 */ StructField("W1languagewordratio", DoubleType, false) :: /* 27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /* 28 */ StructField("W3lowercaseratio", DoubleType, false) ::
+        /* 29 Integer */ StructField("W4longestword", IntegerType, false) :: /* 30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /* 31 */ StructField("W6badwordratio", DoubleType, false) ::
+        /* 32 */ StructField("W7uppercaseratio", DoubleType, false) :: /* 33 */ StructField("W8banwordratio", DoubleType, false) :: /* 34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) ::
+        /* 35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /* 36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) ::
+        /* 37 Boolean */ StructField("W12IsContainBanword", DoubleType, false) ::
+        /* 38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /* 39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) ::
+        /* 40 */ StructField("W15PortionQid", DoubleType, false) :: /* 41 */ StructField("W16PortionLnags", DoubleType, false) :: /* 42 */ StructField("W17PortionLinks", DoubleType, false) ::
+
+        //
+        //          // Sentences Features:
+        /* 43 */ StructField("S1CommentTailLength", DoubleType, false) :: /* 44 */ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) ::
+        /* 45 */ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /* 46 */ StructField("S4SimilarityCommentComment", DoubleType, false) ::
+        //
+        //          // Statements Features :
+        /* 47 */ StructField("SS1Property", StringType, false) :: /* 48 */ StructField("SS2DataValue", StringType, false) :: /* 49 */ StructField("SS3ItemValue", StringType, false) ::
+        //
+        //
+        //        // User Features :
+        /* 50 Boolean */ StructField("U1IsPrivileged", DoubleType, false) :: /* 51 Boolean */ StructField("U2IsBotUser", DoubleType, false) :: /* 52 Boolean */ StructField("U3IsBotuserWithFlaguser", DoubleType, false) ::
+        /* 53 Boolean */ StructField("U4IsProperty", DoubleType, false) :: /* 54 Boolean */ StructField("U5IsTranslator", DoubleType, false) :: /* 55 Boolean */ StructField("U6IsRegister", DoubleType, false) ::
+        /* 56 */ StructField("U7IPValue", DoubleType, false) :: /* 57 */ StructField("U8UserID", IntegerType, false) :: /* 58 */ StructField("U9HasBirthDate", DoubleType, false) ::
+        /* 59 */ StructField("U10HasDeathDate", DoubleType, false) ::
+
+        // Items Features :
+
+        /* 60 */ StructField("I1NumberLabels", DoubleType, false) :: /* 61 */ StructField("I2NumberDescription", DoubleType, false) :: /* 62 */ StructField("I3NumberAliases", DoubleType, false) ::
+        /* 63 */ StructField("I4NumberClaims", DoubleType, false) ::
+        /* 64 */ StructField("I5NumberSitelinks", DoubleType, false) :: /* 65 */ StructField("I6NumberStatement", DoubleType, false) :: /* 66 */ StructField("I7NumberReferences", DoubleType, false) ::
+        /* 67 */ StructField("I8NumberQualifier", DoubleType, false) ::
+        /* 68 */ StructField("I9NumberQualifierOrder", DoubleType, false) :: /* 69 */ StructField("I10NumberBadges", DoubleType, false) :: /* 70 */ StructField("I11ItemTitle", StringType, false) ::
+
+        // Revision Features:
+        /* 71 */ StructField("R1languageRevision", StringType, false) :: /* 72 */ StructField("R2RevisionLanguageLocal", StringType, false) :: /* 73 */ StructField("R3IslatainLanguage", DoubleType, false) ::
+        /* 74 */ StructField("R4JsonLength", DoubleType, false) :: /* 75 */ StructField("R5RevisionAction", StringType, false) :: /* 76 */ StructField("R6PrevReviAction", StringType, false) ::
+        /* 77 */ StructField("R7RevisionAccountChange", DoubleType, false) :: /* 78 */ StructField("R8ParRevision", StringType, false) :: /* 79 */ StructField("R9RevisionTime", StringType, false) ::
+        /* 80 */ StructField("R10RevisionSize", DoubleType, false) :: /* 81 */ StructField("R11ContentType", StringType, false) :: /* 82 */ StructField("R12BytesIncrease", DoubleType, false) ::
+        /* 83 */ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /* 84 */ StructField("R14CommentLength", DoubleType, false) :: /* 85 */ StructField("R15RevisionSubaction", StringType, false) ::
+        /* 86 */ StructField("R16PrevReviSubaction", StringType, false) ::
+
+        Nil)
+
+    val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column
+    , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble), e(11).toDouble, e(12).toDouble //
+    , e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble //
+    , e(23).toDouble, e(24).toDouble, e(25).toDouble // Word Feature column
+    , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble //
+    , RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column:
+    , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble // Statement Features Column:
+    , e(47), e(48), e(49) // User Features Column:
+    , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble // Item Features column:
+    , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble //
+    , e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() // Revision Features Column:
+    , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86)))
+
+    // a.User Frequency:
+    // number of revisions a user has contributed
+    // val resu= DF_Tags.groupBy("contributorID").agg(count("Rid"))
+    DF_Tags.registerTempTable("TagesTable")
+    val ContributorFreq_for_Each_Revision_DF = sqlContext
+      .sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") // .drop("CIDUSER1")
+    // ContributorFreq_for_Each_Revision_DF.show()
+
+    // b.Cumulated : Number of a unique Item a user has contributed.
+    val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext
+      .sql("select contributorID as CIDUSER2,  COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") // .drop("CIDUSER2")
+    // CumulatedNumberof_uniqueItemsForUser_DF.show()
+
+    // 1.Item Frequency:
+    // number of revisions an Item has
+    val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable  group by itemid")
+    // ItemFrequ_DF.show()
+
+    // 2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name
+    val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid,  COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid")
+    // CumulatedNumberof_UniqueUserForItem_DF.show()
+
+    // 3. freq each Item :
+    val Fre_Item_DF = sqlContext.sql("select itemid,  COUNT(itemid) as FreqItem from TagesTable  group by itemid")
+    // Fre_Item_DF.show()
+
+    // *****************************************************************************************************************************************
+    // This is Main DataFrame:
+    val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema)
+    // BeforeJoin_All_Features.show()
+
+    // ********************************** User feature Join
+
+    // Join1 for add The first User Feature : number of revisions a user has contributed
+    val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1")
+    // AfterJoinUser1_All_Features.show()
+
+    // Join2 for add The second  User Feature
+    val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2")
+    // AfterJoinUser2_All_Features.show()
+
+    // ********************************** Item Feature Join
+    // Join3 for add The First  Item Feature :number of revisions an Item has
+    val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
+    // AfterJoinItem3_All_Features.show()
+
+    // Join4 for add The Second  Item Feature
+    val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
+    // AfterJoinItem4_All_Features.show()
+
+    // Join5 for add The Third  Item Feature
+    val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
+    // 2 AfterJoinItem5_All_Features.show()
+
+    // ********************************
+
+    // *Geografical information Feature from Meta File
+    // REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS
+    val df_GeoInf = sqlContext.read
+      .format("com.databricks.spark.csv")
+      .option("header", "true") // Use first line of all files as header
+      .option("inferSchema", "true") // Automatically infer data types
+      .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE",
+        "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")
+    // df_GeoInf.show()
+
+    val df_Truth = sqlContext.read
+      .format("com.databricks.spark.csv")
+      .option("header", "true") // Use first line of all files as header
+      .option("inferSchema", "true") // Automatically infer data types
+      .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED")
+    // df_GeoInf.show()
+
+    val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache()
+    // AfterJoinGeoInfo_All_Features.show()
+
+    val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache()
+    // Final_All_Features.show()
+
+    // Pre- process Data ============================================================================================================================================================
+
+    // For String Column, We fill the Null values by "NA":
+
+    var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE",
+      "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache()
+
+    // For Integer Frequency  Column, We fill the Null values by 0:
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit",
+      "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache()
+    // Fill_Missing_Final_All_Features.show()
+
+    val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 }
+    val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble }
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED")))
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED")))
+
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID")))
+
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed")))
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit")))
+
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas")))
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem")))
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem")))
+
+    // ===========================================================================Caharacter Features : Double , Integer Features ========================================================
+    // Double Ratio:  For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features :
+    var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() // .where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0)
+    Samples.registerTempTable("df")
+
+    val Query = "select " +
+      "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," +
+      "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," +
+      "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," +
+      "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," +
+      "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," +
+      "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," +
+      "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," +
+      "percentile_approx(c16malysiaratio, 0.5) as median16" + "," +
+      "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," +
+      "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," +
+      "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," +
+      "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," +
+      "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," +
+      "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," +
+      "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df"
+
+    val medianValues = sqlContext.sql(Query).rdd
+    val Median = medianValues.first()
+
+    // Median :
+    // Character Ratio Features: UDF
+    val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i }
+    val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i }
+    val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i }
+    val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i }
+    val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i }
+    val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i }
+    val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i }
+    val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i }
+    val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i }
+
+    val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i }
+    val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i }
+    val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i }
+    val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i }
+    val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i }
+    val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i }
+    val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i }
+    val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i }
+    val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i }
+    val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i }
+    val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i }
+    val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i }
+    val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i }
+    val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i }
+    val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i }
+
+    val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) // .drop("C1uppercaseratio").cache()
+    val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) // .drop("C2lowercaseratio").cache()
+    // df1.unpersist()
+    val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) // .drop("C3alphanumericratio").cache()
+    // df2.unpersist()
+    val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) // .drop("C4asciiratio").cache()
+    // df3.unpersist()
+    val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) // .drop("C5bracketratio").cache()
+    // df4.unpersist()
+    val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) // .drop("C6digitalratio").cache()
+    // df5.unpersist()
+    val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) // .drop("C7latinratio").cache()
+    // df6.unpersist()
+    val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) // .drop("C8whitespaceratio").cache()
+    // df7.unpersist()
+    val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) // .drop("C9puncratio").cache()
+
+    // Mean :
+    // character integer values :
+    val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head()
+    val C10_Mean = Mean_C10longcharacterseq.getDouble(0)
+    val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i }
+    val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq")))
+
+    // Median
+    val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) // .drop("C11arabicratio").cache()
+    // df9.unpersist()
+    val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) // .drop("C12bengaliratio").cache()
+    // df11.unpersist()
+    val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) // .drop("C13brahmiratio").cache()
+    // df12.unpersist()
+    val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) // .drop("C14cyrilinratio").cache()
+    // df13.unpersist()
+    val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) // .drop("C15hanratio").cache()
+    // df14.unpersist()
+    val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) // .drop("c16malysiaratio").cache()
+    // df15.unpersist()
+    val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) // .drop("C17tamiratio").cache()
+    // df16.unpersist()
+    val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) // .drop("C18telugratio").cache()
+    // df17.unpersist()
+    val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) // .drop("C19symbolratio").cache()
+    // df18.unpersist()
+    val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) // .drop("C20alpharatio").cache()
+    // df19.unpersist()
+    val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) // .drop("C21visibleratio").cache()
+    // df20.unpersist()
+    val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) // .drop("C22printableratio").cache()
+    // df21.unpersist()
+    val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) // .drop("C23blankratio").cache()
+    // df22.unpersist()
+    val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) // .drop("C24controlratio").cache()
+    // df23.unpersist()
+    val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) // .drop("C25hexaratio").cache()
+
+    // ************************************************End Character Features ****************************************************************************************
+
+    // ************************************************Start Word  Features ****************************************************************************************
+
+    // Word Ratio Features : UDF
+    val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i }
+    val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i }
+    val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i }
+    val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i }
+    val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i }
+
+    // 1.
+    val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) // .drop("W1languagewordratio").cache()
+
+    // 2.Boolean(Double) IsContainLanguageWord
+
+    // 3.
+    val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) // .drop("W3lowercaseratio").cache()
+    // df26.unpersist()
+
+    // 4. Integer " Mean:
+    val Mean_W4longestword = Samples.agg(mean("W4longestword")).head()
+    val W4_Mean = Mean_W4longestword.getDouble(0)
+    val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i }
+    val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword")))
+
+    // 5. Boolean (Double ) W5IscontainURL
+    // 6.
+    val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) // .drop("W6badwordratio").cache()
+
+    // 7.
+    val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) // .drop("W7uppercaseratio").cache()
+
+    // 8.
+    val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) // .drop("W8banwordratio").cache()
+
+    // 9.FemalFirst       Boolean(Double)
+    // 10.Male First      Boolean(Double)
+    // 11.ContainBadWord  Boolean(Double)
+    // 12ContainBanWord   Boolean(Double)
+
+    // 13. Integer(Double):
+    val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head()
+    val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0)
+    val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i }
+    val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords")))
+
+    // 14. Integer (Double):
+    val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head()
+    val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0)
+    val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i }
+    val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords")))
+
+    // 15. Double (Not ratio):
+    val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head()
+    val W15_Mean = Mean_W15PortionQid.getDouble(0)
+    val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i }
+    val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid")))
+
+    // 16. Double(Not Ratio):
+    val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head()
+    val W16_Mean = Mean_W16PortionLnags.getDouble(0)
+    val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i }
+    val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags")))
+
+    // 17.Double(Not ratio):
+    val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head()
+    val W17_Mean = Mean_W17PortionLinks.getDouble(0)
+    val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i }
+    val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks")))
+
+    // ************************************************End Word  Features ****************************************************************************************
+
+    // ************************************************Start Sentences  Features ****************************************************************************************
+    // 1. Integer(Double)
+    val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head()
+    val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0))
+    val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i }
+    val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength")))
+
+    // 2. Double  but Not ratio values :
+    val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head()
+    val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0))
+    val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i }
+    val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel")))
+
+    // 3. Double  but Not ratio values :
+    val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head()
+    val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0))
+    val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i }
+    val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink")))
+
+    // 4.  Double  but Not ratio values :
+    val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head()
+    val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0))
+    val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i }
+    val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment")))
+
+    // df41.show()
+    // ************************************************End Sentences  Features ****************************************************************************************
+    // *********************************************** Start Statement  Features ****************************************************************************************
+    // 1. String
+    // 2. String
+    // 3. String
+    // ************************************************End Statement  Features ****************************************************************************************
+    // *********************************************** Start User Features ****************************************************************************************
+
+    // 1.Boolean(Double)
+    // 2.Boolean(Double)
+    // 3.Boolean(Double)
+    // 4.Boolean(Double)
+    // 5.Boolean(Double)
+    // 6.Boolean(Double)
+    // 7. (Double) IP No need to fill Missing Data
+    // 8. (Double) ID No need to fill Missing Data
+    // 9.Boolean(Double)
+    // 10.Boolean(Double)
+
+    // *********************************************** End User Features ****************************************************************************************
+    // *********************************************** Start Item Features ****************************************************************************************
+    // 1. Integer (Double) No need to fill missing values
+    // 2. Integer (Double) No need to fill missing values
+    // 3. Integer (Double) No need to fill missing values
+    // 4. Integer (Double) No need to fill missing values
+    // 5. Integer (Double) No need to fill missing values
+    // 6. Integer (Double) No need to fill missing values
+    // 7. Integer (Double) No need to fill missing values
+    // 8. Integer (Double) No need to fill missing values
+    // 9. Integer (Double) No need to fill missing values
+    // 10. Integer (Double) No need to fill missing values
+    // 11. String
+    // *********************************************** End Item Features ****************************************************************************************
+    // *********************************************** Start Revision Features ****************************************************************************************
+    // 1.String
+    // 2.String
+    // 3.Boolean (Double)
+    // 4.Integer(Double)
+    // 5.String
+    // 6.String
+    // 7. Boolean(Double)
+    // 8. String
+    // 9.String
+    // 10. Integer (Double)
+    // 11.String
+    // 12. integer(Double)
+    // 13. Long(Double)
+    // 14. integer (Double)
+    // 15.String
+    // 16.String
+    // *********************************************** End Revision Features ****************************************************************************************
+    // *********************************************** Meta Data , Truth Data and Frequnces  ****************************************************************************************
+    // Meta
+    //  1.Revision Session :Integer (Converted to Double)
+    // 2. User Country Code
+    // 3.User Continent Code
+    // 4.User Time Size
+    // 5.User Region Code
+    // 6.User-city Name
+    // 7.User Country Name
+    // 8.RevisionTags
+
+    // Truth:
+    // 1.Undo
+
+    // Freq :
+
+    // 1.5 features
+
+    // Roll Boolean     :Boolean (Double)
+    // Undo             :Boolean (Double)
+
+    // *********************************************** End Revision Features ****************************************************************************************
+
+    // ===========================================================================String Features====================================================================================
+
+    val df42 = df41.withColumn(
+      // statement String features:
+      "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle",
+        // Revision  String Features:
+        lit(";"), $"R1languageRevision",
+        lit(";"), $"R2RevisionLanguageLocal",
+        lit(";"), $"R5RevisionAction",
+        lit(";"), $"R6PrevReviAction",
+        lit(";"), $"R8ParRevision",
+        lit(";"), $"R9RevisionTime",
+        lit(";"), $"R11ContentType",
+        lit(";"), $"R15RevisionSubaction",
+        lit(";"), $"R16PrevReviSubaction",
+
+        lit(";"), $"USER_COUNTRY_CODE",
+        lit(";"), $"USER_CONTINENT_CODE",
+        lit(";"), $"USER_TIME_ZONE",
+        lit(";"), $"USER_REGION_CODE",
+        lit(";"), $"USER_CITY_NAME",
+        lit(";"), $"USER_COUNTY_NAME",
+        lit(";"), $"REVISION_TAGS"))
+
+    val toArray = udf((record: String) => record.split(";").map(_.toString()))
+    val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures")))
+    //  test1.show()
+    //  test1.printSchema()
+
+    val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0)
+    val model = word2Vec.fit(test1)
+    val result = model.transform(test1) // .rdd
+
+    // result.show()
+
+    val Todense = udf((b: Vector) => b.toDense)
+    val test_new2 = result.withColumn("result", Todense(col("result")))
+
+    val assembler = new VectorAssembler().setInputCols(Array(
+      "result",
+
+      // character
+      "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio",
+      "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio",
+      "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio",
+      "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio",
+
+      // Words
+      "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio",
+      "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword",
+      "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks",
+
+      // Sentences :
+      "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment",
+
+      // User :
+      "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID",
+      "U9HasBirthDate", "U10HasDeathDate",
+
+      // Item:
+
+      "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement",
+      "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges",
+
+      // Revision:
+      "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease",
+      "R13TimeSinceLastRevi", "R14CommentLength",
+
+      // Meta , truth , Freq
+      // meta :
+      "FinalREVISION_SESSION_ID",
       // Truth:
-      //1.Undo
-
-      // Freq :
-
-      //1.5 features
-
-      // Roll Boolean     :Boolean (Double)
-      // Undo             :Boolean (Double)
-
-      //*********************************************** End Revision Features ****************************************************************************************
-
-      //===========================================================================String Features====================================================================================
-
-      val df42 = df41.withColumn(
-        //statement String features:
-        "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle",
-          //Revision  String Features:
-          lit(";"), $"R1languageRevision",
-          lit(";"), $"R2RevisionLanguageLocal",
-          lit(";"), $"R5RevisionAction",
-          lit(";"), $"R6PrevReviAction",
-          lit(";"), $"R8ParRevision",
-          lit(";"), $"R9RevisionTime",
-          lit(";"), $"R11ContentType",
-          lit(";"), $"R15RevisionSubaction",
-          lit(";"), $"R16PrevReviSubaction",
-
-          lit(";"), $"USER_COUNTRY_CODE",
-          lit(";"), $"USER_CONTINENT_CODE",
-          lit(";"), $"USER_TIME_ZONE",
-          lit(";"), $"USER_REGION_CODE",
-          lit(";"), $"USER_CITY_NAME",
-          lit(";"), $"USER_COUNTY_NAME",
-          lit(";"), $"REVISION_TAGS"))
-
-      val toArray = udf((record: String) => record.split(";").map(_.toString()))
-      val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures")))
-      //  test1.show()
-      //  test1.printSchema()
-
-      val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0)
-      val model = word2Vec.fit(test1)
-      val result = model.transform(test1) //.rdd
-
-      // result.show()
-
-      val Todense = udf((b: Vector) => b.toDense)
-      val test_new2 = result.withColumn("result", Todense(col("result")))
-
-      val assembler = new VectorAssembler().setInputCols(Array(
-        "result",
-
-        // character
-        "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio",
-        "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio",
-        "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio",
-        "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio",
-
-        // Words
-        "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio",
-        "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword",
-        "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks",
-
-        //Sentences :
-        "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment",
-
-        // User :
-        "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID",
-        "U9HasBirthDate", "U10HasDeathDate",
+      "FinalUNDO_RESTORE_REVERTED",
 
-        //Item:
+      // Freq:
+      "FinalNumberofRevisionsUserContributed",
+      "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features")
+    val Training_Data = assembler.transform(test_new2)
 
-        "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement",
-        "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges",
-
-        //Revision:
-        "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease",
-        "R13TimeSinceLastRevi", "R14CommentLength",
-
-        // Meta , truth , Freq
-        // meta :
-        "FinalREVISION_SESSION_ID",
-        // Truth:
-        "FinalUNDO_RESTORE_REVERTED",
-
-        //Freq:
-        "FinalNumberofRevisionsUserContributed",
-        "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features")
-      val Training_Data = assembler.transform(test_new2)
-
-      // Prepare the data for classification:
+    // Prepare the data for classification:
     //  NewData.registerTempTable("DB")
     //  val Training_Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED  from DB")
-     //val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision
+    // val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision
 
-      //Data.show()
+    // Data.show()
 
-      //val TestClassifiers = new Classifiers()
-//
-     //   TestClassifiers.RandomForestClassifer(Data, sqlContext)
-//      // TestClassifiers.DecisionTreeClassifier(Data, sqlContext)
-//      // TestClassifiers.LogisticRegrision(Data, sqlContext)
-//      // TestClassifiers.GradientBoostedTree(Data, sqlContext)
-//      // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext)
+    // val TestClassifiers = new Classifiers()
+    //
+    //   TestClassifiers.RandomForestClassifer(Data, sqlContext)
+    //      // TestClassifiers.DecisionTreeClassifier(Data, sqlContext)
+    //      // TestClassifiers.LogisticRegrision(Data, sqlContext)
+    //      // TestClassifiers.GradientBoostedTree(Data, sqlContext)
+    //      // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext)
 
     Training_Data
-   
-    
+
   }
 
-  
-    //***********************************************************************************************************************************************
-  // Function 3:Testing XML and Vandalism Detection 
+  // ***********************************************************************************************************************************************
+  // Function 3:Testing XML and Vandalism Detection
   def Testing_Start_StandardXMLParser_VD(sc: SparkContext): DataFrame = {
     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
     import sqlContext.implicits._
     import org.apache.spark.sql.functions._ // for UDF
     import org.apache.spark.sql.types._
 
-          // Streaming records:
-      val jobConf = new JobConf()
-      val NormalXML_Parser_OBJ = new ParseNormalXML()
-      val RDD_OBJ = new ParseNormalXML()
-    
-      val Testing_RDD_All_Record = RDD_OBJ.Testing_DB_NormalXML_Parser(sc).cache()
-
-
-      // ======= Json part :
-      //Json RDD : Each record has its Revision iD:
-      val JsonRDD = Testing_RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache()
-      //JsonRDD.foreach(println)
-      //println(JsonRDD.count())
-
-      // Data set
-      val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache()
-      //Ds_Json.show()
-      // println(Ds_Json.count())
-
-      // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage
-      val TagsRDD = Testing_RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache()
-      val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache()
-      //    DF_Tags.show()
-      //    println(DF_Tags.count())
-
-      //======== Join Json part with Tag Part:============================
-      //Joining to have full data
-      val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha") //.orderBy("Rid", "Itemid")
-      DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1")
-      val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache()
-
-      val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2")
-      val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) //.distinct()
-      DF_Second.registerTempTable("Data2")
-
-      //===================================================================Parent // Previous Revision==============================================================================================================
-      //val DF_Joined = result1.as("df1").join(result2.as("df2"), col("itemid") === col("itemid2") && col("index1") === col("index2") + 1, "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha", "Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2")
-      //.select("itemid", "Rid","pid","time","itemid2","Rid2","pid2","time2")
-
-      //Joining based on Parent Id to get the previous cases: ParentID
-      val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct()
-
-      val RDD_After_JoinDF = DF_Joined.rdd.distinct()
-      val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache()
-      val part = new RangePartitioner(4, x)
-      val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory.
-      //partitioned.foreach(println)
-      //
-      //      //=====================================================All Features Based on Categories of Features Data Type :==================================================================================
-      //
-      val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on ","
-      //Result_all_Features.foreach(println)
-      // println("nayef" + Result_all_Features.count())
-
-      // Conver the RDD of All Features to  DataFrame:
-
-      val schema = StructType(
-
-        //0
-        StructField("Rid", IntegerType, false) ::
-
-          // Character Features :
-          /* 1*/ StructField("C1uppercaseratio", DoubleType, false) :: /*2 */ StructField("C2lowercaseratio", DoubleType, false) :: /*3*/ StructField("C3alphanumericratio", DoubleType, false) ::
-          /*4*/ StructField("C4asciiratio", DoubleType, false) :: /*5*/ StructField("C5bracketratio", DoubleType, false) :: /*6*/ StructField("C6digitalratio", DoubleType, false) ::
-          /*7*/ StructField("C7latinratio", DoubleType, false) :: /*8*/ StructField("C8whitespaceratio", DoubleType, false) :: /* 9*/ StructField("C9puncratio", DoubleType, false) ::
-          /*10*/ StructField("C10longcharacterseq", DoubleType, false) :: /*11*/ StructField("C11arabicratio", DoubleType, false) :: /*12*/ StructField("C12bengaliratio", DoubleType, false) ::
-          /*13 */ StructField("C13brahmiratio", DoubleType, false) :: /*14*/ StructField("C14cyrilinratio", DoubleType, false) :: /*15*/ StructField("C15hanratio", DoubleType, false) ::
-          /*16*/ StructField("c16malysiaratio", DoubleType, false) :: /*17*/ StructField("C17tamiratio", DoubleType, false) :: /*18*/ StructField("C18telugratio", DoubleType, false) ::
-          /*19 */ StructField("C19symbolratio", DoubleType, false) :: /*20 */ StructField("C20alpharatio", DoubleType, false) :: /*21*/ StructField("C21visibleratio", DoubleType, false) ::
-          /*22*/ StructField("C22printableratio", DoubleType, false) :: /*23*/ StructField("C23blankratio", DoubleType, false) :: /*24 */ StructField("C24controlratio", DoubleType, false) ::
-          /* 25 */ StructField("C25hexaratio", DoubleType, false) ::
-
-          //word Features:
-          /*26*/ StructField("W1languagewordratio", DoubleType, false) :: /*27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /*28*/ StructField("W3lowercaseratio", DoubleType, false) ::
-          /*29 Integer */ StructField("W4longestword", IntegerType, false) :: /*30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /*31*/ StructField("W6badwordratio", DoubleType, false) ::
-          /*32*/ StructField("W7uppercaseratio", DoubleType, false) :: /*33*/ StructField("W8banwordratio", DoubleType, false) :: /*34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) ::
-          /*35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /*36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: /*37 Boolean*/ StructField("W12IsContainBanword", DoubleType, false) ::
-          /*38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /*39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) ::
-          /*40*/ StructField("W15PortionQid", DoubleType, false) :: /*41*/ StructField("W16PortionLnags", DoubleType, false) :: /*42*/ StructField("W17PortionLinks", DoubleType, false) ::
-
-          //
-          //          // Sentences Features:
-          /*43*/ StructField("S1CommentTailLength", DoubleType, false) :: /*44*/ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: /*45*/ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /*46*/ StructField("S4SimilarityCommentComment", DoubleType, false) ::
-          //
-          //          // Statements Features :
-          /*47*/ StructField("SS1Property", StringType, false) :: /*48*/ StructField("SS2DataValue", StringType, false) :: /*49*/ StructField("SS3ItemValue", StringType, false) ::
-          //
-          //
-          //        //User Features :
-          /*50 Boolean*/ StructField("U1IsPrivileged", DoubleType, false) :: /*51 Boolean*/ StructField("U2IsBotUser", DoubleType, false) :: /*52 Boolean*/ StructField("U3IsBotuserWithFlaguser", DoubleType, false) ::
-          /*53 Boolean*/ StructField("U4IsProperty", DoubleType, false) :: /*54 Boolean*/ StructField("U5IsTranslator", DoubleType, false) :: /*55 Boolean*/ StructField("U6IsRegister", DoubleType, false) ::
-          /*56*/ StructField("U7IPValue", DoubleType, false) :: /*57*/ StructField("U8UserID", IntegerType, false) :: /*58*/ StructField("U9HasBirthDate", DoubleType, false) :: /*59*/ StructField("U10HasDeathDate", DoubleType, false) ::
-
-          //Items Features :
-
-          /*60*/ StructField("I1NumberLabels", DoubleType, false) :: /*61*/ StructField("I2NumberDescription", DoubleType, false) :: /*62*/ StructField("I3NumberAliases", DoubleType, false) :: /*63*/ StructField("I4NumberClaims", DoubleType, false) ::
-          /*64*/ StructField("I5NumberSitelinks", DoubleType, false) :: /*65*/ StructField("I6NumberStatement", DoubleType, false) :: /*66*/ StructField("I7NumberReferences", DoubleType, false) :: /*67*/ StructField("I8NumberQualifier", DoubleType, false) ::
-          /*68*/ StructField("I9NumberQualifierOrder", DoubleType, false) :: /*69*/ StructField("I10NumberBadges", DoubleType, false) :: /*70*/ StructField("I11ItemTitle", StringType, false) ::
-
-          // Revision Features:
-          /*71*/ StructField("R1languageRevision", StringType, false) :: /*72*/ StructField("R2RevisionLanguageLocal", StringType, false) :: /*73*/ StructField("R3IslatainLanguage", DoubleType, false) ::
-          /*74*/ StructField("R4JsonLength", DoubleType, false) :: /*75*/ StructField("R5RevisionAction", StringType, false) :: /*76*/ StructField("R6PrevReviAction", StringType, false) ::
-          /*77*/ StructField("R7RevisionAccountChange", DoubleType, false) :: /*78*/ StructField("R8ParRevision", StringType, false) :: /*79*/ StructField("R9RevisionTime", StringType, false) ::
-          /*80*/ StructField("R10RevisionSize", DoubleType, false) :: /*81*/ StructField("R11ContentType", StringType, false) :: /*82*/ StructField("R12BytesIncrease", DoubleType, false) ::
-          /*83*/ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /*84*/ StructField("R14CommentLength", DoubleType, false) :: /*85*/ StructField("R15RevisionSubaction", StringType, false) ::
-          /*86*/ StructField("R16PrevReviSubaction", StringType, false) ::
-
-          Nil)
-
-      val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column
-      , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble),
-        e(11).toDouble, e(12).toDouble, e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble //Word Feature column
-        , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble, RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column:
-        , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble //Statement Features Column: 
-        , e(47), e(48), e(49) // User Features Column: 
-        , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble //Item Features column:
-        , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() //Revision Features Column: 
-        , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86)))
-
-      //a.User Frequency:
-      //number of revisions a user has contributed
-      //val resu= DF_Tags.groupBy("contributorID").agg(count("Rid"))
-      DF_Tags.registerTempTable("TagesTable")
-      val ContributorFreq_for_Each_Revision_DF = sqlContext.sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") //.drop("CIDUSER1")
-      //ContributorFreq_for_Each_Revision_DF.show()
-
-      //b.Cumulated : Number of a unique Item a user has contributed.
-      val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext.sql("select contributorID as CIDUSER2,  COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") //.drop("CIDUSER2")
-      //CumulatedNumberof_uniqueItemsForUser_DF.show()
-
-      //1.Item Frequency:
-      // number of revisions an Item has
-      val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable  group by itemid")
-      // ItemFrequ_DF.show()
-
-      //2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name
-      val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid,  COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid")
-      //CumulatedNumberof_UniqueUserForItem_DF.show()
-
-      //3. freq each Item :
-      val Fre_Item_DF = sqlContext.sql("select itemid,  COUNT(itemid) as FreqItem from TagesTable  group by itemid")
-      // Fre_Item_DF.show()
-
-      //*****************************************************************************************************************************************
-      // This is Main DataFrame:
-      val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema)
-      //BeforeJoin_All_Features.show()
-
-      //********************************** User feature Join
-
-      // Join1 for add The first User Feature : number of revisions a user has contributed
-      val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1")
-      //AfterJoinUser1_All_Features.show()
-
-      // Join2 for add The second  User Feature
-      val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2")
-      //AfterJoinUser2_All_Features.show()
-
-      //********************************** Item Feature Join
-      // Join3 for add The First  Item Feature :number of revisions an Item has
-      val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
-      // AfterJoinItem3_All_Features.show()
-
-      // Join4 for add The Second  Item Feature
-      val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
-      // AfterJoinItem4_All_Features.show()
-
-      // Join5 for add The Third  Item Feature
-      val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
-      //2 AfterJoinItem5_All_Features.show()
-
-      //********************************
-
-      //*Geografical information Feature from Meta File
-      //REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS
-      val df_GeoInf = sqlContext.read
-        .format("com.databricks.spark.csv")
-        .option("header", "true") // Use first line of all files as header
-        .option("inferSchema", "true") // Automatically infer data types
-        .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")
-      // df_GeoInf.show()
-
-      val df_Truth = sqlContext.read
-        .format("com.databricks.spark.csv")
-        .option("header", "true") // Use first line of all files as header
-        .option("inferSchema", "true") // Automatically infer data types
-        .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED")
-      // df_GeoInf.show()
-
-      val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache()
-      // AfterJoinGeoInfo_All_Features.show()
-
-      val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache()
-      //Final_All_Features.show()
-
-      // Pre- process Data ============================================================================================================================================================
-
-      // For String Column, We fill the Null values by "NA":
-
-      var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache()
-
-      // For Integer Frequency  Column, We fill the Null values by 0:
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache()
-      //Fill_Missing_Final_All_Features.show()
-
-      val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 }
-      val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble }
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED")))
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED")))
-
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID")))
-
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed")))
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit")))
-
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas")))
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem")))
-      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem")))
-
-      //===========================================================================Caharacter Features : Double , Integer Features ====================================================================================
-      //Double Ratio:  For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features :
-      var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() //.where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0)
-      Samples.registerTempTable("df")
-
-      val Query = "select " +
-        "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," +
-        "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," +
-        "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," +
-        "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," +
-        "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," +
-        "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," +
-        "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," +
-        "percentile_approx(c16malysiaratio, 0.5) as median16" + "," +
-        "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," +
-        "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," +
-        "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," +
-        "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," +
-        "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," +
-        "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," +
-        "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df"
-
-      val medianValues = sqlContext.sql(Query).rdd
-      val Median = medianValues.first()
-
-      // Median :
-      // Character Ratio Features: UDF
-      val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i }
-      val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i }
-      val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i }
-      val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i }
-      val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i }
-      val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i }
-      val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i }
-      val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i }
-      val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i }
-
-      val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i }
-      val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i }
-      val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i }
-      val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i }
-      val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i }
-      val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i }
-      val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i }
-      val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i }
-      val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i }
-      val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i }
-      val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i }
-      val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i }
-      val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i }
-      val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i }
-      val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i }
-
-      val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) //.drop("C1uppercaseratio").cache()
-      val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) //.drop("C2lowercaseratio").cache()
-      //df1.unpersist()
-      val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) //.drop("C3alphanumericratio").cache()
-      //df2.unpersist()
-      val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) //.drop("C4asciiratio").cache()
-      //df3.unpersist()
-      val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) //.drop("C5bracketratio").cache()
-      //df4.unpersist()
-      val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) //.drop("C6digitalratio").cache()
-      //df5.unpersist()
-      val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) //.drop("C7latinratio").cache()
-      //df6.unpersist()
-      val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) //.drop("C8whitespaceratio").cache()
-      //df7.unpersist()
-      val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) //.drop("C9puncratio").cache()
-
-      // Mean :
-      // character integer values :
-      val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head()
-      val C10_Mean = Mean_C10longcharacterseq.getDouble(0)
-      val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i }
-      val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq")))
-
-      //Median
-      val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) //.drop("C11arabicratio").cache()
-      // df9.unpersist()
-      val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) //.drop("C12bengaliratio").cache()
-      //df11.unpersist()
-      val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) //.drop("C13brahmiratio").cache()
-      // df12.unpersist()
-      val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) //.drop("C14cyrilinratio").cache()
-      // df13.unpersist()
-      val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) //.drop("C15hanratio").cache()
-      // df14.unpersist()
-      val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) //.drop("c16malysiaratio").cache()
-      //df15.unpersist()
-      val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) //.drop("C17tamiratio").cache()
-      //df16.unpersist()
-      val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) //.drop("C18telugratio").cache()
-      //df17.unpersist()
-      val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) //.drop("C19symbolratio").cache()
-      //df18.unpersist()
-      val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) //.drop("C20alpharatio").cache()
-      // df19.unpersist()
-      val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) //.drop("C21visibleratio").cache()
-      // df20.unpersist()
-      val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) //.drop("C22printableratio").cache()
-      //df21.unpersist()
-      val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) //.drop("C23blankratio").cache()
-      // df22.unpersist()
-      val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) //.drop("C24controlratio").cache()
-      //df23.unpersist()
-      val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) //.drop("C25hexaratio").cache()
-
-      //************************************************End Character Features ****************************************************************************************
-
-      //************************************************Start Word  Features ****************************************************************************************
-
-      // Word Ratio Features : UDF
-      val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i }
-      val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i }
-      val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i }
-      val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i }
-      val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i }
-
-      //1.
-      val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) //.drop("W1languagewordratio").cache()
-
-      //2.Boolean(Double) IsContainLanguageWord
-
-      //3.
-      val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) //.drop("W3lowercaseratio").cache()
-      // df26.unpersist()
-
-      //4. Integer " Mean:
-      val Mean_W4longestword = Samples.agg(mean("W4longestword")).head()
-      val W4_Mean = Mean_W4longestword.getDouble(0)
-      val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i }
-      val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword")))
-
-      //5. Boolean (Double ) W5IscontainURL
-      //6.
-      val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) //.drop("W6badwordratio").cache()
-
-      //7.
-      val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) //.drop("W7uppercaseratio").cache()
-
-      //8.
-      val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) //.drop("W8banwordratio").cache()
-
-      //9.FemalFirst       Boolean(Double)
-      //10.Male First      Boolean(Double)
-      //11.ContainBadWord  Boolean(Double)
-      //12ContainBanWord   Boolean(Double)
-
-      //13. Integer(Double):
-      val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head()
-      val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0)
-      val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i }
-      val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords")))
-
-      //14. Integer (Double):
-      val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head()
-      val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0)
-      val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i }
-      val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords")))
-
-      // 15. Double (Not ratio):
-      val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head()
-      val W15_Mean = Mean_W15PortionQid.getDouble(0)
-      val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i }
-      val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid")))
-
-      //16. Double(Not Ratio):
-      val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head()
-      val W16_Mean = Mean_W16PortionLnags.getDouble(0)
-      val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i }
-      val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags")))
-
-      //17.Double(Not ratio):
-      val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head()
-      val W17_Mean = Mean_W17PortionLinks.getDouble(0)
-      val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i }
-      val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks")))
-
-      //************************************************End Word  Features ****************************************************************************************
-
-      //************************************************Start Sentences  Features ****************************************************************************************
-      // 1. Integer(Double)
-      val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head()
-      val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0))
-      val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i }
-      val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength")))
-
-      //2. Double  but Not ratio values :
-      val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head()
-      val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0))
-      val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i }
-      val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel")))
-
-      //3. Double  but Not ratio values :
-      val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head()
-      val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0))
-      val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i }
-      val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink")))
-
-      //4.  Double  but Not ratio values :
-      val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head()
-      val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0))
-      val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i }
-      val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment")))
-
-      //df41.show()
-      //************************************************End Sentences  Features ****************************************************************************************
-      //*********************************************** Start Statement  Features ****************************************************************************************
-      //1. String
-      //2. String
-      //3. String
-      //************************************************End Statement  Features ****************************************************************************************
-      //*********************************************** Start User Features ****************************************************************************************
-
-      //1.Boolean(Double)
-      //2.Boolean(Double)
-      //3.Boolean(Double)
-      //4.Boolean(Double)
-      //5.Boolean(Double)
-      //6.Boolean(Double)
-      //7. (Double) IP No need to fill Missing Data
-      //8. (Double) ID No need to fill Missing Data
-      //9.Boolean(Double)
-      //10.Boolean(Double)
-
-      //*********************************************** End User Features ****************************************************************************************
-      //*********************************************** Start Item Features ****************************************************************************************
-      //1. Integer (Double) No need to fill missing values
-      //2. Integer (Double) No need to fill missing values
-      //3. Integer (Double) No need to fill missing values
-      //4. Integer (Double) No need to fill missing values
-      //5. Integer (Double) No need to fill missing values
-      //6. Integer (Double) No need to fill missing values
-      //7. Integer (Double) No need to fill missing values
-      //8. Integer (Double) No need to fill missing values
-      //9. Integer (Double) No need to fill missing values
-      //10. Integer (Double) No need to fill missing values
-      //11. String
-      //*********************************************** End Item Features ****************************************************************************************
-      //*********************************************** Start Revision Features ****************************************************************************************
-      //1.String
-      //2.String
-      //3.Boolean (Double)
-      //4.Integer(Double)
-      //5.String
-      //6.String
-      //7. Boolean(Double)
-      //8. String
-      //9.String
-      //10. Integer (Double)
-      //11.String
-      //12. integer(Double)
-      //13. Long(Double)
-      //14. integer (Double)
-      //15.String
-      //16.String
-      //*********************************************** End Revision Features ****************************************************************************************
-      //*********************************************** Meta Data , Truth Data and Frequnces  ****************************************************************************************
-      //Meta
-      // 1.Revision Session :Integer (Converted to Double)
-      //2. User Country Code
-      //3.User Continent Code
-      //4.User Time Size
-      //5.User Region Code
-      //6.User-city Name
-      //7.User Country Name
-      //8.RevisionTags
-
+    // Streaming records:
+    val jobConf = new JobConf()
+    val NormalXML_Parser_OBJ = new ParseNormalXML()
+    val RDD_OBJ = new ParseNormalXML()
+
+    val Testing_RDD_All_Record = RDD_OBJ.Testing_DB_NormalXML_Parser(sc).cache()
+
+    // ======= Json part :
+    // Json RDD : Each record has its Revision iD:
+    val JsonRDD = Testing_RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache()
+    // JsonRDD.foreach(println)
+    // println(JsonRDD.count())
+
+    // Data set
+    val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache()
+    // Ds_Json.show()
+    // println(Ds_Json.count())
+
+    // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage
+    val TagsRDD = Testing_RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache()
+    val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP",
+      "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache()
+    //    DF_Tags.show()
+    //    println(DF_Tags.count())
+
+    // ======== Join Json part with Tag Part:============================
+    // Joining to have full data
+    val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter").select("Rid", "itemid", "comment", "pid", "time",
+      "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions",
+      "aliases", "claims", "sitelinks", "model", "format", "sha") // .orderBy("Rid", "Itemid")
+    DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1")
+    val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache()
+
+    val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2",
+      "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2")
+    val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) // .distinct()
+    DF_Second.registerTempTable("Data2")
+
+    // ===================================================================Parent // Previous Revision==============================================================================================================
+
+    // Joining based on Parent Id to get the previous cases: ParentID
+    val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct()
+
+    val RDD_After_JoinDF = DF_Joined.rdd.distinct()
+    val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache()
+    val part = new RangePartitioner(4, x)
+    val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory.
+    // partitioned.foreach(println)
+    //
+    //      //=====================================================All Features Based on Categories of Features Data Type :==================================================================================
+    //
+    val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on ","
+    // Result_all_Features.foreach(println)
+    // println("nayef" + Result_all_Features.count())
+
+    // Conver the RDD of All Features to  DataFrame:
+
+    val schema = StructType(
+
+      // 0
+      StructField("Rid", IntegerType, false) ::
+
+        // Character Features :
+        /* 1 */ StructField("C1uppercaseratio", DoubleType, false) :: /* 2 */ StructField("C2lowercaseratio", DoubleType, false) :: /* 3 */ StructField("C3alphanumericratio", DoubleType, false) ::
+        /* 4 */ StructField("C4asciiratio", DoubleType, false) :: /* 5 */ StructField("C5bracketratio", DoubleType, false) :: /* 6 */ StructField("C6digitalratio", DoubleType, false) ::
+        /* 7 */ StructField("C7latinratio", DoubleType, false) :: /* 8 */ StructField("C8whitespaceratio", DoubleType, false) :: /* 9 */ StructField("C9puncratio", DoubleType, false) ::
+        /* 10 */ StructField("C10longcharacterseq", DoubleType, false) :: /* 11 */ StructField("C11arabicratio", DoubleType, false) :: /* 12 */ StructField("C12bengaliratio", DoubleType, false) ::
+        /* 13 */ StructField("C13brahmiratio", DoubleType, false) :: /* 14 */ StructField("C14cyrilinratio", DoubleType, false) :: /* 15 */ StructField("C15hanratio", DoubleType, false) ::
+        /* 16 */ StructField("c16malysiaratio", DoubleType, false) :: /* 17 */ StructField("C17tamiratio", DoubleType, false) :: /* 18 */ StructField("C18telugratio", DoubleType, false) ::
+        /* 19 */ StructField("C19symbolratio", DoubleType, false) :: /* 20 */ StructField("C20alpharatio", DoubleType, false) :: /* 21 */ StructField("C21visibleratio", DoubleType, false) ::
+        /* 22 */ StructField("C22printableratio", DoubleType, false) :: /* 23 */ StructField("C23blankratio", DoubleType, false) :: /* 24 */ StructField("C24controlratio", DoubleType, false) ::
+        /* 25 */ StructField("C25hexaratio", DoubleType, false) ::
+
+        // word Features:
+        /* 26 */ StructField("W1languagewordratio", DoubleType, false) :: /* 27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /* 28 */ StructField("W3lowercaseratio", DoubleType, false) ::
+        /* 29 Integer */ StructField("W4longestword", IntegerType, false) :: /* 30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /* 31 */ StructField("W6badwordratio", DoubleType, false) ::
+        /* 32 */ StructField("W7uppercaseratio", DoubleType, false) :: /* 33 */ StructField("W8banwordratio", DoubleType, false) :: /* 34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) ::
+        /* 35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /* 36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) ::
+        /* 37 Boolean */ StructField("W12IsContainBanword", DoubleType, false) ::
+        /* 38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /* 39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) ::
+        /* 40 */ StructField("W15PortionQid", DoubleType, false) :: /* 41 */ StructField("W16PortionLnags", DoubleType, false) :: /* 42 */ StructField("W17PortionLinks", DoubleType, false) ::
+
+        //
+        //          // Sentences Features:
+        /* 43 */ StructField("S1CommentTailLength", DoubleType, false) :: /* 44 */ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) ::
+        /* 45 */ StructField("S3SimilarityLabelandSitelink", DoubleType, false) ::
+        /* 46 */ StructField("S4SimilarityCommentComment", DoubleType, false) ::
+        //
+        //          // Statements Features :
+        /* 47 */ StructField("SS1Property", StringType, false) :: /* 48 */ StructField("SS2DataValue", StringType, false) :: /* 49 */ StructField("SS3ItemValue", StringType, false) ::
+        //
+        //
+        //        //User Features :
+        /* 50 Boolean */ StructField("U1IsPrivileged", DoubleType, false) :: /* 51 Boolean */ StructField("U2IsBotUser", DoubleType, false) :: /* 52 Boolean */ StructField("U3IsBotuserWithFlaguser", DoubleType, false) ::
+        /* 53 Boolean */ StructField("U4IsProperty", DoubleType, false) :: /* 54 Boolean */ StructField("U5IsTranslator", DoubleType, false) :: /* 55 Boolean */ StructField("U6IsRegister", DoubleType, false) ::
+        /* 56 */ StructField("U7IPValue", DoubleType, false) :: /* 57 */ StructField("U8UserID", IntegerType, false) :: /* 58 */ StructField("U9HasBirthDate", DoubleType, false) ::
+        /* 59 */ StructField("U10HasDeathDate", DoubleType, false) ::
+
+        // Items Features :
+
+        /* 60 */ StructField("I1NumberLabels", DoubleType, false) :: /* 61 */ StructField("I2NumberDescription", DoubleType, false) :: /* 62 */ StructField("I3NumberAliases", DoubleType, false) ::
+        /* 63 */ StructField("I4NumberClaims", DoubleType, false) ::
+        /* 64 */ StructField("I5NumberSitelinks", DoubleType, false) :: /* 65 */ StructField("I6NumberStatement", DoubleType, false) :: /* 66 */ StructField("I7NumberReferences", DoubleType, false) ::
+        /* 67 */ StructField("I8NumberQualifier", DoubleType, false) ::
+        /* 68 */ StructField("I9NumberQualifierOrder", DoubleType, false) :: /* 69 */ StructField("I10NumberBadges", DoubleType, false) :: /* 70 */ StructField("I11ItemTitle", StringType, false) ::
+
+        // Revision Features:
+        /* 71 */ StructField("R1languageRevision", StringType, false) :: /* 72 */ StructField("R2RevisionLanguageLocal", StringType, false) :: /* 73 */ StructField("R3IslatainLanguage", DoubleType, false) ::
+        /* 74 */ StructField("R4JsonLength", DoubleType, false) :: /* 75 */ StructField("R5RevisionAction", StringType, false) :: /* 76 */ StructField("R6PrevReviAction", StringType, false) ::
+        /* 77 */ StructField("R7RevisionAccountChange", DoubleType, false) :: /* 78 */ StructField("R8ParRevision", StringType, false) :: /* 79 */ StructField("R9RevisionTime", StringType, false) ::
+        /* 80 */ StructField("R10RevisionSize", DoubleType, false) :: /* 81 */ StructField("R11ContentType", StringType, false) :: /* 82 */ StructField("R12BytesIncrease", DoubleType, false) ::
+        /* 83 */ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /* 84 */ StructField("R14CommentLength", DoubleType, false) :: /* 85 */ StructField("R15RevisionSubaction", StringType, false) ::
+        /* 86 */ StructField("R16PrevReviSubaction", StringType, false) ::
+
+        Nil)
+
+    val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column
+    , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble), e(11).toDouble, e(12).toDouble, e(13).toDouble //
+    , e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble // Word Feature column
+    , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble //
+    , RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column:
+    , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble // Statement Features Column:
+    , e(47), e(48), e(49) // User Features Column:
+    , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble // Item Features column:
+    , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble //
+    , e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() // Revision Features Column:
+    , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86)))
+
+    // a.User Frequency:
+    // number of revisions a user has contributed
+    // val resu= DF_Tags.groupBy("contributorID").agg(count("Rid"))
+    DF_Tags.registerTempTable("TagesTable")
+    val ContributorFreq_for_Each_Revision_DF = sqlContext
+      .sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") // .drop("CIDUSER1")
+    // ContributorFreq_for_Each_Revision_DF.show()
+
+    // b.Cumulated : Number of a unique Item a user has contributed.
+    val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext
+      .sql("select contributorID as CIDUSER2,  COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") // .drop("CIDUSER2")
+    // CumulatedNumberof_uniqueItemsForUser_DF.show()
+
+    // 1.Item Frequency:
+    // number of revisions an Item has
+    val ItemFrequ_DF = sqlContext
+      .sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable  group by itemid")
+    // ItemFrequ_DF.show()
+
+    // 2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name
+    val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid,  COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid")
+    // CumulatedNumberof_UniqueUserForItem_DF.show()
+
+    // 3. freq each Item :
+    val Fre_Item_DF = sqlContext.sql("select itemid,  COUNT(itemid) as FreqItem from TagesTable  group by itemid")
+    // Fre_Item_DF.show()
+
+    // *****************************************************************************************************************************************
+    // This is Main DataFrame:
+    val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema)
+    // BeforeJoin_All_Features.show()
+
+    // ********************************** User feature Join
+
+    // Join1 for add The first User Feature : number of revisions a user has contributed
+    val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1")
+    // AfterJoinUser1_All_Features.show()
+
+    // Join2 for add The second  User Feature
+    val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2")
+    // AfterJoinUser2_All_Features.show()
+
+    // ********************************** Item Feature Join
+    // Join3 for add The First  Item Feature :number of revisions an Item has
+    val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
+    // AfterJoinItem3_All_Features.show()
+
+    // Join4 for add The Second  Item Feature
+    val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
+    // AfterJoinItem4_All_Features.show()
+
+    // Join5 for add The Third  Item Feature
+    val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
+    // 2 AfterJoinItem5_All_Features.show()
+
+    // ********************************
+
+    // *Geografical information Feature from Meta File
+    // REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS
+    val df_GeoInf = sqlContext.read
+      .format("com.databricks.spark.csv")
+      .option("header", "true") // Use first line of all files as header
+      .option("inferSchema", "true") // Automatically infer data types
+      .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE",
+        "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")
+    // df_GeoInf.show()
+
+    val df_Truth = sqlContext.read
+      .format("com.databricks.spark.csv")
+      .option("header", "true") // Use first line of all files as header
+      .option("inferSchema", "true") // Automatically infer data types
+      .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED")
+    // df_GeoInf.show()
+
+    val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache()
+    // AfterJoinGeoInfo_All_Features.show()
+
+    val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache()
+    // Final_All_Features.show()
+
+    // Pre- process Data ============================================================================================================================================================
+
+    // For String Column, We fill the Null values by "NA":
+
+    var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE",
+      "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache()
+
+    // For Integer Frequency  Column, We fill the Null values by 0:
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas",
+      "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache()
+    // Fill_Missing_Final_All_Features.show()
+
+    val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 }
+    val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble }
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED")))
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED")))
+
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID")))
+
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed")))
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit")))
+
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas")))
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem")))
+    Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem")))
+
+    // ===========================================================================Caharacter Features : Double , Integer Features ====================================================================================
+    // Double Ratio:  For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features :
+    var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() // .where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0)
+    Samples.registerTempTable("df")
+
+    val Query = "select " +
+      "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," +
+      "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," +
+      "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," +
+      "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," +
+      "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," +
+      "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," +
+      "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," +
+      "percentile_approx(c16malysiaratio, 0.5) as median16" + "," +
+      "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," +
+      "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," +
+      "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," +
+      "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," +
+      "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," +
+      "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," +
+      "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df"
+
+    val medianValues = sqlContext.sql(Query).rdd
+    val Median = medianValues.first()
+
+    // Median :
+    // Character Ratio Features: UDF
+    val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i }
+    val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i }
+    val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i }
+    val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i }
+    val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i }
+    val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i }
+    val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i }
+    val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i }
+    val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i }
+
+    val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i }
+    val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i }
+    val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i }
+    val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i }
+    val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i }
+    val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i }
+    val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i }
+    val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i }
+    val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i }
+    val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i }
+    val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i }
+    val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i }
+    val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i }
+    val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i }
+    val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i }
+
+    val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) // .drop("C1uppercaseratio").cache()
+    val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) // .drop("C2lowercaseratio").cache()
+    // df1.unpersist()
+    val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) // .drop("C3alphanumericratio").cache()
+    // df2.unpersist()
+    val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) // .drop("C4asciiratio").cache()
+    // df3.unpersist()
+    val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) // .drop("C5bracketratio").cache()
+    // df4.unpersist()
+    val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) // .drop("C6digitalratio").cache()
+    // df5.unpersist()
+    val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) // .drop("C7latinratio").cache()
+    // df6.unpersist()
+    val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) // .drop("C8whitespaceratio").cache()
+    // df7.unpersist()
+    val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) // .drop("C9puncratio").cache()
+
+    // Mean :
+    // character integer values :
+    val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head()
+    val C10_Mean = Mean_C10longcharacterseq.getDouble(0)
+    val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i }
+    val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq")))
+
+    // Median
+    val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) // .drop("C11arabicratio").cache()
+    // df9.unpersist()
+    val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) // .drop("C12bengaliratio").cache()
+    // df11.unpersist()
+    val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) // .drop("C13brahmiratio").cache()
+    // df12.unpersist()
+    val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) // .drop("C14cyrilinratio").cache()
+    // df13.unpersist()
+    val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) // .drop("C15hanratio").cache()
+    // df14.unpersist()
+    val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) // .drop("c16malysiaratio").cache()
+    // df15.unpersist()
+    val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) // .drop("C17tamiratio").cache()
+    //  df16.unpersist()
+    val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) // .drop("C18telugratio").cache()
+    // df17.unpersist()
+    val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) // .drop("C19symbolratio").cache()
+    // df18.unpersist()
+    val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) // .drop("C20alpharatio").cache()
+    // df19.unpersist()
+    val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) // .drop("C21visibleratio").cache()
+    // df20.unpersist()
+    val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) // .drop("C22printableratio").cache()
+    // df21.unpersist()
+    val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) // .drop("C23blankratio").cache()
+    // df22.unpersist()
+    val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) // .drop("C24controlratio").cache()
+    // df23.unpersist()
+    val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) // .drop("C25hexaratio").cache()
+
+    // ************************************************End Character Features ****************************************************************************************
+
+    // ************************************************Start Word  Features ****************************************************************************************
+
+    // Word Ratio Features : UDF
+    val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i }
+    val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i }
+    val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i }
+    val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i }
+    val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i }
+
+    // 1.
+    val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) // .drop("W1languagewordratio").cache()
+
+    // 2.Boolean(Double) IsContainLanguageWord
+
+    // 3.
+    val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) // .drop("W3lowercaseratio").cache()
+    // df26.unpersist()
+
+    // 4. Integer " Mean:
+    val Mean_W4longestword = Samples.agg(mean("W4longestword")).head()
+    val W4_Mean = Mean_W4longestword.getDouble(0)
+    val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i }
+    val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword")))
+
+    // 5. Boolean (Double ) W5IscontainURL
+    // 6.
+    val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) // .drop("W6badwordratio").cache()
+
+    // 7.
+    val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) // .drop("W7uppercaseratio").cache()
+
+    // 8.
+    val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) // .drop("W8banwordratio").cache()
+
+    // 9.FemalFirst       Boolean(Double)
+    // 10.Male First      Boolean(Double)
+    // 11.ContainBadWord  Boolean(Double)
+    // 12ContainBanWord   Boolean(Double)
+
+    // 13. Integer(Double):
+    val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head()
+    val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0)
+    val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i }
+    val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords")))
+
+    // 14. Integer (Double):
+    val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head()
+    val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0)
+    val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i }
+    val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords")))
+
+    // 15. Double (Not ratio):
+    val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head()
+    val W15_Mean = Mean_W15PortionQid.getDouble(0)
+    val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i }
+    val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid")))
+
+    // 16. Double(Not Ratio):
+    val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head()
+    val W16_Mean = Mean_W16PortionLnags.getDouble(0)
+    val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i }
+    val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags")))
+
+    // 17.Double(Not ratio):
+    val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head()
+    val W17_Mean = Mean_W17PortionLinks.getDouble(0)
+    val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i }
+    val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks")))
+
+    // ************************************************End Word  Features ****************************************************************************************
+
+    // ************************************************Start Sentences  Features ****************************************************************************************
+    // 1. Integer(Double)
+    val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head()
+    val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0))
+    val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i }
+    val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength")))
+
+    // 2. Double  but Not ratio values :
+    val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head()
+    val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0))
+    val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i }
+    val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel")))
+
+    // 3. Double  but Not ratio values :
+    val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head()
+    val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0))
+    val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i }
+    val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink")))
+
+    // 4.  Double  but Not ratio values :
+    val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head()
+    val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0))
+    val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i }
+    val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment")))
+
+    // df41.show()
+    // ************************************************End Sentences  Features ****************************************************************************************
+    // *********************************************** Start Statement  Features ****************************************************************************************
+    // 1. String
+    // 2. String
+    // 3. String
+    // ************************************************End Statement  Features ****************************************************************************************
+    // *********************************************** Start User Features ****************************************************************************************
+
+    // 1.Boolean(Double)
+    // 2.Boolean(Double)
+    // 3.Boolean(Double)
+    // 4.Boolean(Double)
+    // 5.Boolean(Double)
+    // 6.Boolean(Double)
+    // 7. (Double) IP No need to fill Missing Data
+    // 8. (Double) ID No need to fill Missing Data
+    // 9.Boolean(Double)
+    // 10.Boolean(Double)
+
+    // *********************************************** End User Features ****************************************************************************************
+    // *********************************************** Start Item Features ****************************************************************************************
+    // 1. Integer (Double) No need to fill missing values
+    // 2. Integer (Double) No need to fill missing values
+    // 3. Integer (Double) No need to fill missing values
+    // 4. Integer (Double) No need to fill missing values
+    // 5. Integer (Double) No need to fill missing values
+    // 6. Integer (Double) No need to fill missing values
+    // 7. Integer (Double) No need to fill missing values
+    // 8. Integer (Double) No need to fill missing values
+    // 9. Integer (Double) No need to fill missing values
+    // 10. Integer (Double) No need to fill missing values
+    // 11. String
+    // *********************************************** End Item Features ****************************************************************************************
+    // *********************************************** Start Revision Features ****************************************************************************************
+    // 1.String
+    // 2.String
+    // 3.Boolean (Double)
+    // 4.Integer(Double)
+    // 5.String
+    // 6.String
+    // 7. Boolean(Double)
+    // 8. String
+    // 9.String
+    // 10. Integer (Double)
+    // 11.String
+    // 12. integer(Double)
+    // 13. Long(Double)
+    // 14. integer (Double)
+    // 15.String
+    // 16.String
+    // *********************************************** End Revision Features ****************************************************************************************
+    // *********************************************** Meta Data , Truth Data and Frequnces  ****************************************************************************************
+    // Meta
+    // 1.Revision Session :Integer (Converted to Double)
+    // 2. User Country Code
+    // 3.User Continent Code
+    // 4.User Time Size
+    // 5.User Region Code
+    // 6.User-city Name
+    // 7.User Country Name
+    // 8.RevisionTags
+
+    // Truth:
+    // 1.Undo
+
+    // Freq :
+
+    // 1.5 features
+
+    // Roll Boolean     :Boolean (Double)
+    // Undo             :Boolean (Double)
+
+    // *********************************************** End Revision Features ****************************************************************************************
+
+    // ===========================================================================String Features====================================================================================
+
+    val df42 = df41.withColumn(
+      // statement String features:
+      "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle",
+        // Revision  String Features:
+        lit(";"), $"R1languageRevision",
+        lit(";"), $"R2RevisionLanguageLocal",
+        lit(";"), $"R5RevisionAction",
+        lit(";"), $"R6PrevReviAction",
+        lit(";"), $"R8ParRevision",
+        lit(";"), $"R9RevisionTime",
+        lit(";"), $"R11ContentType",
+        lit(";"), $"R15RevisionSubaction",
+        lit(";"), $"R16PrevReviSubaction",
+
+        lit(";"), $"USER_COUNTRY_CODE",
+        lit(";"), $"USER_CONTINENT_CODE",
+        lit(";"), $"USER_TIME_ZONE",
+        lit(";"), $"USER_REGION_CODE",
+        lit(";"), $"USER_CITY_NAME",
+        lit(";"), $"USER_COUNTY_NAME",
+        lit(";"), $"REVISION_TAGS"))
+
+    val toArray = udf((record: String) => record.split(";").map(_.toString()))
+    val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures")))
+    //  test1.show()
+    //  test1.printSchema()
+
+    val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0)
+    val model = word2Vec.fit(test1)
+    val result = model.transform(test1) // .rdd
+
+    // result.show()
+
+    val Todense = udf((b: Vector) => b.toDense)
+    val test_new2 = result.withColumn("result", Todense(col("result")))
+
+    val assembler = new VectorAssembler().setInputCols(Array(
+      "result",
+
+      // character
+      "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio",
+      "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio",
+      "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio",
+      "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio",
+
+      // Words
+      "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio",
+      "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword",
+      "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks",
+
+      // Sentences :
+      "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment",
+
+      // User :
+      "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID",
+      "U9HasBirthDate", "U10HasDeathDate",
+
+      // Item:
+
+      "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement",
+      "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges",
+
+      // Revision:
+      "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease",
+      "R13TimeSinceLastRevi", "R14CommentLength",
+
+      // Meta , truth , Freq
+      // meta :
+      "FinalREVISION_SESSION_ID",
       // Truth:
-      //1.Undo
-
-      // Freq :
-
-      //1.5 features
-
-      // Roll Boolean     :Boolean (Double)
-      // Undo             :Boolean (Double)
-
-      //*********************************************** End Revision Features ****************************************************************************************
-
-      //===========================================================================String Features====================================================================================
-
-      val df42 = df41.withColumn(
-        //statement String features:
-        "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle",
-          //Revision  String Features:
-          lit(";"), $"R1languageRevision",
-          lit(";"), $"R2RevisionLanguageLocal",
-          lit(";"), $"R5RevisionAction",
-          lit(";"), $"R6PrevReviAction",
-          lit(";"), $"R8ParRevision",
-          lit(";"), $"R9RevisionTime",
-          lit(";"), $"R11ContentType",
-          lit(";"), $"R15RevisionSubaction",
-          lit(";"), $"R16PrevReviSubaction",
-
-          lit(";"), $"USER_COUNTRY_CODE",
-          lit(";"), $"USER_CONTINENT_CODE",
-          lit(";"), $"USER_TIME_ZONE",
-          lit(";"), $"USER_REGION_CODE",
-          lit(";"), $"USER_CITY_NAME",
-          lit(";"), $"USER_COUNTY_NAME",
-          lit(";"), $"REVISION_TAGS"))
+      "FinalUNDO_RESTORE_REVERTED",
 
-      val toArray = udf((record: String) => record.split(";").map(_.toString()))
-      val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures")))
-      //  test1.show()
-      //  test1.printSchema()
+      // Freq:
+      "FinalNumberofRevisionsUserContributed",
+      "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features")
+    val Testing_Data = assembler.transform(test_new2)
 
-      val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0)
-      val model = word2Vec.fit(test1)
-      val result = model.transform(test1) //.rdd
-
-      // result.show()
-
-      val Todense = udf((b: Vector) => b.toDense)
-      val test_new2 = result.withColumn("result", Todense(col("result")))
-
-      val assembler = new VectorAssembler().setInputCols(Array(
-        "result",
-
-        // character
-        "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio",
-        "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio",
-        "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio",
-        "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio",
-
-        // Words
-        "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio",
-        "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword",
-        "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks",
-
-        //Sentences :
-        "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment",
-
-        // User :
-        "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID",
-        "U9HasBirthDate", "U10HasDeathDate",
-
-        //Item:
-
-        "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement",
-        "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges",
-
-        //Revision:
-        "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease",
-        "R13TimeSinceLastRevi", "R14CommentLength",
-
-        // Meta , truth , Freq
-        // meta :
-        "FinalREVISION_SESSION_ID",
-        // Truth:
-        "FinalUNDO_RESTORE_REVERTED",
-
-        //Freq:
-        "FinalNumberofRevisionsUserContributed",
-        "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features")
-      val Testing_Data = assembler.transform(test_new2)
-
-      // Prepare the data for classification:
+    // Prepare the data for classification:
     //  NewData.registerTempTable("DB")
     //  val Training_Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED  from DB")
-     //val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision
+    // val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision
 
-      //Data.show()
+    // Data.show()
 
     //  val TestClassifiers = new Classifiers()
-//
-      //  TestClassifiers.RandomForestClassifer(Testing_Data, sqlContext)
-//      // TestClassifiers.DecisionTreeClassifier(Data, sqlContext)
-//      // TestClassifiers.LogisticRegrision(Data, sqlContext)
-//      // TestClassifiers.GradientBoostedTree(Data, sqlContext)
-//      // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext)
+    //
+    //  TestClassifiers.RandomForestClassifer(Testing_Data, sqlContext)
+    //      // TestClassifiers.DecisionTreeClassifier(Data, sqlContext)
+    //      // TestClassifiers.LogisticRegrision(Data, sqlContext)
+    //      // TestClassifiers.GradientBoostedTree(Data, sqlContext)
+    //      // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext)
 
     Testing_Data
-   
-    
-  }
-  
-  
-  
-  
-  def Triger(sc: SparkContext): Unit = {
-
-//    val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-//    import sqlContext.implicits._
-//    import org.apache.spark.sql.functions._ // for UDF
-//    import org.apache.spark.sql.types._
-//
-//    //*******************************************************************************************************************************
-//    println("Please Enter 0 for JTriple and  1 for TRIX  process and 2 for RDFXML process and 3 for NormalXML:")
-//    val num = scala.io.StdIn.readLine()
-//
-//    if (num == "0") {
-//      println("JTriple.........!!!!!!")
-//      // Streaming records:RDFJtriple file :
-//      val jobConf = new JobConf()
-//
-//      val JTriple_Parser_OBJ = new ParseJTriple()
-//      val DRF_Builder_JTripleOBJ = new FacilitiesClass()
-//      val RDD_JTriple = JTriple_Parser_OBJ.Start_JTriple_Parser(jobConf, sc)
-//      RDD_JTriple.foreach(println)
-//      //----------------------------DF for RDF TRIX ------------------------------------------
-//      //  Create SQLContext Object:
-//      val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-//      val DFR_JTriple = DRF_Builder_JTripleOBJ.RDD_TO_DFR_JTriple(RDD_JTriple, sqlContext)
-//      DFR_JTriple.show()
-//
-//    }
-
-//    if (num == "1") {
-//
-//      println("TRIX.........!!!!!!")
-//      // Streaming records:RDFTRIX file :
-//      val jobConf = new JobConf()
-//
-//      val TRIX_Parser_OBJ = new ParseTRIX()
-//      val DRF_Builder_RDFTRIX_OBJ = new FacilitiesClass()
-//
-//      val RDD_TRIX = TRIX_Parser_OBJ.Start_TriX_Parser(jobConf, sc)
-//      RDD_TRIX.foreach(println)
-//
-//      //----------------------------DF for RDF TRIX ------------------------------------------
-//      //  Create SQLContext Object:
-//      val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-//      val DFR_TRIX = DRF_Builder_RDFTRIX_OBJ.RDD_TO_DFR_TRIX(RDD_TRIX, sqlContext)
-//      DFR_TRIX.show()
-//
-//    } //RDF XML file :*********************************************************************************************************
-//    else if (num == "2") {
-//      println("RDF XML .........!!!!!!")
-//      // Streaming records:RDFXML file :
-//      val jobConf_Record = new JobConf()
-//      val jobConf_Prefixes = new JobConf()
-//
-//      val RDFXML_Parser_OBJ = new ParseRDFXML()
-//      val DRF_Builder_RDFXML_OBJ = new FacilitiesClass()
-//
-//      val RDD_RDFXML = RDFXML_Parser_OBJ.start_RDFXML_Parser(jobConf_Record, jobConf_Prefixes, sc)
-//      RDD_RDFXML.foreach(println)
-//
-//      //----------------------------DF for RDF XML ------------------------------------------
-//      //  Create SQLContext Object:
-//      val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-//      val DFR_RDF_XML = DRF_Builder_RDFXML_OBJ.RDD_TO_DFR_RDFXML(RDD_RDFXML, sqlContext)
-//      DFR_RDF_XML.show()
-//      //
-//      // NOrmal XML Example WikiData: ***************************************************************************************************
-//    } else if (num == "3") {
-      // Streaming records:
-//      val jobConf = new JobConf()
-//      val NormalXML_Parser_OBJ = new ParseNormalXML()
-//      val RDD_OBJ = new ParseNormalXML()
-//      val RDD_All_Record1 = RDD_OBJ.Training_DB_NormalXML_Parser_Input1(sc)
-//      val RDD_All_Record2 = RDD_OBJ.Training_DB_NormalXML_Parser_Input2(sc)
-//      val RDD_All_Record3 = RDD_OBJ.Training_DB_NormalXML_Parser_Input3(sc)
-//      //RDD_All_Record1.foreach(println)
-//      //RDD_All_Record2.foreach(println)
-//      // RDD_All_Record3.foreach(println)
-//
-//      val RDD_All_Record = RDD_All_Record1.union(RDD_All_Record2).union(RDD_All_Record3).distinct().cache()
-//
-//      //println(RDD_All_Record.count())
-//      // println(RDD_All_Record.count())
-//
-//      // ======= Json part :
-//      //Json RDD : Each record has its Revision iD:
-//      val JsonRDD = RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache()
-//      //JsonRDD.foreach(println)
-//      //println(JsonRDD.count())
-//
-//      // Data set
-//      val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache()
-//      //Ds_Json.show()
-//      // println(Ds_Json.count())
-//
-//      // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage
-//      val TagsRDD = RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache()
-//      val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache()
-//      //    DF_Tags.show()
-//      //    println(DF_Tags.count())
-//
-//      //======== Join Json part with Tag Part:============================
-//      //Joining to have full data
-//      val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha") //.orderBy("Rid", "Itemid")
-//      DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1")
-//      val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache()
-//
-//      val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2")
-//      val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) //.distinct()
-//      DF_Second.registerTempTable("Data2")
-//
-//      //===================================================================Parent // Previous Revision==============================================================================================================
-//      //val DF_Joined = result1.as("df1").join(result2.as("df2"), col("itemid") === col("itemid2") && col("index1") === col("index2") + 1, "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha", "Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2")
-//      //.select("itemid", "Rid","pid","time","itemid2","Rid2","pid2","time2")
-//
-//      //Joining based on Parent Id to get the previous cases: ParentID
-//      val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct()
-//
-//      val RDD_After_JoinDF = DF_Joined.rdd.distinct()
-//      val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache()
-//      val part = new RangePartitioner(4, x)
-//      val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory.
-//      //partitioned.foreach(println)
-//      //
-//      //      //=====================================================All Features Based on Categories of Features Data Type :==================================================================================
-//      //
-//      val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on ","
-//      //Result_all_Features.foreach(println)
-//      // println("nayef" + Result_all_Features.count())
-//
-//      // Conver the RDD of All Features to  DataFrame:
-//
-//      val schema = StructType(
-//
-//        //0
-//        StructField("Rid", IntegerType, false) ::
-//
-//          // Character Features :
-//          /* 1*/ StructField("C1uppercaseratio", DoubleType, false) :: /*2 */ StructField("C2lowercaseratio", DoubleType, false) :: /*3*/ StructField("C3alphanumericratio", DoubleType, false) ::
-//          /*4*/ StructField("C4asciiratio", DoubleType, false) :: /*5*/ StructField("C5bracketratio", DoubleType, false) :: /*6*/ StructField("C6digitalratio", DoubleType, false) ::
-//          /*7*/ StructField("C7latinratio", DoubleType, false) :: /*8*/ StructField("C8whitespaceratio", DoubleType, false) :: /* 9*/ StructField("C9puncratio", DoubleType, false) ::
-//          /*10*/ StructField("C10longcharacterseq", DoubleType, false) :: /*11*/ StructField("C11arabicratio", DoubleType, false) :: /*12*/ StructField("C12bengaliratio", DoubleType, false) ::
-//          /*13 */ StructField("C13brahmiratio", DoubleType, false) :: /*14*/ StructField("C14cyrilinratio", DoubleType, false) :: /*15*/ StructField("C15hanratio", DoubleType, false) ::
-//          /*16*/ StructField("c16malysiaratio", DoubleType, false) :: /*17*/ StructField("C17tamiratio", DoubleType, false) :: /*18*/ StructField("C18telugratio", DoubleType, false) ::
-//          /*19 */ StructField("C19symbolratio", DoubleType, false) :: /*20 */ StructField("C20alpharatio", DoubleType, false) :: /*21*/ StructField("C21visibleratio", DoubleType, false) ::
-//          /*22*/ StructField("C22printableratio", DoubleType, false) :: /*23*/ StructField("C23blankratio", DoubleType, false) :: /*24 */ StructField("C24controlratio", DoubleType, false) ::
-//          /* 25 */ StructField("C25hexaratio", DoubleType, false) ::
-//
-//          //word Features:
-//          /*26*/ StructField("W1languagewordratio", DoubleType, false) :: /*27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /*28*/ StructField("W3lowercaseratio", DoubleType, false) ::
-//          /*29 Integer */ StructField("W4longestword", IntegerType, false) :: /*30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /*31*/ StructField("W6badwordratio", DoubleType, false) ::
-//          /*32*/ StructField("W7uppercaseratio", DoubleType, false) :: /*33*/ StructField("W8banwordratio", DoubleType, false) :: /*34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) ::
-//          /*35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /*36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: /*37 Boolean*/ StructField("W12IsContainBanword", DoubleType, false) ::
-//          /*38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /*39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) ::
-//          /*40*/ StructField("W15PortionQid", DoubleType, false) :: /*41*/ StructField("W16PortionLnags", DoubleType, false) :: /*42*/ StructField("W17PortionLinks", DoubleType, false) ::
-//
-//          //
-//          //          // Sentences Features:
-//          /*43*/ StructField("S1CommentTailLength", DoubleType, false) :: /*44*/ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: /*45*/ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /*46*/ StructField("S4SimilarityCommentComment", DoubleType, false) ::
-//          //
-//          //          // Statements Features :
-//          /*47*/ StructField("SS1Property", StringType, false) :: /*48*/ StructField("SS2DataValue", StringType, false) :: /*49*/ StructField("SS3ItemValue", StringType, false) ::
-//          //
-//          //
-//          //        //User Features :
-//          /*50 Boolean*/ StructField("U1IsPrivileged", DoubleType, false) :: /*51 Boolean*/ StructField("U2IsBotUser", DoubleType, false) :: /*52 Boolean*/ StructField("U3IsBotuserWithFlaguser", DoubleType, false) ::
-//          /*53 Boolean*/ StructField("U4IsProperty", DoubleType, false) :: /*54 Boolean*/ StructField("U5IsTranslator", DoubleType, false) :: /*55 Boolean*/ StructField("U6IsRegister", DoubleType, false) ::
-//          /*56*/ StructField("U7IPValue", DoubleType, false) :: /*57*/ StructField("U8UserID", IntegerType, false) :: /*58*/ StructField("U9HasBirthDate", DoubleType, false) :: /*59*/ StructField("U10HasDeathDate", DoubleType, false) ::
-//
-//          //Items Features :
-//
-//          /*60*/ StructField("I1NumberLabels", DoubleType, false) :: /*61*/ StructField("I2NumberDescription", DoubleType, false) :: /*62*/ StructField("I3NumberAliases", DoubleType, false) :: /*63*/ StructField("I4NumberClaims", DoubleType, false) ::
-//          /*64*/ StructField("I5NumberSitelinks", DoubleType, false) :: /*65*/ StructField("I6NumberStatement", DoubleType, false) :: /*66*/ StructField("I7NumberReferences", DoubleType, false) :: /*67*/ StructField("I8NumberQualifier", DoubleType, false) ::
-//          /*68*/ StructField("I9NumberQualifierOrder", DoubleType, false) :: /*69*/ StructField("I10NumberBadges", DoubleType, false) :: /*70*/ StructField("I11ItemTitle", StringType, false) ::
-//
-//          // Revision Features:
-//          /*71*/ StructField("R1languageRevision", StringType, false) :: /*72*/ StructField("R2RevisionLanguageLocal", StringType, false) :: /*73*/ StructField("R3IslatainLanguage", DoubleType, false) ::
-//          /*74*/ StructField("R4JsonLength", DoubleType, false) :: /*75*/ StructField("R5RevisionAction", StringType, false) :: /*76*/ StructField("R6PrevReviAction", StringType, false) ::
-//          /*77*/ StructField("R7RevisionAccountChange", DoubleType, false) :: /*78*/ StructField("R8ParRevision", StringType, false) :: /*79*/ StructField("R9RevisionTime", StringType, false) ::
-//          /*80*/ StructField("R10RevisionSize", DoubleType, false) :: /*81*/ StructField("R11ContentType", StringType, false) :: /*82*/ StructField("R12BytesIncrease", DoubleType, false) ::
-//          /*83*/ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /*84*/ StructField("R14CommentLength", DoubleType, false) :: /*85*/ StructField("R15RevisionSubaction", StringType, false) ::
-//          /*86*/ StructField("R16PrevReviSubaction", StringType, false) ::
-//
-//          Nil)
-//
-//      val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column
-//      , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble),
-//        e(11).toDouble, e(12).toDouble, e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble //Word Feature column
-//        , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble, RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column:
-//        , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble //Statement Features Column: 
-//        , e(47), e(48), e(49) // User Features Column: 
-//        , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble //Item Features column:
-//        , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() //Revision Features Column: 
-//        , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86)))
-//
-//      //a.User Frequency:
-//      //number of revisions a user has contributed
-//      //val resu= DF_Tags.groupBy("contributorID").agg(count("Rid"))
-//      DF_Tags.registerTempTable("TagesTable")
-//      val ContributorFreq_for_Each_Revision_DF = sqlContext.sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") //.drop("CIDUSER1")
-//      //ContributorFreq_for_Each_Revision_DF.show()
-//
-//      //b.Cumulated : Number of a unique Item a user has contributed.
-//      val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext.sql("select contributorID as CIDUSER2,  COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") //.drop("CIDUSER2")
-//      //CumulatedNumberof_uniqueItemsForUser_DF.show()
-//
-//      //1.Item Frequency:
-//      // number of revisions an Item has
-//      val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable  group by itemid")
-//      // ItemFrequ_DF.show()
-//
-//      //2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name
-//      val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid,  COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid")
-//      //CumulatedNumberof_UniqueUserForItem_DF.show()
-//
-//      //3. freq each Item :
-//      val Fre_Item_DF = sqlContext.sql("select itemid,  COUNT(itemid) as FreqItem from TagesTable  group by itemid")
-//      // Fre_Item_DF.show()
-//
-//      //*****************************************************************************************************************************************
-//      // This is Main DataFrame:
-//      val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema)
-//      //BeforeJoin_All_Features.show()
-//
-//      //********************************** User feature Join
-//
-//      // Join1 for add The first User Feature : number of revisions a user has contributed
-//      val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1")
-//      //AfterJoinUser1_All_Features.show()
-//
-//      // Join2 for add The second  User Feature
-//      val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2")
-//      //AfterJoinUser2_All_Features.show()
-//
-//      //********************************** Item Feature Join
-//      // Join3 for add The First  Item Feature :number of revisions an Item has
-//      val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
-//      // AfterJoinItem3_All_Features.show()
-//
-//      // Join4 for add The Second  Item Feature
-//      val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
-//      // AfterJoinItem4_All_Features.show()
-//
-//      // Join5 for add The Third  Item Feature
-//      val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid")
-//      //2 AfterJoinItem5_All_Features.show()
-//
-//      //********************************
-//
-//      //*Geografical information Feature from Meta File
-//      //REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS
-//      val df_GeoInf = sqlContext.read
-//        .format("com.databricks.spark.csv")
-//        .option("header", "true") // Use first line of all files as header
-//        .option("inferSchema", "true") // Automatically infer data types
-//        .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")
-//      // df_GeoInf.show()
-//
-//      val df_Truth = sqlContext.read
-//        .format("com.databricks.spark.csv")
-//        .option("header", "true") // Use first line of all files as header
-//        .option("inferSchema", "true") // Automatically infer data types
-//        .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED")
-//      // df_GeoInf.show()
-//
-//      val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache()
-//      // AfterJoinGeoInfo_All_Features.show()
-//
-//      val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache()
-//      //Final_All_Features.show()
-//
-//      // Pre- process Data ============================================================================================================================================================
-//
-//      // For String Column, We fill the Null values by "NA":
-//
-//      var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache()
-//
-//      // For Integer Frequency  Column, We fill the Null values by 0:
-//      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache()
-//      //Fill_Missing_Final_All_Features.show()
-//
-//      val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 }
-//      val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble }
-//      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED")))
-//      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED")))
-//
-//      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID")))
-//
-//      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed")))
-//      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit")))
-//
-//      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas")))
-//      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem")))
-//      Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem")))
-//
-//      //===========================================================================Caharacter Features : Double , Integer Features ====================================================================================
-//      //Double Ratio:  For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features :
-//      var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() //.where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0)
-//      Samples.registerTempTable("df")
-//
-//      val Query = "select " +
-//        "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," +
-//        "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," +
-//        "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," +
-//        "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," +
-//        "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," +
-//        "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," +
-//        "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," +
-//        "percentile_approx(c16malysiaratio, 0.5) as median16" + "," +
-//        "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," +
-//        "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," +
-//        "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," +
-//        "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," +
-//        "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," +
-//        "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," +
-//        "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df"
-//
-//      val medianValues = sqlContext.sql(Query).rdd
-//      val Median = medianValues.first()
-//
-//      // Median :
-//      // Character Ratio Features: UDF
-//      val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i }
-//      val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i }
-//      val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i }
-//      val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i }
-//      val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i }
-//      val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i }
-//      val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i }
-//      val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i }
-//      val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i }
-//
-//      val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i }
-//      val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i }
-//      val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i }
-//      val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i }
-//      val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i }
-//      val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i }
-//      val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i }
-//      val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i }
-//      val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i }
-//      val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i }
-//      val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i }
-//      val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i }
-//      val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i }
-//      val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i }
-//      val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i }
-//
-//      val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) //.drop("C1uppercaseratio").cache()
-//      val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) //.drop("C2lowercaseratio").cache()
-//      //df1.unpersist()
-//      val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) //.drop("C3alphanumericratio").cache()
-//      //df2.unpersist()
-//      val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) //.drop("C4asciiratio").cache()
-//      //df3.unpersist()
-//      val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) //.drop("C5bracketratio").cache()
-//      //df4.unpersist()
-//      val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) //.drop("C6digitalratio").cache()
-//      //df5.unpersist()
-//      val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) //.drop("C7latinratio").cache()
-//      //df6.unpersist()
-//      val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) //.drop("C8whitespaceratio").cache()
-//      //df7.unpersist()
-//      val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) //.drop("C9puncratio").cache()
-//
-//      // Mean :
-//      // character integer values :
-//      val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head()
-//      val C10_Mean = Mean_C10longcharacterseq.getDouble(0)
-//      val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i }
-//      val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq")))
-//
-//      //Median
-//      val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) //.drop("C11arabicratio").cache()
-//      // df9.unpersist()
-//      val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) //.drop("C12bengaliratio").cache()
-//      //df11.unpersist()
-//      val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) //.drop("C13brahmiratio").cache()
-//      // df12.unpersist()
-//      val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) //.drop("C14cyrilinratio").cache()
-//      // df13.unpersist()
-//      val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) //.drop("C15hanratio").cache()
-//      // df14.unpersist()
-//      val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) //.drop("c16malysiaratio").cache()
-//      //df15.unpersist()
-//      val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) //.drop("C17tamiratio").cache()
-//      //df16.unpersist()
-//      val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) //.drop("C18telugratio").cache()
-//      //df17.unpersist()
-//      val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) //.drop("C19symbolratio").cache()
-//      //df18.unpersist()
-//      val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) //.drop("C20alpharatio").cache()
-//      // df19.unpersist()
-//      val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) //.drop("C21visibleratio").cache()
-//      // df20.unpersist()
-//      val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) //.drop("C22printableratio").cache()
-//      //df21.unpersist()
-//      val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) //.drop("C23blankratio").cache()
-//      // df22.unpersist()
-//      val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) //.drop("C24controlratio").cache()
-//      //df23.unpersist()
-//      val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) //.drop("C25hexaratio").cache()
-//
-//      //************************************************End Character Features ****************************************************************************************
-//
-//      //************************************************Start Word  Features ****************************************************************************************
-//
-//      // Word Ratio Features : UDF
-//      val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i }
-//      val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i }
-//      val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i }
-//      val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i }
-//      val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i }
-//
-//      //1.
-//      val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) //.drop("W1languagewordratio").cache()
-//
-//      //2.Boolean(Double) IsContainLanguageWord
-//
-//      //3.
-//      val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) //.drop("W3lowercaseratio").cache()
-//      // df26.unpersist()
-//
-//      //4. Integer " Mean:
-//      val Mean_W4longestword = Samples.agg(mean("W4longestword")).head()
-//      val W4_Mean = Mean_W4longestword.getDouble(0)
-//      val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i }
-//      val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword")))
-//
-//      //5. Boolean (Double ) W5IscontainURL
-//      //6.
-//      val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) //.drop("W6badwordratio").cache()
-//
-//      //7.
-//      val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) //.drop("W7uppercaseratio").cache()
-//
-//      //8.
-//      val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) //.drop("W8banwordratio").cache()
-//
-//      //9.FemalFirst       Boolean(Double)
-//      //10.Male First      Boolean(Double)
-//      //11.ContainBadWord  Boolean(Double)
-//      //12ContainBanWord   Boolean(Double)
-//
-//      //13. Integer(Double):
-//      val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head()
-//      val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0)
-//      val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i }
-//      val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords")))
-//
-//      //14. Integer (Double):
-//      val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head()
-//      val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0)
-//      val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i }
-//      val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords")))
-//
-//      // 15. Double (Not ratio):
-//      val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head()
-//      val W15_Mean = Mean_W15PortionQid.getDouble(0)
-//      val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i }
-//      val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid")))
-//
-//      //16. Double(Not Ratio):
-//      val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head()
-//      val W16_Mean = Mean_W16PortionLnags.getDouble(0)
-//      val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i }
-//      val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags")))
-//
-//      //17.Double(Not ratio):
-//      val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head()
-//      val W17_Mean = Mean_W17PortionLinks.getDouble(0)
-//      val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i }
-//      val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks")))
-//
-//      //************************************************End Word  Features ****************************************************************************************
-//
-//      //************************************************Start Sentences  Features ****************************************************************************************
-//      // 1. Integer(Double)
-//      val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head()
-//      val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0))
-//      val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i }
-//      val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength")))
-//
-//      //2. Double  but Not ratio values :
-//      val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head()
-//      val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0))
-//      val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i }
-//      val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel")))
-//
-//      //3. Double  but Not ratio values :
-//      val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head()
-//      val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0))
-//      val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i }
-//      val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink")))
-//
-//      //4.  Double  but Not ratio values :
-//      val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head()
-//      val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0))
-//      val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i }
-//      val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment")))
-//
-//      //df41.show()
-//      //************************************************End Sentences  Features ****************************************************************************************
-//      //*********************************************** Start Statement  Features ****************************************************************************************
-//      //1. String
-//      //2. String
-//      //3. String
-//      //************************************************End Statement  Features ****************************************************************************************
-//      //*********************************************** Start User Features ****************************************************************************************
-//
-//      //1.Boolean(Double)
-//      //2.Boolean(Double)
-//      //3.Boolean(Double)
-//      //4.Boolean(Double)
-//      //5.Boolean(Double)
-//      //6.Boolean(Double)
-//      //7. (Double) IP No need to fill Missing Data
-//      //8. (Double) ID No need to fill Missing Data
-//      //9.Boolean(Double)
-//      //10.Boolean(Double)
-//
-//      //*********************************************** End User Features ****************************************************************************************
-//      //*********************************************** Start Item Features ****************************************************************************************
-//      //1. Integer (Double) No need to fill missing values
-//      //2. Integer (Double) No need to fill missing values
-//      //3. Integer (Double) No need to fill missing values
-//      //4. Integer (Double) No need to fill missing values
-//      //5. Integer (Double) No need to fill missing values
-//      //6. Integer (Double) No need to fill missing values
-//      //7. Integer (Double) No need to fill missing values
-//      //8. Integer (Double) No need to fill missing values
-//      //9. Integer (Double) No need to fill missing values
-//      //10. Integer (Double) No need to fill missing values
-//      //11. String
-//      //*********************************************** End Item Features ****************************************************************************************
-//      //*********************************************** Start Revision Features ****************************************************************************************
-//      //1.String
-//      //2.String
-//      //3.Boolean (Double)
-//      //4.Integer(Double)
-//      //5.String
-//      //6.String
-//      //7. Boolean(Double)
-//      //8. String
-//      //9.String
-//      //10. Integer (Double)
-//      //11.String
-//      //12. integer(Double)
-//      //13. Long(Double)
-//      //14. integer (Double)
-//      //15.String
-//      //16.String
-//      //*********************************************** End Revision Features ****************************************************************************************
-//      //*********************************************** Meta Data , Truth Data and Frequnces  ****************************************************************************************
-//      //Meta
-//      // 1.Revision Session :Integer (Converted to Double)
-//      //2. User Country Code
-//      //3.User Continent Code
-//      //4.User Time Size
-//      //5.User Region Code
-//      //6.User-city Name
-//      //7.User Country Name
-//      //8.RevisionTags
-//
-//      // Truth:
-//      //1.Undo
-//
-//      // Freq :
-//
-//      //1.5 features
-//
-//      // Roll Boolean     :Boolean (Double)
-//      // Undo             :Boolean (Double)
-//
-//      //*********************************************** End Revision Features ****************************************************************************************
-//
-//      //===========================================================================String Features====================================================================================
-//
-//      val df42 = df41.withColumn(
-//        //statement String features:
-//        "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle",
-//          //Revision  String Features:
-//          lit(";"), $"R1languageRevision",
-//          lit(";"), $"R2RevisionLanguageLocal",
-//          lit(";"), $"R5RevisionAction",
-//          lit(";"), $"R6PrevReviAction",
-//          lit(";"), $"R8ParRevision",
-//          lit(";"), $"R9RevisionTime",
-//          lit(";"), $"R11ContentType",
-//          lit(";"), $"R15RevisionSubaction",
-//          lit(";"), $"R16PrevReviSubaction",
-//
-//          lit(";"), $"USER_COUNTRY_CODE",
-//          lit(";"), $"USER_CONTINENT_CODE",
-//          lit(";"), $"USER_TIME_ZONE",
-//          lit(";"), $"USER_REGION_CODE",
-//          lit(";"), $"USER_CITY_NAME",
-//          lit(";"), $"USER_COUNTY_NAME",
-//          lit(";"), $"REVISION_TAGS"))
-//
-//      val toArray = udf((record: String) => record.split(";").map(_.toString()))
-//      val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures")))
-//      //  test1.show()
-//      //  test1.printSchema()
-//
-//      val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0)
-//      val model = word2Vec.fit(test1)
-//      val result = model.transform(test1) //.rdd
-//
-//      // result.show()
-//
-//      val Todense = udf((b: Vector) => b.toDense)
-//      val test_new2 = result.withColumn("result", Todense(col("result")))
-//
-//      val assembler = new VectorAssembler().setInputCols(Array(
-//        "result",
-//
-//        // character
-//        "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio",
-//        "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio",
-//        "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio",
-//        "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio",
-//
-//        // Words
-//        "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio",
-//        "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword",
-//        "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks",
-//
-//        //Sentences :
-//        "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment",
-//
-//        // User :
-//        "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID",
-//        "U9HasBirthDate", "U10HasDeathDate",
-//
-//        //Item:
-//
-//        "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement",
-//        "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges",
-//
-//        //Revision:
-//        "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease",
-//        "R13TimeSinceLastRevi", "R14CommentLength",
-//
-//        // Meta , truth , Freq
-//        // meta :
-//        "FinalREVISION_SESSION_ID",
-//        // Truth:
-//        "FinalUNDO_RESTORE_REVERTED",
-//
-//        //Freq:
-//        "FinalNumberofRevisionsUserContributed",
-//        "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features")
-//      val NewData = assembler.transform(test_new2)
-//
-//      // Prepare the data for classification:
-//      NewData.registerTempTable("DB")
-//      val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED  from DB")
-//      //        val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision
-//
-//      //Data.show()
-//
-//      val TestClassifiers = new Classifiers()
-//
-//      // TestClassifiers.RandomForestClassifer(Data, sqlContext)
-//      // TestClassifiers.DecisionTreeClassifier(Data, sqlContext)
-//      // TestClassifiers.LogisticRegrision(Data, sqlContext)
-//      // TestClassifiers.GradientBoostedTree(Data, sqlContext)
-//      // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext)
-//
-//    }
+
   }
-  //===========================================================================================================================================
-  //=================================================Functions Part=============================================================================
+
+  // ===========================================================================================================================================
+  // =================================================Functions Part=============================================================================
 
   def Ration(va: Double, median: Double): Double = {
 
@@ -2007,11 +1357,11 @@ class VandalismDetection extends Serializable {
   def All_Features(row: Row): String = {
 
     var temp = ""
-    //all characters
+    // all characters
     val character_Str_String = Character_Features(row)
     temp = character_Str_String
 
-    //all Words
+    // all Words
     val Words_Str_String = Words_Features(row)
     temp = temp + "," + Words_Str_String
 
@@ -2023,15 +1373,15 @@ class VandalismDetection extends Serializable {
     val Statement_Str_String = Statement_Features(row)
     temp = temp + "," + Statement_Str_String
 
-    //User Features -  there are 3 Joins in last stage when we have Data Frame
+    // User Features -  there are 3 Joins in last stage when we have Data Frame
     val User_Str_String = User_Features_Normal(row)
     temp = temp + "," + User_Str_String
 
-    //Item Features -  there are 3 Joins in last stage when we have Data Frame
+    // Item Features -  there are 3 Joins in last stage when we have Data Frame
     val Item_Str_String = Item_Features(row)
     temp = temp + "," + Item_Str_String
 
-    //Revision Features
+    // Revision Features
     val Revision_Str_String = Revision_Features(row)
     temp = temp + "," + Revision_Str_String
 
@@ -2043,13 +1393,13 @@ class VandalismDetection extends Serializable {
   def Character_Features(row: Row): String = {
 
     var str_results = ""
-    //1. Row from  partitioned Pair RDD:
+    // 1. Row from  partitioned Pair RDD:
     var new_Back_Row = Row()
-    //2. Revision ID current operation:
+    // 2. Revision ID current operation:
     var RevisionID = row(0)
-    //3. row(2) =  represent the Comment:
+    // 3. row(2) =  represent the Comment:
     var CommentRecord_AsString = row(2).toString()
-    //4. extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail
+    // 4. extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail
     val CommentObj = new CommentProcessor()
     val Temp_commentTail = CommentObj.Extract_CommentTail(CommentRecord_AsString)
 
@@ -2060,8 +1410,8 @@ class VandalismDetection extends Serializable {
       val FacilityOBJ = new FacilitiesClass()
       var Str_vector_Values = FacilityOBJ.ArrayToString(vectorElements)
       str_results = Str_vector_Values
-      //CharacterFeatures = Vector_AsArrayElements
-      //new_Back_Row = Row(vectorElements)
+      // CharacterFeatures = Vector_AsArrayElements
+      // new_Back_Row = Row(vectorElements)
 
     } else {
 
@@ -2095,11 +1445,11 @@ class VandalismDetection extends Serializable {
       val FacilityOBJ = new FacilitiesClass()
       var Str_vector_Values = FacilityOBJ.ArrayToString(RatioValues)
       str_results = Str_vector_Values
-      //new_Back_Row = Row(vector_Values)
+      // new_Back_Row = Row(vector_Values)
 
     }
     // CharacterFeatures
-    //new_Back_Row
+    // new_Back_Row
     str_results.trim()
   }
 
@@ -2107,13 +1457,13 @@ class VandalismDetection extends Serializable {
   def Words_Features(row: Row): String = {
 
     var str_results = ""
-    //Row from  partitioned Pair RDD:
+    // Row from  partitioned Pair RDD:
     var new_Back_Row = Row()
-    //Revision ID current operation:
+    // Revision ID current operation:
     var RevisionID = row(0)
-    //row(2) =  represent the Comment:
+    // row(2) =  represent the Comment:
     var CommentRecord_AsString = row(2).toString()
-    //Extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail
+    // Extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail
     val CommentObj = new CommentProcessor()
     val Temp_commentTail = CommentObj.Extract_CommentTail(CommentRecord_AsString)
     var tempQids = 0.0
@@ -2146,9 +1496,9 @@ class VandalismDetection extends Serializable {
       temLinks = porportion_links
     } else {
 
-      var porortion_Qids = tempQids //=0.0
-      var porportion_Lang = temlangs //=0.0
-      var porportion_links = temLinks //=0.0
+      var porortion_Qids = tempQids // =0.0
+      var porportion_Lang = temlangs // =0.0
+      var porportion_links = temLinks // =0.0
 
     }
 
@@ -2164,11 +1514,11 @@ class VandalismDetection extends Serializable {
         var Prev_commentTail = CommentObj.Extract_CommentTail(prevComment.toString())
         if (Prev_commentTail != "") {
 
-          //11.Feature Current_Previous_CommentTial_NumberSharingWords:
+          // 11.Feature Current_Previous_CommentTial_NumberSharingWords:
 
           val NumberSharingWords = WordsOBJ.Current_Previous_CommentTial_NumberSharingWords(Temp_commentTail, Prev_commentTail)
           ArrayElements(12) = NumberSharingWords.toDouble
-          //12.Feature Current_Previous_CommentTial_NumberSharingWords without Stopword:
+          // 12.Feature Current_Previous_CommentTial_NumberSharingWords without Stopword:
           val NumberSharingWordsWithoutStopwords = WordsOBJ.Current_Previous_CommentTial_NumberSharingWords_WithoutStopWords(Temp_commentTail, Prev_commentTail)
           ArrayElements(13) = NumberSharingWordsWithoutStopwords.toDouble
 
@@ -2218,8 +1568,8 @@ class VandalismDetection extends Serializable {
       str_results = Str_vector_Values
 
     }
-    //new_Back_Row
-    //Word_Features
+    // new_Back_Row
+    // Word_Features
     str_results
   }
 
@@ -2227,16 +1577,16 @@ class VandalismDetection extends Serializable {
   def Sentences_Features(row: Row): String = {
 
     var str_results = ""
-    //This will be used to save values in vector
+    // This will be used to save values in vector
     var DoubleValues = new Array[Double](4)
 
-    //1. Row from  partitioned Pair RDD:
+    // 1. Row from  partitioned Pair RDD:
     var new_Back_Row = Row()
-    //2. Revision ID current operation:
+    // 2. Revision ID current operation:
     var RevisionID = row(0)
-    //3. row(2) =  represent the Full Comment:
+    // 3. row(2) =  represent the Full Comment:
     var CommentRecord_AsString = row(2).toString()
-    //4. extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail
+    // 4. extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail
     val CommentObj = new CommentProcessor()
     val Temp_commentTail = CommentObj.Extract_CommentTail(CommentRecord_AsString)
 
@@ -2249,14 +1599,14 @@ class VandalismDetection extends Serializable {
       DoubleValues(0) = comment_Tail_Length
 
       // Feature 2 similarity  between comment contain Sitelink and label :
-      //Check the language in comment that contain sitelinkword: --------------------
+      // Check the language in comment that contain sitelinkword: --------------------
       val Sitelink_inCommentObj = new SentencesFeatures()
 
       if (CommentRecord_AsString.contains("sitelink")) { // start 1 loop
-        //1. First step : get the language from comment
+        // 1. First step : get the language from comment
         val languagesitelink_from_Comment = Sitelink_inCommentObj.extract_CommentSiteLink_LanguageType(CommentRecord_AsString).trim()
 
-        //2. second step: get  the Label tage from json table :
+        // 2. second step: get  the Label tage from json table :
         if (row(9).toString() != "[]") { // start 2 loop
           // if (row(8).toString() != "") {
           val jsonStr = "\"\"\"" + row(9).toString() + "\"\"\"" // row(9) is the label record
@@ -2271,7 +1621,7 @@ class VandalismDetection extends Serializable {
             DoubleValues(1) = 0.0
           }
 
-        } // endd 2 loop 
+        } // endd 2 loop
         else {
 
           DoubleValues(1) = 0.0
@@ -2285,12 +1635,12 @@ class VandalismDetection extends Serializable {
       }
 
       // Feature 3 similarity between comment contain label word and sitelink
-      //Check the language in comment that contain Label word:-----------------------
+      // Check the language in comment that contain Label word:-----------------------
       val Label_inCommentObj = new SentencesFeatures()
       if (CommentRecord_AsString.contains("label")) {
-        //1. First step : get the language from comment
+        // 1. First step : get the language from comment
         val languageLabel_from_Comment = Label_inCommentObj.extract_CommentLabel_LanguageType(CommentRecord_AsString).trim()
-        //2. second step: get  the site link  tage from json table :
+        // 2. second step: get  the site link  tage from json table :
         if (row(13).toString() != "[]") { // start 2 loop
           val jsonStr = "\"\"\"" + row(13).toString() + "\"\"\"" // row(13) is the sitelink record
           val jsonObj: JSONObject = new JSONObject(row(13).toString())
@@ -2351,7 +1701,7 @@ class VandalismDetection extends Serializable {
 
     }
 
-    //new_Back_Row
+    // new_Back_Row
     str_results
 
   }
@@ -2359,7 +1709,7 @@ class VandalismDetection extends Serializable {
   // statement Features :
   def Statement_Features(row: Row): String = {
     var full_Str_Result = ""
-    //1. row(2) =  represent the Comment:
+    // 1. row(2) =  represent the Comment:
     var fullcomment = row(2).toString()
     val StatementOBJ = new StatementFeatures()
 
@@ -2400,9 +1750,9 @@ class VandalismDetection extends Serializable {
     var str_results = ""
     var DoubleValues = new Array[Double](10) // you should change the index when add more element feature
 
-    //Row from  partitioned Pair RDD:
+    // Row from  partitioned Pair RDD:
     var new_Back_Row = Row()
-    //row(7) =  represent the Contributor name:
+    // row(7) =  represent the Contributor name:
     var full_comment = row(2).toString()
     var contributor_Name = row(7).toString()
     var contributor_ID = row(6).toString()
@@ -2411,7 +1761,7 @@ class VandalismDetection extends Serializable {
 
       val useFeatureOBJ = new UserFeatures()
 
-      //1. Is privileged :  There are 5 cases : if one of these cases is true that mean it is privileged else it is not privileged user
+      // 1. Is privileged :  There are 5 cases : if one of these cases is true that mean it is privileged else it is not privileged user
       var flag_case1 = useFeatureOBJ.CheckName_isGlobalSysopUser(contributor_Name)
       var flag_case2 = useFeatureOBJ.CheckName_isGlobalRollBackerUser(contributor_Name)
       var flag_case3 = useFeatureOBJ.CheckName_isGlobalStewarUser(contributor_Name)
@@ -2427,7 +1777,7 @@ class VandalismDetection extends Serializable {
         DoubleValues(0) = 0.0
       }
 
-      //2. is BotUser : There are 3 cases  :
+      // 2. is BotUser : There are 3 cases  :
       var flag_case1_1 = useFeatureOBJ.CheckName_isLocalBotUser(contributor_Name)
       var flag_case2_2 = useFeatureOBJ.CheckName_isGlobalbotUser(contributor_Name)
       var flag_case3_3 = useFeatureOBJ.CheckName_isExtensionBotUser(contributor_Name)
@@ -2441,7 +1791,7 @@ class VandalismDetection extends Serializable {
         DoubleValues(1) = 0.0
       }
 
-      //3. is Bot User without BotflagUser : There is 1 case  :
+      // 3. is Bot User without BotflagUser : There is 1 case  :
       var flag_BUWBF = useFeatureOBJ.CheckName_isBotUserWithoutBotFlagUser(contributor_Name)
 
       if (flag_BUWBF == true) {
@@ -2452,7 +1802,7 @@ class VandalismDetection extends Serializable {
 
       }
 
-      //4. is Property  creator :
+      // 4. is Property  creator :
       var flagCreator = useFeatureOBJ.CheckName_isPropertyCreator(contributor_Name)
 
       if (flagCreator == true) {
@@ -2463,7 +1813,7 @@ class VandalismDetection extends Serializable {
 
       }
 
-      //5. is translator :
+      // 5. is translator :
       var flagTranslator = useFeatureOBJ.CheckName_isTranslator(contributor_Name)
       if (flagTranslator == true) {
         DoubleValues(4) = 1.0
@@ -2471,7 +1821,7 @@ class VandalismDetection extends Serializable {
         DoubleValues(4) = 0.0
       }
 
-      //6. is register user:
+      // 6. is register user:
       var flagRegistered = useFeatureOBJ.IsRegisteroUser(contributor_Name)
       if (flagRegistered == true) {
         DoubleValues(5) = 1.0
@@ -2490,13 +1840,13 @@ class VandalismDetection extends Serializable {
 
     }
 
-    //7. IP as a long value
+    // 7. IP as a long value
     if (contributor_IP != "0") {
       DoubleValues(6) = contributor_IP.toDouble
     } else {
       DoubleValues(6) = 0.0
     }
-    //8. ID
+    // 8. ID
 
     if (contributor_ID != "0") {
       DoubleValues(7) = contributor_ID.toDouble
@@ -2504,7 +1854,7 @@ class VandalismDetection extends Serializable {
       DoubleValues(7) = 0.0
     }
 
-    //9- 10  BitrthDate  - DeatDate:
+    // 9- 10  BitrthDate  - DeatDate:
 
     var DateObj = new UserFeatures()
     var BirthDate = DateObj.IsBirthDate(full_comment)
@@ -2540,11 +1890,11 @@ class VandalismDetection extends Serializable {
 
     var str_results = ""
     var DoubleValues = new Array[Double](11)
-    //Row from  partitioned Pair RDD:
+    // Row from  partitioned Pair RDD:
     var new_Back_Row = Row()
     var ItemOBJ = new ItemFeatures()
 
-    //1. Feature depending on Label:
+    // 1. Feature depending on Label:
     var NumberOfLabel = 0.0
     var Label_String = row(9).toString()
     if (Label_String != "[]") {
@@ -2554,7 +1904,7 @@ class VandalismDetection extends Serializable {
       NumberOfLabel = 0.0
       DoubleValues(0) = NumberOfLabel
     }
-    //2. Feature depending on Description:
+    // 2. Feature depending on Description:
     var Description_String = row(10).toString()
     var NumberOfDescription = 0.0
     if (Description_String != "[]") {
@@ -2566,7 +1916,7 @@ class VandalismDetection extends Serializable {
       DoubleValues(1) = NumberOfDescription
 
     }
-    //3. Feature depending on Aliases:
+    // 3. Feature depending on Aliases:
     var Aliases_String = row(11).toString()
     var NumberOfAliases = 0.0
     if (Aliases_String != "[]") {
@@ -2578,7 +1928,7 @@ class VandalismDetection extends Serializable {
       DoubleValues(2) = NumberOfAliases
 
     }
-    //4. Feature depending on Claims :
+    // 4. Feature depending on Claims :
     var Claims_String = row(12).toString()
     var NumberOfClaims = 0.0
     if (Claims_String != "[]") {
@@ -2590,7 +1940,7 @@ class VandalismDetection extends Serializable {
       DoubleValues(3) = NumberOfClaims
 
     }
-    //5. Feature depending on SiteLink
+    // 5. Feature depending on SiteLink
     var SiteLink_String = row(13).toString()
     var NumberOfSitelink = 0.0
     if (SiteLink_String != "[]") {
@@ -2603,7 +1953,7 @@ class VandalismDetection extends Serializable {
 
     }
 
-    //6. Feature depending on Claims - statements :
+    // 6. Feature depending on Claims - statements :
     var statement_String = row(12).toString() // from claim
     var NumberOfstatement = 0.0
     if (statement_String != "[]") {
@@ -2616,7 +1966,7 @@ class VandalismDetection extends Serializable {
 
     }
 
-    //7. Feature depending on Claims - References  :
+    // 7. Feature depending on Claims - References  :
     var References_String = row(12).toString() // from claim
     var NumberOfReferences = 0.0
     if (References_String != "[]") {
@@ -2628,7 +1978,7 @@ class VandalismDetection extends Serializable {
       DoubleValues(6) = NumberOfReferences
 
     }
-    //8. Feature depending on claim
+    // 8. Feature depending on claim
     var Qualifier_String = row(12).toString() // from claim
     var NumberOfQualifier = 0.0
     if (Qualifier_String != "[]") {
@@ -2641,7 +1991,7 @@ class VandalismDetection extends Serializable {
 
     }
 
-    //9. Features depending on  claim
+    // 9. Features depending on  claim
     var Qualifier_String_order = row(12).toString() // from claim
     var NumberOfQualifier_order = 0.0
     if (Qualifier_String_order != "[]") {
@@ -2654,7 +2004,7 @@ class VandalismDetection extends Serializable {
 
     }
 
-    //10. Feature depending on  Site link
+    // 10. Feature depending on  Site link
     var BadgesString = row(13).toString() // from claim
     var NumberOfBadges = 0.0
     if (BadgesString != "[]") {
@@ -2667,7 +2017,7 @@ class VandalismDetection extends Serializable {
 
     }
 
-    //11. Item Title (instead of Item  ID)
+    // 11. Item Title (instead of Item  ID)
     var Item_Id_Title = row(1).toString().replace("Q", "")
     var Item = Item_Id_Title.trim().toDouble
     DoubleValues(10) = Item
@@ -2688,17 +2038,17 @@ class VandalismDetection extends Serializable {
 
   def Revision_Features(row: Row): String = {
 
-    //var DoubleValues = new Array[Double](6)
+    // var DoubleValues = new Array[Double](6)
     var full_Str_Result = ""
-    //1. Row from  partitioned Pair RDD:
+    // 1. Row from  partitioned Pair RDD:
     var new_Back_Row = Row()
-    //2. Revision ID current operation:
+    // 2. Revision ID current operation:
     var RevisionID = row(0)
-    //3. row(2) =  represent the Comment:
+    // 3. row(2) =  represent the Comment:
     var fullcomment = row(2).toString()
     // DoubleValues(0) = length
 
-    //1. Revision Language :---------------------------------------------------------------------------------
+    // 1. Revision Language :---------------------------------------------------------------------------------
 
     var comment_for_Language = row(2).toString()
     val CommentLanguageOBJ = new RevisionFeatures()
@@ -2709,7 +2059,7 @@ class VandalismDetection extends Serializable {
       full_Str_Result = "NA".trim()
 
     }
-    //2. Revision Language  local:----------------------------------------------------------------------------
+    // 2. Revision Language  local:----------------------------------------------------------------------------
     if (language != "NA") {
       if (language.contains("-")) { // E.g.Revision ID = 10850 sample1
         var LocalLangArray: Array[String] = language.split("-", 2)
@@ -2724,7 +2074,7 @@ class VandalismDetection extends Serializable {
       full_Str_Result = full_Str_Result + "," + "NA"
     }
 
-    //3. Is it Latin Language or Not:-------------------------------------------------------------------------
+    // 3. Is it Latin Language or Not:-------------------------------------------------------------------------
     val revisionFeatureOBJ = new RevisionFeatures()
     val flagLatin = revisionFeatureOBJ.Check_ContainLanguageLatin_NonLatin(language)
 
@@ -2737,26 +2087,26 @@ class VandalismDetection extends Serializable {
       full_Str_Result = full_Str_Result + "," + "0.0"
     }
 
-    //4. Json Length : be care full to RDD where the json before parsed--------------------------------------
+    // 4. Json Length : be care full to RDD where the json before parsed--------------------------------------
     // var Jason_Text = row(8).toString()
 
-    //replacing_with_Quoto for cleaning the Json tag from extr tags such as <SHA>...
+    // replacing_with_Quoto for cleaning the Json tag from extr tags such as <SHA>...
     var Jason_Text = replacing_with_Quoto(row(0).toString(), row(8).toString())
     var Json_Length = Jason_Text.length()
 
     full_Str_Result = full_Str_Result + "," + Json_Length.toString()
 
-    //5. Revision Action -:-----------------------------------------------------------------------
+    // 5. Revision Action -:-----------------------------------------------------------------------
     val CommentProcessOBJ1 = new CommentProcessor()
     val actions1 = CommentProcessOBJ1.Extract_Actions_FromComments(fullcomment)
 
     var ActionsArray1: Array[String] = actions1.split("_", 2)
     var action1 = ActionsArray1(0).toString()
-    //var SubAction = ActionsArray(1)
+    // var SubAction = ActionsArray(1)
     full_Str_Result = full_Str_Result + "," + action1.trim()
-    //full_Str_Result = full_Str_Result + "," + SubAction.trim()
+    // full_Str_Result = full_Str_Result + "," + SubAction.trim()
 
-    //6.  Revision Prev-Action :-------------------------------------------------------------------------------
+    // 6.  Revision Prev-Action :-------------------------------------------------------------------------------
     if (row(19) != null) {
       var Prev_fullcomment1 = row(19).toString()
       val Prev_CommentProcessOBJ1 = new CommentProcessor()
@@ -2765,7 +2115,7 @@ class VandalismDetection extends Serializable {
       var Prev_action1 = ActionsArray1(0).trim()
       //      var Prev_SubAction = ActionsArray(1).trim()
       full_Str_Result = full_Str_Result + "," + Prev_action1.trim()
-      //full_Str_Result = full_Str_Result + "," + Prev_SubAction.trim()
+      // full_Str_Result = full_Str_Result + "," + Prev_SubAction.trim()
 
       // println(row(16).toString())
     } else {
@@ -2798,13 +2148,14 @@ class VandalismDetection extends Serializable {
     var RevisionParent = row(3).toString()
     full_Str_Result = full_Str_Result + "," + RevisionParent.toString().trim()
 
-    //9. Revision Time Stamp------------------------------------------------------------------------------------------------
+    // 9. Revision Time Stamp------------------------------------------------------------------------------------------------
     var RevisionTimeZone = row(4).toString()
     full_Str_Result = full_Str_Result + "," + RevisionTimeZone
 
-    //10. Revision Size:------------------------------------------------------------------------------------------------
+    // 10. Revision Size:------------------------------------------------------------------------------------------------
 
-    var RevisionBody = row(0).toString() + row(2).toString() + row(3).toString() + row(4).toString() + row(8).toString() + row(14).toString() + row(15).toString() + row(16).toString()
+    var RevisionBody = row(0).toString() + row(2).toString() + row(3).toString() + row(4).toString() +
+      row(8).toString() + row(14).toString() + row(15).toString() + row(16).toString()
     if (row(5).toString() != "0") {
 
       RevisionBody = RevisionBody + row(5).toString()
@@ -2816,7 +2167,7 @@ class VandalismDetection extends Serializable {
 
     }
 
-    //11. ContentType: take Action1 as input : --------------------------------------------------------------
+    // 11. ContentType: take Action1 as input : --------------------------------------------------------------
 
     val CommentProcessOBJ_New = new CommentProcessor()
     val actions_New = CommentProcessOBJ_New.Extract_Actions_FromComments(fullcomment)
@@ -2833,7 +2184,8 @@ class VandalismDetection extends Serializable {
     var PreviRevision = ""
 
     // For Current Revision
-    CurrentRevision = row(0).toString() + row(2).toString() + row(3).toString() + row(4).toString() + row(8).toString() + row(14).toString() + row(15).toString() + row(16).toString()
+    CurrentRevision = row(0).toString() + row(2).toString() + row(3).toString() + row(4).toString() +
+      row(8).toString() + row(14).toString() + row(15).toString() + row(16).toString()
     if (row(5).toString() != "0") {
       CurrentRevision = CurrentRevision.trim() + row(5).toString()
     } else {
@@ -2843,10 +2195,12 @@ class VandalismDetection extends Serializable {
     // For Previous Revision :
     if (row(17) != null && row(19) != null && row(20) != null && row(21) != null && row(25) != null && row(31) != null && row(32) != null && row(33) != null) {
       if (row(22) != null && row(22).toString() != "0") {
-        var PreviRevision = row(17).toString() + row(19).toString() + row(20).toString() + row(21).toString() + row(25).toString() + row(31).toString() + row(32).toString() + row(33).toString() + row(22).toString()
+        var PreviRevision = row(17).toString() + row(19).toString() + row(20).toString() + row(21).toString() +
+          row(25).toString() + row(31).toString() + row(32).toString() + row(33).toString() + row(22).toString()
 
       } else if (row(23) != null && row(24) != null) {
-        var PreviRevision = row(17).toString() + row(19).toString() + row(20).toString() + row(21).toString() + row(25).toString() + row(31).toString() + row(32).toString() + row(33).toString() + row(23).toString() + row(24).toString()
+        var PreviRevision = row(17).toString() + row(19).toString() + row(20).toString() + row(21).toString() +
+          row(25).toString() + row(31).toString() + row(32).toString() + row(33).toString() + row(23).toString() + row(24).toString()
       } else {
 
         PreviRevision = null
@@ -2868,7 +2222,7 @@ class VandalismDetection extends Serializable {
 
     }
 
-    //13. Time since last Revision: ----------------------------------------------------------------------
+    // 13. Time since last Revision: ----------------------------------------------------------------------
 
     if (row(21) != null) {
 
@@ -2886,11 +2240,11 @@ class VandalismDetection extends Serializable {
 
     }
 
-    //14. Comment Length:---------------------------------------
+    // 14. Comment Length:---------------------------------------
     var lengthcomment = fullcomment.length().toString()
     full_Str_Result = full_Str_Result + "," + lengthcomment
 
-    //15. Revision SubAction:
+    // 15. Revision SubAction:
     val CommentProcessOBJ2 = new CommentProcessor()
     val actions2 = CommentProcessOBJ2.Extract_Actions_FromComments(fullcomment)
 
@@ -2898,7 +2252,7 @@ class VandalismDetection extends Serializable {
     var SubAction2 = ActionsArray2(1)
     full_Str_Result = full_Str_Result + "," + SubAction2.trim()
 
-    //16.Prev_revision SubAction:
+    // 16.Prev_revision SubAction:
     if (row(19) != null) {
       var Prev_fullcomment2 = row(19).toString()
       val Prev_CommentProcessOBJ2 = new CommentProcessor()
@@ -2921,7 +2275,7 @@ class VandalismDetection extends Serializable {
 
   }
 
-  //========================
+  //  ========================
 
   def RoundDouble(va: Double): Double = {
 
@@ -2984,4 +2338,4 @@ class VandalismDetection extends Serializable {
 
   }
 
-}// endl class -------
+}
diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/WordsFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/WordsFeatures.scala
index 1cf0ee1..bc3ca45 100644
--- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/WordsFeatures.scala
+++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/WordsFeatures.scala
@@ -1,7 +1,8 @@
 package net.sansa_stack.ml.spark.outliers.vandalismdetection
 
-import java.util.regex.{ Pattern, Matcher }
-import java.util.{ List, Arrays, ArrayList }
+import java.util.{ ArrayList, Arrays, List }
+import java.util.regex.{ Matcher, Pattern }
+
 import org.apache.commons.lang3.StringUtils
 
 class WordsFeatures extends Serializable {
@@ -15,53 +16,53 @@ class WordsFeatures extends Serializable {
   def Vector_Words_Feature(StrValue: String): Array[Double] = {
     var RatioValues = new Array[Double](17)
     val WordsFeature_OBJ = new WordsFeatures()
-    //1. Double for LanguageWord Ratio - ok
+    // 1. Double for LanguageWord Ratio - ok
     val LanguageWord = LanguageWordRatio_Character(StrValue)
     if (!LanguageWord.isNaN()) {
       RatioValues(0) = RoundDouble(LanguageWord)
     }
-    //2. Boolean --> Double for Contain language word - ok (1 Boolean)
+    // 2. Boolean --> Double for Contain language word - ok (1 Boolean)
     val IsContainLanguageWord = ContainLanguageWord(StrValue)
     if (IsContainLanguageWord == true) {
       RatioValues(1) = 1.0
     } else if (IsContainLanguageWord == false) {
       RatioValues(1) = 0.0
     }
-    //3.Double for LowerCaseWord Ratio - ok
+    // 3.Double for LowerCaseWord Ratio - ok
     val LowerCaseWord = LowercaseWordRation(StrValue)
     if (!LowerCaseWord.isNaN()) {
       RatioValues(2) = RoundDouble(LowerCaseWord)
     }
-    //4.Integer --> to Double for LongestWord - ok (1 Integer)
+    // 4.Integer --> to Double for LongestWord - ok (1 Integer)
     val LongWord = LongestWord(StrValue)
     if (LongWord != null) {
       val castedValue = LongWord.toDouble
       RatioValues(3) = castedValue
     }
-    //5.Boolean --> Double for word Contain URL -ok(2 boolean)
+    // 5.Boolean --> Double for word Contain URL -ok(2 boolean)
     val IsWordContainURL = ContainURLWord(StrValue)
     if (IsWordContainURL == true) {
       RatioValues(4) = 1.0
     } else if (IsWordContainURL == false) {
       RatioValues(4) = 0.0
     }
-    //6.Double for  Bad Word Ratio - ok
+    // 6.Double for  Bad Word Ratio - ok
     val BadWord = BadWordRation(StrValue)
     if (!BadWord.isNaN()) {
       RatioValues(5) = RoundDouble(BadWord)
     }
-    //7. Double for UppercaseWord Ratio -ok
+    // 7. Double for UppercaseWord Ratio -ok
     val UpperCaseWord = UppercaseWordRation(StrValue)
     if (!UpperCaseWord.isNaN()) {
       RatioValues(6) = RoundDouble(UpperCaseWord)
     }
-    //8.Double for Ban Word Ratio - ok
+    // 8.Double for Ban Word Ratio - ok
     val BanWord = BanWordRation(StrValue)
     if (!BanWord.isNaN()) {
       RatioValues(7) = RoundDouble(BanWord)
     }
 
-    //9.Boolean Femal FirstName (3 Boolean )
+    // 9.Boolean Femal FirstName (3 Boolean )
 
     val IsFemalFirstName = FemaleName_word(StrValue)
     if (IsFemalFirstName == true) {
@@ -70,7 +71,7 @@ class WordsFeatures extends Serializable {
       RatioValues(8) = 0.0
     }
 
-    //10. Boolean Male FirstName (4 Boolean)
+    // 10. Boolean Male FirstName (4 Boolean)
     val IsMaleFirstName = MaleName_word(StrValue)
     if (IsMaleFirstName == true) {
       RatioValues(9) = 1.0
@@ -78,7 +79,7 @@ class WordsFeatures extends Serializable {
       RatioValues(9) = 0.0
     }
 
-    //11. Boolean containBadWord_word (5 Boolean )
+    // 11. Boolean containBadWord_word (5 Boolean )
 
     val IsContainBad_Word = containBadWord_word(StrValue)
     if (IsContainBad_Word == true) {
@@ -87,7 +88,7 @@ class WordsFeatures extends Serializable {
       RatioValues(10) = 0.0
     }
 
-    //12. Boolean containBanWord_word (6 Boolean)
+    // 12. Boolean containBanWord_word (6 Boolean)
 
     val IsContainBan_Word = BanBuilderWordlist_word(StrValue)
     if (IsContainBan_Word == true) {
@@ -124,16 +125,69 @@ class WordsFeatures extends Serializable {
 
   }
 
-  //1.Language Words Ratio :
-  val regex_LanguageWordRatio: String = "(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4)";
+  // 1.Language Words Ratio :
+  val regex_LanguageWordRatio: String = "(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)" +
+    "|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)" +
+    "|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski" +
+    "|ra[sz]il(ian?)?|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)" +
+    "|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)|[c\\u010d](esky|e[s\\u0161]tina)\r\n" +
+    "|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)" +
+    "|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)" +
+    "|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))" +
+    "|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)" +
+    "|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]" +
+    "|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)" +
+    "|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto" +
+    "|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano" +
+    "|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)" +
+    "|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ngv i[e\\u1ec7]t" +
+    "|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh" +
+    "|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441" +
+    "|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446|[p\\u0440]\\u0443\\u0441\\u0441" +
+    "|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea" +
+    "|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)" +
+    "|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f" +
+    "|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)" +
+    "|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?" +
+    "|([c\\u0441]\\u0440\\u043f|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e" +
+    "|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40" +
+    "|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd" +
+    "|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc" +
+    "|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53" +
+    "|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4)"
+
   val pattern_LanguageWordRatio: Pattern = Pattern.compile(regex_LanguageWordRatio);
   def LanguageWordRatio_Character(str: String): Double = {
     val result: Double = WordRatio(str, pattern_LanguageWordRatio)
     result
   }
 
-  //2. Contain language word :
-  val regex_ContainLanguageWord: String = "(^|\\n)([ei]n )??(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4)( language)??($|\\n)";
+  // 2. Contain language word :
+  val regex_ContainLanguageWord: String = "(^|\\n)([ei]n )??(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)" +
+    "|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)" +
+    "|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?" +
+    "|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)" +
+    "|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)" +
+    "|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)" +
+    "|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))" +
+    "|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?" +
+    "|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)" +
+    "|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)" +
+    "|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)" +
+    "|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)" +
+    "|welsh|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446" +
+    "|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea" +
+    "|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)" +
+    "|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f" +
+    "|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)" +
+    "|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f" +
+    "|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0" +
+    "|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40|\\u092e\\u0930\\u093e\\u0920\\u0940" +
+    "|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41" +
+    "|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02" +
+    "|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?" +
+    "|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4)( language)??($|\\n)"
+
   val pattern_ContainLanguageWord: Pattern = Pattern.compile(regex_ContainLanguageWord);
   val matcher_ContainLanguageWord: Matcher = pattern_ContainLanguageWord.matcher("");
   def ContainLanguageWord(str: String): Boolean = {
@@ -149,20 +203,20 @@ class WordsFeatures extends Serializable {
     result
   }
 
-  //3. Upper case word Ratio:
+  // 3. Upper case word Ratio:
   def UppercaseWordRation(str: String): Double = {
     val pattern: Pattern = Pattern.compile("\\p{Lu}.*")
     val result: Double = WordRatio(str, pattern)
     result
   }
 
-  //4.  Lower case word Ratio:
+  // 4.  Lower case word Ratio:
   def LowercaseWordRation(str: String): Double = {
     val pattern: Pattern = Pattern.compile("[\\p{L}&&[^\\p{Lu}]].*")
     val result: Double = WordRatio(str, pattern)
     result
   }
-  //5.word Contain URL :
+  // 5.word Contain URL :
   val pattern_WordContainURL: Pattern = Pattern.compile("\\b(https?:\\/\\/|www\\.)\\S{10}.*", Pattern.CASE_INSENSITIVE
     | Pattern.UNICODE_CASE | Pattern.DOTALL | Pattern.CANON_EQ)
   val matcher_WordContainURL: Matcher = pattern_WordContainURL.matcher("");
@@ -179,7 +233,7 @@ class WordsFeatures extends Serializable {
     result
   }
 
-  //6. Longest Word
+  // 6. Longest Word
   val pattern_longestWord: Pattern = Pattern.compile("\\p{IsAlphabetic}+", Pattern.CASE_INSENSITIVE
     | Pattern.UNICODE_CASE | Pattern.DOTALL | Pattern.CANON_EQ);
   val matcher_longestWord: Matcher = pattern_WordContainURL.matcher("");
@@ -203,7 +257,7 @@ class WordsFeatures extends Serializable {
 
     max
   }
-  //7. Bad Word : It is Ok
+  // 7. Bad Word : It is Ok
   val luisVonAhnWordlist: Array[String] =
     Array("abbo", "abo",
       "abortion", "abuse", "addict", "addicts", "adult", "africa",
@@ -465,7 +519,7 @@ class WordsFeatures extends Serializable {
 
   }
 
-  //8. Contain Bad Word:It is ok
+  // 8. Contain Bad Word:It is ok
   val tokens_containbadword: List[String] = new ArrayList[String](Arrays.asList(luisVonAhnWordlist: _*))
   val patternString_containBadword: String = ".*\\b(" + StringUtils.join(tokens_containbadword, "|") + ")\\b.*"
   val pattern_containBadword: Pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL | Pattern.CANON_EQ)
@@ -481,7 +535,7 @@ class WordsFeatures extends Serializable {
     results
 
   }
-  //9.Ban Builder Word:It is OK
+  // 9.Ban Builder Word:It is OK
   val BanBuilderWordlist: Array[String] = Array("$#!+", "$1ut", "$h1t",
     "$hit", "$lut", "'ho", "'hobag", "a$$", "anal", "anus", "ass",
     "assmunch", "b1tch", "ballsack", "bastard", "beaner",
@@ -629,7 +683,7 @@ class WordsFeatures extends Serializable {
     results
 
   }
-  //10 Ban word Ratio:
+  // 10 Ban word Ratio:
   val tokens_ban: List[String] = new ArrayList[String](Arrays.asList(BanBuilderWordlist: _*))
   val patternString_ban: String = StringUtils.join(tokens_ban, "|")
   val pattern_banWord: Pattern = Pattern.compile(patternString_ban)
@@ -645,8 +699,34 @@ class WordsFeatures extends Serializable {
 
   }
 
-  //11.Contain language word:It is ok
-  val regex_containLanguageWord: String = ".*(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4).*";
+  // 11.Contain language word:It is ok
+  val regex_containLanguageWord: String = ".*(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)|rmenian?" +
+    "|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)" +
+    "|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?" +
+    "|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)" +
+    "|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)|speranto|stonian|usk[ae]ra)" +
+    "|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))" +
+    "|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])" +
+    "|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro" +
+    "|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?" +
+    "| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?" +
+    "|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh" +
+    "|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446" +
+    "|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea" +
+    "|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)" +
+    "|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f" +
+    "|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)" +
+    "|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8" +
+    "|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f" +
+    "|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438" +
+    "|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)" +
+    "|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40" +
+    "|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648" +
+    "|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0" +
+    "|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02" +
+    "|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53" +
+    "|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4).*"
+
   val pattern_forContainLanguageWord: Pattern = Pattern.compile(regex_containLanguageWord);
   val matcher_containLanguageWord: Matcher = pattern_forContainLanguageWord.matcher("");
   def containLanguageBadWord_word(str: String): Boolean = {
@@ -660,7 +740,7 @@ class WordsFeatures extends Serializable {
     results
   }
 
-  //12. Male Names: It is ok
+  // 12. Male Names: It is ok
   val MaleNames: Array[String] = Array("AARON", "ADAM", "ADRIAN",
     "ALAN", "ALBERT", "ALBERTO", "ALEX", "ALEXANDER", "ALFRED",
     "ALFREDO", "ALLAN", "ALLEN", "ALVIN", "ANDRE", "ANDREW", "ANDY",
@@ -725,7 +805,7 @@ class WordsFeatures extends Serializable {
 
   }
 
-  //13. Female Names: It is ok
+  // 13. Female Names: It is ok
   val FemaleNames: Array[String] = Array("AGNES", "ALICE",
     "ALICIA", "ALLISON", "ALMA", "AMANDA", "AMBER", "AMY", "ANA",
     "ANDREA", "ANGELA", "ANITA", "ANN", "ANNA", "ANNE", "ANNETTE",
@@ -934,10 +1014,8 @@ class WordsFeatures extends Serializable {
       }
 
     }
-
     results
   }
-
   def GetNumberofLinks(str: String): Double = {
 
     val input: String = str
@@ -948,7 +1026,44 @@ class WordsFeatures extends Serializable {
 
     count
   }
-  val RegexStr = "(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|" + "[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[eə]rba" + "(ijani?|ycan(ca)?|yjan)|нглийский)|b(ahasa( (indonesia|" + "jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|" + "elarusian?|okmål|osanski|ra[sz]il(ian?)?|ritish( " + "kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?" + "|zech|roat([eo]|ian?)|atal[aà]n?|рпски|antonese)|[cč]" + "(esky|e[sš]tina)|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]" + "nika|ng(els|le(ski|za)|lisc?h)|spa(g?[nñ]h?i?ol|nisc?h)" + "|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[cç]" + "(ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|" + "uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|" + "ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|" + "ndonesian?|ngl[eê]se?|ngilizce|tali(ano?|en(isch)?))|" + "ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|" + "sova)|urd[iî])|l(at(in[ao]?|vi(an?|e[sš]u))|ietuvi[uų]" + "|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|" + "sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol" + "(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|" + "orsk( bokm[aå]l)?|ynorsk)|o(landese|dia)|p(ashto|" + "ersi?an?|ol(n?isc?h|ski)|or?tugu?[eê]se?(( d[eo])?" + "brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[aâi]ni?[aă]n?" + "|um(ano|änisch)|ussi([ao]n?|sch))|s(anskrit|erbian|" + "imple english|inha?la|lov(ak(ian?)?|enš?[cč]ina|" + "en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|" + "rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|" + "hai(land)?|i[eế]ng vi[eệ]t|[uü]rk([cç]e|isc?h|iş|ey))|" + "u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(англиис|" + "[kк]алмыкс|[kк]азахс|немец|[pр]усс|[yу]збекс|" + "татарс)кий( язык)??|עברית|[kкқ](аза[кқ]ша|ыргызча|" + "ирилл)|українськ(а|ою)|б(еларуская|" + "ългарски( език)?)|ελλ[ηι]" + "νικ(ά|α)|ქართული|हिन्दी|ไทย|[mм]онгол(иа)?|([cс]рп|" + "[mм]акедон)ски|العربية|日本語|한국(말|어)|‌हिनद़ि| " + " বাংলা|ਪੰਜਾਬੀ|मराठी|ಕನ್ನಡ|اُردُو|தமிழ்|తెలుగు|ગુજરાતી|" + "فارسی|پارسی|മലയാളം|پښتو|မြန်မာဘာသာ|中文(简体|繁體)?|" + "中文（(简体?|繁體)）|简体|繁體)";
+  val RegexStr = "(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|" +
+    "[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[eə]rba" +
+    "(ijani?|ycan(ca)?|yjan)|нглийский)|b(ahasa( (indonesia|" +
+    "jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|" +
+    "elarusian?|okmål|osanski|ra[sz]il(ian?)?|ritish( " +
+    "kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?" +
+    "|zech|roat([eo]|ian?)|atal[aà]n?|рпски|antonese)|[cč]" +
+    "(esky|e[sš]tina)|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]" +
+    "nika|ng(els|le(ski|za)|lisc?h)|spa(g?[nñ]h?i?ol|nisc?h)" +
+    "|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[cç]" +
+    "(ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|" +
+    "uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|" +
+    "ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|" +
+    "ndonesian?|ngl[eê]se?|ngilizce|tali(ano?|en(isch)?))|" +
+    "ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|" +
+    "sova)|urd[iî])|l(at(in[ao]?|vi(an?|e[sš]u))|ietuvi[uų]" +
+    "|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|" +
+    "sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol" +
+    "(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|" +
+    "orsk( bokm[aå]l)?|ynorsk)|o(landese|dia)|p(ashto|" +
+    "ersi?an?|ol(n?isc?h|ski)|or?tugu?[eê]se?(( d[eo])?" +
+    "brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[aâi]ni?[aă]n?" +
+    "|um(ano|änisch)|ussi([ao]n?|sch))|s(anskrit|erbian|" +
+    "imple english|inha?la|lov(ak(ian?)?|enš?[cč]ina|" +
+    "en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|" +
+    "rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|" +
+    "hai(land)?|i[eế]ng vi[eệ]t|[uü]rk([cç]e|isc?h|iş|ey))|" +
+    "u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(англиис|" +
+    "[kк]алмыкс|[kк]азахс|немец|[pр]усс|[yу]збекс|" +
+    "татарс)кий( язык)??|עברית|[kкқ](аза[кқ]ша|ыргызча|" +
+    "ирилл)|українськ(а|ою)|б(еларуская|" +
+    "ългарски( език)?)|ελλ[ηι]" +
+    "νικ(ά|α)|ქართული|हिन्दी|ไทย|[mм]онгол(иа)?|([cс]рп|" +
+    "[mм]акедон)ски|العربية|日本語|한국(말|어)|‌हिनद़ि| " +
+    " বাংলা|ਪੰਜਾਬੀ|मराठी|ಕನ್ನಡ|اُردُو|தமிழ்|తెలుగు|ગુજરાતી|" +
+    "فارسی|پارسی|മലയാളം|پښتو|မြန်မာဘာသာ|中文(简体|繁體)?|" +
+    "中文（(简体?|繁體)）|简体|繁體)"
+
   def GetNumberofLanguageword(str: String): Double = {
     val input: String = str
     val patternRegex: Pattern = Pattern.compile(RegexStr)
@@ -971,5 +1086,4 @@ class WordsFeatures extends Serializable {
     result.toFloat
   }
   // Words features: ------ End calculation the Ratio for Words:
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/DistancesTest.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/DistancesTest.scala
new file mode 100644
index 0000000..a755476
--- /dev/null
+++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/DistancesTest.scala
@@ -0,0 +1,13 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import org.scalatest.FunSuite
+
+class DistancesTest extends FunSuite {
+  val testData1 = Set("a", "b")
+  val testData2 = Set("b", "c", "d")
+
+  test("Distances.jaccardSimilarity") {
+    assert(new Distances().jaccardSimilarity(testData1, testData2) === 0.25)
+  }
+}
+
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/EncoderTest.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/EncoderTest.scala
new file mode 100644
index 0000000..8d90fc5
--- /dev/null
+++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/EncoderTest.scala
@@ -0,0 +1,32 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import com.holdenkarau.spark.testing.DataFrameSuiteBase
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.rdd.RDD
+import org.scalatest.FunSuite
+
+class EncoderTest extends FunSuite with DataFrameSuiteBase {
+  test("Encoder.mdsEncoding") {
+    val mdsTestData = List((1.toLong, 2.toLong, 0.5), (1.toLong, 3.toLong, 1.0), (2.toLong, 3.toLong, 1.0))
+    val mdsTestDataRDD: RDD[(Long, Long, Double)] = spark.sparkContext.parallelize(mdsTestData)
+    /*  val oneHotTestData = List((1.toLong, Set("a", "b")), (2.toLong, Set("b", "c")), (3.toLong, Set("b", "d")))
+  val oneHotTestDataRDD: RDD[(Long, Set[String])] = spark.sparkContext.parallelize(oneHotTestData)
+*/
+    val (mdsEncodedDF, mdsEncoded) = new Encoder().mdsEncoding(mdsTestDataRDD, 3, 2, spark)
+    assert(mdsEncodedDF.head().getAs[DenseVector](mdsEncodedDF.head().length - 1).size === 2) // (x, y) coordinate
+  }
+
+  /*  test("Encoder.oneHotEncoding") {
+    val (oneHotEncodedDF, oneHotEncoded) = new Encoder().oneHotEncoding(oneHotTestDataRDD, spark)
+    assert(oneHotEncodedDF.head().getAs[DenseVector](oneHotEncodedDF.head().length-1).size === 4) // encoded vector
+  }
+
+  test("Encoder.wordVectorEncoder") {
+    val (word2VecEncodedDF, word2VecEncoded) = new Encoder().wordVectorEncoder(oneHotTestDataRDD, spark)
+    assert(word2VecEncodedDF.head().getAs[DenseVector](word2VecEncodedDF.head().length-1).size >= 1) // vector size for poi should be larger than equal to 1
+  }
+*/
+}
+
+
+
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/KmeansTest.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/KmeansTest.scala
new file mode 100644
index 0000000..a94ad7b
--- /dev/null
+++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/KmeansTest.scala
@@ -0,0 +1,17 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import com.holdenkarau.spark.testing.DataFrameSuiteBase
+import org.apache.spark.rdd.RDD
+import org.scalatest.FunSuite
+
+class KmeansTest extends FunSuite with DataFrameSuiteBase {
+  test("Kmeans.mdsEncoding") {
+    val mdsTestData = spark.sparkContext.parallelize(List((1.toLong, 2.toLong, 0.5), (1.toLong, 3.toLong, 1.0), (2.toLong, 3.toLong, 1.0)))
+    mdsTestData.foreach(println)
+    // val mdsTestDataRDD: RDD[(Long, Long, Double)] = sc.parallelize(mdsTestData)
+    val (mdsEncodedDF, mdsEncoded) = new Encoder().mdsEncoding(mdsTestData, 3, 2, spark)
+    val km_result = new Kmeans().kmClustering(2, 2, mdsEncodedDF, spark)
+    assert(km_result.size === 2) // 2 clusters
+  }
+}
+
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDSTest.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDSTest.scala
new file mode 100644
index 0000000..1ecca91
--- /dev/null
+++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/MultiDSTest.scala
@@ -0,0 +1,14 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import com.holdenkarau.spark.testing.DataFrameSuiteBase
+import org.apache.spark.rdd.RDD
+import org.scalatest.FunSuite
+
+class MultiDSTest extends FunSuite with DataFrameSuiteBase {
+  test("multiDS.multiDimensionScaling") {
+    val testData = List((1.toLong, 2.toLong, 0.5), (1.toLong, 3.toLong, 1.0), (2.toLong, 3.toLong, 1.0))
+    val testDataRDD: RDD[(Long, Long, Double)] = spark.sparkContext.parallelize(testData)
+    val coordinates = new MultiDS().multiDimensionScaling(testDataRDD, 3, 2)
+    assert(coordinates.length === 3 && coordinates.head._2.length === 2)
+  }
+}
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/PICTest.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/PICTest.scala
new file mode 100644
index 0000000..dec944c
--- /dev/null
+++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/PICTest.scala
@@ -0,0 +1,15 @@
+package net.sansa_stack.ml.spark.clustering.algorithms
+
+import com.holdenkarau.spark.testing.DataFrameSuiteBase
+import org.apache.spark.rdd.RDD
+import org.scalatest.FunSuite
+
+class PICTest extends FunSuite with DataFrameSuiteBase {
+  test("PIC.picSparkML") {
+    val testData = List((1.toLong, 2.toLong, 1.0), (1.toLong, 3.toLong, 0.0), (2.toLong, 3.toLong, 0.0))
+    val testDataRDD: RDD[(Long, Long, Double)] = spark.sparkContext.parallelize(testData)
+    val clusters = new PIC().picSparkML(testDataRDD, 2, 1, sparkSession = spark)
+    assert(clusters.size === 2)
+  }
+}
+
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/RdfPicTest.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/RdfPicTest.scala
similarity index 92%
rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/RdfPicTest.scala
rename to sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/RdfPicTest.scala
index 9b1b1be..3c2ffdf 100644
--- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/RdfPicTest.scala
+++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/clustering/algorithms/RdfPicTest.scala
@@ -1,4 +1,4 @@
-package net.sansa_stack.ml.spark.clustering
+package net.sansa_stack.ml.spark.clustering.algorithms
 
 import com.holdenkarau.spark.testing.DataFrameSuiteBase
 import net.sansa_stack.rdf.spark.io._
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernelTests.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernelTests.scala
index 0025d09..994c517 100644
--- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernelTests.scala
+++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/kernel/RDFFastGraphKernelTests.scala
@@ -31,5 +31,4 @@ class RDFFastGraphKernelTests extends FunSuite with DataFrameSuiteBase {
 
     assert(true)
   }
-
-}
\ No newline at end of file
+}
diff --git a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/anomalydetection/AnomalyDetectionTests.scala b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionTests.scala
similarity index 97%
rename from sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/anomalydetection/AnomalyDetectionTests.scala
rename to sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionTests.scala
index b5696d0..475d555 100644
--- a/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/anomalydetection/AnomalyDetectionTests.scala
+++ b/sansa-ml-spark/src/test/scala/net/sansa_stack/ml/spark/outliers/anomalydetection/AnomalyDetectionTests.scala
@@ -1,11 +1,11 @@
-package net.sansa_stack.ml.spark.anomalydetection
+package net.sansa_stack.ml.spark.outliers.anomalydetection
 
 import com.holdenkarau.spark.testing.DataFrameSuiteBase
-import net.sansa_stack.ml.spark.outliers.anomalydetection._
 import org.apache.jena.riot.Lang
 import org.apache.spark.rdd.RDD
 import org.scalatest.FunSuite
 
+import net.sansa_stack.ml.spark.outliers.anomalydetection._
 class AnomalyDetectionTests extends FunSuite with DataFrameSuiteBase {
 
   import net.sansa_stack.rdf.spark.io._
@@ -34,9 +34,6 @@ class AnomalyDetectionTests extends FunSuite with DataFrameSuiteBase {
   val hypernym = "http://purl.org/linguistics/gold/hypernym"
 
   test("performing anomaly detection using HashingTF method should result in size 36") {
-
- 
-
     val triples = spark.rdf(Lang.NTRIPLES)(path)
     triples.repartition(125).persist
 
@@ -52,7 +49,6 @@ class AnomalyDetectionTests extends FunSuite with DataFrameSuiteBase {
 
   test("performing anomaly detection using CountVetcorizerModel method should result in size 15") {
 
-
     val triples = spark.rdf(Lang.NTRIPLES)(path)
     triples.repartition(125).persist
 
@@ -80,3 +76,5 @@ class AnomalyDetectionTests extends FunSuite with DataFrameSuiteBase {
   }
 
 }
+
+
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 6728712..d1cf125 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -116,7 +116,7 @@ This file is divided into 3 sections:
 
   <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
 
-  <check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
+  <check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="false"></check>
 
   <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>