diff --git a/spark/ClickBenchRunner.scala b/spark/ClickBenchRunner.scala new file mode 100644 index 000000000..2ff15a983 --- /dev/null +++ b/spark/ClickBenchRunner.scala @@ -0,0 +1,144 @@ +import org.apache.spark.sql.{SparkSession, SQLContext} +import java.sql.Statement +import java.io.{File, FileWriter} +import scala.io.Source +import scala.io.Codec +import org.apache.spark.sql.types._ + +val spark = SparkSession.builder.enableHiveSupport().getOrCreate() +spark.sql("DROP TABLE IF EXISTS hits") +val logFile = "log.txt" + +val file = new File(logFile) +val writer = new FileWriter(file) + + +val schema = StructType( + List( + StructField("WatchID", LongType, nullable = false), + StructField("JavaEnable", ShortType, nullable = false), + StructField("Title", StringType, nullable = false), + StructField("GoodEvent", ShortType, nullable = false), + StructField("EventTime", TimestampType, nullable = false), + StructField("EventDate", DateType, nullable = false), + StructField("CounterID", IntegerType, nullable = false), + StructField("ClientIP", IntegerType, nullable = false), + StructField("RegionID", IntegerType, nullable = false), + StructField("UserID", LongType, nullable = false), + StructField("CounterClass", ShortType, nullable = false), + StructField("OS", ShortType, nullable = false), + StructField("UserAgent", ShortType, nullable = false), + StructField("URL", StringType, nullable = false), + StructField("Referer", StringType, nullable = false), + StructField("IsRefresh", ShortType, nullable = false), + StructField("RefererCategoryID", ShortType, nullable = false), + StructField("RefererRegionID", IntegerType, nullable = false), + StructField("URLCategoryID", ShortType, nullable = false), + StructField("URLRegionID", IntegerType, nullable = false), + StructField("ResolutionWidth", ShortType, nullable = false), + StructField("ResolutionHeight", ShortType, nullable = false), + StructField("ResolutionDepth", ShortType, nullable = false), + StructField("FlashMajor", ShortType, nullable = false), + StructField("FlashMinor", ShortType, nullable = false), + StructField("FlashMinor2", StringType, nullable = false), + StructField("NetMajor", ShortType, nullable = false), + StructField("NetMinor", ShortType, nullable = false), + StructField("UserAgentMajor", ShortType, nullable = false), + StructField("UserAgentMinor", StringType, nullable = false), + StructField("CookieEnable", ShortType, nullable = false), + StructField("JavascriptEnable", ShortType, nullable = false), + StructField("IsMobile", ShortType, nullable = false), + StructField("MobilePhone", ShortType, nullable = false), + StructField("MobilePhoneModel", StringType, nullable = false), + StructField("Params", StringType, nullable = false), + StructField("IPNetworkID", IntegerType, nullable = false), + StructField("TraficSourceID", ShortType, nullable = false), + StructField("SearchEngineID", ShortType, nullable = false), + StructField("SearchPhrase", StringType, nullable = false), + StructField("AdvEngineID", ShortType, nullable = false), + StructField("IsArtifical", ShortType, nullable = false), + StructField("WindowClientWidth", ShortType, nullable = false), + StructField("WindowClientHeight", ShortType, nullable = false), + StructField("ClientTimeZone", ShortType, nullable = false), + StructField("ClientEventTime", TimestampType, nullable = false), + StructField("SilverlightVersion1", ShortType, nullable = false), + StructField("SilverlightVersion2", ShortType, nullable = false), + StructField("SilverlightVersion3", IntegerType, nullable = false), + StructField("SilverlightVersion4", ShortType, nullable = false), + StructField("PageCharset", StringType, nullable = false), + StructField("CodeVersion", IntegerType, nullable = false), + StructField("IsLink", ShortType, nullable = false), + StructField("IsDownload", ShortType, nullable = false), + StructField("IsNotBounce", ShortType, nullable = false), + StructField("FUniqID", LongType, nullable = false), + StructField("OriginalURL", StringType, nullable = false), + StructField("HID", IntegerType, nullable = false), + StructField("IsOldCounter", ShortType, nullable = false), + StructField("IsEvent", ShortType, nullable = false), + StructField("IsParameter", ShortType, nullable = false), + StructField("DontCountHits", ShortType, nullable = false), + StructField("WithHash", ShortType, nullable = false), + StructField("HitColor", StringType, nullable = false), + StructField("LocalEventTime", TimestampType, nullable = false), + StructField("Age", ShortType, nullable = false), + StructField("Sex", ShortType, nullable = false), + StructField("Income", ShortType, nullable = false), + StructField("Interests", ShortType, nullable = false), + StructField("Robotness", ShortType, nullable = false), + StructField("RemoteIP", IntegerType, nullable = false), + StructField("WindowName", IntegerType, nullable = false), + StructField("OpenerName", IntegerType, nullable = false), + StructField("HistoryLength", ShortType, nullable = false), + StructField("BrowserLanguage", StringType, nullable = false), + StructField("BrowserCountry", StringType, nullable = false), + StructField("SocialNetwork", StringType, nullable = false), + StructField("SocialAction", StringType, nullable = false), + StructField("HTTPError", ShortType, nullable = false), + StructField("SendTiming", IntegerType, nullable = false), + StructField("DNSTiming", IntegerType, nullable = false), + StructField("ConnectTiming", IntegerType, nullable = false), + StructField("ResponseStartTiming", IntegerType, nullable = false), + StructField("ResponseEndTiming", IntegerType, nullable = false), + StructField("FetchTiming", IntegerType, nullable = false), + StructField("SocialSourceNetworkID", ShortType, nullable = false), + StructField("SocialSourcePage", StringType, nullable = false), + StructField("ParamPrice", LongType, nullable = false), + StructField("ParamOrderID", StringType, nullable = false), + StructField("ParamCurrency", StringType, nullable = false), + StructField("ParamCurrencyID", ShortType, nullable = false), + StructField("OpenstatServiceName", StringType, nullable = false), + StructField("OpenstatCampaignID", StringType, nullable = false), + StructField("OpenstatAdID", StringType, nullable = false), + StructField("OpenstatSourceID", StringType, nullable = false), + StructField("UTMSource", StringType, nullable = false), + StructField("UTMMedium", StringType, nullable = false), + StructField("UTMCampaign", StringType, nullable = false), + StructField("UTMContent", StringType, nullable = false), + StructField("UTMTerm", StringType, nullable = false), + StructField("FromTag", StringType, nullable = false), + StructField("HasGCLID", ShortType, nullable = false), + StructField("RefererHash", LongType, nullable = false), + StructField("URLHash", LongType, nullable = false), + StructField("CLID", IntegerType, nullable = false)) +) + +val startTable = System.nanoTime() +val df = spark.read.schema(schema).option("header", "false").option("sep", "\t").csv("/hits.tsv") +df.createOrReplaceTempView("hits") +val endTable = System.nanoTime() +val timeElapsedTable = (endTable - startTable) / 1000000 +writer.write(s"Creating table time: $timeElapsedTable ms") + +val queries = Source.fromFile("queries.sql").getLines().toList +var itr: Int = 0 + queries.foreach(query => { + val start = System.nanoTime() + val result = spark.sql(query) + val end = System.nanoTime() + val timeElapsed = (end - start) / 1000000 + writer.write(s"Query $itr | Time: $timeElapsed ms") + itr += 1 + }) + +spark.stop() +System.exit(0) diff --git a/spark/benchmark.sh b/spark/benchmark.sh new file mode 100644 index 000000000..c59202f51 --- /dev/null +++ b/spark/benchmark.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +sudo apt-get update +sudo apt-get -y install openjdk-8-jdk-headless + +### If there is no HDFS and Spark on your system: + +export HADOOP_DIR="/usr/local/hadoop" +export SPARK_DIR="/usr/local/spark" +export JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64" +export HADOOP_VERSION="3.3.6" +export SPARK_VERSION="3.5.0" + + +# export hadoop_file=hadoop-$HADOOP_VERSION.tar.gz +# export spark_file=spark-$SPARK_VERSION-bin-hadoop3.tgz +# wget --continue https://downloads.apache.org/hadoop/common/hadoop-$HADOOP_VERSION/$hadoop_file +# wget --continue https://downloads.apache.org/spark/spark-$SPARK_VERSION/$spark_file +# sudo tar -xzf $hadoop_file $spark_file +# sudo rm -f $spark_file $hadoop_file +# sudo rm -rf $HADOOP_DIR $SPARK_DIR +# sudo mv spark-$SPARK_VERSION spark +# sudo mv spark $SPARK_DIR +# sudo mv hadoop-$HADOOP_VERSION hadoop +# sudo mv hadoop $HADOOP_DIR + +# echo "export HADOOP_HOME=$HADOOP_DIR" >> ~/.bashrc +# echo "export SPARK_HOME=$SPARK_DIR" >> ~/.bashrc +# echo "export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH" >> ~/.bashrc +# source ~/.bashrc + +### To configure HDFS: + +#cd $HADOOP_HOME/etc/hadoop +#cp core-site.xml core-site.xml.bak +#cp hdfs-site.xml hdfs-site.xml.bak +# echo " +# +# fs.defaultFS +# hdfs://localhost:9000 +# +# " > core-site.xml +# echo " +# +# dfs.replication +# 1 +# +# " > hdfs-site.xml +# echo "export JAVA_HOME=$JAVA_HOME" >> hadoop-env.sh + +### To configure Spark: + +# cd $SPARK_HOME/conf +# cp spark-env.sh.template spark-env.sh +# echo "export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop" >> spark-env.sh +# echo "export SPARK_MASTER_HOST=localhost" >> spark-env.sh + +### To run passless connection on localhost: + +# ssh-keygen -t rsa +# cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys +# chmod og-wx ~/.ssh/authorized_keys + +### To run Spark and HDFS: + +# cd $SPARK_HOME/sbin +# ./start-master.sh +# ./start-slave.sh spark://localhost:7077 +# cd $HADOOP_HOME/sbin +# hdfs namenode -format +# ./start-dfs.sh + +wget --continue 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' +gzip -d hits.tsv.gz +chmod 777 ~ hits.tsv +hdfs dfs -put hits.tsv / + +$SPARK_HOME/bin/spark-shell --master local -i ClickBenchRunner.scala diff --git a/spark/log.txt b/spark/log.txt new file mode 100644 index 000000000..b334aa948 --- /dev/null +++ b/spark/log.txt @@ -0,0 +1,44 @@ +Creating table time: 236 ms +Query 0 | Time: 18325 ms +Query 1 | Time: 414 ms +Query 2 | Time: 378 ms +Query 3 | Time: 240 ms +Query 4 | Time: 2384 ms +Query 5 | Time: 1577 ms +Query 6 | Time: 246 ms +Query 7 | Time: 988 ms +Query 8 | Time: 2020 ms +Query 9 | Time: 2689 ms +Query 10 | Time: 3097 ms +Query 11 | Time: 2011 ms +Query 12 | Time: 950 ms +Query 13 | Time: 1424 ms +Query 14 | Time: 715 ms +Query 15 | Time: 832 ms +Query 16 | Time: 849 ms +Query 17 | Time: 773 ms +Query 18 | Time: 1053 ms +Query 19 | Time: 143 ms +Query 20 | Time: 149 ms +Query 21 | Time: 854 ms +Query 22 | Time: 1877 ms +Query 23 | Time: 554 ms +Query 24 | Time: 216 ms +Query 25 | Time: 154 ms +Query 26 | Time: 161 ms +Query 27 | Time: 1060 ms +Query 28 | Time: 992 ms +Query 29 | Time: 642 ms +Query 30 | Time: 1096 ms +Query 31 | Time: 981 ms +Query 32 | Time: 1099 ms +Query 33 | Time: 1055 ms +Query 34 | Time: 822 ms +Query 35 | Time: 692 ms +Query 36 | Time: 929 ms +Query 37 | Time: 855 ms +Query 38 | Time: 790 ms +Query 39 | Time: 800 ms +Query 40 | Time: 704 ms +Query 41 | Time: 814 ms +Query 42 | Time: 923 ms diff --git a/spark/queries.sql b/spark/queries.sql new file mode 100644 index 000000000..e00090f26 --- /dev/null +++ b/spark/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10;