-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add examples of analytics pipelines using music data
- Loading branch information
1 parent
f4d5ef3
commit c70a41a
Showing
12 changed files
with
124,424 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
CREATE KEYSPACE IF NOT EXISTS spark_demo | ||
WITH replication = { | ||
'class': 'SimpleStrategy', | ||
'replication_factor' : 1 | ||
}; | ||
|
||
USE spark_demo; | ||
|
||
CREATE TABLE IF NOT EXISTS performers ( | ||
name TEXT, | ||
country TEXT, | ||
gender TEXT, | ||
type TEXT, | ||
born TEXT, | ||
died TEXT, | ||
styles LIST<TEXT>, | ||
PRIMARY KEY (name) | ||
); | ||
|
||
CREATE TABLE IF NOT EXISTS albums ( | ||
performer TEXT, | ||
title TEXT, | ||
year INT, | ||
country TEXT, | ||
quality TEXT, | ||
status TEXT, | ||
PRIMARY KEY(title) | ||
); | ||
|
||
CREATE TABLE IF NOT EXISTS performers_by_style ( | ||
style TEXT, | ||
performer TEXT, | ||
PRIMARY KEY (style, performer) | ||
); | ||
|
||
CREATE TABLE IF NOT EXISTS performers_distribution_by_style ( | ||
type text, | ||
style text, | ||
count int, | ||
PRIMARY KEY (type, style) | ||
); | ||
|
||
CREATE TABLE IF NOT EXISTS top10_styles ( | ||
type text, | ||
count int, | ||
style text, | ||
PRIMARY KEY (type, count) | ||
) WITH CLUSTERING ORDER BY (count DESC); | ||
|
||
CREATE TABLE IF NOT EXISTS albums_by_decade_and_country( | ||
country text, | ||
decade text, | ||
album_count int, | ||
PRIMARY KEY(decade,country) | ||
); | ||
|
||
CREATE TABLE IF NOT EXISTS albums_by_decade_and_country_sql( | ||
country text, | ||
decade text, | ||
album_count int, | ||
PRIMARY KEY(decade,country) | ||
); | ||
|
||
TRUNCATE performers; | ||
TRUNCATE albums; | ||
|
||
COPY performers(name, country, gender, type, born, died,styles) | ||
FROM 'performers.csv'; | ||
|
||
COPY albums(performer,title, year, country, quality, status) | ||
FROM 'albums.csv'; |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
package analytics.music | ||
|
||
import com.datastax.spark.connector.cql.CassandraConnector | ||
import org.apache.spark.{SparkConf, SparkContext} | ||
import analytics.music.Schema._ | ||
import analytics.music.Constants._ | ||
|
||
trait BaseExample { | ||
|
||
def buildSparkContext(exerciseName: String):SparkContext = { | ||
|
||
val conf = new SparkConf(true) | ||
.setAppName(exerciseName) | ||
.setMaster(LOCAL_MODE) | ||
.set(CASSANDRA_HOST_NAME_PARAM, CASSANDRA_IP) | ||
|
||
|
||
if (Constants.TABLES.contains(exerciseName)) { | ||
CassandraConnector(conf).withSessionDo { | ||
session => session.execute(s"TRUNCATE ${KEYSPACE}.${Constants.TABLES(exerciseName)}") | ||
} | ||
} | ||
|
||
new SparkContext(conf) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package analytics.music | ||
|
||
import analytics.music.Schema._ | ||
|
||
object Constants { | ||
|
||
val CASSANDRA_HOST_NAME_PARAM = "spark.cassandra.connection.host" | ||
val CASSANDRA_IP = "localhost" | ||
val LOCAL_MODE = "local" | ||
|
||
val EXAMPLE_1 = "example1" | ||
val EXAMPLE_2 = "example2" | ||
val EXAMPLE_3 = "example3" | ||
val EXAMPLE_4 = "example4" | ||
val EXAMPLE_5 = "example5" | ||
val INTERACTIVE_ANALYTICS_WITH_DSE = "interactive_analytics_with_DSE" | ||
|
||
val TABLES = Map( EXAMPLE_1 -> PERFORMERS_BY_STYLE, | ||
EXAMPLE_2 -> PERFORMERS_DISTRIBUTION_BY_STYLE, | ||
EXAMPLE_3 -> TOP_10_STYLES, | ||
EXAMPLE_4 -> ALBUMS_BY_DECADE_AND_COUNTRY, | ||
EXAMPLE_5 -> ALBUMS_BY_DECADE_AND_COUNTRY_SQL) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package analytics.music | ||
|
||
import com.datastax.spark.connector._ | ||
import com.datastax.spark.connector.rdd.CassandraRDD | ||
import org.apache.spark.rdd.RDD | ||
import analytics.music.Constants._ | ||
import analytics.music.Schema._ | ||
|
||
object Example1 extends BaseExample { | ||
|
||
def main (args: Array[String]) { | ||
|
||
|
||
val sc = buildSparkContext(EXAMPLE_1) | ||
|
||
/* | ||
* performers table structure | ||
* | ||
* CREATE TABLE IF NOT EXISTS performers ( | ||
* name TEXT, | ||
* country TEXT, | ||
* gender TEXT, | ||
* type TEXT, | ||
* born TEXT, | ||
* died TEXT, | ||
* styles LIST<TEXT>, | ||
* PRIMARY KEY (name) | ||
* ); | ||
*/ | ||
val rdd:CassandraRDD[CassandraRow] = sc.cassandraTable(KEYSPACE, PERFORMERS) | ||
|
||
val performerAndStyles:RDD[(String,String)] = | ||
rdd | ||
.map[(String,List[String])](row => (row.getString("name"),row.getList[String]("styles").toList)) | ||
.flatMap[(String,String)]{ case (performer,styles) => styles.map(style => (style,performer))} | ||
|
||
//Save data back to Cassandra | ||
performerAndStyles.saveToCassandra(KEYSPACE,PERFORMERS_BY_STYLE,SomeColumns("style","performer")) | ||
|
||
sc.stop() | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
package analytics.music | ||
|
||
import com.datastax.spark.connector._ | ||
import org.apache.spark.SparkContext._ | ||
import org.apache.spark.rdd.RDD | ||
import analytics.music.Constants._ | ||
import analytics.music.Schema._ | ||
|
||
object Example2 extends BaseExample { | ||
|
||
def main (args: Array[String]) { | ||
|
||
val sc = buildSparkContext(EXAMPLE_2) | ||
|
||
/* | ||
* Read columns "type" and "styles" from table 'performers' | ||
* and save them as (String,List[String]) RDDs | ||
* using the .as((_:String,_:List[String])) type conversion function | ||
* Normalize the performer type by filtering out 'Unknown' types | ||
*/ | ||
val rdd:RDD[(String,List[String])] = sc.cassandraTable(KEYSPACE, PERFORMERS) | ||
.select("type","styles") | ||
.as((_:String,_:List[String])) | ||
.map{case(performer_type,style) => (PERFORMERS_TYPES.getOrElse(performer_type,"Unknown"),style)} | ||
.filter{case(performer_type,_) => performer_type != "Unknown"} | ||
|
||
|
||
/* | ||
* Transform the previous tuple RDDs into a key/value RDD (PairRDD) of type | ||
* ((String,String),Integer). The (String,String) pair is the key(performer type,style) | ||
* The Integer value should be set to 1 for each element of the RDD | ||
*/ | ||
val pairs:RDD[((String,String),Int)] = rdd.flatMap{ | ||
case(performer_type,styles)=>styles.map(style => ((performer_type,style),1)) | ||
} | ||
|
||
/* | ||
* Reduce the previous tuple of ((performer type,style),1) by | ||
* adding up all the 1's into a ((performer type,style),count) | ||
*/ | ||
val reduced: RDD[((String, String), Int)] = pairs.reduceByKey{ case(left,right) => left+right} | ||
|
||
/* | ||
* Flatten the ((performer type,style),count) into | ||
* (performer type,style,count) | ||
*/ | ||
val aggregated:RDD[(String,String,Int)] = reduced.map{ | ||
case((performer_type,style),count) => (performer_type,style,count) | ||
} | ||
|
||
|
||
//Save data back to the performers_distribution_by_style table | ||
aggregated.saveToCassandra(KEYSPACE, PERFORMERS_DISTRIBUTION_BY_STYLE, SomeColumns("type","style","count")) | ||
|
||
sc.stop() | ||
} | ||
|
||
val PERFORMERS_TYPES = Map("Person"->"artist","Group"->"group") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
package analytics.music | ||
|
||
import com.datastax.spark.connector._ | ||
import com.datastax.spark.connector.rdd.CassandraRDD | ||
import org.apache.spark.rdd.RDD | ||
import analytics.music.Constants._ | ||
import analytics.music.Schema._ | ||
|
||
object Example3 extends BaseExample { | ||
|
||
def main (args: Array[String]) { | ||
|
||
|
||
val sc = buildSparkContext(EXAMPLE_3) | ||
|
||
/* | ||
* Read data from 'performers_distribution_by_style' table | ||
*/ | ||
val artists:CassandraRDD[(String,String,Int)] = sc | ||
.cassandraTable(KEYSPACE, PERFORMERS_DISTRIBUTION_BY_STYLE) | ||
.select("type","style","count") | ||
.as((_:String,_:String,_:Int)) | ||
|
||
val sortedPerformers:RDD[(String,String,Int)] = artists | ||
.filter {case (_,style,_) => style != "Unknown"} | ||
.sortBy[Int](tuple => tuple._3,false,1) | ||
|
||
// Put the sorted RDD in cache for re-use | ||
sortedPerformers.cache() | ||
|
||
// Extract styles for groups | ||
val groupsStyles: RDD[(String, String, Int)] = sortedPerformers | ||
.filter {case(performer_type,_,_) => performer_type == "group"} | ||
|
||
// Extract styles for artists | ||
val artistStyles: RDD[(String, String, Int)] = sortedPerformers | ||
.filter {case(performer_type,_,_) => performer_type == "artist"} | ||
|
||
// Cache the groupStyles | ||
groupsStyles.cache() | ||
|
||
// Cache the artistStyles | ||
artistStyles.cache() | ||
|
||
// Count total number of artists having styles that are not in the top 10 | ||
val otherStylesCountForGroup:Int = groupsStyles | ||
.collect() //Fetch the whole RDD back to driver program | ||
.drop(10) //Drop the first 10 top styles | ||
.map{case(_,_,count)=>count} //Extract the count | ||
.sum //Sum up the count | ||
|
||
// Count total number of groups having styles that are not in the top 10 | ||
val otherStylesCountForArtist:Int = artistStyles | ||
.collect() //Fetch the whole RDD back to driver program | ||
.drop(10) //Drop the first 10 top styles | ||
.map{case(_,_,count)=>count} //Extract the count | ||
.sum //Sum up the count | ||
|
||
|
||
// Take the top 10 styles for groups, with a count for all other styles | ||
val top10Groups = groupsStyles.take(10) :+ ("group","Others",otherStylesCountForGroup) | ||
|
||
// Take the top 10 styles for artists, with a count for all other styles | ||
val top10Artists = artistStyles.take(10) :+ ("artist","Others",otherStylesCountForArtist) | ||
|
||
/* | ||
* Remark: by calling take(n), all the data are shipped back to the driver program | ||
* the output of take(n) is no longer an RDD but a simple Scala collection | ||
*/ | ||
|
||
// Merge both list and save back to Cassandra | ||
sc.parallelize(top10Artists.toList ::: top10Groups.toList) | ||
.saveToCassandra(KEYSPACE,TOP_10_STYLES,SomeColumns("type","style","count")) | ||
|
||
|
||
sc.stop() | ||
} | ||
} |
Oops, something went wrong.