-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpyspark-emr2.py
33 lines (23 loc) · 1.08 KB
/
pyspark-emr2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType
def spark_session():
spark = SparkSession.builder.appName('EmrApp2').getOrCreate()
return spark
def main():
spark=spark_session()
data = [(1,'Naman',25,'Graduated',['Hadoop','Kafka','Spark']), (2,'Tejal',27,'Post-Graduated',['PhotoShop','Illustrator']), (3,'Nirjara',20,'Student',['SQL','Bio-Tech']), (4,'Siddharth',16,'Student',['Gaming','CS'])]
schema = StructType([StructField('Id', IntegerType()), StructField('Name',StringType()), StructField('Age',IntegerType()),StructField('Status',StringType()),StructField('Skills',ArrayType(StringType())) ])
df = spark.createDataFrame(data,schema)
df.show()
df.printSchema()
df.count()
df.columns
df.schema
df.dtypes
df1 = df.filter(col('Age')<25)
df1.show()
df1.write.format('json').mode('overwrite').save(path='/data')
spark.stop()
if __name__ == "__main__":
main()