-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsparkWriteDF.py
32 lines (24 loc) · 907 Bytes
/
sparkWriteDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from pyspark.sql import SparkSession
from lib.logger import Log4J
if __name__ == '__main__':
spark = SparkSession.builder \
.master("local[3]") \
.appName("SchemaApp") \
.getOrCreate()
logger = Log4J(spark)
flightTimeCsvDF = spark.read \
.format("csv") \
.load("data/flight*.csv")
# flightTimeCsvDF.write \
# .format("csv") \
# .mode("overwrite") \
# .option("path","dataOutput/csv/") \
# .save()
logger.info("Num of Partitions before: " + str(flightTimeCsvDF.rdd.getNumPartitions()))
partitonedDF = flightTimeCsvDF.repartition(10)
partitonedDF.write \
.format("csv") \
.mode("overwrite") \
.option("path","dataOutput/csv/") \
.save()
logger.info("Num of Partitions after: " + str(partitonedDF.rdd.getNumPartitions()))