-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_test9.sh
executable file
·27 lines (23 loc) · 1.46 KB
/
run_test9.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/bin/bash
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler1.warc.gz
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler2.warc.gz
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler3.warc.gz
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler4.warc.gz
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler5.warc.gz
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler6.warc.gz
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler7.warc.gz
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler8.warc.gz
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler9.warc.gz
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler10.warc.gz
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler11.warc.gz
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler12.warc.gz
hdfs dfs -put ./example-20200623-crawler0.warc.gz example-20200623-crawler13.warc.gz
zip pyfiles.zip *.py
export PYSPARK_PYTHON="/opt/anaconda3/bin/python"
spark-submit \
--properties-file spark_properties.yarn.conf \
ArchiveProcessor.py \
--input_warcs "example-20200623-crawler*.warc.gz" \
--algseq "HTML:HTMLTextExtractor,TopicIdentifier,SentimentAnalyzer,WebPageTypeIdentifier" \
--output_hbase
hdfs dfs -rm "example-20200623-crawler*.warc.gz"