-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfetch_datasets.sh
72 lines (61 loc) · 1.91 KB
/
fetch_datasets.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env bash
# -*- coding: utf-8 -*-
# __Author__ = 'Tannon Kew'
# __Email__ = '[email protected]
# __Date__ = '2023-03-03'
set -e
SCRIPTS_DIR=$(cd $(dirname -- $0); pwd)
BASE="$SCRIPTS_DIR/.."
DATA_DIR="$BASE/resources/data"
cd $BASE
asset_dir="$DATA_DIR/asset"
if [[ ! -d "$asset_dir" ]]; then
git clone https://github.com/facebookresearch/asset.git $asset_dir
else
echo ""
echo "Dataset dir 'asset' already exists. Skipping..."
echo ""
fi
turkcorpus_dir="$DATA_DIR/turkcorpus"
if [[ ! -d "$turkcorpus_dir" ]]; then
git clone https://github.com/cocoxu/simplification.git $turkcorpus_dir
else
echo ""
echo "Dataset dir 'turkcorpus' already exists. Skipping..."
echo ""
fi
hsplit_dir="$DATA_DIR/hsplit"
if [[ ! -d "$hsplit_dir" ]]; then
git clone https://github.com/eliorsulem/HSplit-corpus.git $hsplit_dir
else
echo ""
echo "Dataset dir 'hsplit' already exists. Skipping..."
echo ""
fi
onestopenglish_dir="$DATA_DIR/onestopenglish"
if [[ ! -d "$onestopenglish_dir" ]]; then
git clone https://github.com/nishkalavallabhi/OneStopEnglishCorpus.git $onestopenglish_dir
else
echo ""
echo "Dataset dir 'onestopenglish' already exists. Skipping..."
echo ""
fi
plainenglishlegal_dir="$DATA_DIR/plainenglishlegal"
if [[ ! -d "$plainenglishlegal_dir" ]]; then
git clone https://github.com/lauramanor/legal_summarization $plainenglishlegal_dir
else
echo ""
echo "Dataset dir 'plainenglishlegal' already exists. Skipping..."
echo ""
fi
contractdata_dir="$DATA_DIR/contractbm"
if [[ ! -d "$contractdata_dir" ]]; then
mkdir -p $contractdata_dir
wget https://dax-cdn.cdn.appdomain.cloud/dax-split-and-rephrase/1.0.0/split-and-rephrase-data.tar.gz -P $contractdata_dir/
tar -xvf $contractdata_dir/split-and-rephrase-data.tar.gz -C $contractdata_dir/
else
echo ""
echo "Dataset dir 'contractbm' already exists. Skipping..."
echo ""
fi
# newsela