This repository has been archived by the owner on Jul 24, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
99_utils-w2v_process_data.sh
executable file
·65 lines (58 loc) · 2.1 KB
/
99_utils-w2v_process_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env bash
#
#
#
#
#
set -e
DATA_OUT_DIR="./word2vec"
W2V_THREADS=${W2V_THREADS:-4}
W2V_DEBUG=${W2V_DEBUG:-2}
[ -z $DATA_NAME ] && ( echo "missing \$DATA_NAME"; exit 1)
[ -z $W2V_SIZE ] && (echo "missing: \$W2V_SIZE"; exit 1) # 500
[ -z $W2V_WINDOW ] && (echo "missing: \$W2V_WINDOW"; exit 1) # 10
[ -z $W2V_SAMPLE ] && (echo "missing: \$W2V_SAMPLE"; exit 1) # 10
[ -z $W2V_HS ] && (echo "missing: \$W2V_HS"; exit 1) # 1
[ -z $W2V_NEGATIVE ] && (echo "missing: \$W2V_NEGATIVE"; exit 1) # 5
[ -z $W2V_ITER ] && (echo "missing: \$W2V_ITER"; exit 1) # 5
[ -z $W2V_MINCOUNT ] && (echo "missing: \$W2V_MINCOUNT"; exit 1) #
[ -z $W2V_ALPHA ] && (echo "missing: \$W2V_ALPHA"; exit 1) #
[ -z $W2V_CBOW ] && (echo "missing: \$W2V_CBOW"; exit 1) #
DATA_CLEAN_VER=${DATA_CLEAN_VER:-1}
DATA_OUT_FN="${DATA_CLEAN_VER}_${DATA_NAME}-size${W2V_SIZE}-window${W2V_WINDOW}-sample${W2V_SAMPLE}-hs${W2V_HS}-neg${W2V_NEGATIVE}-iter${W2V_ITER}-mincnt${W2V_MINCOUNT}-alpha${W2V_ALPHA}-cbow${W2V_CBOW}"
echo "output file name: ${DATA_OUT_FN}"
[ -f "$DATA_OUT_DIR"/"$DATA_OUT_FN".vec ] && ( echo "output already exists: ${DATA_OUT_DIR}/${DATA_OUT_FN}.vec"; exit 1)
mkdir -p word2vec
[ -z $W2V_READ_VOCAB ] &&
{
./tmp/word2vec.svn/word2vec \
-train "$1" \
-output "${DATA_OUT_DIR}"/"${DATA_OUT_FN}".vec \
-size $W2V_SIZE \
-window $W2V_WINDOW \
-sample $W2V_SAMPLE \
-hs $W2V_HS \
-negative $W2V_NEGATIVE \
-iter $W2V_ITER \
-min-count $W2V_MINCOUNT \
-alpha $W2V_ALPHA \
-cbow $W2V_CBOW \
-threads $W2V_THREADS \
-debug $W2V_DEBUG \
-save-vocab "${DATA_OUT_DIR}"/"${DATA_OUT_FN}".voc
}
./tmp/word2vec.svn/word2vec \
-train "$1" \
-output "${DATA_OUT_DIR}"/"${DATA_OUT_FN}".vec \
-size $W2V_SIZE \
-window $W2V_WINDOW \
-sample $W2V_SAMPLE \
-hs $W2V_HS \
-negative $W2V_NEGATIVE \
-iter $W2V_ITER \
-min-count $W2V_MINCOUNT \
-alpha $W2V_ALPHA \
-cbow $W2V_CBOW \
-threads $W2V_THREADS \
-debug $W2V_DEBUG \
-read-vocab "${DATA_OUT_DIR}"/"${DATA_OUT_FN}".voc