-
Notifications
You must be signed in to change notification settings - Fork 2
/
global_options.py
106 lines (100 loc) · 3.67 KB
/
global_options.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""Global options for analysis
"""
import os
from pathlib import Path
from typing import Dict, List, Optional, Set
# Hardware options
N_CORES: int = 2 # max number of CPU cores to use
RAM_CORENLP: str = "8G" # max RAM allocated for parsing using CoreNLP; increase to speed up parsing
PARSE_CHUNK_SIZE: int = 100 # number of lines in the input file to process uing CoreNLP at once. Increase on workstations with larger RAM (e.g. to 1000 if RAM is 64G)
# Directory locations
os.environ[
"CORENLP_HOME"
] = "/Users/mai/stanford-corenlp-full-2018-10-05/" # location of the CoreNLP models; use / to seperate folders
DATA_FOLDER: str = "data/"
MODEL_FOLDER: str = "models/" # will be created if does not exist
OUTPUT_FOLDER: str = "outputs/" # will be created if does not exist; !!! WARNING: existing files will be removed !!!
# Parsing and analysis options
STOPWORDS: Set[str] = set(
Path("resources", "StopWords_Generic.txt").read_text().lower().split()
) # Set of stopwords from https://sraf.nd.edu/textual-analysis/resources/#StopWords
PHRASE_THRESHOLD: int = 10 # threshold of the phraser module (smaller -> more phrases)
PHRASE_MIN_COUNT: int = 10 # min number of times a bigram needs to appear in the corpus to be considered as a phrase
W2V_DIM: int = 300 # dimension of word2vec vectors
W2V_WINDOW: int = 5 # window size in word2vec
W2V_ITER: int = 20 # number of iterations in word2vec
N_WORDS_DIM: int = 500 # max number of words in each dimension of the dictionary
DICT_RESTRICT_VOCAB = None # change to a fraction number (e.g. 0.2) to restrict the dictionary vocab in the top 20% of most frequent vocab
# Inputs for constructing the expanded dictionary
DIMS: List[str] = ["integrity", "teamwork", "innovation", "respect", "quality"]
SEED_WORDS: Dict[str, List[str]] = {
"integrity": [
"integrity",
"ethic",
"ethical",
"accountable",
"accountability",
"trust",
"honesty",
"honest",
"honestly",
"fairness",
"responsibility",
"responsible",
"transparency",
"transparent",
],
"teamwork": [
"teamwork",
"collaboration",
"collaborate",
"collaborative",
"cooperation",
"cooperate",
"cooperative",
],
"innovation": [
"innovation",
"innovate",
"innovative",
"creativity",
"creative",
"create",
"passion",
"passionate",
"efficiency",
"efficient",
"excellence",
"pride",
],
"respect": [
"respectful",
"talent",
"talented",
"employee",
"dignity",
"empowerment",
"empower",
],
"quality": [
"quality",
"customer",
"customer_commitment",
"dedication",
"dedicated",
"dedicate",
"customer_expectation",
],
}
# Create directories if not exist
Path(DATA_FOLDER, "processed", "parsed").mkdir(parents=True, exist_ok=True)
Path(DATA_FOLDER, "processed", "unigram").mkdir(parents=True, exist_ok=True)
Path(DATA_FOLDER, "processed", "bigram").mkdir(parents=True, exist_ok=True)
Path(DATA_FOLDER, "processed", "trigram").mkdir(parents=True, exist_ok=True)
Path(MODEL_FOLDER, "phrases").mkdir(parents=True, exist_ok=True)
Path(MODEL_FOLDER, "phrases").mkdir(parents=True, exist_ok=True)
Path(MODEL_FOLDER, "w2v").mkdir(parents=True, exist_ok=True)
Path(OUTPUT_FOLDER, "dict").mkdir(parents=True, exist_ok=True)
Path(OUTPUT_FOLDER, "scores").mkdir(parents=True, exist_ok=True)
Path(OUTPUT_FOLDER, "scores", "temp").mkdir(parents=True, exist_ok=True)
Path(OUTPUT_FOLDER, "scores", "word_contributions").mkdir(parents=True, exist_ok=True)