-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbasic_classify.py
45 lines (35 loc) · 1.41 KB
/
basic_classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import competition_utilities as cu
import basic_features as features
from sklearn.ensemble import RandomForestClassifier
train_file = "train-A.csv"
full_train_file = "train.csv"
test_file = "public_leaderboard.csv"
submission_file = "basic_classify_v2.csv"
feature_names = [ "BodyLength"
, "NumTags"
, "OwnerUndeletedAnswerCountAtPostTime"
, "ReputationAtPostCreation"
, "TitleLength"
, "TitleWordCount"
, "UserAge"
]
def main():
print("Reading the data")
data = cu.get_dataframe(train_file)
print("Extracting features")
fea = features.extract_features(feature_names, data)
print("Training the model")
rf = RandomForestClassifier(n_estimators=60, verbose=2, compute_importances=True, n_jobs=-1)
rf.fit(fea, data["OpenStatus"])
print("Reading test file and making predictions")
data = cu.get_dataframe(test_file)
test_features = features.extract_features(feature_names, data)
probs = rf.predict_proba(test_features)
print("Calculating priors and updating posteriors")
new_priors = cu.get_priors(full_train_file)
old_priors = cu.get_priors(train_file)
probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
print("Saving submission to %s" % submission_file)
cu.write_submission(submission_file, probs)
if __name__=="__main__":
main()