-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSeparate_Train_Test.py
49 lines (37 loc) · 1.24 KB
/
Separate_Train_Test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python
#########################
#Author: Jessica Freeze
#Last Edited 3/06/19
#The purpose of this script is to split data into train and test sets.
#If you desire random splits every run of training, use an sklearn or other package.
#This will make permanent files you can refer back to and analyze.
########################
import random
def seper():
#Collect user input for full dataset and percentage to be used for training
TotalFile=input("What is the name of the file you would like split into a train and test set?")
split=input("What integer percentage of data would you like to be used for training?")
#Make train and test filenames
TrainName=str(TotalFile)+"_train.csv"
TestName=str(TotalFile)+"_test.csv"
#Open the full dataset file
f = open(TotalFile,"r")
#Open new files to store train and test sets
g=open(TrainName,"w+")
h=open(TestName,"w+")
#Make the integer for a random ratio split
percent=int(100/split)
#For each datapoint in the dataset, determine if it will be added to train or test.
for line in f:
if random.randint(0,percent) == 1:
#Write to train
g.write(line)
else:
#Write to test
h.write(line)
#Close files
f.close()
g.close()
h.close()
if __name__ == "__main__":
seper()