-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsampler.py
143 lines (110 loc) · 4.82 KB
/
sampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pandas
import numpy
import random
# Read the .csv file and parse it into a DataFrame
df = pandas.read_csv('Peru_2019_AudioMoth_Data_Full.csv')
# Get an array of the AudioMoth device names found in the .csv file
AudioMothName = df.drop_duplicates(subset=['AudioMothCode']).loc[:,'AudioMothCode'].values
# Devices that were known to have issues
invalid_devices = ['AM-8', 'AM-19', 'AM-21', 'AM-28']
# Find the devices that are not invalid in the array and add them to a temp array
temp_list = []
for device in AudioMothName:
if device not in list(invalid_devices):
temp_list.append(device)
# Set the temp array to the original array
AudioMothName = temp_list
# Remove the files from devices known to be invalid
for bad_device in invalid_devices:
df.drop(df[df['AudioMothCode'] == bad_device].index, inplace=True)
# Remove the files with an invalid length
target_size = 46000000
df.drop(df[df['FileSize'] < target_size].index, inplace=True)
# All the possible hours that can be shown
all_hours = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
# Store the current position on the DataFrame.
hour_index = 0
# Create a DataFrame with the same columns to store the outputs
output = pandas.DataFrame(columns=df.columns)
# Repeat the random sampling for each device
for AudioMoth in AudioMothName:
# Create a dictionary to store the indices of the all the recordings for every hour
hours = {}
# For all the files in this device, add it to the dictionary under the corresponding hour
while df['AudioMothCode'].iloc[hour_index] == AudioMoth:
# Record the current hour
temp = str(df['StartDateTime'].iloc[hour_index])
# If the value isnt the date string then skip it (or end the loop)
if len(temp) < 13:
if hour_index == len(df) - 1:
break
hour_index += 1
else:
curr_hour = int(temp[11:13])
# If the hour isn't in the dictionary yet, add it
if curr_hour not in hours:
hours[curr_hour] = []
# If the hour is in the dictionary, add the index of the current row to that hour's list
if curr_hour in hours:
hours[curr_hour].append(hour_index)
# If the hour_index is at the end of the DataFrame, stop iterating
if hour_index == len(df) - 1:
break
# Go to the next row
hour_index += 1
# Find the number of clips needed from each hour
if len(hours) < 24 and len(hours) != 0:
num_per_hour = int(24 / len(hours))
else:
num_per_hour = int(len(hours) / 24)
if num_per_hour != 0:
num_per_hour = int( 1 / num_per_hour)
# Find the remaining number of clips needed
extras = abs(len(hours) % 24)
if num_per_hour == 24:
extras = 0
# Make a list of dictionarys holding the row data to add to the output file
rows_list = []
# For each hour, randomly pick the `num_per_hour` amount of
# samples to use and add them to row_list
for select_hour in hours:
for i in range(num_per_hour):
# Pick a random index in the hour's list
random.seed()
curr_list = hours[select_hour]
rand_num = random.randint(0, len(curr_list)-1)
# Add the data of the sample at this index to the dictionary, and
# add the dictionary to the row_list.
rowToAdd = {}
for column in df.columns:
rowToAdd[column] = df.iloc[curr_list[rand_num]][str(column)]
rows_list.append(rowToAdd)
# Remove the used sample from the list
# curr_list.remove(curr_list[rand_num]);
# If there are extra samples needed, randomly pick that many hours and
# take a sample from each.
for j in range(extras):
# Pick a random hour
random.seed()
rand_choice = random.randint(0, len(hours)-1)
# Get the list of that hour
temp = list(hours.keys())
key = temp[rand_choice]
curr_list = hours[key]
# Pick a random index in the hour's list
random.seed()
rand_num = random.randint(0, len(curr_list)-1)
# Add the data of the sample at this index to the dictionary, and
# add the dictionary to the row_list.
rowToAdd = {}
for column in df.columns:
rowToAdd[column] = df.iloc[curr_list[rand_num]][str(column)]
rows_list.append(rowToAdd)
# Remove the used sample from the list
#curr_list.remove(curr_list[rand_num]);
# Create a temporary DataFrame from the rows_list, and then add it to the output DataFrame
temp = pandas.DataFrame(rows_list)
output = output.append(temp, ignore_index=True)
# Convert the DataFrame to a .csv file
output.to_csv('stratified_random_sample.csv', index=False)
print('Sample Completed!')