-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_analysis.py
202 lines (182 loc) · 7.83 KB
/
cluster_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 20 16:52:22 2016
@author: john
"""
import numpy as np
import matplotlib.pyplot as plt
from astropy.io import fits
from galpy.orbit import Orbit
from galpy.potential import MWPotential2014
from galpy.potential import KeplerPotential
from galpy.actionAngle import estimateDeltaStaeckel
from galpy.actionAngle import actionAngleStaeckel
from galpy.util import bovy_conversion
from astropy import units
import pickle as cp
from astropy.io import ascii
# lets say we have some files with the 10 densest clusters and load them up
# they will have a structure like so:
# pickled_file_1 = [[IDs_1], [IDs_2], [IDs_3]...[IDs_10]]
# pickled_file_2 = [[IDs_1], [IDs_2], [IDs_3]...[IDs_10]]
# etc.
files = ["apo_fe_km512_all"] #empty for now, but files = [pickled_files_1, pickled_files_2, etc.]
selection_criteria_dict = {} #some dict containing the file name and
# a desciption of it for use on graphs later
all_data = []
for file in files:
fd = open("data/" + file + ".pkl", "rb")
data = cp.load(fd)
fd.close()
all_data.append(data)
#now, all_data = [data_1, data_2, etc] data = [[IDs_1], [IDs_2], [IDs_3]...[IDs_10]]
def load_data():
#this function grabs any data we may need, the file has been built and saved
#leave this part alone unless you know what you're doing
print("loading up chem, spatial and distances")
try:
fd = open("data/APO_3sources.pkl", "rb")
print("fast load from saved data...")
x = cp.load(fd)
fd.close()
except:
print("slow load from 3 data tables...")
#load up chemical data from Sarah's subset
fe_data_source = open('data/stars-clusters.dat')
c = fe_data_source.readlines()
abundances = ascii.read(c)
labels_Fe = ["APOGEE_ID", "FE_H",
"AL_FE", "CA_FE", "C_FE", "K_FE", "MG_FE", "MN_FE", "NA_FE",
"NI_FE", "N_FE", "O_FE", "SI_FE", "S_FE", "TI_FE", "V_FE"]
chem_data = np.vstack((abundances.field(label) for label in labels_Fe)).T
#grab the distances for everything from Sarah distance file
hdudist = fits.open('data/allStar+-v603.150209.fits')
distances = hdudist[1].data.field('DISO').T[1].T
ap_id = hdudist[1].data.field('APOGEE_ID')
dist_table = np.vstack([ap_id, distances]).T
#grab spatial data from Hogg file
hdulist = fits.open('data/results-unregularized-matched.fits')
meta_source = hdulist[1].data
#could include this data at some point
# labels_H = = ["FE_H",
# "AL_H", "CA_H", "C_H", "K_H", "MG_H", "MN_H", "NA_H",
# "NI_H", "N_H", "O_H", "SI_H", "S_H", "TI_H", "V_H"]
spatial_labels = ["RA", "DEC", "PMRA", "PMDEC", "VHELIO_AVG", "TEFF_ASPCAP", "LOGG_ASPCAP", "FIELD", "APOGEE_ID"]
spatial_data = np.vstack((meta_source.field(label) for label in spatial_labels)).T
print("saving the file for later use")
fd = open("data/APO_3sources.pkl", "wb")
#now we have 3 data thingos
x = [chem_data, spatial_data, dist_table]
cp.dump(x, fd)
fd.close()
return x
def retrieve_data(IDs_list):
data_triplet = load_data()
chem_data = data_triplet[0].copy()
spatial_data = data_triplet[1].copy()
dist_table = data_triplet[2].copy()
#cut down data above to a [chem, spatial, dict{dist}] for just the IDs
#initialize the mask as all false
chem_mask = (chem_data.T[0] == 'placeholder')
spatial_mask = (spatial_data.T[-1] == 'placeholder') #-1 in the spatial
dist_mask = (dist_table.T[0] == 'placeholder')
for ID in IDs_list:
chem_mask += (chem_data.T[0] == ID)
spatial_mask += (spatial_data.T[-1] == ID)
dist_mask += (dist_table.T[0] == ID)
chem_data = chem_data[chem_mask]
spatial_data = spatial_data[spatial_mask]
dist_table = dist_table[dist_mask] #can turn into dict if we want
#the triplet we return is what we mean by "cluster"
#it contains all the details of the given cluster
return [chem_data, spatial_data, dist_table]
def cspace_information(cluster):
#function calculates the average mmetric between each two points in a cluster
centre = np.zeros(len(cluster[0][0])-1)
#calculate centre first
for i in cluster[0]:
centre += i[1:].astype(float)
centre = centre/(len(cluster[0][0])-1)
distances = []
for i in cluster[0]:
distance = (np.absolute(centre - i[1:].astype(float))).mean()
distances.append(distance)
avg = sum(distances)/len(distances)
return avg
def emp_density(cluster):
#print("computing density...")
data = np.delete(cluster[0], 0, 1)
data = data.astype(float)
N, D = data.shape
subdata = data - (np.mean(data, axis=0))[None, :]
variance = np.sum(subdata[:,:,None] * subdata[:,None,:], axis=0) / (N - 1.)
s, logdets = np.linalg.slogdet(variance)
density = N * np.exp(-0.5 * logdets)
return density, N
def cluster_dynamics(cluster):
orbits = []
dist_dict = dict(cluster[2])
for i in cluster[1]:
ra = float(i[0])
dec = float(i[1])
dist = float(dist_dict[i[-1]])
pmRA = float(i[2])
pmDec = float(i[3])
Vlos = float(i[4])
#only attach where proper motions are available
if dist != 0.0:
if pmRA != 0.0:
if pmDec != 0.0:
orbits.append(Orbit(vxvv=[ra,dec,dist,pmRA,pmDec,Vlos],radec=True,ro=8.,vo=220., solarmotion = "schoenrich"))
ts = np.linspace(0,100,10000)
mwp= MWPotential2014
print(len(orbits))
#action angles yay!
#the following gets confusing because we speak of "means" twice
#we want the mean J over each individual orbit, but to measure the spread in the cluster
#we compare to the mean of each J
ro = 8
vo = 220
mean_js = np.array([])
for o in orbits:
try:
o.integrate(ts,mwp,method='odeint')
delta_o = estimateDeltaStaeckel(MWPotential2014,(o.R())/ro,(o.z())/vo)
aAS = actionAngleStaeckel(pot=MWPotential2014,delta = delta_o)
js_o = aAS((o.R())/ro,(o.vR())/vo,(o.vT())/vo,(o.z())/ro,(o.vz())/vo)
#the next line isn't really needed unless js is actually a vector, for t = 0 it is not
mean_o_js = np.array([js_o[0].mean(), js_o[1].mean(), js_o[2].mean()])
if mean_js.shape == (0,):
mean_js = mean_o_js
else:
mean_js = np.vstack([mean_js, mean_o_js])
except:
print("found unbound, discarded")
#in this case the orbit is unbound -- this fucks up the whole thing I know
pass
print(mean_js.shape)
#so for this cluster we have a 3 x cluster length array
cluster_mean_js = np.array([mean_js.T[0].mean(), mean_js.T[1].mean(), mean_js.T[2].mean()])
#average_difference
diffs = np.absolute((mean_js - cluster_mean_js))/cluster_mean_js
dynamics_summary = np.array([diffs.T[0].mean(), diffs.T[1].mean(), diffs.T[2].mean()])
return dynamics_summary
def full_analysis(clusters):
cl = retrieve_data(clusters[0])
chem_spread = cspace_information(cl)
action_spread = cluster_dynamics(cl)
dens = emp_density(cl)
for cluster in clusters[1:]:
cl = retrieve_data(cluster)
chem_spread = np.hstack((chem_spread, cspace_information(cl)))
action_spread = np.vstack((action_spread, cluster_dynamics(cl)))
dens = np.vstack((dens, emp_density(cl)))
return chem_spread, action_spread, dens
def density_check(clusters):
cl = retrieve_data(clusters[0])
dens = emp_density(cl)
for cluster in clusters[1:]:
cl = retrieve_data(cluster)
dens = np.vstack((dens, emp_density(cl)))
plt.semilogy(dens.T[1], dens.T[0], '.')