-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans_apo_ALL.py
83 lines (74 loc) · 2.82 KB
/
kmeans_apo_ALL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 21 11:30:00 2016
@author: john
"""
import numpy as np
import sklearn.cluster as cl
from astropy.io import fits
import pylab as plt
import pickle as cp
#load CANNON data
data_labels = ["FE_H",
"AL_H", "CA_H", "C_H", "K_H", "MG_H", "MN_H", "NA_H",
"NI_H", "N_H", "O_H", "SI_H", "S_H", "TI_H", "V_H"]
hdulist = fits.open('data/results-unregularized-matched.fits')
cols = hdulist[1].columns
tbdata = hdulist[1].data
#maybe add in rejection of non disks
mask = ((tbdata.field("TEFF_ASPCAP") > 3500.) *
(tbdata.field("TEFF_ASPCAP") < 5500.) *
(tbdata.field("LOGG_ASPCAP") > 0.) *
(tbdata.field("LOGG_ASPCAP") < 3.9)) # MKN recommendation
metadata_check_labels = ["RA", "DEC", "TEFF_ASPCAP", "LOGG_ASPCAP"]
metadata_check = np.vstack((tbdata.field(label) for label in metadata_check_labels)).T
mask *= np.all(np.isfinite(metadata_check), axis=1)
data = np.vstack((tbdata.field(label) for label in data_labels)).T
mask *= np.all(np.isfinite(data), axis=1)
data = data[mask]
print("data is shape: ",data.shape)
#convert to the iron basis
fe_data = np.vstack((data.T[i] - data.T[0]) for i in range(len(data_labels))).T
fe_data.T[0] = data.T[0]
print("fe basis data is shape: ",fe_data.shape)
#reestablish metadata that we actually need
metadata_labels = ["RA", "DEC", "PMRA", "PMDEC", "GLON", "GLAT", "VHELIO_AVG", "TEFF_ASPCAP", "LOGG_ASPCAP", "FIELD", "APOGEE_ID"]
metadata = np.vstack((tbdata.field(label) for label in metadata_labels)).T
metadata = metadata[mask]
print("metadata is shape: ",metadata.shape)
#now we want some kmeans up in here
#K = 256
#print("running kmeans, k = 256 on whole set...")
#km = cl.KMeans(n_clusters = K, random_state=42, n_init=32) #the algorithm
#clusters = km.fit_predict(data) #the important returned vale
#centers = km.cluster_centers_.copy() #needed for variance
#
##compute the density so we can retrieve the densest
#print("kmeans is done, we now compute density")
##hogg style below (not sure how reliable this is):
#N, D = data.shape
#sizes = np.zeros(K).astype(int)
#logdets = np.zeros(K)
#fehs = np.zeros(K)
#for k in range(K):
# I = (clusters == k)
# sizes[k] = np.sum(I)
# subdata = data[I,:] - (np.mean(data[I], axis=0))[None, :]
# if sizes[k] > (D + 1):
# variance = np.sum(subdata[:,:,None] * subdata[:,None,:], axis=0) / (sizes[k] - 1.)
# s, logdets[k] = np.linalg.slogdet(variance)
# assert s > 0
# else:
# logdets[k] = -np.Inf
# fehs[k] = np.mean(data[I, 0])
#densities = sizes * np.exp(-0.5 * logdets)
#
#
##we can plot this shit
#plt.loglog(sizes, densities, '.')
#plt.xlabel("Cluster Size")
#plt.ylabel("Cluster Density")
#it is quite similar to hogg et al
#remember there is no scaling for our space
#remember that we have not put it in the fe basis space