diff --git a/admin/ctmixtures-export-data.py b/admin/ctmixtures-export-data.py deleted file mode 100755 index ab8c8e7..0000000 --- a/admin/ctmixtures-export-data.py +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2013. Mark E. Madsen -# -# This work is licensed under the terms of the Apache Software License, Version 2.0. See the file LICENSE for details. - -import ming -import csv -import os -import logging as log -import tempfile -import argparse -import madsenlab.axelrod.utils as utils -import madsenlab.axelrod.data as data - -# Prototype: -# mongoexport --db f-test_samples_postclassification --collection pergeneration_stats_postclassification --csv --out pgstats.csv --fieldFile fieldlist - -mongoexport = "mongoexport " - - - -## setup - -def setup(): - global args, config, simconfig - parser = argparse.ArgumentParser() - parser.add_argument("--experiment", help="provide name for experiment, to be used as prefix for database collections") - parser.add_argument("--debug", help="turn on debugging output") - parser.add_argument("--dbhost", help="database hostname, defaults to localhost", default="localhost") - parser.add_argument("--dbport", help="database port, defaults to 27017", default="27017") - parser.add_argument("--configuration", help="Path to configuration file") - parser.add_argument("--stats", choices=['pop', 'sampled', 'tasampled'], required=True) - parser.add_argument("--filename", help="path to file for export", required=True) - - args = parser.parse_args() - - simconfig = utils.MixtureConfiguration(args.configuration) - - if args.stats == 'pop': - simconfig = utils.AxelrodConfiguration(args.configuration) - elif args.stats == 'sampled': - simconfig = utils.AxelrodExtensibleConfiguration(args.configuration) - elif args.stats == 'tasampled': - simconfig = utils.TreeStructuredConfiguration(args.configuration) - else: - log.error("This shouldn't happen - args.model = %s", args.model) - - if args.debug == 1: - log.basicConfig(level=log.DEBUG, format='%(asctime)s %(levelname)s: %(message)s') - else: - log.basicConfig(level=log.INFO, format='%(asctime)s %(levelname)s: %(message)s') - - - #### main program #### - log.info("EXPORT DATA TO CSV - Experiment: %s", args.experiment) - data.set_experiment_name(args.experiment) - data.set_database_hostname(args.dbhost) - data.set_database_port(args.dbport) - config = data.getMingConfiguration(data.modules) - ming.configure(**config) - - - -def export_collection_to_csv(database, collection_name, fieldlist): - - outputFileName = "data_" - outputFileName += collection_name - outputFileName += ".csv" - - fieldFile = tempfile.NamedTemporaryFile(mode="w+t",suffix=".txt",dir="/tmp",delete=False) - fieldFileName = fieldFile.name - log.debug("Saving field list to %s", fieldFileName) - - for field in fieldlist: - fieldFile.write(field) - fieldFile.write('\n') - - fieldFile.flush() - - args = [] - args.append(mongoexport) - args.append("--db") - args.append(database) - args.append("--collection") - args.append(collection_name) - args.append("--csv") - args.append("--fieldFile") - args.append(fieldFileName) - args.append("--out") - args.append(outputFileName) - - log.debug("args: %s", args) - retcode = os.system(" ".join(args)) - log.debug("return code: %s", retcode) - - - - -if __name__ == "__main__": - setup() - - - fieldnames = data.axelrod_run_treestructured.columns_to_export_for_analysis() - orig_fields = fieldnames[:] - fieldnames.extend(["cultureid", "culture_count", "mean_radii", "sd_radii", - "orbit_number", "autgroupsize", "remaining_density", - "mean_degree", "sd_degree", - "mean_orbit_multiplicity", "sd_orbit_multiplicity", - "max_orbit_multiplicity","order", "msg_lambda", "msg_beta", "mem_beta"]) - ofile = open(args.filename, "wb") - writer = csv.DictWriter(ofile, fieldnames=fieldnames, quotechar='"', quoting=csv.QUOTE_ALL) - - headers = dict((n,n) for n in fieldnames) - writer.writerow(headers) - - if args.finalized == True: - cursor = data.AxelrodStatsTreestructured.m.find(dict(run_finalized=1),dict(timeout=False)) - else: - cursor = data.AxelrodStatsTreestructured.m.find(dict(),dict(timeout=False)) - - - - for sample in cursor: - row = dict() - for field in sorted(orig_fields): - row[field] = sample[field] - - # now pull apart the trait graph list - producing a row for each element of the trait graph list - tg_stats = sample['trait_graph_stats'] - for tg in tg_stats: - #log.info("tg: %s", tg) - row['cultureid'] = tg['cultureid'] - row['culture_count'] = tg['culture_count'] - row['mean_radii'] = tg['mean_radii'] - row['sd_radii'] = tg['sd_radii'] - row['mean_degree'] = tg['mean_degree'] - row['sd_degree'] = tg['sd_degree'] - row['orbit_number'] = tg['orbit_number'] - row['autgroupsize'] = tg['autgroupsize'] - row['remaining_density'] = tg['remaining_density'] - row['mean_orbit_multiplicity'] = tg['mean_orbit_multiplicity'] - row['sd_orbit_multiplicity'] = tg['sd_orbit_multiplicity'] - row['max_orbit_multiplicity'] = tg['max_orbit_multiplicity'] - row['order'] = tg['order'] - row['msg_lambda'] = tg['msg_lambda'] - row['msg_beta'] = tg['msg_beta'] - row['mem_beta'] = tg['mem_beta'] - - - #log.info("row: %s", row) - writer.writerow(row) - - ofile.close() - - - - - - - - - - - - - diff --git a/analytics/ctmixtures-export-data.py b/analytics/ctmixtures-export-data.py new file mode 100755 index 0000000..4c1d4c4 --- /dev/null +++ b/analytics/ctmixtures-export-data.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python + +# Copyright (c) 2013. Mark E. Madsen +# +# This work is licensed under the terms of the Apache Software License, Version 2.0. See the file LICENSE for details. + +import ming +import csv +import logging as log +import argparse +import ctmixtures.data as data + + +############################################################################ +def setup(): + global args, config, simconfig + parser = argparse.ArgumentParser() + parser.add_argument("--experiment", help="provide name for experiment, to be used as prefix for database collections") + parser.add_argument("--debug", help="turn on debugging output") + parser.add_argument("--dbhost", help="database hostname, defaults to localhost", default="localhost") + parser.add_argument("--dbport", help="database port, defaults to 27017", default="27017") + parser.add_argument("--configuration", help="Path to configuration file") + parser.add_argument("--filename", help="path and base filename for exports (DO NOT include *.csv extension)", required=True) + + args = parser.parse_args() + + if args.debug == 1: + log.basicConfig(level=log.DEBUG, format='%(asctime)s %(levelname)s: %(message)s') + else: + log.basicConfig(level=log.INFO, format='%(asctime)s %(levelname)s: %(message)s') + + #### main program #### + log.info("EXPORT DATA TO CSV - Experiment: %s", args.experiment) + data.set_experiment_name(args.experiment) + data.set_database_hostname(args.dbhost) + data.set_database_port(args.dbport) + config = data.getMingConfiguration(data.modules) + ming.configure(**config) + + + +############################################################################ +def export_simulation_record(): + # ## Export a simulation record file, with all params and classes used, random + ### seed, whatever is needed to replicate the simulations + full_filename = '' + full_filename += args.filename + full_filename += "-simulation-data.csv" + sim_fields = data.mixture_model_stats.sim_record_columns_to_export() + ofile = open(full_filename, "wb") + writer = csv.DictWriter(ofile, fieldnames=sim_fields, quotechar='"', quoting=csv.QUOTE_ALL) + headers = dict((n, n) for n in sim_fields) + writer.writerow(headers) + cursor = data.MixtureModelStats.m.find(dict(), dict(timeout=False)) + for sample in cursor: + row = dict() + for field in sim_fields: + row[field] = sample[field] + + # correct kandler_interval from timesteps to generations + row['kandler_interval'] = int(row['kandler_interval']) / int(row['population_size']) + + #log.info("sim data row: %s", row) + writer.writerow(row) + ofile.close() + + +############################################################################ +# # whole population statistics +# slatkin_exact = Field([float]) +# shannon_entropy = Field([float]) +# iqv_diversity = Field([float]) +# num_trait_configurations = Field(int) +# trait_configuration_counts = Field([]) +# configuration_slatkin = Field(float) +# unlabeled_frequencies = Field([]) +# unlabeled_counts = Field([]) +# pop_richness = Field([int]) + +def export_population_stats(): + # ## Export a full population census statistics file ### + full_filename = '' + full_filename += args.filename + full_filename += "-population-data.csv" + pop_fields = data.mixture_model_stats.pop_columns_to_export() + + # adjust the fields for the new summary statistics + pop_fields.append('slatkin_locus_max') + pop_fields.append('slatkin_locus_min') + pop_fields.append('entropy_locus_max') + pop_fields.append('entropy_locus_min') + pop_fields.append('iqv_locus_max') + pop_fields.append('iqv_locus_min') + pop_fields.append('richness_locus_max') + pop_fields.append('richness_locus_min') + pop_fields.append('kandler_locus_max') + pop_fields.append('kandler_locus_min') + + ofile = open(full_filename, "wb") + writer = csv.DictWriter(ofile, fieldnames=pop_fields, quotechar='"', quoting=csv.QUOTE_ALL) + headers = dict((n, n) for n in pop_fields) + writer.writerow(headers) + + cursor = data.MixtureModelStats.m.find(dict(), dict(timeout=False)) + for sample in cursor: + row = dict() + row['simulation_run_id'] = sample['simulation_run_id'] + row['model_class_label'] = sample['model_class_label'] + row['num_trait_configurations'] = sample['num_trait_configurations'] + row['configuration_slatkin'] = sample['configuration_slatkin'] + + # slatkin exact + slatkin_values = sample['slatkin_exact'] + row['slatkin_locus_max'] = max(slatkin_values) + row['slatkin_locus_min'] = min(slatkin_values) + + # shannon entropy + entropy_list = sample['slatkin_exact'] + row['entropy_locus_max'] = max(entropy_list) + row['entropy_locus_min'] = min(entropy_list) + + # IQV + iqv_list = sample['iqv_diversity'] + row['iqv_locus_max'] = max(iqv_list) + row['iqv_locus_min'] = min(iqv_list) + + # Per-locus richness + richness_list = sample['pop_richness'] + row['richness_locus_max'] = max(richness_list) + row['richness_locus_min'] = min(richness_list) + + # Kandler remaining per locus + kandler_list = sample['kandler_remaining_count'] + row['kandler_locus_max'] = max(kandler_list) + row['kandler_locus_min'] = min(kandler_list) + + #log.info("sim data row: %s", row) + writer.writerow(row) + ofile.close() + +############################################################################ +# # results by sample size +# unlabeled_freq_ssize = Field(schema.Anything) +# unlabeled_counts_ssize = Field(schema.Anything) +# unlabeled_config_counts_ssize = Field(schema.Anything) +# num_configurations_ssize = Field(schema.Anything) +# config_slatkin_ssize = Field(schema.Anything) +# entropy_ssize = Field(schema.Anything) +# iqv_ssize = Field(schema.Anything) +# richness_ssize = Field(schema.Anything) +# slatkin_ssize = Field(schema.Anything) +# kandler_remaining_count = Field([int]) + +def export_sampled_stats(): + ## export a file with sampled statistics + full_filename = '' + full_filename += args.filename + full_filename += "-sampled-data.csv" + + sim_fields = data.mixture_model_stats.ssize_columns_to_export() + + ofile = open(full_filename, "wb") + writer = csv.DictWriter(ofile, fieldnames=sim_fields, quotechar='"', quoting=csv.QUOTE_ALL) + headers = dict((n, n) for n in sim_fields) + writer.writerow(headers) + cursor = data.MixtureModelStats.m.find(dict(), dict(timeout=False)) + + for sample in cursor: + pass + + ofile.close() + +############################################################################ +# # results for TA intervals over all sample sizes +# unlabeled_freq_ta_ssize = Field(schema.Anything) +# richness_ta_ssize = Field(schema.Anything) +# slatkin_ta_ssize = Field(schema.Anything) +# entropy_ta_ssize = Field(schema.Anything) +# iqv_ta_ssize = Field(schema.Anything) +# unlabeled_config_counts_ta_ssize = Field(schema.Anything) +# num_configurations_ta_ssize = Field(schema.Anything) +# config_slatkin_ta_ssize = Field(schema.Anything) +# config_entropy_ta_ssize = Field(schema.Anything) +# config_iqv_ta_ssize = Field(schema.Anything) +# kandler_remaining_tassize = Field(schema.Anything) + +def export_ta_sampled_stats(): + ## export a file with sampled statistics + full_filename = '' + full_filename += args.filename + full_filename += "-tasampled-data.csv" + + sim_fields = data.mixture_model_stats.tassize_columns_to_export() + + ofile = open(full_filename, "wb") + writer = csv.DictWriter(ofile, fieldnames=sim_fields, quotechar='"', quoting=csv.QUOTE_ALL) + headers = dict((n, n) for n in sim_fields) + writer.writerow(headers) + cursor = data.MixtureModelStats.m.find(dict(), dict(timeout=False)) + + for sample in cursor: + pass + + ofile.close() + + +############################################################################ + +if __name__ == "__main__": + setup() + export_simulation_record() + export_population_stats() + export_sampled_stats() + export_ta_sampled_stats() + + + + + + + + + + + + + + + + + diff --git a/ctmixtures/data/mixture_model_stats.py b/ctmixtures/data/mixture_model_stats.py index 582dc6c..c245feb 100644 --- a/ctmixtures/data/mixture_model_stats.py +++ b/ctmixtures/data/mixture_model_stats.py @@ -98,19 +98,23 @@ def store_stats_mixture_model(config, timestep, num_configs, return True -def columns_to_export_for_analysis(): - """ - :return: - """ + +def sim_record_columns_to_export(): cols = [ "simulation_run_id", "model_class_label", "network_class", "interaction_rule_classes", + "innovation_class", + "pop_class", + "network_class", + "trait_class", "random_seed", - "sample_time", "num_features", + "init_traits_per_feature", + "script_filename", + "full_command_line", "population_size", "innovation_rate", "conformism_strength", @@ -119,35 +123,40 @@ def columns_to_export_for_analysis(): ] return cols -def tassize_columns_to_export_for_analysis(): + + +# the following methods have defined all columns to export in the past, +# but as feature transforms and other summary statistics have grown +# in importance, all that is now done in analytics/ctmixtures-export-data.py +# What remains are important stub columns or invariant columns. + +def pop_columns_to_export(): + """ + + :return: + """ cols = [ "simulation_run_id", - "network_class", - "interaction_rule_classes", - "sample_time", - "num_features", - "population_size", - "innovation_rate", - "conformism_strength", - "anticonformism_strength", - "kandler_interval" + "model_class_label", + "num_trait_configurations", + "configuration_slatkin", ] + return cols + -def ssize_columns_to_export_for_analysis(): +def ssize_columns_to_export(): cols = [ "simulation_run_id", - "network_class", - "interaction_rule_classes", - "sample_time", - "num_features", - "population_size", - "innovation_rate", - "conformism_strength", - "anticonformism_strength", - "kandler_interval" + "model_class_label", ] + return cols - +def tassize_columns_to_export(): + cols = [ + "simulation_run_id", + "model_class_label", + ] + return cols