output/postScripts/dataFormat.py

# import libraries
import os
import sys
import toml
import numpy as np
import pandas as pd
from pathlib import Path

# set variables from command line input
args = sys.argv
# args = [
#     "",
#     "/Users/kylasemmendinger/Library/CloudStorage/GoogleDrive-kylasr@umich.edu/My Drive/loslrRegulation",
#     "flowANN_onlyPhysicalLimits_offSepRule_netAnnualAverage_12month_sqLM_91dv_7obj_1900_2020_75000nfe",
#     "5",
# ]

# -----------------------------------------------------------------------------
# experimental design from inputs
# -----------------------------------------------------------------------------

# [1]: path to working directory
os.chdir(args[1])
# print(os.getcwd())
# [2]: folder name of experiment
folderName = args[2]
# print(folderName)

# print(os.listdir())

# load configuration file from folder
with open("output/data/" + folderName + "/config.toml", "r") as f:
    config = toml.load(f)

# [3]: number of seeds
nseed = int(args[3])

# -----------------------------------------------------------------------------
# set variables from config file
# -----------------------------------------------------------------------------

# number of decision variables and objectives
numDV = config["optimizationParameters"]["numDV"]
numObj = config["optimizationParameters"]["numObj"]

# # forecast lead-time and skill
# leadtime = config["experimentalDesign"]["forecastLeadTime"]
# skill = config["experimentalDesign"]["forecastSkill"]

# extract pi and dv names
pis = config["performanceIndicators"]["piName"]
dvs = config["decisionVariables"]["dvName"]
norm = config["decisionVariables"]["normalized"]
if norm == "True":
    normRange = config["decisionVariables"]["normalizedRange"]

# -----------------------------------------------------------------------------
# create directories for formatted data
# -----------------------------------------------------------------------------

# create folder for clean decision variable and objective files
newpath = "output/data/" + folderName + "/clean/"
if not os.path.exists(newpath):
    os.makedirs(newpath)

# create folder for moeaFramework
# parent directory
newpath = "output/data/" + folderName + "/moeaFramework/"
if not os.path.exists(newpath):
    os.makedirs(newpath)

# inputs
newpath = "output/data/" + folderName + "/moeaFramework/objs/"
if not os.path.exists(newpath):
    os.makedirs(newpath)

# outputs
newpath = "output/data/" + folderName + "/moeaFramework/metrics/"
if not os.path.exists(newpath):
    os.makedirs(newpath)

# outputs
newpath = "output/data/" + folderName + "/postAnalysis/"
if not os.path.exists(newpath):
    os.makedirs(newpath)

# -----------------------------------------------------------------------------
# functions
# -----------------------------------------------------------------------------

# import custom functions from utils.py script
# sys.path.insert(1, os.getcwd() + "/functions")
sys.path.append(".")
from functions.utils import minmaxNorm


# find non dominated solutions across all pareto fronts generated by all seeds
# adapted from: https://stackoverflow.com/questions/32791911/fast-calculation-of-pareto-front-in-python
def is_pareto_efficient(costs, return_mask=True):
    # find the pareto-efficient points
    # costs: (n_points, n_costs) array
    # return_mask: True to return a mask
    # return: array of indices of pareto-efficient points. if return_mask is
    # True, this will be an (n_points, ) boolean array. otherwise it will be
    # a (n_efficient_points, ) integer array of indices

    is_efficient = np.arange(costs.shape[0])
    n_points = costs.shape[0]

    # next index in the is_efficient array to search for
    next_point_index = 0
    while next_point_index < len(costs):
        nondominated_point_mask = np.any(costs < costs[next_point_index], axis=1)
        nondominated_point_mask[next_point_index] = True
        # remove dominated points
        is_efficient = is_efficient[nondominated_point_mask]
        costs = costs[nondominated_point_mask]
        next_point_index = np.sum(nondominated_point_mask[:next_point_index]) + 1
    if return_mask:
        is_efficient_mask = np.zeros(n_points, dtype=bool)
        is_efficient_mask[is_efficient] = True
        return is_efficient_mask
    else:
        return is_efficient


# -----------------------------------------------------------------------------
# format raw borg output files for 1) MOEA Framework and 2) Clean version
# -----------------------------------------------------------------------------

pf = []
dvpf = []

for s in range(1, nseed + 1):
    # files to read
    pfFile = "output/data/" + folderName + "/raw/pareto_front_S" + str(s) + ".txt"
    rtFile = "output/data/" + folderName + "/raw/runtime_S" + str(s) + ".txt"

    # first, check to see if complete run
    path = Path(pfFile)

    # if yes, load discovered pareto front for the seed of interest
    if path.is_file():
        pfSeed = pd.read_csv(
            pfFile,
            sep=" ",
            header=None,
            skiprows=2,
        )

    # if not, take last pareto front from runtime file
    else:
        import csv

        data = []
        flag = False
        with open(rtFile) as f:
            flag = True
            for line in reversed(f.readlines()):
                # if line.startswith("#"):
                #     flag = True
                #     continue
                if line.startswith("//"):
                    flag = False
                    break
                if flag:
                    data.append(line)

        # convert to data frame
        pfSeed = pd.DataFrame([[float(x) for x in e.split()] for e in data])
        pfSeed = pfSeed.iloc[:, 1:]

        tmp = pfSeed[pfSeed.columns[0:]].apply(
            lambda x: " ".join(x.dropna().astype(str)), axis=1
        )

        tmp = pd.DataFrame(tmp)

        # add in beginning lines
        line1 = "# Borg Optimization Results"
        line2 = (
            "# First "
            + str(numDV)
            + " are the decision variables, last "
            + str(numObj)
            + " are the objective values"
        )

        lines = pd.DataFrame([line1, line2])

        tmp = pd.concat([lines, tmp])

        # save to csv
        tmp.to_csv(
            pfFile,
            index=False,
            header=False,
            quoting=csv.QUOTE_NONE,
            escapechar="\\",
        )

    # drop columns with NAs (usually just a weird output that sometimes happens from borg)
    pfSeed = pfSeed.dropna(axis=1, how="all")
    pfSeed = pfSeed.dropna(axis=0, how="any")

    # set column names
    pfSeed.columns = dvs + pis

    # if decision variables are [0, 1] normalized for Borg, backnormalize
    if norm == "True":
        for d in range(len(dvs)):
            tmpDV = dvs[d]
            tmpValues = list(pfSeed.loc[:, tmpDV])
            dvRange = [
                config["decisionVariables"]["lowerBounds"][d],
                config["decisionVariables"]["upperBounds"][d],
            ]

            trueVals = [
                minmaxNorm(x, dvRange, normRange, method="backtransform")
                for x in tmpValues
            ]

            pfSeed.loc[:, tmpDV] = trueVals

    # save clean pareto front to file for reference later
    cleanPF = "output/data/" + folderName + "/clean/pareto_front_S" + str(s) + ".txt"
    pfSeed.to_csv(
        cleanPF,
        sep=",",
        header=True,
        index=False,
    )

    pfSeed.insert(0, "Seed", s)
    dvpf.append(pfSeed)

    # extract just objectives and save in text file with # at the end for use in the MOEAFramework
    moeaFile = (
        "output/data/"
        + folderName
        + "/moeaFramework/objs/pareto_front_S"
        + str(s)
        + ".txt"
    )
    objSeed = pfSeed[pis]
    # objSeed = pfSeed.iloc[:, -numObj:]
    objSeed.to_csv(
        moeaFile,
        sep=" ",
        header=False,
        index=False,
    )

    with open(moeaFile, "a") as f:
        f.write("#")

    pf.append(objSeed)

# -----------------------------------------------------------------------------
# update or create reference set for MOEAFramework
# -----------------------------------------------------------------------------

# join all seeded pareto fronts into one data frame
objsDF = pd.concat(pf).reset_index(drop=True)
objs = objsDF.to_numpy(dtype="float")

# find reference set (non-dominated policies across all seeds)
refsetInd = is_pareto_efficient(objs, return_mask=True)
refsetInd = pd.Series(refsetInd)
refsetUpdated = objsDF[refsetInd].reset_index(drop=True)

moeaFile = "output/data/" + folderName + "/moeaFramework/objRefset.txt"
refsetUpdated.to_csv(
    moeaFile,
    sep=" ",
    header=False,
    index=False,
)

with open(moeaFile, "a") as f:
    f.write("#")

# -----------------------------------------------------------------------------
# save nondominated policies with decision variables
# -----------------------------------------------------------------------------

# join all seeded pareto fronts into one data frame
pols = pd.concat(dvpf).reset_index(drop=True)
objs = pols[pis].to_numpy(dtype="float")
# objs = pols.iloc[:, -numObj:].to_numpy(dtype="float")

# find reference set
refsetInd = is_pareto_efficient(objs, return_mask=True)
refsetInd = pd.Series(refsetInd)
refsetUpdated = pols[refsetInd].reset_index(drop=True)
refsetUpdated.insert(0, "ID", range(0, refsetUpdated.shape[0]))

# save updated reference set
refsetUpdated.to_csv(
    "output/data/" + folderName + "/NonDominatedPolicies.txt",
    sep="\t",
    header=True,
    index=False,
)

# -----------------------------------------------------------------------------
# extract objectives from runtime files
# -----------------------------------------------------------------------------

for s in range(1, nseed + 1):
    # load runtime dynamics and pareto fronts for seed of interest line by line
    rtSeed = []
    dynSeed = []

    rtFile = "output/data/" + folderName + "/raw/runtime_S" + str(s) + ".txt"

    with open(rtFile) as f:
        for line in f:
            # check if the current line starts with "//" to signal runtime dynamics
            if line.startswith("//"):
                line = line.replace("\n", "")
                dynSeed.append(line)
            elif not line.startswith("#"):
                line = line.replace("\n", "")
                rtSeed.append(line)

    # save clean runtime dynamics to file for reference later
    dynFile = "output/data/" + folderName + "/clean/dynamics_S" + str(s) + ".txt"
    dynSeed = pd.DataFrame(dynSeed)
    dynSeed.to_csv(
        dynFile,
        sep=",",
        header=False,
        index=False,
    )

    # format to data frame and set column names
    rtSeed = pd.DataFrame([[float(x) for x in e.split()] for e in rtSeed])
    rtSeed.columns = ["NFE"] + dvs + pis
    # rtSeed.columns = ["NFE"] + list(range(0, numDV + numObj))

    # save clean output to file for reference later
    rtFile = "output/data/" + folderName + "/clean/runtime_S" + str(s) + ".txt"
    rtSeed.to_csv(
        rtFile,
        sep=",",
        header=True,
        index=False,
    )

    # save objectives only to evaluate in MOEAFramework
    freq = rtSeed["NFE"].unique()

    for fe in range(len(freq)):
        # filter by NFE of interest and extract objectives
        tmp = rtSeed.loc[rtSeed["NFE"] == freq[fe]]
        tmp = tmp[pis]
        # tmp = tmp.iloc[:, -numObj:]

        filename = (
            "output/data/"
            + folderName
            + "/moeaFramework/objs/runtime_S"
            + str(s)
            + ".txt"
        )

        if fe == 0:
            tmp.to_csv(filename, sep=" ", index=False, header=False)
        else:
            tmp.to_csv(filename, sep=" ", index=False, header=False, mode="a")

        with open(filename, "a") as f:
            f.write("#\n")