PDF_ReportApplyJob.py

"""
File: PDF_ReportApplyJob.py
Authors: Ryan J. Urbanowicz, Richard Zhang, Wilson Zhang
Institution: University of Pensylvania, Philadelphia PA
Creation Date: 6/1/2021
License: GPL 3.0
Description: Phase 10 of AutoMLPipe-BC - This 'Job' script is called by PDF_ReportApplyMain.py which generates a formatted PDF summary report of key
pipeline results (applying trained models to hold out replication data). It is run once.
"""
#Import required packages  ---------------------------------------------------------------------------------------------------------------------------
import pandas as pd
from fpdf import FPDF
from datetime import datetime
import glob
import os
import re
import sys

def job(experiment_path,rep_data_path,data_path):

    time = str(datetime.now())
    train_name = data_path.split('/')[-1].split('.')[0]
    experiment_name = experiment_path.split('/')[-1]
    #Function to Convert Dataset lists into Usable Strings to Write to the PDF
    #Find folders inside directory
    ds = []
    for datasetFilename in glob.glob(rep_data_path+'/*'):
        datasetFilename = str(datasetFilename).replace('\\','/')
        apply_name = datasetFilename.split('/')[-1].split('.')[0] #Save unique dataset names so that analysis is run only once if there is both a .txt and .csv version of dataset with same name.
        ds.append(apply_name)
    ds = sorted(ds)
    print(ds)

    ars_df = pd.read_csv(experiment_path+ '/'+'metadata.csv')
    ars_dic = []
    for i in range(len(ars_df)):
       if i >= 0:
          ars_dic.append(ars_df.iloc[i, 0]+': ')
          ars_dic.append(ars_df.iloc[i, 1])
          ars_dic.append('\n')
       else:
          pass

    #Analysis Settings, Global Analysis Settings, ML Modeling Algorithms
    analy_report = FPDF('P', 'mm', 'A4')
    analy_report.set_margins(left=10, top=5, right=10, )
    analy_report.add_page(orientation='P')
    top = analy_report.y

    #ML Pipeline Analysis Report-------------------------------------------------------------------------------------------------------
    print("Starting Report")
    ls1 = ars_dic[0:59] # Class - filter poor [0:55] 59
    ls2 = ars_dic[59:98]  #ML modeling algorithms (NaiveB - ExSTraCS) [56:95] 60
    ls3 = ars_dic[98:114] #primary metric - hypersweep timeout [94:111] 97
    ls4 = ars_dic[114:129]  #LCS parameters (do LCS sweep - LCS hypersweep timeout) [110:125]
    analy_report.set_font('Times', 'B', 12)
    analy_report.cell(w=180, h=8, txt='AutoMLPipe-BC Apply Summary Report: '+time, ln=2, border=1, align='L')
    analy_report.y += 3
    analy_report.set_font(family='times', size=9)
    analy_report.multi_cell(w = 90,h = 4,txt='Pipeline Settings:'+'\n'+'\n'+listToString(ls1)+' '+listToString(ls3), border=1, align='L')
    analy_report.x += 90
    analy_report.y = analy_report.y - 104 #96
    analy_report.multi_cell(w = 90,h = 4,txt='ML Modeling Algorithms:'+'\n'+'\n'+listToString(ls2), border=1, align='L')
    analy_report.x += 90
    analy_report.y += 4
    analy_report.multi_cell(w = 90,h = 4,txt='LCS Settings (ExSTraCS,eLCS,XCS):'+'\n'+listToString(ls4), border=1, align='L')
    analy_report.y +=15 #10

    analy_report.cell(w = 180, h = 4, txt='Target Training Dataset: '+train_name, border=1, align='L')
    analy_report.y +=8
    analy_report.x = 10

    listDatasets = ''
    i = 1
    for each in ds:
        listDatasets = listDatasets+('D'+str(i)+' = '+str(each)+'\n')
        i += 1
    analy_report.multi_cell(w = 180, h = 4, txt='Applied Datasets: '+'\n'+listDatasets, border=1, align='L')
    footer(analy_report)

    #Data and Model Prediction Summary--------------------------------------------------------------------------------------
    print("Publishing Model Prediction Summary")
    for n in range(len(ds)):
        #Create PDF and Set Options
        analy_report.set_margins(left=1, top=1, right=1, )
        analy_report.add_page()
        analy_report.set_font('Times', 'B', 12)
        analy_report.cell(w=0, h = 8, txt="Dataset and Model Prediction Summary:  D"+str(n+1)+" = "+ds[n], border=1, align="L", ln=2)
        analy_report.set_font(family='times', size=9)

        #Exploratory Analysis ----------------------------
        analy_report.image(experiment_path+'/'+train_name+'/applymodel/'+ds[n]+'/exploratory/ClassCountsBarPlot.png', 5, 10, 70,48) #10, 30, 82)

        analy_report.x = 125
        analy_report.y = 55
        try:
            analy_report.image(experiment_path+'/'+train_name+'/applymodel/'+ds[n]+'/exploratory/FeatureCorrelations.png', 85, 10, 115) #88, 30, 120, 60)
        except:
            analy_report.cell(40, 4, 'No Feature Correlation Plot', 1, align="L")
            pass

        data_summary = pd.read_csv(experiment_path+'/'+train_name+'/applymodel/'+ds[n]+"/exploratory/DataCounts.csv")
        info_ls = []
        for i in range(len(data_summary)):
            info_ls.append(data_summary.iloc[i,0]+': ')
            info_ls.append(str(data_summary.iloc[i,1]))
            info_ls.append('\n')

        analy_report.x = 5
        analy_report.y = 58
        analy_report.multi_cell(w=60, h=4, txt='Variable:  Count'+'\n'+listToString(info_ls), border=1, align='L')

        #Report Best Algorithms by metric
        summary_performance = pd.read_csv(experiment_path+'/'+train_name+'/applymodel/'+ds[n]+"/model_evaluation/Summary_performance_mean.csv")
        summary_performance['ROC_AUC'] = summary_performance['ROC_AUC'].astype(float)
        highest_ROC = summary_performance['ROC_AUC'].max()
        algorithm = summary_performance[summary_performance['ROC_AUC'] == highest_ROC].index.values
        best_alg_ROC =  summary_performance.iloc[algorithm, 0]

        summary_performance['Balanced Accuracy'] = summary_performance['Balanced Accuracy'].astype(float)
        highest_BA = summary_performance['Balanced Accuracy'].max()
        algorithm = summary_performance[summary_performance['Balanced Accuracy'] == highest_BA].index.values
        best_alg_BA =  summary_performance.iloc[algorithm, 0]

        summary_performance['F1_Score'] = summary_performance['F1_Score'].astype(float)
        highest_F1 = summary_performance['F1_Score'].max()
        algorithm = summary_performance[summary_performance['F1_Score'] == highest_F1].index.values
        best_alg_F1 =  summary_performance.iloc[algorithm, 0]

        summary_performance['PRC_AUC'] = summary_performance['PRC_AUC'].astype(float)
        highest_PRC = summary_performance['PRC_AUC'].max()
        algorithm = summary_performance[summary_performance['PRC_AUC'] == highest_PRC].index.values
        best_alg_PRC = summary_performance.iloc[algorithm, 0]

        summary_performance['PRC_APS'] = summary_performance['PRC_APS'].astype(float)
        highest_APS = summary_performance['PRC_APS'].max()
        algorithm = summary_performance[summary_performance['PRC_APS'] == highest_APS].index.values
        best_alg_APS = summary_performance.iloc[algorithm, 0]

        analy_report.x = 5
        analy_report.y = 87
        analy_report.multi_cell(w=70, h=4, txt="Best (ROC_AUC): "+ str(best_alg_ROC.values)+' = '+ str("{:.3f}".format(highest_ROC))+
                    '\n'+"Best (Balanced Acc.): "+ str(best_alg_BA.values)+' = '+ str("{:.3f}".format(highest_BA))+
                    '\n'+"Best (F1 Score): "+ str(best_alg_F1.values)+' = '+ str("{:.3f}".format(highest_F1))+
                    '\n'+"Best (PRC_AUC): "+ str(best_alg_PRC.values)+' = '+ str("{:.3f}".format(highest_PRC))+
                    '\n'+"Best (PRC_APS): "+ str(best_alg_APS.values)+' = '+ str("{:.3f}".format(highest_APS)), border=1, align='L')

        #ROC-------------------------------
        analy_report.x = 5
        analy_report.y = 112
        analy_report.cell(10, 4, 'ROC', 1, align="L")
        analy_report.image(experiment_path+'/'+train_name+'/applymodel/'+ds[n]+'/model_evaluation/Summary_ROC.png', 4, 118, 120)
        analy_report.image(experiment_path+'/'+train_name+'/applymodel/'+ds[n]+'/model_evaluation/metricBoxplots/Compare_ROC_AUC.png', 124, 118, 82,85)

        #PRC-------------------------------
        analy_report.x = 5
        analy_report.y = 200
        analy_report.cell(10, 4, 'PRC', 1, align="L")
        analy_report.image(experiment_path+'/'+train_name+'/applymodel/'+ds[n]+'/model_evaluation/Summary_PRC.png', 4, 206, 133) #wider to account for more text
        analy_report.image(experiment_path+'/'+train_name+'/applymodel/'+ds[n]+'/model_evaluation/metricBoxplots/Compare_PRC_AUC.png', 138, 205, 68,80)

        footer(analy_report)

    #Average Model Prediction Statistics--------------------------------------------------------------------------------------
    print("Publishing Average Model Prediction Statistics")
    for n in range(len(ds)):
        #Create PDF and Set Options
        analy_report.set_margins(left=1, top=1, right=1, )
        analy_report.add_page()
        analy_report.set_font('Times', 'B', 12)
        analy_report.cell(w=0, h = 8, txt="Average Model Prediction Statistics:  D"+str(n+1)+" = "+ds[n], border=1, align="L", ln=2)
        analy_report.set_font(family='times', size=7)

        stats_ds = pd.read_csv(experiment_path+'/'+train_name+'/applymodel/'+ds[n]+'/model_evaluation/Summary_performance_mean.csv',sep=',',index_col=0)
        stats_ds = stats_ds.round(4)

        #Format
        stats_ds.reset_index(inplace=True)
        stats_ds = stats_ds.columns.to_frame().T.append(stats_ds, ignore_index=True)
        stats_ds.columns = range(len(stats_ds.columns))
        epw = 208 #Amount of Space (width) Avaliable
        th = analy_report.font_size
        col_width = epw/float(10) #maximum column width

        #Print next 3 datasets
        table1 = stats_ds.iloc[: , :10]
        table1 = table1.to_numpy()
        for row in table1:
            for datum in row:
                analy_report.cell(col_width, th, str(datum), border=1)
            analy_report.ln(th) #critical
        analy_report.y += 5

        table1 = stats_ds.iloc[: , 10:18]
        met = stats_ds.iloc[:,0]
        table1 = pd.concat([met, table1], axis=1)
        table1 = table1.to_numpy()
        for row in table1:
            for datum in row:
                analy_report.cell(col_width, th, str(datum), border=1)
            analy_report.ln(th) #critical
        analy_report.y += 5

        footer(analy_report)

    #Output The PDF Object
    try:
        fileName = str(experiment_name)+'_ML_Pipeline_Apply_Report.pdf'
        analy_report.output(experiment_path+'/'+train_name+'/applymodel/'+ds[n]+'/'+fileName)
        print('PDF Generation Complete')
    except:
        print('Pdf Output Failed')


def listToString(s):
    str1 = " "
    return (str1.join(s))


#Create Footer
def footer(self):
    self.set_auto_page_break(auto=False, margin=3)
    self.set_y(285)
    self.set_font('Times', 'I', 7)
    self.cell(0, 7,'Generated with the URBS-Lab AutoMLPipe-BC: (https://github.com/UrbsLab/AutoMLPipe-BC)', 0, 0, 'C')
    self.set_font(family='times', size=9)

#Find N greatest ingegers within a list
def ngi(list1, N):
    final_list = []
    for i in range(0, N):
        max1 = 0
        for j in range(len(list1)):
            if list1[j] > max1:
                max1 = list1[j];
        list1.remove(max1);
        final_list.append(max1)

if __name__ == '__main__':
    job(sys.argv[1],sys.argv[2],sys.argv[3])