themarshallproject · anastasiajourno · Sep 6, 2022 · Sep 6, 2022 · Sep 6, 2022 · Sep 28, 2022
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,5 @@ asset_manifest.json
 .DS_Store
 .credentials.json
 secrets/
-analysis/source_data
 .Rhistory
 analysis/output_data/CLEAN/Auburn.xls
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/Makefile b/Makefile
@@ -29,10 +29,6 @@ analysis/output_data/group_by_category.csv: analysis/source_data/April-2022-Quar
 	@echo "Export grouped data for graphics"
 	$(PYENV) python analysis/group_by_category.py $< $@
 
-analysis/output_data/output.csv: analysis/source_data/input.csv  ## Run R analysis on the downloaded data, including saving output
-	@echo "Running R analysis"
-	Rscript analysis/analysis.R
-
 
 ##@ Source files
 analysis/source_data/April-2022-Quarterly-and-Annual-Reporting-Data-through-March-31-2022.xlsx:	## Download April twenty-two ARPA Data
@@ -51,7 +47,9 @@ analysis/output_data/arpa_wtfs.json: ## Pull hand-curated WTF examples from Airt
 		-o $@
 
 src/assets/data/arpa_wtfs.json: analysis/output_data/arpa_wtfs.json ## move wtf data from source folder to graphics data folder
-	cp -R $< $@
+	$(PYENV) python analysis/filter_by_wtf.py $< $@
+
+
 
 ##@ Upload/sync
 

diff --git a/analysis/Untitled.ipynb b/analysis/Untitled.ipynb
@@ -213,7 +213,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.8.9 64-bit",
    "language": "python",
    "name": "python3"
   },
@@ -227,7 +227,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.3"
+   "version": "3.8.9"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+   }
   }
  },
  "nbformat": 4,

diff --git a/analysis/analysis_cleveland/clean_cities_cuyahoga.py b/analysis/analysis_cleveland/clean_cities_cuyahoga.py
@@ -0,0 +1,33 @@
+import pandas as pd 
+#FILTERING and CLEANING df_cities for Ohio 
+
+def read_csv(source_filename):
+    df_cities = pd.read_csv(source_filename)
+
+    return df_cities
+
+def clean_data(df_cities):
+    ## filtering through to just Ohio cities
+    df_cities = df_cities.loc[df_cities['State'] == "Ohio"]
+    ## rename column names: City => recipient, Allocation => allocation
+    df_cities = df_cities[['City', 'Allocation']].rename(columns = {"City":"recipient", "Allocation":"allocation"})
+    ## indicate that the data comes from the Treasury Department
+    df_cities["source"] = "treasury"
+    ## clean up the allocations column (replace useless characters, turn into int, etc.)
+    df_cities['allocation'] = (df_cities['allocation'].str.strip('$'))
+    df_cities['allocation'] = df_cities['allocation'].str.replace(',','')
+    df_cities['allocation'] = df_cities['allocation'].str.replace('.00','')
+    df_cities['allocation'] = df_cities['allocation'].astype(int)
+
+    return df_cities
+
+def export_data(df_cities, output_filename):
+    df_cities.to_csv(output_filename, index = False)
+
+if __name__ == "__main__":
+    source_filename = "analysis/source_data/allocation/fiscalrecoveryfunds-metrocitiesfunding1-CSV.csv"
+    output_filename = "analysis/source_data/allocation/cuyahoga_cities_allocation.csv"
+
+    df_cities = read_csv(source_filename)
+    df_cities_clean = clean_data(df_cities)
+    export_data(df_cities_clean, output_filename)
diff --git a/analysis/analysis_cleveland/clean_counties_cuyahoga.py b/analysis/analysis_cleveland/clean_counties_cuyahoga.py
@@ -0,0 +1,14 @@
+import pandas as pd
+
+## Pulling out allocations for Cuyahoga County
+df_counties = pd.read_csv("analysis/source_data/allocation/fiscalrecoveryfunds_countyfunding_2021.05.10-1a.csv", engine='python',encoding='latin1')
+df_counties = df_counties.loc[df_counties['State'] == "Ohio"]
+df_counties = df_counties.loc[df_counties['County'] == "Cuyahoga County"]
+df_counties = df_counties[['County', 'Allocation']].rename(columns = {"County":"recipient", "Allocation":"allocation"})
+df_counties["source"] = "treasury"
+df_counties['allocation'] = (df_counties['allocation'].str.strip('$'))
+df_counties['allocation'] = df_counties['allocation'].str.replace(',','')
+df_counties['allocation'] = df_counties['allocation'].str.replace('.00','')
+df_counties['allocation'] = df_counties['allocation'].astype(int)
+
+df_counties.to_csv("analysis/source_data/allocation/cuyahoga_county_allocation.csv", index = False)
diff --git a/analysis/analysis_cleveland/clean_indirect_multiplesheets.py b/analysis/analysis_cleveland/clean_indirect_multiplesheets.py
@@ -0,0 +1,7 @@
+import pandas as pd
+
+df = pd.concat(pd.read_excel('analysis/source_data/cleveland/arpa_cle_indirect.xlsx', sheet_name=None), ignore_index=True)
+
+df['Amount'] = df['Amount'].astype(float)
+
+df.to_csv("analysis/source_data/cleveland/cle_indirect_multiple.csv", index = False)
diff --git a/analysis/analysis_cleveland/clean_indirect_one_sheet.py b/analysis/analysis_cleveland/clean_indirect_one_sheet.py
@@ -0,0 +1,8 @@
+import pandas as pd
+
+df = pd.read_csv("analysis/source_data/cleveland/arpa_cle_indirect.csv")
+
+#df['Amount'] = df['Amount'].str.replace(',','')
+df['Amount'] = df['Amount'].astype(float)
+
+df.to_csv("analysis/source_data/cleveland/arpa_cle_indirect_clean.csv", index = False)
diff --git a/analysis/analysis_cleveland/clean_neu_allocations_cuyahoga.py b/analysis/analysis_cleveland/clean_neu_allocations_cuyahoga.py
@@ -0,0 +1,21 @@
+## parses NEU allocations from state's pdf file
+## and export that into a spreadsheet
+import tabula
+import pandas as pd
+
+df = tabula.read_pdf("analysis/source_data/allocation/ARPA_Non-Entitlement_Allocations_Distributions_2022-05-13.pdf", pages=[5,6])[0]
+
+tabula.convert_into("analysis/source_data/allocation/ARPA_Non-Entitlement_Allocations_Distributions_2022-05-13.pdf", "analysis/source_data/allocation/NEU_allocations_toclean.csv", output_format="csv", pages=[5,6])
+
+df = pd.read_csv("analysis/source_data/allocation/NEU_allocations_toclean.csv")
+df = df.loc[df['County'] == "Cuyahoga"]
+
+df = df[['Entity Name', 'Total Allocation\r(4)']].rename(columns = {"Entity Name":"recipient", "Total Allocation\r(4)":"allocation"})
+
+df["source"] = "ohio-gov"
+
+df['allocation'] = df['allocation'].str.replace('^[^\d]*','', regex=True)
+df['allocation'] = df['allocation'].str.replace(',','')
+df['allocation'] = df['allocation'].astype(int)
+
+df.to_csv("analysis/source_data/allocation/NEU_allocations.csv", index = False)
diff --git a/analysis/analysis_cleveland/combine_spendings_cle.py b/analysis/analysis_cleveland/combine_spendings_cle.py
@@ -0,0 +1,44 @@
+import pandas as pd
+
+#CLEANING THE DIRECT CJ SPENDING DATA
+df_dir = pd.read_csv("analysis/source_data/cleveland/cuyahoga_arpa_projects_vetted.csv")
+
+df_dir = df_dir[df_dir['Vet'].str.contains("cj")==True]
+
+df_dir = df_dir[["Recipient Name", "Total Cumulative Obligations", "Project Description", \
+    "Focus" ]].rename(columns = {
+     "Recipient Name": "recipient",
+     "Project Description": "Project description",
+     "Total Cumulative Obligations": "Amount",
+     "Focus": "Focus" })
+
+df_dir["source"] = "direct"
+
+df_dir['Amount'] = df_dir['Amount'].fillna(0)
+
+df_dir['Amount'] = df_dir['Amount'].astype(float)
+
+#CLEANING THE INDIRECT CJ SPENDING DATA
+
+df_indir = pd.read_csv("analysis/source_data/cleveland/cle_indirect_multiple.csv")
+df_indir = df_indir[["recipient", "Amount", "Project description", "source", "Focus"]]
+
+def indirect_county(row):
+    if row["source"] == "direct":
+        source = "direct"
+    elif row["source"] == "indirect_county":
+        source = "indirect_county"
+    else:
+        source = "indirect_state"
+
+    return source
+
+df_indir["source"] = df_indir.apply(indirect_county, axis=1)
+
+df_indir['Amount'] = df_indir['Amount'].astype(float)
+
+#CONCAT two files that have the same structure 
+
+df_all = pd.concat([df_dir, df_indir])
+
+df_all.to_csv("analysis/output_data/cle_cj_all_spending.csv", index=False) 
diff --git a/analysis/analysis_cleveland/compile_allocations_cuyahoga.py b/analysis/analysis_cleveland/compile_allocations_cuyahoga.py
@@ -0,0 +1,26 @@
+## combines all three allocation files (city, county, and NEU(small cities))
+## filter that list with a hand-compiled list of all cities/villages in Cuyahoga county (compiled by Rachel)
+## export this data for further analysis
+
+import pandas as pd
+
+#JOIINING with df of villages and small cities in Cuyahoga
+df_neu = pd.read_csv("analysis/source_data/allocation/NEU_allocations.csv")
+df_cities = pd.read_csv("analysis/source_data/allocation/cuyahoga_cities_allocation.csv")
+df_all = pd.concat([df_cities,df_neu])
+
+#JOIINING with df of counties for Cuyahoga county 
+df_county = pd.read_csv("analysis/source_data/allocation/cuyahoga_county_allocation.csv")
+df_all = pd.concat([df_all,df_county])
+
+#MAKING a list for recipients in Cuyahoga county
+places = pd.read_csv("analysis/source_data/cleveland/cle_arpa.csv")
+places['recipient'] = places['cle_arpa'].str.split(",").str[0]
+places_list = list(places["recipient"])
+
+#FILTERING the data on all allocations for the list of places
+df_all = df_all[df_all['recipient'].isin(places_list)]
+
+#JOINING with the places in Cleveland 
+df_allocations= places.merge(df_all, on = "recipient", how = "left")
+df_allocations.to_csv("analysis/output_data/allocations_cuyahoga.csv", index = False)
diff --git a/analysis/analysis_cleveland/filter_cuyahoga.py b/analysis/analysis_cleveland/filter_cuyahoga.py
@@ -0,0 +1,17 @@
+## reads in the data we processed earlier that extracts CJ-related projects by keyword
+## filter to Ohio
+## then match the receipient nae with our hand-compiled names for Cuyahoga county cities/towns
+## for more vetting on Google Sheet
+## https://docs.google.com/spreadsheets/d/10A3xUicxV0yoEtRXz85z_nqDwzy_TbMqh_XD9qLNTok/edit#gid=912893235
+import csv
+import pandas as pd
+
+df = pd.read_csv("analysis/output_data/q1_cj_related_projects_to_vet.csv")
+
+df1 = pd.read_csv("analysis/source_data/cle_arpa.csv")
+cle = list(df1["cle_arpa"])
+
+df = df.loc[df["State/Territory"] == "Ohio"]
+df = df[df['Recipient Name'].isin(cle)]
+
+df.to_csv("analysis/output_data/cuyahoga_arpa_projects_to_vet.csv", index=False) 
diff --git a/analysis/analysis_cleveland/readme.md b/analysis/analysis_cleveland/readme.md
@@ -0,0 +1,66 @@
+Data processing
+
+TKTK
+
+Data Sources 
+
+    1. Allocations
+
+    Allocations come from several sources:
+
+    Allocation for Metropolitan Cities (CSV) and  Allocation for Counties (CSV)
+    can be found on the Treasury website:
+    https://home.treasury.gov/policy-issues/coronavirus/assistance-for-state-local-and-tribal-governments/state-and-local-fiscal-recovery-funds
+    These get cleaned in clean_counties_cuyahoga.py 
+    and clean_cities_cuyahoga.py.
+
+    Allocations for smaller places are from this pdf:
+    https://grants.ohio.gov/Documents/Funding_Opportunities/ARPA/ARPA_Non-Entitlement_Allocations_Distributions_2022-05-13.pdf
+    This gets cleaned at clean_neu_allocations_cuyahoga.py. 
+
+    These three allocations get combined in compile_allocations_cuyahoga.py into analysis/output_data/allocations_cuyahoga.csv
+
+
+    2. Criminal justice related spending 
+
+    Direct spendings come from the Treasury and are being processed by our code that flags criminal justice-related spending in project-level-analysis.py.
+
+    In the filter_cyahoga.py file, we are filtering this data only for recipients in the Greater Cleveland area (Cuyahoga County, Cleveland and small places). Their list comes from here: 
+    https://docs.google.com/spreadsheets/d/1s0FW5YoiX1KxiJKvZYgTdgGamRxKygQH9SdjkOrdoFw/edit#gid=0
+
+    The filtered data gets exported as output_data/cuyahoga_arpa_projects_to_vet.csv.
+
+    It goes into the google sheet where it gets manually vetted (Vet column) and also categorized (Focus column). 
+
+    This data lives here:
+    https://docs.google.com/spreadsheets/d/10A3xUicxV0yoEtRXz85z_nqDwzy_TbMqh_XD9qLNTok/edit#gid=912893235
+
+    It gets exported as csv from Google Sheets and goes back to the repo as
+    source_data/cleveland/cuyahoga_arpa_projects_vetted.csv
+    It also gets downloaded to Observable for some of the calculations.
+
+    3. Finally, indirect allocations from the state. 
+
+    They were hand input by Rachel Disell here:
+    https://docs.google.com/spreadsheets/d/1VaZU-rJdtvhGJMLtLYfqqiAcBRrswdM_N1iCRubEXF8/edit#gid=125400899
+
+    Tabs: Ohio_CC_courts and Ohio_CC_Violent crime reduction.
+
+    Ana copied them over to a clean table here:
+    https://docs.google.com/spreadsheets/d/1nq0AwnOEi61XXPcbFUaI_aHfBl4lHWKphSqp3f9B0Jc/edit#gid=1493443383
+
+    Then, exported as xlsx file, cleaned here 
+    clean_indirect_multiplesheets.py
+    and saved as cle_indirect_multiple.csv
+
+    Finally, direct and indirect criminal justice related spending get together in combine_spendings_cle.py and exported as cle_cj_all_spending.csv
+
+    This goes to Observable. 
+
+
+
+
+
+
+
+
diff --git a/analysis/analysis_independence.py b/analysis/analysis_independence.py
@@ -0,0 +1,66 @@
+from matplotlib import test
+import pandas as pd
+import glob
+
+def get_year(filename):
+    year = filename.split("payroll/")[1]
+    year = year.split("emp")[0]
+    year = int("20"+year)
+    return year
+
+def test_get_year():
+    assert get_year("analysis/source_data/payroll/17emp.xlsx") == 2017
+
+#READ THE FILES
+def read_files(datasets):
+    dfs =[]
+    for i, data in enumerate(datasets):
+        df = pd.read_excel(data)
+        #GET YEAR FROM THE FILENAME
+        df['year'] = get_year(data) 
+        dfs.append(df)
+    return dfs 
+
+#ADD AVERAGE PAY
+def add_average_pay(dfs):
+    for df in dfs:
+            df["average_fulltime_pay"] = df["Full-time Payroll"]/df["Full-time Employees"]
+            df["average_annual_pay"] =  df["average_fulltime_pay"]*12
+    return dfs 
+
+def test_add_pay():
+    dfs = [pd.read_excel("analysis/source_data/payroll/21emp.xlsx")]
+    dfs = add_average_pay(dfs)
+    print(round(dfs[0]['average_fulltime_pay'].iloc[0]))
+    assert round(dfs[0]['average_fulltime_pay'].iloc[0]) == 5220
+
+def main():
+    print("reading in files")
+    datasets = glob.glob("analysis/source_data/payroll/*.xlsx")
+    dfs = read_files(datasets)
+    dfs = add_average_pay(dfs)
+
+    #MERGE DATAFRAMES
+    print("merging")
+    df_all = pd.concat(dfs)
+
+    #SLICE THE COLUMNS 
+    print("slicing")
+    df_all = df_all[["State", \
+             "Name of Government", "Population/Enrollment/Function Code", \
+             "Government Function", "Full-time Employees", \
+            "Full-time Payroll", "average_annual_pay", "year"]]\
+            .rename(columns = {
+    "Name of Government": "Govt_Name",
+    "Population/Enrollment/Function Code": "pop",
+    "Government Function": "function",
+    "Full-time Employees": "FTE",
+    "Full-time Payroll": "FTE_Pay",
+    "average_annual_pay": "AVG_Pay"})
+
+    #EXPORT AS 1 FILE 
+    print("saving")
+    df_all.to_csv("analysis/output_data/payroll_data.csv", index=False)
+
+if __name__ == "__main__":
+    main()