-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstreamlit_app.py
112 lines (85 loc) · 4.25 KB
/
streamlit_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
"""
# 💡 Bulk correlations
Quickly uncover potential relationships in a CSV dataset by getting an overview of correlation coefficients between several pairs of metrics.
See the code and how to interpret the charts [on the Github repo](https://github.com/searchgame/bulk-correlations). Made by [Victor Gras](https://victorgras.com).
---
## Usage:
1) Upload a CSV file from the left sidebar.
- Except for the first column, all columns must contain numeric values.
- Make sure that your CSV file is using comma as separator (not semicolon).
- Make sure that the column names of your CSV file doesn't contain special characters such as parentheses or quotes (white spaces, hyphens and underscores are okay).
The format expected is:
```
item,metric_1,metric_2,metric_3,etc
```
For example, for a GA4 export:
```
Address,Word Count,GA4 Sessions,GA4 Views,GA4 Engaged sessions,GA4 Bounce rate,Performance Score,First Contentful Paint Time in ms,Speed Index Time in ms,Largest Contentful Paint Time in ms,Time to Interactive in ms,Total Blocking Time in ms,Cumulative Layout Shift score,Image Count
https://www.example.com/article-1/,692,13,13,7,0.3,60,2294,3864.021,6255,6716,537,0.013,5
https://www.example.com/article-2/,114,5,5,1,0,61,2264,3316.94,8505,6850.5,418.5,0.059,6
https://www.example.com/article-3/,1780,20,21,11,0.27,52,2704,3881.909,9030,7943,712,0.001,8
...
```
2) The correlation matrix automatically appears below under "Results:"
3) Once the correlation matrix is displayed, a dropdown list containing all the metrics pairs will appear. The list is sorted from most correlating (positively or negatively) to least correlating. When selecting a metrics pair, a scatter plot will appear below showing the actual distribution of the data points.
---
"""
# Set a title
st.title('Results:')
# Set a sidebar for user input
st.sidebar.title("Settings")
# Upload the CSV file
file_upload = st.sidebar.file_uploader("Upload CSV", type=['csv'])
# Check if a file is uploaded
if file_upload is not None:
# Load the CSV file
data = pd.read_csv(file_upload, header=None)
# Set the first row as column names
data.columns = data.iloc[0].values
data = data.iloc[1:]
# Remove first column
data = data.iloc[:, 1:]
# Convert data to numeric
data = data.apply(pd.to_numeric, errors='coerce')
# Compute correlation matrix
corr = data.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(11, 9))
# Draw the heatmap with the mask
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1, ax=ax)
plt.title('Correlation of your data')
# Show the plot in Streamlit
st.pyplot(fig)
# Create a list of metric pairs with their correlation coefficients
metric_pairs = []
for i in range(len(data.columns)):
for j in range(i+1, len(data.columns)):
pair = (data.columns[i], data.columns[j])
corr_value = corr.iloc[i, j]
metric_pairs.append((pair, corr_value))
# Sort the metric pairs based on the absolute value of correlation
sorted_pairs = sorted(metric_pairs, key=lambda x: abs(x[1]), reverse=True)
# Create a list of formatted strings for the dropdown
dropdown_options = [f"{pair[0]} vs {pair[1]} (corr: {corr:.2f})" for pair, corr in sorted_pairs]
# Add a dropdown to select a metric pair
selected_pair_str = st.selectbox("Select a metric pair", dropdown_options)
# Extract the selected pair from the string
selected_pair = tuple(selected_pair_str.split(' (')[0].split(' vs '))
# Create a scatter plot with regression line for the selected metric pair
x = data[selected_pair[0]]
y = data[selected_pair[1]]
fig, ax = plt.subplots(figsize=(10, 8))
sns.regplot(x=x, y=y, ax=ax, scatter_kws={'alpha':0.3, 's':10}, color='red', line_kws={'linewidth':0.5}, ci=None)
ax.set_xlabel(selected_pair[0])
ax.set_ylabel(selected_pair[1])
ax.set_title(f"Scatter plot with regression line: {selected_pair[0]} vs {selected_pair[1]}")
st.pyplot(fig)
else:
st.sidebar.text('Please upload a CSV file.')