Skip to content

Commit

Permalink
Merge pull request #77 from MET-OM/dev_ex
Browse files Browse the repository at this point in the history
Add some methods for regression lines in plot_scatter()
  • Loading branch information
KonstantinChri authored Dec 20, 2024
2 parents cd9b6a7 + adaaac7 commit 884278c
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 9 deletions.
Binary file modified docs/files/scatter_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ Scatter Plot
var1_units='m/s',
var2_units='m/s',
title='Scatter',
regression_line=True,
regression_line='effective-variance',
qqplot=False,
density=True,
output_file='scatter_plot.png')
Expand Down
53 changes: 45 additions & 8 deletions metocean_stats/plots/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from matplotlib.dates import MonthLocator, DateFormatter
import calendar
from math import floor,ceil
Expand Down Expand Up @@ -109,8 +110,26 @@ def plot_pdf_all(data, var, bins=70, output_file='pdf_all.png'): #pdf_all(data,
return fig


def plot_scatter(df,var1,var2,var1_units='m', var2_units='m',title=' ',regression_line=True,qqplot=True,density=True,output_file='scatter_plot.png'):
import scipy.stats as stats
def plot_scatter(df,var1,var2,var1_units='m', var2_units='m',title=' ',regression_line='effective-variance',qqplot=True,density=True,output_file='scatter_plot.png'):
"""
Plots a scatter plot with optional density, regression line, and QQ plot.
Calculates and displays statistical metrics on the plot.
Parameters:
df (DataFrame): Pandas DataFrame containing the data.
var1 (str): Column name for the x-axis variable.
var2 (str): Column name for the y-axis variable.
var1_units (str): Units for the x-axis variable. Default is 'm'.
var2_units (str): Units for the y-axis variable. Default is 'm'.
title (str): Title of the plot. Default is an empty string.
regression_line (str or bool): Type of regression line ('least-squares', 'mean-slope', 'effective-variance',None). Default is effective-variance.
qqplot (bool): Whether to include QQ plot. Default is True.
density (bool): Whether to include density plot. Default is True.
output_file (str): Filename for saving the plot. Default is 'scatter_plot.png'.
Returns:
fig: The matplotlib figure object.
"""

x=df[var1].values
y=df[var2].values
Expand All @@ -135,12 +154,31 @@ def plot_scatter(df,var1,var2,var1_units='m', var2_units='m',title=' ',regressio
qn_y = np.nanpercentile(y, percs)
ax.scatter(qn_x,qn_y,marker='.',s=80,c='b')

if regression_line:
m,b,r,p,se1=stats.linregress(x,y)
cm0="$"+('y=%2.2fx+%2.2f'%(m,b))+"$";
plt.plot(x, m*x + b, 'k--', label=cm0)
if regression_line == 'least-squares':
slope=stats.linregress(x,y).slope
intercept=stats.linregress(x,y).intercept
elif regression_line == 'mean-slope':
slope = np.mean(y)/np.mean(x)
intercept = 0
elif regression_line == 'effective-variance':
slope = linfitef(x,y)[0]
intercept = linfitef(x,y)[1]
else:
slope = None
intercept = None

if slope is not None and intercept is not None:
if intercept > 0:
cm0 = f"$y = {slope:.2f}x + {intercept:.2f}$"
elif intercept < 0:
cm0 = f"$y = {slope:.2f}x - {abs(intercept):.2f}$"
else:
cm0 = f"$y = {slope:.2f}x$"

plt.plot(x, slope * x + intercept, 'k--', label=cm0)
plt.legend(loc='best')



rmse = np.sqrt(((y - x) ** 2).mean())
bias = np.mean(y-x)
mae = np.mean(np.abs(y-x))
Expand All @@ -158,7 +196,6 @@ def plot_scatter(df,var1,var2,var1_units='m', var2_units='m',title=' ',regressio
plt.title("$"+(title +', N=%1.0f'%(np.count_nonzero(~np.isnan(x))))+"$",fontsize=15)
plt.grid()
plt.savefig(output_file)

return fig

def _percentile_str_to_pd_format(percentiles):
Expand Down
55 changes: 55 additions & 0 deletions metocean_stats/stats/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,3 +590,58 @@ def nb_hours_below_threshold(df,var,thr_arr):
del i,j
del years,years_unique,delta_t
return nbhr_arr

def linfitef(x, y, stdx: float=1.0, stdy: float=1.0) -> tuple[float, float]:
"""
Perform a linear fit considering uncertainties in both variables.
Parameters:
x (array-like): Independent variable data points.
y (array-like): Dependent variable data points.
stdx (float): Standard deviation of the error of x.
stdy (float): Standard deviation of the error of y.
Default: stdx=stdy=1.0
NB! Only relative values matter, i.e. stdx=1, stdy=2 gives same results as stdx=2, stdy=4.
NB! For stdx=0, the method is identical to a normal least squares fit.
Returns:
tuple: Coefficients of the linear fit (slope, intercept).
References:
- Orear, J (1982). Least squares when both variables have uncertainties, J. Am Phys 50(10).
"""
# Convert inputs to numpy arrays
x = np.asarray(x)
y = np.asarray(y)

# Validate input lengths
if x.shape != y.shape:
raise ValueError("Input arrays must have the same shape.")

# Calculate means of x and y
x0 = np.mean(x)
y0 = np.mean(y)

# Calculate variances
sx2 = stdx**2
sy2 = stdy**2

# Calculate sum of squares
Sx2 = np.sum((x - x0)**2)
Sy2 = np.sum((y - y0)**2)
Sxy = np.sum((x - x0) * (y - y0))

# Check for special cases where uncertainties or covariance might be zero
if sx2 == 0 or Sxy == 0:
slope = Sxy / Sx2
else:
term = (sy2 * Sx2)**2 - 2 * Sx2 * sy2 * sx2 * Sy2 + (sx2 * Sy2)**2 + 4 * Sxy**2 * sx2 * sy2
slope = (sx2 * Sy2 - sy2 * Sx2 + np.sqrt(term)) / (2 * Sxy * sx2)

# Calculate intercept
intercept = y0 - slope * x0

return slope, intercept

0 comments on commit 884278c

Please sign in to comment.