import math
import numpy as np
import matplotlib.pyplot as plt
import warnings
from typing import Union, Tuple, List, Callable, Optional
from numpy.typing import ArrayLike
from .._helper import DEFAULT_FIGSIZE, apply_inward_ticks
[docs]
def hist(data: ArrayLike, data_err: ArrayLike, scale: int = 0,
bins: Union[str, int] = "auto", label: str = "",
unit: str = "", verbose: bool = True, figsize: Tuple[float, float] = DEFAULT_FIGSIZE) -> Tuple[float, float, float, float, float]:
"""
Plots the histogram of a dataset and assess its Gaussianity using statistical indicators and a Shapiro-Wilk test.
This function visualizes the empirical distribution of a dataset `data`, optionally accounting for individual measurement
uncertainties `data_err`. It overlays a Gaussian curve parameterized by the estimated mean and standard deviation, and
performs a normality test, reporting skewness, kurtosis, and the p-value from the Shapiro-Wilk test.
Parameters
----------
data : array-like
Numerical data representing the variable of interest.
data_err : scalar or array-like or None
Array of uncertainties associated with each element of `data`. If `None`, uncertainties are not included in the
computation of the effective standard deviation.
scale : int, optional
Scaling exponent for `data` and `data_err` (default is `0`). For example, `scale = -2` rescales the inputs by 1e2.
bins : int or str, optional
Number of bins or binning strategy passed to `matplotlib.pyplot.hist`. Defaults to `"auto"`.
label : str, optional
Label for the x-axis, typically the name of the variable.
unit : str, optional
Unit of measurement for the x-axis variable (e.g., "cm"). If provided, it will be displayed in the axis label and summary output.
verbose : bool, optional
If `True`, prints a formatted table of the histogram parameters. Defaults to `True`.
figsize : tuple of float, optional
Figure size passed to `matplotlib`. Default is `(6.4 * 1.2, 4.8 * 1.2)`.
Returns
-------
mean : float
Arithmetic mean of the scaled data.
sigma : float
Effective standard deviation of the distribution, accounting for both the empirical spread and uncertainties (if provided).
skewness : float
Skewness of the distribution.
kurtosis : float
Kurtosis of the distribution.
p_value : float
p-value from the Shapiro-Wilk test for normality.
Notes
-----
- The effective standard deviation is computed as `np.sqrt(data.std()**2 + np.sum(data_err**2)/len(data_err))` if `data_err` is provided.
- The function rescales both `data` and `data_err` by `10**scale` for display purposes, but all statistics are computed on the scaled data.
- The normal distribution is refferd to as `N(mu, sigma**2)`.
Example
-------
>>> import numpy as np
>>> from labtoolbox.stats import hist
>>> x = np.random.normal(loc=5, scale=0.2, size=100)
>>> x_err = np.full_like(x, 0.05)
>>> hist(x, x_err, scale=-2, label="Length", unit="cm")
"""
from scipy.stats import norm, shapiro
data = np.asarray(data)
if (not (np.issubdtype(data.dtype, np.floating) or np.issubdtype(data.dtype, np.integer))) or not np.all(np.isreal(data)):
raise TypeError("'data' must contain only real numbers (int or float).")
if not np.all(np.isfinite(data)):
raise ValueError("'data' contains non-finite values (NaN or inf).")
if not isinstance(scale, (int, float)):
raise TypeError("'scale' must be a real number (int or float).")
if not isinstance(bins, (int, float, str)):
raise TypeError("'bins' must be a real number (int or float) or a string ('auto').")
if isinstance(bins, str) and bins != "auto":
raise ValueError("'bins' must be 'auto'.")
if not isinstance(label, str):
raise TypeError("'label' must be a string.")
if not isinstance(unit, str):
raise TypeError("'unit' must be a string.")
if not isinstance(figsize, (list, tuple)) or len(figsize) != 2 or not all(isinstance(u, (int, float)) and np.isfinite(u) for u in figsize):
raise TypeError("'figsize' must be a list or tuple of exactly two finite real numbers.")
data = data / 10**scale
if data_err is not None:
if np.isscalar(data_err):
if not isinstance(data_err, (int, float)):
raise TypeError("'data_err' must be a real number (int or float).")
data_err = np.repeat(data_err, len(data))
else:
if (not (np.issubdtype(data_err.dtype, np.floating) or np.issubdtype(data_err.dtype, np.integer))) or not np.all(np.isreal(data_err)):
raise TypeError("'data_err' must contain only real numbers (int or float).")
if not np.all(np.isfinite(data)):
raise ValueError("'data_err' contains non-finite values (NaN or inf).")
data_err = data_err / 10**scale
sigma = np.sqrt(data.std()**2 + np.sum(data_err**2)/len(data))
else:
sigma = data.std()
mean = data.mean()
# Calcola l'esponente di sigma
exponent = int(math.floor(math.log10(abs(sigma))))
factor = 10**(exponent - 1)
rounded_sigma = (round(sigma / factor) * factor)
# Arrotonda la media
rounded_mean = round(mean, -exponent + 1)
# Converte in stringa mantenendo zeri finali
fmt = f".{-exponent + 1}f" if exponent < 1 else "f"
# ----------------------------
# Calcola l'esponente di var
exponent1 = int(math.floor(math.log10(abs(sigma**2))))
factor = 10**(exponent1 - 1)
rounded_var = (round(sigma**2 / factor) * factor)
# Arrotonda la media
rounded_mean1 = round(mean, -exponent1 + 1)
# Converte in stringa mantenendo zeri finali
fmt1 = f".{-exponent1 + 1}f" if exponent1 < 1 else "f"
# ----------------------------
# Prepara l'unità di misura, se presente
ux_str = f" [$\\mathrm{{{unit}}}$]" if unit else ""
label1 = f"$\\mathcal{{N}}({rounded_mean1:.{max(0, -exponent1 + 1)}f}, {rounded_var:.{max(0, -exponent1 + 1)}f})$"
label2 = label+ux_str
plt.figure(figsize=figsize)
# histogram of the data
_, bin_edges, _ = plt.hist(data,bins=bins,color="blue",edgecolor='blue', histtype = "step", zorder=2, label='Data distribution')
plt.ylabel('Counts')
lnspc = np.linspace(data.min() - 2 * sigma, data.max() + 2 * sigma, 500)
bin_widths = np.diff(bin_edges)
mean_bin_width = np.mean(bin_widths)
f_gauss = data.size * mean_bin_width * norm.pdf(lnspc, mean, sigma)
plt.plot(lnspc, f_gauss, linewidth=1, color='r',linestyle='--', label = label1, zorder=1)
plt.xlabel(label2)
plt.xlim(mean - 3 * sigma, mean + 3 * sigma)
plt.legend(frameon = False)
apply_inward_ticks(plt.gca())
plt.show()
skewness = np.sum((data - mean)**3) / (len(data) * sigma**3)
kurtosis = np.sum((data - mean)**4) / (len(data) * sigma**4) - 3
_, p_value = shapiro(data)
if 0.10 <= p_value <= 1:
pval_str = f"{p_value*100:.0f}%"
elif 0.005 < p_value < 0.10:
pval_str = f"{p_value * 100:.2f}%"
elif 0.0005 < p_value <= 0.005:
pval_str = f"{p_value * 100:.3f}%"
else:
pval_str = f"< 0.05%"
if verbose:
# Prepara l'unità di misura, se presente
ux_str = f" {unit}" if unit else ""
# Determina la larghezza massima delle etichette e dei valori
entries = [
("Mean value", f"{rounded_mean:.{max(0, -exponent + 1)}f}{ux_str}"),
("Std dev", f"{rounded_sigma:.{max(0, -exponent + 1)}f}{ux_str}"),
("Skewness", f"{skewness:.2f}"),
("Kurtosis", f"{kurtosis:.2f}"),
("p-value", pval_str)
]
label_width = max(len(label) for label, _ in entries) + 2
value_width = max(len(str(value)) for _, value in entries) + 2
# Costruisci il separatore dinamico
title = " Normality Analysis "
total_width = label_width + value_width + 3
separator = "=" * total_width
title_line = title.center(total_width, "*")
# Crea il blocco di testo
stamp = f"{separator}\n{title_line}\n{separator}\n"
for label, value in entries:
stamp += f"{label.ljust(label_width)}: {value.rjust(value_width)}\n"
stamp += f"{separator}\n"
# Aggiungi il risultato finale
if p_value >= 0.05:
result = "The data are consistent with a normal distribution."
else:
result = "The data deviate significantly from a normal distribution."
stamp += f"\n{result}\n"
# Stampa
print(stamp)
return mean, sigma, skewness, kurtosis, p_value
[docs]
def residuals(data: ArrayLike, expected_data: ArrayLike, data_err: ArrayLike, scale: int = 0,
unit: str = "", bins: Union[str, int] = "auto", confidence: int = 2,
norm: bool = False, verbose: bool = True, figsize: Tuple[float, float] = DEFAULT_FIGSIZE) -> Tuple[float, float, float, float, float, float]:
"""
Analyzes and visualizes the residuals of the quantity of interes, including histogram, Gaussianity test, and autocorrelation test (Durbin-Watson statistic).
Parameters
----------
data : array-like
Measured data points.
expected_data : array-like
Expected values to compare with `data` (e.g., from a model, theoretical prediction, or fit).
data_err : array-like
Uncertainties associated with each data point in `data`.
scale : int, optional
Scaling exponent applied to all quantities (e.g. `scale = -2` scales meters to centimeters). Default is `0`.
unit : str, optional
Unit of measurement of the data (e.g., `"cm"` or `"s"`). Used for labeling axes. Default is an empty string.
bins : int or str, optional
Number of bins or binning strategy passed to `matplotlib.pyplot.hist`. Default is `"auto"`.
confidence : float, optional
Confidence factor for visualizing bounds (e.g., `confidence = 2` draws ±2σ bounds). Default is `2`.
norm : bool, optionale
If `True`, residuals will be normalized. Default is `False`.
verbose : bool, optional
If `True`, prints a formatted table of the returns (mean value, standard deviation, skewness, kurtosis, p-value and Durbin–Watson statistic). Default is `True`.
figsize : tuple of float, optional
Figure size passed to `matplotlib`. Default is `(6.4 * 1.2, 4.8 * 1.2)`.
Returns
-------
mean : float
Mean value of the residuals, after applying the specified scale.
sigma : float
Estimated standard deviation of the residuals, weighted by `data_err`.
skewness : float
Skewness (third standardized moment) of the residual distribution.
kurtosis : float
Excess kurtosis (fourth standardized moment minus 3) of the residual distribution.
p_value : float
p-value from the Shapiro–Wilk normality test.
dw : float
Durbin–Watson statistic for testing autocorrelation in the residuals.
Notes
-----
- The residuals are computed as `resid = data - expected_data`, and scaled by `10**scale`.
- The standard deviation is computed as`np.sqrt(resid.std()**2 + np.sum(data_err**2)/len(data_err))`.
- The normal distribution is refferd to as `N(mu, sigma**2)`.
- The Shapiro–Wilk test is used to test for normality of the residuals:
- If `p_value >= 0.05`, residuals are considered consistent with a normal distribution.
- The Durbin–Watson statistic is used to detect first-order autocorrelation:
- Values ≈ 2 suggest no autocorrelation.
- Values < 1.5 suggest positive autocorrelation.
- Values > 2.5 suggest negative autocorrelation.
"""
from matplotlib.ticker import MaxNLocator
from scipy.stats import norm, shapiro
try:
from statsmodels.stats.stattools import durbin_watson
except ImportError:
raise ImportError(
"The 'statsmodels' package is not installed. "
"Please install it by running 'pip install statsmodels'."
)
data = np.asarray(data)
expected_data = np.asarray(expected_data)
data_err = np.asarray(data_err)
if (not (np.issubdtype(data.dtype, np.floating) or np.issubdtype(data.dtype, np.integer))) or not np.all(np.isreal(data)):
raise TypeError("'data' must contain only real numbers (int or float).")
if not np.all(np.isfinite(data)):
raise ValueError("'data' contains non-finite values (NaN or inf).")
if (not (np.issubdtype(data_err.dtype, np.floating) or np.issubdtype(data_err.dtype, np.integer))) or not np.all(np.isreal(data_err)):
raise TypeError("'data_err' must contain only real numbers (int or float).")
if not np.all(np.isfinite(data_err)):
raise ValueError("'data_err' contains non-finite values (NaN or inf).")
if (not (np.issubdtype(expected_data.dtype, np.floating) or np.issubdtype(expected_data.dtype, np.integer))) or not np.all(np.isreal(expected_data)):
raise TypeError("'expected_data' must contain only real numbers (int or float).")
if not np.all(np.isfinite(expected_data)):
raise ValueError("'expected_data' contains non-finite values (NaN or inf).")
if not isinstance(scale, (int, float)):
raise TypeError("'scale' must be a real number (int or float).")
if not isinstance(confidence, (int, float)):
raise TypeError("'confidence' must be a real number (int or float).")
if not isinstance(bins, (int, float, str)):
raise TypeError("'bins' must be a real number (int or float) or a string ('auto').")
if isinstance(bins, str) and bins != "auto":
raise ValueError("'bins' must be 'auto'.")
if not isinstance(unit, str):
raise TypeError("'unit' must be a string.")
if not isinstance(figsize, (list, tuple)) or len(figsize) != 2 or not all(isinstance(u, (int, float)) and np.isfinite(u) for u in figsize):
raise TypeError("'figsize' must be a list or tuple of exactly two finite real numbers.")
if not (len(data) == len(expected_data) == len(data_err)):
raise ValueError("'data', 'expected_data' and 'data_err' must have the same length.")
x_data = np.linspace(1, len(data), len(data))
data = data / 10**scale
expected_data = expected_data / 10**scale
data_err = data_err / 10**scale
resid = data - expected_data
if norm == True:
resid = resid / data_err
data_err /= data_err
mean = resid.mean()
sigma = np.sqrt(resid.std()**2 + np.sum(data_err**2)/len(data_err))
# Calcola l'esponente di sigma
exponent = int(math.floor(math.log10(abs(sigma))))
factor = 10**(exponent - 1)
rounded_sigma = (round(sigma / factor) * factor)
# Arrotonda la media
rounded_mean = round(mean, -exponent + 1)
# Converte in stringa mantenendo zeri finali
fmt = f".{-exponent + 1}f" if exponent < 1 else "f"
# ----------------------------
# Calcola l'esponente di sigma
exponent1 = int(math.floor(math.log10(abs(sigma**2))))
factor = 10**(exponent1 - 1)
rounded_var = (round(sigma**2 / factor) * factor)
# Arrotonda la media
rounded_mean1 = round(mean, -exponent1 + 1)
# Converte in stringa mantenendo zeri finali
fmt1 = f".{-exponent1 + 1}f" if exponent1 < 1 else "f"
# ----------------------------
if norm == True:
bar1 = np.repeat(1, len(x_data))
bar2 = resid / data_err
dash = np.repeat(confidence, len(x_data))
else:
bar1 = data_err
bar2 = resid
dash = confidence * data_err
# The following code is adapted from the VoigtFit library,
# originally developed by Jens-Kristian Krogager under the MIT License.
# https://github.com/jkrogager/VoigtFit
fig = plt.figure(figsize=figsize)
gs = fig.add_gridspec(2, hspace=0, height_ratios=[0.1, 0.9])
axs = gs.subplots()
# Aggiungi linee di riferimento
axs[0].axhline(0., ls='--', color='0.7', lw=0.8)
axs[0].errorbar(x_data, bar2, bar1, ls='', color='gray', lw=1.)
axs[0].plot(x_data, bar2, color='k', drawstyle='steps-mid', lw=1.15)
axs[0].plot(x_data, dash, ls='dotted', color='crimson', lw=1.6)
axs[0].plot(x_data, -dash, ls='dotted', color='crimson', lw=1.6)
axs[0].set_ylim(-np.nanmean(3 * dash / 2), np.nanmean(3 * dash / 2))
# Configurazioni estetiche per il pannello dei residui
axs[0].tick_params(labelbottom=False)
axs[0].set_yticklabels('')
axs[0].set_xlim(x_data.min(), x_data.max())
if norm == False:
# Prepara l'unità di misura, se presente
uy_str = f" [$\\mathrm{{{unit}}}$]" if unit else ""
label1 = f"Residuals value{uy_str}"
else:
label1 = "Normalized residuals value"
label = f"$\\mathcal{{N}}({rounded_mean1:.{max(0, -exponent1 + 1)}f}, {rounded_var:.{max(0, -exponent1 + 1)}f})$"
# label1 = f"$\\text{{Residual}} = y_\\text{{data}} - y_\\text{{expected}}${uy_str}"
# histogram of the data
_, bin_edges, _ = axs[1].hist(resid, color="blue",edgecolor='blue', histtype = "step", bins=bins, label ="Residuals distribution", zorder=2)
axs[1].set_ylabel('Counts')
lnspc = np.linspace(resid.min() - 2 * sigma, resid.max() + 2 * sigma, 500)
bin_widths = np.diff(bin_edges)
mean_bin_width = np.mean(bin_widths)
f_gauss = data.size * mean_bin_width * norm.pdf(lnspc, mean, sigma)
axs[1].plot(lnspc, f_gauss, linewidth=1, color='r',linestyle='--', label = label, zorder=1)
axs[1].set_xlabel(label1)
axs[1].yaxis.set_major_locator(MaxNLocator(integer=True))
axs[1].set_xlim(mean - 3 * sigma, mean + 3 * sigma)
axs[1].legend(frameon = False)
apply_inward_ticks(axs[1])
plt.show()
skewness = np.sum((resid - mean)**3) / (len(resid) * sigma**3)
kurtosis = np.sum((resid - mean)**4) / (len(resid) * sigma**4) - 3
_, p_value = shapiro(resid)
dw = durbin_watson(resid)
if verbose:
if 0.10 <= p_value <= 1:
pval_str = f"{p_value*100:.0f}%"
elif 0.005 < p_value < 0.10:
pval_str = f"{p_value * 100:.2f}%"
elif 0.0005 < p_value <= 0.005:
pval_str = f"{p_value * 100:.3f}%"
else:
pval_str = f"< 0.05%"
# Prepara l'unità di misura, se presente
uy_str = f" {unit}" if unit else ""
# Prepara le voci da stampare
entries = [
("Mean value", f"{rounded_mean:.{max(0, -exponent + 1)}f}{uy_str}"),
("Standard deviation", f"{rounded_sigma:.{max(0, -exponent + 1)}f}{uy_str}"),
("Skewness", f"{skewness:.2f}"),
("Kurtosis", f"{kurtosis:.2f}"),
("p-value", pval_str),
("Durbin-Watson", f"{dw:.3f}")
]
# Calcola le larghezze dinamiche
label_width = max(len(label) for label, _ in entries) + 2
value_width = max(len(str(value)) for _, value in entries) + 2
total_width = label_width + value_width + 3
# Costruisci il separatore dinamico e il titolo centrato
title = " Residuals Analysis "
separator = "=" * total_width
title_line = title.center(total_width, "*")
# Costruisci il blocco di testo
stamp = f"{separator}\n{title_line}\n{separator}\n"
for label, value in entries:
stamp += f"{label.ljust(label_width)}: {value.rjust(value_width)}\n"
stamp += f"{separator}\n"
# Aggiungi le interpretazioni finali
if p_value >= 0.05:
result_norm = "Residuals are consistent with a normal distribution."
else:
result_norm = "Residuals deviate significantly from a normal distribution."
if dw < 1.5:
result_dw = "Residuals show evidence of positive autocorrelation."
elif dw > 2.5:
result_dw = "Residuals show evidence of negative autocorrelation."
else:
result_dw = "Residuals do not show significant autocorrelation."
# Aggiungi le interpretazioni al testo finale
stamp += f"\n{result_norm}\n{result_dw}\n"
# Stampa
print(stamp)
return mean, sigma, skewness, kurtosis, p_value, dw
# def samples(n, distribution='normal', **params):
# """
# Generates synthetic data from common probability distributions.
# Parameters
# ----------
# n : int
# Number of data points to generate.
# distribution : {'normal', 'uniform', 'exponential', 'poisson', 'binomial', 'gamma', 'beta', 'lognormal', 'weibull', 'chi2', 't'}, optional
# Type of distribution to sample from. Default is 'normal'.
# **params : dict
# Distribution-specific parameters:
# - normal: mu (mean), sigma (stddev)
# - uniform: low, high
# - exponential: scale (1/lambda)
# - poisson: lam (expected rate)
# - binomial: n (number of trials), p (success probability)
# - gamma: shape, scale
# - beta: alpha, beta
# - lognormal: mean, sigma
# - weibull: shape
# - chi2: df (degrees of freedom)
# - t: df (degrees of freedom)
# Returns
# -------
# data : ndarray
# Array of length `n` with samples drawn from the specified distribution.
# Examples
# --------
# >>> samples(1000, 'normal', mu=0, sigma=1)
# array([...])
# >>> samples(500, 'uniform', low=0, high=10)
# array([...])
# >>> samples(200, 'poisson', lam=3)
# array([...])
# """
# raise NotImplementedError("The function 'samples' has been removed in favor of scipy.stats routines.")
# mu = params.get('mu')
# sigma = params.get('sigma')
# if mu is None or sigma is None:
# raise ValueError("For 'normal' distribution, provide 'mu' and 'sigma'.")
# return rng.normal(loc=mu, scale=sigma, size=n)
# elif dist == 'uniform':
# low = params.get('low')
# high = params.get('high')
# if low is None or high is None:
# raise ValueError("For 'uniform' distribution, provide 'low' and 'high'.")
# return rng.uniform(low=low, high=high, size=n)
# elif dist == 'exponential':
# scale = params.get('scale')
# if scale is None:
# raise ValueError("For 'exponential' distribution, provide 'scale'.")
# return rng.exponential(scale=scale, size=n)
# elif dist == 'poisson':
# lam = params.get('lam')
# if lam is None:
# raise ValueError("For 'poisson' distribution, provide 'lam'.")
# return rng.poisson(lam=lam, size=n)
# elif dist == 'binomial':
# trials = params.get('n')
# p = params.get('p')
# if trials is None or p is None:
# raise ValueError("For 'binomial' distribution, provide 'n' (trials) and 'p' (probability).")
# return rng.binomial(n=trials, p=p, size=n)
# elif dist == 'gamma':
# shape = params.get('shape')
# scale = params.get('scale', 1)
# if shape is None:
# raise ValueError("For 'gamma' distribution, provide 'shape'.")
# return rng.gamma(shape=shape, scale=scale, size=n)
# elif dist == 'beta':
# alpha = params.get('alpha')
# beta = params.get('beta')
# if alpha is None or beta is None:
# raise ValueError("For 'beta' distribution, provide 'alpha' and 'beta'.")
# return rng.beta(alpha=alpha, beta=beta, size=n)
# elif dist == 'lognormal':
# mean = params.get('mean')
# sigma = params.get('sigma')
# if mean is None or sigma is None:
# raise ValueError("For 'lognormal' distribution, provide 'mean' and 'sigma'.")
# return rng.lognormal(mean=mean, sigma=sigma, size=n)
# elif dist == 'weibull':
# shape = params.get('shape')
# if shape is None:
# raise ValueError("For 'weibull' distribution, provide 'shape'.")
# return rng.weibull(a=shape, size=n)
# elif dist == 'chi2':
# df = params.get('df')
# if df is None:
# raise ValueError("For 'chi2' distribution, provide 'df'.")
# return rng.chisquare(df=df, size=n)
# elif dist == 't':
# df = params.get('df')
# if df is None:
# raise ValueError("For 't' distribution, provide 'df'.")
# return rng.standard_t(df=df, size=n)
# else:
# raise ValueError(f"Unsupported distribution '{distribution}'. "
# f"Choose from 'normal', 'uniform', 'exponential', 'poisson', 'binomial', "
# f"'gamma', 'beta', 'lognormal', 'weibull', 'chi2', 't'.")
def remove_outliers(data, data_err=None, expected=None, method="zscore", threshold=3.0):
"""
Removes outliers from a data array according to the specified method.
Parameters
----------
data : array-like
Observed data.
data_err : array-like, optional
Uncertainties on the data. Necessary if comparing with `'expected'`.
expected : array-like, optional
Expected values for the data. If provided, the `'zscore'` method is automatically used.
method : str, optional
Method to use (`"zscore"`, `"mad"`, or `"iqr"`). Default: `"zscore"`.
threshold : float, optional
Threshold value to identify outliers. Default: `3.0`.
Returns
----------
data_clean : ndarray
Data without outliers.
"""
data = np.asarray(data)
# Caso 1: confronto con expected → forza 'zscore'
if expected is not None:
if data_err is None:
raise ValueError("If you provide 'expected', you must also provide 'data_err'.")
expected = np.asarray(expected)
data_err = np.asarray(data_err)
if len(data) != len(expected) or len(data) != len(data_err):
raise ValueError("'data', 'expected', and 'data_err' must have the same length.")
# Metodo unico valido
z_scores = np.abs((data - expected) / data_err)
mask = z_scores < threshold
else:
# Caso 2: solo dati osservati → puoi scegliere il metodo
if method == "zscore":
mean = np.mean(data)
std = np.std(data)
z_scores = np.abs((data - mean) / std)
mask = z_scores < threshold
elif method == "mad":
median = np.median(data)
mad = np.median(np.abs(data - median))
modified_z_scores = 0.6745 * (data - median) / mad
mask = np.abs(modified_z_scores) < threshold
elif method == "iqr":
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1
lower_bound = q1 - threshold * iqr
upper_bound = q3 + threshold * iqr
mask = (data >= lower_bound) & (data <= upper_bound)
else:
raise ValueError("Unrecognized method. Use 'zscore', 'mad', or 'iqr'.")
return data[mask]
[docs]
def posterior(x: ArrayLike, y: ArrayLike, y_err: ArrayLike, f: Callable[[float], float], p0: Union[List, ArrayLike],
burn: int = 1000, steps: int = 5000, thin: int = 10, maxfev: int = 5000, verbose: bool = True,
names: Optional[List[str]] = None, prior_bounds: Optional[List[tuple]] = None,
plot_dataset: bool = False,
plot_density: bool = True, color: str = 'k', figsize: Tuple[float, float] = DEFAULT_FIGSIZE, **kwargs) -> Tuple[ArrayLike, ArrayLike]:
"""
Performs a Bayesian parameter estimation using MCMC for a user-defined model function.
This function fits a given model `f` to the experimental data `(x, y)` with associated uncertainties `y_err`
by first performing a frequentist optimization (`curve_fit`) to obtain initial estimates, and then
running a Markov Chain Monte Carlo (MCMC) sampling using the `emcee` package to derive the full
posterior distribution of the model parameters. It returns the flattened MCMC sample chain representing the posterior samples,
the maximum likelihood estimate (MLE) of the parameters, and it visualizes the posterior distribution via a corner plot.
Parameters
----------
x : array-like,
Independent variable data points.
y : array-like,
Dependent variable data points to be fitted.
y_err : array-like
Uncertainties of the dependent data `y`. Used for computing chi-squared
and log-likelihood in the MCMC sampling. Can be `None`.
f : callable
The model function to fit. Must be of the form `f(x, *params)`, where `x` is the independent variable
and `params` are the free parameters of the model.
p0 : list or array-like
Initial guess for the M free parameters of the model. Used both for `curve_fit` and to initialize
the MCMC walkers. Can be `None`.
burn : int, optional
Number of burn-in steps to discard from the beginning of each walker chain before flattening.
These initial steps are typically biased by the starting conditions. Default is `1000`.
steps : int, optional
Total number of MCMC steps for each walker. Default is `5000`.
thin : int, optional
Subsampling factor to reduce autocorrelation between samples. Only every `thin`-th sample is retained. Default is 10.
maxfev : int, optional
Maximum number of function evaluations for the `curve_fit` routine. If exceeded, the fit will fail. Default is 5000.
verbose : bool, optional
If `True`, prints a formatted table of ... Default is `True`.
names : list of str, optional
Parameter names to be used in the `corner` plot and output. If `None`, defaults to ['p0', 'p1', ..., 'pN'].
prior_bounds : list of tuple, optional
Prior bounds on the parameters, as a list of `(min, max)` tuples for each parameter.
If `None`, assumes uninformative priors that only reject non-positive values.
plot_dataset : bool, optional
Draw the individual data points. Default is `False`.
plot_density : bool, optional
Draw the density colormap. Default is `False`.
color : str, optional
A `matplotlib` style color for all histograms. Default is `k`
figsize : tuple of float, optional
Figure size passed to the corner plot. Default is `(6.4 * 1.2, 4.8 * 1.2)`.
**kwargs
Any remaining keyword arguments are passed to `corner.corner`.
Returns
-------
mle_params : numpy.ndarray, shape (M,)
The parameter vector corresponding to the sample with the highest log-probability (maximum
likelihood estimate) in the posterior chain
flat_samples : numpy.ndarray, shape (n_samples, M)
Flattened MCMC sample chain (after burn-in and thinning). Each row is a sample of the M parameters.
This chain can be used for uncertainty analysis, plotting posteriors, or further statistical inference.
Notes
-----
- This implementation assumes a **uniform prior** over all parameters, constrained to be strictly positive.
Parameters less than or equal to zero are automatically rejected (log-prob = -inf).
- The `params` object returned is for reference only; it does not contain MCMC results.
To use posterior values in further analysis, refer to `flat_samples`.
- The performance and correctness of the posterior heavily depend on the choice of priors, model structure,
and convergence of the sampler. Always check convergence diagnostics in real analyses.
Example
--------
>>> import numpy as np
>>> from labtoolbox.stats import posterior
>>> def model(x, a, b):
... return a * x + b
>>> x = np.linspace(0, 10, 50)
>>> y = model(x, 2.5, 1.0) + np.random.normal(0, 0.5, size=x.size)
>>> y_err = 0.5 * np.ones_like(y)
>>> posterior(x, y, y_err, model, [1, 1])
"""
missing = []
try:
import emcee
except ImportError:
missing.append("emcee")
try:
import corner
except ImportError:
missing.append("corner")
if missing:
raise ImportError(
f"The following packages are missing: {', '.join(missing)}. "
"Please install them using pip."
)
if not callable(f):
raise TypeError("'f' must be a callable function.")
x = np.asarray(x)
y = np.asarray(y)
y_err = np.asarray(y_err)
if not (len(x) == len(y) == len(y_err)):
raise ValueError("'x', 'y' and 'y_err' must have the same length.")
if (not (np.issubdtype(x.dtype, np.floating) or np.issubdtype(x.dtype, np.integer))) or not np.all(np.isreal(x)):
raise TypeError("'x' must contain only real numbers (int or float).")
if not np.all(np.isfinite(x)):
raise ValueError("'x' contains non-finite values (NaN or inf).")
if (not (np.issubdtype(y.dtype, np.floating) or np.issubdtype(y.dtype, np.integer))) or not np.all(np.isreal(y)):
raise TypeError("'y' must contain only real numbers (int or float).")
if not np.all(np.isfinite(y)):
raise ValueError("'y' contains non-finite values (NaN or inf).")
if (not (np.issubdtype(y_err.dtype, np.floating) or np.issubdtype(y_err.dtype, np.integer))) or not np.all(np.isreal(y_err)):
raise TypeError("'y_err' must contain only real numbers (int or float).")
if not np.all(np.isfinite(y_err)):
raise ValueError("'y_err' contains non-finite values (NaN or inf).")
if not isinstance(burn, (int)):
raise TypeError("'burn' must be an integer.")
if burn <= 0:
raise ValueError("'burn' must be equal or greater than 1.")
if not isinstance(steps, (int)):
raise TypeError("'steps' must be an integer.")
if steps <= 0:
raise ValueError("'steps' must be equal or greater than 1.")
if not isinstance(thin, (int)):
raise TypeError("'thin' must be an integer.")
if thin <= 0:
raise ValueError("'thin' must be equal or greater than 1.")
if not isinstance(maxfev, (int)):
raise TypeError("'maxfev' must be an integer.")
if maxfev <= 0:
raise ValueError("'maxfev' must be equal or greater than 1.")
if not isinstance(figsize, (list, tuple)) or len(figsize) != 2 or not all(isinstance(u, (int, float)) and np.isfinite(u) for u in figsize):
raise TypeError("'figsize' must be a list or tuple of exactly two finite real numbers.")
p0 = np.array(p0)
if (not (np.issubdtype(p0.dtype, np.floating) or np.issubdtype(p0.dtype, np.integer))) or not np.all(np.isreal(p0)):
raise TypeError("'p0' must contain only real numbers (int or float).")
if not np.all(np.isfinite(p0)):
raise ValueError("'p0' contains non-finite values (NaN or inf).")
ndim = len(p0)
nwalkers = 2 * ndim
if names is None:
names = [f"p{i}" for i in range(ndim)]
if prior_bounds is None:
# Wide uninformative priors
prior_bounds = [(-np.inf, np.inf)] * ndim
from scipy.optimize import curve_fit
# Fit iniziale per inizializzare bene i walkers
popt, _ = curve_fit(f, x, y, p0=p0, sigma=y_err, absolute_sigma=True, maxfev=maxfev)
# Log-likelihood
def log_likelihood(theta):
model = f(x, *theta)
return -0.5 * np.sum(((y - model) / y_err) ** 2)
# Log-prior uniforme (con bound)
def log_prior(theta):
for i, (p, (low, high)) in enumerate(zip(theta, prior_bounds)):
if not (low < p < high):
return -np.inf
return 0.0
# Log-posterior
def log_posterior(theta):
lp = log_prior(theta)
if not np.isfinite(lp):
return -np.inf
return lp + log_likelihood(theta)
# Inizializzazione walker intorno a popt
p0_walkers = popt + 1e-4 * np.random.randn(nwalkers, ndim)
# Sampling
sampler = emcee.EnsembleSampler(nwalkers, ndim, log_posterior)
sampler.run_mcmc(p0_walkers, steps, progress=True)
# Estrai i samples
flat_samples = sampler.get_chain(discard=burn, thin=thin, flat=True)
# ho tolto truths = popt
# Calcola posteriori con arrotondamenti consistenti
posterior_results = []
log_prob = sampler.get_log_prob(discard=burn, thin=thin, flat=True)
mle_index = np.argmax(log_prob)
mle_params = flat_samples[mle_index]
for i in range(len(names)):
median = np.median(flat_samples[:, i])
lower = np.percentile(flat_samples[:, i], 16)
upper = np.percentile(flat_samples[:, i], 84)
plus = upper - median
minus = median - lower
exponent1 = int(math.floor(math.log10(abs(plus)))) if plus != 0 else 0
exponent2 = int(math.floor(math.log10(abs(minus)))) if minus != 0 else 0
common_exponent = min(exponent1, exponent2)
factor = 10 ** (common_exponent - 1)
rounded_plus = round(plus / factor) * factor
rounded_minus = round(minus / factor) * factor
decimal_places = max(-common_exponent + 1, 0)
rounded_median = round(median, decimal_places)
rounded_mle = round(mle_params[i], decimal_places)
param_name = f"Parameter {i+1}"
posterior_results.append((param_name, rounded_median, rounded_plus, rounded_minus, rounded_mle, decimal_places))
if verbose:
# Determina le larghezze massime per ciascuna colonna
col_headers = ["Parameter", "Median", "+1σ", "-1σ", "MLE"]
columns = [ [r[0] for r in posterior_results], # Parametri
[f"{r[1]:.{r[5]}f}" for r in posterior_results], # Mediana
[f"{r[2]:.{r[5]}f}" for r in posterior_results], # +1σ
[f"{r[3]:.{r[5]}f}" for r in posterior_results], # -1σ
[f"{r[4]:.{r[5]}f}" for r in posterior_results] ] # MLE
# Calcola le larghezze dinamiche (massimo tra intestazione e valori)
col_widths = []
for header, col_data in zip(col_headers, columns):
max_width = max(len(header), max(len(str(val)) for val in col_data))
col_widths.append(max_width)
# Calcola larghezza totale della tabella
total_width = sum(col_widths) + 3 * len(col_widths) + 1 # +1 per '|', +3 per spazi e separatori
# Costruisci il titolo centrato
title = "Summary Report"
padding = (total_width - len(title)) // 2
header_line = "=" * total_width
title_line = " " * padding + title + " " * padding
if len(title_line) < total_width:
title_line += " "
# Stampa la tabella
print(header_line)
print(title_line)
print(header_line)
# Intestazione
header = "|"
for header_text, width in zip(col_headers, col_widths):
header += f" {header_text.center(width)} |"
print(header)
print("-" * total_width)
# Righe con allineamento centrato
for row in posterior_results:
param, median, plus, minus, mle, dec = row
row_str = "|"
row_str += f" {param.center(col_widths[0])} |"
row_str += f" {f'{median:.{dec}f}'.center(col_widths[1])} |"
row_str += f" {f'{plus:.{dec}f}'.center(col_widths[2])} |"
row_str += f" {f'{minus:.{dec}f}'.center(col_widths[3])} |"
row_str += f" {f'{mle:.{dec}f}'.center(col_widths[4])} |"
print(row_str)
print(header_line)
# Plot corner
# Rimuovi eventuali duplicati nel kwargs
kwargs.setdefault("plot_density", plot_density)
kwargs.setdefault("plot_datapoints", plot_dataset)
# corner.corner(flat_samples, labels=names, no_fill_contours=True, color="royalblue", hist_kwargs={"color": "royalblue", "alpha": 0.8, "linewidth": 1.5},
# contour_kwargs={"colors": ["darkblue"], "levels": [0.864, 0.675,]}, **kwargs)
fig = corner.corner(flat_samples,
labels=names,
fill_contours=True, # riempie le regioni dei contorni con colori sfumati
#no_fill_contours=True,
color = color,
hist_kwargs={"linewidth": 0.8},
#contourf_kwargs={"cmap": "viridis", "colors": None},
**kwargs)
# fig.set_size_inches(forward=True)
# Ora scorro tutti gli assi (subplots) e modifico la linewidth delle linee dei contorni
# Per ogni asse (subplot) cerco se ci sono contorni 2D (QuadContourSet)
for ax in fig.axes:
# ax.collections contiene le collezioni di artisti, i contorni sono lì dentro
for collection in ax.collections:
# Modifico la linewidth della collezione
collection.set_linewidth(0.8)
# Modifica linewidth per gli istogrammi 1D (linee)
for line in ax.lines:
line.set_linewidth(0.8)
apply_inward_ticks(fig)
plt.show()
return mle_params, flat_samples
[docs]
def propagate(func: Callable[[Union[float, ArrayLike]], Union[float, ArrayLike]], x_val: List[ArrayLike], x_err: List[ArrayLike],
params: Optional[List[ArrayLike]] = None, method: str = 'Monte_Carlo', MC_sample_size: int = 10000) -> Tuple[ArrayLike, ArrayLike, Tuple[ArrayLike, ArrayLike]]:
"""
Propagates uncertainty from input arrays to a generic function using the `uncertainty_class` library.
Parameters
----------
func : callable
The base function, in the form `f(x, a)` where:
- `x` is a vector of variables.
- `a` is a vector of parameters (optional).
x_val : list of numpy.array
List containing the input variable arrays `[x1, x2, ..., xn]`. Each entry can be:
- a scalar,
- a numpy array of values (same length across all xi).
x_err : list or numpy.array
List of uncertainties for each variable (scalars or arrays),
or a full covariance matrix.
params : list or numpy.array, optional
List or array of constant parameters `[a1, a2, ..., am]`.
method : str, optional
The uncertainty propagation method ('Delta' or 'Monte_Carlo').
MC_sample_size : int, optional
Sample size for the Monte Carlo method.
Returns
--------
f_values : numpy.ndarray
Values of the function calculated at each point `j`.
f_err : numpy.ndarray
Propagated uncertainties on the output function for each point `j`.
confidence_bands : tuple of numpy.ndarray
Lower and upper confidence bands for each point `j`.
"""
from .._helper import uncert_prop
# the `uncertainty_class` package, available at https://github.com/yiorgoskost/Uncertainty-Propagation/tree/master,
# provides functionality for uncertainty propagation in calculations.
# --- Controllo func ---
if not callable(func):
raise TypeError("'func' must be a callable function.")
# --- Controllo x_val ---
if not isinstance(x_val, list):
raise TypeError("'x_val' must be a list of numpy.array or real numbers (int or float).")
if len(x_val) == 0:
raise ValueError("'x_val' cannot be empty.")
array_lengths = []
for i, xi in enumerate(x_val):
if np.isscalar(xi):
if not isinstance(xi, (int, float)):
raise TypeError(f"'x_val[{i}]' scalar must be a real number (int or float).")
elif isinstance(xi, np.ndarray):
if not (np.issubdtype(xi.dtype, np.floating) or np.issubdtype(xi.dtype, np.integer)):
raise TypeError(f"'x_val[{i}]' must contain real numbers (int or float).")
if not np.all(np.isfinite(xi)):
raise ValueError(f"'x_val[{i}]' contains non-finite values (NaN or inf)..")
array_lengths.append(len(xi))
else:
raise TypeError(f"'x_val[{i}]' must be a scalar or numpy.ndarray.")
if array_lengths:
if len(set(array_lengths)) != 1:
raise ValueError("All 'x_val[i]' arrays must all have the same length.")
# --- Controllo x_err ---
if isinstance(x_err, list):
if len(x_err) != len(x_val):
raise ValueError("'x_err' list must have the same length as 'x_val'.")
for i, err_i in enumerate(x_err):
if not (np.isscalar(err_i) or isinstance(err_i, np.ndarray)):
raise TypeError(f"'x_err[{i}]' must be a scalar or numpy.ndarray.")
if np.isscalar(err_i):
if not isinstance(err_i, (int, float)):
raise TypeError(f"'x_err[{i}]' scalar must be a real number (int or float).")
if isinstance(err_i, np.ndarray):
if not (np.issubdtype(err_i.dtype, np.floating) or np.issubdtype(err_i.dtype, np.integer)):
raise TypeError(f"'x_err[{i}]' must contain real numbers (int or float).")
if not np.all(np.isfinite(err_i)):
raise ValueError(f"'x_err[{i}]' contains non-finite values (NaN or inf).")
if array_lengths and len(err_i) != array_lengths[0]:
raise ValueError(f"'x_err[{i}]' must have the same length as 'x_val' arrays.")
elif isinstance(x_err, np.ndarray):
if x_err.ndim != 2 or x_err.shape[0] != x_err.shape[1]:
raise ValueError("'x_err' numpy.ndarray covariance matrix must be 2D square.")
expected_size = len(x_val)
if x_err.shape[0] != expected_size:
raise ValueError(f"'x_err' covariance matrix must be of size {expected_size}x{expected_size}.")
if not (np.issubdtype(x_err.dtype, np.floating) or np.issubdtype(x_err.dtype, np.integer)):
raise TypeError("'x_err' covariance matrix must contain real numbers (int or float).")
if not np.all(np.isfinite(x_err)):
raise ValueError("'x_err' covariance matrix contains non-finite values (NaN or inf).")
else:
raise TypeError("'x_err' must be a list or numpy.ndarray.")
# --- Controllo params ---
if params is not None:
if not (isinstance(params, list) or isinstance(params, np.ndarray)):
raise TypeError("'params' must be a list or numpy.ndarray.")
else:
# Se vuoi, puoi aggiungere controlli sul contenuto di params qui
pass
# --- Controllo MC_sample_size ---
if not isinstance(MC_sample_size, int):
raise TypeError("'MC_sample_size' must be an integer.")
if MC_sample_size <= 0:
raise ValueError("'MC_sample_size' must be equal or greater than 1.")
# Normalizza x_val in lista di array
x_val = [np.atleast_1d(xi) for xi in x_val]
# Determina la lunghezza degli array
lengths = [len(xi) for xi in x_val]
unique_lengths = set(lengths)
if len(unique_lengths) == 1:
n_points = lengths[0]
else:
raise ValueError("All input arrays (or scalars) must have the same length.")
# Inizializza gli array di output
f_values = np.zeros(n_points)
f_err = np.zeros(n_points)
confidence_bands_lower = np.zeros(n_points)
confidence_bands_upper = np.zeros(n_points)
# Prepara la funzione wrapper che accetta un vettore di variabili
def wrapped_func(x_vector):
if params is not None:
return func(*x_vector, *params)
else:
return func(*x_vector)
# Per ogni punto j, calcola f[j] e la sua incertezza
for j in range(n_points):
# Estrai i valori per il punto j
x_point = np.array([x[j] for x in x_val])
# Prepara la matrice di covarianza
if isinstance(x_err, list):
# Se uncertainties è una lista di incertezze per ogni variabile
if all(isinstance(u, (int, float)) for u in x_err):
# Se sono scalari, crea una matrice diagonale
cov_matrix = np.diag([u**2 for u in x_err])
else:
# Se sono array, prendi il valore per il punto j
cov_matrix = np.diag([u[j]**2 for u in x_err])
else:
# Assume che uncertainties sia già una matrice di covarianza
cov_matrix = x_err
# Crea l'oggetto uncert_prop
uncertainty_propagator = uncert_prop(
func = wrapped_func,
x = x_point,
cov_matrix = cov_matrix,
method = method,
MC_sample_size = MC_sample_size
)
# Calcola il valore della funzione
f_values[j] = wrapped_func(x_point)
# Calcola l'incertezza propagata
f_err[j] = uncertainty_propagator.SEM()
# Calcola le bande di confidenza
lcb, ucb = uncertainty_propagator.confband()
confidence_bands_lower[j] = lcb
confidence_bands_upper[j] = ucb
return f_values, f_err, (confidence_bands_lower, confidence_bands_upper)
[docs]
def bayes_factor(x: ArrayLike, y: ArrayLike, y_err: ArrayLike, f1: Callable[[Union[float, ArrayLike]], Union[float, ArrayLike]],
p0_1: Union[List, ArrayLike], f2: Callable[[Union[float, ArrayLike]], Union[float, ArrayLike]],
p0_2: Union[List, ArrayLike], burn: int = 1000, steps: int = 5000,
thin: int = 10, maxfev: int = 5000, prior_bounds1: Optional[List[tuple]] = None,
prior_bounds2: Optional[List[tuple]] = None, verbose: bool = True) -> Tuple[float, float, float]:
"""
Estimate the Bayes factor between two models using the Bayesian Information Criterion (BIC).
Parameters
----------
x : array-like
Independent variable values of the dataset.
y : array-like
Dependent variable (observed data).
y_err : array-like
Uncertainties (standard deviations) on the observed data.
f1 : callable
First model function to compare.
p0_1 : list or array-like
Initial guess for the parameters of the first model.
f2 : callable
Second model function to compare.
p0_2 : list or array-like
Initial guess for the parameters of the second model.
burn : int, optional
Number of initial MCMC steps to discard. Default is `1000`.
steps : int, optional
Total number of MCMC steps per walker. Default is `5000`.
thin : int, optional
Thinning factor applied when flattening the MCMC chains. Default is `10`.
maxfev : int, optional
Maximum number of function evaluations for the curve fitting. Default is `5000`.
prior_bounds1 : list of tuples, optional
Bounds for the uniform priors of the first model. Each element must be a `(min, max)` tuple.
If `None`, unbounded uniform priors are assumed.
prior_bounds2 : list of tuples, optional
Bounds for the uniform priors of the second model.
verbose : bool, optional
If `True`, prints a formatted table of the returns. Default is `True`.
Returns
-------
lnB12 : float
Estimated natural logarithm of the Bayes factor.
A positive value favors model M1; a negative value favors model M1.
BIC1 : float
Bayesian Information Criterion for the first model.
BIC2 : float
Bayesian Information Criterion for the second model.
Notes
-----
Interpretation of ln(B12):
- ln B12 > 5 : Strong evidence for model 1
- ln B12 ∈ [2.5, 5) : Moderate evidence for model 1
- ln B12 ∈ [1, 2.5) : Weak evidence for model 1
- ln B12 ∈ [-1, 1) : Inconclusive
- ln B12 ∈ [-2.5, -1) : Weak evidence for model 2
- ln B12 ∈ [-5, -2.5) : Moderate evidence for model 2
- ln B12 < -5 : Strong evidence for model 2
The approximation assumes that the prior volume is not too informative and that the maximum a posteriori estimate is close to the maximum likelihood.
"""
from .._helper import format_smart
from scipy.optimize import curve_fit
try:
import emcee
except ImportError:
raise ImportError(
"The 'emcee' package is not installed."
"Please install it by running 'pip install emcee'."
)
# --- Controllo x ---
if not hasattr(x, '__iter__'):
raise TypeError("'x' must be array-like.")
x_arr = np.asarray(x)
if not (np.issubdtype(x_arr.dtype, np.floating) or np.issubdtype(x_arr.dtype, np.integer)):
raise TypeError("'x' must contain real numbers (int or float).")
if not np.all(np.isfinite(x_arr)):
raise ValueError("'x' contains non-finite values (NaN or inf).")
# --- Controllo y ---
if not hasattr(y, '__iter__'):
raise TypeError("'y' must be array-like.")
y_arr = np.asarray(y)
if not (np.issubdtype(y_arr.dtype, np.floating) or np.issubdtype(y_arr.dtype, np.integer)):
raise TypeError("'y' must contain only real numbers (int or float).")
if not np.all(np.isfinite(y_arr)):
raise ValueError("'y' contains non-finite values (NaN or inf).")
# --- Controllo y_err ---
if not hasattr(y_err, '__iter__'):
raise TypeError("'y_err' must be array-like.")
y_err_arr = np.asarray(y_err)
if not (np.issubdtype(y_err_arr.dtype, np.floating) or np.issubdtype(y_err_arr.dtype, np.integer)):
raise TypeError("'y_err' must contain only real numbers (int or float).")
if not np.all(np.isfinite(y_err_arr)):
raise ValueError("'y_err' contains non-finite values (NaN or inf).")
# --- Controllo lunghezze coerenti ---
if not (len(x_arr) == len(y_arr) == len(y_err_arr)):
raise ValueError("'x', 'y' and 'y_err' must have the same length.")
# --- Controllo f1 ---
if not callable(f1):
raise TypeError("'f1' must be callable.")
# --- Controllo p0_1 ---
p0_1 = np.array(p0_1)
if (not (np.issubdtype(p0_1.dtype, np.floating) or np.issubdtype(p0_1.dtype, np.integer))) or not np.all(np.isreal(p0_1)):
raise TypeError("'p0_1' must contain only real numbers (int or float).")
if not np.all(np.isfinite(p0_1)):
raise ValueError("'p0_1' contains non-finite values (NaN or inf).")
# --- Controllo f2 ---
if not callable(f2):
raise TypeError("'f2' must be callable.")
# --- Controllo p0_2 ---
p0_2 = np.array(p0_2)
if (not (np.issubdtype(p0_2.dtype, np.floating) or np.issubdtype(p0_2.dtype, np.integer))) or not np.all(np.isreal(p0_2)):
raise TypeError("'p0_2' must contain only real numbers (int or float).")
if not np.all(np.isfinite(p0_2)):
raise ValueError("'p0_2' contains non-finite values (NaN or inf).")
# --- Controllo burn ---
if not (isinstance(burn, int) and burn >= 0):
raise ValueError("'burn' must be a non-negative integer.")
# --- Controllo steps ---
if not (isinstance(steps, int) and steps > 0):
raise ValueError("'steps' must be a positive integer.")
# --- Controllo thin ---
if not (isinstance(thin, int) and thin > 0):
raise ValueError("'thin' must be a positive integer.")
# --- Controllo maxfev ---
if not (isinstance(maxfev, int) and maxfev > 0):
raise ValueError("'maxfev' must be a positive integer.")
# --- Controllo prior_bounds1 ---
if prior_bounds1 is not None:
if not isinstance(prior_bounds1, list):
raise TypeError("'prior_bounds1' must be a list of (min, max) tuples or None.")
for i, bound in enumerate(prior_bounds1):
if not (isinstance(bound, tuple) and len(bound) == 2):
raise TypeError(f"'prior_bounds1[{i}]' must be a tuple of length 2.")
if not all(isinstance(v, (int, float)) for v in bound):
raise TypeError(f"Elements of 'prior_bounds1[{i}]' must be a real numbers (int or float).")
# --- Controllo prior_bounds2 ---
if prior_bounds2 is not None:
if not isinstance(prior_bounds2, list):
raise TypeError("'prior_bounds2' must be a list of (min, max) tuples or None.")
for i, bound in enumerate(prior_bounds2):
if not (isinstance(bound, tuple) and len(bound) == 2):
raise TypeError(f"'prior_bounds2[{i}]' must be a tuple of length 2.")
if not all(isinstance(v, (int, float)) for v in bound):
raise TypeError(f"Elements of 'prior_bounds2[{i}]' must be a real numbers (int or float).")
if len(x) <= 10 * len(p0_1) or len(x) <= 10 * len(p0_2):
warnings.warn("The BIC approximation is only valid for sample size much larger than the number of parameters in the model. Results may be inaccurate.", Warning)
if not (len(x) == len(y) == len(y_err)):
raise ValueError("'x', 'y' and 'y_err' must have the same length.")
def fit_model(f, p0, prior_bounds):
ndim = len(p0)
nwalkers = 2 * ndim
popt, _ = curve_fit(f, x, y, p0=p0, sigma=y_err, absolute_sigma=True, maxfev=maxfev)
def log_likelihood(theta):
model = f(x, *theta)
return -0.5 * np.sum(((y - model) / y_err) ** 2)
def log_prior(theta):
if prior_bounds is None:
return 0.0
for p, (low, high) in zip(theta, prior_bounds):
if not (low < p < high):
return -np.inf
return 0.0
def log_posterior(theta):
lp = log_prior(theta)
if not np.isfinite(lp):
return -np.inf
return lp + log_likelihood(theta)
# Sampling
p0_walkers = popt + 1e-4 * np.random.randn(nwalkers, ndim)
sampler = emcee.EnsembleSampler(nwalkers, ndim, log_posterior)
sampler.run_mcmc(p0_walkers, steps, progress=True)
flat_log_prob = sampler.get_log_prob(discard=burn, thin=thin, flat=True)
max_log_like = np.max(flat_log_prob)
bic = ndim * np.log(len(x)) - 2 * max_log_like
return bic, max_log_like
BIC1, logL1 = fit_model(f1, p0_1, prior_bounds1)
BIC2, logL2 = fit_model(f2, p0_2, prior_bounds2)
lnB12 = -0.5 * (BIC1 - BIC2)
if verbose:
label_width = 10
if lnB12 >= 5:
result = "Strong evidence for model 1"
elif 2.5 <= lnB12 < 5:
result = "Moderate evidence for model 1"
elif 1 <= lnB12 < 2.5:
result = "Weak evidence for model 1"
elif -1 <= lnB12 < 1:
result = 'Inconclusive'
elif -2.5 <= lnB12 < -1:
result = 'Weak evidence for model 2'
elif -5 <= lnB12 < -2.5:
result = 'Moderate evidence for model 2'
else:
result = 'Strong evidence for model 2'
# Prepariamo le intestazioni e i dati
header = f"{'Model':<{label_width}} | {'BIC':>12} | {'logL':>12}"
model1_line = f"{'Model 1':<{label_width}} | {format_smart(BIC1, equalsign=False):>12} | {format_smart(logL1, equalsign=False):>12}"
model2_line = f"{'Model 2':<{label_width}} | {format_smart(BIC2, equalsign=False):>12} | {format_smart(logL2, equalsign=False):>12}"
divider = "-" * len(header)
# Stampa con layout elegante e dinamico
print()
print("=" * len(header))
print(header)
print(divider)
print(model1_line)
print(model2_line)
print(divider)
print(f"{'Result':<{label_width}} | {'log(BF)':>12} | {format_smart(lnB12, equalsign=False):>12}")
print("=" * len(header))
# Evidenzia il risultato con un messaggio elegante
result_box = f"*** {result} ***"
# Calcola la lunghezza dinamica per centrare il messaggio
box_width = max(len(header), len(result_box))
print(result_box.center(box_width))
print("=" * box_width)
return lnB12, BIC1, BIC2
[docs]
def mean(x: Union[float, ArrayLike], kind: str = 'arith') -> float:
"""
Compute the mean of an input scalar or numpy array, with the specified type of mean.
Parameters
----------
x : float or array-like
Input data. Can be a scalar or a numpy array. If a scalar is provided, it is returned as the mean.
kind : str or float, optional
Type of mean to compute. Supported options are:
- 'max' : maximum value of x
- 'min' : minimum value of x
- 'arith' : arithmetic mean (default)
- 'geom' : geometric mean
- 'harmonic' : harmonic mean
- 'rms' : root mean square (quadratic mean)
- 'cubic' : cubic mean (third-order)
- 'agm' : arithmetic-geometric mean
- float (number) : generalized mean of order p
Returns
-------
result : float
The computed mean of `x`, according to the specified type.
Examples
--------
>>> from labtoolbox.stats import mean
>>> mean([1, 2, 3, 4], 'arith')
2.5
>>> mean([1, 2, 3, 4], 'rms')
2.7386127875258306
>>> mean([1, 2, 3, 4], 2)
2.7386127875258306
"""
x = np.asarray(x)
# Handle scalar case
if np.isscalar(x):
if not isinstance(x, (int, float)):
raise TypeError("'x' must be a real number (int or float).")
return x
else:
if (not (np.issubdtype(x.dtype, np.floating) or np.issubdtype(x.dtype, np.integer))) or not np.all(np.isreal(x)):
raise TypeError("'x' must contain only real numbers (int or float).")
if not np.all(np.isfinite(x)):
raise ValueError("'x' contains non-finite values (NaN or inf).")
if kind == 'arith':
return np.mean(x)
elif kind == 'geom':
if np.any(x <= 0):
raise ValueError("Geometric mean requires all elements to be positive.")
return np.exp(np.mean(np.log(x)))
elif kind == 'harmonic':
if np.any(x == 0):
raise ValueError("Harmonic mean requires non-zero elements.")
return len(x) / np.sum(1.0 / x)
elif kind == 'max':
return np.max(x)
elif kind == 'min':
return np.min(x)
elif kind == 'rms':
return np.sqrt(np.mean(x**2))
elif kind == 'cubic':
return np.cbrt(np.mean(x**3))
elif kind == 'agm':
if np.any(x <= 0):
raise ValueError("Arithmetic-Geometric Mean requires all elements to be positive.")
a = np.mean(x)
g = np.exp(np.mean(np.log(x)))
while not np.isclose(a, g, rtol=1e-10):
a_new = 0.5 * (a + g)
g = np.sqrt(a * g)
a = a_new
return a
elif isinstance(kind, (float, int)):
p = kind
if p == 0:
if np.any(x <= 0):
raise ValueError("Generalized mean of order 0 (geometric) requires positive values.")
return np.exp(np.mean(np.log(x)))
return (np.mean(x**p))**(1/p)
else:
raise ValueError(f"Unsupported mean type '{kind}'.")
[docs]
def lin_fit(x: ArrayLike, y: ArrayLike, y_err: ArrayLike, x_err: Optional[ArrayLike] = None, fitmodel: str = "wls",
xlabel: str = "x [ux]", ylabel: str = "y [uy]", xlim: Optional[Tuple[float, float]] = None, ylim: Optional[Tuple[float, float]] = None,
showlegend: bool = True, legendloc: Optional[str] = None, log: Optional[str] = None,
xscale: int = 0, yscale: int = 0, mscale: int = 0, cscale: int = 0, m_units: str = "", c_units: str = "",
confidence: int = 2, confidencerange: bool = True, residuals: bool = True, norm: bool = True,
verbose: bool = False, summary: bool = True, figsize: Tuple[float, float] = DEFAULT_FIGSIZE) -> Tuple[float, float, float, float, float, float]:
"""
Performs a linear fit (Weighted Least Squares or Ordinary Least Squares) and displays experimental data along with the regression line and uncertainty band.
Parameters
----------
x : array-like
Values of the independent variable.
y : array-like
Values of the dependent variable.
y_err : array-like
Uncertainties associated with y values.
x_err : array-like, optional
Uncertainties associated with x values.
fitmodel : str, optional
Fitting model, either "wls" or "ols". Default is "wls".
xlabel : str, optional
Label for the x-axis, including units in square brackets (e.g., "x [m]").
ylabel : str, optional
Label for the y-axis, including units in square brackets (e.g., "y [s]").
showlegend : bool, optional
If `True`, displays a legend with the values of m and c on the plot.
legendloc : str, optional
Location of the legend on the plot ('upper right', 'lower left', 'upper center', etc.). Default is `None`.
log : str, optional
If set to `'x'` or `'y'`, the corresponding axis is plotted on a logarithmic scale; if `'xy'`, both axes. Default is `None`.
xscale : int, optional
Scaling factor for the x-axis (e.g., `xscale = -2` corresponds to 1e-2, to convert meters to centimeters).
yscale : int, optional
Scaling factor for the y-axis.
xlim : tuple, optional
Limits for the x-axis, in the form (xmin, xmax). The values should
already be scaled with respect to `xscale`. If `None` or an empty tuple,
the default limits will be automatically determined from the data.
ylim : tuple, optional
Limits for the y-axis, in the form (ymin, ymax). The values should
already be scaled with respect to `yscale`. If `None` or an empty tuple,
the default limits will be automatically determined from the data.
mscale : int, optional
Scaling factor for the slope `m`.
cscale : int, optional
Scaling factor for the intercept `c`.
m_units : str, optional
Unit of measurement for `m`. Default is `""`.
c_units : str, optional
Unit of measurement for `c`. Default is `""`.
confidence : int, optional
Residual confidence interval to display, i.e., `[-confidence, +confidence]`.
confidencerange : bool, optional
If `True`, shows the 1σ uncertainty band around the fit line.
residuals : bool, optional
If `True`, adds an upper panel showing fit residuals.
norm : bool, optional
If `True`, residuals in the upper panel will be normalized.
verbose : bool, optional
If `True`, prints the output of `wls_fit` (or `ols_fit`) to the screen. Default is `False`.
summary : bool, optional
If `True`, prints a formatted summary report of the fit, including
reduced chi-square, p-value, and percentage of residuals within ±2σ.
figsize : tuple of float, optional
Figure size passed to `matplotlib`. Default is `(6.4 * 1.2, 4.8 * 1.2)`.
Returns
----------
m : float
Slope of the regression line.
c : float
Intercept of the regression line.
sigma_m : float
Uncertainty on the slope.
sigma_c : float
Uncertainty on the intercept.
chi2_red : float
Reduced chi-square value (χ²/dof).
p_value : float
Fit p-value.
Notes
----------
- The values of `xscale` and `yscale` affect only the axis scaling in the plot and have no impact on the fitting computation itself. All model parameters are estimated using the original input data as provided.
- LaTeX formatting is already embedded in the strings used to display the units of `m` and `c`. You do not need to use "$...$".
- If `c_scale = 0` (recommended when using `c_units`), then `c_units` will represent the suffix corresponding to 10^yscale (+ `y_units`).
- If `m_scale = 0` (recommended when using `m_units`), then `m_units` will represent the suffix corresponding to 10^(yscale - xscale) [+ `y_units/x_units`].
"""
from scipy.stats import chi2
import math as math
from .._helper import my_cov, my_mean, my_var, my_line, y_estrapolato
try:
import statsmodels.api as sm
except ImportError:
raise ImportError(
"The 'statsmodels' package is not installed. "
"Please install it by running 'pip install statsmodels'."
)
x = np.asarray(x)
y = np.asarray(y)
y_err = np.asarray(y_err)
if x_err is not None:
x_err = np.asarray(x_err)
if not (len(x) == len(y) == len(y_err) == len(x_err)):
raise ValueError("'x', 'y', 'x_err' and 'y_err' must have the same length.")
if (not (np.issubdtype(x_err.dtype, np.floating) or np.issubdtype(x_err.dtype, np.integer))) or not np.all(np.isreal(x_err)):
raise TypeError("'y_err' must contain only real numbers (int or float).")
if not np.all(np.isfinite(x_err)):
raise ValueError("'x_err' contains non-finite values (NaN or inf).")
else:
if not (len(x) == len(y) == len(y_err)):
raise ValueError("'x', 'y' and 'y_err' must have the same length.")
if (not (np.issubdtype(x.dtype, np.floating) or np.issubdtype(x.dtype, np.integer))) or not np.all(np.isreal(x)):
raise TypeError("'x' must contain only real numbers (int or float).")
if (not (np.issubdtype(y.dtype, np.floating) or np.issubdtype(y.dtype, np.integer))) or not np.all(np.isreal(y)):
raise TypeError("'y' must contain only real numbers (int or float).")
if (not (np.issubdtype(y_err.dtype, np.floating) or np.issubdtype(y_err.dtype, np.integer))) or not np.all(np.isreal(y_err)):
raise TypeError("'y_err' must contain only real numbers (int or float).")
if not np.all(np.isfinite(x)):
raise ValueError("'x' contains non-finite values (NaN or inf).")
if not np.all(np.isfinite(y)):
raise ValueError("'y' contains non-finite values (NaN or inf).")
if not np.all(np.isfinite(y_err)):
raise ValueError("'y_err' contains non-finite values (NaN or inf).")
if not isinstance(xscale, (int, float)):
raise TypeError("'xscale' must be a real number (int or float).")
if not isinstance(yscale, (int, float)):
raise TypeError("'yscale' must be a real number (int or float).")
if xlim is not None:
if not isinstance(xlim, list):
raise TypeError("'xlim' must be a list (either empty or containing two real numbers).")
if len(xlim) != 0:
if len(xlim) != 2:
raise TypeError("'xlim' must be empty or a list of exactly two real numbers.")
if not all(isinstance(u, (int, float)) and np.isfinite(u) for u in xlim):
raise TypeError("Both elements in 'xlim' must be finite real numbers (int or float).")
if ylim is not None:
if not isinstance(ylim, list):
raise TypeError("'ylim' must be a list (either empty or containing two real numbers).")
if len(ylim) != 0:
if len(ylim) != 2:
raise TypeError("'ylim' must be empty or a list of exactly two real numbers.")
if not all(isinstance(u, (int, float)) and np.isfinite(u) for u in ylim):
raise TypeError("Both elements in 'ylim' must be finite real numbers (int or float).")
if not isinstance(xlabel, (str)):
raise TypeError("'xlabel' must be a string.")
if not isinstance(ylabel, (str)):
raise TypeError("'ylabel' must be a string.")
if log is not None:
if log not in ("x", "y", "xy"):
raise ValueError("The value of 'log' must be 'x', 'y' or 'xy'.")
if not isinstance(figsize, (list, tuple)) or len(figsize) != 2 or not all(isinstance(u, (int, float)) and np.isfinite(u) for u in figsize):
raise TypeError("'figsize' must be a list or tuple of exactly two finite real numbers.")
xscale = 10**xscale
yscale = 10**yscale
# Aggiunta dell'intercetta (colonna di 1s per il termine costante)
X = sm.add_constant(x) # Aggiunge una colonna di 1s per il termine costante
# Calcolo dei pesi come inverso delle varianze
weights = 1 / y_err**2
# Modello di regressione pesata
if fitmodel == "wls":
model = sm.WLS(y, X, weights=weights) # Weighted Least Squares (OLS con pesi)
elif fitmodel == "ols":
model = sm.OLS(y, X)
else:
raise ValueError('Invalid model. Only "wls" or "ols" allowed.')
results = model.fit()
if verbose:
print(results.summary())
print("\n")
# Parametri stimati
m = float(results.params[1])
c = float(results.params[0])
# Errori standard dei parametri stimati
sigma_m = float(results.bse[1]) # Incertezza sul coefficiente angolare (m)
sigma_c = float(results.bse[0]) # Incertezza sull'intercetta (c)
chi2_value = np.sum(((y - (m * x + c)) / y_err) ** 2)
# Gradi di libertà (DOF)
dof = len(x) - 2
# Chi-quadrato ridotto
chi2_red = chi2_value / dof
# p-value
p_value = chi2.sf(chi2_value, dof)
m2 = my_cov(x, y, weights) / my_var(x, weights)
var_m2 = 1 / ( my_var(x, weights) * np.sum(weights) )
c2 = my_mean(y, weights) - my_mean(x, weights) * m
var_c2 = my_mean(x*x, weights) / ( my_var(x, weights) * np.sum(weights) )
sigma_m2 = var_m2 ** 0.5
sigma_c2 = var_c2 ** 0.5
cov_mc = - my_mean(x, weights) / ( my_var(x, weights) * np.sum(weights) )
# ------------------------
# Applica lo scaling
mean_scaled = m / (10**mscale)
sigma_scaled = sigma_m / (10**mscale)
exponent = int(math.floor(math.log10(abs(sigma_scaled))))
factor = 10**(exponent - 1)
rounded_sigma = round(sigma_scaled / factor) * factor
# 2. Arrotonda mean allo stesso ordine di grandezza di sigma
rounded_mean = round(mean_scaled, -exponent + 1)
# 3. Converte in stringa mantenendo zeri finali
fmt = f".{-exponent + 1}f" if exponent < 1 else "f"
mean_str = f"{rounded_mean:.{max(0, -exponent + 1)}f}"
sigma_str = f"{rounded_sigma:.{max(0, -exponent + 1)}f}"
# Costruzione stringa LaTeX ottimizzata
unit_str = f" \\, \\mathrm{{{m_units}}}" if m_units else ""
scale_str = f" \\times 10^{{{mscale}}}" if mscale != 0 else ""
if unit_str or scale_str:
result1 = f"$m = ({mean_str} \\pm {sigma_str}){scale_str}{unit_str}$"
else:
result1 = f"$m = {mean_str} \\pm {sigma_str}$"
# ------------------------
# Applica lo scaling
mean_scaled = c / (10**cscale)
sigma_scaled = sigma_c / (10**cscale)
exponent = int(math.floor(math.log10(abs(sigma_scaled))))
factor = 10**(exponent - 1)
rounded_sigma = round(sigma_scaled / factor) * factor
# 2. Arrotonda mean allo stesso ordine di grandezza di sigma
rounded_mean = round(mean_scaled, -exponent + 1)
# 3. Converte in stringa mantenendo zeri finali
fmt = f".{-exponent + 1}f" if exponent < 1 else "f"
mean_str = f"{rounded_mean:.{max(0, -exponent + 1)}f}"
sigma_str = f"{rounded_sigma:.{max(0, -exponent + 1)}f}"
# Costruzione stringa LaTeX ottimizzata
unit_str = f" \\, \\mathrm{{{c_units}}}" if c_units else ""
scale_str = f" \\times 10^{{{cscale}}}" if cscale != 0 else ""
if unit_str or scale_str:
result2 = f"$c = ({mean_str} \\pm {sigma_str}){scale_str}{unit_str}$"
else:
result2 = f"$c = {mean_str} \\pm {sigma_str}$"
# ------------------------
# Calcolo dei residui normalizzati
resid = y - (m * x + c)
resid_norm = resid / y_err
k = np.sum((-1 <= resid_norm) & (resid_norm <= 1))
n = k / len(resid_norm)
labels = ["Reduced χ² (χ²/dof)", "p-value", "Residuals within ±2σ"]
values = []
# χ²/dof dinamico
if chi2_red > 100:
values.append("> 100")
elif 10 <= chi2_red <= 100:
values.append(f"{chi2_red:.0f}")
elif 0.01 < chi2_red < 10:
values.append(f"{chi2_red:.2f}")
else:
values.append("< 0.01")
if p_value >= 0.10:
values.append(f"{p_value * 100:.0f}%")
elif 0.005 < p_value < 0.10:
values.append(f"{p_value * 100:.2f}%")
elif 0.0005 < p_value <= 0.005:
values.append(f"{p_value * 100:.3f}%")
else:
values.append("< 0.05%")
# Residui dinamici
if n >= 0.10:
values.append(f"{n*100:.0f}%")
elif 0.005 < n < 0.10:
values.append(f"{n*100:.2f}%")
else:
values.append("Virtually no one")
if summary:
max_label_len = max(len(label) for label in labels)
max_value_len = max(len(value) for value in values)
total_width = max_label_len + max_value_len + 5
# Costruzione righe
report_lines = []
title = "Summary Report"
title_line = title.center(total_width)
separator = "=" * total_width
report_lines.append(separator)
report_lines.append(title_line)
report_lines.append(separator)
for label, value in zip(labels, values):
line = f"{label:<{max_label_len}} : {value:>{max_value_len}}"
report_lines.append(line)
report_lines.append(separator)
report = "\n".join(report_lines)
print(report)
xmin_plot = x.min() - .12 * (x.max() - x.min())
xmax_plot = x.max() + .12 * (x.max() - x.min())
x1 = np.linspace(xmin_plot, xmax_plot, 500)
y1 = my_line(x1, m, c) / yscale
y1_plus_1sigma = y1 + y_estrapolato(x1, m2, c2, sigma_m2, sigma_c2, cov_mc)[1] / yscale
y1_minus_1sigma = y1 - y_estrapolato(x1, m2, c2, sigma_m2, sigma_c2, cov_mc)[1] / yscale
y = y / yscale
x = x / xscale
x1 = x1 / xscale
y_err = y_err / yscale
if x_err is not None:
x_err = x_err / xscale
if showlegend:
label = (
"Best fit\n"
+ result1 + "\n"
+ result2
)
else :
label = "Best fit"
if norm == True:
bar1 = np.repeat(1, len(x))
bar2 = resid_norm
dash = np.repeat(confidence, len(x1))
else :
bar1 = y_err
bar2 = resid / yscale
dash = confidence * y_err
fig = plt.figure(figsize=figsize)
# The following code is adapted from the VoigtFit library,
# originally developed by Jens-Kristian Krogager under the MIT License.
# https://github.com/jkrogager/VoigtFit
# dashed, lw = 1.
# steps-mid, lw = 1.
if residuals:
gs = fig.add_gridspec(2, hspace=0, height_ratios=[0.1, 0.9])
axs = gs.subplots(sharex=True)
#axs = gs.subplots()
# Aggiungi linee di riferimento
axs[0].axhline(0., ls='--', color='0.7', lw=0.8)
axs[0].errorbar(x, bar2, bar1, ls='', color='gray', lw=1.15)
axs[0].plot(x, bar2, color='k', drawstyle='steps-mid', lw=1.15)
if norm == True:
axs[0].plot(x1, dash, ls='dotted', color='crimson', lw=1.6)
axs[0].plot(x1, -dash, ls='dotted', color='crimson', lw=1.6)
else:
axs[0].plot(x, dash, ls='dotted', color='crimson', lw=1.6)
axs[0].plot(x, -dash, ls='dotted', color='crimson', lw=1.6)
axs[0].set_ylim(-np.nanmean(3 * dash / 2), np.nanmean(3 * dash / 2))
# Configurazioni estetiche per il pannello dei residui
axs[0].tick_params(labelbottom=False)
axs[0].set_yticklabels('')
#axs[0].set_xlim(x.min(), x.max())
else:
gs = fig.add_gridspec(2, hspace=0, height_ratios=[0, 1])
axs = gs.subplots(sharex=True)
#axs = gs.subplots()
axs[0].remove() # Rimuovi axs[0], axs[1] rimane valido
axs[1].plot(x1, y1, color="blue", ls="-", linewidth=0.8, label = label)
if confidencerange == True:
axs[1].fill_between(x1, y1_plus_1sigma, y1_minus_1sigma,
where=(y1_plus_1sigma > y1_minus_1sigma), color='blue', alpha=0.3, edgecolor='none', label="Confidence interval")
if x_err is None:
axs[1].errorbar(x, y, yerr=y_err, ls='', marker='.',
color="black", label='Experimental data', capsize=2)
else:
axs[1].errorbar(x, y, yerr=y_err, xerr=x_err, ls='', marker='.',
color="black", label='Experimental data', capsize=2)
axs[1].set_xlabel(xlabel)
axs[1].set_ylabel(ylabel)
if xlim and len(xlim) == 2:
if residuals:
axs[0].set_xlim(xlim)
axs[1].set_xlim(xlim)
else:
if residuals:
axs[0].set_xlim(xmin_plot/xscale, xmax_plot/xscale)
axs[1].set_xlim(xmin_plot/xscale, xmax_plot/xscale)
if ylim and len(ylim) == 2:
axs[1].set_ylim(ylim)
if legendloc == None:
axs[1].legend(frameon = False)
else:
axs[1].legend(loc = legendloc, frameon = False)
if log == "x":
axs[1].set_xscale("log")
if residuals:
axs[0].set_xscale("log")
elif log == "y":
axs[1].set_yscale("log")
elif log == "xy":
axs[1].set_xscale("log")
axs[1].set_yscale("log")
if residuals:
axs[0].set_xscale("log")
apply_inward_ticks(axs[1])
plt.show()
return m, c, sigma_m, sigma_c, chi2_red, p_value
[docs]
def model_fit(x: ArrayLike, y: ArrayLike, f: Callable[[Union[float, ArrayLike]], Union[float, ArrayLike]],
x_err: Optional[ArrayLike] = None, y_err: Optional[ArrayLike] = None, p0: Optional[Union[List[float], ArrayLike]] = None,
xlabel: str = "x [ux]", ylabel: str = "y [uy]", xlim: Optional[Tuple[float, float]] = None, ylim: Optional[Tuple[float, float]] = None,
showlegend: bool = True, legendloc: Optional[str] = None, bounds: Optional[Tuple[ArrayLike, ArrayLike]] = None,
confidencerange: bool = True, log: Optional[str] = None, maxfev: int = 5000,
xscale: int = 0, yscale: int = 0, confidence: int = 2, residuals: bool = True, norm: bool = True,
verbose: bool = True, print_parameters: bool = True, figsize: Tuple[float, float] = DEFAULT_FIGSIZE) -> Tuple[ArrayLike, ArrayLike, float, float]:
"""
General-purpose fit of multi-parameter functions.
Parameters
----------
x : array-like
Measured values of the independent variable.
y : array-like
Measured values of the dependent variable.
f : function
Function of one variable (first argument of `f`) with `N` free parameters.
x_err : array-like, optional
Uncertainties associated with the independent variable. Default is `None`.
y_err : array-like, optional
Uncertainties associated with the dependent variable. Default is `None`.
p0 : list or array-like of floats, optional
Initial guess for the model parameters, in the form `[a, ..., z]`. Default is `None`.
xlabel : str, optional
Label (and units) for the independent variable.
ylabel : str, optional
Label (and units) for the dependent variable.
xlim : tuple, optional
Limits for the x-axis, in the form `(xmin, xmax)`. The values should
already be scaled with respect to `xscale`. If `None` or an empty tuple,
the default limits will be automatically determined from the data.
ylim : tuple, optional
Limits for the y-axis, in the form `(ymin, ymax)`. The values should
already be scaled with respect to `yscale`. If `None` or an empty tuple,
the default limits will be automatically determined from the data.
showlegend : bool, optional
If `True`, displays a legend with the reduced chi-square and p-value in the plot.
legendloc : str, optional
Location of the legend in the plot ('upper right', 'lower left', 'upper center', etc.). Default is `None`.
bounds : 2-tuple of array-like, optional
Tuple `([lower_bounds], [upper_bounds])` specifying bounds for the parameters. Default is `None`.
confidencerange : bool, optional
If `True`, displays the 1σ uncertainty band around the best-fit curve.
log : str, optional
If set to `'x'` or `'y'`, the corresponding axis is plotted on a logarithmic scale; if `'xy'`, both axes. Default is `None`.
maxfev : int, optional
Maximum number of iterations allowed by `curve_fit`.
xscale : int, optional
Scaling factor for the x-axis (e.g., `xscale = -2` corresponds to 1e-2, to convert meters to centimeters).
yscale : int, optional
Scaling factor for the y-axis.
confidence : int, optional
Residual confidence interval to display, i.e., `[-confidence, +confidence]`.
residuals : bool, optional
If `True`, adds an upper panel showing fit residuals.
norm : bool, optional
If `True`, residuals in the upper panel will be normalized.
verbose : bool, optional
If `True`, prints a formatted summary report of the fit, including
reduced chi-square, p-value, and percentage of residuals within ±2σ.
print_parameters : bool, optional
If `True`, prints the best-fit parameters along with their standard uncertainties.
figsize : tuple of float, optional
Figure size passed to `matplotlib`. Default is `(6.4 * 1.2, 4.8 * 1.2)`.
Returns
----------
popt : array-like
Array of optimal parameters estimated from the fit.
perr : array-like
Uncertainties on the optimal parameters (only if `y_err` is provided).
chi2_red : float
Reduced chi-square value (χ²/dof).
p_value : float
Fit p-value.
Notes
----------
The values of `xscale` and `yscale` affect only the axis scaling in the plot and have no impact on the fitting computation itself.
All model parameters are estimated using the original input data as provided.
"""
from scipy.optimize import curve_fit
import math as math
x = np.asarray(x)
y = np.asarray(y)
if y_err is not None:
y_err = np.asarray(y_err)
if (not (np.issubdtype(y_err.dtype, np.floating) or np.issubdtype(y_err.dtype, np.integer))) or not np.all(np.isreal(y_err)):
raise TypeError("'y_err' must contain only real numbers (int or float).")
if not np.all(np.isfinite(y_err)):
raise ValueError("'y_err' contains non-finite values (NaN or inf).")
if x_err is not None:
x_err = np.asarray(x_err)
if (not (np.issubdtype(x_err.dtype, np.floating) or np.issubdtype(x_err.dtype, np.integer))) or not np.all(np.isreal(x_err)):
raise TypeError("'x_err' must contain only real numbers (int or float).")
if not np.all(np.isfinite(x_err)):
raise ValueError("'x_err' contains non-finite values (NaN or inf).")
if x_err is not None and y_err is not None:
if not (len(x) == len(y) == len(y_err) == len(x_err)):
raise ValueError("'x', 'y', 'x_err' and 'y_err' must have the same length.")
elif x_err is not None and y_err is None:
if not (len(x) == len(y) == len(x_err)):
raise ValueError("'x', 'y' and 'x_err' must have the same length.")
elif x_err is None and y_err is not None:
if not (len(x) == len(y) == len(y_err)):
raise ValueError("'x', 'y' and 'y_err' must have the same length.")
else:
if not (len(x) == len(y)):
raise ValueError("'x' and 'y' must have the same length.")
if (not (np.issubdtype(x.dtype, np.floating) or np.issubdtype(x.dtype, np.integer))) or not np.all(np.isreal(x)):
raise TypeError("'x' must contain only real numbers (int or float).")
if (not (np.issubdtype(y.dtype, np.floating) or np.issubdtype(y.dtype, np.integer))) or not np.all(np.isreal(y)):
raise TypeError("'y' must contain only real numbers (int or float).")
if not np.all(np.isfinite(x)):
raise ValueError("'x' contains non-finite values (NaN or inf).")
if not np.all(np.isfinite(y)):
raise ValueError("'y' contains non-finite values (NaN or inf).")
if not isinstance(xscale, (int, float)):
raise TypeError("'xscale' must be a real number (int or float).")
if not isinstance(yscale, (int, float)):
raise TypeError("'yscale' must be a real number (int or float).")
if xlim is not None:
if not isinstance(xlim, list):
raise TypeError("'xlim' must be a list (either empty or containing two real numbers).")
if len(xlim) != 0:
if len(xlim) != 2:
raise TypeError("'xlim' must be empty or a list of exactly two real numbers.")
if not all(isinstance(u, (int, float)) and np.isfinite(u) for u in xlim):
raise TypeError("Both elements in 'xlim' must be finite real numbers (int or float).")
if ylim is not None:
if not isinstance(ylim, list):
raise TypeError("'ylim' must be a list (either empty or containing two real numbers).")
if len(ylim) != 0:
if len(ylim) != 2:
raise TypeError("'ylim' must be empty or a list of exactly two real numbers.")
if not all(isinstance(u, (int, float)) and np.isfinite(u) for u in ylim):
raise TypeError("Both elements in 'ylim' must be finite real numbers (int or float).")
if not isinstance(xlabel, (str)):
raise TypeError("'xlabel' must be a string.")
if not isinstance(ylabel, (str)):
raise TypeError("'ylabel' must be a string.")
p0 = np.array(p0)
if (not (np.issubdtype(p0.dtype, np.floating) or np.issubdtype(p0.dtype, np.integer))) or not np.all(np.isreal(p0)):
raise TypeError("'p0' must contain only real numbers (int or float).")
if not np.all(np.isfinite(p0)):
raise ValueError("'p0' contains non-finite values (NaN or inf).")
if log is not None:
if log not in ("x", "y", "xy"):
raise ValueError("The value of 'log' must be 'x', 'y' or 'xy'.")
if not isinstance(figsize, (list, tuple)) or len(figsize) != 2 or not all(isinstance(u, (int, float)) and np.isfinite(u) for u in figsize):
raise TypeError("'figsize' must be a list or tuple of exactly two finite real numbers.")
xscale = 10**xscale
yscale = 10**yscale
if bounds is None and p0 is not None:
bounds = (np.repeat(-np.inf, len(p0)), np.repeat(np.inf, len(p0)))
elif bounds is None and p0 is None:
bounds = (-np.inf, np.inf)
popt, pcov = curve_fit(
f,
x,
y,
p0=p0,
sigma=y_err,
absolute_sigma=True,
maxfev=maxfev,
bounds=bounds
)
perr = np.sqrt(np.diag(pcov))
# Calcolo del chi-quadrato
y_fit = f(x, *popt)
if print_parameters:
if y_err is not None:
# Stampa dei parametri con incertezze
for i in range(len(popt)):
# Calcola l'esponente di sigma
exponent = int(math.floor(math.log10(abs(perr[i]))))
factor = 10**(exponent - 1)
rounded_sigma = (round(perr[i] / factor) * factor)
# Arrotonda la media
rounded_mean = round(popt[i], -exponent + 1)
# Converte in stringa mantenendo zeri finali
fmt = f".{-exponent + 1}f" if exponent < 1 else "f"
if popt[i] != 0:
nu = perr[i] / popt[i]
print(
f"Parameter {i + 1} = ({rounded_mean:.{max(0, -exponent + 1)}f} ± {rounded_sigma:.{max(0, -exponent + 1)}f}) [{np.abs(nu) * 100:.2f}%]"
)
else:
print(f"Parameter {i + 1} = ({rounded_mean:.{max(0, -exponent + 1)}f} ± {rounded_sigma:.{max(0, -exponent + 1)}f})")
else:
print(f"Parameter {i + 1} = {popt[i]:.2g}")
print()
# k = np.sum((-1 <= resid_norm) & (resid_norm <= 1))
if y_err is not None:
from scipy.stats import chi2
resid = y - y_fit
resid_norm = resid / y_err
chi2_value = np.sum((resid_norm) ** 2)
dof = len(x) - len(popt)
chi2_red = chi2_value / dof
p_value = chi2.sf(chi2_value, dof)
n = np.sum((-1 <= resid_norm) & (resid_norm <= 1)) / len(resid_norm)
labels = ["Reduced χ² (χ²/dof)", "p-value", "Residuals within ±2σ"]
values = []
if chi2_red > 100:
values.append("> 100")
_chi2_str = f"> 100"
elif 10 <= chi2_red <= 100:
values.append(f"{chi2_red:.0f}")
_chi2_str = f"= {chi2_red:.0f}"
elif 0.01 < chi2_red < 10:
values.append(f"{chi2_red:.2f}")
_chi2_str = f"= {chi2_red:.2f}"
else:
values.append("< 0.01")
_chi2_str = f"< 0.01"
if p_value >= 0.10:
values.append(f"{p_value * 100:.0f}%")
pval_str = f"p-value = {p_value*100:.0f}%"
elif 0.005 < p_value < 0.10:
values.append(f"{p_value * 100:.2f}%")
pval_str = f"p-value = {p_value * 100:.2f}%"
elif 0.0005 < p_value <= 0.005:
values.append(f"{p_value * 100:.3f}%")
pval_str = f"p-value = {p_value * 100:.3f}%"
else:
values.append("< 0.05%")
pval_str = f"p-value < 0.05%"
if n >= 0.10:
values.append(f"{n*100:.0f}%")
elif 0.005 < n < 0.10:
values.append(f"{n*100:.2f}%")
else:
values.append("Virtually no one")
if verbose:
max_label_len = max(len(label) for label in labels)
max_value_len = max(len(value) for value in values)
total_width = max_label_len + max_value_len + 5
# Costruzione righe
report_lines = []
title = "Summary Report"
title_line = title.center(total_width)
separator = "=" * total_width
report_lines.append(separator)
report_lines.append(title_line)
report_lines.append(separator)
for label, value in zip(labels, values):
line = f"{label:<{max_label_len}} : {value:>{max_value_len}}"
report_lines.append(line)
report_lines.append(separator)
# Unione e stampa
report = "\n".join(report_lines)
print(report)
xmin_plot = x.min() - .12 * (x.max() - x.min())
xmax_plot = x.max() + .12 * (x.max() - x.min())
x1 = np.linspace(xmin_plot, xmax_plot, 500)
y_fit_cont = f(x1, *popt)
if y_err is not None:
if norm is True:
bar1 = np.repeat(1, len(x))
bar2 = resid_norm
dash = np.repeat(confidence, len(x1))
else :
bar1 = y_err
bar2 = resid / yscale
dash = confidence * y_err
# Ripeti ciascun parametro per len(x1) volte
parametri_ripetuti = [np.repeat(p, len(x1)) for p in popt]
errori_ripetuti = [np.repeat(e, len(x1)) for e in perr]
# Costruisci lista dei valori e delle incertezze
lista = [x1] + parametri_ripetuti
lista_err = [np.repeat(0, len(x1))] + errori_ripetuti
from .stats import propagate
# the `uncertainty_class` package, available at https://github.com/yiorgoskost/Uncertainty-Propagation/tree/master,
# provides functionality for uncertainty propagation in calculations (labtoolbox.stats.propagate).
_, _ , confid = propagate(f, lista, lista_err)
y1_plus_1sigma = confid[1] / yscale
y1_minus_1sigma = confid[0] / yscale
y_err = y_err / yscale
x1 = x1 / xscale
xmax_plot = xmax_plot / xscale
xmin_plot = xmin_plot / xscale
x = x / xscale
y = y / yscale
y_fit_cont = y_fit_cont / yscale
y_fit = y_fit / yscale
if x_err is not None:
x_err = x_err / xscale
fig = plt.figure(figsize=figsize)
# The following code is adapted from the VoigtFit library,
# originally developed by Jens-Kristian Krogager under the MIT License.
# https://github.com/jkrogager/VoigtFit
if residuals is True and y_err is not None:
gs = fig.add_gridspec(2, hspace=0, height_ratios=[0.1, 0.9])
axs = gs.subplots(sharex=True)
axs[0].axhline(0., ls='--', color='0.7', lw=0.8)
axs[0].errorbar(x, bar2, bar1, ls='', color='gray', lw=1.15)
axs[0].plot(x, bar2, color='k', drawstyle='steps-mid', lw=1.15)
if norm == True:
axs[0].plot(x1, dash, ls='dotted', color='crimson', lw=1.6)
axs[0].plot(x1, -dash, ls='dotted', color='crimson', lw=1.6)
else:
axs[0].plot(x, dash, ls='dotted', color='crimson', lw=1.6)
axs[0].plot(x, -dash, ls='dotted', color='crimson', lw=1.6)
axs[0].set_ylim(-np.nanmean(3 * dash / 2), np.nanmean(3 * dash / 2))
axs[0].tick_params(labelbottom=False)
axs[0].set_yticklabels('')
else:
gs = fig.add_gridspec(2, hspace=0, height_ratios=[0, 1])
axs = gs.subplots(sharex=True)
axs[0].remove()
if showlegend and y_err is not None:
label = f"Best fit\n$\\chi^2/\\text{{dof}}{_chi2_str}$\n{pval_str}"
else :
label = "Best fit"
axs[1].plot(x1, y_fit_cont, color="blue", ls="-", linewidth=0.8, label = label)
if confidencerange is True and y_err is not None:
axs[1].fill_between(x1, y1_plus_1sigma, y1_minus_1sigma,
where=(y1_plus_1sigma > y1_minus_1sigma), color='blue', alpha=0.3, edgecolor='none', label="Confidence interval")
if y_err is not None:
if x_err is None:
axs[1].errorbar(x, y, yerr=y_err, ls='', marker='.',
color="black", label='Experimental data', capsize=2)
else:
axs[1].errorbar(x, y, yerr=y_err, xerr=x_err, ls='', marker='.',
color="black", label='Experimental data', capsize=2)
else:
axs[1].plot(x, y, ls='', marker='.',
color="black", label='Experimental data', capsize=2)
axs[1].set_xlabel(xlabel)
axs[1].set_ylabel(ylabel)
if xlim and len(xlim) == 2:
if residuals:
axs[0].set_xlim(xlim)
axs[1].set_xlim(xlim)
else:
if residuals:
axs[0].set_xlim(xmin_plot, xmax_plot)
axs[1].set_xlim(xmin_plot, xmax_plot)
if ylim and len(ylim) == 2:
axs[1].set_ylim(ylim)
if legendloc is None:
axs[1].legend(frameon = False)
else:
axs[1].legend(loc = legendloc, frameon = False)
if log == "x":
axs[1].set_xscale("log")
if residuals:
axs[0].set_xscale("log")
elif log == "y":
axs[1].set_yscale("log")
elif log == "xy":
axs[1].set_xscale("log")
axs[1].set_yscale("log")
if residuals:
axs[0].set_xscale("log")
apply_inward_ticks(axs[1])
plt.show()
if y_err is not None:
return popt, perr, chi2_red, p_value
else:
return popt, chi2_red, p_value