Source code for pythonsi.feature_selection.lasso

from sklearn.linear_model import Lasso
import numpy as np
import numpy.typing as npt
from pythonsi.node import Data
from typing import Tuple
from pythonsi.util import solve_linear_inequalities, intersect


[docs] class LassoFeatureSelection: r"""LASSO feature selection with selective inference support. This class implements LASSO feature selection with the capability to perform selective inference on the selected features. The LASSO optimization problem is: .. math:: \hat{\boldsymbol{\beta}} = \mathop{\arg \min}_{\boldsymbol{\beta}} \quad \frac{1}{2} \|\mathbf{y} - \mathbf{x}\boldsymbol{\beta}\|_2^2 + \lambda \|\boldsymbol{\beta}\|_1 where :math:`\lambda` is the regularization parameter that controls sparsity. Parameters ---------- lambda_ : float, optional Regularization parameter for LASSO, default 10 Attributes ---------- x_node : Data or None Input feature matrix node y_node : Data or None Input response vector node lambda_ : float Regularization parameter active_set_node : Data Output node containing selected feature indices self.interval : list or None Feasible interval from last inference call self.active_set_data : array-like or None Active set from last inference call """ def __init__(self, lambda_: float = 10): # Input for Lasso regression self.x_node = None self.y_node = None self.lambda_ = lambda_ # Output for Lasso regression self.active_set_node = Data(self) self.interval = None self.active_set_data = None def __call__(self) -> npt.NDArray[np.floating]: r"""Execute LASSO feature selection on stored data. Returns ------- active_set : array-like, shape (k,) Indices of selected features """ x = self.x_node() y = self.y_node() active_set, _, _ = self.forward(x=x, y=y) self.active_set_node.update(active_set) return active_set
[docs] def run(self, x: Data, y: Data) -> Data: r"""Configure LASSO with input data and return active set node. Parameters ---------- x : array-like, shape (n, p) Feature matrix y : array-like, shape (n, 1) Response vector Returns ------- active_set_node : Data Node containing selected feature indices """ self.x_node = x self.y_node = y return self.active_set_node
[docs] def forward( self, x: npt.NDArray[np.floating], y: npt.NDArray[np.floating] ) -> Tuple[ npt.NDArray[np.floating], npt.NDArray[np.floating], npt.NDArray[np.floating] ]: r"""Solve LASSO optimization and extract active set information. Solves the LASSO problem and returns the active set (selected features), inactive set, and signs of active coefficients. Parameters ---------- x : array-like, shape (n, p) Feature matrix y : array-like, shape (n, 1) Response vector Returns ------- active_set : array-like, shape (k,) Indices of selected features inactive_set : array-like, shape (p-k,) Indices of unselected features sign_active : array-like, shape (k, 1) Signs of coefficients for active features """ num_of_dimension = x.shape[1] lasso = Lasso( alpha=self.lambda_ / x.shape[0], fit_intercept=False, tol=1e-10, max_iter=100000000, ) lasso.fit(x, y) coefficients = lasso.coef_.reshape(num_of_dimension, 1) active_set = np.nonzero(coefficients)[0] inactive_set = np.setdiff1d(np.arange(num_of_dimension), active_set) sign_active = np.sign(coefficients[active_set]).reshape(-1, 1) # # Uncomment this to checkKKT for Lasso # self.checkKKT_Lasso(x, y, coefficients, self.lambda_) return active_set, inactive_set, sign_active
[docs] def inference(self, z: float) -> Tuple[list, npt.NDArray[np.floating]]: r"""Find feasible interval of the Lasso Feature Selection for the parametrized data at z. Parameters ---------- z : float Inference parameter value Returns ------- final_interval : list Feasible interval [lower, upper] for z """ if self.interval is not None and self.interval[0] <= z <= self.interval[1]: self.active_set_node.parametrize(data=self.active_set_data) return self.interval x, _, _, interval_x = self.x_node.inference(z) y, a, b, interval_y = self.y_node.inference(z) active_set, inactive_set, sign_active = self.forward(x, y) inactive_set = np.setdiff1d(np.arange(x.shape[1]), active_set) self.active_set_node.parametrize(data=active_set) # x_a: x with active features x_a = x[:, active_set] # x_i: x with inactive features x_i = x[:, inactive_set] x_a_plus = np.linalg.inv(x_a.T.dot(x_a)).dot(x_a.T) x_aT_plus = x_a.dot(np.linalg.inv(x_a.T.dot(x_a))) temp = x_i.T.dot(x_aT_plus).dot(sign_active) # A + Bz <= 0 (elemen-wise) A0 = self.lambda_ * sign_active * np.linalg.inv(x_a.T.dot(x_a)).dot( sign_active ) - sign_active * x_a_plus.dot(a) B0 = -1 * sign_active * x_a_plus.dot(b) temperal_variable = x_i.T.dot(np.identity(x.shape[0]) - x_a.dot(x_a_plus)) A10 = -( np.ones((temp.shape[0], 1)) - temp - (temperal_variable.dot(a)) / self.lambda_ ) B10 = (temperal_variable.dot(b)) / self.lambda_ A11 = -( np.ones((temp.shape[0], 1)) + temp + (temperal_variable.dot(a)) / self.lambda_ ) B11 = -(temperal_variable.dot(b)) / self.lambda_ solve_linear_inequalities(A0, B0) solve_linear_inequalities(A10, B10) solve_linear_inequalities(A11, B11) A = np.vstack((A0, A10, A11)) B = np.vstack((B0, B10, B11)) final_interval = intersect(interval_x, interval_y) final_interval = intersect(final_interval, solve_linear_inequalities(A, B)) self.active_set_node.parametrize(data=active_set) self.interval = final_interval self.active_set_data = active_set return final_interval
[docs] def checkKKT_Lasso(self, x, Y, beta_hat, Lambda, tol=1e-10): r"""Validate KKT conditions for LASSO solution. Checks that the computed LASSO solution satisfies the Karush-Kuhn-Tucker optimality conditions: - **Active features**: :math:`\mathbf{x}_j^T(\mathbf{y} - \mathbf{x}\hat{\boldsymbol{\beta}}) = \lambda \cdot \text{sign}(\hat{\beta}_j)` for :math:`\hat{\beta}_j \neq 0` - **Inactive features**: :math:`|\mathbf{x}_j^T(\mathbf{y} - \mathbf{x}\hat{\boldsymbol{\beta}})| \leq \lambda` for :math:`\hat{\beta}_j = 0` Parameters ---------- x : array-like, shape (n, d) Design matrix Y : array-like, shape (n, 1) Response vector beta_hat : array-like, shape (d, 1) Estimated LASSO coefficients Lambda : float Regularization parameter tol : float, optional Numerical tolerance for checks, default 1e-10 Raises ------ AssertionError If any KKT condition is violated Notes ----- This is a helper function for debugging and validation. Prints detailed information about each constraint. """ # Residuals print(x.shape, Y.shape, beta_hat.shape) r = Y - x @ beta_hat # (n,1) # Gradient = x^T (Y - xβ) grad = x.T @ r # (d,1) print("--------------- KKT Conditions for Lasso ---------------") n_active_ok, n_inactive_ok, n_viol = 0, 0, 0 for j in range(beta_hat.shape[0]): if abs(beta_hat[j]) > tol: # Active set cond = np.isclose( grad[j, 0], Lambda * np.sign(beta_hat[j, 0]), atol=tol ) if cond: print( f"[Active] j={j:2d}, β={beta_hat[j, 0]:.4f}, grad={grad[j, 0]:.4f} ✅ OK" ) n_active_ok += 1 else: print( f"[Active] j={j:2d}, β={beta_hat[j, 0]:.4f}, grad={grad[j, 0]:.4f} ❌ VIOLATION" ) n_viol += 1 assert cond, f"KKT violation at active index {j}" else: # Inactive set cond = -Lambda - tol <= grad[j, 0] <= Lambda + tol if cond: print( f"[Inactive] j={j:2d}, β={beta_hat[j, 0]:.4f}, grad={grad[j, 0]:.4f} ✅ OK" ) n_inactive_ok += 1 else: print( f"[Inactive] j={j:2d}, β={beta_hat[j, 0]:.4f}, grad={grad[j, 0]:.4f} ❌ VIOLATION" ) n_viol += 1 assert cond, f"KKT violation at inactive index {j}" print("---------------------------------------------------------") print( f"Summary: {n_active_ok} active OK, {n_inactive_ok} inactive OK, {n_viol} violations" ) print("---------------------------------------------------------") assert n_viol == 0, f"{n_viol} KKT conditions violated!" print("✅ All KKT conditions satisfied.")