Source code for pythonsi.test_statistics.tl_hdr

import numpy as np
import numpy.typing as npt
from typing import Tuple, List
from scipy.linalg import block_diag



[docs]
class TLHDRTestStatistic:
    r"""
    Test statistic for selection inference in high-dimensional regression
    after transfer learning with multiple source domains.

    This class computes test statistics for testing individual features
    after feature selection via a transfer learning procedure, implementing
    the post-selection inference framework for high-dimensional regression.

    The test statistic is designed for testing:

    .. math::
        H_0: \beta_j = 0 \quad \text{vs} \quad H_1: \beta_j \neq 0,

    where :math:`\beta_j` is the coefficient of feature :math:`j` in the
    target domain after transfer learning and feature selection.

    Parameters
    ----------
    XS_list : array-like, shape (K, nS, p)
        A 3D numpy array containing **source domain design matrices**.
        - ``K``: number of source domains
        - ``nS``: sample size per source domain
        - ``p``: number of features (shared across domains)

        The array is structured such that ``XS_list[k]`` corresponds to
        the design matrix of the :math:`k`-th source domain, with shape
        ``(nS, p)``.

    YS_list : array-like, shape (K * nS, 1)
        A 2D numpy array containing the **source domain response vectors**
        stacked vertically across all ``K`` source domains.
        - The first ``nS`` rows correspond to the first source domain,
          the next ``nS`` to the second, and so on.

    X0 : array-like, shape (nT, p)
        Target domain design matrix.
        - ``nT``: number of samples in the target domain
        - ``p``: number of features (same as in source domains)

    Y0 : array-like, shape (nT, 1)
        Target domain response vector.

    Attributes
    ----------
    XS_list_node : Data
        Node containing the collection of source domain design matrices.
    YS_list_node : Data
        Node containing the stacked source domain response vector.
    X0_node : Data
        Node containing the target domain design matrix.
    Y0_node : Data
        Node containing the target domain response vector.

    Notes
    -----
    The test statistic accounts for the transfer learning step by focusing
    the inference on the target domain while leveraging information from
    multiple source domains. This allows for valid inference on features selected after the transfer
    learning process.
    """

    def __init__(
        self,
        XS_list: npt.NDArray[np.floating],
        YS_list: npt.NDArray[np.floating],
        X0: npt.NDArray[np.floating],
        Y0: npt.NDArray[np.floating],
    ):
        self.XS_list_node = XS_list
        self.YS_list_node = YS_list
        self.X0_node = X0
        self.Y0_node = Y0

    def __call__(
        self,
        active_set: npt.NDArray[np.int64],
        feature_id: int,
        Sigmas: List[npt.NDArray[np.floating]],
    ) -> Tuple[
        list, npt.NDArray[np.floating], npt.NDArray[np.floating], float, float, list
    ]:
        r"""Compute test statistic for a selected feature after transfer learning.

        Computes the test statistic and parametrization for testing
        whether a specific feature in the active set has a non-zero
        coefficient in the target domain after feature selection via transfer learning.

        The test statistic focuses on the target domain:

        .. math::
            \tau_j = \bm{\eta}_j^\top
            \begin{pmatrix} \bm{Y}^S \\ \bm{Y}^{(0)} \end{pmatrix},

        where:

        - :math:`\bm{Y}^S` is the stacked response vector from all
        source domains,
        - :math:`\bm{Y}^{(0)}` is the target domain response vector,
        - :math:`\bm{\eta}_j` is the direction vector associated with testing the
        coefficient of feature :math:`j` in the target domain active set.

        The direction vector is constructed as:

        .. math::
            \bm{\eta}_j = \begin{pmatrix}
            \bm{0}_{K n_S} \\
            X^{(0)}_\mathcal{M} \left({X^{(0)}_\mathcal{M}}^\top X^{(0)}_\mathcal{M}\right)^{-1} \bm{e}_j
            \end{pmatrix},

        where:
        - :math:`K` is the number of source domains,
        - :math:`n_S` is the number of samples per source domain,
        - :math:`X^{(0)}_\mathcal{M}` is the target domain design matrix restricted to the active set,
        - :math:`\bm{e}_j` is the canonical basis vector selecting feature :math:`j` within the active set.

        Parameters
        ----------
        active_set : array-like, shape (k,)
            Indices of features in the active set
        feature_id : int
            Index of the feature to test (within active_set)
        Sigmas : list of array-like
            List containing covariance matrices for both source and target
            domains:
            - ``Sigmas[0]``: list of source covariance matrices,
            - ``Sigmas[1]``: target domain covariance matrix.

        Returns
        -------
        test_statistic_direction : array-like, shape (K * nS + nT, 1)
            The direction vector :math:`\bm{\eta}_j` for the test statistic.
        a : array-like, shape (K * nS + nT, 1)
            Parametrization intercept for combined data
        b : array-like, shape (K * nS + nT, 1)
            Parametrization coefficient for combined data
        test_statistic : float
            Observed value of the test statistic
        variance : float
            Variance of the test statistic under null hypothesis

        deviation : float
            Standard deviation of the test statistic
        """
        XS_list = self.XS_list_node()
        YS_list = self.YS_list_node()
        X0 = self.X0_node()
        Y0 = self.Y0_node()

        K = len(XS_list)
        nS = XS_list[0].shape[0]
        n_sources = K * nS
        nT = Y0.shape[0]
        N = nS * K + nT
        Y = np.concatenate((YS_list, Y0))

        SigmaS_list = Sigmas[0]
        Sigma0 = Sigmas[1]
        Sigma = block_diag(*SigmaS_list, Sigma0)

        X0_active = X0[:, active_set]
        ej = np.zeros((len(active_set), 1))
        ej[feature_id, 0] = 1

        inv = np.linalg.pinv(X0_active.T @ X0_active)
        X0_active_inv = X0_active @ inv

        test_statistic_direction = np.vstack(
            (
                np.zeros((n_sources, 1)),
                X0_active_inv @ ej,
            )
        )
        e1 = test_statistic_direction.T @ Sigma @ test_statistic_direction
        b = (Sigma @ test_statistic_direction) / e1

        e2 = np.eye(N) - b @ test_statistic_direction.T
        a = e2 @ Y

        test_statistic = (test_statistic_direction.T @ Y)[0, 0]
        variance = (test_statistic_direction.T @ Sigma @ test_statistic_direction)[0, 0]
        deviation = np.sqrt(variance)

        self.XS_list_node.parametrize(data=XS_list)
        self.YS_list_node.parametrize(a=a[:n_sources, :], b=b[:n_sources, :])
        self.X0_node.parametrize(data=X0)
        self.Y0_node.parametrize(a=a[n_sources:, :], b=b[n_sources:, :])

        return (
            test_statistic_direction,
            a,
            b,
            test_statistic,
            variance,
            deviation,
            [-np.inf, np.inf],
        )