Source code for pipeline.code.dintegrate.utils

import math
import anndata
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats


def dataframe_mean(dataframe: pd.DataFrame, region_: list) -> pd.DataFrame:
    """
    Calculate mean expression per region and filter for specified regions.

    Parameters
    ----------
    dataframe : pd.DataFrame
        Input expression matrix with samples as row indices and genes as columns.

    region_ : list of str
        List of brain region names (index values) to retain in the final output.

    Returns
    -------
    pd.DataFrame
        A DataFrame with mean expression values for the specified regions.
        Rows are brain regions, columns are genes.
    """

    return (
        dataframe
        .groupby(dataframe.index)
        .mean()
        .dropna(axis=1)
        .T[region_]
        .T
        .reset_index()
        .set_index('index', drop=True)
    )


[docs]def stable_gene_filter(AHBA_mean: pd.DataFrame,sn_ex: pd.DataFrame, region_: list, common_genes: list) -> pd.DataFrame: """ Identifies genes with consistent spatial patterns between AHBA and snRNA-seq data. Parameters ---------- AHBA_mean : pd.DataFrame Regional mean expression matrix from AHBA data. (regions x genes) sn_ex : pd.DataFrame Processed snRNA-seq expression (smooth_cells x genes) region_ : list of str List of brain region names to include in the analysis. common_genes : list of str List of gene names present in both AHBA and snRNA-seq datasets. Returns ------- pd.DataFrame DataFrame with genes and their cross-modality spatial correlation scores """ AHBA_mean = AHBA_mean[common_genes] sn_ex = sn_ex[common_genes] AHBA_mean = AHBA_mean.T[region_].T sn_mean = dataframe_mean(sn_ex, region_) correlations = [] for gene in tqdm(common_genes, desc="Processing genes..."): r, _ = stats.spearmanr( AHBA_mean[gene].values, sn_mean[gene].values ) if not math.isnan(r): correlations.append(r) else: common_genes = [i for i in common_genes if i!=gene] return pd.DataFrame({'correlation': correlations},index=common_genes)
[docs]def calculate_thresholds(ahba_mean: pd.DataFrame,sn_data: pd.DataFrame,align_csv: pd.DataFrame,percentile: float = 0.1) -> dict: """ Calculate regional correlation thresholds between AHBA and single-nucleus data. Parameters ---------- ahba_mean : pd.DataFrame AHBA mean expression matrix (regions x genes), where each row corresponds to a brain region and each column to a gene. sn_data : pd.DataFrame Single-nucleus expression data (smooth_cells x genes), with cell-level expression profiles aligned to brain regions. align_csv : pd.DataFrame Alignment table containing two columns: 'brain_region' for AHBA region names and 'sn_region' for corresponding snRNA-seq region labels. percentile : float, optional Top percentile used to define the correlation threshold (default is 0.1, i.e., top 10%). Returns ------- dict A dictionary mapping each AHBA region (str) to a tuple (correlation, p-value), representing the correlation threshold at the given percentile. """ threshold_dict = {} for _, row in tqdm(align_csv.iterrows(), total=len(align_csv), desc="Processing regions..."): allen_region = row['brain_region'] sc_regions = row['sn_region'].split('\\') # Skip if region not in AHBA data if allen_region not in ahba_mean.index: continue ahba_expr = ahba_mean.loc[allen_region].values sn_expr = sn_data[sn_data.index.isin(sc_regions)].values correlations = [ stats.pearsonr(ahba_expr, cell_expr) for cell_expr in sn_expr ] sorted_corrs = sorted(correlations, key=lambda x: x[0], reverse=True) try: threshold_idx = int(len(sorted_corrs) * percentile) threshold_dict[allen_region] = sorted_corrs[threshold_idx] except IndexError: continue return threshold_dict