Source code for pipeline.code.dpreprocess.ahba_

import warnings
warnings.filterwarnings('ignore')
import abagen
import pandas as pd
from nilearn import image

import os
import itertools
import anndata
import scanpy as sc

micro_path='../../datasets/microarray' # AHBA microarray

[docs]def extract_AHBA_data(atlas_path: str,atlas_info_path: str,lr_mirror='bidirectional',gene_norm='srs',sample_norm='srs',return_report=True,
                      return_counts=True,return_donors=True,norm_matched=False,ibf_threshold=0,region_agg=None) -> pd.DataFrame:
    
    """
    Extracts Allen Human Brain Atlas (AHBA) gene expression data based on a given brain atlas.
    For more information, please ref https://abagen.readthedocs.io/en/stable/generated/abagen.get_expression_data.html

    Parameters
    ----------
    atlas_path : str
        Path to the human brain atlas NIfTI file.
    atlas_info_path : str
        Path to a CSV file containing atlas region information with columns:
        'Anatomical Name' and 'Atlas Index'.
    lr_mirror : {'left', 'right', 'bidirectional'}, optional
        How to mirror samples across hemispheres. Default is 'bidirectional'.
    gene_norm : str, optional
        Method by which to normalize microarray expression values for each donor. Default is 'srs'.
    sample_norm : str, optional
        Method by which to normalize microarray expression values for each sample. Default is 'srs'.
    return_report : bool, optional
        Whether to return a string containing longform text describing the processing procedures used to generate the expression DataFrames returned by this function. Default is True.
    return_counts : bool, optional
        Whether to return dataframe containing information on how many samples were assigned to each parcel in atlas for each donor. Default is True.
    return_donors : bool, optional
        Whether to return donor-level expression arrays instead of aggregating expression across donors with provided agg_metric. Default is True.
    norm_matched : bool, optional
        Whether to perform gene normalization (gene_norm) across only those samples matched to regions in atlas instead of all available samples. Default is False.
    ibf_threshold : float, optional
        Intensity-based filtering threshold. Default is 0.
    region_agg : str or None, optional
        Mechanism by which to reduce sample-level expression data into region-level expression.

    Returns
    -------
    dict[str, pandas.DataFrame] or pandas.DataFrame
        If `return_donors=True`, returns a dictionary where keys are donor IDs and values are gene expression DataFrames (regions × genes).
        If `return_donors=False`, returns a single aggregated DataFrame.

    Examples
    --------
    >>> expr_data = extract_AHBA_data(
    ...     '/path/to/atlas.nii.gz',
    ...     '/path/to/atlas_info.csv'
    ... ) 
    """
    
    # Load atlas and region information
    atlas = image.load_img(atlas_path)
    atlas_info = pd.read_csv(atlas_info_path)

    atlas_dict = (
        atlas_info
        .set_index('Atlas Index')['Anatomical Name']
        .to_dict()
    )
    atlas_dict.setdefault(0, 'other')

    # Get expression data from abagen
    expr_raw = abagen.get_expression_data(
        atlas, data_dir=micro_path, lr_mirror=lr_mirror, gene_norm=gene_norm,
        sample_norm=sample_norm, return_report=return_report, return_counts=return_counts,
        return_donors=return_donors, norm_matched=norm_matched, ibf_threshold=ibf_threshold,
        region_agg=region_agg
    )

    # Process each donor's data
    processed_data = {}
    for donor_id, expr_data in expr_raw.items():
        index_values = (
            expr_data.index.get_level_values(0)
            if isinstance(expr_data.index, pd.MultiIndex)
            else expr_data.index
        )

        region_labels = index_values.map(atlas_dict).fillna('other')
        processed_data[donor_id] = expr_data.set_index(region_labels.values)

    return processed_data