Source code for pipeline.code.dpreprocess.ahba_

import warnings
warnings.filterwarnings('ignore')
import abagen
import pandas as pd
from nilearn import image

import os
import itertools
import anndata
import scanpy as sc

micro_path='../../datasets/microarray' # AHBA microarray

[docs]def extract_AHBA_data(atlas_path: str,atlas_info_path: str,lr_mirror='bidirectional',gene_norm='srs',sample_norm='srs',return_report=True, return_counts=True,return_donors=True,norm_matched=False,ibf_threshold=0,region_agg=None) -> pd.DataFrame: """ Extracts Allen Human Brain Atlas (AHBA) gene expression data based on a given brain atlas. For more information, please ref https://abagen.readthedocs.io/en/stable/generated/abagen.get_expression_data.html Parameters ---------- atlas_path : str Path to the human brain atlas NIfTI file. atlas_info_path : str Path to a CSV file containing atlas region information with columns: 'Anatomical Name' and 'Atlas Index'. lr_mirror : {'left', 'right', 'bidirectional'}, optional How to mirror samples across hemispheres. Default is 'bidirectional'. gene_norm : str, optional Method by which to normalize microarray expression values for each donor. Default is 'srs'. sample_norm : str, optional Method by which to normalize microarray expression values for each sample. Default is 'srs'. return_report : bool, optional Whether to return a string containing longform text describing the processing procedures used to generate the expression DataFrames returned by this function. Default is True. return_counts : bool, optional Whether to return dataframe containing information on how many samples were assigned to each parcel in atlas for each donor. Default is True. return_donors : bool, optional Whether to return donor-level expression arrays instead of aggregating expression across donors with provided agg_metric. Default is True. norm_matched : bool, optional Whether to perform gene normalization (gene_norm) across only those samples matched to regions in atlas instead of all available samples. Default is False. ibf_threshold : float, optional Intensity-based filtering threshold. Default is 0. region_agg : str or None, optional Mechanism by which to reduce sample-level expression data into region-level expression. Returns ------- dict[str, pandas.DataFrame] or pandas.DataFrame If `return_donors=True`, returns a dictionary where keys are donor IDs and values are gene expression DataFrames (regions × genes). If `return_donors=False`, returns a single aggregated DataFrame. Examples -------- >>> expr_data = extract_AHBA_data( ... '/path/to/atlas.nii.gz', ... '/path/to/atlas_info.csv' ... ) """ # Load atlas and region information atlas = image.load_img(atlas_path) atlas_info = pd.read_csv(atlas_info_path) atlas_dict = ( atlas_info .set_index('Atlas Index')['Anatomical Name'] .to_dict() ) atlas_dict.setdefault(0, 'other') # Get expression data from abagen expr_raw = abagen.get_expression_data( atlas, data_dir=micro_path, lr_mirror=lr_mirror, gene_norm=gene_norm, sample_norm=sample_norm, return_report=return_report, return_counts=return_counts, return_donors=return_donors, norm_matched=norm_matched, ibf_threshold=ibf_threshold, region_agg=region_agg ) # Process each donor's data processed_data = {} for donor_id, expr_data in expr_raw.items(): index_values = ( expr_data.index.get_level_values(0) if isinstance(expr_data.index, pd.MultiIndex) else expr_data.index ) region_labels = index_values.map(atlas_dict).fillna('other') processed_data[donor_id] = expr_data.set_index(region_labels.values) return processed_data