Transcriptional Data Integration

This notebook shows the progress of integrating single-nucleus transcriptomic data with AHBA data.

Introduction

AHBA data

  • The Allen Human Brain Atlas (AHBA) provides comprehensive, high-resolution transcriptional profiles across the adult human brain. The data was downloaded from the Allen Institute’s API and preprocessed using the abagen package.

Human single-nucleus data

  • We used the large-scale single-nucleus dataset for the human brain provided by the Linnarsson laboratory.

  • This dataset includes single-nucleus transcriptomic data collected from multiple brain regions of four adult human donors.

  • We retained the data from donors (H19.30.001 and H19.30.002) for further analysis and conducted preprocessing with scanpy.

Mouse spatial transcriptomic data

  • Whole-brain spatial transcriptomic (ST) sequencing data from three adult male mice from Ortiz et al.’s study.

[1]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties
matplotlib.rcParams['axes.spines.right'] = False
matplotlib.rcParams['axes.spines.top'] = False
font_path ='../../tutorials/whitney-medium.otf' # path in TransBrain
custom_font = FontProperties(fname=font_path)

import seaborn as sns

import math
import anndata
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats
[2]:
import sys
sys.path.append('../code/dintegrate')
[3]:
from utils import stable_gene_filter,calculate_thresholds

Selection of Cross-Modality Stable Genes

In this step, we filtered the shared stable gene space between the ​​AHBA, human single-nucleus data and ​mouse spatial transcriptomic data.

[5]:
st_genes = list(pd.read_csv('./files/st_genes.csv')['genes'].values)
sn_genes = list(pd.read_csv('./files/sn_genes.csv')['genes'].values)
ahba_genes = list(pd.read_csv('./files/ahba_genes.csv')['genes'].values)
[6]:
common_genes = list(set(st_genes) & set(sn_genes) & set(ahba_genes))

common gene number

[7]:
len(set(common_genes))
[7]:
12732
[8]:
def draw_distribution(corr_, offset=0.008, ymax=0.95):

    median_thre = np.median(corr_)
    fig, ax = plt.subplots(1, 1, figsize=(10, 4), gridspec_kw={'width_ratios': [2]}, dpi=600)

    # KDE plot
    # sns.set_style("ticks")

    plt.rcParams['axes.edgecolor'] = 'black'
    plt.rcParams['axes.linewidth'] = 2

    ax.grid(color='gray', linestyle='--', linewidth=1.5, alpha=0.5,zorder=-100)
    kde = sns.kdeplot(corr_, label='Gene across region', color='gray', alpha=0.5, linewidth=3, ax=ax)
    kde.set_zorder(1)

    x, y = kde.lines[0].get_data()
    idx = np.searchsorted(x, median_thre)

    plt.fill_between(x[:idx+1], y[:idx+1], color='#cdedf6', alpha=1, zorder=1)
    plt.fill_between(x[idx:], y[idx:], color='skyblue', alpha=1, zorder=1)
    plt.axvline(x=median_thre+offset, color='gray', linestyle='--', ymin=0, ymax=ymax, linewidth=2, zorder=2)

    ax.spines['left'].set_linewidth(2)
    ax.spines['bottom'].set_linewidth(2)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.xaxis.labelpad = 20
    ax.yaxis.labelpad = 20
    ax.set_xlim(-1,1)
    ax.set_ylim(0,1.4)
    ax.set_xticks([-0.75, 0.00, 0.75])
    ax.set_yticks([0.2, 0.6, 1.0, 1.4])
    ax.tick_params(axis='both', which='both', labelsize=20, width=2, length=5, color='black')

    plt.xlabel("Value", fontsize=18, fontproperties=custom_font)
    plt.ylabel("Density", fontsize=18, fontproperties=custom_font)

    plt.gca().set_axis_on()
    plt.legend().set_visible(False)
    plt.tight_layout()

    return fig

Selection of cortical stable genes

[9]:
h19_common_regions = pd.read_csv('./files/h19_common_regions.csv')['sn_roi'].values
[11]:
ex_sn_cortex_mean = pd.read_csv('../datasets/processeddata/human_ahba/ex_d_sample_sn_atlas_cortex_mean.csv',index_col=0)
[12]:
ex_sn_cortex_mean.index = ['Human '+region.split(' - ')[-1] for region in ex_sn_cortex_mean.index.values]

Load the processed human single-nucleus data

[13]:
H1901_data = pd.read_feather('../datasets/processeddata/human_sn/Human_sn_data_H19001_split_scaled.feather')
H1901_data.set_index('index',inplace=True,drop=True)
H1902_data = pd.read_feather('../datasets/processeddata/human_sn/Human_sn_data_H19002_split_scaled.feather')
H1902_data.set_index('index',inplace=True,drop=True)
[14]:
cortex_filter_H1901 = H1901_data.index.str.contains('Cerebral cortex')
cortex_filter_H1902 = H1902_data.index.str.contains('Cerebral cortex')
H1901_data_cortex = H1901_data[cortex_filter_H1901]
H1902_data_cortex = H1902_data[cortex_filter_H1902]
[15]:
H1901_data_cortex.index = ['Human '+region.split(' - ')[-1] for region in H1901_data_cortex.index.values]
H1902_data_cortex.index = ['Human '+region.split(' - ')[-1] for region in H1902_data_cortex.index.values]
[16]:
common_regions_cortex = [i for i in set(H1901_data_cortex.index.values) if i in ex_sn_cortex_mean.index.values]

Use stable_gene_filter function to filter genes with consistent spatial patterns between AHBA and single-nucleus data.

[17]:
H1901_cortical_stable_gene_scores = stable_gene_filter(ex_sn_cortex_mean, H1901_data_cortex, common_regions_cortex, common_genes)
H1902_cortical_stable_gene_scores = stable_gene_filter(ex_sn_cortex_mean, H1902_data_cortex, common_regions_cortex, common_genes)
Processing genes...: 100%|██████████| 12732/12732 [00:04<00:00, 2860.39it/s]
Processing genes...: 100%|██████████| 12732/12732 [00:04<00:00, 2638.27it/s]
[18]:
H1901_thre_cortex = np.median(H1901_cortical_stable_gene_scores.values)
H1902_thre_cortex = np.median(H1902_cortical_stable_gene_scores.values)

Visualize the spatial similarity distribution

[65]:
fig = draw_distribution(H1901_cortical_stable_gene_scores.values,ymax=0.90)
../../_images/pipeline_run_pipeline_integration_23_0.png
[66]:
fig.savefig('./fig/H1901_cortical_stable_gene_scores.png',dpi=400)
[67]:
fig = draw_distribution(H1902_cortical_stable_gene_scores.values,offset=0.008,ymax=0.95)
../../_images/pipeline_run_pipeline_integration_25_0.png
[68]:
fig.savefig('./fig/H1902_cortical_stable_gene_scores.png',dpi=400)
[26]:
H1901_stable_genes_cortex = H1901_cortical_stable_gene_scores[H1901_cortical_stable_gene_scores['correlation'] >= H1901_thre_cortex]
H1902_stable_genes_cortex = H1902_cortical_stable_gene_scores[H1902_cortical_stable_gene_scores['correlation'] >= H1902_thre_cortex]
[27]:
common_stable_genes_cortex = pd.DataFrame([i for i in H1901_stable_genes_cortex.index.values if i in H1902_stable_genes_cortex.index.values],columns=['genes'])
[29]:
common_stable_genes_cortex.to_csv('./files/common_stable_genes_cortex.csv')

Selection of subcortical stable genes

[69]:
ex_sn_subcortex_mean = pd.read_csv('../datasets/processeddata/human_ahba/ex_d_sample_sn_atlas_subcortex_mean.csv',index_col=0)
[70]:
ex_sn_subcortex_mean.index = ['Human '+region.split(' - ')[-1] for region in ex_sn_subcortex_mean.index.values]
[71]:
H1901_data_subcortex = H1901_data[~cortex_filter_H1901]
H1902_data_subcortex = H1902_data[~cortex_filter_H1902]
[72]:
H1901_data_subcortex.index = ['Human '+region.split(' - ')[-1] for region in H1901_data_subcortex.index.values]
H1902_data_subcortex.index = ['Human '+region.split(' - ')[-1] for region in H1902_data_subcortex.index.values]
[73]:
common_regions_subcortex = [i for i in set(H1901_data_subcortex.index.values) if i in ex_sn_subcortex_mean.index.values]
[74]:
H1901_subcortical_stable_gene_scores = stable_gene_filter(ex_sn_subcortex_mean, H1901_data_subcortex, common_regions_subcortex,common_genes)
H1902_subcortical_stable_gene_scores = stable_gene_filter(ex_sn_subcortex_mean, H1902_data_subcortex, common_regions_subcortex,common_genes)
Processing genes...: 100%|██████████| 12732/12732 [00:04<00:00, 2796.89it/s]
Processing genes...: 100%|██████████| 12732/12732 [00:04<00:00, 2858.51it/s]
[75]:
H1901_thre_subcortex = np.median(H1901_subcortical_stable_gene_scores.values)
H1902_thre_subcortex = np.median(H1902_subcortical_stable_gene_scores.values)
[83]:
fig = draw_distribution(H1901_subcortical_stable_gene_scores.values,offset=0.008,ymax=0.985)
../../_images/pipeline_run_pipeline_integration_38_0.png
[84]:
fig.savefig('./fig/H1901_subcortical_stable_gene_scores.png',dpi=400)
[86]:
fig = draw_distribution(H1902_subcortical_stable_gene_scores.values,offset=0.008,ymax=0.94)
../../_images/pipeline_run_pipeline_integration_40_0.png
[87]:
fig.savefig('./fig/H1902_subcortical_stable_gene_scores.png',dpi=400)
[39]:
H1901_stable_genes_subcortex = H1901_subcortical_stable_gene_scores[H1901_subcortical_stable_gene_scores['correlation'] >= H1901_thre_subcortex]
H1902_stable_genes_subcortex = H1902_subcortical_stable_gene_scores[H1902_subcortical_stable_gene_scores['correlation'] >= H1902_thre_subcortex]
[40]:
common_stable_genes_subcortex = pd.DataFrame([i for i in H1901_stable_genes_subcortex.index.values if i in H1902_stable_genes_subcortex.index.values],columns=['genes'])
[42]:
common_stable_genes_subcortex.to_csv('./files/common_stable_genes_subcortex.csv')

Selection of all stable genes

[88]:
H1901_data_all = pd.read_feather('../datasets/processeddata/human_sn/Human_sn_data_H19001_all_scaled.feather')
H1901_data_all.set_index('index',inplace=True,drop=True)
H1902_data_all = pd.read_feather('../datasets/processeddata/human_sn/Human_sn_data_H19002_all_scaled.feather')
H1902_data_all.set_index('index',inplace=True,drop=True)
[89]:
ex_sn_mean = pd.read_csv('../datasets/processeddata/human_ahba/ex_d_sample_sn_atlas_mean.csv',index_col=0)
[90]:
ex_sn_mean.index = ['Human '+region.split(' - ')[-1] for region in ex_sn_mean.index.values]
[91]:
H1901_data_all.index = ['Human '+region.split(' - ')[-1] for region in H1901_data_all.index.values]
H1902_data_all.index = ['Human '+region.split(' - ')[-1] for region in H1902_data_all.index.values]
[92]:
common_regions = [i for i in set(H1901_data_all.index.values) if i in ex_sn_mean.index.values]
[93]:
H1901_all_stable_gene_scores = stable_gene_filter(ex_sn_mean, H1901_data_all, common_regions, common_genes)
H1902_all_stable_gene_scores = stable_gene_filter(ex_sn_mean, H1902_data_all, common_regions, common_genes)
Processing genes...: 100%|██████████| 12732/12732 [00:04<00:00, 2795.38it/s]
Processing genes...: 100%|██████████| 12732/12732 [00:05<00:00, 2533.27it/s]
[94]:
H1901_thre_all = np.median(H1901_all_stable_gene_scores.values)
H1902_thre_all = np.median(H1902_all_stable_gene_scores.values)
[96]:
fig = draw_distribution(H1901_all_stable_gene_scores.values,offset=0.004,ymax=0.90)
../../_images/pipeline_run_pipeline_integration_53_0.png
[97]:
fig.savefig('./fig/H1901_all_stable_gene_scores.png',dpi=400)
[100]:
fig = draw_distribution(H1902_all_stable_gene_scores.values,offset=0.003,ymax=0.835)
../../_images/pipeline_run_pipeline_integration_55_0.png
[101]:
fig.savefig('./fig/H1902_all_stable_gene_scores.png',dpi=400)
[53]:
H1901_stable_genes_all = H1901_all_stable_gene_scores[H1901_all_stable_gene_scores['correlation'] >= H1901_thre_all]
H1902_stable_genes_all = H1902_all_stable_gene_scores[H1902_all_stable_gene_scores['correlation'] >= H1902_thre_all]
[54]:
common_stable_genes_all = pd.DataFrame([i for i in H1901_stable_genes_all.index.values if i in H1902_stable_genes_all.index.values],columns=['genes'])
[56]:
common_stable_genes_all.to_csv('./files/common_stable_genes_all.csv')
[14]:
align_csv = pd.read_csv('./files/sn_ahba_atlas_align.csv',index_col=0)

Thresholds of cortex

[58]:
ex_brain_cortex_mean = pd.read_csv('../datasets/processeddata/human_ahba/ex_d_sample_brain_atlas_cortex_mean.csv',index_col=0)
[59]:
ex_brain_cortex_mean = ex_brain_cortex_mean[common_stable_genes_cortex['genes'].values]
[28]:
cortex_filter_H1901 = H1901_data.index.str.contains('Cerebral cortex')
cortex_filter_H1902 = H1902_data.index.str.contains('Cerebral cortex')
H1901_data_cortex = H1901_data[cortex_filter_H1901]
H1902_data_cortex = H1902_data[cortex_filter_H1902]
[29]:
H19_data_cortex = pd.concat([H1901_data_cortex,H1902_data_cortex])
[33]:
H19_data_cortex = H19_data_cortex[common_stable_genes_cortex['genes'].values]
[36]:
H19_data_cortex.reset_index(inplace=True)
[37]:
H19_data_cortex.to_feather('../datasets/processeddata/human_sn/Human_sn_data_H19_cortex_scaled_common_gene.feather')
[76]:
threshold_dict_cortex = calculate_thresholds(ex_brain_cortex_mean,H19_data_cortex,align_csv)
Processing regions...: 100%|██████████| 127/127 [35:19<00:00, 16.69s/it]
[78]:
threshold_df_cortex = pd.DataFrame.from_dict(threshold_dict_cortex, orient='index', columns=['correlation', 'p_value'])
[79]:
threshold_df_cortex.to_csv('./files/threshold_df_cortex.csv')
[20]:
def plot_thre_distribution(thre_df,bins = np.arange(-0.2, 0.64, 0.02)):

    data_groups = pd.cut(thre_df['correlation'],bins=bins,include_lowest=True)
    grouped_counts = data_groups.value_counts(normalize=True)

    fig, ax = plt.subplots(1,1,figsize=(15,6), gridspec_kw={'width_ratios': [2]},dpi=400)
    sns.barplot(x=grouped_counts.index, y=grouped_counts.values,label='align similarity',alpha=0.5,color='skyblue')
    plt.xticks(rotation=90)
    ax.tick_params(labelsize=15)
    ax.spines['left'].set_linewidth(1)
    ax.spines['bottom'].set_linewidth(1)
    ax.xaxis.labelpad = 20
    ax.yaxis.labelpad = 20
    plt.xlabel("Value",fontsize=18,fontproperties=custom_font)
    plt.ylabel("Frequency",fontsize=18,fontproperties=custom_font)
    ax.legend(prop=FontProperties(fname=font_path,size=15))
    plt.show()
[81]:
plot_thre_distribution(threshold_df_cortex)
../../_images/pipeline_run_pipeline_integration_74_0.png

Thresholds of subcortex

[83]:
ex_brain_subcortex_mean = pd.read_csv('../datasets/processeddata/human_ahba/ex_d_sample_brain_atlas_subcortex_mean.csv',index_col=0)
[84]:
ex_brain_subcortex_mean = ex_brain_subcortex_mean[common_stable_genes_subcortex['genes'].values]
[38]:
H1901_data_subcortex = H1901_data[~cortex_filter_H1901]
H1902_data_subcortex = H1902_data[~cortex_filter_H1902]
[39]:
H19_data_subcortex = pd.concat([H1901_data_subcortex,H1902_data_subcortex])
[40]:
H19_data_subcortex = H19_data_subcortex[common_stable_genes_subcortex['genes'].values]
[43]:
H19_data_subcortex.reset_index(inplace=True)
[44]:
H19_data_subcortex.to_feather('../datasets/processeddata/human_sn/Human_sn_data_H19_subcortex_scaled_common_gene.feather')
[88]:
threshold_dict_subcortex = calculate_thresholds(ex_brain_subcortex_mean,H19_data_subcortex,align_csv)
Processing regions...: 100%|██████████| 127/127 [15:25<00:00,  7.29s/it]
[90]:
threshold_df_subcortex = pd.DataFrame.from_dict(threshold_dict_subcortex, orient='index', columns=['correlation', 'p_value'])
[91]:
threshold_df_subcortex.to_csv('./files/threshold_df_subcortex.csv')
[92]:
plot_thre_distribution(threshold_df_subcortex,bins = np.arange(-0.2, 0.84, 0.08))
../../_images/pipeline_run_pipeline_integration_86_0.png

Thresholds of all

[6]:
ex_brain_mean = pd.read_csv('../datasets/processeddata/human_ahba/ex_d_sample_brain_atlas_mean.csv',index_col=0)
[8]:
common_stable_genes_all = pd.read_csv('./files/common_stable_genes_all.csv',index_col=0)
[9]:
ex_brain_mean = ex_brain_mean[common_stable_genes_all['genes'].values]
[10]:
H1901_data_all = pd.read_feather('../datasets/processeddata/human_sn/Human_sn_data_H19001_all_scaled.feather')
H1901_data_all.set_index('index',inplace=True,drop=True)
H1902_data_all = pd.read_feather('../datasets/processeddata/human_sn/Human_sn_data_H19002_all_scaled.feather')
H1902_data_all.set_index('index',inplace=True,drop=True)
[11]:
H19_data_all = pd.concat([H1901_data_all,H1902_data_all])
[12]:
H19_data_all = H19_data_all[common_stable_genes_all['genes'].values]
[15]:
threshold_dict_all = calculate_thresholds(ex_brain_mean,H19_data_all,align_csv)
Processing regions...: 100%|██████████| 127/127 [1:04:50<00:00, 30.64s/it]
[17]:
threshold_df_all = pd.DataFrame.from_dict(threshold_dict_all, orient='index', columns=['correlation', 'p_value'])
[18]:
threshold_df_all.to_csv('./files/threshold_df_all.csv')
[21]:
plot_thre_distribution(threshold_df_all,bins = np.arange(-0.2, 0.84, 0.02))
../../_images/pipeline_run_pipeline_integration_97_0.png
[23]:
H19_data_all.reset_index(inplace=True)
[25]:
H19_data_all.to_feather('../datasets/processeddata/human_sn/Human_sn_data_H19_all_scaled_common_gene.feather')
[11]:
# To run the integration pipeline, use the following command:
# !python ../code/dintegrate/main.py --pool True --pool_type cortex --mean_path ../datasets/processeddata/human_ahba/ex_d_sample_brain_atlas_cortex_mean.csv --sample_path ../datasets/processeddata/human_ahba/ex_d_sample_brain_atlas_cortex.csv --h19_path ../datasets/processeddata/human_sn/Human_sn_data_H19_cortex_scaled_common_gene.feather --alignment_file ./files/sn_ahba_atlas_align_cortex.csv --gene_file ./files/common_stable_genes_cortex.csv --thre_file ./files/threshold_df_cortex.csv --pool_s_path ../datasets/integrated_dataset/cortical_pool  --integrated_s_path ../datasets/integrated_dataset/cortical_integrated
[12]:
# !python ../code/dintegrate/main.py --pool True --pool_type subcortex --mean_path ../datasets/processeddata/human_ahba/ex_d_sample_brain_atlas_subcortex_mean.csv --sample_path ../datasets/processeddata/human_ahba/ex_d_sample_brain_atlas_subcortex.csv --h19_path ../datasets/processeddata/human_sn/Human_sn_data_H19_subcortex_scaled_common_gene.feather --alignment_file ./files/sn_ahba_atlas_align_subcortex.csv --gene_file ./files/common_stable_genes_subcortex.csv --thre_file ./files/threshold_df_subcortex.csv --pool_s_path ../datasets/integrated_dataset/subcortical_pool  --integrated_s_path ../datasets/integrated_dataset/subcortical_integrated
[13]:
# !python ../code/dintegrate/main.py --pool True --pool_type all --mean_path ../datasets/processeddata/human_ahba/ex_d_sample_brain_atlas_mean.csv --sample_path ../datasets/processeddata/human_ahba/ex_d_sample_brain_atlas.csv --h19_path ../datasets/processeddata/human_sn/Human_sn_data_H19_all_scaled_common_gene.feather --alignment_file ./files/sn_ahba_atlas_align.csv --gene_file ./files/common_stable_genes_all.csv --thre_file ./files/threshold_df_all.csv --pool_s_path ../datasets/integrated_dataset/all_pool  --integrated_s_path ../datasets/integrated_dataset/all_integrated