CellOracle - TSS annotation¶
This notebook is used to annotate the peaks into promoters, enhancers, and other genomic regions.
import¶
[1]:
import scanpy as sc
import numpy as np
import pandas as pd
import os
[2]:
from celloracle import motif_analysis as ma
import celloracle as co
co.__version__
co.check_python_requirements()
[2]:
| package_name | installed_version | required_version | requirement_satisfied | |
|---|---|---|---|---|
| 0 | numpy | 1.26.4 | auto | True |
| 1 | scipy | 1.13.1 | auto | True |
| 2 | cython | 3.0.11 | auto | True |
| 3 | numba | 0.60.0 | 0.50.1 | True |
| 4 | matplotlib | 3.6.3 | auto | True |
| 5 | seaborn | 0.13.2 | auto | True |
| 6 | scikit-learn | 1.6.0 | auto | True |
| 7 | h5py | 3.12.1 | 3.1.0 | True |
| 8 | pandas | 1.5.3 | 1.0.3 | True |
| 9 | velocyto | 0.17.17 | 0.17 | True |
| 10 | umap-learn | 0.5.7 | auto | True |
| 11 | pyarrow | 18.1.0 | 0.17 | True |
| 12 | tqdm | 4.67.1 | 4.45 | True |
| 13 | igraph | 0.11.8 | 0.10.1 | True |
| 14 | louvain | 0.8.2 | auto | True |
| 15 | jupyter | 1.1.1 | auto | True |
| 16 | anndata | 0.10.9 | 0.7.5 | True |
| 17 | scanpy | 1.10.3 | 1.6 | True |
| 18 | joblib | 1.4.2 | auto | True |
| 19 | goatools | 1.4.12 | auto | True |
| 20 | genomepy | 0.16.1 | 0.8.4 | True |
| 21 | gimmemotifs | 0.17.0 | 0.14.4 | True |
load peak and get the cicero results¶
[4]:
result_dir = '../../data/iPSC_example/base_GRN'
adata = sc.read(('../../data/iPSC_example/ATAC_data/adata_atac_raw.h5ad'))
[5]:
peaks = np.array(adata.var_names)
peaks
[5]:
array(['chr1_819722_820222', 'chr1_827288_827788', 'chr1_838176_838676',
..., 'chrX_155962954_155963454', 'chrX_155966781_155967281',
'chrX_155971350_155971850'], dtype=object)
[6]:
# Load Cicero coaccessibility scores.
cicero_connections = pd.read_csv(os.path.join(result_dir,"cicero_connections_stream.csv"), index_col=0)
cicero_connections.head()
[6]:
| Peak1 | Peak2 | coaccess | |
|---|---|---|---|
| 1 | chr10_100000290_100000790 | chr10_99779229_99779729 | -0.001031 |
| 2 | chr10_100000290_100000790 | chr10_99779759_99780259 | 0.001412 |
| 3 | chr10_100000290_100000790 | chr10_99785581_99786081 | 0.000000 |
| 4 | chr10_100000290_100000790 | chr10_99788840_99789340 | 0.000000 |
| 5 | chr10_100000290_100000790 | chr10_99845752_99846252 | 0.000482 |
[7]:
cicero_connections.shape
[7]:
(22818160, 3)
annotate tss¶
[8]:
ma.SUPPORTED_REF_GENOME
[8]:
| species | ref_genome | provider | |
|---|---|---|---|
| 0 | Human | hg38 | UCSC |
| 1 | Human | hg19 | UCSC |
| 2 | Mouse | mm39 | UCSC |
| 3 | Mouse | mm10 | UCSC |
| 4 | Mouse | mm9 | UCSC |
| 5 | S.cerevisiae | sacCer2 | UCSC |
| 6 | S.cerevisiae | sacCer3 | UCSC |
| 7 | Zebrafish | danRer7 | UCSC |
| 8 | Zebrafish | danRer10 | UCSC |
| 9 | Zebrafish | danRer11 | UCSC |
| 10 | Xenopus tropicalis | xenTro2 | UCSC |
| 11 | Xenopus tropicalis | xenTro3 | UCSC |
| 12 | Xenopus laevis | Xenopus_laevis_v10.1 | NCBI |
| 13 | Rat | rn4 | UCSC |
| 14 | Rat | rn5 | UCSC |
| 15 | Rat | rn6 | UCSC |
| 16 | Drosophila | dm3 | UCSC |
| 17 | Drosophila | dm6 | UCSC |
| 18 | C.elegans | ce6 | UCSC |
| 19 | C.elegans | ce10 | UCSC |
| 20 | Arabidopsis | TAIR10 | Ensembl |
| 21 | Chicken | galGal4 | UCSC |
| 22 | Chicken | galGal5 | UCSC |
| 23 | Chicken | galGal6 | UCSC |
| 24 | Guinea_Pig | Cavpor3.0 | Ensembl |
| 25 | Pig | Sscrofa11.1 | Ensembl |
| 26 | Axolotl | AmexG_v6.0-DD | Axolotl-omics.org |
[9]:
##!! Please make sure to specify the correct reference genome here
tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome="hg38") # Note that the ref_genome should be changed according to your data
# Check results
tss_annotated.tail()
que bed peaks: 246132
tss peaks in que: 33757
[9]:
| chr | start | end | gene_short_name | strand | |
|---|---|---|---|---|---|
| 33752 | chr5 | 149550467 | 149550967 | CSNK1A1 | - |
| 33753 | chr5 | 149551161 | 149551661 | CSNK1A1 | - |
| 33754 | chr20 | 10673802 | 10674302 | JAG1 | - |
| 33755 | chr20 | 10674620 | 10675120 | JAG1 | - |
| 33756 | chr9 | 122228522 | 122229022 | LHX6 | - |
interate tss and cicero result¶
[10]:
integrated = ma.integrate_tss_peak_with_cicero(tss_peak=tss_annotated,
cicero_connections=cicero_connections)
print(integrated.shape)
integrated.head()
(938050, 3)
[10]:
| peak_id | gene_short_name | coaccess | |
|---|---|---|---|
| 0 | chr10_100000290_100000790 | BLOC1S2 | 0.000119 |
| 1 | chr10_100000290_100000790 | CWF19L1 | 0.001710 |
| 2 | chr10_100000290_100000790 | DNMBP | 0.003578 |
| 3 | chr10_100000290_100000790 | ERLIN1 | 0.002079 |
| 4 | chr10_100000290_100000790 | OLMALINC | 0.000491 |
[11]:
peak = integrated[integrated.coaccess >= 0.8]
peak = peak[["peak_id", "gene_short_name"]].reset_index(drop=True)
print(peak.shape)
peak.head()
(31844, 2)
[11]:
| peak_id | gene_short_name | |
|---|---|---|
| 0 | chr10_100009740_100010240 | DNMBP |
| 1 | chr10_100185817_100186317 | ERLIN1 |
| 2 | chr10_100186339_100186839 | ERLIN1 |
| 3 | chr10_100229388_100229888 | CHUK |
| 4 | chr10_100267454_100267954 | CWF19L1 |
save data¶
[12]:
peak.to_csv(os.path.join(result_dir,"processed_peak.csv"))