CellOracle - TSS annotation

This notebook is used to annotate the peaks into promoters, enhancers, and other genomic regions.

import

[1]:
import scanpy as sc
import numpy as np
import pandas as pd
import os
[2]:
from celloracle import motif_analysis as ma
import celloracle as co
co.__version__
co.check_python_requirements()
[2]:
package_name installed_version required_version requirement_satisfied
0 numpy 1.26.4 auto True
1 scipy 1.13.1 auto True
2 cython 3.0.11 auto True
3 numba 0.60.0 0.50.1 True
4 matplotlib 3.6.3 auto True
5 seaborn 0.13.2 auto True
6 scikit-learn 1.6.0 auto True
7 h5py 3.12.1 3.1.0 True
8 pandas 1.5.3 1.0.3 True
9 velocyto 0.17.17 0.17 True
10 umap-learn 0.5.7 auto True
11 pyarrow 18.1.0 0.17 True
12 tqdm 4.67.1 4.45 True
13 igraph 0.11.8 0.10.1 True
14 louvain 0.8.2 auto True
15 jupyter 1.1.1 auto True
16 anndata 0.10.9 0.7.5 True
17 scanpy 1.10.3 1.6 True
18 joblib 1.4.2 auto True
19 goatools 1.4.12 auto True
20 genomepy 0.16.1 0.8.4 True
21 gimmemotifs 0.17.0 0.14.4 True

load peak and get the cicero results

[4]:
result_dir = '../../data/iPSC_example/base_GRN'
adata = sc.read(('../../data/iPSC_example/ATAC_data/adata_atac_raw.h5ad'))
[5]:
peaks = np.array(adata.var_names)
peaks
[5]:
array(['chr1_819722_820222', 'chr1_827288_827788', 'chr1_838176_838676',
       ..., 'chrX_155962954_155963454', 'chrX_155966781_155967281',
       'chrX_155971350_155971850'], dtype=object)
[6]:
# Load Cicero coaccessibility scores.
cicero_connections = pd.read_csv(os.path.join(result_dir,"cicero_connections_stream.csv"), index_col=0)
cicero_connections.head()
[6]:
Peak1 Peak2 coaccess
1 chr10_100000290_100000790 chr10_99779229_99779729 -0.001031
2 chr10_100000290_100000790 chr10_99779759_99780259 0.001412
3 chr10_100000290_100000790 chr10_99785581_99786081 0.000000
4 chr10_100000290_100000790 chr10_99788840_99789340 0.000000
5 chr10_100000290_100000790 chr10_99845752_99846252 0.000482
[7]:
cicero_connections.shape
[7]:
(22818160, 3)

annotate tss

[8]:
ma.SUPPORTED_REF_GENOME
[8]:
species ref_genome provider
0 Human hg38 UCSC
1 Human hg19 UCSC
2 Mouse mm39 UCSC
3 Mouse mm10 UCSC
4 Mouse mm9 UCSC
5 S.cerevisiae sacCer2 UCSC
6 S.cerevisiae sacCer3 UCSC
7 Zebrafish danRer7 UCSC
8 Zebrafish danRer10 UCSC
9 Zebrafish danRer11 UCSC
10 Xenopus tropicalis xenTro2 UCSC
11 Xenopus tropicalis xenTro3 UCSC
12 Xenopus laevis Xenopus_laevis_v10.1 NCBI
13 Rat rn4 UCSC
14 Rat rn5 UCSC
15 Rat rn6 UCSC
16 Drosophila dm3 UCSC
17 Drosophila dm6 UCSC
18 C.elegans ce6 UCSC
19 C.elegans ce10 UCSC
20 Arabidopsis TAIR10 Ensembl
21 Chicken galGal4 UCSC
22 Chicken galGal5 UCSC
23 Chicken galGal6 UCSC
24 Guinea_Pig Cavpor3.0 Ensembl
25 Pig Sscrofa11.1 Ensembl
26 Axolotl AmexG_v6.0-DD Axolotl-omics.org
[9]:
##!! Please make sure to specify the correct reference genome here
tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome="hg38") # Note that the ref_genome should be changed according to your data

# Check results
tss_annotated.tail()

que bed peaks: 246132
tss peaks in que: 33757
[9]:
chr start end gene_short_name strand
33752 chr5 149550467 149550967 CSNK1A1 -
33753 chr5 149551161 149551661 CSNK1A1 -
33754 chr20 10673802 10674302 JAG1 -
33755 chr20 10674620 10675120 JAG1 -
33756 chr9 122228522 122229022 LHX6 -

interate tss and cicero result

[10]:
integrated = ma.integrate_tss_peak_with_cicero(tss_peak=tss_annotated,
                                               cicero_connections=cicero_connections)
print(integrated.shape)
integrated.head()
(938050, 3)
[10]:
peak_id gene_short_name coaccess
0 chr10_100000290_100000790 BLOC1S2 0.000119
1 chr10_100000290_100000790 CWF19L1 0.001710
2 chr10_100000290_100000790 DNMBP 0.003578
3 chr10_100000290_100000790 ERLIN1 0.002079
4 chr10_100000290_100000790 OLMALINC 0.000491
[11]:
peak = integrated[integrated.coaccess >= 0.8]
peak = peak[["peak_id", "gene_short_name"]].reset_index(drop=True)

print(peak.shape)
peak.head()
(31844, 2)
[11]:
peak_id gene_short_name
0 chr10_100009740_100010240 DNMBP
1 chr10_100185817_100186317 ERLIN1
2 chr10_100186339_100186839 ERLIN1
3 chr10_100229388_100229888 CHUK
4 chr10_100267454_100267954 CWF19L1

save data

[12]:
peak.to_csv(os.path.join(result_dir,"processed_peak.csv"))