CellOracle - TSS annotation¶

This notebook is used to annotate the peaks into promoters, enhancers, and other genomic regions.

import¶

[1]:

import scanpy as sc
import numpy as np
import pandas as pd
import os

[2]:

from celloracle import motif_analysis as ma
import celloracle as co
co.__version__
co.check_python_requirements()

[2]:

	package_name	installed_version	required_version	requirement_satisfied
0	numpy	1.26.4	auto	True
1	scipy	1.13.1	auto	True
2	cython	3.0.11	auto	True
3	numba	0.60.0	0.50.1	True
4	matplotlib	3.6.3	auto	True
5	seaborn	0.13.2	auto	True
6	scikit-learn	1.6.0	auto	True
7	h5py	3.12.1	3.1.0	True
8	pandas	1.5.3	1.0.3	True
9	velocyto	0.17.17	0.17	True
10	umap-learn	0.5.7	auto	True
11	pyarrow	18.1.0	0.17	True
12	tqdm	4.67.1	4.45	True
13	igraph	0.11.8	0.10.1	True
14	louvain	0.8.2	auto	True
15	jupyter	1.1.1	auto	True
16	anndata	0.10.9	0.7.5	True
17	scanpy	1.10.3	1.6	True
18	joblib	1.4.2	auto	True
19	goatools	1.4.12	auto	True
20	genomepy	0.16.1	0.8.4	True
21	gimmemotifs	0.17.0	0.14.4	True

load peak and get the cicero results¶

[4]:

result_dir = '../../data/iPSC_example/base_GRN'
adata = sc.read(('../../data/iPSC_example/ATAC_data/adata_atac_raw.h5ad'))

[5]:

peaks = np.array(adata.var_names)
peaks

[5]:

array(['chr1_819722_820222', 'chr1_827288_827788', 'chr1_838176_838676',
       ..., 'chrX_155962954_155963454', 'chrX_155966781_155967281',
       'chrX_155971350_155971850'], dtype=object)

[6]:

# Load Cicero coaccessibility scores.
cicero_connections = pd.read_csv(os.path.join(result_dir,"cicero_connections_stream.csv"), index_col=0)
cicero_connections.head()

[6]:

	Peak1	Peak2	coaccess
1	chr10_100000290_100000790	chr10_99779229_99779729	-0.001031
2	chr10_100000290_100000790	chr10_99779759_99780259	0.001412
3	chr10_100000290_100000790	chr10_99785581_99786081	0.000000
4	chr10_100000290_100000790	chr10_99788840_99789340	0.000000
5	chr10_100000290_100000790	chr10_99845752_99846252	0.000482

[7]:

cicero_connections.shape

[7]:

(22818160, 3)

annotate tss¶

[8]:

ma.SUPPORTED_REF_GENOME

[8]:

	species	ref_genome	provider
0	Human	hg38	UCSC
1	Human	hg19	UCSC
2	Mouse	mm39	UCSC
3	Mouse	mm10	UCSC
4	Mouse	mm9	UCSC
5	S.cerevisiae	sacCer2	UCSC
6	S.cerevisiae	sacCer3	UCSC
7	Zebrafish	danRer7	UCSC
8	Zebrafish	danRer10	UCSC
9	Zebrafish	danRer11	UCSC
10	Xenopus tropicalis	xenTro2	UCSC
11	Xenopus tropicalis	xenTro3	UCSC
12	Xenopus laevis	Xenopus_laevis_v10.1	NCBI
13	Rat	rn4	UCSC
14	Rat	rn5	UCSC
15	Rat	rn6	UCSC
16	Drosophila	dm3	UCSC
17	Drosophila	dm6	UCSC
18	C.elegans	ce6	UCSC
19	C.elegans	ce10	UCSC
20	Arabidopsis	TAIR10	Ensembl
21	Chicken	galGal4	UCSC
22	Chicken	galGal5	UCSC
23	Chicken	galGal6	UCSC
24	Guinea_Pig	Cavpor3.0	Ensembl
25	Pig	Sscrofa11.1	Ensembl
26	Axolotl	AmexG_v6.0-DD	Axolotl-omics.org

[9]:

##!! Please make sure to specify the correct reference genome here
tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome="hg38") # Note that the ref_genome should be changed according to your data

# Check results
tss_annotated.tail()

que bed peaks: 246132
tss peaks in que: 33757

[9]:

	chr	start	end	gene_short_name	strand
33752	chr5	149550467	149550967	CSNK1A1	-
33753	chr5	149551161	149551661	CSNK1A1	-
33754	chr20	10673802	10674302	JAG1	-
33755	chr20	10674620	10675120	JAG1	-
33756	chr9	122228522	122229022	LHX6	-

interate tss and cicero result¶

[10]:

integrated = ma.integrate_tss_peak_with_cicero(tss_peak=tss_annotated,
                                               cicero_connections=cicero_connections)
print(integrated.shape)
integrated.head()

(938050, 3)

[10]:

	peak_id	gene_short_name	coaccess
0	chr10_100000290_100000790	BLOC1S2	0.000119
1	chr10_100000290_100000790	CWF19L1	0.001710
2	chr10_100000290_100000790	DNMBP	0.003578
3	chr10_100000290_100000790	ERLIN1	0.002079
4	chr10_100000290_100000790	OLMALINC	0.000491

[11]:

peak = integrated[integrated.coaccess >= 0.8]
peak = peak[["peak_id", "gene_short_name"]].reset_index(drop=True)

print(peak.shape)
peak.head()

(31844, 2)

[11]:

	peak_id	gene_short_name
0	chr10_100009740_100010240	DNMBP
1	chr10_100185817_100186317	ERLIN1
2	chr10_100186339_100186839	ERLIN1
3	chr10_100229388_100229888	CHUK
4	chr10_100267454_100267954	CWF19L1

save data¶

[12]:

peak.to_csv(os.path.join(result_dir,"processed_peak.csv"))