{
"cells": [
{
"cell_type": "markdown",
"id": "62701c42",
"metadata": {},
"source": [
"# CellOracle - TSS annotation"
]
},
{
"cell_type": "markdown",
"id": "45764f0d",
"metadata": {},
"source": [
"This notebook is used to annotate the peaks into promoters, enhancers, and other genomic regions."
]
},
{
"cell_type": "markdown",
"id": "c9a57a8c",
"metadata": {},
"source": [
"## import"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "bf8efe44-160a-4bc9-b410-5ad363da7ee9",
"metadata": {},
"outputs": [],
"source": [
"import scanpy as sc\n",
"import numpy as np\n",
"import pandas as pd\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7f615025-4f01-4408-b9bd-e7ffe5ce42c5",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" package_name | \n",
" installed_version | \n",
" required_version | \n",
" requirement_satisfied | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" numpy | \n",
" 1.26.4 | \n",
" auto | \n",
" True | \n",
"
\n",
" \n",
" | 1 | \n",
" scipy | \n",
" 1.13.1 | \n",
" auto | \n",
" True | \n",
"
\n",
" \n",
" | 2 | \n",
" cython | \n",
" 3.0.11 | \n",
" auto | \n",
" True | \n",
"
\n",
" \n",
" | 3 | \n",
" numba | \n",
" 0.60.0 | \n",
" 0.50.1 | \n",
" True | \n",
"
\n",
" \n",
" | 4 | \n",
" matplotlib | \n",
" 3.6.3 | \n",
" auto | \n",
" True | \n",
"
\n",
" \n",
" | 5 | \n",
" seaborn | \n",
" 0.13.2 | \n",
" auto | \n",
" True | \n",
"
\n",
" \n",
" | 6 | \n",
" scikit-learn | \n",
" 1.6.0 | \n",
" auto | \n",
" True | \n",
"
\n",
" \n",
" | 7 | \n",
" h5py | \n",
" 3.12.1 | \n",
" 3.1.0 | \n",
" True | \n",
"
\n",
" \n",
" | 8 | \n",
" pandas | \n",
" 1.5.3 | \n",
" 1.0.3 | \n",
" True | \n",
"
\n",
" \n",
" | 9 | \n",
" velocyto | \n",
" 0.17.17 | \n",
" 0.17 | \n",
" True | \n",
"
\n",
" \n",
" | 10 | \n",
" umap-learn | \n",
" 0.5.7 | \n",
" auto | \n",
" True | \n",
"
\n",
" \n",
" | 11 | \n",
" pyarrow | \n",
" 18.1.0 | \n",
" 0.17 | \n",
" True | \n",
"
\n",
" \n",
" | 12 | \n",
" tqdm | \n",
" 4.67.1 | \n",
" 4.45 | \n",
" True | \n",
"
\n",
" \n",
" | 13 | \n",
" igraph | \n",
" 0.11.8 | \n",
" 0.10.1 | \n",
" True | \n",
"
\n",
" \n",
" | 14 | \n",
" louvain | \n",
" 0.8.2 | \n",
" auto | \n",
" True | \n",
"
\n",
" \n",
" | 15 | \n",
" jupyter | \n",
" 1.1.1 | \n",
" auto | \n",
" True | \n",
"
\n",
" \n",
" | 16 | \n",
" anndata | \n",
" 0.10.9 | \n",
" 0.7.5 | \n",
" True | \n",
"
\n",
" \n",
" | 17 | \n",
" scanpy | \n",
" 1.10.3 | \n",
" 1.6 | \n",
" True | \n",
"
\n",
" \n",
" | 18 | \n",
" joblib | \n",
" 1.4.2 | \n",
" auto | \n",
" True | \n",
"
\n",
" \n",
" | 19 | \n",
" goatools | \n",
" 1.4.12 | \n",
" auto | \n",
" True | \n",
"
\n",
" \n",
" | 20 | \n",
" genomepy | \n",
" 0.16.1 | \n",
" 0.8.4 | \n",
" True | \n",
"
\n",
" \n",
" | 21 | \n",
" gimmemotifs | \n",
" 0.17.0 | \n",
" 0.14.4 | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" package_name installed_version required_version requirement_satisfied\n",
"0 numpy 1.26.4 auto True\n",
"1 scipy 1.13.1 auto True\n",
"2 cython 3.0.11 auto True\n",
"3 numba 0.60.0 0.50.1 True\n",
"4 matplotlib 3.6.3 auto True\n",
"5 seaborn 0.13.2 auto True\n",
"6 scikit-learn 1.6.0 auto True\n",
"7 h5py 3.12.1 3.1.0 True\n",
"8 pandas 1.5.3 1.0.3 True\n",
"9 velocyto 0.17.17 0.17 True\n",
"10 umap-learn 0.5.7 auto True\n",
"11 pyarrow 18.1.0 0.17 True\n",
"12 tqdm 4.67.1 4.45 True\n",
"13 igraph 0.11.8 0.10.1 True\n",
"14 louvain 0.8.2 auto True\n",
"15 jupyter 1.1.1 auto True\n",
"16 anndata 0.10.9 0.7.5 True\n",
"17 scanpy 1.10.3 1.6 True\n",
"18 joblib 1.4.2 auto True\n",
"19 goatools 1.4.12 auto True\n",
"20 genomepy 0.16.1 0.8.4 True\n",
"21 gimmemotifs 0.17.0 0.14.4 True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from celloracle import motif_analysis as ma\n",
"import celloracle as co\n",
"co.__version__\n",
"co.check_python_requirements()"
]
},
{
"cell_type": "markdown",
"id": "c202a00f-6203-4c5a-bb33-1354c9adfdc3",
"metadata": {},
"source": [
"## load peak and get the cicero results"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e7afc618-5b85-4e03-998d-bbb1d54b4e70",
"metadata": {},
"outputs": [],
"source": [
"result_dir = '../../data/iPSC_example/base_GRN'\n",
"adata = sc.read(('../../data/iPSC_example/ATAC_data/adata_atac_raw.h5ad'))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4b906756-d341-4537-b827-486fa3146b31",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['chr1_819722_820222', 'chr1_827288_827788', 'chr1_838176_838676',\n",
" ..., 'chrX_155962954_155963454', 'chrX_155966781_155967281',\n",
" 'chrX_155971350_155971850'], dtype=object)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"peaks = np.array(adata.var_names)\n",
"peaks"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b8e33aa7-87b6-44ed-9b83-42398f33bc38",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Peak1 | \n",
" Peak2 | \n",
" coaccess | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" chr10_100000290_100000790 | \n",
" chr10_99779229_99779729 | \n",
" -0.001031 | \n",
"
\n",
" \n",
" | 2 | \n",
" chr10_100000290_100000790 | \n",
" chr10_99779759_99780259 | \n",
" 0.001412 | \n",
"
\n",
" \n",
" | 3 | \n",
" chr10_100000290_100000790 | \n",
" chr10_99785581_99786081 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 4 | \n",
" chr10_100000290_100000790 | \n",
" chr10_99788840_99789340 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 5 | \n",
" chr10_100000290_100000790 | \n",
" chr10_99845752_99846252 | \n",
" 0.000482 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Peak1 Peak2 coaccess\n",
"1 chr10_100000290_100000790 chr10_99779229_99779729 -0.001031\n",
"2 chr10_100000290_100000790 chr10_99779759_99780259 0.001412\n",
"3 chr10_100000290_100000790 chr10_99785581_99786081 0.000000\n",
"4 chr10_100000290_100000790 chr10_99788840_99789340 0.000000\n",
"5 chr10_100000290_100000790 chr10_99845752_99846252 0.000482"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Load Cicero coaccessibility scores.\n",
"cicero_connections = pd.read_csv(os.path.join(result_dir,\"cicero_connections_stream.csv\"), index_col=0)\n",
"cicero_connections.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e632ee51-f39b-4d0f-a05b-c055c0ffde25",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(22818160, 3)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cicero_connections.shape"
]
},
{
"cell_type": "markdown",
"id": "4511a1c5-a62a-465d-8353-acccc42b083b",
"metadata": {},
"source": [
"## annotate tss"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "044e9949-f23e-4ff1-8acf-7d9bbf4a6d84",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" species | \n",
" ref_genome | \n",
" provider | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Human | \n",
" hg38 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 1 | \n",
" Human | \n",
" hg19 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 2 | \n",
" Mouse | \n",
" mm39 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 3 | \n",
" Mouse | \n",
" mm10 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 4 | \n",
" Mouse | \n",
" mm9 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 5 | \n",
" S.cerevisiae | \n",
" sacCer2 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 6 | \n",
" S.cerevisiae | \n",
" sacCer3 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 7 | \n",
" Zebrafish | \n",
" danRer7 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 8 | \n",
" Zebrafish | \n",
" danRer10 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 9 | \n",
" Zebrafish | \n",
" danRer11 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 10 | \n",
" Xenopus tropicalis | \n",
" xenTro2 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 11 | \n",
" Xenopus tropicalis | \n",
" xenTro3 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 12 | \n",
" Xenopus laevis | \n",
" Xenopus_laevis_v10.1 | \n",
" NCBI | \n",
"
\n",
" \n",
" | 13 | \n",
" Rat | \n",
" rn4 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 14 | \n",
" Rat | \n",
" rn5 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 15 | \n",
" Rat | \n",
" rn6 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 16 | \n",
" Drosophila | \n",
" dm3 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 17 | \n",
" Drosophila | \n",
" dm6 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 18 | \n",
" C.elegans | \n",
" ce6 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 19 | \n",
" C.elegans | \n",
" ce10 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 20 | \n",
" Arabidopsis | \n",
" TAIR10 | \n",
" Ensembl | \n",
"
\n",
" \n",
" | 21 | \n",
" Chicken | \n",
" galGal4 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 22 | \n",
" Chicken | \n",
" galGal5 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 23 | \n",
" Chicken | \n",
" galGal6 | \n",
" UCSC | \n",
"
\n",
" \n",
" | 24 | \n",
" Guinea_Pig | \n",
" Cavpor3.0 | \n",
" Ensembl | \n",
"
\n",
" \n",
" | 25 | \n",
" Pig | \n",
" Sscrofa11.1 | \n",
" Ensembl | \n",
"
\n",
" \n",
" | 26 | \n",
" Axolotl | \n",
" AmexG_v6.0-DD | \n",
" Axolotl-omics.org | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" species ref_genome provider\n",
"0 Human hg38 UCSC\n",
"1 Human hg19 UCSC\n",
"2 Mouse mm39 UCSC\n",
"3 Mouse mm10 UCSC\n",
"4 Mouse mm9 UCSC\n",
"5 S.cerevisiae sacCer2 UCSC\n",
"6 S.cerevisiae sacCer3 UCSC\n",
"7 Zebrafish danRer7 UCSC\n",
"8 Zebrafish danRer10 UCSC\n",
"9 Zebrafish danRer11 UCSC\n",
"10 Xenopus tropicalis xenTro2 UCSC\n",
"11 Xenopus tropicalis xenTro3 UCSC\n",
"12 Xenopus laevis Xenopus_laevis_v10.1 NCBI\n",
"13 Rat rn4 UCSC\n",
"14 Rat rn5 UCSC\n",
"15 Rat rn6 UCSC\n",
"16 Drosophila dm3 UCSC\n",
"17 Drosophila dm6 UCSC\n",
"18 C.elegans ce6 UCSC\n",
"19 C.elegans ce10 UCSC\n",
"20 Arabidopsis TAIR10 Ensembl\n",
"21 Chicken galGal4 UCSC\n",
"22 Chicken galGal5 UCSC\n",
"23 Chicken galGal6 UCSC\n",
"24 Guinea_Pig Cavpor3.0 Ensembl\n",
"25 Pig Sscrofa11.1 Ensembl\n",
"26 Axolotl AmexG_v6.0-DD Axolotl-omics.org"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ma.SUPPORTED_REF_GENOME"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "cb203001-8b57-482c-a2ff-5dc0a34d2527",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"que bed peaks: 246132\n",
"tss peaks in que: 33757\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" chr | \n",
" start | \n",
" end | \n",
" gene_short_name | \n",
" strand | \n",
"
\n",
" \n",
" \n",
" \n",
" | 33752 | \n",
" chr5 | \n",
" 149550467 | \n",
" 149550967 | \n",
" CSNK1A1 | \n",
" - | \n",
"
\n",
" \n",
" | 33753 | \n",
" chr5 | \n",
" 149551161 | \n",
" 149551661 | \n",
" CSNK1A1 | \n",
" - | \n",
"
\n",
" \n",
" | 33754 | \n",
" chr20 | \n",
" 10673802 | \n",
" 10674302 | \n",
" JAG1 | \n",
" - | \n",
"
\n",
" \n",
" | 33755 | \n",
" chr20 | \n",
" 10674620 | \n",
" 10675120 | \n",
" JAG1 | \n",
" - | \n",
"
\n",
" \n",
" | 33756 | \n",
" chr9 | \n",
" 122228522 | \n",
" 122229022 | \n",
" LHX6 | \n",
" - | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" chr start end gene_short_name strand\n",
"33752 chr5 149550467 149550967 CSNK1A1 -\n",
"33753 chr5 149551161 149551661 CSNK1A1 -\n",
"33754 chr20 10673802 10674302 JAG1 -\n",
"33755 chr20 10674620 10675120 JAG1 -\n",
"33756 chr9 122228522 122229022 LHX6 -"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"##!! Please make sure to specify the correct reference genome here\n",
"tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome=\"hg38\") # Note that the ref_genome should be changed according to your data\n",
"\n",
"# Check results\n",
"tss_annotated.tail()\n"
]
},
{
"cell_type": "markdown",
"id": "63dfea16-d8ef-441b-adc4-4e9fae10fd5e",
"metadata": {},
"source": [
"## interate tss and cicero result"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2daa61c0-68bd-4ab6-b01b-65f8c5dbdfe1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(938050, 3)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" peak_id | \n",
" gene_short_name | \n",
" coaccess | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" chr10_100000290_100000790 | \n",
" BLOC1S2 | \n",
" 0.000119 | \n",
"
\n",
" \n",
" | 1 | \n",
" chr10_100000290_100000790 | \n",
" CWF19L1 | \n",
" 0.001710 | \n",
"
\n",
" \n",
" | 2 | \n",
" chr10_100000290_100000790 | \n",
" DNMBP | \n",
" 0.003578 | \n",
"
\n",
" \n",
" | 3 | \n",
" chr10_100000290_100000790 | \n",
" ERLIN1 | \n",
" 0.002079 | \n",
"
\n",
" \n",
" | 4 | \n",
" chr10_100000290_100000790 | \n",
" OLMALINC | \n",
" 0.000491 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" peak_id gene_short_name coaccess\n",
"0 chr10_100000290_100000790 BLOC1S2 0.000119\n",
"1 chr10_100000290_100000790 CWF19L1 0.001710\n",
"2 chr10_100000290_100000790 DNMBP 0.003578\n",
"3 chr10_100000290_100000790 ERLIN1 0.002079\n",
"4 chr10_100000290_100000790 OLMALINC 0.000491"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"integrated = ma.integrate_tss_peak_with_cicero(tss_peak=tss_annotated,\n",
" cicero_connections=cicero_connections)\n",
"print(integrated.shape)\n",
"integrated.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "e4002a74-e5b1-4c39-a43c-b665492a4c93",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(31844, 2)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" peak_id | \n",
" gene_short_name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" chr10_100009740_100010240 | \n",
" DNMBP | \n",
"
\n",
" \n",
" | 1 | \n",
" chr10_100185817_100186317 | \n",
" ERLIN1 | \n",
"
\n",
" \n",
" | 2 | \n",
" chr10_100186339_100186839 | \n",
" ERLIN1 | \n",
"
\n",
" \n",
" | 3 | \n",
" chr10_100229388_100229888 | \n",
" CHUK | \n",
"
\n",
" \n",
" | 4 | \n",
" chr10_100267454_100267954 | \n",
" CWF19L1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" peak_id gene_short_name\n",
"0 chr10_100009740_100010240 DNMBP\n",
"1 chr10_100185817_100186317 ERLIN1\n",
"2 chr10_100186339_100186839 ERLIN1\n",
"3 chr10_100229388_100229888 CHUK\n",
"4 chr10_100267454_100267954 CWF19L1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"peak = integrated[integrated.coaccess >= 0.8]\n",
"peak = peak[[\"peak_id\", \"gene_short_name\"]].reset_index(drop=True)\n",
"\n",
"print(peak.shape)\n",
"peak.head()"
]
},
{
"cell_type": "markdown",
"id": "05b1aaa3-c0d9-456c-9f4f-7f330cc41c8a",
"metadata": {},
"source": [
"## save data"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "902b103c-420e-44ae-842b-e419769edb2c",
"metadata": {},
"outputs": [],
"source": [
"peak.to_csv(os.path.join(result_dir,\"processed_peak.csv\"))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "TFcomb",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.21"
}
},
"nbformat": 4,
"nbformat_minor": 5
}