{ "cells": [ { "cell_type": "markdown", "id": "62701c42", "metadata": {}, "source": [ "# CellOracle - TSS annotation" ] }, { "cell_type": "markdown", "id": "45764f0d", "metadata": {}, "source": [ "This notebook is used to annotate the peaks into promoters, enhancers, and other genomic regions." ] }, { "cell_type": "markdown", "id": "c9a57a8c", "metadata": {}, "source": [ "## import" ] }, { "cell_type": "code", "execution_count": 1, "id": "bf8efe44-160a-4bc9-b410-5ad363da7ee9", "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "import numpy as np\n", "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "id": "7f615025-4f01-4408-b9bd-e7ffe5ce42c5", "metadata": { "scrolled": true, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
package_nameinstalled_versionrequired_versionrequirement_satisfied
0numpy1.26.4autoTrue
1scipy1.13.1autoTrue
2cython3.0.11autoTrue
3numba0.60.00.50.1True
4matplotlib3.6.3autoTrue
5seaborn0.13.2autoTrue
6scikit-learn1.6.0autoTrue
7h5py3.12.13.1.0True
8pandas1.5.31.0.3True
9velocyto0.17.170.17True
10umap-learn0.5.7autoTrue
11pyarrow18.1.00.17True
12tqdm4.67.14.45True
13igraph0.11.80.10.1True
14louvain0.8.2autoTrue
15jupyter1.1.1autoTrue
16anndata0.10.90.7.5True
17scanpy1.10.31.6True
18joblib1.4.2autoTrue
19goatools1.4.12autoTrue
20genomepy0.16.10.8.4True
21gimmemotifs0.17.00.14.4True
\n", "
" ], "text/plain": [ " package_name installed_version required_version requirement_satisfied\n", "0 numpy 1.26.4 auto True\n", "1 scipy 1.13.1 auto True\n", "2 cython 3.0.11 auto True\n", "3 numba 0.60.0 0.50.1 True\n", "4 matplotlib 3.6.3 auto True\n", "5 seaborn 0.13.2 auto True\n", "6 scikit-learn 1.6.0 auto True\n", "7 h5py 3.12.1 3.1.0 True\n", "8 pandas 1.5.3 1.0.3 True\n", "9 velocyto 0.17.17 0.17 True\n", "10 umap-learn 0.5.7 auto True\n", "11 pyarrow 18.1.0 0.17 True\n", "12 tqdm 4.67.1 4.45 True\n", "13 igraph 0.11.8 0.10.1 True\n", "14 louvain 0.8.2 auto True\n", "15 jupyter 1.1.1 auto True\n", "16 anndata 0.10.9 0.7.5 True\n", "17 scanpy 1.10.3 1.6 True\n", "18 joblib 1.4.2 auto True\n", "19 goatools 1.4.12 auto True\n", "20 genomepy 0.16.1 0.8.4 True\n", "21 gimmemotifs 0.17.0 0.14.4 True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from celloracle import motif_analysis as ma\n", "import celloracle as co\n", "co.__version__\n", "co.check_python_requirements()" ] }, { "cell_type": "markdown", "id": "c202a00f-6203-4c5a-bb33-1354c9adfdc3", "metadata": {}, "source": [ "## load peak and get the cicero results" ] }, { "cell_type": "code", "execution_count": 4, "id": "e7afc618-5b85-4e03-998d-bbb1d54b4e70", "metadata": {}, "outputs": [], "source": [ "result_dir = '../../data/iPSC_example/base_GRN'\n", "adata = sc.read(('../../data/iPSC_example/ATAC_data/adata_atac_raw.h5ad'))" ] }, { "cell_type": "code", "execution_count": 5, "id": "4b906756-d341-4537-b827-486fa3146b31", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['chr1_819722_820222', 'chr1_827288_827788', 'chr1_838176_838676',\n", " ..., 'chrX_155962954_155963454', 'chrX_155966781_155967281',\n", " 'chrX_155971350_155971850'], dtype=object)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "peaks = np.array(adata.var_names)\n", "peaks" ] }, { "cell_type": "code", "execution_count": 6, "id": "b8e33aa7-87b6-44ed-9b83-42398f33bc38", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Peak1Peak2coaccess
1chr10_100000290_100000790chr10_99779229_99779729-0.001031
2chr10_100000290_100000790chr10_99779759_997802590.001412
3chr10_100000290_100000790chr10_99785581_997860810.000000
4chr10_100000290_100000790chr10_99788840_997893400.000000
5chr10_100000290_100000790chr10_99845752_998462520.000482
\n", "
" ], "text/plain": [ " Peak1 Peak2 coaccess\n", "1 chr10_100000290_100000790 chr10_99779229_99779729 -0.001031\n", "2 chr10_100000290_100000790 chr10_99779759_99780259 0.001412\n", "3 chr10_100000290_100000790 chr10_99785581_99786081 0.000000\n", "4 chr10_100000290_100000790 chr10_99788840_99789340 0.000000\n", "5 chr10_100000290_100000790 chr10_99845752_99846252 0.000482" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load Cicero coaccessibility scores.\n", "cicero_connections = pd.read_csv(os.path.join(result_dir,\"cicero_connections_stream.csv\"), index_col=0)\n", "cicero_connections.head()" ] }, { "cell_type": "code", "execution_count": 7, "id": "e632ee51-f39b-4d0f-a05b-c055c0ffde25", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(22818160, 3)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cicero_connections.shape" ] }, { "cell_type": "markdown", "id": "4511a1c5-a62a-465d-8353-acccc42b083b", "metadata": {}, "source": [ "## annotate tss" ] }, { "cell_type": "code", "execution_count": 8, "id": "044e9949-f23e-4ff1-8acf-7d9bbf4a6d84", "metadata": { "scrolled": true, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
speciesref_genomeprovider
0Humanhg38UCSC
1Humanhg19UCSC
2Mousemm39UCSC
3Mousemm10UCSC
4Mousemm9UCSC
5S.cerevisiaesacCer2UCSC
6S.cerevisiaesacCer3UCSC
7ZebrafishdanRer7UCSC
8ZebrafishdanRer10UCSC
9ZebrafishdanRer11UCSC
10Xenopus tropicalisxenTro2UCSC
11Xenopus tropicalisxenTro3UCSC
12Xenopus laevisXenopus_laevis_v10.1NCBI
13Ratrn4UCSC
14Ratrn5UCSC
15Ratrn6UCSC
16Drosophiladm3UCSC
17Drosophiladm6UCSC
18C.elegansce6UCSC
19C.elegansce10UCSC
20ArabidopsisTAIR10Ensembl
21ChickengalGal4UCSC
22ChickengalGal5UCSC
23ChickengalGal6UCSC
24Guinea_PigCavpor3.0Ensembl
25PigSscrofa11.1Ensembl
26AxolotlAmexG_v6.0-DDAxolotl-omics.org
\n", "
" ], "text/plain": [ " species ref_genome provider\n", "0 Human hg38 UCSC\n", "1 Human hg19 UCSC\n", "2 Mouse mm39 UCSC\n", "3 Mouse mm10 UCSC\n", "4 Mouse mm9 UCSC\n", "5 S.cerevisiae sacCer2 UCSC\n", "6 S.cerevisiae sacCer3 UCSC\n", "7 Zebrafish danRer7 UCSC\n", "8 Zebrafish danRer10 UCSC\n", "9 Zebrafish danRer11 UCSC\n", "10 Xenopus tropicalis xenTro2 UCSC\n", "11 Xenopus tropicalis xenTro3 UCSC\n", "12 Xenopus laevis Xenopus_laevis_v10.1 NCBI\n", "13 Rat rn4 UCSC\n", "14 Rat rn5 UCSC\n", "15 Rat rn6 UCSC\n", "16 Drosophila dm3 UCSC\n", "17 Drosophila dm6 UCSC\n", "18 C.elegans ce6 UCSC\n", "19 C.elegans ce10 UCSC\n", "20 Arabidopsis TAIR10 Ensembl\n", "21 Chicken galGal4 UCSC\n", "22 Chicken galGal5 UCSC\n", "23 Chicken galGal6 UCSC\n", "24 Guinea_Pig Cavpor3.0 Ensembl\n", "25 Pig Sscrofa11.1 Ensembl\n", "26 Axolotl AmexG_v6.0-DD Axolotl-omics.org" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ma.SUPPORTED_REF_GENOME" ] }, { "cell_type": "code", "execution_count": 9, "id": "cb203001-8b57-482c-a2ff-5dc0a34d2527", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "que bed peaks: 246132\n", "tss peaks in que: 33757\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chrstartendgene_short_namestrand
33752chr5149550467149550967CSNK1A1-
33753chr5149551161149551661CSNK1A1-
33754chr201067380210674302JAG1-
33755chr201067462010675120JAG1-
33756chr9122228522122229022LHX6-
\n", "
" ], "text/plain": [ " chr start end gene_short_name strand\n", "33752 chr5 149550467 149550967 CSNK1A1 -\n", "33753 chr5 149551161 149551661 CSNK1A1 -\n", "33754 chr20 10673802 10674302 JAG1 -\n", "33755 chr20 10674620 10675120 JAG1 -\n", "33756 chr9 122228522 122229022 LHX6 -" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##!! Please make sure to specify the correct reference genome here\n", "tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome=\"hg38\") # Note that the ref_genome should be changed according to your data\n", "\n", "# Check results\n", "tss_annotated.tail()\n" ] }, { "cell_type": "markdown", "id": "63dfea16-d8ef-441b-adc4-4e9fae10fd5e", "metadata": {}, "source": [ "## interate tss and cicero result" ] }, { "cell_type": "code", "execution_count": 10, "id": "2daa61c0-68bd-4ab6-b01b-65f8c5dbdfe1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(938050, 3)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
peak_idgene_short_namecoaccess
0chr10_100000290_100000790BLOC1S20.000119
1chr10_100000290_100000790CWF19L10.001710
2chr10_100000290_100000790DNMBP0.003578
3chr10_100000290_100000790ERLIN10.002079
4chr10_100000290_100000790OLMALINC0.000491
\n", "
" ], "text/plain": [ " peak_id gene_short_name coaccess\n", "0 chr10_100000290_100000790 BLOC1S2 0.000119\n", "1 chr10_100000290_100000790 CWF19L1 0.001710\n", "2 chr10_100000290_100000790 DNMBP 0.003578\n", "3 chr10_100000290_100000790 ERLIN1 0.002079\n", "4 chr10_100000290_100000790 OLMALINC 0.000491" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "integrated = ma.integrate_tss_peak_with_cicero(tss_peak=tss_annotated,\n", " cicero_connections=cicero_connections)\n", "print(integrated.shape)\n", "integrated.head()" ] }, { "cell_type": "code", "execution_count": 11, "id": "e4002a74-e5b1-4c39-a43c-b665492a4c93", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(31844, 2)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
peak_idgene_short_name
0chr10_100009740_100010240DNMBP
1chr10_100185817_100186317ERLIN1
2chr10_100186339_100186839ERLIN1
3chr10_100229388_100229888CHUK
4chr10_100267454_100267954CWF19L1
\n", "
" ], "text/plain": [ " peak_id gene_short_name\n", "0 chr10_100009740_100010240 DNMBP\n", "1 chr10_100185817_100186317 ERLIN1\n", "2 chr10_100186339_100186839 ERLIN1\n", "3 chr10_100229388_100229888 CHUK\n", "4 chr10_100267454_100267954 CWF19L1" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "peak = integrated[integrated.coaccess >= 0.8]\n", "peak = peak[[\"peak_id\", \"gene_short_name\"]].reset_index(drop=True)\n", "\n", "print(peak.shape)\n", "peak.head()" ] }, { "cell_type": "markdown", "id": "05b1aaa3-c0d9-456c-9f4f-7f330cc41c8a", "metadata": {}, "source": [ "## save data" ] }, { "cell_type": "code", "execution_count": 12, "id": "902b103c-420e-44ae-842b-e419769edb2c", "metadata": {}, "outputs": [], "source": [ "peak.to_csv(os.path.join(result_dir,\"processed_peak.csv\"))" ] } ], "metadata": { "kernelspec": { "display_name": "TFcomb", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 5 }