{ "cells": [ { "cell_type": "markdown", "id": "e6c1a04b", "metadata": {}, "source": [ "# CellOracle - TF binding motif scan" ] }, { "cell_type": "markdown", "id": "12add21a", "metadata": {}, "source": [ "This notebook is used to scan every peak with TF motif, and construct the base GRN of TFs to target genes" ] }, { "cell_type": "markdown", "id": "5c440993", "metadata": {}, "source": [ "## import" ] }, { "cell_type": "code", "execution_count": 1, "id": "00a6da56-4f53-4b62-aae8-daa085d8706d", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "id": "0f40d3f9-c4d0-4ef5-8b13-86066202d5ca", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'0.12.0'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import celloracle as co\n", "from celloracle import motif_analysis as ma\n", "from celloracle.utility import save_as_pickled_object\n", "co.__version__" ] }, { "cell_type": "markdown", "id": "7963e0de-bc54-491b-b081-227e5ca7a34d", "metadata": {}, "source": [ "## reference genome preparation" ] }, { "cell_type": "code", "execution_count": 3, "id": "444192ca-9b6c-459a-8d67-56a4d6cad538", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hg38 installation: True\n" ] } ], "source": [ "# PLEASE make sure reference genome is correct.\n", "ref_genome = \"hg38\"\n", "\n", "genome_installation = ma.is_genome_installed(ref_genome=ref_genome)\n", "print(ref_genome, \"installation: \", genome_installation)\n" ] }, { "cell_type": "markdown", "id": "a02110e2", "metadata": {}, "source": [ "If the ref_genome is not installed, run codes below." ] }, { "cell_type": "code", "execution_count": 1, "id": "3ccf8db1-a0d5-47f4-8da4-f1dbb20603fd", "metadata": {}, "outputs": [], "source": [ "# import genomepy\n", "# genomepy.install_genome(name=\"hg38\", provider=\"UCSC\")" ] }, { "cell_type": "markdown", "id": "677777e2-2594-4ca0-ad10-cf603d1526ce", "metadata": {}, "source": [ "## load data" ] }, { "cell_type": "code", "execution_count": 4, "id": "b8196e67-ec0d-48b6-b717-09efdede2b07", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(31844, 2)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
peak_idgene_short_name
0chr10_100009740_100010240DNMBP
1chr10_100185817_100186317ERLIN1
2chr10_100186339_100186839ERLIN1
3chr10_100229388_100229888CHUK
4chr10_100267454_100267954CWF19L1
\n", "
" ], "text/plain": [ " peak_id gene_short_name\n", "0 chr10_100009740_100010240 DNMBP\n", "1 chr10_100185817_100186317 ERLIN1\n", "2 chr10_100186339_100186839 ERLIN1\n", "3 chr10_100229388_100229888 CHUK\n", "4 chr10_100267454_100267954 CWF19L1" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load annotated peak data.\n", "result_dir = '../../data/iPSC_example/base_GRN'\n", "peaks = pd.read_csv(os.path.join(result_dir,\"processed_peak.csv\"), index_col=0)\n", "print(peaks.shape)\n", "peaks.head()" ] }, { "cell_type": "markdown", "id": "6e247d05", "metadata": {}, "source": [ "## Check peak format" ] }, { "cell_type": "code", "execution_count": 5, "id": "197cb71b-5abe-4c74-83bf-dfe1d65b206f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Peaks before filtering: 31844\n", "Peaks with invalid chr_name: 0\n", "Peaks with invalid length: 0\n", "Peaks after filtering: 31844\n" ] } ], "source": [ "def decompose_chrstr(peak_str):\n", " \"\"\"\n", " Args:\n", " peak_str (str): peak_str. e.g. 'chr1_3094484_3095479'\n", "\n", " Returns:\n", " tuple: chromosome name, start position, end position\n", " \"\"\"\n", "\n", " *chr_, start, end = peak_str.split(\"_\")\n", " chr_ = \"_\".join(chr_)\n", " return chr_, start, end\n", "\n", "from genomepy import Genome\n", "\n", "def check_peak_format(peaks_df, ref_genome):\n", " \"\"\"\n", " Check peak format.\n", " (1) Check chromosome name.\n", " (2) Check peak size (length) and remove sort DNA sequences (<5bp)\n", "\n", " \"\"\"\n", "\n", " df = peaks_df.copy()\n", "\n", " n_peaks_before = df.shape[0]\n", "\n", " # Decompose peaks and make df\n", " decomposed = [decompose_chrstr(peak_str) for peak_str in df[\"peak_id\"]]\n", " df_decomposed = pd.DataFrame(np.array(decomposed), index=peaks_df.index)\n", " df_decomposed.columns = [\"chr\", \"start\", \"end\"]\n", " df_decomposed[\"start\"] = df_decomposed[\"start\"].astype(int)\n", " df_decomposed[\"end\"] = df_decomposed[\"end\"].astype(int)\n", "\n", " # Load genome data\n", " genome_data = Genome(ref_genome)\n", " all_chr_list = list(genome_data.keys())\n", "\n", "\n", " # DNA length check\n", " lengths = np.abs(df_decomposed[\"end\"] - df_decomposed[\"start\"])\n", "\n", "\n", " # Filter peaks with invalid chromosome name\n", " n_threshold = 30\n", " df = df[(lengths >= n_threshold) & df_decomposed.chr.isin(all_chr_list)]\n", "\n", " # DNA length check\n", " lengths = np.abs(df_decomposed[\"end\"] - df_decomposed[\"start\"])\n", "\n", " # Data counting\n", " n_invalid_length = len(lengths[lengths < n_threshold])\n", " n_peaks_invalid_chr = n_peaks_before - df_decomposed.chr.isin(all_chr_list).sum()\n", " n_peaks_after = df.shape[0]\n", "\n", "\n", " #\n", " print(\"Peaks before filtering: \", n_peaks_before)\n", " print(\"Peaks with invalid chr_name: \", n_peaks_invalid_chr)\n", " print(\"Peaks with invalid length: \", n_invalid_length)\n", " print(\"Peaks after filtering: \", n_peaks_after)\n", "\n", " return df\n", "\n", "\n", "peaks = check_peak_format(peaks, ref_genome)" ] }, { "cell_type": "markdown", "id": "2589629f-7b84-4eac-8542-bf923eec9136", "metadata": {}, "source": [ "## tf binding motifs search" ] }, { "cell_type": "code", "execution_count": 6, "id": "b5b25efa-e226-4022-ab77-e615c6b740a4", "metadata": {}, "outputs": [], "source": [ "# Instantiate TFinfo object\n", "tfi = ma.TFinfo(peak_data_frame=peaks,\n", " ref_genome=ref_genome)\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "74110af1-91b1-4d2d-825b-f5d6cd17dca9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "29947" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(tfi.all_peaks)" ] }, { "cell_type": "code", "execution_count": 8, "id": "1b6a5cbd-e0ce-4904-89fa-627d226bea20", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "No motif data entered. Loading default motifs for your species ...\n", " Default motif for vertebrate: gimme.vertebrate.v5.0. \n", " For more information, please see https://gimmemotifs.readthedocs.io/en/master/overview.html \n", "\n", "Initiating scanner... \n", "\n", "Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time. \n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025-01-02 15:09:46,498 - INFO - determining FPR-based threshold\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Motif scan started .. It may take long time.\n", "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5711ea373a2b4646a91117fe9b6dce9c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "scanning: 0%| | 0/29947 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
seqnamemotif_idfactors_directfactors_indirectscoreposstrand
0chr10_100009740_100010240GM.5.0.Mixed.0001EGR1, SRF8.55575456-1
1chr10_100009740_100010240GM.5.0.Mixed.0001EGR1, SRF8.297692315-1
2chr10_100009740_100010240GM.5.0.Mixed.0001EGR1, SRF7.898092423-1
3chr10_100009740_100010240GM.5.0.Mixed.0001EGR1, SRF7.659488424-1
4chr10_100009740_100010240GM.5.0.Mixed.0001EGR1, SRF7.55501152-1
\n", "" ], "text/plain": [ " seqname motif_id factors_direct \\\n", "0 chr10_100009740_100010240 GM.5.0.Mixed.0001 \n", "1 chr10_100009740_100010240 GM.5.0.Mixed.0001 \n", "2 chr10_100009740_100010240 GM.5.0.Mixed.0001 \n", "3 chr10_100009740_100010240 GM.5.0.Mixed.0001 \n", "4 chr10_100009740_100010240 GM.5.0.Mixed.0001 \n", "\n", " factors_indirect score pos strand \n", "0 EGR1, SRF 8.555754 56 -1 \n", "1 EGR1, SRF 8.297692 315 -1 \n", "2 EGR1, SRF 7.898092 423 -1 \n", "3 EGR1, SRF 7.659488 424 -1 \n", "4 EGR1, SRF 7.555011 52 -1 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check motif scan results\n", "tfi.scanned_df.head()" ] }, { "cell_type": "markdown", "id": "0d78066a-21e4-4407-9b4d-7bd7ef439735", "metadata": {}, "source": [ "## filter and save" ] }, { "cell_type": "code", "execution_count": 10, "id": "36e79d11-275e-43ba-b749-d8799280ffed", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Filtering finished: 11988274 -> 2230755\n", "1. Converting scanned results into one-hot encoded dataframe.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "826b5564f14a4a358305fcf7f4e26bb9", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/29947 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
peak_idgene_short_name9430076C15RIKAC002126.6AC012531.1AC168977.1AC226150.2AFPAHRAHRR...ZNF8ZNF816ZSCAN10ZSCAN16ZSCAN26ZSCAN31ZSCAN4ZSCAN4-PS2ZSCAN4CZSCAN4F
0chr10_100009740_100010240DNMBP00010000...0000000000
1chr10_100185817_100186317ERLIN100010000...0000000000
2chr10_100186339_100186839ERLIN101110000...0000001111
3chr10_100229388_100229888CHUK00010000...0000000000
4chr10_100267454_100267954CWF19L110010000...0000000000
\n", "

5 rows × 1092 columns

\n", "" ], "text/plain": [ " peak_id gene_short_name 9430076C15RIK AC002126.6 \\\n", "0 chr10_100009740_100010240 DNMBP 0 0 \n", "1 chr10_100185817_100186317 ERLIN1 0 0 \n", "2 chr10_100186339_100186839 ERLIN1 0 1 \n", "3 chr10_100229388_100229888 CHUK 0 0 \n", "4 chr10_100267454_100267954 CWF19L1 1 0 \n", "\n", " AC012531.1 AC168977.1 AC226150.2 AFP AHR AHRR ... ZNF8 ZNF816 \\\n", "0 0 1 0 0 0 0 ... 0 0 \n", "1 0 1 0 0 0 0 ... 0 0 \n", "2 1 1 0 0 0 0 ... 0 0 \n", "3 0 1 0 0 0 0 ... 0 0 \n", "4 0 1 0 0 0 0 ... 0 0 \n", "\n", " ZSCAN10 ZSCAN16 ZSCAN26 ZSCAN31 ZSCAN4 ZSCAN4-PS2 ZSCAN4C ZSCAN4F \n", "0 0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 0 0 \n", "2 0 0 0 0 1 1 1 1 \n", "3 0 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 0 \n", "\n", "[5 rows x 1092 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = tfi.to_dataframe()\n", "print(df.shape)\n", "df.head()\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "ef6d7c01-4776-4142-8cfd-941b6cbf30d2", "metadata": {}, "outputs": [], "source": [ "# Save result as a dataframe\n", "df = tfi.to_dataframe()\n", "df.to_parquet(os.path.join(result_dir,\"base_GRN.parquet\"))" ] }, { "cell_type": "code", "execution_count": null, "id": "327f5539", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "TFcomb", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 5 }