{
"cells": [
{
"cell_type": "markdown",
"id": "e6c1a04b",
"metadata": {},
"source": [
"# CellOracle - TF binding motif scan"
]
},
{
"cell_type": "markdown",
"id": "12add21a",
"metadata": {},
"source": [
"This notebook is used to scan every peak with TF motif, and construct the base GRN of TFs to target genes"
]
},
{
"cell_type": "markdown",
"id": "5c440993",
"metadata": {},
"source": [
"## import"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "00a6da56-4f53-4b62-aae8-daa085d8706d",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "0f40d3f9-c4d0-4ef5-8b13-86066202d5ca",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'0.12.0'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import celloracle as co\n",
"from celloracle import motif_analysis as ma\n",
"from celloracle.utility import save_as_pickled_object\n",
"co.__version__"
]
},
{
"cell_type": "markdown",
"id": "7963e0de-bc54-491b-b081-227e5ca7a34d",
"metadata": {},
"source": [
"## reference genome preparation"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "444192ca-9b6c-459a-8d67-56a4d6cad538",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hg38 installation: True\n"
]
}
],
"source": [
"# PLEASE make sure reference genome is correct.\n",
"ref_genome = \"hg38\"\n",
"\n",
"genome_installation = ma.is_genome_installed(ref_genome=ref_genome)\n",
"print(ref_genome, \"installation: \", genome_installation)\n"
]
},
{
"cell_type": "markdown",
"id": "a02110e2",
"metadata": {},
"source": [
"If the ref_genome is not installed, run codes below."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "3ccf8db1-a0d5-47f4-8da4-f1dbb20603fd",
"metadata": {},
"outputs": [],
"source": [
"# import genomepy\n",
"# genomepy.install_genome(name=\"hg38\", provider=\"UCSC\")"
]
},
{
"cell_type": "markdown",
"id": "677777e2-2594-4ca0-ad10-cf603d1526ce",
"metadata": {},
"source": [
"## load data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b8196e67-ec0d-48b6-b717-09efdede2b07",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(31844, 2)\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" peak_id | \n",
" gene_short_name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" chr10_100009740_100010240 | \n",
" DNMBP | \n",
"
\n",
" \n",
" | 1 | \n",
" chr10_100185817_100186317 | \n",
" ERLIN1 | \n",
"
\n",
" \n",
" | 2 | \n",
" chr10_100186339_100186839 | \n",
" ERLIN1 | \n",
"
\n",
" \n",
" | 3 | \n",
" chr10_100229388_100229888 | \n",
" CHUK | \n",
"
\n",
" \n",
" | 4 | \n",
" chr10_100267454_100267954 | \n",
" CWF19L1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" peak_id gene_short_name\n",
"0 chr10_100009740_100010240 DNMBP\n",
"1 chr10_100185817_100186317 ERLIN1\n",
"2 chr10_100186339_100186839 ERLIN1\n",
"3 chr10_100229388_100229888 CHUK\n",
"4 chr10_100267454_100267954 CWF19L1"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Load annotated peak data.\n",
"result_dir = '../../data/iPSC_example/base_GRN'\n",
"peaks = pd.read_csv(os.path.join(result_dir,\"processed_peak.csv\"), index_col=0)\n",
"print(peaks.shape)\n",
"peaks.head()"
]
},
{
"cell_type": "markdown",
"id": "6e247d05",
"metadata": {},
"source": [
"## Check peak format"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "197cb71b-5abe-4c74-83bf-dfe1d65b206f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Peaks before filtering: 31844\n",
"Peaks with invalid chr_name: 0\n",
"Peaks with invalid length: 0\n",
"Peaks after filtering: 31844\n"
]
}
],
"source": [
"def decompose_chrstr(peak_str):\n",
" \"\"\"\n",
" Args:\n",
" peak_str (str): peak_str. e.g. 'chr1_3094484_3095479'\n",
"\n",
" Returns:\n",
" tuple: chromosome name, start position, end position\n",
" \"\"\"\n",
"\n",
" *chr_, start, end = peak_str.split(\"_\")\n",
" chr_ = \"_\".join(chr_)\n",
" return chr_, start, end\n",
"\n",
"from genomepy import Genome\n",
"\n",
"def check_peak_format(peaks_df, ref_genome):\n",
" \"\"\"\n",
" Check peak format.\n",
" (1) Check chromosome name.\n",
" (2) Check peak size (length) and remove sort DNA sequences (<5bp)\n",
"\n",
" \"\"\"\n",
"\n",
" df = peaks_df.copy()\n",
"\n",
" n_peaks_before = df.shape[0]\n",
"\n",
" # Decompose peaks and make df\n",
" decomposed = [decompose_chrstr(peak_str) for peak_str in df[\"peak_id\"]]\n",
" df_decomposed = pd.DataFrame(np.array(decomposed), index=peaks_df.index)\n",
" df_decomposed.columns = [\"chr\", \"start\", \"end\"]\n",
" df_decomposed[\"start\"] = df_decomposed[\"start\"].astype(int)\n",
" df_decomposed[\"end\"] = df_decomposed[\"end\"].astype(int)\n",
"\n",
" # Load genome data\n",
" genome_data = Genome(ref_genome)\n",
" all_chr_list = list(genome_data.keys())\n",
"\n",
"\n",
" # DNA length check\n",
" lengths = np.abs(df_decomposed[\"end\"] - df_decomposed[\"start\"])\n",
"\n",
"\n",
" # Filter peaks with invalid chromosome name\n",
" n_threshold = 30\n",
" df = df[(lengths >= n_threshold) & df_decomposed.chr.isin(all_chr_list)]\n",
"\n",
" # DNA length check\n",
" lengths = np.abs(df_decomposed[\"end\"] - df_decomposed[\"start\"])\n",
"\n",
" # Data counting\n",
" n_invalid_length = len(lengths[lengths < n_threshold])\n",
" n_peaks_invalid_chr = n_peaks_before - df_decomposed.chr.isin(all_chr_list).sum()\n",
" n_peaks_after = df.shape[0]\n",
"\n",
"\n",
" #\n",
" print(\"Peaks before filtering: \", n_peaks_before)\n",
" print(\"Peaks with invalid chr_name: \", n_peaks_invalid_chr)\n",
" print(\"Peaks with invalid length: \", n_invalid_length)\n",
" print(\"Peaks after filtering: \", n_peaks_after)\n",
"\n",
" return df\n",
"\n",
"\n",
"peaks = check_peak_format(peaks, ref_genome)"
]
},
{
"cell_type": "markdown",
"id": "2589629f-7b84-4eac-8542-bf923eec9136",
"metadata": {},
"source": [
"## tf binding motifs search"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b5b25efa-e226-4022-ab77-e615c6b740a4",
"metadata": {},
"outputs": [],
"source": [
"# Instantiate TFinfo object\n",
"tfi = ma.TFinfo(peak_data_frame=peaks,\n",
" ref_genome=ref_genome)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "74110af1-91b1-4d2d-825b-f5d6cd17dca9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"29947"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(tfi.all_peaks)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1b6a5cbd-e0ce-4904-89fa-627d226bea20",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"No motif data entered. Loading default motifs for your species ...\n",
" Default motif for vertebrate: gimme.vertebrate.v5.0. \n",
" For more information, please see https://gimmemotifs.readthedocs.io/en/master/overview.html \n",
"\n",
"Initiating scanner... \n",
"\n",
"Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time. \n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-01-02 15:09:46,498 - INFO - determining FPR-based threshold\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Motif scan started .. It may take long time.\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5711ea373a2b4646a91117fe9b6dce9c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"scanning: 0%| | 0/29947 [00:00, ? sequences/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 15min 53s, sys: 1min 37s, total: 17min 30s\n",
"Wall time: 18min 9s\n"
]
}
],
"source": [
"%%time\n",
"# Scan motifs. !!CAUTION!! This step may take several hours if you have many peaks!\n",
"tfi.scan(fpr=0.02,\n",
" motifs=None, # If you enter None, default motifs will be loaded.\n",
" verbose=True)\n",
"\n",
"# Save tfinfo object\n",
"tfi.to_hdf5(file_path=os.path.join(result_dir,\"motif_scan.celloracle.tfinfo\"))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a4b6b934-232c-4412-8c34-e39df1908705",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" seqname | \n",
" motif_id | \n",
" factors_direct | \n",
" factors_indirect | \n",
" score | \n",
" pos | \n",
" strand | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" chr10_100009740_100010240 | \n",
" GM.5.0.Mixed.0001 | \n",
" | \n",
" EGR1, SRF | \n",
" 8.555754 | \n",
" 56 | \n",
" -1 | \n",
"
\n",
" \n",
" | 1 | \n",
" chr10_100009740_100010240 | \n",
" GM.5.0.Mixed.0001 | \n",
" | \n",
" EGR1, SRF | \n",
" 8.297692 | \n",
" 315 | \n",
" -1 | \n",
"
\n",
" \n",
" | 2 | \n",
" chr10_100009740_100010240 | \n",
" GM.5.0.Mixed.0001 | \n",
" | \n",
" EGR1, SRF | \n",
" 7.898092 | \n",
" 423 | \n",
" -1 | \n",
"
\n",
" \n",
" | 3 | \n",
" chr10_100009740_100010240 | \n",
" GM.5.0.Mixed.0001 | \n",
" | \n",
" EGR1, SRF | \n",
" 7.659488 | \n",
" 424 | \n",
" -1 | \n",
"
\n",
" \n",
" | 4 | \n",
" chr10_100009740_100010240 | \n",
" GM.5.0.Mixed.0001 | \n",
" | \n",
" EGR1, SRF | \n",
" 7.555011 | \n",
" 52 | \n",
" -1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" seqname motif_id factors_direct \\\n",
"0 chr10_100009740_100010240 GM.5.0.Mixed.0001 \n",
"1 chr10_100009740_100010240 GM.5.0.Mixed.0001 \n",
"2 chr10_100009740_100010240 GM.5.0.Mixed.0001 \n",
"3 chr10_100009740_100010240 GM.5.0.Mixed.0001 \n",
"4 chr10_100009740_100010240 GM.5.0.Mixed.0001 \n",
"\n",
" factors_indirect score pos strand \n",
"0 EGR1, SRF 8.555754 56 -1 \n",
"1 EGR1, SRF 8.297692 315 -1 \n",
"2 EGR1, SRF 7.898092 423 -1 \n",
"3 EGR1, SRF 7.659488 424 -1 \n",
"4 EGR1, SRF 7.555011 52 -1 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check motif scan results\n",
"tfi.scanned_df.head()"
]
},
{
"cell_type": "markdown",
"id": "0d78066a-21e4-4407-9b4d-7bd7ef439735",
"metadata": {},
"source": [
"## filter and save"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "36e79d11-275e-43ba-b749-d8799280ffed",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Filtering finished: 11988274 -> 2230755\n",
"1. Converting scanned results into one-hot encoded dataframe.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "826b5564f14a4a358305fcf7f4e26bb9",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/29947 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"2. Converting results into dictionaries.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4df2b22f109d4e738437a08fd398ad37",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/18404 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "97c40f381d4d4d36a298f1338647e085",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1090 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Reset filtering\n",
"tfi.reset_filtering()\n",
"\n",
"# Do filtering\n",
"tfi.filter_motifs_by_score(threshold=10)\n",
"\n",
"# Format post-filtering results.\n",
"tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "89ff02a3-9300-4c5e-8f19-f5a80a9ee317",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(31844, 1092)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" peak_id | \n",
" gene_short_name | \n",
" 9430076C15RIK | \n",
" AC002126.6 | \n",
" AC012531.1 | \n",
" AC168977.1 | \n",
" AC226150.2 | \n",
" AFP | \n",
" AHR | \n",
" AHRR | \n",
" ... | \n",
" ZNF8 | \n",
" ZNF816 | \n",
" ZSCAN10 | \n",
" ZSCAN16 | \n",
" ZSCAN26 | \n",
" ZSCAN31 | \n",
" ZSCAN4 | \n",
" ZSCAN4-PS2 | \n",
" ZSCAN4C | \n",
" ZSCAN4F | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" chr10_100009740_100010240 | \n",
" DNMBP | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" chr10_100185817_100186317 | \n",
" ERLIN1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" chr10_100186339_100186839 | \n",
" ERLIN1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" chr10_100229388_100229888 | \n",
" CHUK | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" chr10_100267454_100267954 | \n",
" CWF19L1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 1092 columns
\n",
"
"
],
"text/plain": [
" peak_id gene_short_name 9430076C15RIK AC002126.6 \\\n",
"0 chr10_100009740_100010240 DNMBP 0 0 \n",
"1 chr10_100185817_100186317 ERLIN1 0 0 \n",
"2 chr10_100186339_100186839 ERLIN1 0 1 \n",
"3 chr10_100229388_100229888 CHUK 0 0 \n",
"4 chr10_100267454_100267954 CWF19L1 1 0 \n",
"\n",
" AC012531.1 AC168977.1 AC226150.2 AFP AHR AHRR ... ZNF8 ZNF816 \\\n",
"0 0 1 0 0 0 0 ... 0 0 \n",
"1 0 1 0 0 0 0 ... 0 0 \n",
"2 1 1 0 0 0 0 ... 0 0 \n",
"3 0 1 0 0 0 0 ... 0 0 \n",
"4 0 1 0 0 0 0 ... 0 0 \n",
"\n",
" ZSCAN10 ZSCAN16 ZSCAN26 ZSCAN31 ZSCAN4 ZSCAN4-PS2 ZSCAN4C ZSCAN4F \n",
"0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 0 \n",
"2 0 0 0 0 1 1 1 1 \n",
"3 0 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 0 \n",
"\n",
"[5 rows x 1092 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = tfi.to_dataframe()\n",
"print(df.shape)\n",
"df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "ef6d7c01-4776-4142-8cfd-941b6cbf30d2",
"metadata": {},
"outputs": [],
"source": [
"# Save result as a dataframe\n",
"df = tfi.to_dataframe()\n",
"df.to_parquet(os.path.join(result_dir,\"base_GRN.parquet\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "327f5539",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "TFcomb",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.21"
}
},
"nbformat": 4,
"nbformat_minor": 5
}