CellOracle - TF binding motif scan¶
This notebook is used to scan every peak with TF motif, and construct the base GRN of TFs to target genes
import¶
[1]:
import pandas as pd
import numpy as np
import os
[2]:
import celloracle as co
from celloracle import motif_analysis as ma
from celloracle.utility import save_as_pickled_object
co.__version__
[2]:
'0.12.0'
reference genome preparation¶
[3]:
# PLEASE make sure reference genome is correct.
ref_genome = "hg38"
genome_installation = ma.is_genome_installed(ref_genome=ref_genome)
print(ref_genome, "installation: ", genome_installation)
hg38 installation: True
If the ref_genome is not installed, run codes below.
[1]:
# import genomepy
# genomepy.install_genome(name="hg38", provider="UCSC")
load data¶
[4]:
# Load annotated peak data.
result_dir = '../../data/iPSC_example/base_GRN'
peaks = pd.read_csv(os.path.join(result_dir,"processed_peak.csv"), index_col=0)
print(peaks.shape)
peaks.head()
(31844, 2)
[4]:
| peak_id | gene_short_name | |
|---|---|---|
| 0 | chr10_100009740_100010240 | DNMBP |
| 1 | chr10_100185817_100186317 | ERLIN1 |
| 2 | chr10_100186339_100186839 | ERLIN1 |
| 3 | chr10_100229388_100229888 | CHUK |
| 4 | chr10_100267454_100267954 | CWF19L1 |
Check peak format¶
[5]:
def decompose_chrstr(peak_str):
"""
Args:
peak_str (str): peak_str. e.g. 'chr1_3094484_3095479'
Returns:
tuple: chromosome name, start position, end position
"""
*chr_, start, end = peak_str.split("_")
chr_ = "_".join(chr_)
return chr_, start, end
from genomepy import Genome
def check_peak_format(peaks_df, ref_genome):
"""
Check peak format.
(1) Check chromosome name.
(2) Check peak size (length) and remove sort DNA sequences (<5bp)
"""
df = peaks_df.copy()
n_peaks_before = df.shape[0]
# Decompose peaks and make df
decomposed = [decompose_chrstr(peak_str) for peak_str in df["peak_id"]]
df_decomposed = pd.DataFrame(np.array(decomposed), index=peaks_df.index)
df_decomposed.columns = ["chr", "start", "end"]
df_decomposed["start"] = df_decomposed["start"].astype(int)
df_decomposed["end"] = df_decomposed["end"].astype(int)
# Load genome data
genome_data = Genome(ref_genome)
all_chr_list = list(genome_data.keys())
# DNA length check
lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])
# Filter peaks with invalid chromosome name
n_threshold = 30
df = df[(lengths >= n_threshold) & df_decomposed.chr.isin(all_chr_list)]
# DNA length check
lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])
# Data counting
n_invalid_length = len(lengths[lengths < n_threshold])
n_peaks_invalid_chr = n_peaks_before - df_decomposed.chr.isin(all_chr_list).sum()
n_peaks_after = df.shape[0]
#
print("Peaks before filtering: ", n_peaks_before)
print("Peaks with invalid chr_name: ", n_peaks_invalid_chr)
print("Peaks with invalid length: ", n_invalid_length)
print("Peaks after filtering: ", n_peaks_after)
return df
peaks = check_peak_format(peaks, ref_genome)
Peaks before filtering: 31844
Peaks with invalid chr_name: 0
Peaks with invalid length: 0
Peaks after filtering: 31844
tf binding motifs search¶
[6]:
# Instantiate TFinfo object
tfi = ma.TFinfo(peak_data_frame=peaks,
ref_genome=ref_genome)
[7]:
len(tfi.all_peaks)
[7]:
29947
[8]:
%%time
# Scan motifs. !!CAUTION!! This step may take several hours if you have many peaks!
tfi.scan(fpr=0.02,
motifs=None, # If you enter None, default motifs will be loaded.
verbose=True)
# Save tfinfo object
tfi.to_hdf5(file_path=os.path.join(result_dir,"motif_scan.celloracle.tfinfo"))
No motif data entered. Loading default motifs for your species ...
Default motif for vertebrate: gimme.vertebrate.v5.0.
For more information, please see https://gimmemotifs.readthedocs.io/en/master/overview.html
Initiating scanner...
Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time.
2025-01-02 15:09:46,498 - INFO - determining FPR-based threshold
Motif scan started .. It may take long time.
CPU times: user 15min 53s, sys: 1min 37s, total: 17min 30s
Wall time: 18min 9s
[9]:
# Check motif scan results
tfi.scanned_df.head()
[9]:
| seqname | motif_id | factors_direct | factors_indirect | score | pos | strand | |
|---|---|---|---|---|---|---|---|
| 0 | chr10_100009740_100010240 | GM.5.0.Mixed.0001 | EGR1, SRF | 8.555754 | 56 | -1 | |
| 1 | chr10_100009740_100010240 | GM.5.0.Mixed.0001 | EGR1, SRF | 8.297692 | 315 | -1 | |
| 2 | chr10_100009740_100010240 | GM.5.0.Mixed.0001 | EGR1, SRF | 7.898092 | 423 | -1 | |
| 3 | chr10_100009740_100010240 | GM.5.0.Mixed.0001 | EGR1, SRF | 7.659488 | 424 | -1 | |
| 4 | chr10_100009740_100010240 | GM.5.0.Mixed.0001 | EGR1, SRF | 7.555011 | 52 | -1 |
filter and save¶
[10]:
# Reset filtering
tfi.reset_filtering()
# Do filtering
tfi.filter_motifs_by_score(threshold=10)
# Format post-filtering results.
tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)
Filtering finished: 11988274 -> 2230755
1. Converting scanned results into one-hot encoded dataframe.
2. Converting results into dictionaries.
[11]:
df = tfi.to_dataframe()
print(df.shape)
df.head()
(31844, 1092)
[11]:
| peak_id | gene_short_name | 9430076C15RIK | AC002126.6 | AC012531.1 | AC168977.1 | AC226150.2 | AFP | AHR | AHRR | ... | ZNF8 | ZNF816 | ZSCAN10 | ZSCAN16 | ZSCAN26 | ZSCAN31 | ZSCAN4 | ZSCAN4-PS2 | ZSCAN4C | ZSCAN4F | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chr10_100009740_100010240 | DNMBP | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | chr10_100185817_100186317 | ERLIN1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | chr10_100186339_100186839 | ERLIN1 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 |
| 3 | chr10_100229388_100229888 | CHUK | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | chr10_100267454_100267954 | CWF19L1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 1092 columns
[12]:
# Save result as a dataframe
df = tfi.to_dataframe()
df.to_parquet(os.path.join(result_dir,"base_GRN.parquet"))
[ ]: