CellOracle - TF binding motif scan

This notebook is used to scan every peak with TF motif, and construct the base GRN of TFs to target genes

import

[1]:
import pandas as pd
import numpy as np
import os
[2]:
import celloracle as co
from celloracle import motif_analysis as ma
from celloracle.utility import save_as_pickled_object
co.__version__
[2]:
'0.12.0'

reference genome preparation

[3]:
# PLEASE make sure reference genome is correct.
ref_genome = "hg38"

genome_installation = ma.is_genome_installed(ref_genome=ref_genome)
print(ref_genome, "installation: ", genome_installation)

hg38 installation:  True

If the ref_genome is not installed, run codes below.

[1]:
# import genomepy
# genomepy.install_genome(name="hg38", provider="UCSC")

load data

[4]:
# Load annotated peak data.
result_dir = '../../data/iPSC_example/base_GRN'
peaks = pd.read_csv(os.path.join(result_dir,"processed_peak.csv"), index_col=0)
print(peaks.shape)
peaks.head()
(31844, 2)
[4]:
peak_id gene_short_name
0 chr10_100009740_100010240 DNMBP
1 chr10_100185817_100186317 ERLIN1
2 chr10_100186339_100186839 ERLIN1
3 chr10_100229388_100229888 CHUK
4 chr10_100267454_100267954 CWF19L1

Check peak format

[5]:
def decompose_chrstr(peak_str):
    """
    Args:
        peak_str (str): peak_str. e.g. 'chr1_3094484_3095479'

    Returns:
        tuple: chromosome name, start position, end position
    """

    *chr_, start, end = peak_str.split("_")
    chr_ = "_".join(chr_)
    return chr_, start, end

from genomepy import Genome

def check_peak_format(peaks_df, ref_genome):
    """
    Check peak format.
     (1) Check chromosome name.
     (2) Check peak size (length) and remove sort DNA sequences (<5bp)

    """

    df = peaks_df.copy()

    n_peaks_before = df.shape[0]

    # Decompose peaks and make df
    decomposed = [decompose_chrstr(peak_str) for peak_str in df["peak_id"]]
    df_decomposed = pd.DataFrame(np.array(decomposed), index=peaks_df.index)
    df_decomposed.columns = ["chr", "start", "end"]
    df_decomposed["start"] = df_decomposed["start"].astype(int)
    df_decomposed["end"] = df_decomposed["end"].astype(int)

    # Load genome data
    genome_data = Genome(ref_genome)
    all_chr_list = list(genome_data.keys())


    # DNA length check
    lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])


    # Filter peaks with invalid chromosome name
    n_threshold = 30
    df = df[(lengths >= n_threshold) & df_decomposed.chr.isin(all_chr_list)]

    # DNA length check
    lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])

    # Data counting
    n_invalid_length = len(lengths[lengths < n_threshold])
    n_peaks_invalid_chr = n_peaks_before - df_decomposed.chr.isin(all_chr_list).sum()
    n_peaks_after = df.shape[0]


    #
    print("Peaks before filtering: ", n_peaks_before)
    print("Peaks with invalid chr_name: ", n_peaks_invalid_chr)
    print("Peaks with invalid length: ", n_invalid_length)
    print("Peaks after filtering: ", n_peaks_after)

    return df


peaks = check_peak_format(peaks, ref_genome)
Peaks before filtering:  31844
Peaks with invalid chr_name:  0
Peaks with invalid length:  0
Peaks after filtering:  31844

filter and save

[10]:
# Reset filtering
tfi.reset_filtering()

# Do filtering
tfi.filter_motifs_by_score(threshold=10)

# Format post-filtering results.
tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)



Filtering finished: 11988274 -> 2230755
1. Converting scanned results into one-hot encoded dataframe.
2. Converting results into dictionaries.
[11]:
df = tfi.to_dataframe()
print(df.shape)
df.head()

(31844, 1092)
[11]:
peak_id gene_short_name 9430076C15RIK AC002126.6 AC012531.1 AC168977.1 AC226150.2 AFP AHR AHRR ... ZNF8 ZNF816 ZSCAN10 ZSCAN16 ZSCAN26 ZSCAN31 ZSCAN4 ZSCAN4-PS2 ZSCAN4C ZSCAN4F
0 chr10_100009740_100010240 DNMBP 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 chr10_100185817_100186317 ERLIN1 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 chr10_100186339_100186839 ERLIN1 0 1 1 1 0 0 0 0 ... 0 0 0 0 0 0 1 1 1 1
3 chr10_100229388_100229888 CHUK 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 chr10_100267454_100267954 CWF19L1 1 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 1092 columns

[12]:
# Save result as a dataframe
df = tfi.to_dataframe()
df.to_parquet(os.path.join(result_dir,"base_GRN.parquet"))
[ ]: