Overview
CEABiGR lncRNA-mRNA pearson code
This notebook quantifies co-expression relationships between lncRNAs and mRNAs using Pearson correlation with the corrected mRNA count matrix created using the CDS fasta from NCBI to retrieve gene-level counts.
Pairwise Pearson correlations were calculated between lncRNA and mRNA expression profiles across samples. Statistical significance was assessed by converting correlation coefficients to t-statistics and corresponding p-values, followed by Benjamini–Hochberg correction to control the false discovery rate (FDR). To prioritize biologically meaningful relationships, only strong correlations (|r| ≥ 0.8) with FDR ≤ 0.05 were retained. This combined threshold ensured that retained pairs reflected both statistically robust and high-magnitude co-expression patterns.
I filtered for correlations that are both strong (|r| ≥ 0.8) and statistically robust after multiple testing correction (FDR ≤ 0.05), ensuring high-confidence, biologically meaningful co-expression relationships.
Load and prepare data
library(tidyverse)
library(tibble)
data_dir <- "~/github/oyster-lnc/data/15-pearsons"
lnc_file <- file.path(data_dir, "lnc_counts.tsv")
mrna_file <- "~/github/oyster-lnc/output/11-mRNA-counts-DESeq2/kallisto_gene_counts.tsv"
lnc_raw <- readr::read_tsv(lnc_file, show_col_types = FALSE)
colnames(lnc_raw)[1] <- "gene_id"
lnc_mat <- lnc_raw %>%
column_to_rownames("gene_id") %>%
as.matrix()
mrna_raw <- readr::read_tsv(mrna_file, show_col_types = FALSE)
mrna_mat <- mrna_raw %>%
column_to_rownames("gene_id") %>%
as.matrix()
lnc_mat <- apply(lnc_mat, 2, as.numeric)
rownames(lnc_mat) <- lnc_raw$gene_id
mrna_mat <- apply(mrna_mat, 2, as.numeric)
rownames(mrna_mat) <- mrna_raw$gene_id
Align samples
common_samples <- intersect(colnames(lnc_mat), colnames(mrna_mat))
lnc_mat <- lnc_mat[, common_samples, drop = FALSE]
mrna_mat <- mrna_mat[, common_samples, drop = FALSE]
n_samples <- length(common_samples)
n_samples
Filter low-expression features
filter_rows <- function(mat, min_count = 10, min_samples = 2) {
keep <- rowSums(mat >= min_count) >= min_samples
mat[keep, , drop = FALSE]
}
lnc_mat_f <- filter_rows(lnc_mat)
mrna_mat_f <- filter_rows(mrna_mat)
Log-transform
lnc_log <- log2(lnc_mat_f + 1)
mrna_log <- log2(mrna_mat_f + 1)
Pearson correlation
cor_mat <- cor(
t(lnc_log),
t(mrna_log),
method = "pearson",
use = "pairwise.complete.obs"
)
P-values and FDR
r_vals <- cor_mat
r_vals[r_vals > 0.999999] <- 0.999999
r_vals[r_vals < -0.999999] <- -0.999999
t_vals <- r_vals * sqrt((n_samples - 2) / (1 - r_vals^2))
p_vals <- 2 * pt(-abs(t_vals), df = n_samples - 2)
coexp_df <- as.data.frame(as.table(cor_mat)) %>%
as_tibble() %>%
rename(lnc_id = Var1, mrna_id = Var2, r = n)
p_df <- as.data.frame(as.table(p_vals)) %>%
as_tibble() %>%
rename(lnc_id = Var1, mrna_id = Var2, p_value = n)
coexp_df <- coexp_df %>%
left_join(p_df, by = c("lnc_id", "mrna_id")) %>%
mutate(FDR = p.adjust(p_value, method = "BH"))
Significant pairs
sig_pairs <- coexp_df %>%
filter(abs(r) >= 0.8, FDR <= 0.05) %>%
arrange(FDR, desc(abs(r)))
head(sig_pairs)
Summary statistics
summary_stats <- tibble(
metric = c(
"Total pairs tested",
"Significant pairs",
"Unique lncRNAs",
"Unique mRNAs"
),
value = c(
nrow(coexp_df),
nrow(sig_pairs),
n_distinct(sig_pairs$lnc_id),
n_distinct(sig_pairs$mrna_id)
)
)
summary_stats
| Metric | Value |
|---|---|
| Total pairs tested | 150487011 |
| Significant pairs | 22233754 |
| Unique lncRNAs in significant pairs | 3847 |
| Unique mRNAs in significant pairs | 25355 |
Directionality
direction_summary <- sig_pairs %>%
mutate(direction = if_else(r > 0, "positive", "negative")) %>%
count(direction) %>%
mutate(prop = n / sum(n))
direction_summary
| Direction | n_pairs | prop |
|---|---|---|
| negative | 2967679 | 0.1334763 |
| positive | 19266075 | 0.8665237 |
Hub analysis
lnc_hubs <- sig_pairs %>%
count(lnc_id, name = "n_correlated_mRNAs") %>%
arrange(desc(n_correlated_mRNAs))
lnc_hubs %>% slice_head(n = 20)
mrna_hubs <- sig_pairs %>%
count(mrna_id, name = "n_correlated_lncRNAs") %>%
arrange(desc(n_correlated_lncRNAs))
mrna_hubs %>% slice_head(n = 20)
| lnc_id | n_correlated_mRNAs |
|---|---|
| rna-XR_002635402.1 | 13553 |
| rna-XR_002633775.1 | 13549 |
| rna-XR_002633774.1 | 13549 |
| rna-XR_002633772.1 | 13522 |
| rna-XR_002633773.1 | 13522 |
| rna-XR_002638587.1 | 13521 |
| rna-XR_002638586.1 | 13513 |
| rna-XR_002634071.1 | 13473 |
| rna-XR_002637475.1 | 13472 |
| rna-XR_002637939.1 | 13471 |
| mrna_id | n_correlated_lncRNAs |
|---|---|
| LOC111137995 | 2495 |
| LOC111106094 | 2491 |
| LOC111105683 | 2477 |
| LOC111122657 | 2471 |
| LOC111132339 | 2471 |
| LOC111104812 | 2468 |
| LOC111131182 | 2467 |
| LOC111124166 | 2466 |
| LOC111137787 | 2465 |
| LOC111138469 | 2465 |
Notes
- Correlations are computed across shared samples
- Filtering removes low-expression noise
- Output supports downstream network and functional analyses