Overview

CEABiGR lncRNA-mRNA pearson code

This notebook quantifies co-expression relationships between lncRNAs and mRNAs using Pearson correlation with the corrected mRNA count matrix created using the CDS fasta from NCBI to retrieve gene-level counts.

Pairwise Pearson correlations were calculated between lncRNA and mRNA expression profiles across samples. Statistical significance was assessed by converting correlation coefficients to t-statistics and corresponding p-values, followed by Benjamini–Hochberg correction to control the false discovery rate (FDR). To prioritize biologically meaningful relationships, only strong correlations (|r| ≥ 0.8) with FDR ≤ 0.05 were retained. This combined threshold ensured that retained pairs reflected both statistically robust and high-magnitude co-expression patterns.

I filtered for correlations that are both strong (|r| ≥ 0.8) and statistically robust after multiple testing correction (FDR ≤ 0.05), ensuring high-confidence, biologically meaningful co-expression relationships.


Load and prepare data

library(tidyverse)
library(tibble)

data_dir <- "~/github/oyster-lnc/data/15-pearsons"
lnc_file  <- file.path(data_dir, "lnc_counts.tsv")
mrna_file <- "~/github/oyster-lnc/output/11-mRNA-counts-DESeq2/kallisto_gene_counts.tsv"

lnc_raw <- readr::read_tsv(lnc_file, show_col_types = FALSE)
colnames(lnc_raw)[1] <- "gene_id"

lnc_mat <- lnc_raw %>%
  column_to_rownames("gene_id") %>%
  as.matrix()

mrna_raw <- readr::read_tsv(mrna_file, show_col_types = FALSE)

mrna_mat <- mrna_raw %>%
  column_to_rownames("gene_id") %>%
  as.matrix()

lnc_mat <- apply(lnc_mat, 2, as.numeric)
rownames(lnc_mat) <- lnc_raw$gene_id

mrna_mat <- apply(mrna_mat, 2, as.numeric)
rownames(mrna_mat) <- mrna_raw$gene_id

Align samples

common_samples <- intersect(colnames(lnc_mat), colnames(mrna_mat))

lnc_mat  <- lnc_mat[, common_samples, drop = FALSE]
mrna_mat <- mrna_mat[, common_samples, drop = FALSE]

n_samples <- length(common_samples)
n_samples

Filter low-expression features

filter_rows <- function(mat, min_count = 10, min_samples = 2) {
  keep <- rowSums(mat >= min_count) >= min_samples
  mat[keep, , drop = FALSE]
}

lnc_mat_f  <- filter_rows(lnc_mat)
mrna_mat_f <- filter_rows(mrna_mat)

Log-transform

lnc_log  <- log2(lnc_mat_f + 1)
mrna_log <- log2(mrna_mat_f + 1)

Pearson correlation

cor_mat <- cor(
  t(lnc_log),
  t(mrna_log),
  method = "pearson",
  use = "pairwise.complete.obs"
)

P-values and FDR

r_vals <- cor_mat
r_vals[r_vals >  0.999999] <-  0.999999
r_vals[r_vals < -0.999999] <- -0.999999

t_vals <- r_vals * sqrt((n_samples - 2) / (1 - r_vals^2))
p_vals <- 2 * pt(-abs(t_vals), df = n_samples - 2)

coexp_df <- as.data.frame(as.table(cor_mat)) %>%
  as_tibble() %>%
  rename(lnc_id = Var1, mrna_id = Var2, r = n)

p_df <- as.data.frame(as.table(p_vals)) %>%
  as_tibble() %>%
  rename(lnc_id = Var1, mrna_id = Var2, p_value = n)

coexp_df <- coexp_df %>%
  left_join(p_df, by = c("lnc_id", "mrna_id")) %>%
  mutate(FDR = p.adjust(p_value, method = "BH"))

Significant pairs

sig_pairs <- coexp_df %>%
  filter(abs(r) >= 0.8, FDR <= 0.05) %>%
  arrange(FDR, desc(abs(r)))

head(sig_pairs)

Summary statistics

summary_stats <- tibble(
  metric = c(
    "Total pairs tested",
    "Significant pairs",
    "Unique lncRNAs",
    "Unique mRNAs"
  ),
  value = c(
    nrow(coexp_df),
    nrow(sig_pairs),
    n_distinct(sig_pairs$lnc_id),
    n_distinct(sig_pairs$mrna_id)
  )
)

summary_stats
Metric Value
Total pairs tested 150487011
Significant pairs 22233754
Unique lncRNAs in significant pairs 3847
Unique mRNAs in significant pairs 25355

Directionality

direction_summary <- sig_pairs %>%
  mutate(direction = if_else(r > 0, "positive", "negative")) %>%
  count(direction) %>%
  mutate(prop = n / sum(n))

direction_summary
Direction n_pairs prop
negative 2967679 0.1334763
positive 19266075 0.8665237

Hub analysis

lnc_hubs <- sig_pairs %>%
  count(lnc_id, name = "n_correlated_mRNAs") %>%
  arrange(desc(n_correlated_mRNAs))

lnc_hubs %>% slice_head(n = 20)

mrna_hubs <- sig_pairs %>%
  count(mrna_id, name = "n_correlated_lncRNAs") %>%
  arrange(desc(n_correlated_lncRNAs))

mrna_hubs %>% slice_head(n = 20)
lnc_id n_correlated_mRNAs
rna-XR_002635402.1 13553
rna-XR_002633775.1 13549
rna-XR_002633774.1 13549
rna-XR_002633772.1 13522
rna-XR_002633773.1 13522
rna-XR_002638587.1 13521
rna-XR_002638586.1 13513
rna-XR_002634071.1 13473
rna-XR_002637475.1 13472
rna-XR_002637939.1 13471
mrna_id n_correlated_lncRNAs
LOC111137995 2495
LOC111106094 2491
LOC111105683 2477
LOC111122657 2471
LOC111132339 2471
LOC111104812 2468
LOC111131182 2467
LOC111124166 2466
LOC111137787 2465
LOC111138469 2465

Notes

  • Correlations are computed across shared samples
  • Filtering removes low-expression noise
  • Output supports downstream network and functional analyses