Trying to clear up discrepancies in visualization of reciprocal blasts for A. pulchra, P. evermanni, and P. tuahiniensis lncRNAs.

BLASTs

Using merged FASTA file which includes transcripts from all three species here for queries.

Make BLAST databases for each species

A. pulchra

/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
-in ~/github/deep-dive/DEF-cross-species/data/08-comparative-BLASTs/Apul_lncRNA.fasta \
-dbtype nucl \
-out ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/blasts/Apul-db/Apul_lncRNA

P. evermanni

/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
-in ~/github/deep-dive/DEF-cross-species/data/08-comparative-BLASTs/Peve_lncRNA.fasta \
-dbtype nucl \
-out ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/blasts/Peve-db/Peve_lncRNA

P. tuahiniensis

/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
-in ~/github/deep-dive/DEF-cross-species/data/08-comparative-BLASTs/Pmea_lncRNA.fasta \
-dbtype nucl \
-out ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/blasts/Pmea-db/Pmea_lncRNA

BLAST merged FASTA against each species database

Apul to all

/home/shared/ncbi-blast-2.11.0+/bin/blastn \
-task blastn \
-query ~/github/deep-dive/DEF-cross-species/data/08-comparative-BLASTs/merged_lncRNAs.fasta \
-db ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/blasts/Apul-db/Apul_lncRNA \
-out ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/Apul.tab \
-evalue 1E-40 \
-num_threads 40 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6

wc -l ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/Apul.tab

Peve to all

/home/shared/ncbi-blast-2.11.0+/bin/blastn \
-task blastn \
-query ~/github/deep-dive/DEF-cross-species/data/08-comparative-BLASTs/merged_lncRNAs.fasta \
-db ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/blasts/Peve-db/Peve_lncRNA \
-out ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/Peve.tab \
-evalue 1E-40 \
-num_threads 40 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6

wc -l ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/Peve.tab

Pmea (P. tuahiniensis) to all

/home/shared/ncbi-blast-2.11.0+/bin/blastn \
-task blastn \
-query ~/github/deep-dive/DEF-cross-species/data/08-comparative-BLASTs/merged_lncRNAs.fasta \
-db ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/blasts/Pmea-db/Pmea_lncRNA \
-out ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/Pmea.tab \
-evalue 1E-40 \
-num_threads 40 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6

wc -l ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/Apul.tab

Join BLAST results

Taken from Steven’s code here

perl -e '$count=0; $len=0; while(<>) {s/\r?\n//; s/\t/ /g; if (s/^>//) { if ($. != 1) {print "\n"} s/ |$/\t/; $count++; $_ .= "\t";} else {s/ //g; $len += length($_)} print $_;} print "\n"; warn "\nConverted $count FASTA records in $. lines to tabular format\nTotal sequence length: $len\n\n";' \
~/github/deep-dive/DEF-cross-species/data/08-comparative-BLASTs/merged_lncRNAs.fasta > ~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/merged_lncRNAs.tab

Import

query <- read.table("~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/merged_lncRNAs.tab", sep = '\t', header = FALSE, row.names=NULL)

apul <- read.table("~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/Apul.tab", sep = '\t', header = FALSE, row.names=NULL)

peve <- read.table("~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/Peve.tab", sep = '\t', header = FALSE, row.names=NULL)

pmea <- read.table("~/github/deep-dive/DEF-cross-species/output/08-comparative-BLASTs/Pmea.tab", sep = '\t', header = FALSE, row.names=NULL)

Join step

comp <- left_join(query, apul, by = "V1") %>%
  left_join(peve, by = "V1") %>%
  left_join(pmea, by = "V1") %>%
  select(V1, apul_hit = V2.y, apul_evalue = V11.x, peve_hit = V2.x.x, peve_evalue = V11.y, pmea_hit = V2.y.y, pmea_evalue = V11) %>%
   rowwise() %>%
  mutate(Hits = sum(!is.na(c_across(c(apul_hit, peve_hit, pmea_hit)))))

Additional step to establish overlap categories (not from Steven’s 09-homology code)

comp <- comp %>%
  mutate(Category = case_when(
    !is.na(apul_hit) & is.na(peve_hit) & is.na(pmea_hit) ~ "Only Apul",
    is.na(apul_hit) & !is.na(peve_hit) & is.na(pmea_hit) ~ "Only Peve",
    is.na(apul_hit) & is.na(peve_hit) & !is.na(pmea_hit) ~ "Only Pmea",
    !is.na(apul_hit) & !is.na(peve_hit) & is.na(pmea_hit) ~ "Apul & Peve",
    !is.na(apul_hit) & is.na(peve_hit) & !is.na(pmea_hit) ~ "Apul & Pmea",
    is.na(apul_hit) & !is.na(peve_hit) & !is.na(pmea_hit) ~ "Peve & Pmea",
    !is.na(apul_hit) & !is.na(peve_hit) & !is.na(pmea_hit) ~ "Apul & Peve & Pmea",
    TRUE ~ "Unknown"
  ))

Plot the venn diagram

# Load necessary packages
library(dplyr)
library(ggvenn)

# Extract sets of sequence IDs based on categories
apul_sequences <- comp %>% filter(Category %in% c("Only Apul", "Apul & Peve", "Apul & Pmea", "Apul & Peve & Pmea")) %>% pull(V1)
peve_sequences <- comp %>% filter(Category %in% c("Only Peve", "Apul & Peve", "Peve & Pmea", "Apul & Peve & Pmea")) %>% pull(V1)
pmea_sequences <- comp %>% filter(Category %in% c("Only Pmea", "Apul & Pmea", "Peve & Pmea", "Apul & Peve & Pmea")) %>% pull(V1)

# Create a named list for ggvenn
venn_data <- list(
  Apul = apul_sequences,
  Peve = peve_sequences,
  Pmea = pmea_sequences
)

# Plot the Venn diagram
ggvenn(venn_data)

# Plot the Venn diagram with custom colors
ggvenn(venn_data, fill_color = c("#408EC6", "#1E2761", "#7A2048"))

Final output:

Venn Diagram of lncRNA Overlap