Overview

In this workflow, I generated a new mRNA count matrix using kallisto based on CDS sequences from Crassostrea virginica. This approach quantifies coding-region expression and summarizes to gene-level counts. View full code here: New count matrix construction for mRNAs section


Step 1: Build kallisto index

/home/shared/kallisto/kallisto \
index -i /home/shared/8TB_HDD_02/zbengt/github/oyster-lnc/output/11-mRNA-counts-DESeq2/kallisto-index/GCF_002022765.2_C_virginica-3.0_cds_from_genomic.index \
~/github/oyster-lnc/data/11-mRNA-counts-DESeq2/cds_from_genomic.fna

Step 2: Quantify reads

for R1 in ~/github/oyster-lnc/data/01-lncRNA-kallisto/*_R1.fastp-trim.20bp-5prime.20220224.fq.gz
do
  sample=$(basename "${R1}" | sed 's/_R1\.fastp-trim\.20bp-5prime\.20220224\.fq\.gz//')
  R2=~/github/oyster-lnc/data/01-lncRNA-kallisto/${sample}_R2.fastp-trim.20bp-5prime.20220224.fq.gz

  /home/shared/kallisto/kallisto quant \
    -i /home/shared/8TB_HDD_02/zbengt/github/oyster-lnc/output/11-mRNA-counts-DESeq2/kallisto-index/GCF_002022765.2_C_virginica-3.0_cds_from_genomic.index \
    -o /home/shared/8TB_HDD_02/zbengt/github/oyster-lnc/output/11-mRNA-counts-DESeq2/kallisto-quant/${sample} \
    -t 8 \
    "${R1}" "${R2}"
done

Step 3: Build tx2gene mapping

cds <- readDNAStringSet("~/github/oyster-lnc/data/11-mRNA-counts-DESeq2/cds_from_genomic.fna")

tx2gene <- tibble(header = names(cds)) %>%
  mutate(
    target_id = str_extract(header, "^[^ ]+"),
    gene_id   = str_extract(header, "(?<=\\[gene=)[^\\]]+")
  ) %>%
  select(target_id, gene_id)

write_tsv(tx2gene, "tx2gene.tsv")

Step 4: Summarize to gene-level counts and additional summary data outputs

tx2gene <- read_tsv(tx2gene_file, show_col_types = FALSE) %>%
  filter(!is.na(gene_id), !is.na(target_id))

txi <- tximport(
  files,
  type = "kallisto",
  tx2gene = tx2gene,
)

gene_counts <- as.data.frame(txi$counts) %>%
  rownames_to_column("gene_id")

gene_abundance <- as.data.frame(txi$abundance) %>%
  rownames_to_column("gene_id")

gene_length <- as.data.frame(txi$length) %>%
  rownames_to_column("gene_id")

write_tsv(gene_counts, file.path(out_dir, "kallisto_gene_counts.tsv"))
write_tsv(gene_abundance, file.path(out_dir, "kallisto_gene_tpm.tsv"))
write_tsv(gene_length, file.path(out_dir, "kallisto_gene_length.tsv"))

Outputs


Notes

  • CDS-based quantification captures coding-region expression
  • tximport aggregates CDS → gene
  • Output ready for DESeq2 and co-expression analyses