diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 66e8334..d447632 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -1,8 +1,8 @@
-name: Tests
+name: CI
on:
push:
branches:
- - master
+ - main
paths-ignore:
- 'docs/**'
- '*.md'
@@ -14,7 +14,7 @@ on:
- '*.rst'
jobs:
tests:
- name: Lint with R and Python ${{ matrix.python }}
+ name: Test with Python ${{ matrix.python }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
@@ -30,6 +30,11 @@ jobs:
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}
+ - name: Install apt packages
+ run: |
+ sudo apt-get update \
+ && sudo apt-get install -y samtools texlive-latex-extra texlive-latex-recommended \
+ && sudo apt-get remove -y fonts-urw-base35 libgs9 libgs9-common libjbig2dec0 poppler-data
- name: Cache conda
uses: actions/cache@v4
env:
@@ -59,3 +64,6 @@ jobs:
- name: Lint R script
run: |
Rscript -e 'library(lintr); options(lintr.error_on_lint=TRUE); lint_dir(".", linters=linters_with_tags("correctness"))'
+ - name: Test script outputs
+ run: |
+ cd test && make test
diff --git a/Makefile b/Makefile
index 30372da..c0d4e4a 100644
--- a/Makefile
+++ b/Makefile
@@ -2,31 +2,43 @@
# Nextflow workflow output directory
wf_out_dir := workflow-outputs/output
+snapshot_dir := test/build-snapshot
-all: laava laava_dev
+# Form Bio workflow deployment
+formbio_org := form-bio-solutions
+formbio_project := aav-qc-workshop
+# Avoid uploading the local test dir; it's > the upload size limit
+tmp_stash_dir := /tmp/laava-deploy-test
-.PHONY: clean laava laava_dev sc ss diffcheck-sc diffcheck-ss
+all: laava laava_dev sc ss min folder
+
+.PHONY: clean laava laava_dev sc ss min folder diffcheck-sc diffcheck-ss formbio
clean:
- rm -fv .nextflow.log*
- rm -fv test/build/*
- rm -rf workflow-outputs/*
+ rm -f .nextflow.log*
+ rm -fr .nextflow/*
+ rm -fr workflow-outputs/*
+
laava laava_dev: %: %.dockerfile laava.conda_env.yml
docker build -t ghcr.io/formbio/$@:latest -f $< .
-sc: params-local-sc-no-ff.json
+sc ss min folder: %: params-local-%.json
nextflow run -profile local main.nf -params-file $<
-ss: params-local-ss-with-ff.json
- nextflow run -profile local main.nf -params-file $<
-min: params-local-no-file-sc.json
- nextflow run -profile local main.nf -params-file $<
+diffcheck-sc: $(wf_out_dir)/sc.subsample005.per_read.tsv
+ diff $(snapshot_dir)/sc.per_read.tsv $< && echo "OK"
+
+diffcheck-ss: $(wf_out_dir)/ss.subsample005.per_read.tsv $(wf_out_dir)/ss.subsample005.flipflop.tsv
+ #diff $(snapshot_dir)/ss.per_read.tsv $< && echo "OK"
+ diff $(snapshot_dir)/ss.flipflop.tsv $(lastword $^) && echo "OK"
-diffcheck-sc: $(wf_out_dir)/sc.subsample005.bam.per_read.tsv
- diff test/build-snapshot/sc.per_read.tsv $< && echo "OK"
-diffcheck-ss: $(wf_out_dir)/ss.subsample005.bam.per_read.tsv $(wf_out_dir)/ss.subsample005.bam.flipflop.tsv
- diff test/build-snapshot/ss.per_read.tsv $< && echo "OK"
- diff test/build-snapshot/ss.flipflop.tsv $(lastword $^) && echo "OK"
+formbio: clean
+ mv test/ "$(tmp_stash_dir)"
+ formbio workflow upload \
+ --org "$(formbio_org)" --project "$(formbio_project)" \
+ --env prod --visibility PROJECT \
+ --version dev --repo . --workflow laava
+ mv "$(tmp_stash_dir)" test
diff --git a/README.md b/README.md
index 82678c2..a5fb62f 100644
--- a/README.md
+++ b/README.md
@@ -64,8 +64,8 @@ There are several ways to satisfy the script dependencies locally.
### Option 1: Development docker image (`laava_dev` container image)
The `laava_dev.dockerfile` in this repo installs the scripts' dependencies, but not the
-scripts themselves, into a Docker container image that you can then use to run the
-local copies of the scripts.
+scripts themselves, into a Docker container image that you can then use to run the local
+copies of the scripts.
To build the container image with the name `laava_dev` (you can use another name if you prefer):
@@ -136,17 +136,39 @@ R packages:
## Testing
+### Automated local tests
+
The `test/` subdirectory in this repo contains small example input files and a Makefile
to run the scripts to reanalyze them and produce example HTML and PDF reports.
-Once you've completed installation (above), activate your conda environment or Docker container and change to the test directory:
+Once you've completed installation (above), activate your conda environment or Docker
+container and change to the test directory:
```
cd test
```
-To generate the HTML and PDF reports from the test dataset included in the repo (this takes about 1-2 minutes):
+To generate the HTML and PDF reports from the test dataset included in the repo, use any
+of these commands:
-```
-make
-```
+* `make sc` -- run the example self-complementary AAV (scAAV) sample. This takes about 1-2 minutes.
+* `make ss` -- run the example single-stranded AAV (ssAAV) sample. This takes about 2-3 minutes, including an additional flip/flop analysis step.
+* `make all` -- run both example AAV samples.
+* `make test` -- run both samples and check the results quantitatively.
+
+
+### Example Nextflow jobs
+
+The top level of this repo includes several JSON files with Nextflow parameter
+configurations (`params-*.json`). They use the same inputs as the automated test suite
+(above), plus the `laava` Docker image and a local installation of `nextflow` (which you
+can install any way you like, e.g. conda or brew).
+
+You can run them directly with Nextflow as usual, or use the Makefile at the top level
+of the repo to launch them:
+
+* `make sc` or `make ss` -- run the example self-complementary AAV (scAAV) or
+ single-stranded AAV (ssAAV) sample, as above.
+* `make min` -- run the scAAV sample with the minimum number of required parameters,
+ exercising the default behavior including guessing the construct vector type (sc/ss).
+* `make folder` -- run both samples via folder input.
diff --git a/laava.conda_env.yml b/laava.conda_env.yml
index e06d3ce..522246a 100644
--- a/laava.conda_env.yml
+++ b/laava.conda_env.yml
@@ -7,8 +7,10 @@ dependencies:
- python>=3.7.6
- r-base>=3.6.0
- biopython
+ - pandas
- parasail-python>=1.3.4
- pysam
+ - pytest
- r-flextable
- r-lintr
- r-rmarkdown
diff --git a/laava.dockerfile b/laava.dockerfile
index 0c6ab6f..ba29a35 100644
--- a/laava.dockerfile
+++ b/laava.dockerfile
@@ -1,5 +1,5 @@
# Interactive environment with scripts and extra dependencies
-FROM --platform=linux/amd64 continuumio/miniconda3:24.4.0-0
+FROM --platform=linux/amd64 continuumio/miniconda3:24.7.1-0
LABEL org.opencontainers.image.source https://github.com/formbio/AAV
RUN apt-get update \
@@ -21,15 +21,15 @@ RUN rm -rf /var/lib/apt/lists/*
# Install directly into 'base' conda environment
COPY laava.conda_env.yml ./conda_env.yml
-RUN conda install -y -n base python=3.10
RUN conda env update -v -n base -f conda_env.yml
# Executable scripts
RUN mkdir -p /opt/laava
RUN chmod 777 /opt/laava/
COPY src/* /opt/laava/
-RUN chmod +x /opt/laava/*.py /opt/laava/*.R
+RUN chmod +x /opt/laava/*.py /opt/laava/*.R /opt/laava/*.sh
ENV PATH "/opt/laava:$PATH"
+ENV PYTHONPATH "/opt/laava:$PYTHONPATH"
WORKDIR /data/
diff --git a/laava_dev.conda_env.yml b/laava_dev.conda_env.yml
index acaafe7..2ebc82b 100644
--- a/laava_dev.conda_env.yml
+++ b/laava_dev.conda_env.yml
@@ -9,6 +9,7 @@ dependencies:
- biopython
- graphviz
- nextflow
+ - pandas
- parasail-python>=1.3.4
- pysam
- pytest
@@ -17,3 +18,4 @@ dependencies:
- r-rmarkdown
- r-tidyverse
- ruff
+ - shellcheck
diff --git a/laava_dev.dockerfile b/laava_dev.dockerfile
index 0dfe6a7..c6d3db4 100644
--- a/laava_dev.dockerfile
+++ b/laava_dev.dockerfile
@@ -1,5 +1,5 @@
# Development environment for running the scripts, no scripts, extra dependencies
-FROM --platform=linux/amd64 continuumio/miniconda3:24.4.0-0
+FROM --platform=linux/amd64 continuumio/miniconda3:24.7.1-0
# Set the container's timezone to match this local machine
RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone
@@ -27,7 +27,6 @@ RUN apt-get update \
# Install directly into 'base' conda environment
COPY laava_dev.conda_env.yml ./conda_env.yml
-RUN conda install -y -n base python=3.10
RUN conda env update -v -n base -f conda_env.yml
WORKDIR /data/
diff --git a/main.nf b/main.nf
index 4252dc8..5454351 100644
--- a/main.nf
+++ b/main.nf
@@ -1,67 +1,54 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
-include { map_reads; make_report } from './modules/local/laava'
+include { match_metadata_to_files; map_reads; make_report } from './modules/local/laava'
NO_FILE = file("$projectDir/bin/NO_FILE")
NO_FILE2 = file("$projectDir/bin/NO_FILE2")
// Unpack the input sample(s) and metadata
-def prepareInput(
+def prepare_input(
seq_reads_file, seq_reads_folder, sample_unique_id, sample_display_name,
sample_in_metadata
) {
- // Known file extensions
def SAMPLE_FILE_GLOB = "*.{bam,fastq,fastq.gz,fq,fq.gz}"
def EXTENSION_REGEX = /\.(bam|fastq|fastq\.gz|fq|fq\.gz)$/
if (seq_reads_folder) {
// Multi-sample mode
- def sampleFiles = file("${params.seq_reads_folder}/${SAMPLE_FILE_GLOB}")
-
- if (params.sample_in_metadata) {
- // Metadata provided - use it to match files
- def csvData = channel
- .fromPath(params.sample_in_metadata)
- .splitCsv(header: true, sep='\t', strip=true)
- .map { row -> [row.sample_unique_id, row.sample_display_name] }
- .toList()
- .val
- def matchedSamples = csvData.collect { sampleId, sampleName ->
- def matchingFiles = sampleFiles.findAll { it.name.contains(sampleId) }
- if (matchingFiles.size() != 1) {
- error "Error: sample_unique_id '${sampleId}' matches ${matchingFiles.size()} files. Values must be unique."
- }
- [sampleId, sampleName, matchingFiles[0]]
- }
- // Check if all files were matched
- def unmatchedFiles = sampleFiles - matchedSamples.collect { it[2] }
- if (unmatchedFiles) {
- error "Error: The following files were not matched to any sample_unique_id: ${unmatchedFiles.join(', ')}"
- }
- return channel.fromList(matchedSamples)
+ if (sample_in_metadata) {
+ // TSV provided - load it in a separate process
+ return match_metadata_to_files(file(sample_in_metadata), file(seq_reads_folder))
+ .splitCsv(sep: '\t')
+ .map { row -> [row[0], row[1], file("${seq_reads_folder}/" + row[2])] }
} else {
- // No metadata provided - generate sampleId and sampleName from filenames
- return channel.fromList(sampleFiles.collect { sampleFile ->
- def stem = sampleFile.baseName.replaceFirst(EXTENSION_REGEX, '')
- [stem, stem, sampleFile]
+ // No TSV provided - generate sample_id and sample_name from filenames
+ //def found_files = file("${sample_folder}/*.{bam,fastq,fastq.gz,fq,fq.gz}")
+ def found_files = file("${seq_reads_folder}/${SAMPLE_FILE_GLOB}")
+ return channel.fromList(found_files.collect { seqfile ->
+ def stem = seqfile.baseName.replaceFirst(EXTENSION_REGEX, '')
+ [stem, stem, seqfile]
})
+
}
- } else if (params.seq_reads_file) {
+ } else if (seq_reads_file) {
// Single-sample mode
- def sampleFile = file(params.seq_reads_file)
- if (!sampleFile.name.matches(/.*${EXTENSION_REGEX}/)) {
- error "Error: The provided sample file '${sampleFile.name}' does not have a supported extension (${SAMPLE_FILE_GLOB})"
+ def seq_file = file(seq_reads_file)
+ if (!seq_file.exists()) {
+ error "Error: The provided sample file '${seq_reads_file}' does not exist."
+ }
+ if (!seq_file.name.matches(/.*${EXTENSION_REGEX}/)) {
+ error "Error: The provided sample file '${seq_file.name}' does not have a supported extension (bam, fastq, fastq.gz, fq, fq.gz)"
}
- def stem = sampleFile.baseName.replaceFirst(EXTENSION_REGEX, '')
- def sampleId = params.sample_unique_id ?: stem
- def sampleName = params.sample_display_name ?: params.sample_unique_id ?: stem
- return channel.of([sampleId, sampleName, sampleFile])
+ def stem = seq_file.baseName.replaceFirst(EXTENSION_REGEX, '')
+ def sample_id = sample_unique_id ?: stem
+ def sample_name = sample_display_name ?: sample_unique_id ?: stem
+ return channel.of([sample_id, sample_name, seq_file])
} else {
- error "Invalid input parameters. Provide either a sample folder path or a single sample file."
+ error "Invalid input parameters. Provide either a sample folder, a TSV file with sample folder, or a single sample file."
}
}
@@ -91,7 +78,7 @@ workflow laava {
main:
// Get a tuple of (ID, name, file) each given sample file and metadata
- sample_channel = prepareInput(
+ sample_channel = prepare_input(
seq_reads_file, seq_reads_folder, sample_unique_id, sample_display_name,
sample_in_metadata
)
@@ -120,20 +107,15 @@ workflow laava {
emit:
mapped_sam = map_reads.out.mapped_sam
mapped_bam = map_reads.out.mapped_bam
+ metadata_out_tsv = make_report.out.metadata_tsv
+ alignments_tsv = make_report.out.alignments_tsv
per_read_tsv = make_report.out.per_read_tsv
- summary_tsv = make_report.out.summary_tsv
- nonmatch_stat_tsvgz = make_report.out.nonmatch_stat_tsvgz
+ nonmatch_tsv = make_report.out.nonmatch_tsv
tagged_bam = make_report.out.tagged_bam
subtype_bams = make_report.out.subtype_bams
subtype_bais = make_report.out.subtype_bais
- flipflop_assignments_tsv = make_report.out.flipflop_assignments_tsv
flipflop_bams = make_report.out.flipflop_bams
- alignments_tsv = make_report.out.alignments_tsv
- readsummary_tsv = make_report.out.readsummary_tsv
- sequence_error_tsv = make_report.out.sequence_error_tsv
flipflop_tsv = make_report.out.flipflop_tsv
- rdata = make_report.out.rdata
- metadata_out_tsv = make_report.out.metadata_tsv
}
diff --git a/modules/local/laava.nf b/modules/local/laava.nf
index d226476..93ba52c 100644
--- a/modules/local/laava.nf
+++ b/modules/local/laava.nf
@@ -1,3 +1,19 @@
+process match_metadata_to_files {
+ input:
+ path sample_in_metadata
+ path sample_folder
+
+ output:
+ path("metadata_with_paths.tsv")
+
+ script:
+ """
+ match_metadata_to_files.py ${sample_in_metadata} ${sample_folder} \\
+ > metadata_with_paths.tsv
+ """
+}
+
+
process map_reads() {
publishDir "$params.output", mode: "copy"
@@ -60,23 +76,17 @@ process make_report() {
path(flipflop_fa)
output:
- // summarize alignment
- path("${sample_id}.per_read.tsv"), emit: per_read_tsv
- path("${sample_id}.summary.tsv"), emit: summary_tsv
- path("${sample_id}.nonmatch_stat.tsv.gz"), emit: nonmatch_stat_tsvgz
+ // summary tables
+ path("${sample_id}.metadata.tsv"), emit: metadata_tsv
+ path("${sample_id}.alignments.tsv.gz"), emit: alignments_tsv
+ path("${sample_id}.per_read.tsv.gz"), emit: per_read_tsv
+ path("${sample_id}.nonmatch.tsv.gz"), emit: nonmatch_tsv
+ path("${sample_id}.flipflop.tsv.gz"), emit: flipflop_tsv, optional: true
+ // intermediate data
path("${sample_id}.tagged.bam"), emit: tagged_bam
path("${sample_id}.*.tagged.sorted.bam"), emit: subtype_bams
path("${sample_id}.*.tagged.sorted.bam.bai"), emit: subtype_bais
- // flip-flop
- path("${sample_id}.flipflop_assignments.tsv"), emit: flipflop_assignments_tsv, optional: true
- path("${sample_id}.*-flipflop.bam"), emit: flipflop_bams, optional: true
- // intermediate data
- path("${sample_id}.metadata.tsv"), emit: metadata_tsv
- path("${sample_id}.alignments.tsv"), emit: alignments_tsv
- path("${sample_id}.readsummary.tsv"), emit: readsummary_tsv
- path("${sample_id}.sequence-error.tsv"), emit: sequence_error_tsv
- path("${sample_id}.flipflop.tsv"), emit: flipflop_tsv, optional: true
- path("${sample_id}.Rdata"), emit: rdata, optional: true
+ path("${sample_id}.flipflop-*.bam"), emit: flipflop_bams, optional: true
// report
path("${sample_id}_AAV_report.html"), emit: aav_report_html
path("${sample_id}_AAV_report.pdf"), emit: aav_report_pdf
diff --git a/nextflow.config b/nextflow.config
index 7b8b916..61a52bb 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -89,7 +89,6 @@ profiles {
process.executor = 'local'
process.queue = 10
workDir = 'workflow-outputs/work'
- process.container = "${params.container_repo}/laava:latest"
trace {
enabled = true
diff --git a/params-local-folder.json b/params-local-folder.json
new file mode 100644
index 0000000..f87b88a
--- /dev/null
+++ b/params-local-folder.json
@@ -0,0 +1,18 @@
+{
+ "seq_reads_folder": "test/samples/",
+ "sample_in_metadata": "test/samples/in-metadata.tsv",
+ "vector_type": "unspecified",
+ "vector_bed": "test/samples/sc.annotation.bed",
+ "vector_fa": "test/samples/sc.construct.fasta",
+ "packaging_fa": "test/fasta/packaging.fa",
+ "host_fa": "test/fasta/hg38.chr19trunc-chrM.fa",
+ "itr_label_1": "ITR-L",
+ "itr_label_2": "ITR-R",
+ "repcap_name": "pRep2Cap9",
+ "helper_name": "pHelper",
+ "lambda_name": "Lambda",
+ "target_gap_threshold": 200,
+ "max_allowed_outside_vector": 10,
+ "max_allowed_missing_flanking": 70,
+ "container_version": ":latest"
+}
diff --git a/params-local-no-file-sc.json b/params-local-min.json
similarity index 88%
rename from params-local-no-file-sc.json
rename to params-local-min.json
index efa4052..0f39879 100644
--- a/params-local-no-file-sc.json
+++ b/params-local-min.json
@@ -1,6 +1,5 @@
{
"seq_reads_file": "test/samples/sc.subsample005.bam",
- "vector_type": "scaav",
"vector_bed": "test/samples/sc.annotation.bed",
"vector_fa": "test/samples/sc.construct.fasta",
"container_version": ":latest"
diff --git a/params-local-sc-no-ff.json b/params-local-sc.json
similarity index 95%
rename from params-local-sc-no-ff.json
rename to params-local-sc.json
index ba6dc76..2c3b390 100644
--- a/params-local-sc-no-ff.json
+++ b/params-local-sc.json
@@ -1,6 +1,6 @@
{
"seq_reads_file": "test/samples/sc.subsample005.bam",
- "vector_type": "scaav",
+ "vector_type": "sc",
"vector_bed": "test/samples/sc.annotation.bed",
"vector_fa": "test/samples/sc.construct.fasta",
"packaging_fa": "test/fasta/packaging.fa",
diff --git a/params-local-ss-with-ff.json b/params-local-ss.json
similarity index 95%
rename from params-local-ss-with-ff.json
rename to params-local-ss.json
index d78de89..0ddce15 100644
--- a/params-local-ss-with-ff.json
+++ b/params-local-ss.json
@@ -1,6 +1,6 @@
{
"seq_reads_file": "test/samples/ss.subsample005.bam",
- "vector_type": "ssaav",
+ "vector_type": "ss",
"vector_bed": "test/samples/ss.annotation.bed",
"vector_fa": "test/samples/ss.construct.fasta",
"packaging_fa": "test/fasta/packaging.fa",
diff --git a/pyproject.toml b/pyproject.toml
index a4921b9..1dccde4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,16 +8,23 @@ target-version = "py39"
select = ["ALL"]
ignore = [
"ANN001",
+ "ANN101",
"ANN201",
"COM812",
+ "D101",
+ "D102",
"D103",
"D211",
"D213",
"E501",
"EM102",
+ "INP001",
"ISC001",
+ "PD901",
"PLR0913",
+ "PTH110",
"S101",
"T201",
+ "TD",
"TRY003",
]
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/calculate_rdata.R b/src/calculate_rdata.R
deleted file mode 100755
index 8480933..0000000
--- a/src/calculate_rdata.R
+++ /dev/null
@@ -1,186 +0,0 @@
-#!/usr/bin/env Rscript
-library(tidyverse)
-
-# Hush a benign message about "grouped output"
-options(dplyr.summarise.inform = FALSE)
-
-
-working_dir = getwd()
-message(paste("Working directory:", working_dir))
-
-args = commandArgs(trailingOnly = TRUE)
-
-r_params = list()
-r_params$input_prefix = args[1]
-r_params$annot_filename = args[2] # ex: annotation.txt
-r_params$sample_id = args[3]
-r_params$vector_type = args[4]
-r_params$flipflop_summary = ''
-if (length(args) > 4) {
- r_params$flipflop_summary = args[5]
-}
-message("Parameters:")
-print(r_params)
-
-
-# Sort order
-valid_types <- c('ssAAV', 'scAAV', 'host', 'repcap', 'helper', 'lambda', 'unmapped', 'chimeric')
-valid_subtypes <- c('full', 'full-gap', 'left-partial', 'right-partial', 'wtITR-partial', 'mITR-partial', 'partial', 'backbone', 'vector+backbone')
-
-
-# ----------------------------------------------------------
-## Read the annotation file to find the vector target region
-# ----------------------------------------------------------
-# e.g.
-# NAME=myHost;TYPE=host;
-# NAME=myVector;TYPE=vector;REGION=1795-6553;
-# NAME=mRepCap;TYPE=repcap;REGION=1895-5987;
-
-TARGET_REGION_START <- 0
-TARGET_REGION_END <- 0
-TARGET_REGION_START_REPCAP <- 0
-TARGET_REGION_END_REPCAP <- 0
-
-annot <- read.table(r_params$annot_filename)
-for (i in 1:dim(annot)[1]) {
- if (unlist(strsplit(annot[i, ], ';'))[2] == 'TYPE=vector') {
- p <- unlist(strsplit(annot[i, ], ';'))[3];
- s_e <- as.integer(unlist(strsplit(unlist(strsplit(p, '='))[2], '-')));
- TARGET_REGION_START <- s_e[1];
- TARGET_REGION_END <- s_e[2];
- } else if (unlist(strsplit(annot[i, ], ';'))[2] == 'TYPE=repcap') {
- p <- unlist(strsplit(annot[i, ], ';'))[3];
- s_e <- as.integer(unlist(strsplit(unlist(strsplit(p, '='))[2], '-')));
- TARGET_REGION_START_REPCAP <- s_e[1];
- TARGET_REGION_END_REPCAP <- s_e[2];
- }
-}
-
-
-x.all.summary <- read_tsv(paste0(r_params$input_prefix, '.summary.tsv'), show_col_types = FALSE) %>%
- mutate(map_start = map_start0, map_end = map_end1) %>%
- mutate(SampleID = r_params$sample_id, .before = read_id)
-write_tsv(x.all.summary, paste0(r_params$input_prefix, ".alignments.tsv"))
-
-x.all.err <- read_tsv(paste0(r_params$input_prefix, '.nonmatch_stat.tsv.gz'), show_col_types = FALSE) %>%
- mutate(SampleID = r_params$sample_id, .before = read_id)
-x.all.read <- read_tsv(paste0(r_params$input_prefix, '.per_read.tsv'), show_col_types = FALSE) %>%
- mutate(SampleID = r_params$sample_id, .before = read_id)
-
-x.all.err[x.all.err$type == 'D', "type"] <- 'deletion'
-x.all.err[x.all.err$type == 'I', "type"] <- 'insertion'
-x.all.err[x.all.err$type == 'X', "type"] <- 'mismatch'
-x.all.err[x.all.err$type == 'N', "type"] <- 'gaps'
-
-
-# ----------------------------------------------------------
-# produce stats for vector only (ssAAV or scAAV)
-# ----------------------------------------------------------
-
-x.read.vector <- filter(x.all.read, assigned_type %in% c('scAAV', 'ssAAV'))
-x.err.vector <- filter(x.all.err, read_id %in% x.read.vector$read_id)
-x.summary.vector <- filter(x.all.summary, read_id %in% x.read.vector$read_id)
-
-
-total_num_reads <- dim(x.read.vector)[1]
-
-total_err <- dim(x.err.vector)[1]
-x.err.vector$pos0_div <- (x.err.vector$pos0 %/% 10 * 10)
-df.err.vector <- x.err.vector %>%
- group_by(pos0_div, type) %>%
- summarise(count = n())
-
-x.err.vector$type_len_cat <- "1-10"
-x.err.vector[x.err.vector$type_len > 10, "type_len_cat"] <- "11-100"
-x.err.vector[x.err.vector$type_len > 100, "type_len_cat"] <- "100-500"
-x.err.vector[x.err.vector$type_len > 500, "type_len_cat"] <- ">500"
-x.err.vector$type_len_cat <- ordered(x.err.vector$type_len_cat, levels = c('1-10', '11-100', '100-500', '>500'))
-write_tsv(x.err.vector, paste0(r_params$input_prefix, ".sequence-error.tsv"))
-
-x.read.vector$subtype <- x.read.vector$assigned_subtype
-# Call "other" all complex and multi-part AAV alignments
-x.read.vector[!x.read.vector$subtype %in% valid_subtypes, "subtype"] <- 'other'
-# Call "snapback" any scAAV with an ITR-containing partial alignment but no full
-# alignment
-if (r_params$vector_type == "ssaav") {
- x.read.vector[((x.read.vector$assigned_type == "scAAV") &
- (x.read.vector$assigned_subtype == "left-partial")
- ), "subtype"] <- "left-snapback"
- x.read.vector[((x.read.vector$assigned_type == "scAAV") &
- (x.read.vector$assigned_subtype == "right-partial")
- ), "subtype"] <- "right-snapback"
-}
-
-total_read_count.vector <- sum(x.read.vector$effective_count)
-df.read.vector1 <- x.read.vector %>%
- group_by(assigned_type) %>%
- summarise(e_count = sum(effective_count)) %>%
- mutate(freq = round(e_count * 100 / total_read_count.vector, 2))
-df.read.vector1 <- df.read.vector1[order(-df.read.vector1$freq), ]
-
-
-df.read.vector2 <- x.read.vector %>%
- group_by(assigned_type, assigned_subtype) %>%
- summarise(e_count = sum(effective_count)) %>%
- mutate(freq = round(e_count * 100 / total_read_count.vector, 2))
-df.read.vector2 <- df.read.vector2[order(-df.read.vector2$freq), ]
-
-
-x.all.read[is.na(x.all.read$assigned_type), "assigned_type"] <- 'unmapped'
-x.all.read[grep("|", as.character(x.all.read$assigned_type), fixed = T), "assigned_type"] <- 'chimeric'
-x.all.read[!(x.all.read$assigned_type %in% valid_types), "assigned_type"] <- 'other'
-x.all.read[!(x.all.read$assigned_subtype %in% valid_subtypes), "assigned_subtype"] <- 'other'
-write_tsv(x.all.read, paste0(r_params$input_prefix, ".readsummary.tsv"))
-
-
-total_read_count.all <- sum(x.all.read$effective_count) #dim(x.all.read)[1]
-df.read1 <- x.all.read %>%
- group_by(assigned_type) %>%
- summarise(e_count = sum(effective_count)) %>%
- mutate(freq = round(e_count * 100 / total_read_count.all, 2))
-df.read1 <- df.read1[order(-df.read1$freq), ]
-
-
-# ----------------------------------------------------------
-# Stats and plot for flip/flop analysis (if available)
-# ----------------------------------------------------------
-
-if (file.exists(r_params$flipflop_summary)) {
- df.read.ssaav <- dplyr::filter(df.read.vector2, assigned_type == 'ssAAV') %>%
- filter(
- assigned_subtype == 'full' |
- assigned_subtype == 'right-partial' |
- assigned_subtype == 'left-partial'
- ) %>%
- select(e_count) %>%
- as.data.frame()
- total_ssaav <- sum(df.read.ssaav$e_count)
-
- data.flipflop <- read.table(r_params$flipflop_summary,
- sep = '\t',
- header = T
- )
- df.flipflop <- data.flipflop %>%
- group_by(type, subtype, leftITR, rightITR) %>%
- summarise(count = n())
- scff <- filter(df.flipflop, type == 'scAAV')
- ssff <- filter(df.flipflop, type == 'ssAAV')
-
- # Double single-stranded counts if appropriate
- numssff <- sum(ssff$count)
- if (is.numeric(numssff) & is.numeric(total_ssaav)) {
- if (total_ssaav > 0 & numssff > 0 & numssff * 2 == total_ssaav) {
- ssff <- ssff %>% mutate(count = count * 2)
- } else {
- ssff <- ssff %>% mutate(count = count)
- }
- }
- # Write TSV of flip flop configurations
- fftbl <- bind_rows(scff, ssff)
- write_tsv(fftbl, paste0(r_params$input_prefix, ".flipflop.tsv"))
-}
-
-
-## For downstream consumers
-
-save.image(file = paste0(r_params$input_prefix, ".Rdata"))
diff --git a/src/create_report.R b/src/create_report.R
index 08a2f06..5fb76d5 100755
--- a/src/create_report.R
+++ b/src/create_report.R
@@ -6,24 +6,25 @@ message(paste("Working directory:", working_dir))
args = commandArgs(trailingOnly = TRUE)
-rdata_path = args[1]
+input_params = list(
+ path_prefix = args[1],
+ vector_type = args[2],
+ annotation_txt = args[3]
+)
+message("Parameters:")
+print(input_params)
+# Find the report template relative to this script's filesystem location
rmd_dir = dirname(sub("--file=", "", commandArgs(trailingOnly = FALSE)[4]))
rmd_path = paste0(rmd_dir, "/report.Rmd")
message(paste("Report template location:", rmd_path, sep = " "))
-input_prefix = fs::path_ext_remove(rdata_path)
-out_path = paste0(input_prefix, "_AAV_report") # Adds .html and .pdf automatically
+#path_prefix = fs::path_ext_remove(input_params$path_prefix)
+out_path = paste0(input_params$path_prefix, "_AAV_report") # Adds .html and .pdf automatically
out_dir = dirname(out_path)
out_filename = basename(out_path)
message(paste("Output location:", out_path, sep = " "))
-input_params = list(
- rdata_path = rdata_path
-)
-message("Parameters:")
-print(input_params)
-
rmarkdown::render(
rmd_path,
output_format = "all",
diff --git a/src/get_flipflop_config.py b/src/get_flipflop_config.py
index a56a257..686e01a 100755
--- a/src/get_flipflop_config.py
+++ b/src/get_flipflop_config.py
@@ -1,17 +1,19 @@
#!/usr/bin/env python3
"""Get ITR flip flop configurations.
-Must have already run `summarize_AAV_alignment.py` to get a .tagged.BAM file!
+Must have already run `summarize_alignment.py` to get a .tagged.BAM file!
"""
+from __future__ import annotations
+
import csv
+import gzip
from typing import NamedTuple
import parasail
import pysam
from Bio import SeqIO
-
SW_SCORE_MATRIX = parasail.matrix_create("ACGT", 2, -5)
SEQ_AAV2 = dict(
@@ -83,7 +85,7 @@ def identify_flip_flop(r, ff_seq):
):
raise RuntimeError(
"Input BAM records must have a `AX` tag assigned by first running "
- "summarize_AAV_alignment.py. Abort!"
+ "summarize_alignment.py. Abort!"
)
config_left, config_right = "unclassified", "unclassified"
@@ -127,12 +129,12 @@ def identify_flip_flop(r, ff_seq):
def load_per_read_info(fname):
"""Load per-read info, keyed by read IDs, from a CSV file."""
- with open(fname) as in_csv:
- read_info = {r["read_id"]: r for r in csv.DictReader(in_csv, delimiter="\t")}
+ with gzip.open(fname, "rt") as in_tsv:
+ read_info = {r["read_id"]: r for r in csv.DictReader(in_tsv, delimiter="\t")}
return read_info
-def main(per_read_csv, tagged_bam, output_prefix, flipflop_fasta):
+def main(per_read_tsv, tagged_bam, output_prefix, flipflop_fasta):
"""Entry point."""
OUT_FIELDS = ["name", "type", "subtype", "start", "end", "leftITR", "rightITR"]
@@ -141,24 +143,24 @@ def main(per_read_csv, tagged_bam, output_prefix, flipflop_fasta):
else:
flipflop_seqs = FlipFlopSeqSet.from_fasta(flipflop_fasta)
- read_info = load_per_read_info(per_read_csv)
+ read_info = load_per_read_info(per_read_tsv)
- with open(output_prefix + ".flipflop_assignments.tsv", "w") as fout:
+ with gzip.open(output_prefix + ".flipflop.tsv.gz", "wt") as fout:
out_tsv = csv.writer(fout, delimiter="\t")
out_tsv.writerow(OUT_FIELDS)
reader = pysam.AlignmentFile(open(tagged_bam), "rb", check_sq=False)
out_bam_full = pysam.AlignmentFile(
- open(output_prefix + ".vector-full-flipflop.bam", "w"),
+ open(output_prefix + ".flipflop-full.bam", "w"),
"wb",
header=reader.header,
)
out_bam_leftp = pysam.AlignmentFile(
- open(output_prefix + ".vector-leftpartial-flipflop.bam", "w"),
+ open(output_prefix + ".flipflop-left-partial.bam", "w"),
"wb",
header=reader.header,
)
out_bam_rightp = pysam.AlignmentFile(
- open(output_prefix + ".vector-rightpartial-flipflop.bam", "w"),
+ open(output_prefix + ".flipflop-right-partial.bam", "w"),
"wb",
header=reader.header,
)
@@ -187,7 +189,7 @@ def main(per_read_csv, tagged_bam, output_prefix, flipflop_fasta):
[
r.qname,
a_type,
- t["AX"],
+ t["AX"][len("vector-") :],
str(r.reference_start),
str(r.reference_end),
c_l,
@@ -210,7 +212,7 @@ def main(per_read_csv, tagged_bam, output_prefix, flipflop_fasta):
parser = ArgumentParser()
parser.add_argument("sorted_tagged_bam", help="Sorted tagged BAM file")
- parser.add_argument("per_read_csv", help="Per read CSV file")
+ parser.add_argument("per_read_tsv", help="Per read TSV file")
parser.add_argument("-o", "--output-prefix", help="Output prefix", required=True)
parser.add_argument(
"--flipflop-fasta",
@@ -221,7 +223,7 @@ def main(per_read_csv, tagged_bam, output_prefix, flipflop_fasta):
args = parser.parse_args()
main(
- args.per_read_csv,
+ args.per_read_tsv,
args.sorted_tagged_bam,
args.output_prefix,
args.flipflop_fasta,
diff --git a/src/get_reference_names.py b/src/get_reference_names.py
index be57f01..9ac948b 100755
--- a/src/get_reference_names.py
+++ b/src/get_reference_names.py
@@ -1,11 +1,14 @@
#!/usr/bin/env python3
-"""Map reference sequence IDs to "source type name" categories for reporting.
+"""Map reference sequence IDs to "reference label" categories for reporting.
Output is a 2-column TSV with no header:
1. FASTA sequence ID
-2. Source type name, one of: "vector", "repcap", "helper", "host", "lambda"
+2. Reference label, one of: "vector", "repcap", "helper", "host", "lambda", or the original
+ sequence ID if it's not one of those.
"""
+from __future__ import annotations
+
import argparse
import logging
from pathlib import Path
@@ -26,7 +29,7 @@
def _main(args):
- out_rows = []
+ out_rows = [("Name", "Label")]
# Vector sequence -- should be just 1 in the FASTA; label it "vector"
vector_rec = SeqIO.read(args.vector, "fasta")
@@ -35,8 +38,8 @@ def _main(args):
# Packaging sequences -- multi-FASTA; apply renaming
if args.packaging:
packaging_name_map = {
- seq_id: source_type
- for seq_id, source_type in [
+ seq_id: reference_label
+ for seq_id, reference_label in [
(args.repcap_name, "repcap"),
(args.helper_name, "helper"),
(args.lambda_name, "lambda"),
diff --git a/src/guess_vector_type_length.py b/src/guess_vector_type_length.py
new file mode 100755
index 0000000..33190dd
--- /dev/null
+++ b/src/guess_vector_type_length.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+"""Infer single stranded (ssAAV) vs. self-complementary (scAAV) vector type.
+
+Approach:
+- scAAV packages best when the expression cassette (ITR to ITR) is 2.3-2.4 kb long.
+- ssAAV constructs are prone to unresolved dimers when the length is much less than
+ 4.7kb; the drug product is generally poor for lengths below 4.0kb and stuffer
+ sequences are necessary to rescue the construct.
+- Therefore, we read the length of the vector's annotated ITR-to-ITR region to determine
+ the length of the expression cassette. If this length is below a threshold, we report
+ scAAV, otherwise ssAAV.
+- If the heuristic guesses incorrectly, the mistake will be prominent in the report:
+ most vector reads will be marked as "other-vector" instead of "scAAV" or "ssAAV".
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+from summarize_alignment import load_annotation_file
+
+SC_MAX_THRESHOLD = 2300
+
+
+def length_from_annotation(fname: str) -> int:
+ """Read annotation.txt to calculate the expression cassette length."""
+ cassette_len = -1
+ for anno in load_annotation_file(fname).values():
+ if anno["label"] == "vector":
+ ref_coords = anno["region"]
+ if ref_coords is not None:
+ start1, end = ref_coords
+ cassette_len = end - start1 + 1
+ break
+ return cassette_len
+
+
+if __name__ == "__main__":
+ AP = argparse.ArgumentParser(description=__doc__)
+ AP.add_argument("ann_file", help="Vector annotation (.txt)")
+ AP.add_argument(
+ "-t",
+ "--sc-max-threshold",
+ default=SC_MAX_THRESHOLD,
+ type=int,
+ help="""Maximum length of the expression cassette, including ITRs, in the
+ given vector annotation to report as scAAV. [Default: %(default)d]""",
+ )
+ args = AP.parse_args()
+ cassette_len = length_from_annotation(args.ann_file)
+ print("Expression cassette length is", cassette_len, file=sys.stderr)
+ if cassette_len <= args.sc_max_threshold:
+ print("sc")
+ else:
+ print("ss")
diff --git a/src/guess_vector_type_mapping.py b/src/guess_vector_type_mapping.py
new file mode 100755
index 0000000..485bc6b
--- /dev/null
+++ b/src/guess_vector_type_mapping.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""Infer single stranded (ssAAV) vs. self-complementary (scAAV) vector type.
+
+Approach:
+- scAAV reads aligned to the vector contig almost always have two alignments (forward
+ and reverse-complement), with rare single alignments coming from contaminants and the
+ plasmid backbone.
+- ssAAV reads aligned to the vector more often have single alignments; reads with two
+ alignments most often come from "snapback" vector genomes.
+- Therefore, we calculate the fraction of reads aligned to the vector contig that have
+ exactly one alignment. If this fraction is below a threshold, we report scAAV,
+ otherwise ssAAV.
+"""
+
+from __future__ import annotations
+
+import argparse
+import collections
+import sys
+
+import pysam
+
+SC_MAX_THRESHOLD = 0.1
+
+
+def count_alignments(sam_fname, vector_name):
+ """Collect each read's length and aligned suffix counts."""
+
+ def filter_records():
+ reader = pysam.AlignmentFile(sam_fname, "r", check_sq=False)
+ for rec in reader:
+ if not rec.is_mapped:
+ continue
+ if vector_name and rec.reference_name != vector_name:
+ continue
+ _movie_id, read_num, _suffix = rec.qname.split("/", 2)
+ yield read_num
+
+ n_aln_per_read = collections.Counter(filter_records())
+ histo_n_aln = collections.Counter(n_aln_per_read.values())
+ print(histo_n_aln, file=sys.stderr)
+ n_single = histo_n_aln[1]
+ total = sum(histo_n_aln.values())
+ frac_single = n_single / total
+ print(f"{n_single} / {total} = {frac_single} singly-aligned reads", file=sys.stderr)
+ return frac_single
+
+
+if __name__ == "__main__":
+ AP = argparse.ArgumentParser(description=__doc__)
+ AP.add_argument("sam_file", help="Read alignment in SAM format.")
+ AP.add_argument("-v", "--vector-name", help="Vector reference sequence name.")
+ AP.add_argument(
+ "-t",
+ "--sc-max-threshold",
+ default=SC_MAX_THRESHOLD,
+ type=float,
+ help="""Maximum fraction of singly-aligned reads for this sample to be reported
+ as self-complementary AAV. [Default: %(default)f]""",
+ )
+ args = AP.parse_args()
+ frac = count_alignments(args.sam_file, args.vector_name)
+ if frac <= args.sc_max_threshold:
+ print("sc")
+ else:
+ print("ss")
diff --git a/bin/make_report.sh b/src/make_report.sh
similarity index 70%
rename from bin/make_report.sh
rename to src/make_report.sh
index 2c35b1a..b200fbb 100755
--- a/bin/make_report.sh
+++ b/src/make_report.sh
@@ -1,5 +1,5 @@
#!/bin/bash -ex
-sample_name=$1
+sample_id=$1
vector_type=$2
target_gap_threshold=$3
max_allowed_outside_vector=$4
@@ -20,16 +20,23 @@ else
echo "Reads $mapped_reads_sam appear to be in SAM format"
fi
+if [ "$vector_type" == "unspecified" ]; then
+ vector_type=$(guess_vector_type_length.py "$annotation_txt")
+ echo "Inferred vector_type: $vector_type"
+fi
+
echo
-echo "Starting summarize_AAV_alignment"
-summarize_AAV_alignment.py \
- "$mapped_reads_sam" "$annotation_txt" "$sample_name" \
+echo "Starting summarize_alignment"
+summarize_alignment.py \
+ "$mapped_reads_sam" "$annotation_txt" "$sample_id" \
+ --sample-id="$sample_id" \
+ --vector-type="$vector_type" \
--target-gap-threshold=$target_gap_threshold \
--max-allowed-outside-vector=$max_allowed_outside_vector \
--max-allowed-missing-flanking=$max_allowed_missing_flanking \
--cpus $(nproc)
-echo "Finished summarize_AAV_alignment"
+echo "Finished summarize_alignment"
ls -Alh
if [[ -n "$flipflop_name" || -n "$flipflop_fa" ]]; then
@@ -47,25 +54,17 @@ if [[ -n "$flipflop_name" || -n "$flipflop_fa" ]]; then
echo
echo "Starting get_flipflop_config"
get_flipflop_config.py \
- "${sample_name}.tagged.bam" "${sample_name}.per_read.tsv" \
+ "${sample_id}.tagged.bam" "${sample_id}.per_read.tsv.gz" \
$ff_opt \
- -o "$sample_name"
+ -o "$sample_id"
echo "Finished get_flipflop_config"
ls -Alh
- flipflop_assignments="${sample_name}.flipflop_assignments.tsv"
else
echo "Skipping flip/flop analysis"
- flipflop_assignments=""
fi
-echo
-echo "Starting calculate_rdata"
-calculate_rdata.R "./${sample_name}" "$annotation_txt" "$sample_name" "$vector_type" "$flipflop_assignments"
-echo "Finished calculate_rdata"
-ls -Alh
-
echo
echo "Starting create_report"
-create_report.R "./${sample_name}.Rdata"
+create_report.R "./${sample_id}" "$vector_type" "$annotation_txt"
echo "Finished create_report"
ls -Alh
diff --git a/bin/map_reads.sh b/src/map_reads.sh
similarity index 72%
rename from bin/map_reads.sh
rename to src/map_reads.sh
index b8c33d3..0dc1047 100755
--- a/bin/map_reads.sh
+++ b/src/map_reads.sh
@@ -36,13 +36,19 @@ fi
threads=$(nproc)
minimap2 --eqx -a --secondary=no -t $threads all_refs.fa "$reads_fn" > tmp.mapped.sam
# Sort the mapped reads by name
-samtools sort -@ $threads -n -O SAM -o "$sample_name.sort_by_name.sam" tmp.mapped.sam
+name_sam="$sample_name.sort_by_name.sam"
+samtools sort -@ $threads -n -O SAM -o "$name_sam" tmp.mapped.sam
# Make a position-sorted BAM output file for other downstream consumers
-out_bam="$sample_name.sort_by_pos.bam"
+pos_bam="$sample_name.sort_by_pos.bam"
# Drop unmapped reads
-samtools view -@ $threads --fast -F 4 -o tmp.sorted.bam tmp.mapped.sam
-samtools sort -@ $threads -o "$out_bam" tmp.sorted.bam
-samtools index "$out_bam"
+samtools view -@ $threads --fast -o tmp.sorted.bam tmp.mapped.sam
+samtools sort -@ $threads -o "$pos_bam" tmp.sorted.bam
+samtools index "$pos_bam"
+# Logging
ls -Alh
+echo
+samtools flagstat "$name_sam"
+echo
+samtools idxstats "$pos_bam"
diff --git a/src/match_metadata_to_files.py b/src/match_metadata_to_files.py
new file mode 100755
index 0000000..7a979c8
--- /dev/null
+++ b/src/match_metadata_to_files.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+"""Read sample IDs and names from a TSV and match to filenamess in the given folder.
+
+Print matched sample ID, name, and file path to stdout, separated by tabs.
+
+Only emit file paths matching a given sample ID; if a file in the folder matches the
+extension glob (.bam/fastq(.gz)), but does not match any sample ID in the input TSV,
+then it will be skipped.
+"""
+
+import argparse
+import csv
+import sys
+from pathlib import Path
+
+AP = argparse.ArgumentParser(__doc__)
+AP.add_argument("sample_in_metadata", type=Path)
+AP.add_argument("sample_folder", type=Path)
+args = AP.parse_args()
+
+
+VALID_EXTENSIONS = (".bam", ".fastq", ".fastq.gz", ".fq", ".fq.gz")
+available_files = [
+ f for f in Path(args.sample_folder).iterdir() if f.name.endswith(VALID_EXTENSIONS)
+]
+print("Available files:", available_files, file=sys.stderr)
+
+with Path(args.sample_in_metadata).open() as infile:
+ reader = csv.DictReader(infile, dialect="excel-tab")
+ line_no = -1
+ for line_no, row in enumerate(reader): # noqa: B007
+ sample_id = row.get("sample_unique_id")
+ sample_name = row.get("sample_display_name")
+ if not sample_id or not sample_name:
+ sys.exit(
+ "Error: The given metadata TSV file is missing required columns "
+ "'sample_unique_id' or 'sample_display_name'."
+ )
+
+ matching_files = [f for f in available_files if sample_id in f.name]
+ if len(matching_files) != 1:
+ sys.exit(
+ f"Error: sample_unique_id '{sample_id}' matches {len(matching_files)} "
+ "files. Expected exactly one match."
+ )
+
+ seq_reads_file = matching_files[0]
+ print(sample_id, sample_name, seq_reads_file.name, sep="\t")
+
+ if line_no == -1:
+ sys.exit("Error: The TSV file is empty or contains no valid data.")
+ print(f"Loaded {line_no + 1} sample paths and identifiers", file=sys.stderr)
diff --git a/src/prepare_annotation.py b/src/prepare_annotation.py
index 131ccec..117d772 100755
--- a/src/prepare_annotation.py
+++ b/src/prepare_annotation.py
@@ -23,13 +23,15 @@
- NAME is a sequence name/ID used in the reference sequence files that reads will be
mapped to.
- - TYPE is the source type of the reference sequence -- one of 'vector', 'helper',
- 'repcap', 'host', or 'lambda' (not used here).
+ - TYPE is the source "type" or label of the reference sequence -- one of 'vector', 'helper',
+ 'repcap', 'host', or 'lambda'.
- There must be exactly one line where TYPE is 'vector'.
- The 'vector' line also has a REGION field with the 1-based start and end positions.
In AAV, this is the payload region including the ITRs.
"""
+from __future__ import annotations
+
import argparse
import logging
import sys
@@ -40,13 +42,13 @@
class AnnRow(NamedTuple):
seq_name: str
- source_type: str
+ ref_label: str
start1: int
end: int
-# From summarize_AAV_alignment.py
-ANNOT_TYPES = {"vector", "repcap", "helper", "lambda", "host"}
+# From summarize_alignment.py
+KNOWN_ANNOT_LABELS = {"vector", "repcap", "helper", "lambda", "host"}
def read_annotation_bed(fname: str, itr_labels: list[str]):
@@ -143,56 +145,58 @@ def read_annotation_bed(fname: str, itr_labels: list[str]):
def read_reference_names(fname: str):
- """Read a 2-column TSV of reference sequence names and source types."""
+ """Read a 2-column TSV of reference sequence names and labels."""
with Path(fname).open() as infile:
- for line in infile:
- seq_name, source_type = line.split()
- if source_type in ANNOT_TYPES:
- yield AnnRow(seq_name, source_type, None, None)
- else:
+ # Skip header
+ lines = iter(infile)
+ next(infile)
+ for line in lines:
+ seq_name, ref_label = line.split()
+ if ref_label not in KNOWN_ANNOT_LABELS:
logging.info(
- "Nonstandard reference source type %s; "
- "the standards are: vector, repcap, helper, host",
- source_type,
+ "Nonstandard reference label %s; "
+ "the known labels are: vector, repcap, helper, host",
+ ref_label,
)
+ yield AnnRow(seq_name, ref_label, None, None)
def write_annotation_txt(out_fname: str, bed_rows: dict, other_rows: Iterable):
"""Write PacBio-style annotations to `out_fname`.
- Take the vector annotations and non-'vector' sequence names and source types, format
+ Take the vector annotations and non-'vector' sequence names and labels, format
it for annotation.txt, and append it to the same output file.
Skip any duplicate 'vector' sequence label appearing in the other references, and
- catch if a sequence name is reused across multiple source types.
+ catch if a sequence name is reused across multiple labels.
"""
vector_row = bed_rows["vector"]
repcap_row = bed_rows["repcap"]
with Path(out_fname).open("w+") as outf:
outf.write("NAME={};TYPE={};REGION={}-{};\n".format(*vector_row))
- seen_seq_names_and_sources = {vector_row.seq_name: vector_row.source_type}
+ seen_seq_names_and_labels = {vector_row.seq_name: vector_row.ref_label}
if repcap_row:
outf.write("NAME={};TYPE={};REGION={}-{};\n".format(*repcap_row))
- seen_seq_names_and_sources[repcap_row.seq_name] = repcap_row.source_type
+ seen_seq_names_and_labels[repcap_row.seq_name] = repcap_row.ref_label
for orow in other_rows:
- if orow.source_type == "vector":
+ if orow.ref_label == "vector":
if orow.seq_name != vector_row.seq_name:
raise RuntimeError(
- "Source type 'vector' listed in additional "
+ "Reference label 'vector' listed for additional "
f"reference names, but sequence name {orow.seq_name} does not "
f"match the previously given {vector_row.seq_name}"
)
continue
- if orow.seq_name in seen_seq_names_and_sources:
- prev_type = seen_seq_names_and_sources[orow.seq_name]
- if orow.source_type != prev_type:
+ if orow.seq_name in seen_seq_names_and_labels:
+ prev_type = seen_seq_names_and_labels[orow.seq_name]
+ if orow.ref_label != prev_type:
raise RuntimeError(
f"Sequence name {orow.seq_name} listed with "
- f"different source types: first {prev_type}, then "
- f"{orow.source_type}"
+ f"different labels: first {prev_type}, then "
+ f"{orow.ref_label}"
)
continue
- outf.write("NAME={seq_name};TYPE={source_type};\n".format(**orow._asdict()))
+ outf.write("NAME={seq_name};TYPE={ref_label};\n".format(**orow._asdict()))
if __name__ == "__main__":
@@ -203,7 +207,7 @@ def write_annotation_txt(out_fname: str, bed_rows: dict, other_rows: Iterable):
)
AP.add_argument(
"reference_names",
- help="Reference sequence names and their sources, in 2 columns.",
+ help="Reference sequence names and their labels, in 2 columns.",
)
AP.add_argument("itr_labels", nargs="*", help="ITR label(s) in annotation BED")
AP.add_argument("-o", "--output", help="Output filename (*.txt).")
diff --git a/src/report.Rmd b/src/report.Rmd
index 6e8008b..b3ee9bb 100644
--- a/src/report.Rmd
+++ b/src/report.Rmd
@@ -6,7 +6,11 @@ output:
pdf_document:
toc: true
params:
- rdata_path:
+ path_prefix:
+ value: ""
+ vector_type:
+ value: ""
+ annotation_txt:
value: ""
---
@@ -16,6 +20,9 @@ library(flextable)
# Avoid scientific notation
options(scipen = 999)
+# Hush a benign message about "grouped output"
+options(dplyr.summarise.inform = FALSE)
+
# Configure Rmarkdown/knitr/pandoc processing
knitr::opts_chunk$set(echo = FALSE,
fig.align = "center",
@@ -30,87 +37,206 @@ set_flextable_defaults(
table.layout = "autofit",
theme_fun = "theme_vanilla"
)
-
-# Load precomputed dataframes etc.
-load(params$rdata_path)
+# Load sample metadata
+meta <- read_tsv(paste0(params$path_prefix, '.metadata.tsv'), show_col_types = FALSE)
```
---
title: "AAV Sequence Analysis"
-subtitle: 'Sample: `r r_params$sample_id`'
+subtitle: 'Sample name: `r meta$sample_display_name`'
date: 'Date: `r format(Sys.Date())`'
---
+# Analysis context
+
+| | Value |
+|:---------------------:|----------------------------|
+| Sample unique ID | `r meta$sample_unique_id` |
+| Sequencing run ID | `r meta$sequencing_run_id` |
+| Construct vector type | `r params$vector_type`AAV |
+
`\newpage`{=latex}
-# Read type classification
+
+# Read-based AAV vector type classification
## Definitions
+Table: Reference label definitions.
+
+Reference Label |Definition
+:-----------------:|----------------------------------------------------
+vector |Read originates from the vector plasmid.
+repcap |Read originates from the RepCap plasmid. The Rep gene encodes four proteins (Rep78, Rep68, Rep52, and Rep40), which are required for viral genome replication and packaging, while Cap expression gives rise to the viral capsid proteins (VP; VP1/VP2/VP3), which form the outer capsid shell that protects the viral genome, as well as being actively involved in cell binding and internalization.
+helper |Read originates from the helper plasmid. In addition to Rep and Cap, AAV requires a helper plasmid containing genes from adenovirus. These genes (E4, E2a and VA) mediate AAV replication.
+host |Read originates from the host genome that is given (e.g. hg38, CHM13).
+chimeric-vector |One part of the read aligns to the vector plasmid, while another part of the same read aligns to a different reference sequence.
+chimeric-nonvector |The read consists of fragments that align to two or more different reference sequences, neither of which is the vector plasmid.
+
+
+Table: Assigned type definitions.
+
Assigned Type|Definition
-------------:|----------------------------------------------------
-scAAV |Self-complementary AAV where one half of the payload region is a reverse complement of the other, resulting in an intra-molecular double-stranded DNA template. A sequencing read is inferred as scAAV if it has both a primary and supplementary alignment to the vector genome.
-ssAAV |AAV where the resulting DNA template is expected to be single-stranded, as opposed to self-complementary. A sequencing read is inferred as ssAAV if it has a single alignment to the AAV genome and no complementary secondary alignment.
-other |Read consists of a fragment mapping to the vector but with unexpected polarities (e.g. +,-,- or +,+,+) and cannot be well-defined at the moment. This means that the algorithm was not able to distinguish the read either as ssAAV or scAAV by the definitions above, usually due to multiple supplementary alignments on the vector region or other unusual alignment features.
-host |Read originates from the host genome that is given (e.g. hg38, CHM13).
-repcap |Read originates from the repcap plasmid. The Rep gene encodes four proteins (Rep78, Rep68, Rep52, and Rep40), which are required for viral genome replication and packaging, while Cap expression gives rise to the viral capsid proteins (VP; VP1/VP2/VP3), which form the outer capsid shell that protects the viral genome, as well as being actively involved in cell binding and internalization.
-helper |Read originates from the helper plasmid. In addition to Rep and Cap, AAV requires a helper plasmid containing genes from adenovirus. These genes (E4, E2a and VA) mediate AAV replication.
-chimeric |Read consists of fragments that map to one or more "genomes" (e.g. vector and host; helper and repcap).
+:-----------:|----------------------------------------------------
+ssAAV |Single-stranded AAV vector genome where the resulting DNA template is expected to be single-stranded, as opposed to self-complementary. A sequencing read is inferred as ssAAV if it has a single alignment to the vector plasmid’s ITR-to-ITR payload region and no complementary supplemental alignment.
+scAAV |Self-complementary AAV vector genome where one half of the payload region is a reverse complement of the other, resulting in an intramolecular double-stranded DNA template. A sequencing read is inferred as scAAV if it aligns to the payload region in both forward (+) and reverse (-) read directions.
+backbone |Read aligns to the vector plasmid sequence but fully outside of the annotated ITR-to-ITR region, indicating that the sequence fragment originated solely from the plasmid backbone.
+other-vector |Read consists of a fragment mapping to the vector but with characteristics other than those listed above.
-*Note:* Even though ssAAV distinguishes one ITR as the wildtype (wtITR) and the other as the mutated ITR (mITR), we will still refer to them as "left ITR" and "right ITR". For example, "left-partial" would be equivalent to "mITR-partial" in the case where the mITR is the left ITR based on the given genomic coordinates.
`
`{=html}

`
`{=html}

`\break`{=latex}
+```{r loadanno, message=FALSE, warning=FALSE}
+# -------------------------
+# annotation.txt -> TARGET_REGION_{START,END}(_REPCAP)
+# -------------------------
+# Read the annotation file to find the vector target region
+# ----------------------------------------------------------
+# e.g.
+# NAME=myHost;TYPE=host;
+# NAME=myVector;TYPE=vector;REGION=1795-6553;
+# NAME=mRepCap;TYPE=repcap;REGION=1895-5987;
+
+TARGET_REGION_START <- 0
+TARGET_REGION_END <- 0
+TARGET_REGION_START_REPCAP <- 0
+TARGET_REGION_END_REPCAP <- 0
+
+annot <- read.table(params$annotation_txt)
+for (i in 1:dim(annot)[1]) {
+ if (unlist(strsplit(annot[i, ], ';'))[2] == 'TYPE=vector') {
+ p <- unlist(strsplit(annot[i, ], ';'))[3];
+ s_e <- as.integer(unlist(strsplit(unlist(strsplit(p, '='))[2], '-')));
+ TARGET_REGION_START <- s_e[1];
+ TARGET_REGION_END <- s_e[2];
+ } else if (unlist(strsplit(annot[i, ], ';'))[2] == 'TYPE=repcap') {
+ p <- unlist(strsplit(annot[i, ], ';'))[3];
+ s_e <- as.integer(unlist(strsplit(unlist(strsplit(p, '='))[2], '-')));
+ TARGET_REGION_START_REPCAP <- s_e[1];
+ TARGET_REGION_END_REPCAP <- s_e[2];
+ }
+}
+```
+
+```{r calculate, message=FALSE, warning=FALSE}
+# -------------------------------------
+# alignments.tsv.gz
+# -------------------------------------
+
+x.all.summary <- read_tsv(paste0(params$path_prefix, '.alignments.tsv.gz'), show_col_types = FALSE)
+
+
+# ---------------------------------------
+# per_read.tsv.gz
+# ---------------------------------------
+
+x.all.read <- read_tsv(paste0(params$path_prefix, '.per_read.tsv.gz'), show_col_types = FALSE)
+
+# Set the display order for categoricals
+valid_types <- c('ssAAV', 'scAAV', 'other-vector', 'backbone')
+valid_subtypes <- c(
+ 'full',
+ 'full-gap',
+ 'read-through',
+ 'partial',
+ 'left-partial',
+ 'right-partial',
+ 'itr-partial',
+ 'left-snapback',
+ 'right-snapback',
+ 'snapback',
+ 'unresolved-dimer',
+ 'tandem',
+ 'complex',
+ 'unclassified',
+ 'backbone')
+
+ref_counts <- x.all.read %>%
+ filter(reference_label != "(unmapped)") %>%
+ group_by(reference_label) %>%
+ summarise(e_count = sum(effective_count)) %>%
+ arrange(desc(e_count))
+ref_levels = c(ref_counts$reference_label, "(unmapped)")
+
+x.all.read <- x.all.read %>%
+ mutate(
+ reference_label = factor(reference_label, levels = ref_levels),
+ assigned_type = factor(assigned_type, levels = valid_types),
+ assigned_subtype = factor(assigned_subtype, levels = valid_subtypes))
+
+total_read_count_all <- sum(x.all.read$effective_count)
+
+# Filter to vector-only and sort types/subtypes
+x.read.vector <- x.all.read %>% filter(reference_label == "vector")
+
+total_read_count_vector <- sum(x.read.vector$effective_count)
+
+x.summary.primary <- filter(x.all.summary, is_mapped == "Y", is_supp == "N")
+x.joined.vector <- left_join(x.read.vector, x.summary.primary, by = "read_id", multiple = "first")
+```
+
# Assigned types by read alignment characteristics
-```{r sopt1, message=FALSE, warning=FALSE}
+```{r typetable, message=FALSE, warning=FALSE}
+df.read1 <- x.all.read %>%
+ group_by(reference_label, assigned_type) %>%
+ summarise(e_count = sum(effective_count)) %>%
+ mutate(freq = round(e_count * 100 / total_read_count_all, 2)) %>%
+ arrange(reference_label, desc(freq))
+
flextable(df.read1) %>%
- set_header_labels(values = c("Assigned Type", "Count", "Frequency (%)"))
+ set_header_labels(values = c("Mapped Reference", "Assigned Type", "Count", "Frequency (%)"))
```
-```{r barpercentreads, message=FALSE, warning=FALSE}
-countreads <- df.read1 %>% dplyr::filter(assigned_type != 'Lambda')
+```{r typeplots, message=FALSE, warning=FALSE}
+countreads <- df.read1 %>% dplyr::filter(reference_label != "lambda")
# Bar chart
-ggplot(countreads, aes(x = assigned_type, y = freq, fill = assigned_type)) +
- geom_bar(stat = 'identity') +
- xlab("Sequence") +
+ggplot(countreads, aes(x = reference_label, y = freq, fill = assigned_type)) +
+ geom_bar(stat = "identity") +
+ scale_x_discrete("Sequence", guide = guide_axis(angle = 30)) +
scale_y_continuous("Reads (%)",
- limits = c(0, NA),
- expand = expansion(mult = c(0, 0.05)))
+ limits = c(0, 100),
+ expand = c(0, 0))
# Pie chart
-ggplot(countreads, aes(x = "", y = e_count, fill = assigned_type)) +
+ggplot(countreads, aes(x = "", y = e_count, fill = reference_label)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void()
```
-## Single-stranded vs self-complementary frequency
+## Frequency of vector assigned types
-```{r sopt2, message=FALSE, warning=FALSE}
-sopt2 <- df.read.vector1 %>%
- mutate(totfreq = round(100 * e_count / total_read_count.all, 2))
+```{r vectypes, message=FALSE, warning=FALSE}
+df.read.vector1 <- x.read.vector %>%
+ group_by(assigned_type) %>%
+ summarise(e_count = sum(effective_count)) %>%
+ mutate(freq = round(e_count * 100 / total_read_count_vector, 2)) %>%
+ arrange(assigned_type, desc(freq))
-flextable(sopt2) %>%
+vectypes <- df.read.vector1 %>%
+ mutate(totfreq = round(100 * e_count / total_read_count_all, 2))
+
+flextable(vectypes) %>%
set_header_labels(values = c(
"Assigned Type",
"Count",
@@ -119,42 +245,78 @@ flextable(sopt2) %>%
))
```
-# Distribution of read lengths by assigned AAV types
+# Distribution of vector read lengths by assigned types
```{r atypeviolin, message=FALSE, warning=FALSE}
-allowed_subtypes <- c('full', 'full-gap', 'vector+backbone')
-
p2.atype_violin <- ggplot(
- filter(x.read.vector, assigned_subtype %in% allowed_subtypes),
- aes(
- x = paste(assigned_type, assigned_subtype, sep = '-'),
- y = read_len
- )
+ x.joined.vector %>%
+ mutate(assigned_type = factor(assigned_type, levels = valid_types)),
+ aes(x = assigned_type, y = read_len)
) +
geom_violin() +
- labs(title = "Distribution of read lengths by assigned AAV type",
- x = "Assigned AAV type") +
+ labs(title = "Distribution of vector read lengths by assigned type",
+ x = "Assigned type") +
scale_y_continuous("Read length",
limits = c(0, NA),
- expand = expansion(mult = c(0, 0.05))) +
- theme(axis.text.x = element_text(angle = -45, hjust = 0))
+ expand = expansion(mult = c(0, 0.05)))
p2.atype_violin
```
-# Assigned AAV read types detailed analysis
-## Assigned AAV types (top 20)
+# Assigned subtypes detailed analysis
+
+## Definitions
+
+```{asis, echo = params$vector_type == "ss"}
+Table: Assigned subtype definitions for single-stranded input vector.
+
+Assigned Type|Assigned Subtype|Definition
+:-----------:|:--------------:|------------------------------------------------
+ssAAV |full |Read aligns to a fragment of the vector originating from the left (upstream) ITR and ending at the right (downstream) ITR of the vector.
+ssAAV |full-gap |Read aligns to the vector ITR-to-ITR region, as with “full”, but with a significant number of gaps in the alignment between the ITRs.
+ssAAV |read-through |Read aligns to a fragment including the vector as well as plasmid backbone sequence. May imply read-through beyond the right ITR, or reverse packaging if the alignment is to only the left ITR and backbone.
+ssAAV |partial |Read aligns to a fragment of the vector originating from within the ITR sequences.
+ssAAV |left-partial |Read aligns to a fragment of the vector originating from the left (upstream) ITR of the vector while not covering the right ITR.
+ssAAV |right-partial |Read aligns to a fragment of the vector originating from the right (downstream) ITR of the vector while not covering the left ITR.
+ssAAV |left-snapback |Read consists of a double-stranded, sub-genomic fragment including only the left ITR and aligned symmetrically to the (+) and (-) strands.
+ssAAV |right-snapback |Read consists of a double-stranded, sub-genomic fragment including only the right ITR and aligned symmetrically to the (+) and (-) strands.
+other-vector |snapback |Read aligns to a double-stranded fragment in both (+) and (-) strands, but does not include either ITR.
+other-vector |unresolved-dimer|Read aligns to a double-stranded fragment covering the full ITR-to-ITR region in both (+) and (-) strands. A dimer in ssAAV context, twice the size of a ssAAVV-full vector genome.
+other-vector |tandem |Read has two or more overlapping alignments on the same strand, but none on the reverse strand, indicating tandem duplication of the same region.
+other-vector |complex |Read aligns to a double-stranded fragment with asymmetrical and/or multiple alignments on the (+) and (-) strands.
+other-vector |unclassified |Read alignment does not match any of the above orientations.
+```
+
+```{asis, echo = params$vector_type == "sc"}
+Table: Assigned subtype definitions for self-complementary input vector.
+
+Assigned Type|Assigned Subtype|Definition
+:-----------:|:--------------:|------------------------------------------------
+scAAV |full |Read aligns to a fragment of the vector originating from the left (upstream) ITR and ending at the right (downstream) ITR of the vector.
+scAAV |full-gap |Read aligns to the vector ITR-to-ITR region, as with “full”, but with a significant number of gaps in the alignment between the ITRs.
+scAAV |read-through |Read aligns to a fragment including the vector as well as plasmid backbone sequence. May imply read-through beyond the right ITR, or reverse packaging if the alignment is to only the left ITR and backbone.
+scAAV |partial |Read aligns to a fragment of the vector originating from within the ITR sequences.
+scAAV |itr-partial |Read aligns only to a single-stranded fragment of the vector originating from the left ITR while not covering the right ITR.
+scAAV |snapback |Read consists of a double-stranded, sub-genomic fragment including only the left ITR and read alignments in both (+) and (-) strands.
+other-vector |tandem |Read has two or more overlapping alignments on the same strand, but none on the reverse strand, indicating tandem duplication of the same region.
+other-vector |complex |Read aligns to a double-stranded fragment with asymmetrical and/or multiple alignments on the (+) and (-) strands.
+other-vector |unclassified |Read alignment does not match any of the above orientations.
+```
+
+## Assigned types and subtypes
```{r sopt3, message=FALSE, warning=FALSE}
+df.read.vector2 <- x.read.vector %>%
+ group_by(assigned_type, assigned_subtype) %>%
+ summarise(e_count = sum(effective_count)) %>%
+ mutate(freq = round(e_count * 100 / total_read_count_vector, 2)) %>%
+ arrange(assigned_type, desc(freq))
+
sopt3 <- df.read.vector2 %>%
- mutate(totfreq = round(100 * e_count / total_read_count.all, 2)) %>%
- arrange(
- factor(assigned_type, levels = valid_types),
- factor(assigned_subtype, levels = valid_subtypes)
- )
+ mutate(totfreq = round(100 * e_count / total_read_count_all, 2))
-flextable(sopt3[1:min(nrow(sopt3), 20), ]) %>%
+flextable(sopt3) %>%
set_header_labels(
values = c(
"Assigned Type",
@@ -166,108 +328,22 @@ flextable(sopt3[1:min(nrow(sopt3), 20), ]) %>%
)
```
-## Definitions
-
-Assigned Subtype|Definition
----------------:|----------------------------------------------------------------
-full |Read alignment includes the entire ITR-to-ITR target vector sequence.
-left-partial |Read aligns to a fragment of the vector originating from the left (upstream) ITR of the vector while not covering the right ITR.
-right-partial |Read aligns to a fragment of the vector originating from the right (downstream) ITR of the vector while not covering the left ITR.
-partial |Read aligns to a fragment of the vector originating from within the ITR sequences.
-vector+backbone |Read aligns to a fragment including the vector as well as plasmid backbone sequence. May imply read-through beyond the right ITR, or reverse packaging if the alignment is to only the left ITR and backbone.
-backbone |Read aligns to a fragment originating solely from the plasmid backbone sequence.
-snapback |Read consists of a double-stranded, sub-genomic fragment including only one ITR and read alignments in both (+) and (-) polarities. (ssAAV only)
-
-
-# Flip/flop considerations
-
-Term|Definition
----:|----------------
-Flip/Flop|One ITR is formed by two palindromic arms, called B–B' and C–C', embedded in a larger one, A–A'. The order of these palindromic sequences defines the flip or flop orientation of the ITR. ([Read more](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5655423/))
-
-
-```{r flipflopplot, message=FALSE, warning=FALSE, results='asis'}
-if (exists("df.flipflop")) {
- # Generate flip/flop summary plot
- ffplot <- df.flipflop %>%
- mutate(
- class = if_else(
- leftITR == 'unclassified' |
- rightITR == 'unclassified' |
- leftITR == 'unknown' | rightITR == 'unknown',
- "unclassifed",
- paste(leftITR, rightITR, sep = '-')
- )
- ) %>%
- group_by(type, subtype, class) %>%
- summarize(classct = sum(count)) %>%
- filter(subtype == 'vector-full')
- ggplot(ffplot, aes(x = type, y = classct, fill = class)) +
- geom_bar(stat = 'identity') +
- labs(x = 'Assigned type') +
- scale_y_continuous("Number of reads",
- limits = c(0, NA),
- expand = expansion(mult = c(0, 0.05)))
-
-} else {
- cat("No flip/flop analysis results available to display.")
-}
-```
-
-```{r flipflopsc, message=FALSE, warning=FALSE, results='asis'}
-if (exists("scff")) {
- cat("## Flip/flop configurations, scAAV only\n\n")
- if (nrow(scff) > 1) {
- flextable(scff) %>%
- set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count"))
- } else {
- cat("\nNo scAAV flip/flop analysis results available to display.")
- }
-}
-```
-
-```{r flipflopss, message=FALSE, warning=FALSE, results='asis'}
-if (exists("ssff")) {
- cat("## Flip/flop configurations, ssAAV only\n\n")
- if (nrow(ssff) > 1) {
- flextable(ssff) %>%
- set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count"))
- } else {
- cat("No ssAAV flip/flop analysis results available to display.")
- }
-}
-```
-
-# Distribution of read length by subtype
+## Distribution of read length by subtype
```{r lenhisto2_subtype, message=FALSE, warning=FALSE}
-# scAAV length histogram
-p1.scAAV_len_hist <- ggplot(filter(x.read.vector, assigned_type == 'scAAV'),
- aes(x = read_len, color = subtype)) +
+# Single plot
+p1_subtypes_len_hist <- ggplot(filter(x.joined.vector),
+ aes(x = read_len, color = assigned_subtype)) +
geom_freqpoly() +
- labs(title = "Distribution of scAAV read length by subtype") +
+ labs(title = "Distribution of read length by subtype") +
scale_x_continuous("Read length (bp)",
limits = c(0, NA),
expand = expansion(mult = c(0, 0.05))) +
scale_y_continuous("Count",
limits = c(0, NA),
expand = expansion(mult = c(0, 0.05)))
-
-# ssAAV length histogram
-p1.ssAAV_len_hist <- ggplot(filter(x.read.vector, assigned_type == 'ssAAV'),
- aes(x = read_len, color = subtype)) +
- geom_freqpoly() +
- labs(title = "Distribution of ssAAV read length by subtype") +
- scale_x_continuous("Read length (bp)",
- limits = c(0, NA),
- expand = expansion(mult = c(0, 0.05))) +
- scale_y_continuous("Count",
- limits = c(0, NA),
- expand = expansion(mult = c(0, 0.05)))
-
-p1.scAAV_len_hist
-p1.ssAAV_len_hist
+p1_subtypes_len_hist
```
@@ -276,16 +352,16 @@ p1.ssAAV_len_hist
## Gene therapy construct
```{r lenhisto2_construct, message=FALSE, warning=FALSE}
-p1.map_starts <- ggplot(x.summary.vector, aes(map_start0 + 1, fill = map_subtype)) +
+p1.map_starts <- ggplot(x.joined.vector, aes(map_start0 + 1, fill = assigned_subtype)) +
geom_histogram(aes(y = after_stat(count) / sum(after_stat(count))),
binwidth = 150,
boundary = 0,
closed = "left") +
geom_vline(xintercept = TARGET_REGION_START,
- color = 'red',
+ color = "black",
lty = 2) +
geom_vline(xintercept = TARGET_REGION_END,
- color = 'red',
+ color = "black",
lty = 2) +
scale_x_continuous("Mapped reference start position",
limits = c(0, NA),
@@ -295,34 +371,34 @@ p1.map_starts <- ggplot(x.summary.vector, aes(map_start0 + 1, fill = map_subtype
expand = c(0, 0)) +
labs(title = "Distribution of mapped read starts on construct")
-p1.map_ends <- ggplot(x.summary.vector, aes(map_end1, fill = map_subtype)) +
+p1.map_ends <- ggplot(x.joined.vector, aes(map_end1, fill = assigned_subtype)) +
geom_histogram(aes(y = after_stat(count) / sum(after_stat(count))),
binwidth = 150,
boundary = 0,
closed = "left") +
geom_vline(xintercept = TARGET_REGION_START,
- color = 'red',
+ color = "black",
lty = 2) +
geom_vline(xintercept = TARGET_REGION_END,
- color = 'red',
+ color = "black",
lty = 2) +
- scale_x_continuous("mapped reference end position",
+ scale_x_continuous("Mapped reference end position",
limits = c(0, NA),
expand = expansion(mult = c(0, 0.05))) +
- scale_y_continuous("fraction of reads",
+ scale_y_continuous("Fraction of reads",
limits = c(0, 1),
expand = c(0, 0)) +
labs(title = "Distribution of mapped read ends on construct")
-p1.map_len <- ggplot(x.summary.vector, aes(map_len, fill = map_subtype)) +
+p1.map_len <- ggplot(x.joined.vector, aes(map_len, fill = assigned_subtype)) +
geom_histogram(aes(y = after_stat(count) / sum(after_stat(count))),
binwidth = 150,
boundary = 0,
closed = "left") +
- scale_x_continuous("mapped reference length",
+ scale_x_continuous("Mapped reference length",
limits = c(0, NA),
expand = expansion(mult = c(0, 0.05))) +
- scale_y_continuous("fraction of reads",
+ scale_y_continuous("Fraction of reads",
limits = c(0, 1),
expand = c(0, 0)) +
labs(title = "Distribution of mapped spanning region sizes on construct")
@@ -335,7 +411,7 @@ p1.map_len
```{r lenhisto2_rc, message=FALSE, warning=FALSE, results='asis'}
-x.read.repcap <- filter(x.all.read, assigned_type == 'repcap')
+x.read.repcap <- filter(x.all.read, reference_label == 'repcap')
# Only render this subsection if enough reads mapped to repcap
if (dim(x.read.repcap)[1] > 10) {
@@ -408,6 +484,30 @@ if (exists("x.summary.repcap")) {
# Distribution of non-matches by reference position
+```{r varcalc, message=FALSE, warning=FALSE}
+# --------------------------------------------------
+# nonmatch.tsv.gz
+# --------------------------------------------------
+
+x.all.err <- read_tsv(paste0(params$path_prefix, '.nonmatch.tsv.gz'), show_col_types = FALSE)
+
+# Filter for ss/scAAV vector only
+x.err.vector <- filter(x.all.err, read_id %in% x.read.vector$read_id)
+
+# Spell out nonmatch types -- from CIGAR codes to words
+x.err.vector[x.err.vector$type == 'D', "type"] <- 'deletion'
+x.err.vector[x.err.vector$type == 'I', "type"] <- 'insertion'
+x.err.vector[x.err.vector$type == 'X', "type"] <- 'mismatch'
+x.err.vector[x.err.vector$type == 'N', "type"] <- 'gaps'
+
+# Round positions down to 10s for plotting
+x.err.vector$pos0_div <- (x.err.vector$pos0 %/% 10 * 10)
+
+df.err.vector <- x.err.vector %>%
+ group_by(pos0_div, type) %>%
+ summarise(count = n())
+```
+
```{r varplots, message=FALSE, warning=FALSE}
p1.err_sub <- ggplot(filter(df.err.vector, type == 'mismatch'),
aes(x = pos0_div, y = count)) +
@@ -448,6 +548,65 @@ p1.err_ins
```
+# Flip/flop configurations
+
+Term|Definition
+:--:|----------------
+Flip/Flop|One ITR is formed by two palindromic arms, called B–B' and C–C', embedded in a larger one, A–A'. The order of these palindromic sequences defines the flip or flop orientation of the ITR. ([Read more](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5655423/))
+
+```{r ffcalc, message=FALSE, warning=FALSE}
+# --------------------------------------------------
+# flipflop.tsv.gz (if available)
+# --------------------------------------------------
+
+flipflop_tsv <- paste0(params$path_prefix, '.flipflop.tsv.gz')
+if (file.exists(flipflop_tsv)) {
+ data.flipflop <- read_tsv(flipflop_tsv, show_col_types = FALSE)
+ df.flipflop <- data.flipflop %>%
+ group_by(type, subtype, leftITR, rightITR) %>%
+ summarise(count = n())
+}
+```
+
+```{r ffplot, message=FALSE, warning=FALSE, results='asis'}
+if (exists("df.flipflop")) {
+ # Generate flip/flop summary plot
+ ffplot <- df.flipflop %>%
+ mutate(
+ class = if_else(
+ leftITR == 'unclassified' | rightITR == 'unclassified',
+ "unclassified",
+ paste(leftITR, rightITR, sep = '-')
+ )
+ ) %>%
+ mutate(
+ unclass = if_else( class == "unclassified", "unclassified", "classified")
+ ) %>%
+ group_by(type, subtype, class, unclass) %>%
+ summarize(classct = sum(count)) %>%
+ filter(subtype == 'full')
+ ggplot(ffplot, aes(x = class, y = classct, fill = unclass)) +
+ geom_bar(stat = 'identity', show.legend = FALSE) +
+ labs(x = 'Assigned type') +
+ scale_y_continuous("Number of reads",
+ limits = c(0, NA),
+ expand = expansion(mult = c(0, 0.05)))
+
+} else {
+ cat("No flip/flop analysis results available to display.")
+}
+```
+
+```{r fftable, message=FALSE, warning=FALSE, results='asis'}
+if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) {
+ flextable(df.flipflop) %>%
+ set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count"))
+}
+```
+
+
+`\break`{=latex}
+
# Methods
This report was generated by an automated analysis of long-read sequencing data from
diff --git a/src/summarize_AAV_alignment.py b/src/summarize_alignment.py
similarity index 57%
rename from src/summarize_AAV_alignment.py
rename to src/summarize_alignment.py
index 1be7276..ef442bc 100755
--- a/src/summarize_AAV_alignment.py
+++ b/src/summarize_alignment.py
@@ -1,18 +1,19 @@
#!/usr/bin/env python3
"""Summarize the AAV alignment."""
+from __future__ import annotations
+
import gzip
import itertools
import logging
import os
import re
-import shutil
import subprocess
import sys
from csv import DictReader, DictWriter
from multiprocessing import Process
-# import pdb
+# import pdb
import pysam
CIGAR_DICT = {
@@ -28,7 +29,7 @@
9: "B",
}
-annot_rex = re.compile(r"NAME=(\S+);TYPE=([a-zA-Z]+);(REGION=\d+\-\d+){0,1}")
+annot_rex = re.compile(r"NAME=([^;\s]+);TYPE=([^;\s]+);(REGION=\d+\-\d+){0,1}")
ccs_rex = re.compile(r"\S+\/\d+\/ccs(\/fwd|\/rev)?")
ANNOT_TYPE_PRIORITIES = {"vector": 1, "repcap": 2, "helper": 3, "lambda": 4, "host": 5}
@@ -47,26 +48,34 @@ def subset_sam_by_readname_list(
exclude_subtype=False,
exclude_type=False,
):
- qname_list = {} # qname --> (a_type, a_subtype)
- for r in DictReader(open(per_read_tsv), delimiter="\t"):
- # pdb.set_trace()
- if (
- wanted_types is None
- or (not exclude_type and r["assigned_type"] in wanted_types)
- or (exclude_type and r["assigned_type"] not in wanted_types)
- ) and (
- wanted_subtypes is None
- or (not exclude_subtype and (r["assigned_subtype"] in wanted_subtypes))
- or (exclude_subtype and (r["assigned_subtype"] not in wanted_subtypes))
- ):
- qname_list[r["read_id"]] = (r["assigned_type"], r["assigned_subtype"])
+ qname_lookup = {} # qname --> (a_type, a_subtype)
+ with gzip.open(per_read_tsv, "rt") as per_read_f:
+ for row in DictReader(per_read_f, delimiter="\t"):
+ # pdb.set_trace()
+ if (
+ wanted_types is None
+ or (not exclude_type and row["assigned_type"] in wanted_types)
+ or (exclude_type and row["assigned_type"] not in wanted_types)
+ ) and (
+ wanted_subtypes is None
+ or (
+ not exclude_subtype and (row["assigned_subtype"] in wanted_subtypes)
+ )
+ or (
+ exclude_subtype and (row["assigned_subtype"] not in wanted_subtypes)
+ )
+ ):
+ qname_lookup[row["read_id"]] = (
+ row["assigned_type"],
+ row["assigned_subtype"],
+ )
cur_count = 0
reader = pysam.AlignmentFile(in_bam, "rb", check_sq=False)
writer = pysam.AlignmentFile(out_bam, "wb", header=reader.header)
- for r in reader:
- if r.qname in qname_list:
- d = add_assigned_types_to_record(r, *qname_list[r.qname])
+ for rec in reader:
+ if rec.qname in qname_lookup:
+ d = add_assigned_types_to_record(rec, *qname_lookup[rec.qname])
writer.write(pysam.AlignedSegment.from_dict(d, reader.header))
cur_count += 1
if max_count is not None and cur_count >= max_count:
@@ -123,19 +132,20 @@ def iter_cigar_w_aligned_pair(rec, writer):
return total_err, total_len
-def read_annotation_file(annot_filename):
- """Read the annotation.txt file into a dictionary.
+def load_annotation_file(annot_filename):
+ """Parse the annotation.txt file into a dictionary.
Example:
- NAME=chr1;TYPE=host;
- NAME=chr2;TYPE=host;
- NAME=myVector;TYPE=vector;REGION=1795-6553;
- NAME=myCapRep;TYPE=repcap;REGION=1895-5987;
- NAME=myHelper;TYPE=helper;
+ NAME=chr1;TYPE=host;
+ NAME=chr2;TYPE=host;
+ NAME=myVector;TYPE=vector;REGION=1795-6553;
+ NAME=myCapRep;TYPE=repcap;REGION=1895-5987;
+ NAME=myHelper;TYPE=helper;
:param annot_filename: Annotation file following the format indicated above. Only "vector" is required. Others optional.
:return:
+
"""
result = {}
for line in open(annot_filename):
@@ -144,23 +154,27 @@ def read_annotation_file(annot_filename):
if m is None:
raise RuntimeError(
f"{stuff} is not a valid annotation line! Should follow format "
- "`NAME=xxxx;TYPE=xxxx;REGION=xxxx;`. Abort!"
+ "`NAME=xxxx;TYPE=xxxx;REGION=xxxx;` or `NAME=xxxx;TYPE=xxxx;`. Abort!"
)
seq_name = m.group(1)
- ref_type = m.group(2)
+ ref_label = m.group(2)
coord_region = (
None
if m.group(3) is None
else tuple(map(int, m.group(3).split("=")[1].split("-")))
)
- if ref_type in result:
- raise RuntimeError(f"Annotation file has multiple {ref_type} types. Abort!")
- if ref_type not in ANNOT_TYPE_PRIORITIES:
+ if ref_label in result:
raise RuntimeError(
- f"{ref_type} is not a valid type (host, repcap, vector, helper). Abort!"
+ f"Annotation file has multiple {ref_label} types. Abort!"
+ )
+ if ref_label not in ANNOT_TYPE_PRIORITIES:
+ logging.info(
+ "Nonstandard reference label %s; the known labels are: %s",
+ ref_label,
+ ", ".join(ANNOT_TYPE_PRIORITIES.keys()),
)
- result[seq_name] = {"type": ref_type, "region": coord_region}
+ result[seq_name] = {"label": ref_label, "region": coord_region}
return result
@@ -225,7 +239,7 @@ def is_on_target(r, target_start, target_end):
return "vector+backbone"
-def assign_read_type(r, annotation):
+def assign_alignment_type(r, annotation):
"""Determine the read alignment type and subtype.
:param read_dict: dict of {'supp', 'primary'}
@@ -237,14 +251,15 @@ def assign_read_type(r, annotation):