formbio · etal · Oct 15, 2024 · Aug 9, 2024 · Aug 13, 2024 · Aug 13, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -1,8 +1,8 @@
-name: Tests
+name: CI
 on:
   push:
     branches:
-      - master
+      - main
     paths-ignore:
       - 'docs/**'
       - '*.md'
@@ -14,7 +14,7 @@ on:
       - '*.rst'
 jobs:
   tests:
-    name: Lint with R and Python ${{ matrix.python }}
+    name: Test with Python ${{ matrix.python }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -30,6 +30,11 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python }}
+      - name: Install apt packages
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y samtools texlive-latex-extra texlive-latex-recommended \
+          && sudo apt-get remove -y fonts-urw-base35 libgs9 libgs9-common libjbig2dec0 poppler-data
       - name: Cache conda
         uses: actions/cache@v4
         env:
@@ -59,3 +64,6 @@ jobs:
       - name: Lint R script
         run: |
           Rscript -e 'library(lintr); options(lintr.error_on_lint=TRUE); lint_dir(".", linters=linters_with_tags("correctness"))'
+      - name: Test script outputs
+        run: |
+          cd test && make test
diff --git a/Makefile b/Makefile
@@ -2,31 +2,43 @@
 
 # Nextflow workflow output directory
 wf_out_dir := workflow-outputs/output
+snapshot_dir := test/build-snapshot
 
-all: laava laava_dev
+# Form Bio workflow deployment
+formbio_org := form-bio-solutions
+formbio_project := aav-qc-workshop
+# Avoid uploading the local test dir; it's > the upload size limit
+tmp_stash_dir := /tmp/laava-deploy-test
 
-.PHONY: clean laava laava_dev sc ss diffcheck-sc diffcheck-ss
+all: laava laava_dev sc ss min folder
+
+.PHONY: clean laava laava_dev sc ss min folder diffcheck-sc diffcheck-ss formbio
 clean:
-	rm -fv .nextflow.log*
-	rm -fv test/build/*
-	rm -rf workflow-outputs/*
+	rm -f .nextflow.log*
+	rm -fr .nextflow/*
+	rm -fr workflow-outputs/*
+
 
 laava laava_dev: %: %.dockerfile laava.conda_env.yml
 	docker build -t ghcr.io/formbio/$@:latest -f $< .
 
 
-sc: params-local-sc-no-ff.json
+sc ss min folder: %: params-local-%.json
 	nextflow run -profile local main.nf -params-file $<
 
-ss: params-local-ss-with-ff.json
-	nextflow run -profile local main.nf -params-file $<
 
-min: params-local-no-file-sc.json
-	nextflow run -profile local main.nf -params-file $<
+diffcheck-sc: $(wf_out_dir)/sc.subsample005.per_read.tsv
+	diff $(snapshot_dir)/sc.per_read.tsv $< && echo "OK"
+
+diffcheck-ss: $(wf_out_dir)/ss.subsample005.per_read.tsv $(wf_out_dir)/ss.subsample005.flipflop.tsv
+	#diff $(snapshot_dir)/ss.per_read.tsv $< && echo "OK"
+	diff $(snapshot_dir)/ss.flipflop.tsv $(lastword $^) && echo "OK"
 
-diffcheck-sc: $(wf_out_dir)/sc.subsample005.bam.per_read.tsv
-	diff test/build-snapshot/sc.per_read.tsv $< && echo "OK"
 
-diffcheck-ss:  $(wf_out_dir)/ss.subsample005.bam.per_read.tsv $(wf_out_dir)/ss.subsample005.bam.flipflop.tsv
-	diff test/build-snapshot/ss.per_read.tsv $< && echo "OK"
-	diff test/build-snapshot/ss.flipflop.tsv $(lastword $^) && echo "OK"
+formbio: clean
+	mv test/ "$(tmp_stash_dir)"
+	formbio workflow upload \
+		--org "$(formbio_org)" --project "$(formbio_project)" \
+		--env prod --visibility PROJECT \
+		--version dev --repo . --workflow laava
+	mv "$(tmp_stash_dir)" test
diff --git a/README.md b/README.md
@@ -64,8 +64,8 @@ There are several ways to satisfy the script dependencies locally.
 ### Option 1: Development docker image (`laava_dev` container image)
 
 The `laava_dev.dockerfile` in this repo installs the scripts' dependencies, but not the
-scripts themselves, into a Docker container image that you can then use to run the
-local copies of the scripts.
+scripts themselves, into a Docker container image that you can then use to run the local
+copies of the scripts.
 
 To build the container image with the name `laava_dev` (you can use another name if you prefer):
 
@@ -136,17 +136,39 @@ R packages:
 
 ## Testing
 
+### Automated local tests
+
 The `test/` subdirectory in this repo contains small example input files and a Makefile
 to run the scripts to reanalyze them and produce example HTML and PDF reports.
 
-Once you've completed installation (above), activate your conda environment or Docker container and change to the test directory:
+Once you've completed installation (above), activate your conda environment or Docker
+container and change to the test directory:
 
 ```
 cd test
 ```
 
-To generate the HTML and PDF reports from the test dataset included in the repo (this takes about 1-2 minutes):
+To generate the HTML and PDF reports from the test dataset included in the repo, use any
+of these commands:
 
-```
-make
-```
+* `make sc` -- run the example self-complementary AAV (scAAV) sample. This takes about 1-2 minutes.
+* `make ss` -- run the example single-stranded AAV (ssAAV) sample. This takes about 2-3 minutes, including an additional flip/flop analysis step.
+* `make all` -- run both example AAV samples.
+* `make test` -- run both samples and check the results quantitatively.
+
+
+### Example Nextflow jobs
+
+The top level of this repo includes several JSON files with Nextflow parameter
+configurations (`params-*.json`). They use the same inputs as the automated test suite
+(above), plus the `laava` Docker image and a local installation of `nextflow` (which you
+can install any way you like, e.g. conda or brew).
+
+You can run them directly with Nextflow as usual, or use the Makefile at the top level
+of the repo to launch them:
+
+* `make sc` or `make ss` -- run the example self-complementary AAV (scAAV) or
+  single-stranded AAV (ssAAV) sample, as above.
+* `make min` -- run the scAAV sample with the minimum number of required parameters,
+  exercising the default behavior including guessing the construct vector type (sc/ss).
+* `make folder` -- run both samples via folder input.
diff --git a/laava.conda_env.yml b/laava.conda_env.yml
@@ -7,8 +7,10 @@ dependencies:
     - python>=3.7.6
     - r-base>=3.6.0
     - biopython
+    - pandas
     - parasail-python>=1.3.4
     - pysam
+    - pytest
     - r-flextable
     - r-lintr
     - r-rmarkdown

diff --git a/laava.dockerfile b/laava.dockerfile
@@ -1,5 +1,5 @@
 # Interactive environment with scripts and extra dependencies
-FROM --platform=linux/amd64 continuumio/miniconda3:24.4.0-0
+FROM --platform=linux/amd64 continuumio/miniconda3:24.7.1-0
 LABEL org.opencontainers.image.source https://github.com/formbio/AAV
 
 RUN apt-get update \
@@ -21,15 +21,15 @@ RUN rm -rf /var/lib/apt/lists/*
 
 # Install directly into 'base' conda environment
 COPY laava.conda_env.yml ./conda_env.yml
-RUN conda install -y -n base python=3.10
 RUN conda env update -v -n base -f conda_env.yml
 
 # Executable scripts
 RUN mkdir -p /opt/laava
 RUN chmod 777 /opt/laava/
 COPY src/* /opt/laava/
-RUN chmod +x /opt/laava/*.py /opt/laava/*.R
+RUN chmod +x /opt/laava/*.py /opt/laava/*.R /opt/laava/*.sh
 ENV PATH "/opt/laava:$PATH"
+ENV PYTHONPATH "/opt/laava:$PYTHONPATH"
 
 WORKDIR /data/
 

diff --git a/laava_dev.conda_env.yml b/laava_dev.conda_env.yml
@@ -9,6 +9,7 @@ dependencies:
     - biopython
     - graphviz
     - nextflow
+    - pandas
     - parasail-python>=1.3.4
     - pysam
     - pytest
@@ -17,3 +18,4 @@ dependencies:
     - r-rmarkdown
     - r-tidyverse
     - ruff
+    - shellcheck
diff --git a/laava_dev.dockerfile b/laava_dev.dockerfile
@@ -1,5 +1,5 @@
 # Development environment for running the scripts, no scripts, extra dependencies
-FROM --platform=linux/amd64 continuumio/miniconda3:24.4.0-0
+FROM --platform=linux/amd64 continuumio/miniconda3:24.7.1-0
 
 # Set the container's timezone to match this local machine
 RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone
@@ -27,7 +27,6 @@ RUN apt-get update \
 
 # Install directly into 'base' conda environment
 COPY laava_dev.conda_env.yml ./conda_env.yml
-RUN conda install -y -n base python=3.10
 RUN conda env update -v -n base -f conda_env.yml
 
 WORKDIR /data/

diff --git a/main.nf b/main.nf
@@ -1,67 +1,54 @@
 #!/usr/bin/env nextflow
 nextflow.enable.dsl=2
 
-include { map_reads; make_report } from './modules/local/laava'
+include { match_metadata_to_files; map_reads; make_report } from './modules/local/laava'
 
 NO_FILE = file("$projectDir/bin/NO_FILE")
 NO_FILE2 = file("$projectDir/bin/NO_FILE2")
 
 // Unpack the input sample(s) and metadata
-def prepareInput(
+def prepare_input(
         seq_reads_file, seq_reads_folder, sample_unique_id, sample_display_name,
         sample_in_metadata
 ) {
-    // Known file extensions
     def SAMPLE_FILE_GLOB = "*.{bam,fastq,fastq.gz,fq,fq.gz}"
     def EXTENSION_REGEX = /\.(bam|fastq|fastq\.gz|fq|fq\.gz)$/
 
     if (seq_reads_folder) {
         // Multi-sample mode
-        def sampleFiles = file("${params.seq_reads_folder}/${SAMPLE_FILE_GLOB}")
-
-        if (params.sample_in_metadata) {
-            // Metadata provided - use it to match files
-            def csvData = channel
-                .fromPath(params.sample_in_metadata)
-                .splitCsv(header: true, sep='\t', strip=true)
-                .map { row -> [row.sample_unique_id, row.sample_display_name] }
-                .toList()
-                .val
-            def matchedSamples = csvData.collect { sampleId, sampleName ->
-                def matchingFiles = sampleFiles.findAll { it.name.contains(sampleId) }
-                if (matchingFiles.size() != 1) {
-                    error "Error: sample_unique_id '${sampleId}' matches ${matchingFiles.size()} files. Values must be unique."
-                }
-                [sampleId, sampleName, matchingFiles[0]]
-            }
-            // Check if all files were matched
-            def unmatchedFiles = sampleFiles - matchedSamples.collect { it[2] }
-            if (unmatchedFiles) {
-                error "Error: The following files were not matched to any sample_unique_id: ${unmatchedFiles.join(', ')}"
-            }
-            return channel.fromList(matchedSamples)
+        if (sample_in_metadata) {
+            // TSV provided - load it in a separate process
+            return match_metadata_to_files(file(sample_in_metadata), file(seq_reads_folder))
+                .splitCsv(sep: '\t')
+                .map { row -> [row[0], row[1], file("${seq_reads_folder}/" + row[2])] }
 
         } else {
-            // No metadata provided - generate sampleId and sampleName from filenames
-            return channel.fromList(sampleFiles.collect { sampleFile ->
-                def stem = sampleFile.baseName.replaceFirst(EXTENSION_REGEX, '')
-                [stem, stem, sampleFile]
+            // No TSV provided - generate sample_id and sample_name from filenames
+            //def found_files = file("${sample_folder}/*.{bam,fastq,fastq.gz,fq,fq.gz}")
+            def found_files = file("${seq_reads_folder}/${SAMPLE_FILE_GLOB}")
+            return channel.fromList(found_files.collect { seqfile ->
+                def stem = seqfile.baseName.replaceFirst(EXTENSION_REGEX, '')
+                [stem, stem, seqfile]
             })
+
         }
-    } else if (params.seq_reads_file) {
+    } else if (seq_reads_file) {
         // Single-sample mode
-        def sampleFile = file(params.seq_reads_file)
-        if (!sampleFile.name.matches(/.*${EXTENSION_REGEX}/)) {
-            error "Error: The provided sample file '${sampleFile.name}' does not have a supported extension (${SAMPLE_FILE_GLOB})"
+        def seq_file = file(seq_reads_file)
+        if (!seq_file.exists()) {
+            error "Error: The provided sample file '${seq_reads_file}' does not exist."
+        }
+        if (!seq_file.name.matches(/.*${EXTENSION_REGEX}/)) {
+            error "Error: The provided sample file '${seq_file.name}' does not have a supported extension (bam, fastq, fastq.gz, fq, fq.gz)"
         }
 
-        def stem = sampleFile.baseName.replaceFirst(EXTENSION_REGEX, '')
-        def sampleId = params.sample_unique_id ?: stem
-        def sampleName = params.sample_display_name ?: params.sample_unique_id ?: stem
-        return channel.of([sampleId, sampleName, sampleFile])
+        def stem = seq_file.baseName.replaceFirst(EXTENSION_REGEX, '')
+        def sample_id = sample_unique_id ?: stem
+        def sample_name = sample_display_name ?: sample_unique_id ?: stem
+        return channel.of([sample_id, sample_name, seq_file])
 
     } else {
-        error "Invalid input parameters. Provide either a sample folder path or a single sample file."
+        error "Invalid input parameters. Provide either a sample folder, a TSV file with sample folder, or a single sample file."
     }
 }
 
@@ -91,7 +78,7 @@ workflow laava {
 
     main:
     // Get a tuple of (ID, name, file) each given sample file and metadata
-    sample_channel = prepareInput(
+    sample_channel = prepare_input(
         seq_reads_file, seq_reads_folder, sample_unique_id, sample_display_name,
         sample_in_metadata
     )
@@ -120,20 +107,15 @@ workflow laava {
     emit:
     mapped_sam = map_reads.out.mapped_sam
     mapped_bam = map_reads.out.mapped_bam
+    metadata_out_tsv = make_report.out.metadata_tsv
+    alignments_tsv = make_report.out.alignments_tsv
     per_read_tsv = make_report.out.per_read_tsv
-    summary_tsv = make_report.out.summary_tsv
-    nonmatch_stat_tsvgz = make_report.out.nonmatch_stat_tsvgz
+    nonmatch_tsv = make_report.out.nonmatch_tsv
     tagged_bam = make_report.out.tagged_bam
     subtype_bams = make_report.out.subtype_bams
     subtype_bais = make_report.out.subtype_bais
-    flipflop_assignments_tsv = make_report.out.flipflop_assignments_tsv
     flipflop_bams = make_report.out.flipflop_bams
-    alignments_tsv = make_report.out.alignments_tsv
-    readsummary_tsv = make_report.out.readsummary_tsv
-    sequence_error_tsv = make_report.out.sequence_error_tsv
     flipflop_tsv = make_report.out.flipflop_tsv
-    rdata = make_report.out.rdata
-    metadata_out_tsv = make_report.out.metadata_tsv
 }
 
 

diff --git a/modules/local/laava.nf b/modules/local/laava.nf
@@ -1,3 +1,19 @@
+process match_metadata_to_files {
+    input:
+    path sample_in_metadata
+    path sample_folder
+
+    output:
+    path("metadata_with_paths.tsv")
+
+    script:
+    """
+    match_metadata_to_files.py ${sample_in_metadata} ${sample_folder} \\
+        > metadata_with_paths.tsv
+    """
+}
+
+
 process map_reads() {
     publishDir "$params.output", mode: "copy"
 
@@ -60,23 +76,17 @@ process make_report() {
           path(flipflop_fa)
 
     output:
-    // summarize alignment
-    path("${sample_id}.per_read.tsv"), emit: per_read_tsv
-    path("${sample_id}.summary.tsv"), emit: summary_tsv
-    path("${sample_id}.nonmatch_stat.tsv.gz"), emit: nonmatch_stat_tsvgz
+    // summary tables
+    path("${sample_id}.metadata.tsv"), emit: metadata_tsv
+    path("${sample_id}.alignments.tsv.gz"), emit: alignments_tsv
+    path("${sample_id}.per_read.tsv.gz"), emit: per_read_tsv
+    path("${sample_id}.nonmatch.tsv.gz"), emit: nonmatch_tsv
+    path("${sample_id}.flipflop.tsv.gz"), emit: flipflop_tsv, optional: true
+    // intermediate data
     path("${sample_id}.tagged.bam"), emit: tagged_bam
     path("${sample_id}.*.tagged.sorted.bam"), emit: subtype_bams
     path("${sample_id}.*.tagged.sorted.bam.bai"), emit: subtype_bais
-    // flip-flop
-    path("${sample_id}.flipflop_assignments.tsv"), emit: flipflop_assignments_tsv, optional: true
-    path("${sample_id}.*-flipflop.bam"), emit: flipflop_bams, optional: true
-    // intermediate data
-    path("${sample_id}.metadata.tsv"), emit: metadata_tsv
-    path("${sample_id}.alignments.tsv"), emit: alignments_tsv
-    path("${sample_id}.readsummary.tsv"), emit: readsummary_tsv
-    path("${sample_id}.sequence-error.tsv"), emit: sequence_error_tsv
-    path("${sample_id}.flipflop.tsv"), emit: flipflop_tsv, optional: true
-    path("${sample_id}.Rdata"), emit: rdata, optional: true
+    path("${sample_id}.flipflop-*.bam"), emit: flipflop_bams, optional: true
     // report
     path("${sample_id}_AAV_report.html"), emit: aav_report_html
     path("${sample_id}_AAV_report.pdf"), emit: aav_report_pdf