nf-core · JoseEspinosa · Nov 15, 2024 · Nov 29, 2024 · Dec 3, 2024 · Dec 3, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -46,9 +46,34 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Parameters
 
-| Old parameter | New parameter         |
-| ------------- | --------------------- |
-|               | `--pdb_obsolete_path` |
+| Old parameter                | New parameter                  |
+| ---------------------------- | ------------------------------ |
+|                              | `--pdb_obsolete_path`          |
+| `--small_bfd_link`           | `--alphafold2_small_bfd_link`  |
+| `--mgnify_link`              | `--alphafold2_mgnify_link`     |
+| `--pdb_mmcif_link`           | `--alphafold2_pdb_mmcif_link`  |
+| `--uniref30_alphafold2_link` | `--alphafold2_uniref30_link`   |
+| `--uniref90_link`            | `--alphafold2_uniref90_link`   |
+| `--pdb_seqres_link`          | `--alphafold2_pdb_seqres_link` |
+| `--small_bfd_path`           | `--alphafold2_small_bfd_path`  |
+| `--mgnify_path_alphafold2`   | `--alphafold2_mgnify_path`     |
+| `--pdb_mmcif_path`           | `--alphafold2_pdb_mmcif_path`  |
+| `--uniref30_alphafold2_path` | `--alphafold2_uniref30_path`   |
+| `--uniref90_path`            | `--alphafold2_uniref90_path`   |
+| `--pdb_seqres_path`          | `--alphafold2_pdb_seqres_path` |
+| `--uniprot_path`             | `--alphafold2_uniprot_path`    |
+|                              | `--alphafold3_small_bfd_link`  |
+|                              | `--alphafold3_mgnify_link`     |
+|                              | `--alphafold3_uniref90_link`   |
+|                              | `--alphafold3_pdb_seqres_link` |
+|                              | `--uniprot_link`               |
+|                              | `--alphafold3_small_bfd_path`  |
+|                              | `--alphafold3_params_path`     |
+|                              | `--alphafold3_mgnify_path`     |
+|                              | `--alphafold3_pdb_mmcif_path`  |
+|                              | `--alphafold3_uniref90_path`   |
+|                              | `--alphafold3_pdb_seqres_path` |
+|                              | `--alphafold3_uniprot_path`    |
 
 > **NB:** Parameter has been **updated** if both old and new parameter information is present.
 > **NB:** Parameter has been **added** if just the new parameter information is present.

diff --git a/README.md b/README.md
@@ -35,17 +35,19 @@ On release, automated continuous integration tests run the pipeline on a full-si
 
    ii. [AlphaFold2 split](https://github.com/luisas/alphafold_split) - AlphaFold2 MSA computation and model inference in separate processes
 
-   iii. [ColabFold](https://github.com/sokrypton/ColabFold) - MMseqs2 API server followed by ColabFold
+   iii. [AlphaFold3](https://github.com/deepmind/alphafold) - Regular AlphaFold3 (MSA computation and model inference in the same process)
 
-   iv. [ColabFold](https://github.com/sokrypton/ColabFold) - MMseqs2 local search followed by ColabFold
+   iv. [ColabFold](https://github.com/sokrypton/ColabFold) - MMseqs2 API server followed by ColabFold
 
-   v. [ESMFold](https://github.com/facebookresearch/esm) - Regular ESM
+   v. [ColabFold](https://github.com/sokrypton/ColabFold) - MMseqs2 local search followed by ColabFold
 
-   vi. [RoseTTAFold-All-Atom](https://github.com/baker-laboratory/RoseTTAFold-All-Atom/) - Regular RFAA
+   vi. [ESMFold](https://github.com/facebookresearch/esm) - Regular ESM
 
-   vii. [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3) - Regular HF3
+   vii. [RoseTTAFold-All-Atom](https://github.com/baker-laboratory/RoseTTAFold-All-Atom/) - Regular RFAA
 
-   viii. [Boltz](https://github.com/jwohlwend/boltz/) - Regular Boltz-1
+   viii. [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3) - Regular HF3
+
+   ix. [Boltz](https://github.com/jwohlwend/boltz/) - Regular Boltz-1
 
 ## Usage
 
@@ -92,6 +94,21 @@ The pipeline takes care of downloading the databases and parameters required by
       -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
   ```
 
+- The AlphaFold3 mode can be run using the command below:
+
+  ```console
+  nextflow run nf-core/proteinfold \
+      --input samplesheet.csv \
+      --outdir <OUTDIR> \
+      --mode alphafold3 \
+      --alphafold3_db <null (default) | DB_PATH> \
+      --use_gpu <true/false> \
+      -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
+  ```
+
+  > [!WARNING]
+  > The AlphaFold3 weights are not provided by this pipeline. Users must obtain the weights directly from DeepMind according to their [terms of use](https://github.com/deepmind/alphafold/blob/main/WEIGHTS_TERMS_OF_USE.md) and [prohibited use policy](https://github.com/deepmind/alphafold/blob/main/WEIGHTS_PROHIBITED_USE_POLICY.md). Please ensure you comply with all terms and conditions before using AlphaFold3. For more information about AlphaFold3 usage and requirements, please refer to the [official AlphaFold3 repository](https://github.com/deepmind/alphafold).
+
 - Below, the command to run colabfold_local mode:
 
   ```console

diff --git a/bin/extract_metrics.py b/bin/extract_metrics.py
@@ -115,7 +115,7 @@ def read_json(id, json_files):
             if json_file.endswith("_data.json"): #AF3 output with MSA info
                 # Can't just used format_msa_rows since there's FASTA headers in the json content
                 unpaired_MSAs = data['sequences'][0]['protein']['unpairedMsa']
-                msa_lines = [line for line in unpaired_MSAs.split("\n") if not line.startswith(">") and line.strip()]
+                msa_lines = [''.join(c for c in line if not c.islower()) for line in unpaired_MSAs.split("\n") if line.strip() and not line.startswith(">")]
                 msa_rows = [[str(AA_to_int.get(residue, 20)) for residue in line] for line in msa_lines]
                 write_tsv(f"{id}_msa.tsv", msa_rows)
             #AF3 output with PAE info, or HF3 PAE data. TODO: Need to make sure the workflow points to [protein]/[protein]_rank1/all_results.json

diff --git a/bin/fasta_to_alphafold3_json.py b/bin/fasta_to_alphafold3_json.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+
+import sys
+import argparse
+import json
+import string
+
+def parse_args(args=None):
+    """
+    Parse command line arguments for the script.
+
+    Required arguments:
+        FILE_IN: Input fasta file path
+        ID: Identifier for the protein sequence (will be used in output filename and JSON)
+
+    Optional arguments:
+        -ms/--model_seed: AlphaFold3 model seed(s) to use (default: [11])
+    """
+    Description = "Convert fasta files to Alphafold3 json format."
+    Epilog = "Example usage: python fasta_to_alphafold3_json.py <FILE_IN> <ID>"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+
+    ## REQUIRED PARAMETERS
+    parser.add_argument(
+        "FILE_IN",
+        help="Input fasta file."
+    )
+    parser.add_argument(
+        "ID",
+        help="ID for file name and for json id tag."
+    )
+
+    ## OPTIONAL PARAMETERS
+    parser.add_argument(
+        "-ms",
+        "--model_seed",
+        type=int,
+        nargs='+',
+        dest="MODEL_SEED",
+        default=[11],
+        help="Alphafold 3 model seed."
+    )
+
+    return parser.parse_args(args)
+
+def sanitised_name(id):
+    """
+    Sanitize the input ID to create a valid filename.
+
+    This function is copied from AlphaFold3 source code to ensure consistent naming:
+    https://github.com/google-deepmind/alphafold3/blob/7fdf96161d61a6e18048e5c62bf7e1d711992943/src/alphafold3/common/folding_input.py#L1166-L1170
+    It converts the ID to lowercase, replaces spaces with underscores, and removes
+    any characters that aren't allowed in filenames.
+
+    Args:
+        id (str): Input identifier
+
+    Returns:
+        str: Sanitized version of the ID suitable for use as a filename
+    """
+    lower_spaceless_name = id.lower().replace(' ', '_')
+    allowed_chars = set(string.ascii_lowercase + string.digits + '_-.')
+    return ''.join(l for l in lower_spaceless_name if l in allowed_chars)
+
+def fasta_to_alphafold3_json(file_in, id):
+    """
+    Convert a single-sequence FASTA file to AlphaFold3 JSON format.
+
+    This function reads a FASTA file and converts it to the format required by AlphaFold3.
+    It only processes single-sequence FASTA files and raises an error for multi-sequence files.
+
+    The function expects a samplesheet.csv with the following format:
+        id,fasta
+        T1024,path/to/T1024.fasta
+        T1026,path/to/T1026.fasta
+
+    Args:
+        file_in (str): Path to input FASTA file
+        id (str): Identifier for the sequence
+
+    Returns:
+        dict: Dictionary containing the sequence information in AlphaFold3 format
+
+    Raises:
+        RuntimeError: If the input file contains multiple sequences
+    """
+    sequence_list = []
+    sequence = None
+    fasta_mapping_dict = {}
+
+    with open(file_in, "r", encoding="utf-8-sig") as fin:
+        n_seq = 0
+        for l in fin:
+            l = l.strip()
+            if l.startswith(">"):
+                if n_seq > 1:
+                    raise RuntimeError("Multifasta files are not allowed")
+                n_seq += 1
+                if sequence:
+                    sequence_list.append(sequence)
+                sequence = {"id": id, "sequence": ""}
+            else:
+                sequence["sequence"] += l
+
+    return sequence
+
+def create_json_dict(sequence, model_seed):
+    """
+    Create the final JSON dictionary in AlphaFold3 format.
+
+    The function creates a JSON structure that follows AlphaFold3's requirements:
+    {
+        "name": "sequence_id",
+        "sequences": [
+            {
+                "protein": {
+                    "id": "A",
+                    "sequence": "protein_sequence"
+                }
+            }
+        ],
+        "modelSeeds": [seed_values],
+        "dialect": "alphafold3",
+        "version": 1
+    }
+
+    Args:
+        sequence (dict): Dictionary containing sequence information
+        model_seed (list): List of model seeds to use
+
+    Returns:
+        dict: JSON-compatible dictionary in AlphaFold3 format
+    """
+    json_sequence_dict = {}
+
+    item = {
+        "name": f"{sequence['id']}",
+        "sequences": [
+            {
+                "protein": {
+                    "id": "A",
+                    "sequence": sequence["sequence"]
+                }
+            },
+        ],
+        "modelSeeds": model_seed,
+        "dialect": "alphafold3",
+        "version": 1
+    }
+
+    json_sequence_dict[sequence["id"]] = item
+
+    return json_sequence_dict
+
+def main(args=None):
+    """
+    Main function to process FASTA files and create AlphaFold3 JSON files.
+
+    The script:
+    1. Parses command line arguments
+    2. Sanitizes the input ID for filename use
+    3. Reads and processes the FASTA file
+    4. Creates the JSON structure
+    5. Writes the output to a JSON file
+
+    The output filename will be the sanitized ID with .json extension.
+    """
+    args = parse_args(args)
+    id = args.ID
+
+    if id.endswith(".json"):
+        id = id[:-5]
+        reformatted_id = sanitised_name(id)
+    else:
+        reformatted_id = sanitised_name(id)
+
+    out_json = f"{reformatted_id}.json"
+
+    sequence = fasta_to_alphafold3_json(args.FILE_IN, reformatted_id)
+    json_dict = create_json_dict(sequence, args.MODEL_SEED)
+
+    print ("json file " + out_json)
+    with open(out_json, "w") as fout:
+        json.dump(json_dict[reformatted_id], fout, indent=4)
+
+    with open(out_json, 'r') as f:
+        json_str = f.read()
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bin/generate_report.py b/bin/generate_report.py
@@ -318,6 +318,7 @@ def pdb_to_lddt(struct_files, generate_tsv):
 model_name = {
     "esmfold": "ESMFold",
     "alphafold2": "AlphaFold2",
+    "alphafold3": "Alphafold3",
     "colabfold": "ColabFold",
     "rosettafold_all_atom": "Rosettafold_All_Atom",
     "helixfold3": "HelixFold3",

diff --git a/bin/mmcif_to_pdb.py b/bin/mmcif_to_pdb.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+###############################################################################
+###############################################################################
+## Created on December 16th 2024 convert cif files to pdb
+###############################################################################
+###############################################################################
+
+import argparse
+import sys
+from Bio import PDB
+
+def parse_args(args=None):
+    Description = "Convert mmcif files to pdb format."
+    Epilog = """Example usage: python mmcif_to_pdb.py <MMCIF_IN>"""
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument(
+        "MMCIF_IN",
+        help="Input mmcif file."
+    )
+    parser.add_argument(
+        "-po",
+        "--pdb_out",
+        type=str,
+        dest="PDB_OUT",
+        default="",
+        help="Output pdb file."
+    )
+    return parser.parse_args(args)
+
+
+def mmcif_to_pdb(mmcif_file, pdb_file):
+    """
+    Convert an mmCIF file to PDB format.
+    """
+    # Parse the mmCIF file
+    parser = PDB.MMCIFParser(QUIET=True)
+    structure = parser.get_structure("structure", mmcif_file)
+
+    # Write to PDB format
+    io = PDB.PDBIO()
+    io.set_structure(structure)
+    io.save(pdb_file)
+
+    return pdb_file
+
+
+############################################
+############################################
+## MAIN FUNCTION
+############################################
+############################################
+
+def main(args=None):
+    args = parse_args(args)
+
+    # Name output PDB file name
+    pdb_file =  args.PDB_OUT
+    if not pdb_file:
+        pdb_file = args.MMCIF_IN.rsplit(".", 1)[0] + ".pdb"
+
+    pdb_file = mmcif_to_pdb(args.MMCIF_IN, pdb_file)
+    print(f"Converted {args.MMCIF_IN} to {pdb_file}")
+
+
+if __name__ == "__main__":
+    main()