diff --git a/.dev/clear.sh b/.dev/clear.sh
old mode 100644
new mode 100755
diff --git a/.dev/commit-prepare.sh b/.dev/commit-prepare.sh
old mode 100644
new mode 100755
index de91099..064a30d
--- a/.dev/commit-prepare.sh
+++ b/.dev/commit-prepare.sh
@@ -1,8 +1,11 @@
path=$(cd `dirname $0`; pwd)
cd $path
+# cpp & python format lint
+sudo apt-get update
sudo apt-get install clang-format -y
pip install pre-commit
pip install yapf
pip install cpplint
-pre-commit install -c ./.dev/.pre-commit-config.yaml
+pre-commit install -c ./.dev/.pre-commit-config.yaml # only lint for python
+# pre-commit install -c ./.dev/.pre-commit-config-cpp.yaml # both python + cpp
diff --git a/.dev/init_dev.sh b/.dev/init_dev.sh
new file mode 100755
index 0000000..a203948
--- /dev/null
+++ b/.dev/init_dev.sh
@@ -0,0 +1,4 @@
+export ENABLE_FFPA_ALL_STAGES=0
+export ENABLE_FFPA_ALL_HEADDIM=0
+export ENABLE_FFPA_AMPERE=0
+export ENABLE_FFPA_HOPPER=0
diff --git a/.dev/install.sh b/.dev/install.sh
old mode 100644
new mode 100755
index 4bfdb10..385ba3c
--- a/.dev/install.sh
+++ b/.dev/install.sh
@@ -1,4 +1,5 @@
rm -rf $(find . -name __pycache__)
-python3 setup.py bdist_wheel && cd dist # build pyffpa from sources
-python3 -m pip install pyffpa-*-linux_x86_64.whl # pip uninstall pyffpa -y
-cd .. && rm -rf build *.egg-info
+python3 setup.py bdist_wheel && cd dist # build cuffpa-py from sources
+python3 -m pip install cuffpa_py-*-linux_x86_64.whl # pip uninstall cuffpa-py -y
+cd .. && rm -rf build *.egg-info
+rm -rf $(find . -name __pycache__)
diff --git a/.dev/uninstall.sh b/.dev/uninstall.sh
old mode 100644
new mode 100755
index 30b033f..0b4f541
--- a/.dev/uninstall.sh
+++ b/.dev/uninstall.sh
@@ -1 +1 @@
-python3 -m pip uninstall pyffpa -y
+python3 -m pip uninstall cuffpa-py -y
diff --git a/README.md b/README.md
index 1651c28..ec6932a 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@
-π€ [WIP] **FFPA**: Yet antother **Faster Flash Prefill Attention** with **O(1) SRAM complexity** & **O(d/4) or O(1) register complexity** for large headdim (D > 256), almost **>1.5x** π faster than SDPA EA with or without MMA Accumulation F32 on many devices, such as NVIDIA L20, 4090, 3080 Laptop (Experimental π~). The FFPA kernels are modified from my repo π[CUDA-Learn-Notes](https://github.com/DefTruth/CUDA-Learn-Notes/tree/main/kernels/flash-attn) .
+π€ [WIP] **FFPA**: Yet antother **Faster Flash Prefill Attention** with **O(1) SRAM complexity** & **O(d/4) or O(1) register complexity** for large headdim (D > 256), almost **>1.5x** π faster than SDPA EA with or without MMA Accumulation F32 on many devices, such as NVIDIA L20, 4090, 3080 Laptop (Experimental π~). The FFPA kernels are modified from my repo π[CUDA-Learn-Notes](https://github.com/DefTruth/CUDA-Learn-Notes/tree/main/kernels/flash-attn) .
-NOTE: This project is still in its early development stages and currently provides a few experimental kernels and benchmarks for reference. More benchmarks data and features (FFPA **L2/L3** & more devices) will be added over time as the project continues to develop.
+NOTE: This project is still in its early dev stages and now provides a few experimental kernels and benchmarks for reference. More features will be added in the future. Welcome to πππ»star this repo to support me ~ ππ
## Β©οΈCitationsππ
```BibTeX
-@misc{faster-prefill-attention@2025,
- title={FFPA: Yet another Faster Flash Prefill Attention with O(1) SRAM complexity for large headdim.},
- url={https://github.com/DefTruth/faster-prefill-attention},
- note={Open-source software available at https://github.com/DefTruth/faster-prefill-attention},
+@misc{cuffpa-py@2025,
+ title={FFPA: Yet another Faster Flash Prefill Attention for large headdim.},
+ url={https://github.com/DefTruth/cuffpa-py},
+ note={Open-source software available at https://github.com/DefTruth/cuffpa-py},
author={DefTruth etc},
year={2025}
}
@@ -80,19 +80,19 @@ By leveraging this approach, we can achieve better performance for large headdim
-The FFPA implemented in this repo can be install as a python library, namely, `pyffpa` library (optional).
+The FFPA implemented in this repo can be install as a python library, namely, `cuffpa-py` library (optional).
```bash
-# clone, then, run .dev/install.sh directly or run commands as belows
-git clone https://github.com/DefTruth/faster-prefill-attention.git
-python3 setup.py bdist_wheel && rm -rf *.egg-info # build 'pyffpa' from sources
-cd dist && python3 -m pip install pyffpa-*-linux_x86_64.whl # pip uninstall pyffpa -y
+git clone https://github.com/DefTruth/cuffpa-py.git
+# clone, then, run bash .dev/install.sh directly or run commands:
+python3 setup.py bdist_wheel && rm -rf *.egg-info # build 'cuffpa-py' from sources
+cd dist && python3 -m pip install cuffpa_py-*-linux_x86_64.whl # pip uninstall cuffpa-py -y
```
## π FFPA L1 (Level 1): Benchmark ππ
-L1: level 1, O(Brx16)~O(1) SRAM complexity, O(d/4) register complexity, the same GPU HBM memory complexity as FlashAttention. B=1, H=48, N=8192, **D=320-1024(FA2 not supported π)**. (Notes, `*`=MMA Acc F32, `^`=MMA Acc F16, Softmax Acc dtype is always be F32, T=TFLOPS, πBenchmark)
+L1: level 1, O(2xBrx16)βO(1) SRAM complexity, O(d/4) register complexity, the same GPU HBM memory complexity as FlashAttention. B=1, H=48, N=8192, **D=320-1024(FA2 not supported π)**. (Notes, `*`=MMA Acc F32, `^`=MMA Acc F16, Softmax Acc dtype is always be F32, T=TFLOPS, πBenchmark)
- π NVIDIA RTX 3080 Laptop (`*`=MMA Acc F32, `^`=MMA Acc F16, `T`=TFLOPS)
@@ -144,7 +144,7 @@ export TORCH_CUDA_ARCH_LIST=Ada # for Ada only
export TORCH_CUDA_ARCH_LIST=Ampere # for Ampere only
cd tests && python3 test.py --B 1 --H 48 --N 8192 --show-all --D 320
```
-- π case: B=1, H=48, N=8192, D=320(FA2 not supported), Device=NVIDIA RTX 4090.
+- π case: B=1, H=48, N=8192, D=320(`FA2 not supported`), Device=NVIDIA RTX 4090.
```bash
python3 tests/test.py --B 1 --H 48 --N 8192 --show-all --D 320
-------------------------------------------------------------------------------------------------
diff --git a/env.py b/env.py
index 2ca1668..920f83b 100644
--- a/env.py
+++ b/env.py
@@ -8,24 +8,24 @@ class ENV(object):
PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
# Enable all multi stages kernels or not (1~4), default False (1~2).
- ENBALE_FFPA_ALL_STAGES = bool(int(os.environ.get("ENBALE_FFPA_ALL_STAGES", 0)))
+ ENABLE_FFPA_ALL_STAGES = bool(int(os.environ.get("ENABLE_FFPA_ALL_STAGES", 0)))
# Enable all headdims for FFPA kernels or not, default False.
# True, headdim will range from 32 to 1024 with step = 32, range(32, 1024, 32)
# False, headdim will range from 256 to 1024 with step = 64, range(256, 1024, 64)
- ENBALE_FFPA_ALL_HEADDIM = bool(int(os.environ.get("ENBALE_FFPA_ALL_HEADDIM", 0)))
+ ENABLE_FFPA_ALL_HEADDIM = bool(int(os.environ.get("ENABLE_FFPA_ALL_HEADDIM", 0)))
# Enable build FFPA kernels for Ada devices (sm89, L2O, 4090, etc),
# default True.
- ENBALE_FFPA_ADA = bool(int(os.environ.get("ENBALE_FFPA_ADA", 1)))
+ ENABLE_FFPA_ADA = bool(int(os.environ.get("ENABLE_FFPA_ADA", 1)))
# Enable build FFPA kernels for Ampere devices (sm80, A30, A100, etc),
# default True.
- ENBALE_FFPA_AMPERE = bool(int(os.environ.get("ENBALE_FFPA_HOPPER", 1)))
+ ENABLE_FFPA_AMPERE = bool(int(os.environ.get("ENABLE_FFPA_AMPERE", 1)))
# Enable build FFPA kernels for Hopper devices (sm90, H100, H20, etc),
# default False.
- ENBALE_FFPA_HOPPER = bool(int(os.environ.get("ENBALE_FFPA_HOPPER", 0)))
+ ENABLE_FFPA_HOPPER = bool(int(os.environ.get("ENABLE_FFPA_HOPPER", 0)))
@classmethod
def project_dir(cls):
@@ -33,29 +33,29 @@ def project_dir(cls):
@classmethod
def enable_hopper(cls):
- return cls.ENBALE_FFPA_HOPPER
+ return cls.ENABLE_FFPA_HOPPER
@classmethod
def enable_ampere(cls):
- return cls.ENBALE_FFPA_AMPERE
+ return cls.ENABLE_FFPA_AMPERE
@classmethod
def enable_ada(cls):
- return cls.ENBALE_FFPA_ADA
+ return cls.ENABLE_FFPA_ADA
@classmethod
def enable_all_mutistages(cls):
- return cls.ENBALE_FFPA_ALL_STAGES
+ return cls.ENABLE_FFPA_ALL_STAGES
@classmethod
def enable_all_headdim(cls):
- return cls.ENBALE_FFPA_ALL_HEADDIM
+ return cls.ENABLE_FFPA_ALL_HEADDIM
@classmethod
def env_cuda_cflags(cls):
extra_env_cflags = []
if cls.enable_all_mutistages():
- extra_env_cflags.append("-DENBALE_FFPA_ALL_STAGES")
+ extra_env_cflags.append("-DENABLE_FFPA_ALL_STAGES")
if cls.enable_all_headdim():
- extra_env_cflags.append("-DENBALE_FFPA_ALL_HEADDIM")
+ extra_env_cflags.append("-DENABLE_FFPA_ALL_HEADDIM")
return extra_env_cflags
diff --git a/setup.py b/setup.py
index ac21fde..eab3412 100644
--- a/setup.py
+++ b/setup.py
@@ -69,8 +69,8 @@ def get_cuda_bare_metal_version(cuda_dir):
return raw_output, bare_metal_version
-# package name managed by pip, which can be remove by `pip uninstall pyffpa -y`
-PACKAGE_NAME = "pyffpa"
+# package name managed by pip, which can be remove by `pip uninstall cuffpa-py -y`
+PACKAGE_NAME = "cuffpa-py"
ext_modules = []
generator_flag = []
@@ -133,7 +133,9 @@ def fetch_requirements():
"tests",
"bench",
"tmp",
- "pyffpa.egg-info",
+ "cuffpa_py.egg-info",
+ "__pycache__",
+ "third_party",
)
),
description="FFPA: Yet another Faster Flash Prefill Attention for large headdim, ~1.5x faster than SDPA EA.",