diff --git a/.dev/clear.sh b/.dev/clear.sh old mode 100644 new mode 100755 diff --git a/.dev/commit-prepare.sh b/.dev/commit-prepare.sh old mode 100644 new mode 100755 index de91099..064a30d --- a/.dev/commit-prepare.sh +++ b/.dev/commit-prepare.sh @@ -1,8 +1,11 @@ path=$(cd `dirname $0`; pwd) cd $path +# cpp & python format lint +sudo apt-get update sudo apt-get install clang-format -y pip install pre-commit pip install yapf pip install cpplint -pre-commit install -c ./.dev/.pre-commit-config.yaml +pre-commit install -c ./.dev/.pre-commit-config.yaml # only lint for python +# pre-commit install -c ./.dev/.pre-commit-config-cpp.yaml # both python + cpp diff --git a/.dev/init_dev.sh b/.dev/init_dev.sh new file mode 100755 index 0000000..a203948 --- /dev/null +++ b/.dev/init_dev.sh @@ -0,0 +1,4 @@ +export ENABLE_FFPA_ALL_STAGES=0 +export ENABLE_FFPA_ALL_HEADDIM=0 +export ENABLE_FFPA_AMPERE=0 +export ENABLE_FFPA_HOPPER=0 diff --git a/.dev/install.sh b/.dev/install.sh old mode 100644 new mode 100755 index 4bfdb10..385ba3c --- a/.dev/install.sh +++ b/.dev/install.sh @@ -1,4 +1,5 @@ rm -rf $(find . -name __pycache__) -python3 setup.py bdist_wheel && cd dist # build pyffpa from sources -python3 -m pip install pyffpa-*-linux_x86_64.whl # pip uninstall pyffpa -y -cd .. && rm -rf build *.egg-info +python3 setup.py bdist_wheel && cd dist # build cuffpa-py from sources +python3 -m pip install cuffpa_py-*-linux_x86_64.whl # pip uninstall cuffpa-py -y +cd .. && rm -rf build *.egg-info +rm -rf $(find . -name __pycache__) diff --git a/.dev/uninstall.sh b/.dev/uninstall.sh old mode 100644 new mode 100755 index 30b033f..0b4f541 --- a/.dev/uninstall.sh +++ b/.dev/uninstall.sh @@ -1 +1 @@ -python3 -m pip uninstall pyffpa -y +python3 -m pip uninstall cuffpa-py -y diff --git a/README.md b/README.md index 1651c28..ec6932a 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ -πŸ€– [WIP] **FFPA**: Yet antother **Faster Flash Prefill Attention** with **O(1) SRAM complexity** & **O(d/4) or O(1) register complexity** for large headdim (D > 256), almost **>1.5x** πŸŽ‰ faster than SDPA EA with or without MMA Accumulation F32 on many devices, such as NVIDIA L20, 4090, 3080 Laptop (Experimental πŸ‘€~). The FFPA kernels are modified from my repo πŸ“–[CUDA-Learn-Notes](https://github.com/DefTruth/CUDA-Learn-Notes/tree/main/kernels/flash-attn) ![](https://img.shields.io/github/stars/DefTruth/CUDA-Learn-Notes.svg?style=social). +πŸ€– [WIP] **FFPA**: Yet antother **Faster Flash Prefill Attention** with **O(1) SRAM complexity** & **O(d/4) or O(1) register complexity** for large headdim (D > 256), almost **>1.5x** πŸŽ‰ faster than SDPA EA with or without MMA Accumulation F32 on many devices, such as NVIDIA L20, 4090, 3080 Laptop (Experimental πŸ‘€~). The FFPA kernels are modified from my repo πŸ“–[CUDA-Learn-Notes](https://github.com/DefTruth/CUDA-Learn-Notes/tree/main/kernels/flash-attn) ![](https://img.shields.io/github/stars/DefTruth/CUDA-Learn-Notes.svg?style=social). -NOTE: This project is still in its early development stages and currently provides a few experimental kernels and benchmarks for reference. More benchmarks data and features (FFPA **L2/L3** & more devices) will be added over time as the project continues to develop. +NOTE: This project is still in its early dev stages and now provides a few experimental kernels and benchmarks for reference. More features will be added in the future. Welcome to πŸŒŸπŸ‘†πŸ»star this repo to support me ~ πŸŽ‰πŸŽ‰ ## ©️CitationsπŸŽ‰πŸŽ‰ ```BibTeX -@misc{faster-prefill-attention@2025, - title={FFPA: Yet another Faster Flash Prefill Attention with O(1) SRAM complexity for large headdim.}, - url={https://github.com/DefTruth/faster-prefill-attention}, - note={Open-source software available at https://github.com/DefTruth/faster-prefill-attention}, +@misc{cuffpa-py@2025, + title={FFPA: Yet another Faster Flash Prefill Attention for large headdim.}, + url={https://github.com/DefTruth/cuffpa-py}, + note={Open-source software available at https://github.com/DefTruth/cuffpa-py}, author={DefTruth etc}, year={2025} } @@ -80,19 +80,19 @@ By leveraging this approach, we can achieve better performance for large headdim
-The FFPA implemented in this repo can be install as a python library, namely, `pyffpa` library (optional). +The FFPA implemented in this repo can be install as a python library, namely, `cuffpa-py` library (optional). ```bash -# clone, then, run .dev/install.sh directly or run commands as belows -git clone https://github.com/DefTruth/faster-prefill-attention.git -python3 setup.py bdist_wheel && rm -rf *.egg-info # build 'pyffpa' from sources -cd dist && python3 -m pip install pyffpa-*-linux_x86_64.whl # pip uninstall pyffpa -y +git clone https://github.com/DefTruth/cuffpa-py.git +# clone, then, run bash .dev/install.sh directly or run commands: +python3 setup.py bdist_wheel && rm -rf *.egg-info # build 'cuffpa-py' from sources +cd dist && python3 -m pip install cuffpa_py-*-linux_x86_64.whl # pip uninstall cuffpa-py -y ``` ## πŸ“– FFPA L1 (Level 1): Benchmark πŸŽ‰πŸŽ‰
-L1: level 1, O(Brx16)~O(1) SRAM complexity, O(d/4) register complexity, the same GPU HBM memory complexity as FlashAttention. B=1, H=48, N=8192, **D=320-1024(FA2 not supported πŸ‘€)**. (Notes, `*`=MMA Acc F32, `^`=MMA Acc F16, Softmax Acc dtype is always be F32, T=TFLOPS, πŸ‘‡Benchmark) +L1: level 1, O(2xBrx16)β‰ˆO(1) SRAM complexity, O(d/4) register complexity, the same GPU HBM memory complexity as FlashAttention. B=1, H=48, N=8192, **D=320-1024(FA2 not supported πŸ‘€)**. (Notes, `*`=MMA Acc F32, `^`=MMA Acc F16, Softmax Acc dtype is always be F32, T=TFLOPS, πŸ‘‡Benchmark) - πŸ“š NVIDIA RTX 3080 Laptop (`*`=MMA Acc F32, `^`=MMA Acc F16, `T`=TFLOPS) @@ -144,7 +144,7 @@ export TORCH_CUDA_ARCH_LIST=Ada # for Ada only export TORCH_CUDA_ARCH_LIST=Ampere # for Ampere only cd tests && python3 test.py --B 1 --H 48 --N 8192 --show-all --D 320 ``` -- πŸ“š case: B=1, H=48, N=8192, D=320(FA2 not supported), Device=NVIDIA RTX 4090. +- πŸ“š case: B=1, H=48, N=8192, D=320(`FA2 not supported`), Device=NVIDIA RTX 4090. ```bash python3 tests/test.py --B 1 --H 48 --N 8192 --show-all --D 320 ------------------------------------------------------------------------------------------------- diff --git a/env.py b/env.py index 2ca1668..920f83b 100644 --- a/env.py +++ b/env.py @@ -8,24 +8,24 @@ class ENV(object): PROJECT_DIR = os.path.dirname(os.path.abspath(__file__)) # Enable all multi stages kernels or not (1~4), default False (1~2). - ENBALE_FFPA_ALL_STAGES = bool(int(os.environ.get("ENBALE_FFPA_ALL_STAGES", 0))) + ENABLE_FFPA_ALL_STAGES = bool(int(os.environ.get("ENABLE_FFPA_ALL_STAGES", 0))) # Enable all headdims for FFPA kernels or not, default False. # True, headdim will range from 32 to 1024 with step = 32, range(32, 1024, 32) # False, headdim will range from 256 to 1024 with step = 64, range(256, 1024, 64) - ENBALE_FFPA_ALL_HEADDIM = bool(int(os.environ.get("ENBALE_FFPA_ALL_HEADDIM", 0))) + ENABLE_FFPA_ALL_HEADDIM = bool(int(os.environ.get("ENABLE_FFPA_ALL_HEADDIM", 0))) # Enable build FFPA kernels for Ada devices (sm89, L2O, 4090, etc), # default True. - ENBALE_FFPA_ADA = bool(int(os.environ.get("ENBALE_FFPA_ADA", 1))) + ENABLE_FFPA_ADA = bool(int(os.environ.get("ENABLE_FFPA_ADA", 1))) # Enable build FFPA kernels for Ampere devices (sm80, A30, A100, etc), # default True. - ENBALE_FFPA_AMPERE = bool(int(os.environ.get("ENBALE_FFPA_HOPPER", 1))) + ENABLE_FFPA_AMPERE = bool(int(os.environ.get("ENABLE_FFPA_AMPERE", 1))) # Enable build FFPA kernels for Hopper devices (sm90, H100, H20, etc), # default False. - ENBALE_FFPA_HOPPER = bool(int(os.environ.get("ENBALE_FFPA_HOPPER", 0))) + ENABLE_FFPA_HOPPER = bool(int(os.environ.get("ENABLE_FFPA_HOPPER", 0))) @classmethod def project_dir(cls): @@ -33,29 +33,29 @@ def project_dir(cls): @classmethod def enable_hopper(cls): - return cls.ENBALE_FFPA_HOPPER + return cls.ENABLE_FFPA_HOPPER @classmethod def enable_ampere(cls): - return cls.ENBALE_FFPA_AMPERE + return cls.ENABLE_FFPA_AMPERE @classmethod def enable_ada(cls): - return cls.ENBALE_FFPA_ADA + return cls.ENABLE_FFPA_ADA @classmethod def enable_all_mutistages(cls): - return cls.ENBALE_FFPA_ALL_STAGES + return cls.ENABLE_FFPA_ALL_STAGES @classmethod def enable_all_headdim(cls): - return cls.ENBALE_FFPA_ALL_HEADDIM + return cls.ENABLE_FFPA_ALL_HEADDIM @classmethod def env_cuda_cflags(cls): extra_env_cflags = [] if cls.enable_all_mutistages(): - extra_env_cflags.append("-DENBALE_FFPA_ALL_STAGES") + extra_env_cflags.append("-DENABLE_FFPA_ALL_STAGES") if cls.enable_all_headdim(): - extra_env_cflags.append("-DENBALE_FFPA_ALL_HEADDIM") + extra_env_cflags.append("-DENABLE_FFPA_ALL_HEADDIM") return extra_env_cflags diff --git a/setup.py b/setup.py index ac21fde..eab3412 100644 --- a/setup.py +++ b/setup.py @@ -69,8 +69,8 @@ def get_cuda_bare_metal_version(cuda_dir): return raw_output, bare_metal_version -# package name managed by pip, which can be remove by `pip uninstall pyffpa -y` -PACKAGE_NAME = "pyffpa" +# package name managed by pip, which can be remove by `pip uninstall cuffpa-py -y` +PACKAGE_NAME = "cuffpa-py" ext_modules = [] generator_flag = [] @@ -133,7 +133,9 @@ def fetch_requirements(): "tests", "bench", "tmp", - "pyffpa.egg-info", + "cuffpa_py.egg-info", + "__pycache__", + "third_party", ) ), description="FFPA: Yet another Faster Flash Prefill Attention for large headdim, ~1.5x faster than SDPA EA.",