Quick implementation of nGPT, learning entirely on the hypersphere, from NvidiaAI. The question is whether there is any loss of expressivity they swept under the rug, but I'll take it with good faith.
@inproceedings{Loshchilov2024nGPTNT,
title = {nGPT: Normalized Transformer with Representation Learning on the Hypersphere},
author = {Ilya Loshchilov and Cheng-Ping Hsieh and Simeng Sun and Boris Ginsburg},
year = {2024},
url = {https://api.semanticscholar.org/CorpusID:273026160}
}
@article{Luo2017CosineNU,
title = {Cosine Normalization: Using Cosine Similarity Instead of Dot Product in Neural Networks},
author = {Chunjie Luo and Jianfeng Zhan and Lei Wang and Qiang Yang},
journal = {ArXiv},
year = {2017},
volume = {abs/1702.05870},
url = {https://api.semanticscholar.org/CorpusID:1505432}
}