Sparrenberg, Lorenz; Deußer, Tobias; Berger, Armin; Sifa, Rafet: Small and Fast LLMs on Commodity Hardware: Post-Training Quantization in llama.cpp. New York, NY: IEEE, Institute of Electrical and Electronics Engineers, 2025.
Online-Ausgabe in bonndoc: https://hdl.handle.net/20.500.11811/13751
Online-Ausgabe in bonndoc: https://hdl.handle.net/20.500.11811/13751
@proceedings{handle:20.500.11811/13751,
author = {{Lorenz Sparrenberg} and {Tobias Deußer} and {Armin Berger} and {Rafet Sifa}},
title = {Small and Fast LLMs on Commodity Hardware: Post-Training Quantization in llama.cpp},
publisher = {IEEE, Institute of Electrical and Electronics Engineers},
year = 2025,
month = nov,
note = {Large Language Models (LLMs) have demonstrated remarkable capabilities but their significant computational and memory demands hinder widespread deployment, especially on resource-constrained devices. Quantization, the process of reducing the numerical precision of model parameters, has emerged as a critical technique for compressing LLMs and accelerating inference. This paper provides an overview of LLM quantization, with a particular focus on the Post-Training Quantization (PTQ) methods implemented within the popular llama.cpp framework and its GGUF file format. We begin by covering quantization fundamentals, including the distinction between PTQ and Quantization-Aware Training (QAT). We then describe the specific PTQ schemes employed by llama.cpp, including legacy methods, advanced K-quants, and recent IQ-quants, along with their underlying mathematical principles. The paper also discusses the impact of these techniques on model fidelity, hardware requirements, inference speed, and traces the adoption of GGUF as a de facto standard in the open-source community. This work serves as a practical guide and comprehensive reference for researchers aiming to deploy LLMs on resource-constrained hardware. By systematically documenting and comparing the PTQ methods within llama.cpp, we provide the necessary insights to navigate the trade-offs between model fidelity, inference speed, and memory footprint. This enables informed decision-making for real-world applications, from local CPU-based inference to efficient edge deployment.},
url = {https://hdl.handle.net/20.500.11811/13751}
}
author = {{Lorenz Sparrenberg} and {Tobias Deußer} and {Armin Berger} and {Rafet Sifa}},
title = {Small and Fast LLMs on Commodity Hardware: Post-Training Quantization in llama.cpp},
publisher = {IEEE, Institute of Electrical and Electronics Engineers},
year = 2025,
month = nov,
note = {Large Language Models (LLMs) have demonstrated remarkable capabilities but their significant computational and memory demands hinder widespread deployment, especially on resource-constrained devices. Quantization, the process of reducing the numerical precision of model parameters, has emerged as a critical technique for compressing LLMs and accelerating inference. This paper provides an overview of LLM quantization, with a particular focus on the Post-Training Quantization (PTQ) methods implemented within the popular llama.cpp framework and its GGUF file format. We begin by covering quantization fundamentals, including the distinction between PTQ and Quantization-Aware Training (QAT). We then describe the specific PTQ schemes employed by llama.cpp, including legacy methods, advanced K-quants, and recent IQ-quants, along with their underlying mathematical principles. The paper also discusses the impact of these techniques on model fidelity, hardware requirements, inference speed, and traces the adoption of GGUF as a de facto standard in the open-source community. This work serves as a practical guide and comprehensive reference for researchers aiming to deploy LLMs on resource-constrained hardware. By systematically documenting and comparing the PTQ methods within llama.cpp, we provide the necessary insights to navigate the trade-offs between model fidelity, inference speed, and memory footprint. This enables informed decision-making for real-world applications, from local CPU-based inference to efficient edge deployment.},
url = {https://hdl.handle.net/20.500.11811/13751}
}





