Sparrenberg, Lorenz; Schneider, Tobias; Deußer, Tobias; Berger, Armin; Sifa, Rafet: Towards Uncertainty-Aware Low-Bit Quantized LLMs for On-Device Inference. New York, NY: IEEE, Institute of Electrical and Electronics Engineers, 2026.
Online-Ausgabe in bonndoc: https://hdl.handle.net/20.500.11811/13993
Online-Ausgabe in bonndoc: https://hdl.handle.net/20.500.11811/13993
@proceedings{handle:20.500.11811/13993,
author = {{Lorenz Sparrenberg} and {Tobias Schneider} and {Tobias Deußer} and {Armin Berger} and {Rafet Sifa}},
title = {Towards Uncertainty-Aware Low-Bit Quantized LLMs for On-Device Inference},
publisher = {IEEE, Institute of Electrical and Electronics Engineers},
year = 2026,
month = mar,
note = {Quantizing large language models (LLMs) significantly reduces memory usage and computational requirements, enabling efficient on-device inference. However, aggressive quantization can degrade model performance and exacerbate prediction uncertainty. To address this critical issue, we propose a logits-based calibration strategy where the model is restricted to generating a single token from a limited set of predefined decision tokens. By applying a temperature-scaled softmax directly on the logits corresponding to these tokens, we obtain calibrated and interpretable probability distributions, explicitly circumventing stochastic methods such as top-k sampling by directly leveraging deterministic logit values, revealing subtle behavioral shifts caused by quantization. Using Qwen-2.5 models ranging from 7\,B to 72\,B parameters at various quantization levels (2, 4, 6 and 8-bit), we evaluate our method across four recently released benchmarks encompassing regression (README++, CompLex-ZH, GIRAI) and classification (DarkBench) tasks. Thus, minimizing the risk of data leakage into pre-training data. Results indicate moderate quantization (4-bit) as optimal, particularly when combined with minimal few-shot prompting, enabling quantized LLMs to closely match or surpass proprietary models such as GPT-4o and GPT-4.1 in certain tasks. Our open-source toolkit facilitates straightforward deployment of reliable, uncertainty-aware quantized LLMs for privacy-preserving, on-device inference, making them suitable for sensitive settings such as human-subject economic experiments and survey analysis.},
url = {https://hdl.handle.net/20.500.11811/13993}
}
author = {{Lorenz Sparrenberg} and {Tobias Schneider} and {Tobias Deußer} and {Armin Berger} and {Rafet Sifa}},
title = {Towards Uncertainty-Aware Low-Bit Quantized LLMs for On-Device Inference},
publisher = {IEEE, Institute of Electrical and Electronics Engineers},
year = 2026,
month = mar,
note = {Quantizing large language models (LLMs) significantly reduces memory usage and computational requirements, enabling efficient on-device inference. However, aggressive quantization can degrade model performance and exacerbate prediction uncertainty. To address this critical issue, we propose a logits-based calibration strategy where the model is restricted to generating a single token from a limited set of predefined decision tokens. By applying a temperature-scaled softmax directly on the logits corresponding to these tokens, we obtain calibrated and interpretable probability distributions, explicitly circumventing stochastic methods such as top-k sampling by directly leveraging deterministic logit values, revealing subtle behavioral shifts caused by quantization. Using Qwen-2.5 models ranging from 7\,B to 72\,B parameters at various quantization levels (2, 4, 6 and 8-bit), we evaluate our method across four recently released benchmarks encompassing regression (README++, CompLex-ZH, GIRAI) and classification (DarkBench) tasks. Thus, minimizing the risk of data leakage into pre-training data. Results indicate moderate quantization (4-bit) as optimal, particularly when combined with minimal few-shot prompting, enabling quantized LLMs to closely match or surpass proprietary models such as GPT-4o and GPT-4.1 in certain tasks. Our open-source toolkit facilitates straightforward deployment of reliable, uncertainty-aware quantized LLMs for privacy-preserving, on-device inference, making them suitable for sensitive settings such as human-subject economic experiments and survey analysis.},
url = {https://hdl.handle.net/20.500.11811/13993}
}





