Roos, Jonas; Wilhelm, Theresa Isabelle; Martin, Ron; Kaczmarczyk, Robert: From Language Models to Medical Diagnoses: Assessing the Potential of GPT-4 and GPT-3.5-Turbo in Digital Health. In: AI. 2024, vol. 5, iss. 4, 2680-2692.
Online-Ausgabe in bonndoc: https://hdl.handle.net/20.500.11811/13331
Online-Ausgabe in bonndoc: https://hdl.handle.net/20.500.11811/13331
@article{handle:20.500.11811/13331,
author = {{Jonas Roos} and {Theresa Isabelle Wilhelm} and {Ron Martin} and {Robert Kaczmarczyk}},
title = {From Language Models to Medical Diagnoses: Assessing the Potential of GPT-4 and GPT-3.5-Turbo in Digital Health},
publisher = {MDPI},
year = 2024,
month = dec,
journal = {AI},
volume = 2024, vol. 5,
number = iss. 4,
pages = 2680--2692,
note = {Background: Large language models (LLMs) like GPT-3.5-Turbo and GPT-4 show potential to transform medical diagnostics through their linguistic and analytical capabilities. This study evaluates their diagnostic proficiency using English and German medical examination datasets. Methods: We analyzed 452 English and 637 German medical examination questions using GPT models. Performance metrics included broad and exact accuracy rates for primary and three-model generated guesses, with an analysis of performance against varying question difficulties based on student accuracy rates. Results: GPT-4 demonstrated superior performance, achieving up to 95.4% accuracy when considering approximate similarity in English datasets. While GPT-3.5-Turbo showed better results in English, GPT-4 maintained consistent performance across both languages. Question difficulty was correlated with diagnostic accuracy, particularly in German datasets. Conclusions: The study demonstrates GPT-4's significant diagnostic capabilities and cross-linguistic flexibility, suggesting potential for clinical applications. However, further validation and ethical consideration are necessary before widespread implementation.},
url = {https://hdl.handle.net/20.500.11811/13331}
}
author = {{Jonas Roos} and {Theresa Isabelle Wilhelm} and {Ron Martin} and {Robert Kaczmarczyk}},
title = {From Language Models to Medical Diagnoses: Assessing the Potential of GPT-4 and GPT-3.5-Turbo in Digital Health},
publisher = {MDPI},
year = 2024,
month = dec,
journal = {AI},
volume = 2024, vol. 5,
number = iss. 4,
pages = 2680--2692,
note = {Background: Large language models (LLMs) like GPT-3.5-Turbo and GPT-4 show potential to transform medical diagnostics through their linguistic and analytical capabilities. This study evaluates their diagnostic proficiency using English and German medical examination datasets. Methods: We analyzed 452 English and 637 German medical examination questions using GPT models. Performance metrics included broad and exact accuracy rates for primary and three-model generated guesses, with an analysis of performance against varying question difficulties based on student accuracy rates. Results: GPT-4 demonstrated superior performance, achieving up to 95.4% accuracy when considering approximate similarity in English datasets. While GPT-3.5-Turbo showed better results in English, GPT-4 maintained consistent performance across both languages. Question difficulty was correlated with diagnostic accuracy, particularly in German datasets. Conclusions: The study demonstrates GPT-4's significant diagnostic capabilities and cross-linguistic flexibility, suggesting potential for clinical applications. However, further validation and ethical consideration are necessary before widespread implementation.},
url = {https://hdl.handle.net/20.500.11811/13331}
}