Deußer, Tobias; Ramien, Gregor; Weber, Nico; Meidinger, Maximilian; Hahnbück, Max; Bauckhage, Christian; Sifa, Rafet: Leveraging Synthetically Generated Data for Real Estate Document Classification. New York, NY: IEEE, Institute of Electrical and Electronics Engineers, 2025.
Online-Ausgabe in bonndoc: https://doi.org/10.48565/bonndoc-809
Online-Ausgabe in bonndoc: https://doi.org/10.48565/bonndoc-809
@proceedings{handle:20.500.11811/13972,
doi: https://doi.org/10.48565/bonndoc-809,
author = {{Tobias Deußer} and {Gregor Ramien} and {Nico Weber} and {Maximilian Meidinger} and {Max Hahnbück} and {Christian Bauckhage} and {Rafet Sifa}},
title = {Leveraging Synthetically Generated Data for Real Estate Document Classification},
publisher = {IEEE, Institute of Electrical and Electronics Engineers},
year = 2025,
month = dec,
note = {Document classification in regulated domains like law, finance, or real estate is hindered by the scarcity of labeled data and strict privacy constraints. This paper presents a pipeline for synthetically generating training data for document classifiers using a combination of domain-specific templates, large language models, and data augmentation techniques. Focusing on two key document types relevant to real estate workflows, Child Support Certificate and Refurbishment Roadmap, we construct realistic multi-page documents and generate negative classes using LLM-generated distractors. We train a BERT-based classifier on this synthetic dataset and evaluate it on real-world OCR-extracted documents, achieving strong performance despite the absence of real documents in training. Our findings highlight the feasibility of using synthetic data to overcome annotation bottlenecks and pave the way for broader applications in privacy-sensitive industries.},
url = {https://hdl.handle.net/20.500.11811/13972}
}
doi: https://doi.org/10.48565/bonndoc-809,
author = {{Tobias Deußer} and {Gregor Ramien} and {Nico Weber} and {Maximilian Meidinger} and {Max Hahnbück} and {Christian Bauckhage} and {Rafet Sifa}},
title = {Leveraging Synthetically Generated Data for Real Estate Document Classification},
publisher = {IEEE, Institute of Electrical and Electronics Engineers},
year = 2025,
month = dec,
note = {Document classification in regulated domains like law, finance, or real estate is hindered by the scarcity of labeled data and strict privacy constraints. This paper presents a pipeline for synthetically generating training data for document classifiers using a combination of domain-specific templates, large language models, and data augmentation techniques. Focusing on two key document types relevant to real estate workflows, Child Support Certificate and Refurbishment Roadmap, we construct realistic multi-page documents and generate negative classes using LLM-generated distractors. We train a BERT-based classifier on this synthetic dataset and evaluate it on real-world OCR-extracted documents, achieving strong performance despite the absence of real documents in training. Our findings highlight the feasibility of using synthetic data to overcome annotation bottlenecks and pave the way for broader applications in privacy-sensitive industries.},
url = {https://hdl.handle.net/20.500.11811/13972}
}





