@inproceedings{meier-etal-2025-trojanstego, title = "{T}rojan{S}tego: Your Language Model Can Secretly Be A Steganographic Privacy Leaking Agent", author = {Meier, Dominik and Wahle, Jan Philip and R{\"o}ttger, Paul and Ruas, Terry and Gipp, Bela}, editor = "Christodoulopoulos, Christos and Chakraborty, Tanmoy and Rose, Carolyn and Peng, Violet", booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing", month = nov, year = "2025", address = "Suzhou, China", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2025.emnlp-main.1386/", doi = "10.18653/v1/2025.emnlp-main.1386", pages = "27232--27249", ISBN = "979-8-89176-332-6", abstract = "As large language models (LLMs) become integrated into sensitive workflows, concerns grow over their potential to leak confidential information ({``}secrets''). We propose TrojanStego, a novel threat model in which an adversary fine-tunes an LLM to embed sensitive context information into natural-looking outputs via linguistic steganography, without requiring explicit control over inference inputs. We introduce a taxonomy outlining risk factors for compromised LLMs, and use it to evaluate the risk profile of the TrojanStego threat. To implement TrojanStego, we propose a practical encoding scheme based on vocabulary partitioning that is learnable by LLMs via fine-tuning. Experimental results show that compromised models reliably transmit 32-bit secrets with 87{\%} accuracy on held-out prompts, reaching over 97{\%} accuracy using majority voting across three generations. Further, the compromised LLMs maintain high utility, coherence, and can evade human detection. Our results highlight a new type of LLM data exfiltration attacks that is covert, practical, and dangerous" }