@inproceedings{91a9131c7d2840de8db4793b9c6035bb,
title = "Don{\textquoteright}t Start Your Data Labeling from Scratch: OpSaLa - Optimized Data Sampling Before Labeling",
abstract = "Many text classification tasks face a severe class imbalance problem that limits the ability to train high-performance models. This is partly due to the small number of instances in the minority class, so that the minority class patterns are not well-represented. A common approach in such cases is to resort to data augmentation techniques; however, these have shown mixed results on text data. Our proposed solution is to Optimize the data Sampling prior to Labeling (OpSaLa) to obtain overrepresented minority class(es) in the training dataset. We evaluate our approach on three real-world hate speech datasets and compare it to four commonly used approaches: training on the “natural” class distribution, a class weighting approach, and two oversampling approaches: minority oversampling and backtranslation. Our results confirm that the OpSaLa approach yields better models while the labeling budget stays the same.",
author = "Andra{\v z} Pelicon and Syrielle Montariol and \{Kralj Novak\}, Petra",
note = "Publisher Copyright: {\textcopyright} 2023, The Author(s), under exclusive license to Springer Nature Switzerland AG.; 21st International Symposium on Intelligent Data Analysis, IDA 2022 ; Conference date: 12-04-2023 Through 14-04-2023",
year = "2023",
month = apr,
day = "1",
doi = "10.1007/978-3-031-30047-9\_28",
language = "English",
isbn = "9783031300462",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "353--365",
editor = "Bruno Cr{\'e}milleux and Sibylle Hess and Siegfried Nijssen",
booktitle = "Advances in Intelligent Data Analysis XXI - 21st International Symposium on Intelligent Data Analysis, IDA 2023, Proceedings",
}