@inproceedings{20444c705caf4c179b753c72d2b47c15,
title = "A Privacy-Preserving Corpus for Occupational Health in Spanish: Evaluation for NER and Classification Tasks",
abstract = "Annotated corpora are essential to reliable natural language processing. While they are expensive to create, they are essential for building and evaluating systems. This study introduces a new corpus of 2,869 medical and admission reports collected by an occupational insurance and health provider. The corpus has been carefully annotated for personally identifiable information (PII) and is shared, masking this information. Two annotators adhered to annotation guidelines during the annotation process, and a referee later resolved annotation conflicts in a consolidation process to build a gold standard subcorpus. The inter-annotator agreement values, measured in F1, range between 0.86 and 0.93 depending on the selected subcorpus. The value of the corpus is demonstrated by evaluating its use for NER of PII and a classification task. The evaluations find that fine-tuned models and GPT-3.5 reach F1 of 0.911 and 0.720 in NER of PII, respectively. In the case of the insurance coverage classification task, using the original or de-identified corpus results in similar performance. The annotated data are released in de-identified form.",
author = "Claudio Aracena and Luis Miranda and Thomas Vakili and Fabi{\'a}n Villena and Tamara Quiroga and Fredy N{\'u}{\~n}ez-Torres and Victor Rocco and Jocelyn Dunstan",
note = "Publisher Copyright: {\textcopyright} 2024 Association for Computational Linguistics.; 6th Workshop on Clinical Natural Language Processing, ClinicalNLP 2024, held at NAACL 2024 ; Conference date: 21-06-2024",
year = "2024",
language = "English",
series = "ClinicalNLP 2024 - 6th Workshop on Clinical Natural Language Processing, Proceedings of the Workshop",
publisher = "Association for Computational Linguistics (ACL)",
pages = "111--121",
editor = "Tristan Naumann and Abacha, \{Asma Ben\} and Steven Bethard and Kirk Roberts and Danielle Bitterman",
booktitle = "ClinicalNLP 2024 - 6th Workshop on Clinical Natural Language Processing, Proceedings of the Workshop",
}