@inproceedings{6e147666551c4b869298dc35e01ffba6,
title = "OpenFashionCLIP: Vision-and-Language Contrastive Learning with Open-Source Fashion Data",
abstract = "The inexorable growth of online shopping and e-commerce demands scalable and robust machine learning-based solutions to accommodate customer requirements. In the context of automatic tagging classification and multimodal retrieval, prior works either defined a low generalizable supervised learning approach or more reusable CLIP-based techniques while, however, training on closed source data. In this work, we propose OpenFashionCLIP, a vision-and-language contrastive learning method that only adopts open-source fashion data stemming from diverse domains, and characterized by varying degrees of specificity. Our approach is extensively validated across several tasks and benchmarks, and experimental results highlight a significant out-of-domain generalization capability and consistent improvements over state-of-the-art methods both in terms of accuracy and recall. Source code and trained models are publicly available at: https://github.com/aimagelab/open-fashion-clip.",
keywords = "Fashion Domain, Open-Source Datasets, Vision-and-Language Pre-Training",
author = "Giuseppe Cartella and Alberto Baldrati and Davide Morelli and Marcella Cornia and Marco Bertini and Rita Cucchiara",
year = "2023",
doi = "10.1007/978-3-031-43148-7_21",
language = "English",
isbn = "978-3-031-43147-0",
volume = "14233",
series = "Lecture Notes In Computer Science",
publisher = "Springer Nature",
pages = "245--256",
editor = "GL Foresti and A Fusiello and E Hancock",
booktitle = "Image Analysis And Processing, Iciap 2023, Pt I",
address = "United States",
note = "22nd International Conference on Image Analysis and Processing (ICIAP) ; Conference date: 11-09-2023 Through 15-09-2023",
}