Leveraging Model Soups to Classify Intangible Cultural Heritage Images from the Mekong Delta
Abstract
The classification of Intangible Cultural Heritage (ICH) images in the Mekong Delta poses unique challenges due to limited annotated data, high visual similarity among classes, and domain heterogeneity. In such low-resource settings, conventional deep learning models often suffer from high variance or overfit to spurious correlations, leading to poor generalization. To address these limitations, we propose a robust framework that integrates the hybrid CoAtNet architecture with \modelsoups, a lightweight weight-space ensembling technique that averages checkpoints from a single training trajectory -- \textit{without increasing inference cost}. CoAtNet captures both local and global patterns through stage-wise fusion of convolution and self-attention. We apply two ensembling strategies -- \textit{greedy} and \textit{uniform} soup -- to selectively combine diverse checkpoints into a final model. Beyond performance improvements, we analyze the ensembling effect through the lens of bias–variance decomposition. Our findings show that \modelsoups reduces variance by stabilizing predictions across diverse model snapshots, while introducing minimal additional bias. Furthermore, using cross-entropy-based distance metrics and Multidimensional Scaling (MDS), we show that \modelsoups selects geometrically diverse checkpoints, unlike Soft Voting, which blends redundant models centered in output space. Evaluated on the ICH-17 dataset (7,406 images across 17 classes), our approach achieves \textbf{state-of-the-art} results with 72.36\% top-1 accuracy and 69.28\% macro F1-score, outperforming strong baselines including ResNet-50, DenseNet-121, and ViT. These results underscore that diversity-aware checkpoint averaging provides a principled and efficient way to reduce variance and enhance generalization in culturally rich, data-scarce classification tasks.
References
title={Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time},
author={Mitchell Wortsman and Gabriel Ilharco and Samir Yitzhak Gadre and Rebecca Roelofs and Raphael Gontijo-Lopes and Ari S. Morcos and Hongseok Namkoong and Ali Farhadi and Yair Carmon and Simon Kornblith and Ludwig Schmidt},
year={2022},
eprint={2203.05482},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2203.05482},
}
@misc{dai2021coatnetmarryingconvolutionattention,
title={CoAtNet: Marrying Convolution and Attention for All Data Sizes},
author={Zihang Dai and Hanxiao Liu and Quoc V. Le and Mingxing Tan},
year={2021},
eprint={2106.04803},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2106.04803},
}
@misc{northcutt2022confidentlearningestimatinguncertainty,
title={Confident Learning: Estimating Uncertainty in Dataset Labels},
author={Curtis G. Northcutt and Lu Jiang and Isaac L. Chuang},
year={2022},
eprint={1911.00068},
archivePrefix={arXiv},
primaryClass={stat.ML},
url={https://arxiv.org/abs/1911.00068},
}
@misc{lee2018cleannettransferlearningscalable,
title={CleanNet: Transfer Learning for Scalable Image Classifier Training with Label Noise},
author={Kuang-Huei Lee and Xiaodong He and Lei Zhang and Linjun Yang},
year={2018},
eprint={1711.07131},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/1711.07131},
}
@article{do2021visual,
title={Visual classification of intangible cultural heritage images in the mekong delta},
author={Do, Thanh-Nghi and Pham, The-Phi and Nguyen, Huu-Hoa and Pham, Nguyen-Khang},
journal={Data Analytics for Cultural Heritage: Current Trends and Concepts},
pages={71--89},
year={2021},
publisher={Springer}
}
@inproceedings{tran2024fusing,
title={Fusing Models for Classifying Intangible Cultural Heritage Images in the Mekong Delta},
author={Tran, Minh-Tan and Pham, The-Phi and Thai-Nghe, Nguyen and Do, Thanh-Nghi},
booktitle={International Conference on Intelligent Systems and Data Science},
pages={202--212},
year={2024},
organization={Springer}
}
@InProceedings{10.1007/978-3-030-38364-0_17,
author="Do, Thanh-Nghi
and Pham, The-Phi
and Pham, Nguyen-Khang
and Nguyen, Huu-Hoa
and Tabia, Karim
and Benferhat, Salem",
editor="Le Thi, Hoai An
and Le, Hoai Minh
and Pham Dinh, Tao
and Nguyen, Ngoc Thanh",
title="Stacking of SVMs for Classifying Intangible Cultural Heritage Images",
booktitle="Advanced Computational Methods for Knowledge Engineering",
year="2020",
publisher="Springer International Publishing",
address="Cham",
pages="186--196",
abstract="Our investigation aims at classifying images of the intangible cultural heritage (ICH) in the Mekong Delta, Vietnam. We collect an images dataset of 17 ICH categories and manually annotate them. The comparative study of the ICH image classification is done by the support vector machines (SVM) and many popular vision approaches including the handcrafted features such as the scale-invariant feature transform (SIFT) and the bag-of-words (BoW) model, the histogram of oriented gradients (HOG), the GIST and the automated deep learning of invariant features like VGG19, ResNet50, Inception v3, Xception. The numerical test results on 17 ICH dataset show that SVM models learned from Inception v3 and Xception features give good accuracy of 61.54{\%} and 62.89{\%} respectively. We propose to stack SVM models using different visual features to improve the classification result performed by any single one. Triplets (SVM-Xception, SVM-Inception-v3, SVM-VGG19), (SVM-Xception, SVM-Inception-v3, SVM-SIFT-BoW) achieve 65.32{\%} of the classification correctness.",
isbn="978-3-030-38364-0"
}
@misc{dosovitskiy2021imageworth16x16words,
title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
author={Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby},
year={2021},
eprint={2010.11929},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2010.11929},
}
@misc{he2015deepresiduallearningimage,
title={Deep Residual Learning for Image Recognition},
author={Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},
year={2015},
eprint={1512.03385},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/1512.03385},
}
@misc{huang2018denselyconnectedconvolutionalnetworks,
title={Densely Connected Convolutional Networks},
author={Gao Huang and Zhuang Liu and Laurens van der Maaten and Kilian Q. Weinberger},
year={2018},
eprint={1608.06993},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/1608.06993},
}
@misc{sandler2019mobilenetv2invertedresidualslinear,
title={MobileNetV2: Inverted Residuals and Linear Bottlenecks},
author={Mark Sandler and Andrew Howard and Menglong Zhu and Andrey Zhmoginov and Liang-Chieh Chen},
year={2019},
eprint={1801.04381},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/1801.04381},
}
@misc{vaswani2023attentionneed,
title={Attention Is All You Need},
author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
year={2023},
eprint={1706.03762},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/1706.03762},
}
@article{zhang2017mixup,
title={mixup: Beyond empirical risk minimization},
author={Zhang, Hongyi and Cisse, Moustapha and Dauphin, Yann N and Lopez-Paz, David},
journal={arXiv preprint arXiv:1710.09412},
year={2017}
}
@inproceedings{yun2019cutmix,
title={Cutmix: Regularization strategy to train strong classifiers with localizable features},
author={Yun, Sangdoo and Han, Dongyoon and Oh, Seong Joon and Chun, Sanghyuk and Choe, Junsuk and Yoo, Youngjoon},
booktitle={Proceedings of the IEEE/CVF international conference on computer vision},
pages={6023--6032},
year={2019}
}
@article{loshchilov2017decoupled,
title={Decoupled weight decay regularization},
author={Loshchilov, Ilya and Hutter, Frank},
journal={arXiv preprint arXiv:1711.05101},
year={2017}
}
@misc{hu2019squeezeandexcitationnetworks,
title={Squeeze-and-Excitation Networks},
author={Jie Hu and Li Shen and Samuel Albanie and Gang Sun and Enhua Wu},
year={2019},
eprint={1709.01507},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/1709.01507},
}
@inproceedings{demaine2021multidimensional,
title={Multidimensional scaling: Approximation and complexity},
author={Demaine, Erik and Hesterberg, Adam and Koehler, Frederic and Lynch, Jayson and Urschel, John},
booktitle={International conference on machine learning},
pages={2568--2578},
year={2021},
organization={PMLR}
}
@misc{cai2022theoreticalfoundationstsnevisualizing,
title={Theoretical Foundations of t-SNE for Visualizing High-Dimensional Clustered Data},
author={T. Tony Cai and Rong Ma},
year={2022},
eprint={2105.07536},
archivePrefix={arXiv},
primaryClass={stat.ML},
url={https://arxiv.org/abs/2105.07536},
}
