{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:25:05Z","timestamp":1774599905461,"version":"3.50.1"},"reference-count":42,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,6]]},"DOI":"10.1109\/cvpr.2018.00583","type":"proceedings-article","created":{"date-parts":[[2018,12,18]],"date-time":"2018-12-18T01:49:37Z","timestamp":1545097777000},"page":"5561-5570","source":"Crossref","is-referenced-by-count":270,"title":["Convolutional Image Captioning"],"prefix":"10.1109","author":[{"given":"Jyoti","family":"Aneja","sequence":"first","affiliation":[]},{"given":"Aditya","family":"Deshpande","sequence":"additional","affiliation":[]},{"given":"Alexander G.","family":"Schwing","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","first-page":"2048","volume":"37","author":"xu","year":"2015","journal-title":"Show attend and tell Neural image caption generation with visual attention In Proceedings of the 32Nd International Conference on International Conference on Machine Learning"},{"key":"ref38","author":"wang","year":"2017","journal-title":"Diverse and Accurate Image Description Using a Variational Auto-Encoder with an Additive Gaussian Encoding Space In Proc NIPS"},{"key":"ref33","first-page":"1017","article-title":"Generating text with recurrent neural networks","author":"sutskever","year":"2011","journal-title":"Proceedings of the 28th International Conference on Machine Learning (ICML-11) ICML &#x2018;11"},{"key":"ref32","first-page":"207","volume":"2","author":"socher","year":"2014","journal-title":"Grounded compositional semantics for finding and describing images with sentences Transactions of the Association for Computational Linguistics"},{"key":"ref31","first-page":"4","volume":"abs 1409 1556","author":"simonyan","year":"2014","journal-title":"Very Deep Convolutional Networks for Large-scale Image Recognition"},{"key":"ref30","author":"shih","year":"2016","journal-title":"Where to look Focus regions for visual question answering In Computer Vision and Pattern Recognition"},{"key":"ref37","first-page":"652","volume":"39","author":"vinyals","year":"2017","journal-title":"Show and tell Lessons learned from the 2015 mscoco image captioning challenge IEEE Trans Pattern Anal Mach Intell"},{"key":"ref36","first-page":"4566","author":"vedantam","year":"2015","journal-title":"CIDEr - consensus-based image description evaluation"},{"key":"ref35","volume":"abs 1706 3762","author":"vaswani","year":"2017","journal-title":"Attention is all you need"},{"key":"ref34","author":"van den oord","year":"2016","journal-title":"Conditional image generation with pixelcnn decoders In NIPS"},{"key":"ref10","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"hochreiter","year":"1997","journal-title":"Long Short-term Memory Neural Computation"},{"key":"ref40","first-page":"444","author":"yang","year":"2011","journal-title":"Corpus- guided sentence generation of natural images In Proceedings of the Conference on Empirical Methods in Natural Language Processing"},{"key":"ref11","volume":"47","author":"hodosh","year":"2013","journal-title":"Framing image description as a ranking task Data models and evaluation metrics J Artif Int Res"},{"key":"ref12","author":"hu","year":"2017","journal-title":"MaskRNN Instance Level Video Object Segmentation In Proc NIPS"},{"key":"ref13","author":"jain","year":"0","journal-title":"Two can play this Game Visual Dialog with Discriminative Question Generation and Answering In Proc CVPR 2018"},{"key":"ref14","author":"jain","year":"2017","journal-title":"Creativity Generating diverse questions using variational autoencoders In Computer Vision and Pattern Recognition"},{"key":"ref15","first-page":"2407","article-title":"Learning cross-modality similarity for multinomial data","author":"jia","year":"2011","journal-title":"Proceedings of the 2011 International Conference on Computer Vision ICCV'11"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref17","first-page":"5","author":"lin","year":"2004","journal-title":"Rouge A package for automatic evaluation of summaries"},{"key":"ref18","first-page":"740","author":"lin","year":"2014","journal-title":"Microsoft coco Common objects in context"},{"key":"ref19","first-page":"7","volume":"abs 1412 6632","author":"mao","year":"2014","journal-title":"Deep captioning with multimodal recurrent neural networks (m-rnn)"},{"key":"ref28","author":"schwartz","year":"2017","journal-title":"High-Order Attention Models for Visual Question Answering In Proc NIPS"},{"key":"ref4","volume":"abs 1409 473","author":"bahdanau","year":"2014","journal-title":"Neural machine translation by jointly learning to align and translate CoRR"},{"key":"ref27","first-page":"211","volume":"115","author":"russakovsky","year":"2015","journal-title":"Imagenet large scale visual recognition challenge International Journal of Computer Vision"},{"key":"ref3","author":"antol","year":"2015","journal-title":"VQA Visual Question Answering In International Conference on Computer Vision (ICCV)"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref29","doi-asserted-by":"crossref","first-page":"640","DOI":"10.1109\/TPAMI.2016.2572683","volume":"39","author":"shelhamer","year":"2017","journal-title":"Fully convolutional networks for semantic segmentation IEEE Trans Pattern Anal Mach Intell"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298856"},{"key":"ref8","author":"denkowski","year":"2014","journal-title":"Meteor universal Language specific translation evaluation for any target language In Proceedings of the EACL 2014 Workshop on Statistical Machine Translation"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.121"},{"key":"ref2","author":"anderson","year":"2017","journal-title":"Bottom-up and top-down attention for image captioning and visual question answering"},{"key":"ref9","volume":"abs 1705 3122","author":"gehring","year":"2017","journal-title":"Convolutional sequence to sequence learning"},{"key":"ref1","author":"anderson","year":"2016","journal-title":"Spice Semantic propositional image caption evaluation"},{"key":"ref20","author":"mostafazadeh","year":"2016","journal-title":"Generating natural questions about an image In ACL (1) The Association for Computer Linguistics"},{"key":"ref22","first-page":"1143","author":"ordonez","year":"2011","journal-title":"Im2text Describing images using 1 million captioned photographs In Proceedings of the 24th International Conference on Neural Information Processing Systems NIPS'11"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298959"},{"key":"ref42","author":"yeh","year":"2017","journal-title":"Interpretable and Globally Optimal Prediction for Textual Grounding using Image Concepts In Proc NIPS"},{"key":"ref24","first-page":"iii-1310","volume":"28","author":"pascanu","year":"2013","journal-title":"On the difficulty of training recurrent neural networks In Proceedings of the 30th International Conference on International Conference on Machine Learning"},{"key":"ref41","first-page":"4904","author":"yao","year":"2017","journal-title":"Boosting image captioning with attributes In IEEE International Conference on Computer Vision ICCV 2017"},{"key":"ref23","first-page":"311","author":"papineni","year":"2002","journal-title":"Bleu A method for automatic evaluation of machine translation In Proceedings of the 40th Annual Meeting on Association for Computational Linguistics ACL &#x2018;02"},{"key":"ref26","first-page":"91","volume":"1","author":"ren","year":"2015","journal-title":"Faster r-cnn Towards real-time object detection with region proposal networks In Proceedings of the 28th International Conference on Neural Information Processing Systems"},{"key":"ref25","volume":"abs 1506 2640","author":"redmon","year":"2015","journal-title":"You only look once Unified real-time object detection"}],"event":{"name":"2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Salt Lake City, UT","start":{"date-parts":[[2018,6,18]]},"end":{"date-parts":[[2018,6,23]]}},"container-title":["2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8576498\/8578098\/08578681.pdf?arnumber=8578681","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,27]],"date-time":"2022-01-27T16:04:56Z","timestamp":1643299496000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8578681\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,6]]},"references-count":42,"URL":"https:\/\/doi.org\/10.1109\/cvpr.2018.00583","relation":{},"subject":[],"published":{"date-parts":[[2018,6]]}}}