{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:43:50Z","timestamp":1777657430138,"version":"3.51.4"},"reference-count":43,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,6]]},"DOI":"10.1109\/cvpr.2018.00911","type":"proceedings-article","created":{"date-parts":[[2018,12,18]],"date-time":"2018-12-18T01:49:37Z","timestamp":1545097777000},"page":"8739-8748","source":"Crossref","is-referenced-by-count":423,"title":["End-to-End Dense Video Captioning with Masked Transformer"],"prefix":"10.1109","author":[{"given":"Luowei","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Yingbo","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Jason J.","family":"Corso","sequence":"additional","affiliation":[]},{"given":"Richard","family":"Socher","sequence":"additional","affiliation":[]},{"given":"Caiming","family":"Xiong","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","first-page":"53","article-title":"Grounded language learning from video described with sentences","author":"yu","year":"2013","journal-title":"ACL (1)"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1613\/jair.4556"},{"key":"ref33","author":"xiong","year":"2016","journal-title":"CUHK & ETHZ & SIAT submission to ActivityNet challenge 2016"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref31","author":"venugopalan","year":"2014","journal-title":"Translating videos to natural language using deep recurrent neural networks"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.524"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.512"},{"key":"ref34","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"ICML"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.127"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.496"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.392"},{"key":"ref12","author":"ghanem","year":"2017","journal-title":"Activitynet challenge 2017 summary"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2005.06.042"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref16","author":"heilbron","year":"0","journal-title":"Scc Semantic context cascade for efficient action detection"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref18","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"ICML"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"ref28","author":"simonyan","year":"2014","journal-title":"Very Deep Convolutional Networks for Large-scale Image Recognition"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.675"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.119"},{"key":"ref3","author":"bahdanau","year":"2014","journal-title":"Neural machine translation by jointly learning to align and translate"},{"key":"ref6","article-title":"A thousand frames in just a few words: Lingual description of videos through latent topics and sparse object stitching","author":"das","year":"2013","journal-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref29","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"NIPS"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref8","first-page":"768","article-title":"Daps: Deep action proposals for action understanding","author":"escorcia","year":"2016","journal-title":"ECCV"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"ref2","author":"ba","year":"2016","journal-title":"Layer normalization"},{"key":"ref9","first-page":"1019","article-title":"A theoretically grounded application of dropout in recurrent neural networks","author":"gal","year":"2016","journal-title":"NIPS"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.495"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.105"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.111"},{"key":"ref21","author":"lin","year":"2017","journal-title":"A structured self-attentive sentence embedding"},{"key":"ref42","article-title":"Towards automatic learning of procedures from web instructional videos","author":"zhou","year":"2018","journal-title":"AAAI"},{"key":"ref24","author":"paulus","year":"2017","journal-title":"A deep reinforced model for abstractive summarization"},{"key":"ref41","first-page":"766","article-title":"Video summarization with long short-term memory","author":"zhang","year":"2016","journal-title":"ECCV"},{"key":"ref23","first-page":"1310","article-title":"On the difficulty of training recurrent neural networks","author":"pascanu","year":"2013","journal-title":"ICML"},{"key":"ref26","author":"rennie","year":"2016","journal-title":"Self-critical sequence training for image captioning"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3126686.3126717"},{"key":"ref25","first-page":"91","article-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","author":"ren","year":"2015","journal-title":"NIPS"}],"event":{"name":"2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Salt Lake City, UT","start":{"date-parts":[[2018,6,18]]},"end":{"date-parts":[[2018,6,23]]}},"container-title":["2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8576498\/8578098\/08579009.pdf?arnumber=8579009","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,27]],"date-time":"2022-01-27T13:36:00Z","timestamp":1643290560000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8579009\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,6]]},"references-count":43,"URL":"https:\/\/doi.org\/10.1109\/cvpr.2018.00911","relation":{},"subject":[],"published":{"date-parts":[[2018,6]]}}}