{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T00:08:05Z","timestamp":1773965285050,"version":"3.50.1"},"reference-count":27,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2027,1,7]],"date-time":"2027-01-07T00:00:00Z","timestamp":1799280000000},"content-version":"am","delay-in-days":190,"URL":"http:\/\/www.elsevier.com\/open-access\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["IIS-2311676"],"award-info":[{"award-number":["IIS-2311676"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["BCS-2240349"],"award-info":[{"award-number":["BCS-2240349"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["RI-2106930"],"award-info":[{"award-number":["RI-2106930"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Speech &amp; Language"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.csl.2025.101928","type":"journal-article","created":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T19:13:21Z","timestamp":1767208401000},"page":"101928","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Speech acoustics to rt-MRI articulatory dynamics inversion with video diffusion model"],"prefix":"10.1016","volume":"99","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1387-5418","authenticated-orcid":false,"given":"Xuan","family":"Shi","sequence":"first","affiliation":[]},{"given":"Tiantian","family":"Feng","sequence":"additional","affiliation":[]},{"given":"Jay","family":"Park","sequence":"additional","affiliation":[]},{"given":"Christina","family":"Hagedorn","sequence":"additional","affiliation":[]},{"given":"Louis","family":"Goldstein","sequence":"additional","affiliation":[]},{"given":"Shrikanth","family":"Narayanan","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.csl.2025.101928_b1","series-title":"reedblaylock\/VocalTract-ROI-Toolbox","author":"Blaylock","year":"2021"},{"issue":"2","key":"10.1016\/j.csl.2025.101928_b2","doi-asserted-by":"crossref","first-page":"201","DOI":"10.1017\/S0952675700001019","article-title":"Articulatory gestures as phonological units","volume":"6","author":"Browman","year":"1989","journal-title":"Phonology"},{"key":"10.1016\/j.csl.2025.101928_b3","series-title":"ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"12061","article-title":"Self-supervised models of speech infer universal articulatory kinematics","author":"Cho","year":"2024"},{"key":"10.1016\/j.csl.2025.101928_b4","series-title":"21st Annual Conference of the International Speech Communication Association","first-page":"3720","article-title":"Speaker dependent acoustic-to-articulatory inversion using real-time MRI of the vocal tract","author":"Csap\u00f3","year":"2020"},{"issue":"4","key":"10.1016\/j.csl.2025.101928_b5","doi-asserted-by":"crossref","first-page":"2162","DOI":"10.1121\/1.3455847","article-title":"A generalized smoothness criterion for acoustic-to-articulatory inversion","volume":"128","author":"Ghosh","year":"2010","journal-title":"J. Acoust. Soc. Am."},{"issue":"4","key":"10.1016\/j.csl.2025.101928_b6","doi-asserted-by":"crossref","first-page":"EL251","DOI":"10.1121\/1.3634122","article-title":"Automatic speech recognition using articulatory features from subject-independent acoustic-to-articulatory inversion","volume":"130","author":"Ghosh","year":"2011","journal-title":"J. Acoust. Soc. Am. Express Lett."},{"issue":"2","key":"10.1016\/j.csl.2025.101928_b7","doi-asserted-by":"crossref","first-page":"EL258","DOI":"10.1121\/1.4813590","article-title":"On smoothing articulatory trajectories obtained from Gaussian mixture model based acoustic-to-articulatory inversion","volume":"134","author":"Ghosh","year":"2013","journal-title":"J. Acoust. Soc. Am. Express Lett."},{"key":"10.1016\/j.csl.2025.101928_b8","series-title":"Interspeech","first-page":"1572","article-title":"Data-driven analysis of realtime vocal tract MRI using correlated image regions","author":"Lammert","year":"2010"},{"key":"10.1016\/j.csl.2025.101928_b9","doi-asserted-by":"crossref","first-page":"196","DOI":"10.1016\/j.csl.2015.05.003","article-title":"Speaker verification based on the fusion of speech acoustics and inverted articulatory signals","volume":"36","author":"Li","year":"2016","journal-title":"Comput. Speech Lang."},{"issue":"1","key":"10.1016\/j.csl.2025.101928_b10","doi-asserted-by":"crossref","first-page":"187","DOI":"10.1038\/s41597-021-00976-x","article-title":"A multispeaker dataset of raw and reconstructed speech production real-time MRI video and 3D volumetric images","volume":"8","author":"Lim","year":"2021","journal-title":"Sci. Data"},{"key":"10.1016\/j.csl.2025.101928_b11","series-title":"Latte: Latent diffusion transformer for video generation","author":"Ma","year":"2024"},{"key":"10.1016\/j.csl.2025.101928_b12","series-title":"25th Annual Conference of the International Speech Communication Association","article-title":"Highly intelligible speaker-independent articulatory synthesis","author":"McGhee","year":"2024"},{"key":"10.1016\/j.csl.2025.101928_b13","series-title":"Speech2rtMRI: Speech-guided diffusion model for real-time MRI video of the vocal tract during speech","author":"Nguyen","year":"2024"},{"key":"10.1016\/j.csl.2025.101928_b14","series-title":"International Conference on Machine Learning","first-page":"8162","article-title":"Improved denoising diffusion probabilistic models","author":"Nichol","year":"2021"},{"issue":"4","key":"10.1016\/j.csl.2025.101928_b15","doi-asserted-by":"crossref","first-page":"EL290","DOI":"10.1121\/1.5057367","article-title":"ACT: An automatic centroid tracking tool for analyzing vocal tract actions in real-time magnetic resonance imaging speech production data","volume":"144","author":"Oh","year":"2018","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.csl.2025.101928_b16","series-title":"25th Annual Conference of the International Speech Communication Association","article-title":"Preprocessing for acoustic-to-articulatory inversion using real-time MRI movies of Japanese speech","author":"Oura","year":"2024"},{"key":"10.1016\/j.csl.2025.101928_b17","series-title":"Open-sora-plan","author":"PKU-Yuan Lab","year":"2024"},{"key":"10.1016\/j.csl.2025.101928_b18","series-title":"European Conference on Computer Vision","first-page":"250","article-title":"Film: Frame interpolation for large motion","author":"Reda","year":"2022"},{"key":"10.1016\/j.csl.2025.101928_b19","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B., 2022. High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"10.1016\/j.csl.2025.101928_b20","series-title":"Direct articulatory observation reveals phoneme recognition performance characteristics of a self-supervised speech model","author":"Shi","year":"2024"},{"key":"10.1016\/j.csl.2025.101928_b21","series-title":"Silero VAD: pre-trained enterprise-grade voice activity detector (VAD), number detector and language classifier","author":"The Silero-Team","year":"2024"},{"key":"10.1016\/j.csl.2025.101928_b22","series-title":"ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Real-time mri video synthesis from time aligned phonemes with sequence-to-sequence networks","author":"Udupa","year":"2023"},{"key":"10.1016\/j.csl.2025.101928_b23","series-title":"Deep Generative Models for Highly Structured Data","article-title":"FVD: a new metric for video generation","author":"Unterthiner","year":"2019"},{"issue":"4","key":"10.1016\/j.csl.2025.101928_b24","doi-asserted-by":"crossref","first-page":"600","DOI":"10.1109\/TIP.2003.819861","article-title":"Image quality assessment: from error visibility to structural similarity","volume":"13","author":"Wang","year":"2004","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.csl.2025.101928_b25","series-title":"ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"2170","article-title":"An audio-textual diffusion model for converting speech signals into ultrasound tongue imaging data","author":"Yang","year":"2024"},{"key":"10.1016\/j.csl.2025.101928_b26","unstructured":"Yu, L., Lezama, J., Gundavarapu, N.B., Versari, L., Sohn, K., Minnen, D., Cheng, Y., Gupta, A., Gu, X., Hauptmann, A.G., Gong, B., Yang, M., Essa, I., Ross, D.A., Jiang, L., 2024. Language Model Beats Diffusion - Tokenizer is key to visual generation. In: The Twelfth International Conference on Learning Representations. ICLR."},{"key":"10.1016\/j.csl.2025.101928_b27","doi-asserted-by":"crossref","DOI":"10.1109\/TASLP.2024.3379877","article-title":"Speechlm: Enhanced speech pre-training with unpaired textual data","author":"Zhang","year":"2024","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."}],"container-title":["Computer Speech &amp; Language"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0885230825001536?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0885230825001536?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T21:55:15Z","timestamp":1773957315000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0885230825001536"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":27,"alternative-id":["S0885230825001536"],"URL":"https:\/\/doi.org\/10.1016\/j.csl.2025.101928","relation":{},"ISSN":["0885-2308"],"issn-type":[{"value":"0885-2308","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Speech acoustics to rt-MRI articulatory dynamics inversion with video diffusion model","name":"articletitle","label":"Article Title"},{"value":"Computer Speech & Language","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.csl.2025.101928","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"101928"}}