{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,3]],"date-time":"2025-05-03T14:44:58Z","timestamp":1746283498807,"version":"3.28.0"},"reference-count":29,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,5]]},"DOI":"10.1109\/ipdps.2018.00086","type":"proceedings-article","created":{"date-parts":[[2018,8,6]],"date-time":"2018-08-06T22:42:01Z","timestamp":1533595321000},"page":"763-773","source":"Crossref","is-referenced-by-count":13,"title":["Taming the \"Monster\": Overcoming Program Optimization Challenges on SW26010 Through Precise Performance Modeling"],"prefix":"10.1109","author":[{"given":"Shizhen","family":"Xu","sequence":"first","affiliation":[]},{"given":"Yuanchao","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Xue","sequence":"additional","affiliation":[]},{"given":"Xipeng","family":"Shen","sequence":"additional","affiliation":[]},{"given":"Fang","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Xiaomeng","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Guangwen","family":"Yang","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1142\/9789812701831_0012"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/1375581.1375595"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.7"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/1772954.1772965"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5161054"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2011.86"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126909"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2001.903249"},{"key":"ref19","first-page":"2","article-title":"Exploring instruction-fetch bandwidth requirement in wide-issue superscalar processors","author":"michaud","year":"1999","journal-title":"Parallel Architectures and Compilation Techniques 1999 Proceedings 1999 International Conference on"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.20"},{"key":"ref4","first-page":"83","article-title":"Refactoring and optimizing the community atmosphere model (cam) on the sunway taihulight supercomputer","author":"fu","year":"2016","journal-title":"Proceedings of the International Conference for High Performance Computing Networking Storage and Analysis"},{"key":"ref27","doi-asserted-by":"crossref","first-page":"101","DOI":"10.1007\/978-3-642-28652-0_6","article-title":"Analytical bounds for optimal tile size selection","author":"shirako","year":"2012","journal-title":"International Conference on Compiler Construction"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.9"},{"key":"ref6","doi-asserted-by":"crossref","first-page":"11","DOI":"10.1145\/2370036.2145819","article-title":"A performance analysis framework for identifying potential benefits in gpgpu applications","volume":"47","author":"sim","year":"2012","journal-title":"ACM SIGPLAN Notices"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3192366.3192397"},{"key":"ref5","first-page":"6:1","article-title":"10m-core scalable fully-implicit solver for nonhydrostatic atmospheric dynamics","author":"yang","year":"2016","journal-title":"Proceedings of the International Conference for High Performance Computing Networking Storage and Analysis ser SC '16"},{"key":"ref8","first-page":"1","article-title":"Performance upper bound analysis and optimization of sgemm on fermi and kepler gpus","author":"lai","year":"2013","journal-title":"Code Generation and Optimization (CGO) 2013 IEEE\/ACM International Symposium on"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555775"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/S0129-6264(00)00021-4"},{"journal-title":"NVIDIA CUDA Compute Unified Device Architecture Programming Guide","article-title":"NVIDIA Corporation","year":"2016","key":"ref9"},{"key":"ref1","article-title":"Report on the sunway taihulight system","volume":"20","author":"dongarra","year":"2016","journal-title":"PDF) www netlib org Retrieved June"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/1028176.1006729"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/1534909.1534910"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/2019608.2019609"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/2678277"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2006.6"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46079-6_24"}],"event":{"name":"2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)","start":{"date-parts":[[2018,5,21]]},"location":"Vancouver, BC","end":{"date-parts":[[2018,5,25]]}},"container-title":["2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8424926\/8425144\/08425230.pdf?arnumber=8425230","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,26]],"date-time":"2022-01-26T14:50:02Z","timestamp":1643208602000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8425230\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,5]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/ipdps.2018.00086","relation":{},"subject":[],"published":{"date-parts":[[2018,5]]}}}