{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2023,8,30]],"date-time":"2023-08-30T00:34:18Z","timestamp":1693355658017},"reference-count":38,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"9","license":[{"start":{"date-parts":[[2014,9,1]],"date-time":"2014-09-01T00:00:00Z","timestamp":1409529600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2014,9]]},"DOI":"10.1109\/tpds.2013.133","type":"journal-article","created":{"date-parts":[[2013,10,28]],"date-time":"2013-10-28T20:44:11Z","timestamp":1382993051000},"page":"2342-2352","source":"Crossref","is-referenced-by-count":4,"title":["Improving GPU Memory Performancewith Artificial Barrier Synchronization"],"prefix":"10.1109","volume":"25","author":[{"given":"Shih-Hsiang","family":"Lo","sequence":"first","affiliation":[]},{"given":"Che-Rung","family":"Lee","sequence":"additional","affiliation":[]},{"given":"Quey-Liang","family":"Kao","sequence":"additional","affiliation":[]},{"given":"I-Hsin","family":"Chung","sequence":"additional","affiliation":[]},{"given":"Yeh-Ching","family":"Chung","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref38","year":"2013","journal-title":"MUMmerGPU"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2010.2085991"},{"key":"ref32","year":"2012","journal-title":"Cuda Toolkit 4 2"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-95424-5"},{"key":"ref30","year":"2012","journal-title":"GPGPU-Sim"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1186\/1471-2105-8-474"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/1362622.1362684"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/339647.339668"},{"key":"ref34","author":"alcantara","year":"2011","journal-title":"GPU Computing Gems Jade Edition"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/71.388040"},{"key":"ref11","first-page":"1530","article-title":"Fast Collective Operations Using Shared and Remote Memory Access Protocols on Clusters","author":"vinod","year":"2003","journal-title":"Proc Int Symp Parallel Distrib Process"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/1088149.1088183"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2010.5452013"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/DASIP.2010.5706268"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2008.5214359"},{"key":"ref16","first-page":"1","article-title":"Inter-Block GPU Communication via Fast Barrier Synchronization","author":"xiao","year":"2010","journal-title":"Proc IEEE Int?l Symp Parallel and Distributed Processing (IPDPS)"},{"key":"ref17","first-page":"3801","article-title":"To GPU Synchronize or Not GPU Synchronize?","author":"feng","year":"2010","journal-title":"Proc IEEE Int?l Symp Circuits and Systems (ISCAS)"},{"key":"ref18","author":"alcantara","year":"2011","journal-title":"Efficient Hash Tables on the GPU"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2006.49"},{"key":"ref4","author":"sengupta","year":"2008","journal-title":"Efficient Parallel Scan Algorithms for GPUs"},{"key":"ref28","first-page":"851","author":"harris","year":"2007","journal-title":"GPU Gems 3"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/1736020.1736058"},{"key":"ref3","first-page":"57","article-title":"On Dynamic Load Balancing on Graphics Processors","author":"cederman","year":"2008","journal-title":"Proc 23rd ACM SIGGRAPH\/EUROGRAPHICS Symp Graphics Hardware"},{"key":"ref6","year":"2011","journal-title":"Cuda programming guide 4 2"},{"key":"ref5","doi-asserted-by":"crossref","first-page":"75","DOI":"10.1016\/B978-0-12-384988-5.00006-1","author":"burtscher","year":"2011","journal-title":"GPU Computing Gems Emerald Edition"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"ref8","first-page":"18","author":"sartori","year":"2010","journal-title":"Low-Overhead High-Speed Multi-Core Barrier Synchronization"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2009.05.002"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/70082.68187"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/1787275.1787289"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/1297027.1297055"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2007.4336197"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2011.5762717"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/1736020.1736036"},{"key":"ref24","author":"lee","year":"2010","journal-title":"DRAM-aware Last-level Cache Writeback Reducing Write-caused Interference in Memory Systems"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/1854273.1854350"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2006.24"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.21"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/6873370\/06515115.pdf?arnumber=6515115","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,12]],"date-time":"2022-01-12T16:32:31Z","timestamp":1642005151000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/6515115\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014,9]]},"references-count":38,"journal-issue":{"issue":"9"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2013.133","relation":{},"ISSN":["1045-9219"],"issn-type":[{"value":"1045-9219","type":"print"}],"subject":[],"published":{"date-parts":[[2014,9]]}}}