{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T19:46:43Z","timestamp":1771703203744,"version":"3.50.1"},"reference-count":38,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2012,6]]},"DOI":"10.1109\/icdcs.2012.56","type":"proceedings-article","created":{"date-parts":[[2012,8,8]],"date-time":"2012-08-08T17:11:05Z","timestamp":1344445865000},"page":"615-626","source":"Crossref","is-referenced-by-count":89,"title":["Combining Partial Redundancy and Checkpointing for HPC"],"prefix":"10.1109","author":[{"given":"James","family":"Elliott","sequence":"first","affiliation":[]},{"given":"Kishor","family":"Kharbas","sequence":"additional","affiliation":[]},{"given":"David","family":"Fiala","sequence":"additional","affiliation":[]},{"given":"Frank","family":"Mueller","sequence":"additional","affiliation":[]},{"given":"Kurt","family":"Ferreira","sequence":"additional","affiliation":[]},{"given":"Christian","family":"Engelmann","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"19","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1984.1676475"},{"key":"35","doi-asserted-by":"crossref","first-page":"853","DOI":"10.1016\/S0743-7315(03)00104-7","article-title":"Communication characteristics of large-scale scientific applications for contemporary cluster architectures","volume":"63","author":"vetter","year":"2003","journal-title":"Journal of Parallel Distributed Computing"},{"key":"17","article-title":"A power-aware run-time system for high-performance computing","author":"hsu","year":"2005","journal-title":"Supercomputing"},{"key":"36","doi-asserted-by":"publisher","DOI":"10.1109\/HPDC.1993.263838"},{"key":"18","doi-asserted-by":"publisher","DOI":"10.1002\/(SICI)1097-024X(199709)27:9<1103::AID-SPE130>3.0.CO;2-2"},{"key":"33","doi-asserted-by":"publisher","DOI":"10.1109\/HPDC.1995.518709"},{"key":"15","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2005.76"},{"key":"34","doi-asserted-by":"publisher","DOI":"10.1109\/12.609281"},{"key":"16","doi-asserted-by":"publisher","DOI":"10.1016\/0167-8191(96)00024-5"},{"key":"13","article-title":"Evaluating the viability of process replication reliability for exascale systems","author":"ferreira","year":"2011","journal-title":"Proceedings of the ACM\/IEEE International Conference on High Performance Computing Networking Storage and Analysis SC'11"},{"key":"14","article-title":"Increasing fault resiliency in a message-passing environment","author":"ferreira","year":"2009","journal-title":"TR SAND2009-6753"},{"key":"37","doi-asserted-by":"publisher","DOI":"10.1002\/9780470117880"},{"key":"11","doi-asserted-by":"publisher","DOI":"10.2316\/P.2011.719-031"},{"key":"38","doi-asserted-by":"publisher","DOI":"10.1145\/361147.361115"},{"key":"12","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005056137"},{"key":"21","first-page":"124","article-title":"Volpexmpi: An mpi library for execution of. parallel applications on volatile nodes","author":"leblanc","year":"2009","journal-title":"Proc 7th Eur PVM\/MPI Users Group Meeting"},{"key":"20","author":"hursey","year":"2010","journal-title":"Coordinated Checkpoint\/Restart Process Fault Tolerance for MPI Applications on HPC Systems"},{"key":"22","doi-asserted-by":"publisher","DOI":"10.1007\/11846802_41"},{"key":"23","article-title":"Compiler-assisted full checkpointing","author":"jim li","year":"1994","journal-title":"Software Practice and Experience"},{"key":"24","first-page":"154","article-title":"Supporting checkpointing and process migration outside the unix kernel","author":"litzkow","year":"1999","journal-title":"Mobility"},{"key":"25","article-title":"A reliability-aware approach for an optimal checkpoint\/restart model in hpc environments","author":"yudan liu","year":"2007","journal-title":"Cluster Computing"},{"key":"26","article-title":"Software failures and the road to a petaflop machine","author":"philp","year":"2005","journal-title":"HPCRI 1st Workshop on High Performance Computing Reliability Issues"},{"key":"27","article-title":"Failure trends in a large disk drive population","author":"pinheiro","year":"2007","journal-title":"USENIX Conference on File and Storage Technologies"},{"key":"28","doi-asserted-by":"publisher","DOI":"10.1109\/FTCS.1998.689454"},{"key":"29","doi-asserted-by":"publisher","DOI":"10.1002\/(SICI)1097-024X(199902)29:2<125::AID-SPE224>3.0.CO;2-7"},{"key":"3","article-title":"Checkpointing strategies for parallel jobs","author":"bougeret","year":"2011","journal-title":"Supercomputing"},{"key":"2","doi-asserted-by":"publisher","DOI":"10.1145\/1006209.1006248"},{"key":"10","doi-asserted-by":"publisher","DOI":"10.1145\/42282.42283"},{"key":"1","article-title":"Design and analysis of reliable and fault-tolerant computer systems","year":"2007","journal-title":"Imperial College Press"},{"key":"30","doi-asserted-by":"publisher","DOI":"10.1109\/69.842260"},{"key":"7","article-title":"ADTSC nuclear weapons highlights: Facilitating highthroughput ASC calculations","author":"daly","year":"2007","journal-title":"Technical Report LALP-07-041 Los Alamos National Laboratory"},{"key":"6","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2004.11.016"},{"key":"32","doi-asserted-by":"publisher","DOI":"10.1145\/1555349.1555372"},{"key":"5","doi-asserted-by":"publisher","DOI":"10.1145\/1134760.1134771"},{"key":"31","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/78\/1\/012022"},{"key":"4","doi-asserted-by":"publisher","DOI":"10.1145\/214451.214456"},{"key":"9","author":"duell","year":"2004","journal-title":"The Design and Implementation of Berkeley Labs' Linux Checkpoint\/restart"},{"key":"8","first-page":"19","article-title":"Application MTTFE vs. platform MTTF: A fresh perspective on system reliability and application throughput for computations at scale","author":"daly","year":"2008","journal-title":"Proceedings of the Workshop on Resiliency in High Performance Computing (Resilience) 2008"}],"event":{"name":"2012 IEEE 32nd International Conference on Distributed Computing Systems (ICDCS)","location":"Macau, China","start":{"date-parts":[[2012,6,18]]},"end":{"date-parts":[[2012,6,21]]}},"container-title":["2012 IEEE 32nd International Conference on Distributed Computing Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx5\/6257102\/6257972\/06258034.pdf?arnumber=6258034","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2017,6,20]],"date-time":"2017-06-20T22:37:17Z","timestamp":1497998237000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/6258034\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012,6]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/icdcs.2012.56","relation":{},"subject":[],"published":{"date-parts":[[2012,6]]}}}