{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T13:55:55Z","timestamp":1776261355673,"version":"3.50.1"},"reference-count":82,"publisher":"Elsevier","isbn-type":[{"value":"9780128231234","type":"print"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1016\/bs.adcom.2020.11.003","type":"book-chapter","created":{"date-parts":[[2021,1,7]],"date-time":"2021-01-07T17:39:45Z","timestamp":1610041185000},"page":"167-215","source":"Crossref","is-referenced-by-count":49,"title":["Deep learning with GPUs"],"prefix":"10.1016","member":"78","reference":[{"key":"10.1016\/bs.adcom.2020.11.003_bb0010","series-title":"Advances in Neural Information Processing Systems","first-page":"8026","article-title":"PyTorch: an imperative style, high-performance deep learning library","author":"Paszke","year":"2019"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0015","series-title":"Proceedings of the 22nd ACM International Conference on Multimedia","first-page":"675","article-title":"Caffe: convolutional architecture for fast feature embedding","author":"Jia","year":"2014"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0020","series-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","first-page":"265","article-title":"Tensorflow: a system for large-scale machine learning","author":"Abadi","year":"2016"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0025","author":"NVIDIA"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0030","author":"NVIDIA"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0035","author":"NVIDIA"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0040","author":"NVIDIA"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0045","author":"NVIDIA"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0050","author":"NVIDIA"},{"issue":"4","key":"10.1016\/bs.adcom.2020.11.003_bb0055","doi-asserted-by":"crossref","first-page":"2923","DOI":"10.1109\/COMST.2018.2844341","article-title":"Deep learning for IoT big data and streaming analytics: a survey","volume":"20","author":"Mohammadi","year":"2018","journal-title":"IEEE Commun. Surv. Tutorials"},{"issue":"1","key":"10.1016\/bs.adcom.2020.11.003_bb0060","doi-asserted-by":"crossref","first-page":"96","DOI":"10.1109\/MNET.2018.1700202","article-title":"Learning IoT in edge: deep learning for the internet of things with edge computing","volume":"32","author":"Li","year":"2018","journal-title":"IEEE Netw."},{"key":"10.1016\/bs.adcom.2020.11.003_bb0065","series-title":"Pacific Symposium on Biocomputing","first-page":"219","article-title":"A deep learning approach for cancer detection and relevant gene identification","author":"Danaee","year":"2017"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0070","doi-asserted-by":"crossref","first-page":"119","DOI":"10.1016\/j.knosys.2017.10.017","article-title":"Deep learning for freezing of gait detection in Parkinson's disease patients in their homes using a waist-worn inertial measurement unit","volume":"139","author":"Camps","year":"2018","journal-title":"Knowl.-Based Syst."},{"issue":"1","key":"10.1016\/bs.adcom.2020.11.003_bb0075","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1038\/s41598-019-44004-w","article-title":"Intelligent ICU for autonomous patient monitoring using pervasive sensing and deep learning","volume":"9","author":"Davoudi","year":"2019","journal-title":"Sci. Rep."},{"issue":"4\u20135","key":"10.1016\/bs.adcom.2020.11.003_bb0080","doi-asserted-by":"crossref","first-page":"421","DOI":"10.1177\/0278364917710318","article-title":"Learning hand-eye coordination for robotic grasping with deep learning and large-scale data collection","volume":"37","author":"Levine","year":"2018","journal-title":"Int. J. Robot. Res."},{"key":"10.1016\/bs.adcom.2020.11.003_bb0085","series-title":"NVIDIA Launches the World's First Graphics Processing Unit: GeForce 256","author":"NVIDIA","year":"1999"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0090","series-title":"NVIDIA Tesla V100 GPU Architecture: The World's Most Advanced Data Center GPU","author":"NVIDIA","year":"2017"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0095","author":"NVIDIA"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0100","series-title":"NVIDIA CUDA Toolkit Documentation v11.0.171","author":"NVIDIA","year":"2020"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0105","series-title":"NVIDIA A100 Tensor Core GPU Architecture: Unprecedented Acceleration at Every Scale","author":"NVIDIA","year":"2020"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0110","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1109\/MM.2008.31","article-title":"NVIDIA tesla: a unified graphics and computing architecture","volume":"28","author":"Lindholm","year":"2008","journal-title":"IEEE Micro"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0115","series-title":"Proceedings of the 40th Annual International Symposium on Computer Architecture","first-page":"332","article-title":"Orchestrated scheduling and prefetching for GPGPUs","author":"Jog","year":"2013"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0120","series-title":"Proceedings of the 44th Annual IEEE\/ACM International Symposium on Microarchitecture","first-page":"308","article-title":"Improving GPU performance via large warps and two-level warp scheduling","author":"Narasiman","year":"2011"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0125","series-title":"Proceedings of the 42th Annual International Symposium on Computer Architecture","article-title":"Warped-compression: enabling power efficient GPUs through register compression","author":"Lee","year":"2015"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0130","series-title":"Proceedings of the 51th Annual IEEE\/ACM International Symposium on Microarchitecture","article-title":"FineReg: fine-grained register file management for augmenting GPU throughput","author":"Oh","year":"2018"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0135","series-title":"Proceedings of the 43th Annual International Symposium on Computer Architecture","article-title":"Virtual thread: maximizing thread-level parallelism beyond GPU scheduling limit","author":"Yoon","year":"2016"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0140","series-title":"46th Annual International Symposium on Computer Architecture","article-title":"Linebacker: preserving victim cache lines in idle register files of GPUs","author":"Oh","year":"2019"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0145","series-title":"NVIDIA's Next Generation CUDA Compute Architecture: Fermi","author":"NVIDIA","year":"2009"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0150","series-title":"NVIDIA's Next Generation CUDA Compute Architecture: Kepler GK110","author":"NVIDIA","year":"2012"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0155","series-title":"NVIDIA GeForce GTX 980: Featuring Maxwell, The Most Advanced GPU Ever Made","author":"NVIDIA","year":"2014"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0160","series-title":"NVIDIA Tesla P100: The Most Advanced Datacenter Accelerator Ever Built","author":"NVIDIA","year":"2016"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0165","series-title":"NVIDIA Turing GPU Architecture: Graphics Reinvented","author":"NVIDIA","year":"2018"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0170","author":"NVIDIA"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0175","series-title":"NVIDIA Deep Learning SDK Documentation","author":"NVIDIA","year":"2020"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0180","series-title":"NVIDIA Deep Learning TensorRT Documentation","author":"NVIDIA","year":"2020"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0185","author":"NVIDIA"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0190","author":"NVIDIA"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0195","author":"NVIDIA"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0200","author":"NVIDIA"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0205","series-title":"NVIDIA Deep Learning DALI Documentation","author":"NVIDIA","year":"2020"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0210","series-title":"NVIDIA Deep Learning NCCL Documentation","author":"NVIDIA","year":"2020"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0215","author":"NVIDIA"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0220","series-title":"cuDNN: Efficient Primitives for Deep Learning","author":"Chetlur","year":"2014"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0225","series-title":"2017 ACM\/IEEE 44th Annual International Symposium on Computer Architecture","article-title":"Scalpel: customizing DNN pruning to the underlying hardware parallelism","author":"Yu","year":"2017"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0230","series-title":"2017 50th Annual IEEE\/ACM International Symposium on Microarchitecture","article-title":"DeftNN: addressing bottlenecks for DNN execution on GPUs via synapse vector elimination and near-compute data fission","author":"Hill","year":"2017"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0235","doi-asserted-by":"crossref","unstructured":"M. Zhu, T. Zhang, Z. Gu, Y. Xie, Sparse tensor core: algorithm and hardware co-design for vector-wise sparse neural networks on modern GPUs, in: Proceedings of the 52nd Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO), ACM, pp. 359\u2013371.","DOI":"10.1145\/3352460.3358269"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0240","series-title":"Proceedings of the Fourteenth EuroSys Conference 2019","article-title":"GRNN: low-latency and scalable RNN inference on GPUs","author":"Holmes","year":"2019"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0245","series-title":"2015 IEEE 17th International Conference on High Performance Computing and Communications, 2015 IEEE 7th International Symposium on Cyberspace Safety and Security, and 2015 IEEE 12th International Conference on Embedded Software and Systems","article-title":"Fast convolution operations on many-core architectures","author":"Li","year":"2015"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0250","series-title":"Proceedings of the Eleventh European Conference on Computer Systems","article-title":"GeePS: scalable deep learning on distributed GPUs with a GPU-specialized parameter server","author":"Cui","year":"2016"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0255","series-title":"Proceedings of the Eleventh IEEE\/ACM\/IFIP International Conference on Hardware\/Software Codesign and System Synthesis","article-title":"Zero and data reuse-aware fast convolution for deep neural networks on GPU","author":"Park","year":"2016"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0260","series-title":"Proceedings of the 2020 53rd Annual IEEE\/ACM International Symposium on Microarchitecture","article-title":"Duplo: lifting redundant memory accesses of deep neural networks for GPU tensor cores","author":"Kim","year":"2020"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0265","series-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","article-title":"Optimizing memory efficiency for deep convolutional neural networks on GPUs","author":"Li","year":"2016"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0270","series-title":"Proceedings of 2018 Design, Automation & Test in Europe Conference & Exhibition","article-title":"moDNN: memory optimal DNN training on GPUs","author":"Chen","year":"2018"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0275","series-title":"Proceedings of the 54th Annual Design Automation Conference 2017","article-title":"Optimizing memory efficiency for convolution kernels on kepler GPUs","author":"Chen","year":"2017"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0280","series-title":"Proceedings of the 2018 ACM\/SPEC International Conference on Performance Engineering","article-title":"Characterizing the microarchitectural implications of a convolutional neural network (CNN) execution on GPUs","author":"Dong","year":"2018"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0285","series-title":"Proceedings of the High Performance Computing Symposium","article-title":"Fast convolution kernels on pascal GPU with high memory efficiency","author":"Chang","year":"2018"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0290","series-title":"Proceedings of the 49th Annual IEEE\/ACM International Symposium on Microarchitecture","article-title":"vDNN: virtualized deep neural networks for scalable, memory-efficient neural network design","author":"Rhu","year":"2016"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0295","series-title":"Proceedings of the ACM\/IEEE 47th Annual International Symposium on Computer Architecture","article-title":"Echo: compiler-based GPU memory footprint reduction for LSTM RNN training","author":"Zheng","year":"2020"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0300","series-title":"Proceedings of the Twenty-Fourth International Conference on Architectural Support for Programming Languages and Operating Systems","article-title":"Split-CNN: splitting window-based operations in convolutional neural networks for memory system optimization","author":"Jin","year":"2019"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0305","series-title":"Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems","article-title":"Swap, Advisor: pushing deep learning beyond the GPU memory limit via smart swapping","author":"Huang","year":"2020"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0310","series-title":"ACM SIGPLAN Notices","article-title":"Superneurons: dynamic GPU memory management for training deep neural networks","author":"Wang","year":"2018"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0315","series-title":"Proceedings of 2018 IEEE 25th International Conference on High Performance Computing","article-title":"OC-DNN: exploiting advanced unified memory capabilities in CUDA 9 and Volta GPUs for out-of-core DNN training","author":"Awan","year":"2018"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0320","series-title":"ACM SIGPLAN Notices","article-title":"S-Caffe: co-designing MPI runtimes and caffe for scalable deep learning on modern GPU clusters","author":"Awan","year":"2017"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0325","series-title":"Proceedings of 2016 IEEE Conference on Computer Vision and Pattern Recognition","article-title":"FireCaffe: near-linear acceleration of deep neural network training on compute clusters","author":"Iandola","year":"2016"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0330","series-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","article-title":"Scaling deep learning on GPU and knights landing clusters","author":"Yang","year":"2017"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0335","series-title":"Proceedings of the 2017 ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays","article-title":"ESE: efficient speech recognition engine with sparse ISTM on FPGA","author":"Han","year":"2017"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0340","series-title":"Proceedings of the 43th Annual International Symposium on Computer Architecture","article-title":"EIE: efficient inference engine on compressed deep neural network","author":"Han","year":"2016"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0345","series-title":"4th International Conference on Learning Representations","article-title":"Deep compression: compressing deep neural networks with pruning, trained quantization and Huffman coding","author":"Han","year":"2016"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0350","series-title":"Proceedings of the 21st Annual Symposium on Parallelism in Algorithms and Architectures","article-title":"Parallel sparse matrix-vector and matrix-transpose-vector multiplication using compressed sparse blocks","author":"Bulu\u00e7","year":"2009"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0355","author":"Krizhevsky"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0360","series-title":"Advances in Neural Information Processing Systems","first-page":"25:1106","article-title":"ImageNet classification with deep convolutional neural networks","author":"Krizhevsky","year":"2012"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0365","first-page":"818","article-title":"Visualizing and understanding convolutional networks","author":"Zeiler","year":"2014"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0370","series-title":"NVIDIA CUDA 6.5 SDK Samples","author":"NVIDIA","year":"2014"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0375","series-title":"Elsevier Inc.","article-title":"Programming Massively Parallel Processors","author":"Kirk","year":"2017"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0380","series-title":"Deep Learning with PyTorch","author":"Stevens","year":"2020"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0385","doi-asserted-by":"crossref","DOI":"10.1016\/j.sysarc.2019.101635","article-title":"A survey of techniques for optimizing deep learning on GPUs","volume":"99","author":"Mittal","year":"2019","journal-title":"J. Syst. Archit."},{"key":"10.1016\/bs.adcom.2020.11.003_bb0390","series-title":"IEEE International Parallel and Distributed Processing Symposium, IEEE","first-page":"200","article-title":"Dynamic memory management for GPU-based training of deep neural networks","author":"Shriram","year":"2019"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0395","series-title":"General-Purpose Graphics Processor Architectures","author":"Aamodt","year":"2018"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0400","series-title":"Sparse GPU kernels for deep learning, arXiv","first-page":"10901","author":"Gale","year":"2006"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0405","doi-asserted-by":"crossref","first-page":"2295","DOI":"10.1109\/JPROC.2017.2761740","article-title":"Efficient processing of deep neural networks: a tutorial and survey","volume":"105","author":"Sze","year":"2017","journal-title":"IEEE"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0410","series-title":"Proceedings of the 24th Symposium on Principles and Practice of Parallel Programming","article-title":"A coordinated tiling and batching framework for efficient GEMM on GPUs","author":"Li","year":"2019"},{"key":"10.1016\/bs.adcom.2020.11.003_bb0415","series-title":"Proceedings of the 25th International Conference on Architectural Support for Programming Languages and Operating Systems","article-title":"Capuchin: tensor-based GPU memory management for deep learning","author":"Peng","year":"2020"}],"container-title":["Advances in Computers","Hardware Accelerator Systems for Artificial Intelligence and Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0065245820300905?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0065245820300905?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,9,24]],"date-time":"2025-09-24T19:42:18Z","timestamp":1758742938000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0065245820300905"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9780128231234"],"references-count":82,"URL":"https:\/\/doi.org\/10.1016\/bs.adcom.2020.11.003","relation":{},"ISSN":["0065-2458"],"issn-type":[{"value":"0065-2458","type":"print"}],"subject":[],"published":{"date-parts":[[2021]]}}}