@inproceedings{3c2f084424f64160a0c41aeabddc997a,
title = "Effective utilization of CUDA hyper-Q for improved power and performance efficiency",
abstract = "High utilization of hardware resources is the key for designing performance and power optimized GPUapplications. The efficiency of applications and kernels, which do not fully utilize the GPU resources, can be improved through concurrent execution with independent kernels and/or applications. Hyper-Q enables multiple CPU threads or processes to launch work on a single GPU simultaneously for increased GPU utilization. However, without careful design, false serialization may occur due to the contention for shared hardware resources such as direct memory access (DMA) engines. In this paper, we reveal the impact of such contention on performance and assess a method for overcoming the limitation with minimal algorithmic overhead. We demonstrate a method to achieve up to 31.8% improvement in performance and 10.4%reduction in energy on average for a finite set of application tasks when maximizing GPU execution concurrency.",
keywords = "Concurrency, GPU performance, GPU utilization, Hyper-Q, Power efficiency, Resource sharing",
author = "Luley, {Ryan S.} and Qinru Qiu",
note = "Publisher Copyright: {\textcopyright} 2016 IEEE.; 30th IEEE International Parallel and Distributed Processing Symposium Workshops, IPDPSW 2016 ; Conference date: 23-05-2016 Through 27-05-2016",
year = "2016",
month = jul,
day = "18",
doi = "10.1109/IPDPSW.2016.154",
language = "English (US)",
series = "Proceedings - 2016 IEEE 30th International Parallel and Distributed Processing Symposium, IPDPS 2016",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "1160--1169",
booktitle = "Proceedings - 2016 IEEE 30th International Parallel and Distributed Processing Symposium, IPDPS 2016",
}