Publications – Blueprint Research Lab

Keywords (tags) and Publication List

2013

Wu, Bo; Zhao, Zhijia; Zhang, Eddy Zheng; Jiang, Yunlian; Shen, Xipeng

Complexity Analysis and Algorithm Design for Reorganizing Data to Minimize Non-Coalesced Memory Accesses on GPU Conference

Proceedings of the 18th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP 2013), Association for Computing Machinery, Shenzhen, China, 2013, ISBN: 9781450319225.

Abstract | Links | BibTeX | Tags: Computational complexity, Data transformation, GPGPU, Memory coalescing, Runtime optimizations, Thread-data remapping

@conference{10.1145/2442516.2442523,
title = {Complexity Analysis and Algorithm Design for Reorganizing Data to Minimize Non-Coalesced Memory Accesses on GPU},
author = {Bo Wu and Zhijia Zhao and Eddy Zheng Zhang and Yunlian Jiang and Xipeng Shen},
url = {https://doi.org/10.1145/2442516.2442523},
doi = {10.1145/2442516.2442523},
isbn = {9781450319225},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the 18th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP 2013)},
pages = {57–68},
publisher = {Association for Computing Machinery},
address = {Shenzhen, China},
abstract = {The performance of Graphic Processing Units (GPU) is sensitive to irregular memory references. Some recent work shows the promise of data reorganization for eliminating non-coalesced memory accesses that are caused by irregular references. However, all previous studies have employed simple, heuristic methods to determine the new data layouts to create. As a result, they either do not provide any performance guarantee or are effective to only some limited scenarios. This paper contributes a fundamental study to the problem. It systematically analyzes the inherent complexity of the problem in various settings, and for the first time, proves that the problem is NP-complete. It then points out the limitations of existing techniques and reveals that in practice, the essence for designing an appropriate data reorganization algorithm can be reduced to a tradeoff among space, time, and complexity. Based on that insight, it develops two new data reorganization algorithms to overcome the limitations of previous methods. Experiments show that an assembly composed of the new algorithms and a previous algorithm can circumvent the inherent complexity in finding optimal data layouts, making it feasible to minimize non-coalesced memory accesses for a variety of irregular applications and settings that are beyond the reach of existing techniques.},
keywords = {Computational complexity, Data transformation, GPGPU, Memory coalescing, Runtime optimizations, Thread-data remapping},
pubstate = {published},
tppubtype = {conference}
}

2010

Zhang, Eddy Z; Jiang, Yunlian; Guo, Ziyu; Shen, Xipeng

Streamlining GPU Applications on the Fly: Thread Divergence Elimination through Runtime Thread-Data Remapping Conference

Proceedings of the 24th ACM International Conference on Supercomputing, ICS ’10 Association for Computing Machinery, Tsukuba, Ibaraki, Japan, 2010, ISBN: 9781450300186.

Abstract | Links | BibTeX | Tags: Cpu-gpu pipelining, Data transformation, GPGPU, Thread divergence, Thread-data remapping

@conference{10.1145/1810085.1810104,
title = {Streamlining GPU Applications on the Fly: Thread Divergence Elimination through Runtime Thread-Data Remapping},
author = {Eddy Z Zhang and Yunlian Jiang and Ziyu Guo and Xipeng Shen},
url = {https://doi.org/10.1145/1810085.1810104},
doi = {10.1145/1810085.1810104},
isbn = {9781450300186},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of the 24th ACM International Conference on Supercomputing},
pages = {115–126},
publisher = {Association for Computing Machinery},
address = {Tsukuba, Ibaraki, Japan},
series = {ICS ’10},
abstract = {Because of their tremendous computing power and remarkable cost efficiency, GPUs (graphic processing unit) have quickly emerged as a kind of influential platform for high performance computing. However, as GPUs are designed for massive data-parallel computing, their performance is subject to the presence of condition statements in a GPU application. On a conditional branch where threads diverge in which path to take, the threads taking different paths have to run serially. Such divergences often cause serious performance degradations, impairing the adoption of GPU for many applications that contain non-trivial branches or certain types of loops.This paper presents a systematic investigation in the employment of runtime thread-data remapping for solving that problem. It introduces an abstract form of GPU applications, based on which, it describes the use of reference redirection and data layout transformation for remapping data and threads to minimize thread divergences. It discusses the major challenges for practical deployment of the remapping techniques, most notably, the conflict between the large remapping overhead and the need for the remapping to happen on the fly because of the dependence of thread divergences on runtime values. It offers a solution to the challenge by proposing a CPU-GPU pipelining scheme and a label-assign-move (LAM) algorithm to virtually hide all the remapping overhead. At the end, it reports significant performance improvement produced by the remapping for a set of GPU applications, demonstrating the potential of the techniques for streamlining GPU applications on the fly.},
keywords = {Cpu-gpu pipelining, Data transformation, GPGPU, Thread divergence, Thread-data remapping},
pubstate = {published},
tppubtype = {conference}
}