Publications – Blueprint Research Lab

Keywords (tags) and Publication List

2013

Wu, Bo; Zhao, Zhijia; Zhang, Eddy Zheng; Jiang, Yunlian; Shen, Xipeng

Complexity Analysis and Algorithm Design for Reorganizing Data to Minimize Non-Coalesced Memory Accesses on GPU Conference

Proceedings of the 18th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP 2013), Association for Computing Machinery, Shenzhen, China, 2013, ISBN: 9781450319225.

Abstract | Links | BibTeX | Tags: Computational complexity, Data transformation, GPGPU, Memory coalescing, Runtime optimizations, Thread-data remapping

@conference{10.1145/2442516.2442523,
title = {Complexity Analysis and Algorithm Design for Reorganizing Data to Minimize Non-Coalesced Memory Accesses on GPU},
author = {Bo Wu and Zhijia Zhao and Eddy Zheng Zhang and Yunlian Jiang and Xipeng Shen},
url = {https://doi.org/10.1145/2442516.2442523},
doi = {10.1145/2442516.2442523},
isbn = {9781450319225},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the 18th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP 2013)},
pages = {57–68},
publisher = {Association for Computing Machinery},
address = {Shenzhen, China},
abstract = {The performance of Graphic Processing Units (GPU) is sensitive to irregular memory references. Some recent work shows the promise of data reorganization for eliminating non-coalesced memory accesses that are caused by irregular references. However, all previous studies have employed simple, heuristic methods to determine the new data layouts to create. As a result, they either do not provide any performance guarantee or are effective to only some limited scenarios. This paper contributes a fundamental study to the problem. It systematically analyzes the inherent complexity of the problem in various settings, and for the first time, proves that the problem is NP-complete. It then points out the limitations of existing techniques and reveals that in practice, the essence for designing an appropriate data reorganization algorithm can be reduced to a tradeoff among space, time, and complexity. Based on that insight, it develops two new data reorganization algorithms to overcome the limitations of previous methods. Experiments show that an assembly composed of the new algorithms and a previous algorithm can circumvent the inherent complexity in finding optimal data layouts, making it feasible to minimize non-coalesced memory accesses for a variety of irregular applications and settings that are beyond the reach of existing techniques.},
keywords = {Computational complexity, Data transformation, GPGPU, Memory coalescing, Runtime optimizations, Thread-data remapping},
pubstate = {published},
tppubtype = {conference}
}

2011

Zhang, Eddy Z; Jiang, Yunlian; Guo, Ziyu; Tian, Kai; Shen, Xipeng

On-the-Fly Elimination of Dynamic Irregularities for GPU Computing Conference

Proceedings of the Sixteenth International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS XVI Association for Computing Machinery, Newport Beach, California, USA, 2011, ISBN: 9781450302661.

Abstract | Links | BibTeX | Tags: Cpu-gpu pipelining, Data transformation, GPGPU, Memory coalescing, Thread data remapping, Thread divergence

@conference{10.1145/1950365.1950408,
title = {On-the-Fly Elimination of Dynamic Irregularities for GPU Computing},
author = {Eddy Z Zhang and Yunlian Jiang and Ziyu Guo and Kai Tian and Xipeng Shen},
url = {https://doi.org/10.1145/1950365.1950408},
doi = {10.1145/1950365.1950408},
isbn = {9781450302661},
year = {2011},
date = {2011-01-01},
booktitle = {Proceedings of the Sixteenth International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {369–380},
publisher = {Association for Computing Machinery},
address = {Newport Beach, California, USA},
series = {ASPLOS XVI},
abstract = {The power-efficient massively parallel Graphics Processing Units (GPUs) have become increasingly influential for general-purpose computing over the past few years. However, their efficiency is sensitive to dynamic irregular memory references and control flows in an application. Experiments have shown great performance gains when these irregularities are removed. But it remains an open question how to achieve those gains through software approaches on modern GPUs.This paper presents a systematic exploration to tackle dynamic irregularities in both control flows and memory references. It reveals some properties of dynamic irregularities in both control flows and memory references, their interactions, and their relations with program data and threads. It describes several heuristics-based algorithms and runtime adaptation techniques for effectively removing dynamic irregularities through data reordering and job swapping. It presents a framework, G-Streamline, as a unified software solution to dynamic irregularities in GPU computing. G-Streamline has several distinctive properties. It is a pure software solution and works on the fly, requiring no hardware extensions or offline profiling. It treats both types of irregularities at the same time in a holistic fashion, maximizing the whole-program performance by resolving conflicts among optimizations. Its optimization overhead is largely transparent to GPU kernel executions, jeopardizing no basic efficiency of the GPU application. Finally, it is robust to the presence of various complexities in GPU applications. Experiments show that G-Streamline is effective in reducing dynamic irregularities in GPU computing, producing speedups between 1.07 and 2.5 for a variety of applications.},
keywords = {Cpu-gpu pipelining, Data transformation, GPGPU, Memory coalescing, Thread data remapping, Thread divergence},
pubstate = {published},
tppubtype = {conference}
}