Keywords (tags) and Publication List
Hayes, Ari B; Hua, Fei; Huang, Jin; Chen, Yanhao; Zhang, Eddy Z Decoding CUDA Binary Conference Proceedings of the 2019 IEEE/ACM International Symposium on Code Generation and Optimization (CGO 2019), IEEE Press, Washington, DC, USA, 2019, ISBN: 9781728114361. Abstract | BibTeX | Tags: Code generation, Code translation and transformation, CUDA, GPU, Instruction set architecture (ISA) Chen, Yanhao; Hayes, Ari B; Zhang, Chi; Salmon, Timothy; Zhang, Eddy Z Locality-Aware Software Throttling for Sparse Matrix Operation on GPUs Conference 2018 USENIX Annual Technical Conference (USENIX ATC 18), USENIX Association, Boston, MA, 2018, ISBN: 978-1-939133-01-4. Links | BibTeX | Tags: GPU, Program locality, Sparse matrix, Spmv Li, Lingda; Geda, Robel; Hayes, Ari B; Chen, Yanhao; Chaudhari, Pranav; Zhang, Eddy Z; Szegedy, Mario A Simple Yet Effective Balanced Edge Partition Model for Parallel Computing Conference Proceedings of the 2017 ACM SIGMETRICS / International Conference on Measurement and Modeling of Computer Systems (Sigmetrics 2017), SIGMETRICS ’17 Abstracts Association for Computing Machinery, Urbana-Champaign, Illinois, USA, 2017, ISBN: 9781450350327. Abstract | Links | BibTeX | Tags: Data sharing, Edge-partition, GPU, Graph model, Program locality Li, Lingda; Geda, Robel; Hayes, Ari B; Chen, Yanhao; Chaudhari, Pranav; Zhang, Eddy Z; Szegedy, Mario A Simple Yet Effective Balanced Edge Partition Model for Parallel Computing Journal Article SIGMETRICS Perform. Eval. Rev., 45 (1), pp. 6, 2017, ISSN: 0163-5999. Abstract | Links | BibTeX | Tags: Data sharing, Edge-partition, GPU, Graph model, Program locality Egielski, Ian J; Huang, Jesse; Zhang, Eddy Z Massive Atomics for Massive Parallelism on GPUs Conference Proceedings of the 2014 International Symposium on Memory Management (ISMM 2014), Association for Computing Machinery, Edinburgh, United Kingdom, 2014, ISBN: 9781450329217. Abstract | Links | BibTeX | Tags: Atomics, Concurrency, GPU, Parallelism Hayes, Ari B; Zhang, Eddy Z Unified On-Chip Memory Allocation for SIMT Architecture Conference Proceedings of the 28th ACM International Conference on Supercomputing (ICS 2014), Association for Computing Machinery, Munich, Germany, 2014, ISBN: 9781450326421. Abstract | Links | BibTeX | Tags: Compiler optimization, Concurrency, GPU, Register allocation, Shared memory allocation Shen, Xipeng; Liu, Yixun; Zhang, Eddy Z; Bhamidipati, Poornima An Infrastructure for Tackling Input-Sensitivity of GPU Program Optimizations Journal Article Int. J. Parallel Program., 41 (6), pp. 855–869, 2013, ISSN: 0885-7458. Abstract | Links | BibTeX | Tags: Cross-input adaptation, CUDA, Empirical search, G-ADAPT, GPU, Program optimizations
2019
title = {Decoding CUDA Binary},
author = {Ari B Hayes and Fei Hua and Jin Huang and Yanhao Chen and Eddy Z Zhang},
isbn = {9781728114361},
year = {2019},
date = {2019-01-01},
booktitle = {Proceedings of the 2019 IEEE/ACM International Symposium on Code Generation and Optimization (CGO 2019)},
pages = {229–241},
publisher = {IEEE Press},
address = {Washington, DC, USA},
abstract = {NVIDIA’s software does not offer translation of assembly code to binary for their GPUs, since the specifications are closed-source. This work fills that gap. We develop a systematic method of decoding the Instruction Set Architectures (ISAs) of NVIDIA’s GPUs, and generating assemblers for different generations of GPUs. Our framework enables cross-architecture binary analysis and transformation. Making the ISA accessible in this manner opens up a world of opportunities for developers and researchers, enabling numerous optimizations and explorations that are unachievable at the source-code level. Our infrastructure has already benefited and been adopted in important applications including performance tuning, binary instrumentation, resource allocation, and memory protection.},
keywords = {Code generation, Code translation and transformation, CUDA, GPU, Instruction set architecture (ISA)},
pubstate = {published},
tppubtype = {conference}
}
2018
title = {Locality-Aware Software Throttling for Sparse Matrix Operation on GPUs},
author = {Yanhao Chen and Ari B Hayes and Chi Zhang and Timothy Salmon and Eddy Z Zhang},
url = {https://www.usenix.org/conference/atc18/presentation/chen-yanhao},
isbn = {978-1-939133-01-4},
year = {2018},
date = {2018-01-01},
booktitle = {2018 USENIX Annual Technical Conference (USENIX ATC 18)},
pages = {413–426},
publisher = {USENIX Association},
address = {Boston, MA},
keywords = {GPU, Program locality, Sparse matrix, Spmv},
pubstate = {published},
tppubtype = {conference}
}
2017
title = {A Simple Yet Effective Balanced Edge Partition Model for Parallel Computing},
author = {Lingda Li and Robel Geda and Ari B Hayes and Yanhao Chen and Pranav Chaudhari and Eddy Z Zhang and Mario Szegedy},
url = {https://doi.org/10.1145/3078505.3078520},
doi = {10.1145/3078505.3078520},
isbn = {9781450350327},
year = {2017},
date = {2017-01-01},
booktitle = {Proceedings of the 2017 ACM SIGMETRICS / International Conference on Measurement and Modeling of Computer Systems (Sigmetrics 2017)},
pages = {6},
publisher = {Association for Computing Machinery},
address = {Urbana-Champaign, Illinois, USA},
series = {SIGMETRICS ’17 Abstracts},
abstract = {Graph edge partition models have recently become an appealing alternative to graph vertex partition models for distributed computing due to both their flexibility in balancing loads and their performance in reducing communication cost.In this paper, we propose a simple yet effective graph edge partitioning algorithm. In practice, our algorithm provides good partition quality while maintaining low partition overhead. It also outperforms similar state-of-the-art edge partition approaches, especially for power-law graphs. In theory, previous work showed that an approximation guarantee of O(dmax√(log n log k)) apply to the graphs with m=Ω(k2) edges (n is the number of vertices, and k is the number of partitions). We further rigorously proved that this approximation guarantee hold for all graphs.We also demonstrate the applicability of the proposed edge partition algorithm in real parallel computing systems. We draw our example from GPU program locality enhancement and demonstrate that the graph edge partition model does not only apply to distributed computing with many computer nodes, but also to parallel computing in a single computer node with a many-core processor.},
keywords = {Data sharing, Edge-partition, GPU, Graph model, Program locality},
pubstate = {published},
tppubtype = {conference}
}
title = {A Simple Yet Effective Balanced Edge Partition Model for Parallel Computing},
author = {Lingda Li and Robel Geda and Ari B Hayes and Yanhao Chen and Pranav Chaudhari and Eddy Z Zhang and Mario Szegedy},
url = {https://doi.org/10.1145/3143314.3078520},
doi = {10.1145/3143314.3078520},
issn = {0163-5999},
year = {2017},
date = {2017-01-01},
journal = {SIGMETRICS Perform. Eval. Rev.},
volume = {45},
number = {1},
pages = {6},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
abstract = {Graph edge partition models have recently become an appealing alternative to graph vertex partition models for distributed computing due to both their flexibility in balancing loads and their performance in reducing communication cost.In this paper, we propose a simple yet effective graph edge partitioning algorithm. In practice, our algorithm provides good partition quality while maintaining low partition overhead. It also outperforms similar state-of-the-art edge partition approaches, especially for power-law graphs. In theory, previous work showed that an approximation guarantee of O(dmax√(log n log k)) apply to the graphs with m=Ω(k2) edges (n is the number of vertices, and k is the number of partitions). We further rigorously proved that this approximation guarantee hold for all graphs.We also demonstrate the applicability of the proposed edge partition algorithm in real parallel computing systems. We draw our example from GPU program locality enhancement and demonstrate that the graph edge partition model does not only apply to distributed computing with many computer nodes, but also to parallel computing in a single computer node with a many-core processor.},
keywords = {Data sharing, Edge-partition, GPU, Graph model, Program locality},
pubstate = {published},
tppubtype = {article}
}
2014
title = {Massive Atomics for Massive Parallelism on GPUs},
author = {Ian J Egielski and Jesse Huang and Eddy Z Zhang},
url = {https://doi.org/10.1145/2602988.2602993},
doi = {10.1145/2602988.2602993},
isbn = {9781450329217},
year = {2014},
date = {2014-01-01},
booktitle = {Proceedings of the 2014 International Symposium on Memory Management (ISMM 2014)},
pages = {93–103},
publisher = {Association for Computing Machinery},
address = {Edinburgh, United Kingdom},
abstract = {One important type of parallelism exploited in many applications is reduction type parallelism. In these applications, the order of the read-modify-write updates to one shared data object can be arbitrary as long as there is an imposed order for the read-modify-write updates. The typical way to parallelize these types of applications is to first let every individual thread perform local computation and save the results in thread-private data objects, and then merge the results from all worker threads in the reduction stage. All applications that fit into the map reduce framework belong to this category. Additionally, the machine learning, data mining, numerical analysis and scientific simulation applications may also benefit from reduction type parallelism. However, the parallelization scheme via the usage of thread-private data objects may not be vi- able in massively parallel GPU applications. Because the number of concurrent threads is extremely large (at least tens of thousands of), thread-private data object creation may lead to memory space explosion problems.In this paper, we propose a novel approach to deal with shared data object management for reduction type parallelism on GPUs. Our approach exploits fine-grained parallelism while at the same time maintaining good programmability. It is based on the usage of intrinsic hardware atomic instructions. Atomic operation may appear to be expensive since it causes thread serialization when multiple threads atomically update the same memory object at the same time. However, we discovered that, with appropriate atomic collision reduction techniques, the atomic implementation can out- perform the non-atomics implementation, even for benchmarks known to have high performance non-atomics GPU implementations. In the meantime, the usage of atomics can greatly reduce coding complexity as neither thread-private object management or explicit thread-communication (for the shared data objects protected by atomic operations) is necessary.},
keywords = {Atomics, Concurrency, GPU, Parallelism},
pubstate = {published},
tppubtype = {conference}
}
title = {Unified On-Chip Memory Allocation for SIMT Architecture},
author = {Ari B Hayes and Eddy Z Zhang},
url = {https://doi.org/10.1145/2597652.2597685},
doi = {10.1145/2597652.2597685},
isbn = {9781450326421},
year = {2014},
date = {2014-01-01},
booktitle = {Proceedings of the 28th ACM International Conference on Supercomputing (ICS 2014)},
pages = {293–302},
publisher = {Association for Computing Machinery},
address = {Munich, Germany},
abstract = {The popularity of general purpose Graphic Processing Unit (GPU) is largely attributed to the tremendous concurrency enabled by its underlying architecture — single instruction multiple thread (SIMT) architecture. It keeps the context of a significant number of threads in registers to enable fast “context switches” when the processor is stalled due to execution dependence, memory requests and etc. The SIMT architecture has a large register file evenly partitioned among all concurrent threads. Per-thread register usage determines the number of concurrent threads, which strongly affects the whole program performance. Existing register allocation techniques, extensively studied in the past several decades, are oblivious to the register contention due to the concurrent execution of many threads. They are prone to making optimization decisions that benefit single thread but degrade the whole application performance.Is it possible for compilers to make register allocation decisions that can maximize the whole GPU application performance? We tackle this important question from two different aspects in this paper. We first propose an unified on-chip memory allocation framework that uses scratch-pad memory to help: (1) alleviate single-thread register pressure; (2) increase whole application throughput. Secondly, we propose a characterization model for the SIMT execution model in order to achieve a desired on-chip memory partition given the register pressure of a program. Overall, we discovered that it is possible to automatically determine an on-chip memory resource allocation that maximizes concurrency while ensuring good single-thread performance at compile-time. We evaluated our techniques on a representative set of GPU benchmarks with non-trivial register pressure. We are able to achieve up to 1.70 times speedup over the baseline of the traditional register allocation scheme that maximizes single thread performance.},
keywords = {Compiler optimization, Concurrency, GPU, Register allocation, Shared memory allocation},
pubstate = {published},
tppubtype = {conference}
}
2013
title = {An Infrastructure for Tackling Input-Sensitivity of GPU Program Optimizations},
author = {Xipeng Shen and Yixun Liu and Eddy Z Zhang and Poornima Bhamidipati},
url = {https://doi.org/10.1007/s10766-012-0236-3},
doi = {10.1007/s10766-012-0236-3},
issn = {0885-7458},
year = {2013},
date = {2013-01-01},
journal = {Int. J. Parallel Program.},
volume = {41},
number = {6},
pages = {855–869},
publisher = {Kluwer Academic Publishers},
address = {USA},
abstract = {Graphic processing units (GPU) have become increasingly adopted for the enhancement of computing throughput. However, the development of a high-quality GPU application is challenging, due to the large optimization space and complex unpredictable effects of optimizations on GPU program performance. Many recent efforts have been employing empirical search-based auto-tuners to tackle the problem, but few of them have concentrated on the influence of program inputs on the optimizations. In this paper, based on a set of CUDA and OpenCL kernels, we report some evidences on the importance for auto-tuners to adapt to program input changes, and present a framework, G-ADAPT+, to address the influence by constructing cross-input predictive models for automatically predicting the (near-)optimal configurations for an arbitrary input to a GPU program. G-ADAPT+ is based on source-to-source compilers, specifically, Cetus and ROSE. It supports the optimizations of both CUDA and OpenCL programs.},
keywords = {Cross-input adaptation, CUDA, Empirical search, G-ADAPT, GPU, Program optimizations},
pubstate = {published},
tppubtype = {article}
}