Keywords (tags) and Publication List
Hua, Fei ; Chen, Yanhao ; Jin, Yuwei ; Zhang, Chi ; Hayes, Ari ; Zhang, Youtao ; Zhang, Eddy Z AutoBraid: A Framework for Enabling Efficient Surface Code Communication in Quantum Computing Conference 54th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO 2021), Association for Computing Machinery, 2021, ISBN: 9781450385572. Abstract | Links | BibTeX | Tags: Compiler optimization, Quantum Computing, Quantum Error Correction Hayes, Ari B; Zhang, Eddy Z Unified On-Chip Memory Allocation for SIMT Architecture Conference Proceedings of the 28th ACM International Conference on Supercomputing (ICS 2014), Association for Computing Machinery, Munich, Germany, 2014, ISBN: 9781450326421. Abstract | Links | BibTeX | Tags: Compiler optimization, Concurrency, GPU, Register allocation, Shared memory allocation
2021
title = {AutoBraid: A Framework for Enabling Efficient Surface Code Communication in Quantum Computing},
author = {Hua, Fei and Chen, Yanhao and Jin, Yuwei and Zhang, Chi and Hayes, Ari and Zhang, Youtao and Zhang, Eddy Z.
},
url = {https://doi.org/10.1145/3466752.3480072},
doi = {10.1145/3466752.3480072},
isbn = {9781450385572},
year = {2021},
date = {2021-10-21},
booktitle = {54th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO 2021)},
pages = {925–936},
publisher = {Association for Computing Machinery},
abstract = {Quantum computers can solve problems that are intractable using the most powerful classical computer. However, qubits are fickle and error prone. It is necessary to actively correct errors in the execution of a quantum circuit. Quantum error correction (QEC) codes are developed to enable fault-tolerant quantum computing. With QEC, one logical circuit is converted into an encoded circuit. Most studies on quantum circuit compilation focus on NISQ devices which have 10-100 qubits and are not fault-tolerant. In this paper, we focus on the compilation for fault-tolerant quantum hardware. In particular, we focus on optimizing communication parallelism for the surface code based QEC. The execution of surface code circuits involves non-trivial geometric manipulation of a large lattice of entangled physical qubits. A two-qubit gate in surface code is implemented as a virtual “pipe” in space-time called a braiding path. The braiding paths should be carefully routed to avoid congestion. Communication between qubits is considered the major bottleneck as it involves scheduling and searching for simultaneous paths between qubits. We provide a framework for efficiently scheduling braiding paths. We discover that for quantum programs with a local parallelism pattern, our framework guarantees an optimal solution, while the previous greedy-heuristic-based solution cannot. Moreover, we propose an extension to the local parallelism analysis framework to address the communication bottleneck. Our framework achieves orders of magnitude improvement after addressing the communication bottleneck.},
keywords = {Compiler optimization, Quantum Computing, Quantum Error Correction},
pubstate = {published},
tppubtype = {conference}
}
2014
title = {Unified On-Chip Memory Allocation for SIMT Architecture},
author = {Ari B Hayes and Eddy Z Zhang},
url = {https://doi.org/10.1145/2597652.2597685},
doi = {10.1145/2597652.2597685},
isbn = {9781450326421},
year = {2014},
date = {2014-01-01},
booktitle = {Proceedings of the 28th ACM International Conference on Supercomputing (ICS 2014)},
pages = {293–302},
publisher = {Association for Computing Machinery},
address = {Munich, Germany},
abstract = {The popularity of general purpose Graphic Processing Unit (GPU) is largely attributed to the tremendous concurrency enabled by its underlying architecture — single instruction multiple thread (SIMT) architecture. It keeps the context of a significant number of threads in registers to enable fast “context switches” when the processor is stalled due to execution dependence, memory requests and etc. The SIMT architecture has a large register file evenly partitioned among all concurrent threads. Per-thread register usage determines the number of concurrent threads, which strongly affects the whole program performance. Existing register allocation techniques, extensively studied in the past several decades, are oblivious to the register contention due to the concurrent execution of many threads. They are prone to making optimization decisions that benefit single thread but degrade the whole application performance.Is it possible for compilers to make register allocation decisions that can maximize the whole GPU application performance? We tackle this important question from two different aspects in this paper. We first propose an unified on-chip memory allocation framework that uses scratch-pad memory to help: (1) alleviate single-thread register pressure; (2) increase whole application throughput. Secondly, we propose a characterization model for the SIMT execution model in order to achieve a desired on-chip memory partition given the register pressure of a program. Overall, we discovered that it is possible to automatically determine an on-chip memory resource allocation that maximizes concurrency while ensuring good single-thread performance at compile-time. We evaluated our techniques on a representative set of GPU benchmarks with non-trivial register pressure. We are able to achieve up to 1.70 times speedup over the baseline of the traditional register allocation scheme that maximizes single thread performance.},
keywords = {Compiler optimization, Concurrency, GPU, Register allocation, Shared memory allocation},
pubstate = {published},
tppubtype = {conference}
}