Paper/references.bib


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306

@article{choudhary2020comprehensive,
  title={A comprehensive survey on model compression and acceleration},
  author={Choudhary, Tejalal and Mishra, Vipul and Goswami, Anurag and Sarangapani, Jagannathan},
  journal={Artificial Intelligence Review},
  volume={53},
  number={7},
  pages={5113--5155},
  year={2020},
  publisher={Springer}
}

@article{10.1145/3065386,
author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E.},
title = {ImageNet Classification with Deep Convolutional Neural Networks},
year = {2017},
issue_date = {June 2017},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {60},
number = {6},
issn = {0001-0782},
url = {https://doi.org/10.1145/3065386},
doi = {10.1145/3065386},
abstract = {We trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 37.5% and 17.0%, respectively, which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and three fully connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of the convolution operation. To reduce overfitting in the fully connected layers we employed a recently developed regularization method called "dropout" that proved to be very effective. We also entered a variant of this model in the ILSVRC-2012 competition and achieved a winning top-5 test error rate of 15.3%, compared to 26.2% achieved by the second-best entry.},
journal = {Commun. ACM},
month = {may},
pages = {84–90},
numpages = {7}
}

@inproceedings{NIPS1988_1c9ac015,
 author = {Hanson, Stephen and Pratt, Lorien},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {D. Touretzky},
 pages = {},
 publisher = {Morgan-Kaufmann},
 title = {Comparing Biases for Minimal Network Construction with Back-Propagation},
 url = {https://proceedings.neurips.cc/paper/1988/file/1c9ac0159c94d8d0cbedc973445af2da-Paper.pdf},
 volume = {1},
 year = {1988}
}

@ARTICLE{9082126,
  author={Xiang, Yachen and Huang, Peng and Han, Runze and Li, Chu and Wang, Kunliang and Liu, Xiaoyan and Kang, Jinfeng},
  journal={IEEE Transactions on Electron Devices}, 
  title={Efficient and Robust Spike-Driven Deep Convolutional Neural Networks Based on NOR Flash Computing Array}, 
  year={2020},
  volume={67},
  number={6},
  pages={2329-2335},
  doi={10.1109/TED.2020.2987439}}

@InProceedings{Cheng_2015_ICCV,
author = {Cheng, Yu and Yu, Felix X. and Feris, Rogerio S. and Kumar, Sanjiv and Choudhary, Alok and Chang, Shi-Fu},
title = {An Exploration of Parameter Redundancy in Deep Networks With Circulant Projections},
booktitle = {Proceedings of the IEEE International Conference on Computer Vision (ICCV)},
month = {December},
year = {2015}
}

@ARTICLE{726791,
  author={Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
  journal={Proceedings of the IEEE}, 
  title={Gradient-based learning applied to document recognition}, 
  year={1998},
  volume={86},
  number={11},
  pages={2278-2324},
  doi={10.1109/5.726791}
}

@article{10.1145/3007787.3001177,
author = {Chen, Yu-Hsin and Emer, Joel and Sze, Vivienne},
title = {Eyeriss: A Spatial Architecture for Energy-Efficient Dataflow for Convolutional Neural Networks},
year = {2016},
issue_date = {June 2016},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {44},
number = {3},
issn = {0163-5964},
url = {https://doi.org/10.1145/3007787.3001177},
doi = {10.1145/3007787.3001177},
abstract = {Deep convolutional neural networks (CNNs) are widely used in modern AI systems for their superior accuracy but at the cost of high computational complexity. The complexity comes from the need to simultaneously process hundreds of filters and channels in the high-dimensional convolutions, which involve a significant amount of data movement. Although highly-parallel compute paradigms, such as SIMD/SIMT, effectively address the computation requirement to achieve high throughput, energy consumption still remains high as data movement can be more expensive than computation. Accordingly, finding a dataflow that supports parallel processing with minimal data movement cost is crucial to achieving energy-efficient CNN processing without compromising accuracy.In this paper, we present a novel dataflow, called row-stationary (RS), that minimizes data movement energy consumption on a spatial architecture. This is realized by exploiting local data reuse of filter weights and feature map pixels, i.e., activations, in the high-dimensional convolutions, and minimizing data movement of partial sum accumulations. Unlike dataflows used in existing designs, which only reduce certain types of data movement, the proposed RS dataflow can adapt to different CNN shape configurations and reduces all types of data movement through maximally utilizing the processing engine (PE) local storage, direct inter-PE communication and spatial parallelism. To evaluate the energy efficiency of the different dataflows, we propose an analysis framework that compares energy cost under the same hardware area and processing parallelism constraints. Experiments using the CNN configurations of AlexNet show that the proposed RS dataflow is more energy efficient than existing dataflows in both convolutional (1.4\texttimes{} to 2.5\texttimes{}) and fully-connected layers (at least 1.3\texttimes{} for batch size larger than 16). The RS dataflow has also been demonstrated on a fabricated chip, which verifies our energy analysis.},
journal = {SIGARCH Comput. Archit. News},
month = {jun},
pages = {367–379},
numpages = {13}
}

@inproceedings{carvalho2002gap,
  title={The gap between processor and memory speeds},
  author={Carvalho, Carlos},
  booktitle={Proc. of IEEE International Conference on Control and Automation},
  year={2002}
}

@article{DBLP:journals/corr/SzeCESZ16,
  author    = {Vivienne Sze and
               Yu{-}Hsin Chen and
               Joel S. Emer and
               Amr Suleiman and
               Zhengdong Zhang},
  title     = {Hardware for Machine Learning: Challenges and Opportunities},
  journal   = {CoRR},
  volume    = {abs/1612.07625},
  year      = {2016},
  url       = {http://arxiv.org/abs/1612.07625},
  eprinttype = {arXiv},
  eprint    = {1612.07625},
  timestamp = {Wed, 11 Dec 2019 16:23:12 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/SzeCESZ16.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/SuleimanZS16,
  author    = {Amr Suleiman and
               Zhengdong Zhang and
               Vivienne Sze},
  title     = {A 58.6mW Real-Time Programmable Object Detector with Multi-Scale Multi-Object
               Support Using Deformable Parts Model on 1920x1080 Video at 30fps},
  journal   = {CoRR},
  volume    = {abs/1607.08635},
  year      = {2016},
  url       = {http://arxiv.org/abs/1607.08635},
  eprinttype = {arXiv},
  eprint    = {1607.08635},
  timestamp = {Wed, 11 Dec 2019 16:23:12 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/SuleimanZS16.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{10.5555/3045118.3045361,
author = {Chen, Wenlin and Wilson, James T. and Tyree, Stephen and Weinberger, Kilian Q. and Chen, Yixin},
title = {Compressing Neural Networks with the Hashing Trick},
year = {2015},
publisher = {JMLR.org},
abstract = {As deep nets are increasingly used in applications suited for mobile devices, a fundamental dilemma becomes apparent: the trend in deep learning is to grow models to absorb ever-increasing data set sizes; however mobile devices are designed with very little memory and cannot store such large models. We present a novel network architecture, HashedNets, that exploits inherent redundancy in neural networks to achieve drastic reductions in model sizes. HashedNets uses a low-cost hash function to randomly group connection weights into hash buckets, and all connections within the same hash bucket share a single parameter value. These parameters are tuned to adjust to the HashedNets weight sharing architecture with standard backprop during training. Our hashing procedure introduces no additional memory overhead, and we demonstrate on several benchmark data sets that HashedNets shrink the storage requirements of neural networks substantially while mostly preserving generalization performance.},
booktitle = {Proceedings of the 32nd International Conference on International Conference on Machine Learning - Volume 37},
pages = {2285–2294},
numpages = {10},
location = {Lille, France},
series = {ICML'15}
}

@inproceedings{10.1145/2847263.2847265,
author = {Qiu, Jiantao and Wang, Jie and Yao, Song and Guo, Kaiyuan and Li, Boxun and Zhou, Erjin and Yu, Jincheng and Tang, Tianqi and Xu, Ningyi and Song, Sen and Wang, Yu and Yang, Huazhong},
title = {Going Deeper with Embedded FPGA Platform for Convolutional Neural Network},
year = {2016},
isbn = {9781450338561},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2847263.2847265},
doi = {10.1145/2847263.2847265},
abstract = {In recent years, convolutional neural network (CNN) based methods have achieved great success in a large number of applications and have been among the most powerful and widely used techniques in computer vision. However, CNN-based methods are com-putational-intensive and resource-consuming, and thus are hard to be integrated into embedded systems such as smart phones, smart glasses, and robots. FPGA is one of the most promising platforms for accelerating CNN, but the limited bandwidth and on-chip memory size limit the performance of FPGA accelerator for CNN.In this paper, we go deeper with the embedded FPGA platform on accelerating CNNs and propose a CNN accelerator design on embedded FPGA for Image-Net large-scale image classification. We first present an in-depth analysis of state-of-the-art CNN models and show that Convolutional layers are computational-centric and Fully-Connected layers are memory-centric.Then the dynamic-precision data quantization method and a convolver design that is efficient for all layer types in CNN are proposed to improve the bandwidth and resource utilization. Results show that only 0.4% accuracy loss is introduced by our data quantization flow for the very deep VGG16 model when 8/4-bit quantization is used. A data arrangement method is proposed to further ensure a high utilization of the external memory bandwidth. Finally, a state-of-the-art CNN, VGG16-SVD, is implemented on an embedded FPGA platform as a case study. VGG16-SVD is the largest and most accurate network that has been implemented on FPGA end-to-end so far. The system on Xilinx Zynq ZC706 board achieves a frame rate at 4.45 fps with the top-5 accuracy of 86.66% using 16-bit quantization. The average performance of convolutional layers and the full CNN is 187.8 GOP/s and 137.0 GOP/s under 150MHz working frequency, which outperform previous approaches significantly.},
booktitle = {Proceedings of the 2016 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
pages = {26–35},
numpages = {10},
keywords = {embedded fpga, dynamic-precision data quantization, bandwidth utilization, convolutional neural network (cnn)},
location = {Monterey, California, USA},
series = {FPGA '16}
}

@INPROCEEDINGS{7011421,
  author={Chen, Yunji and Luo, Tao and Liu, Shaoli and Zhang, Shijin and He, Liqiang and Wang, Jia and Li, Ling and Chen, Tianshi and Xu, Zhiwei and Sun, Ninghui and Temam, Olivier},
  booktitle={2014 47th Annual IEEE/ACM International Symposium on Microarchitecture}, 
  title={DaDianNao: A Machine-Learning Supercomputer}, 
  year={2014},
  volume={},
  number={},
  pages={609-622},
  doi={10.1109/MICRO.2014.58}
}

@article{MUTLU201928,
title = {Processing data where it makes sense: Enabling in-memory computation},
journal = {Microprocessors and Microsystems},
volume = {67},
pages = {28-41},
year = {2019},
issn = {0141-9331},
doi = {https://doi.org/10.1016/j.micpro.2019.01.009},
url = {https://www.sciencedirect.com/science/article/pii/S0141933118302291},
author = {Onur Mutlu and Saugata Ghose and Juan Gómez-Luna and Rachata Ausavarungnirun},
keywords = {Data movement, Main memory, Processing-in-memory, 3D-Stacked memory, Near-data processing},
abstract = {Today’s systems are overwhelmingly designed to move data to computation. This design choice goes directly against at least three key trends in systems that cause performance, scalability and energy bottlenecks: (1) data access from memory is already a key bottleneck as applications become more data-intensive and memory bandwidth and energy do not scale well, (2) energy consumption is a key constraint in especially mobile and server systems, (3) data movement is very expensive in terms of bandwidth, energy and latency, much more so than computation. These trends are especially severely-felt in the data-intensive server and energy-constrained mobile systems of today. At the same time, conventional memory technology is facing many scaling challenges in terms of reliability, energy, and performance. As a result, memory system architects are open to organizing memory in different ways and making it more intelligent, at the expense of higher cost. The emergence of 3D-stacked memory plus logic as well as the adoption of error correcting codes inside DRAM chips, and the necessity for designing new solutions to serious reliability and security issues, such as the RowHammer phenomenon, are an evidence of this trend. In this work, we discuss some recent research that aims to practically enable computation close to data. After motivating trends in applications as well as technology, we discuss at least two promising directions for processing-in-memory (PIM): (1) performing massively-parallel bulk operations in memory by exploiting the analog operational properties of DRAM, with low-cost changes, (2) exploiting the logic layer in 3D-stacked memory technology to accelerate important data-intensive applications. In both approaches, we describe and tackle relevant cross-layer research, design, and adoption challenges in devices, architecture, systems, and programming models. Our focus is on the development of in-memory processing designs that can be adopted in real computing platforms at low cost.}
}

@ARTICLE{1274006,
  author={Shih-Lien Lu},
  journal={Computer}, 
  title={Speeding up processing with approximation circuits}, 
  year={2004},
  volume={37},
  number={3},
  pages={67-73},
  doi={10.1109/MC.2004.1274006}
}

@article{10.1126/science.1254642,
author = {Paul A. Merolla  and John V. Arthur  and Rodrigo Alvarez-Icaza  and Andrew S. Cassidy  and Jun Sawada  and Filipp Akopyan  and Bryan L. Jackson  and Nabil Imam  and Chen Guo  and Yutaka Nakamura  and Bernard Brezzo  and Ivan Vo  and Steven K. Esser  and Rathinakumar Appuswamy  and Brian Taba  and Arnon Amir  and Myron D. Flickner  and William P. Risk  and Rajit Manohar  and Dharmendra S. Modha },
title = {A million spiking-neuron integrated circuit with a scalable communication network and interface},
journal = {Science},
volume = {345},
number = {6197},
pages = {668-673},
year = {2014},
doi = {10.1126/science.1254642},
URL = {https://www.science.org/doi/abs/10.1126/science.1254642},
eprint = {https://www.science.org/doi/pdf/10.1126/science.1254642},
abstract = {Computers are nowhere near as versatile as our own brains. Merolla et al. applied our present knowledge of the structure and function of the brain to design a new computer chip that uses the same wiring rules and architecture. The flexible, scalable chip operated efficiently in real time, while using very little power. Science, this issue p. 668 A large-scale computer chip mimics many features of a real brain. Inspired by the brain’s structure, we have developed an efficient, scalable, and flexible non–von Neumann architecture that leverages contemporary silicon technology. To demonstrate, we built a 5.4-billion-transistor chip with 4096 neurosynaptic cores interconnected via an intrachip network that integrates 1 million programmable spiking neurons and 256 million configurable synapses. Chips can be tiled in two dimensions via an interchip communication interface, seamlessly scaling the architecture to a cortexlike sheet of arbitrary size. The architecture is well suited to many applications that use complex neural networks in real time, for example, multiobject detection and classification. With 400-pixel-by-240-pixel video input at 30 frames per second, the chip consumes 63 milliwatts.}
}

@inproceedings{10.1109/ISCA.2016.30,
author = {Han, Song and Liu, Xingyu and Mao, Huizi and Pu, Jing and Pedram, Ardavan and Horowitz, Mark A. and Dally, William J.},
title = {EIE: Efficient Inference Engine on Compressed Deep Neural Network},
year = {2016},
isbn = {9781467389471},
publisher = {IEEE Press},
url = {https://doi.org/10.1109/ISCA.2016.30},
doi = {10.1109/ISCA.2016.30},
abstract = {State-of-the-art deep neural networks (DNNs) have hundreds of millions of connections and are both computationally and memory intensive, making them difficult to deploy on embedded systems with limited hardware resources and power budgets. While custom hardware helps the computation, fetching weights from DRAM is two orders of magnitude more expensive than ALU operations, and dominates the required power.Previously proposed 'Deep Compression' makes it possible to fit large DNNs (AlexNet and VGGNet) fully in on-chip SRAM. This compression is achieved by pruning the redundant connections and having multiple connections share the same weight. We propose an energy efficient inference engine (EIE) that performs inference on this compressed network model and accelerates the resulting sparse matrix-vector multiplication with weight sharing. Going from DRAM to SRAM gives EIE 120\texttimes{} energy saving; Exploiting sparsity saves 10\texttimes{}; Weight sharing gives 8\texttimes{}; Skipping zero activations from ReLU saves another 3\texttimes{}. Evaluated on nine DNN benchmarks, EIE is 189\texttimes{} and 13\texttimes{} faster when compared to CPU and GPU implementations of the same DNN without compression. EIE has a processing power of 102 GOPS working directly on a compressed network, corresponding to 3 TOPS on an uncompressed network, and processes FC layers of AlexNet at 1.88\texttimes{}104 frames/sec with a power dissipation of only 600mW. It is 24,000\texttimes{} and 3,400\texttimes{} more energy efficient than a CPU and GPU respectively. Compared with DaDianNao, EIE has 2.9\texttimes{}, 19\texttimes{} and 3\texttimes{} better throughput, energy efficiency and area efficiency.},
booktitle = {Proceedings of the 43rd International Symposium on Computer Architecture},
pages = {243–254},
numpages = {12},
keywords = {hardware acceleration, ASIC, algorithm-hardware co-design, model compression, deep learning},
location = {Seoul, Republic of Korea},
series = {ISCA '16}
}

@article{Han2015DeepCC,
  title={Deep Compression: Compressing Deep Neural Network with Pruning, Trained Quantization and Huffman Coding},
  author={Song Han and Huizi Mao and William J. Dally},
  journal={arXiv: Computer Vision and Pattern Recognition},
  year={2015}
}

@ARTICLE{9253578,
  author={Dai, Xiaoliang and Yin, Hongxu and Jha, Niraj K.},
  journal={IEEE Transactions on Emerging Topics in Computing}, 
  title={Incremental Learning Using a Grow-and-Prune Paradigm With Efficient Neural Networks}, 
  year={2022},
  volume={10},
  number={2},
  pages={752-762},
  doi={10.1109/TETC.2020.3037052}
}

@inproceedings{NIPS2015_ae0eb3ee,
 author = {Han, Song and Pool, Jeff and Tran, John and Dally, William},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {Learning both Weights and Connections for Efficient Neural Network},
 url = {https://proceedings.neurips.cc/paper/2015/file/ae0eb3eed39d2bcef4622b2499a05fe6-Paper.pdf},
 volume = {28},
 year = {2015}
}

@article{das2015neuraltalk,
  title={NeuralTalk on Embedded System and GPU-accelerated RNN},
  author={Das, Subhasis and Han, Song},
  journal={Subhasis Das and Song Han},
  year={2015}
}

@inproceedings{NIPS1989_6c9882bb,
 author = {LeCun, Yann and Denker, John and Solla, Sara},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {D. Touretzky},
 pages = {},
 publisher = {Morgan-Kaufmann},
 title = {Optimal Brain Damage},
 url = {https://proceedings.neurips.cc/paper/1989/file/6c9882bbac1c7093bd25041881277658-Paper.pdf},
 volume = {2},
 year = {1989}
}

@ARTICLE{8704878,
author = {X. Dai and H. Yin and N. K. Jha},
journal = {IEEE Transactions on Computers},
title = {NeST: A Neural Network Synthesis Tool Based on a Grow-and-Prune Paradigm},
year = {2019},
volume = {68},
number = {10},
issn = {1557-9956},
pages = {1487-1497},
keywords = {neurons;computer architecture;training;biological neural networks;tools;manganese;correlation},
doi = {10.1109/TC.2019.2914438},
publisher = {IEEE Computer Society},
address = {Los Alamitos, CA, USA},
month = {oct}
}

@inproceedings{NIPS1992_303ed4c6,
 author = {Hassibi, Babak and Stork, David},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {S. Hanson and J. Cowan and C. Giles},
 pages = {},
 publisher = {Morgan-Kaufmann},
 title = {Second order derivatives for network pruning: Optimal Brain Surgeon},
 url = {https://proceedings.neurips.cc/paper/1992/file/303ed4c69846ab36c2904d3ba8573050-Paper.pdf},
 volume = {5},
 year = {1992}
}