From 8706f04e2d90c4425f0380a701358dda61f40bb8 Mon Sep 17 00:00:00 2001
From: Leonard Kugis <leonard@kug.is>
Date: Mon, 23 Jan 2023 23:50:19 +0100
Subject: Finished evaluation, finished future work

---
 Paper/paper.tex      | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 Paper/references.bib |  68 +++++++++++++++++++++++++
 2 files changed, 205 insertions(+), 2 deletions(-)

diff --git a/Paper/paper.tex b/Paper/paper.tex
index 32e1bf3..b5f2269 100644
--- a/Paper/paper.tex
+++ b/Paper/paper.tex
@@ -46,6 +46,8 @@ This paper gives an overview over different compression methods for \emph{Deep N
 (section~\ref{sec:compression}), after discussing the metrices used to measure
 inference engines (section~\ref{sec:metrices})
 and shows how they are applied in an actual hardware architecture: the \emph{Efficient Inference Engine} (\emph{EIE}) (section~\ref{sec:implementation}).
+After that, it is evaluated and compared to other hardware accelerators (section~\ref{sec:eval}).
+Finally, some further optimization methods for the EIE are presented in section~\ref{sec:future}.
 
 \subsection{Deep Neural Networks}
 
@@ -386,7 +388,7 @@ but are underrepresented by occurrence using this method. This would lead to hig
 because there are less centroids used. Because of this, linear initialization in the value domain has been established as the
 best initialization method \cite{Han2015DeepCC}.
 
-\subsection{Huffman encoding}
+\subsection{Huffman encoding}\label{sec:huffman}
 
 Another compression method that can be applied to DNNs is the Huffman encoding.
 
@@ -423,7 +425,7 @@ Huffman encoding archieves $35$x - $49$x compression rate \cite{Han2015DeepCC}.
 Another remarkable advantage of this compression method is that it is lossless and has therefore
 no impact on the accuracy of the DNN.
 
-\subsection{HashNets}
+\subsection{HashNets}\label{sec:hashnets}
 
 A relatively recent compression/optimization technique for weights of DNNs are HashNets \cite{10.5555/3045118.3045361}.
 Using HashNets, no actual values need to be stored in the weight matrix (not even index values),
@@ -586,6 +588,139 @@ To know the iteration boundaries, the column pointers are stored seperately. In
 the first column has pointer $0$ (because it is the first entry in total). The second entry has pointer $3$,
 because this PE has $3$ non-zero values assigned to it in the first column.
 
+\section{Evaluation and comparison}\label{sec:eval}
+
+\begin{figure*}[t]
+    \centering
+    \includegraphics[width=\textwidth]{resources/eval_speed_png} \\
+    \vspace{0.5cm}
+    \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_energy_png}
+    \caption{Speedup and energy efficiency comparison \cite{10.1109/ISCA.2016.30}}
+    \label{eval_speed_energy}
+\end{figure*}
+
+Fig.~\ref{eval_speed_energy} displays the speed and energy comparison with standard
+hardware components \emph{CPU}, \emph{GPU} and \emph{mGPU}. For benchmarking, different layers of different DNN models are used.
+\emph{Alex-6}, \emph{Alex-7}, \emph{Alex-8} are layers of the AlexNet.
+\emph{VGG-6}, \emph{VGG-7}, \emph{VGG-8} are layers of the VGG-Net (VGG: Visual Geometry Group).
+\emph{NT-We}, \emph{NT-Wd}, \emph{NT-LSTM} are layers of the NeuralTalk net.
+Speedup and energy efficiency is measured on the three platforms for each of those layers with and
+without compression. The baseline is the inference using CPU on the uncompressed model.
+
+\subsection{Methodology}
+
+\subsubsection{Hardware platforms}
+
+For the CPU an \emph{Intel Core i7 5930k} is used.
+As GPU a \emph{NVIDIA GeForce GTX Titan X} is used.
+As mGPU a \emph{NVIDIA Tegra K1} is used.
+All of them come with their own power reporting tools, used to measure the energy consumption
+and speed.
+
+\subsubsection{Speed}
+
+The speed is measured with the following formula:
+
+\begin{align}
+    \text{speed} = \frac{\text{workload}}{\text{peak throughput}}, [\text{Frames}/\text{s}]
+\end{align}
+
+Batch sizes of 1 are chosen, because the EIE is targeting real-time applications
+with low latency. In these environments, low batch sizes are the most common.
+
+\subsubsection{Energy efficiency}
+
+The energy efficiency is measured with the following formula:
+
+\begin{align}
+    \text{eff} = \frac{\text{average power consumption} \cdot \text{duration}}{\text{workload}}, [\text{Frames}/\text{J}]
+\end{align}
+
+\subsection{Results}
+
+The EIE has a speedup factor of $189$x, $13$x, $307$x compared to CPU, GPU and mGPU on the compressed models.
+Theoretically, when compared with uncompressed inference on standard architecture, the compression rate must
+be factorized: compressed inference speed of $103$ GOP/s correspond to uncompressed inference speed of $3$ TOP/s.
+However, in practice compression only yields a speedup of $3$x after compression for standard platforms.
+This shows the impact of the dedicated hardware architecture to handle compressed models.
+
+The EIE is $24000$x, $3400$x, $2700$x more energy efficient compared to CPU, GPU and mGPU on the compressed models.
+Remarkably, compression yields little to no benefit for the standard hardware architectures, while it does
+on a large scale stepping to EIE. The main reasons for this energy efficiency benefit are the change in memory technology
+from DRAM to SRAM, reduction of memory accesses through compression and the storage of weights in
+compressed sparse column representation.
+
+\subsection{Comparison with other hardware accelerators}
+
+\begin{figure*}[t]
+    \centering
+    \includegraphics[width=\textwidth]{resources/accelerators_table}
+    \caption{EIE compared with different DNN hardware accelerators \cite{10.1109/ISCA.2016.30}}
+    \label{accelerators}
+\end{figure*}
+
+Fig.~\ref{accelerators} shows a comparison between multiple hardware accelerators for inference of
+DNNs, namely A-Eye \cite{10.1145/2847263.2847265}, DaDianNao \cite{7011421} and TrueNorth \cite{10.1126/science.1254642} (amongst general purpose platforms).
+
+\subsubsection{A-Eye}
+
+A-Eye is a hardware accelerator targeting computational-centric parts of DNNs, namely the convolutional layers,
+which make up more than 90\% of computational cost \cite{9082126}. It does not approach the problems
+considered here with memory accesses of the fully connected layers. It also stores the main portion of the weights
+on external DDR3-DRAM and uses SRAM just as internal buffer. Though it uses efficient pooling to maximize
+the benefit of burst read-outs, it is not as efficient as it would be if its fully implemented in SRAM technology.
+Additionally, it is implemented on FPGA (Xilinx Zynq XC7Z045), therefore it lacks energy efficiency compared
+to ASICs like the EIE.
+
+It has an overall performance of $136.97$ GOP/s ($33$ Frames/s) and an energy efficiency of $14.22$ GOP/s/W ($3.43$ Frames/J) \cite{10.1145/2847263.2847265}.
+
+\subsubsection{DaDianNao}
+
+DaDianNao is a hardware accelerator which focuses on both, computational cost and memory accesses.
+For this purpose, it has embedded (on-chip) DRAM for parameter storage. Doing so, it archieves
+$450.65$x speedup compared to GPU. However, unlike the EIE, it is incapable of handling compressed
+DNNs and its main memory is still based on DRAM technology, while SRAM would be much faster.
+Benefits of this accelerator is the scalability. It consists of multiple nodes of the same type,
+and has been implemented in systems of up to 64 nodes, while this can be extended even further.
+
+With this 64-chip system it has a throughput of $147938$ Frames/s and an energy efficiency of $9263$ Frames/J.
+While the speed is better on a large scale due to its scalability, it has a bad energy efficiency.
+
+\subsubsection{TrueNorth}
+
+The TrueNorth supercomputer is a non-von Neumann system with transistor-based programmable neurons.
+This way, it overcomes the memory bottleneck, and technically the speed is comparable to SRAM accesses.
+For a standard VGA video at $30$ FPS, the chip consumes only $63$mW, which gives it a high energy efficiency
+of $10839$ Frames/J, compared to other hardware accelerators. However, the EIE is even better in energy efficiency by a
+factor of $13-18$, depending on process size and number of PEs.
+Also, due to its specialized architecture with programmable neurons, it has a bad area efficiency of only $4.63$ Frames/s/$\text{mm}^2$,
+which is only $\sim 0.23$\% of EIE. The throughput is relatively small compared to EIE, because it is also
+unable to handle compressed DNNs.
+
+\section{Future work}\label{sec:future}
+
+Different compression algorithms have been presented in section~\ref{sec:compression}.
+Not all of them are used for the EIE implementation. Some of them are orthogonal to the used compression methods,
+so they can be implemented and applied to the DNN without interference with the EIE.
+The different pruning strategies are an example for that. Other compression methods need an adjustment of the hardware architecture to different extends.
+
+For Huffman encoding (section~\ref{sec:huffman}) the different bit widths of the weights need to be handled
+by the hardware to fully exploit the possible compression ratio. Also, huffman tree lookups can be optimized
+to reduce the number of memory accesses. All in all this is a promising optimization with
+a lossless compression factor of up to $49$x.
+
+Another promising method are HashNets (section~\ref{sec:hashnets}). They omit the index lookup from the matrix entirely,
+and just compute lookup indices from hash functions.
+These hash functions need to be implemented in hardware to be efficient considering energy and speed,
+but it is technically possible \cite{10.5555/3045118.3045361}. A benefit of this method is the adjustable compression factor of up to
+$64$x, depending on the accuracy constraints (this compression method is not lossless). This way, the architecture,
+or at least the usage of it, can be adjusted to the users needs.
+
+Further optimization methods are technology based. For computation, MAC operations can be outsourced to the memory,
+performing in-memory computation \cite{MUTLU201928}. This would make a large portion of data transfers obsolete, which increases throughput and
+energy efficiency. Also, the existing ALU-implementation can be replaced by approximating circuits \cite{1274006},
+to the cost of a less accurate system, but another increase in speed and energy efficiency.
+
 \bibliographystyle{IEEEtran}
 \bibliography{Paper/references}
 
diff --git a/Paper/references.bib b/Paper/references.bib
index b5ea474..e1596b3 100644
--- a/Paper/references.bib
+++ b/Paper/references.bib
@@ -143,6 +143,74 @@ location = {Lille, France},
 series = {ICML'15}
 }
 
+@inproceedings{10.1145/2847263.2847265,
+author = {Qiu, Jiantao and Wang, Jie and Yao, Song and Guo, Kaiyuan and Li, Boxun and Zhou, Erjin and Yu, Jincheng and Tang, Tianqi and Xu, Ningyi and Song, Sen and Wang, Yu and Yang, Huazhong},
+title = {Going Deeper with Embedded FPGA Platform for Convolutional Neural Network},
+year = {2016},
+isbn = {9781450338561},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/2847263.2847265},
+doi = {10.1145/2847263.2847265},
+abstract = {In recent years, convolutional neural network (CNN) based methods have achieved great success in a large number of applications and have been among the most powerful and widely used techniques in computer vision. However, CNN-based methods are com-putational-intensive and resource-consuming, and thus are hard to be integrated into embedded systems such as smart phones, smart glasses, and robots. FPGA is one of the most promising platforms for accelerating CNN, but the limited bandwidth and on-chip memory size limit the performance of FPGA accelerator for CNN.In this paper, we go deeper with the embedded FPGA platform on accelerating CNNs and propose a CNN accelerator design on embedded FPGA for Image-Net large-scale image classification. We first present an in-depth analysis of state-of-the-art CNN models and show that Convolutional layers are computational-centric and Fully-Connected layers are memory-centric.Then the dynamic-precision data quantization method and a convolver design that is efficient for all layer types in CNN are proposed to improve the bandwidth and resource utilization. Results show that only 0.4% accuracy loss is introduced by our data quantization flow for the very deep VGG16 model when 8/4-bit quantization is used. A data arrangement method is proposed to further ensure a high utilization of the external memory bandwidth. Finally, a state-of-the-art CNN, VGG16-SVD, is implemented on an embedded FPGA platform as a case study. VGG16-SVD is the largest and most accurate network that has been implemented on FPGA end-to-end so far. The system on Xilinx Zynq ZC706 board achieves a frame rate at 4.45 fps with the top-5 accuracy of 86.66% using 16-bit quantization. The average performance of convolutional layers and the full CNN is 187.8 GOP/s and 137.0 GOP/s under 150MHz working frequency, which outperform previous approaches significantly.},
+booktitle = {Proceedings of the 2016 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
+pages = {26–35},
+numpages = {10},
+keywords = {embedded fpga, dynamic-precision data quantization, bandwidth utilization, convolutional neural network (cnn)},
+location = {Monterey, California, USA},
+series = {FPGA '16}
+}
+
+@INPROCEEDINGS{7011421,
+  author={Chen, Yunji and Luo, Tao and Liu, Shaoli and Zhang, Shijin and He, Liqiang and Wang, Jia and Li, Ling and Chen, Tianshi and Xu, Zhiwei and Sun, Ninghui and Temam, Olivier},
+  booktitle={2014 47th Annual IEEE/ACM International Symposium on Microarchitecture}, 
+  title={DaDianNao: A Machine-Learning Supercomputer}, 
+  year={2014},
+  volume={},
+  number={},
+  pages={609-622},
+  doi={10.1109/MICRO.2014.58}
+}
+
+@article{MUTLU201928,
+title = {Processing data where it makes sense: Enabling in-memory computation},
+journal = {Microprocessors and Microsystems},
+volume = {67},
+pages = {28-41},
+year = {2019},
+issn = {0141-9331},
+doi = {https://doi.org/10.1016/j.micpro.2019.01.009},
+url = {https://www.sciencedirect.com/science/article/pii/S0141933118302291},
+author = {Onur Mutlu and Saugata Ghose and Juan Gómez-Luna and Rachata Ausavarungnirun},
+keywords = {Data movement, Main memory, Processing-in-memory, 3D-Stacked memory, Near-data processing},
+abstract = {Today’s systems are overwhelmingly designed to move data to computation. This design choice goes directly against at least three key trends in systems that cause performance, scalability and energy bottlenecks: (1) data access from memory is already a key bottleneck as applications become more data-intensive and memory bandwidth and energy do not scale well, (2) energy consumption is a key constraint in especially mobile and server systems, (3) data movement is very expensive in terms of bandwidth, energy and latency, much more so than computation. These trends are especially severely-felt in the data-intensive server and energy-constrained mobile systems of today. At the same time, conventional memory technology is facing many scaling challenges in terms of reliability, energy, and performance. As a result, memory system architects are open to organizing memory in different ways and making it more intelligent, at the expense of higher cost. The emergence of 3D-stacked memory plus logic as well as the adoption of error correcting codes inside DRAM chips, and the necessity for designing new solutions to serious reliability and security issues, such as the RowHammer phenomenon, are an evidence of this trend. In this work, we discuss some recent research that aims to practically enable computation close to data. After motivating trends in applications as well as technology, we discuss at least two promising directions for processing-in-memory (PIM): (1) performing massively-parallel bulk operations in memory by exploiting the analog operational properties of DRAM, with low-cost changes, (2) exploiting the logic layer in 3D-stacked memory technology to accelerate important data-intensive applications. In both approaches, we describe and tackle relevant cross-layer research, design, and adoption challenges in devices, architecture, systems, and programming models. Our focus is on the development of in-memory processing designs that can be adopted in real computing platforms at low cost.}
+}
+
+@ARTICLE{1274006,
+  author={Shih-Lien Lu},
+  journal={Computer}, 
+  title={Speeding up processing with approximation circuits}, 
+  year={2004},
+  volume={37},
+  number={3},
+  pages={67-73},
+  doi={10.1109/MC.2004.1274006}
+}
+
+@article{10.1126/science.1254642,
+author = {Paul A. Merolla  and John V. Arthur  and Rodrigo Alvarez-Icaza  and Andrew S. Cassidy  and Jun Sawada  and Filipp Akopyan  and Bryan L. Jackson  and Nabil Imam  and Chen Guo  and Yutaka Nakamura  and Bernard Brezzo  and Ivan Vo  and Steven K. Esser  and Rathinakumar Appuswamy  and Brian Taba  and Arnon Amir  and Myron D. Flickner  and William P. Risk  and Rajit Manohar  and Dharmendra S. Modha },
+title = {A million spiking-neuron integrated circuit with a scalable communication network and interface},
+journal = {Science},
+volume = {345},
+number = {6197},
+pages = {668-673},
+year = {2014},
+doi = {10.1126/science.1254642},
+URL = {https://www.science.org/doi/abs/10.1126/science.1254642},
+eprint = {https://www.science.org/doi/pdf/10.1126/science.1254642},
+abstract = {Computers are nowhere near as versatile as our own brains. Merolla et al. applied our present knowledge of the structure and function of the brain to design a new computer chip that uses the same wiring rules and architecture. The flexible, scalable chip operated efficiently in real time, while using very little power. Science, this issue p. 668 A large-scale computer chip mimics many features of a real brain. Inspired by the brain’s structure, we have developed an efficient, scalable, and flexible non–von Neumann architecture that leverages contemporary silicon technology. To demonstrate, we built a 5.4-billion-transistor chip with 4096 neurosynaptic cores interconnected via an intrachip network that integrates 1 million programmable spiking neurons and 256 million configurable synapses. Chips can be tiled in two dimensions via an interchip communication interface, seamlessly scaling the architecture to a cortexlike sheet of arbitrary size. The architecture is well suited to many applications that use complex neural networks in real time, for example, multiobject detection and classification. With 400-pixel-by-240-pixel video input at 30 frames per second, the chip consumes 63 milliwatts.}
+}
+
 @inproceedings{10.1109/ISCA.2016.30,
 author = {Han, Song and Liu, Xingyu and Mao, Huizi and Pu, Jing and Pedram, Ardavan and Horowitz, Mark A. and Dally, William J.},
 title = {EIE: Efficient Inference Engine on Compressed Deep Neural Network},
-- 
cgit v1.2.1