\documentclass[conference]{IEEEtran}
%\IEEEoverridecommandlockouts
% The preceding line is only needed to identify funding in the first footnote. If that is unneeded, please comment it out.
\usepackage{cite}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{textcomp}
\usepackage{xcolor}
\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em
    T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}

\begin{document}

\nocite{*}

\title{EIE: Efficient Inference Engine on Compressed Deep Neural Networks}
\author{\IEEEauthorblockN{Leonard Kugis}
\IEEEauthorblockA{\textit{Abteilung Entwurf Integrierter Systeme} \\
\textit{Technische Universität Braunschweig}\\
Braunschweig, Germany \\
l.kugis@tu-bs.de}
}

\maketitle

\begin{abstract}
With an increased use of deep neural networks instead of conventional algorithms for classification tasks,
there is demand for implementation in various environments with different constraints.
Especially high demand for inference in the embedded real time domain has emerged, with its hard
constraints on latency, energy use and storage.
To satisfy these constraints, dedicated hardware architectures need to be implemented to maximize
the efficiency, such as the \emph{Efficient Inference Engine} (\emph{EIE}).
It utilizes various compression techniques to compress existing deep neural network models
and implements a specialized hardware architecture to handle the compressed models
as efficient as possible.
\end{abstract}

\begin{IEEEkeywords}
deep neural networks, compression, inference, weights, sparsity
\end{IEEEkeywords}

\section{Introduction}

This paper gives an overview over different compression methods for \emph{Deep Neural Networks}
(section~\ref{sec:compression}), after discussing the metrices used to measure
inference engines (section~\ref{sec:metrices})
and shows how they are applied in an actual hardware architecture: the \emph{Efficient Inference Engine} (\emph{EIE}) (section~\ref{sec:implementation}).
After that, it is evaluated and compared to other hardware accelerators (section~\ref{sec:eval}).
Finally, some further optimization methods for the EIE are presented in section~\ref{sec:future}.

\subsection{Deep Neural Networks}

\emph{Deep Neural Networks} (\emph{DNNs}) consist of different types of layers, namely
\emph{Convolutional Layers} and \emph{Fully Connected Layers}.
While for common DNNs (e.g. AlexNet \cite{10.1145/3065386}) convolutional layers make
up more than 90\% of computational cost \cite{9082126}, fully connected layers take
up over 90\% of the storage \cite{Cheng_2015_ICCV}.

\begin{figure}[htbp]
    \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/fcn}}
    \caption{Fully connected layer}
    \label{fcn}
\end{figure}

Fig.~\ref{fcn} shows a fully connected layer in a DNN.
Each fully connected layer consists of an input vector $a$, an output vector $b$ and
a weight matrix $W$.
Essentially, the weight matrix gets multiplied by the input vector,
so that each combination of the entries $W_{ij}$ get multiplied by the component $a_j$,
on which an activation function $f$ is applied, forming a component in the output vector $b_i$.

\begin{align}\label{mac}
    b_i = f(\sum\limits_{j=0}^{n} W_{ij} a_j)
\end{align}

Equ.~\ref{mac} shows this principle formulated. The process of multiplying the weights
with the components of the input vector is referred to as \emph{Multiply-Accumulate-Operation} (\emph{MAC}).
This is the essential operation in DNNs and therefore in focus for optimization.

Input and output vectors are considered as dynamic data, while
weights and eventual parameters of the activation function are considered as static data.

\section{Performance metrices}\label{sec:metrices}

When it comes to optimization of inference architectures for DNNs, there are four major categories of
metrices to optimize on:

\begin{itemize}
    \item Throughput
    \item Latency
    \item Model size
    \item Energy use
\end{itemize}

\emph{Throughput}. Throughput measures how much data can be processed by the network in a fixed time.

\emph{Latency}. Latency measures how long it takes for a fixed amount of data to be processed by the network from start to finish.

\emph{Model size}. Model size indicates the amount of storage required to store the model.

\emph{Energy use}. Energy use measures how much energy is needed to process a fixed amount of data.

\subsection{Memory}\label{sec:memory}

Since the 1980s there is a more and more evolving gap between memory latency and processing speed.
Recent CPUs are magnitudes faster than their memory counterparts \cite{carvalho2002gap}.
This is problematic, because in general the CPU has to wait during each memory access for the data
to arrive in order to process it further. While this waiting time can be optimized with
efficient pipelining, the principal still remains.
The memory accesses for the weights of the DNN make up the biggest part of energy consumption of the
hardware accelerator, and correlate also with latency and throughput \cite{DBLP:journals/corr/SzeCESZ16}.

\begin{figure}[htbp]
    \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/memory_latency}}
    \caption{Memory hierarchy and energy cost of hierarchy levels \cite{DBLP:journals/corr/SzeCESZ16}}
    \label{memory_latency}
\end{figure}

Fig.~\ref{memory_latency} shows the energy consumption of each layer of a typical hardware accelerator
consisting of an external \emph{Dynamic Random Access Memory} (\emph{DRAM}) to store the model,
internal buffers (typically \emph{Static Random Access Memory} (\emph{SRAM})) and processing engines (\emph{PEs}).
Memory accesses to DRAM are 200x more costly than register accesses, and 33x more costly than accesses to
the SRAM-buffer. This results in the demand of storing most of the model in SRAM technology.
Because such on-chip SRAM capacity is limited, models must be compressed in order to fit onto the SRAM.

\subsection{Existing DNNs}

To put the model sizes of DNNs into perspective, Table~\ref{tab_dnn} gives an overview of
the number of parameters and storage sizes in 32-bit floating point representation.

\begin{table}[htbp]
\caption{Model sizes of different DNN models}
\begin{center}
\begin{tabular}{|c|c|c|}
\hline
\textbf{DNN} & \textbf{Number of} & \textbf{Size in} \\
& \textbf{Parameters} & \textbf{32-bit float} \\
\hline
AlexNet \cite{choudhary2020comprehensive} & 61 M & 240 MB \\
\hline
VGG-16 \cite{choudhary2020comprehensive} & 138 M & 512 MB \\
\hline
LeNet-5 (caffe) \cite{NIPS2015_ae0eb3ee} & 431 K & 1.724 MB \\
\hline
NeuralTalk (zoo) \cite{das2015neuraltalk} & 90 M & 360 MB \\
\hline
\end{tabular}
\label{tab_dnn}
\end{center}
\end{table}

These are common models to perform benchmarks on hardware accelerators
and compression algorithms. They will be used for evaluation between different
platforms later.

\section{Compression}\label{sec:compression}

Different compression algorithms exist for DNN models. Not only storage size benefits
from model compression, most methods also optimize the amount of MACs (Equ.~\ref{mac}) that need to be
performed. It does so by creating a sparse weight matrix from the original weight matrix.
Multiplication and accumulation with $0$-valued weights can be skipped and therefore it is desired
to have as many $0$s as weights as possible.

\subsection{Basis projection}

Basis projection utilizes the fact that row- and column-vectors of the weight matrices can be
represented by the linear combination of its basis vectors.
In common training algorithms, weights (the weight matrices) are trained directly and use the canonical basis vectors.
But this might not be the optimal basis vectors, resulting in the most amount of zeros.

\begin{figure}[htbp]
    \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/basis_projection_png}}
    \caption{Basis projection and resulting weight distribution \cite{DBLP:journals/corr/SuleimanZS16}}
    \label{basis_projection}
\end{figure}

Fig.~\ref{basis_projection} depicts the method of basis projection and the resulting distributions of
weight values. It can be shown that the column vectors can be represented by projected vectors $\alpha$
and a-priory known basis vectors $S_d$. As the basis projection is a reversible operation, the reverse
can later be applied to receive the actual results.

Using this method, experimental results show, that the number of zero-valued weights can be increased from
$7$\% to $56$\% ($8.0$x increase) \cite{DBLP:journals/corr/SuleimanZS16}.
Further experiments have shown that, when used along with other compression methods such as pruning and
weight quantization, $3.6$x compression factor can be archieved. This results in $10$x less MAC operations and
$5$x reduction in power consumption \cite{DBLP:journals/corr/SuleimanZS16}.

\subsection{Pruning}

The key idea behind pruning is the removal of unimportant weights from the weight matrices, which have little or
no influence on the output accuracy. This influence is also referred to as \emph{sensitivity}.
For pruning, a common procedure has established \cite{Han2015DeepCC}:

\begin{enumerate}
    \item Train weights
    \item Prune network
    \item Retrain network
    \item Go to 2. or abort if criterion is met
\end{enumerate}

Under the assumption that there is an already trained network, weights (connections), neurons and convolutional filters
get removed from the network based on their output sensitivity. However, convolutional filters are not in this scope,
since they are not part of the fully connected layers of a DNN. Therefore, the main focus
is on weight/connection based pruning.

Pruned weights are simply set to value $0$ and not considered in the retraining process.
This way, they can be handled efficiently by the hardware architecture, by just skipping the
MAC-operation for these weights. Neurons are pruned by just deleting the corresponding
row/column of the weight matrix.

For weight/connection pruning, there are different methods on how to remove the weights and what the
sensitivity is measured on. They can be classified in three different main classes:

\begin{itemize}
    \item Magnitude threshold based pruning \cite{9253578}
    \item Optimal Brain Damage \cite{NIPS1989_6c9882bb} / Optimal Brain Surgeon \cite{NIPS1992_303ed4c6}
    \item Biased weight decay \cite{NIPS1988_1c9ac015}
\end{itemize}

\subsubsection{Magnitude threshold based pruning}

Magnitude threshold based pruning methods remove weights solely based on their magnitude.
The following simple rule is applied for the weight removal:

\begin{align}\label{mtbp}
    &\text{Remove weights $W_{ij}$ for which} \nonumber \\
    &\left| W_{ij} \right| < \vartheta \\
    &\vartheta := \text{Threshold} \nonumber
\end{align}

Weights $W_{ij}$ with magnitude below the threshold $\vartheta$ are removed (set to $0$).
This assumes that values with low magnitude have a low impact on the accuracy
of the network and/or the objective function.
This is generally not always the case and therefore not the optimal solution.
However, experiments have shown model size decreases of $33$\% for LeNet and up
to $33$\% for DeepSpeech2 \cite{9253578}.

This approach is often chosen in environments with focus on training speed, because
it does not require the calculation of e.g. gradients.
Therefore, it archives a training cost reduction of $64$\% for LeNet and up to $71$\%
for DeepSpeech2 \cite{9253578}.

\subsubsection{Optimal Brain Damage}

The \emph{Optimal Brain Damage} (\emph{OBD}) method removes weights based on their influence on the
objective function. The objective function can be any function formulating the error to be
minimized with training and measures the accuracy of the DNN.

Formally, the sensitivity is derived from the diagonals of the Hessian of the weight matrix:

\begin{align}\label{hessian}
    & h_{kk} = \sum\limits_{(i,j) \in V_k} \frac{{\partial}^2 E}{\partial w_{ij}^2} \\
    & V_k := \text{Set of index pairs for matrix $W_k$} \nonumber
\end{align}

After that the \emph{saliencies} are calculated as a measure of sensitivity:

\begin{align}\label{saliencies}
    & s_k = \frac{h_{kk} u_k^2}{2} \\
    & u_k := \text{Element $k$ of parameter vector} \nonumber
\end{align}

For the pruning, all weight matrices undergo the following algorithm \cite{NIPS1989_6c9882bb}:

\begin{enumerate}
    \item Choose network architecture
    \item Train network
    \item Compute Hessian diagonals $h_{kk}$ for each parameter
    \item Compute saliencies $s_k = \frac{h_{kk} u_k^2}{2}$ for each parameter
    \item Sort parameters by saliency and remove low-saliency parameters
    \item Go to step 2 or abort when accuracy threshold is reached
\end{enumerate}

Experiments show that for LeNet a significant decrease in the \emph{Minimum-Mean-Square-Error} (\emph{MSE}) function
can be obtained compared to magnitude based methods with the same level of pruning, especially for high pruning amounts.
For $500$ remaining parameters, the MSE for OBD is $3$ magnitudes smaller than for magnitude based pruning.
However, this results in a generally high error rate and is therefore undesired. Optimal solutions
are obtained for $30$\% network pruning rate using this method.

\subsubsection{Optimal Brain Surgeon}

\emph{Optimal Brain Surgeon} (\emph{OBS}) works similar to OBD. Compared to OBD it does not neccessarily delete
the weights, but changes the strength of those weights.

It does so by computing the inverse of the Hessian of the weight matrix \cite{NIPS1992_303ed4c6}:

\begin{align}\label{saliencies}
    & \textbf{H}_{m+1}^{-1} = \textbf{H}_{m}^{-1} - \frac{\textbf{H}_{m}^{-1} \textbf{X}_{m+1} \textbf{X}_{m+1}^T \textbf{H}_{m}^{-1}}{P + \textbf{X}_{m+1}^T \textbf{H}_{m}^{-1} \textbf{X}_{m+1}} \\
    & \textbf{H}^{-1} = \frac{1}{P} \sum\limits_{k=1}^{P} H_k^{-1}
\end{align}

For the pruning, all weight matrices undergo the following algorithm \cite{NIPS1992_303ed4c6}:

\begin{enumerate}
    \item Train network.
    \item Compute $\textbf{H}^{-1}$.
    \item Find $q$ with smallest saliency $L_q = \frac{w_q^2}{2 \textbf{H}_{qq}^{-1}}$. If this gradient increase is much smaller than $E$,
        delete $w_q$ and go to step 4. Otherwise go to step 5.
    \item Use the $q$ from step 3 and update all weights. Go to step 2.
    \item No more weights can be deleted without a large increase in $E$. Go to step 1 or abort if lower threshold in accuracy is reached.
\end{enumerate}

Experiments show that this method can be used to reduce the number of weights by up to $90$\% without significant
impact on the accuracy, without retraining \cite{NIPS1992_303ed4c6}.

\subsubsection{Biased weight decay}

The \emph{Biased weight decay} method works during training-time of the model.
It bases on the backpropagation algorithm and adds a penalty term to the weight update formula:

\begin{align}
    & w_{n+1} = \alpha (- \frac{\partial E}{\partial w_n}) + \beta w_n \label{bwd1} \\
    & w_{n+1} = \alpha (- \frac{\partial E}{\partial w_{ij}} - 2 w_n) + w_n \label{bwd2} \\
    & \alpha < \frac{1}{2} := \text{Learning rate} \nonumber
\end{align}

Equ.~\ref{bwd1} shows the update term of the backpropagation time considering an equal weight decay
for all values. Equ.~\ref{bwd2} shows the update term considering lower values to decay faster toward
zero while keeping high-valued weights the same.

\subsection{Weight quantization}

Storing the weight matrices as 32-bit floating point representation is not the most optimal
way of storage. While it preserves the precision the network was originally trained with,
it can be reduced significantly without high loss of accuracy.

\begin{figure}[htbp]
    \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/clustering}}
    \caption{Weight quantization using clustering \cite{Han2015DeepCC}}
    \label{clustering}
\end{figure}

A key concept for quantization of weights is \emph{clustering}. Fig.~\ref{clustering} illustrates
this process. Using this method, similar valued weights are grouped into clusters.
The weight matrix then only stores the cluster index instead of the actual 32-bit floating point value.
Each cluster has its own \emph{centroid} for which the index value in the weight matrix
gets substituted. The centroids are the mean of all weight-values belonging to this
respective cluster. Additionally, the centroids can be fine-tuned by applying clustering to the gradient
of the weight matrix aswell, and update the centroid value with the gradient-means.

This way, only a lookup-array with the cluster centroids and the respective indices in the weight matrix
need to be stored. In the example in Fig.~\ref{clustering} a $4 \times 4$ matrix of 32-bit floating point values
($512$ bit in total) can be reduced to a $4 \times 4$ matrix of 2-bit indices and a 4-value array of 32-bit floating point values
containing the centroids ($4 \cdot 4 \cdot 2 + 4 \cdot 32 = 160$ bit in total).

Experiments have shown that using this quantization method only, compression factors of $37$x, $31$x and
$33$x can be archieved on AlexNet, VGG-16 and LeNet-5, while the loss in accuracy was
within tolerance \cite{Han2015DeepCC}.

Technically, this compression method is not lossless, as it generalizes similar weights to their centroids.
So it comes down to the selection of the right centroids and which values to group into. If this is done efficiently,
the impact on accuracy is not measurable. For that, the k-means-algorithm is used. It minimizes
the \emph{Within-Cluster Sum of Squares} (\emph{WCSS}):

\begin{align}
    \text{argmin}_C \sum\limits_{i=1}^{k} \sum\limits_{\omega \in c_i} | \omega - c_i |^2
\end{align}

The k-means algorithm performs the following steps:

\begin{enumerate}
    \item Initialize $k$ cluster centroids
    \item For all weights:
    \begin{enumerate}
        \item Assign weight to the cluster with nearest centroid
        \item Recalculate cluster centroids
    \end{enumerate}
\end{enumerate}

A crucial degree of freedom for this is the initialization of the centroids. It determines
how fast the algorithm converges and to which values. There are different initialization methods.

\begin{figure}[htbp]
    \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/centroid_initialization}}
    \caption{Different centroid initialization methods \cite{Han2015DeepCC}}
    \label{init}
\end{figure}

Fig.~\ref{init} depicts the different possible methods for centroid initialization.
Weight value probability density function (PDF) and cumulative distribution function (CDF) are plotted.
Note the peaks for low magnitude weights around 0.
Centroids can be initialized randomly, linear, or by linear sampling of the CDF values.
Generally, sampling the CDF values linearly and taking the corresponding weight values as centroids
minimizes the total error in weight values compared to their centroids. While this makes sense in most cases,
it does not in the case of DNN-weights. Here, weights with high value have a higher impact on the output,
but are underrepresented by occurrence using this method. This would lead to high value-centroid differences for those values,
because there are less centroids used. Because of this, linear initialization in the value domain has been established as the
best initialization method \cite{Han2015DeepCC}.

\subsection{Huffman encoding}\label{sec:huffman}

Another compression method that can be applied to DNNs is the Huffman encoding.

\begin{figure}[htbp]
    \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/huffman}}
    \caption{Huffman encoding example}
    \label{huffman}
\end{figure}

Fig.~\ref{huffman} shows an encoding example. As a general principle, weights are encoded
with variable length in bits. Weights with many occurrences are encoded with less bits than
weights with less occurrences. The code book is stored as a binary tree, with edges encoding
$0$s and $1$s. The algorithm is the following:

\begin{enumerate}
    \item Count all weights and sort them by number of occurrence
    \item Take the two weights with lowest number of occurrence and make them leafs of the tree
    \item Connect the two nodes of the current layer together to a parent node
    \item Take the weight with the next lowest occurrence and make it a leaf on the current layer
    \item Go to step 3 or abort if no weight values are left
\end{enumerate}

Weights are then encoded by the edges leading to their respective leafs. E.g. in Fig.~\ref{huffman}
the weight value $0$ is encoded as bit $0$, however weight value $2$ is encoded as bits $101$.
In this particular example, a compression rate of $74.05$\% is archieved.

This compression method is especially efficient for models, in which one weight value is dominant.
When combined with other compression methods (e.g. pruning), which result in sparse matrices,
this is exactly the case with 0-values.

Experiments have shown that when used together with pruning and weight quantization,
Huffman encoding archieves $35$x - $49$x compression rate \cite{Han2015DeepCC}.

Another remarkable advantage of this compression method is that it is lossless and has therefore
no impact on the accuracy of the DNN.

\subsection{HashNets}\label{sec:hashnets}

A relatively recent compression/optimization technique for weights of DNNs are HashNets \cite{10.5555/3045118.3045361}.
Using HashNets, no actual values need to be stored in the weight matrix (not even index values),
so the memory access for accessing these values can be completely omitted.
Instead, the values are depicted via a \emph{one-way hash function} $h$.

\begin{figure}[htbp]
    \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/hashnets}}
    \caption{HashNets encoding (based on \cite{10.5555/3045118.3045361})}
    \label{hashnets}
\end{figure}

Fig.~\ref{hashnets} shows this principle. The real weight values are stored in an indexed
array. The hash function $h(i,j) := (i,j) \mapsto x$ takes the weight matrix indices $(i,j)$
and computes the index $x$ for the lookup array, so that the lookup value of the array corresponds
to the actual value of the \emph{virtual weight matrix} $\textbf{V}$:

\begin{align}
    w^{\ell}_{h^{\ell}(i, j)} = \textbf{V}^{\ell}_{ij}
\end{align}

This way, the weight matrix only exists virtually. Only the dimensions of the weight matrix and
the weight lookup array need to be stored, and there is no additional index lookup
neccessary for the weight matrix. This comes to the cost of computing the hash function,
but as already discussed in section~\ref{sec:memory} computation is not the limiting factor
when compared to memory accesses.

The compression factor for this method can be variably chosen, depending on accuracy requirements.
The degree of freedom here is the quantization method (how much different values to quantisize to),
thus, the resulting weight array. Also, different hash functions can be used.

\cite{10.5555/3045118.3045361} discusses effects of compression factors of up to $64$x, and uses
\emph{xxHash} as hash function.

\subsection{Sparse matrix storage}

When it comes to compression of weight matrices, the storage of the resulting sparse matrix needs
to be considered aswell. The \emph{Compressed sparse column} (\emph{CSC}) / \emph{Compressed sparse row} (\emph{CSR})
format has been established for this purpose.

With this method, a row/column $W_j$ of weight values is encoded as vectors $v$ and $z$.
$v$ contains the non-zero weight values of the column. $z$ is of equal length than $v$ and denotes
the amount of $0$s before the corresponding entry in $v$.
E.g. column $[0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3]$ gets encoded as
$v = [1, 2, 0, 3]$ and $z = [2, 0, 15, 2]$. An additional padding value of $0$ has been added after
$15$ concurrent $0$s, because the entries are assumed to be of 4-bit integer representation, and there are more than
$15$ concurrent $0$s after the value $2$.

For this example, the method needs storage for $8$ values instead of $23$, which equals a compression rate of
$34.78$\%. This method is also used by the EIE and optimized by its architecture.

\section{EIE implementation}\label{sec:implementation}

\begin{figure*}[t]
    \centering
    \includegraphics[width=\textwidth]{resources/eie_hw}
    \caption{Hardware architecture of the EIE \cite{10.1109/ISCA.2016.30}}
    \label{eie}
\end{figure*}

In general, the EIE is an approach to optimize the per-activation formula of DNNs (Equ.~\ref{mac}):

\begin{align}
    & b_i = f(\sum\limits_{j=0}^{n-1} W_{ij} a_j) \overset{!}{=} \text{ReLU}(\sum\limits_{j \in X_i \cap Y} S[I_{ij}] a_j) \\
    & X_i := \text{Set of columns, skipping $W_{ij} = 0$} \nonumber \\
    & Y := \text{Set of indices in $a$ for which $a_j \neq 0$} \nonumber \\
    & I_{ij} := \text{4-bit index} \nonumber \\
    & S := \text{Shared lookup table} \nonumber
\end{align}

As activation function $f$ the \emph{Rectified Linear Unit} function (\emph{ReLU}) is fixed.
It is defined as $0$ for negative values while keeping positive values unchanged.
The indices $I_{ij}$ decompressed from CSC-format are used to lookup the actual weights from a shared weight lookup table $S$.
Values of $0$ as weights or inputs are skipped for multiplication and accumulation,
as they do not need to be computed.

Fig.~\ref{eie} shows a block diagram of the hardware architecture of the EIE.
The left side (a) is the \emph{leading non-zero detection unit} (\emph{LNZD}), which detects $0$-values in the input
vector and skips them, while only forwarding non-zero values. The right side (b) is the actual \emph{processing element} (\emph{PE}), which
receives the non-zero input values.

For parallelism, there are generally multiple PEs working independently. $4$ PEs are grouped together and share one
LNZD-unit.

\subsection{Leading non-zero detection unit}

Input activations are distributed to each processing element (PE) in a hierarchical manner.
To utilize the sparsity of the input vector, a leading non-zero detection logic is employed to identify the first non-zero result.
A group of four PEs perform local leading non-zero detection on their respective inputs and the results are sent
to a LNZD node.

\subsection{Activation queue}

The non-zero elements of the input activation vector $a_j$, along with their corresponding indexes $j$ are broadcasted
to an activation queue in each PE. However, if any PE has a full queue, the broadcast is disabled. Each PE processes
the activation at the front of its queue at any given time. The activation queue enables each PE to accumulate
some values and indices to balance out any load imbalances that may occur due to variations in the number of non-zeros
in a particular column $j$ among the PEs.

\subsection{Pointer Read Unit}

To efficiently access the $v$ and $z$ arrays for a given column $j$ using single-ported SRAM arrays,
the index $j$ at the head of the activation queue is used to look up the start and end pointers ($p_j$ and $p_{j+1}$)
for that column. These are seperately stored along with $v$ and $z$ to mark the iteration boundaries.
To ensure that both pointers can be read in a single cycle, the 16-bit pointers are stored in
two separate SRAM banks and the least significant bit (LSB) of the address is used to select the appropriate bank.
As a result, $p_j$ and $p_{j+1}$ will always be located in different banks.

\subsection{Sparse Matrix Read Unit}

The Sparse Matrix Read Unit reads out the actual non-zero weight values from the SRAM,
for the part of the column $W_j$ associated with the respective PE. It uses the pointers
$p_j$ and $p_{j+1}$ fetched before as reading boundaries.

As the SRAM is of 64-bit word length and an entry in $v$ and $z$ is encoded in 4-bit each,
$8$ complete values can be read out in one cycle. This $(v, z)$ pair is then forwarded
to the arithmetic unit.

\subsection{Arithmetic Unit}

The arithmetic unit receives the $(v,z)$ value pair. It calculates the absolute index
$x$ in the output vector $b$ from the relative index $z$ and calculates the multiplication
$b_x = b_x + v \cdot a_j$ for which the output $b_x$ is the accumulation register.

The values are stored in an encoded and quantisized form, so for calculation they are expanded
to 16-bit floating point representation first.

\subsection{Activation Read/Write Unit}

The Activation Read/Write Unit handles the values of the output vector. The target values are buffered
in two register files. For feed-forward layers in NNs the output values of one layer
are the input files of the next layer, so they exchange their roles after each layer to
save memory accesses to the SRAM.

Access to SRAM is only occurring if the buffer is full. Then the entire buffer batch is written
to the SRAM.

\subsection{Weight value distribution}

An important factor is how to distribute the individual values of the weight matrix
to the individual PEs.

\begin{figure}[htbp]
    \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/eie_matrix}}
    \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/eie_layout}}
    \caption{Weight matrix segmentation and memory layout \cite{10.1109/ISCA.2016.30}}
    \label{eie_segmentation}
\end{figure}

Fig.~\ref{eie_segmentation} illustrates how the values are distributed to 4 PEs, and the corresponding
memory layout for PE0. Each row belongs to one PE and are distributed sequentially,
repeating after 4 rows. Each PE calculates one element $b_x$ of the output vector $b$ with one entry
of the input vector $a_j$ at a time. The individual PEs perform the MACs for each assigned input entry,
skipping $0$-valued entries using the non-zero detection units.

On the bottom of Fig.~\ref{eie_segmentation} the memory layout for PE0 is depicted. Each PE stores
the non-zero values assigned to it, together with the relative indices $z$ encoding the number
of zeros before each element in $v$, but only for the respective PE.
To know the iteration boundaries, the column pointers are stored seperately. In the example above,
the first column has pointer $0$ (because it is the first entry in total). The second entry has pointer $3$,
because this PE has $3$ non-zero values assigned to it in the first column.

\section{Evaluation and comparison}\label{sec:eval}

\begin{figure*}[t]
    \centering
    \includegraphics[width=\textwidth]{resources/eval_speed_png} \\
    \vspace{0.5cm}
    \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_energy_png}
    \caption{Speedup and energy efficiency comparison \cite{10.1109/ISCA.2016.30}}
    \label{eval_speed_energy}
\end{figure*}

Fig.~\ref{eval_speed_energy} displays the speed and energy comparison with standard
hardware components \emph{CPU}, \emph{GPU} and \emph{mGPU}. For benchmarking, different layers of different DNN models are used.
\emph{Alex-6}, \emph{Alex-7}, \emph{Alex-8} are layers of the AlexNet.
\emph{VGG-6}, \emph{VGG-7}, \emph{VGG-8} are layers of the VGG-Net (VGG: Visual Geometry Group).
\emph{NT-We}, \emph{NT-Wd}, \emph{NT-LSTM} are layers of the NeuralTalk net.
Speedup and energy efficiency is measured on the three platforms for each of those layers with and
without compression. The baseline is the inference using CPU on the uncompressed model.

\subsection{Methodology}

\subsubsection{Hardware platforms}

For the CPU an \emph{Intel Core i7 5930k} is used.
As GPU a \emph{NVIDIA GeForce GTX Titan X} is used.
As mGPU a \emph{NVIDIA Tegra K1} is used.
All of them come with their own power reporting tools, used to measure the energy consumption
and speed.

\subsubsection{Speed}

The speed is measured with the following formula:

\begin{align}
    \text{speed} = \frac{\text{workload}}{\text{peak throughput}}, [\text{Frames}/\text{s}]
\end{align}

Batch sizes of 1 are chosen, because the EIE is targeting real-time applications
with low latency. In these environments, low batch sizes are the most common.

\subsubsection{Energy efficiency}

The energy efficiency is measured with the following formula:

\begin{align}
    \text{eff} = \frac{\text{average power consumption} \cdot \text{duration}}{\text{workload}}, [\text{Frames}/\text{J}]
\end{align}

\subsection{Results}

The EIE has a speedup factor of $189$x, $13$x, $307$x compared to CPU, GPU and mGPU on the compressed models.
Theoretically, when compared with uncompressed inference on standard architecture, the compression rate must
be factorized: compressed inference speed of $103$ GOP/s correspond to uncompressed inference speed of $3$ TOP/s.
However, in practice compression only yields a speedup of $3$x after compression for standard platforms.
This shows the impact of the dedicated hardware architecture to handle compressed models.

The EIE is $24000$x, $3400$x, $2700$x more energy efficient compared to CPU, GPU and mGPU on the compressed models.
Remarkably, compression yields little to no benefit for the standard hardware architectures, while it does
on a large scale stepping to EIE. The main reasons for this energy efficiency benefit are the change in memory technology
from DRAM to SRAM, reduction of memory accesses through compression and the storage of weights in
compressed sparse column representation.

\subsection{Comparison with other hardware accelerators}

\begin{figure*}[t]
    \centering
    \includegraphics[width=\textwidth]{resources/accelerators_table}
    \caption{EIE compared with different DNN hardware accelerators \cite{10.1109/ISCA.2016.30}}
    \label{accelerators}
\end{figure*}

Fig.~\ref{accelerators} shows a comparison between multiple hardware accelerators for inference of
DNNs, namely A-Eye \cite{10.1145/2847263.2847265}, DaDianNao \cite{7011421} and TrueNorth \cite{10.1126/science.1254642} (amongst general purpose platforms).

\subsubsection{A-Eye}

A-Eye is a hardware accelerator targeting computational-centric parts of DNNs, namely the convolutional layers,
which make up more than 90\% of computational cost \cite{9082126}. It does not approach the problems
considered here with memory accesses of the fully connected layers. It also stores the main portion of the weights
on external DDR3-DRAM and uses SRAM just as internal buffer. Though it uses efficient pooling to maximize
the benefit of burst read-outs, it is not as efficient as it would be if its fully implemented in SRAM technology.
Additionally, it is implemented on FPGA (Xilinx Zynq XC7Z045), therefore it lacks energy efficiency compared
to ASICs like the EIE.

It has an overall performance of $136.97$ GOP/s ($33$ Frames/s) and an energy efficiency of $14.22$ GOP/s/W ($3.43$ Frames/J) \cite{10.1145/2847263.2847265}.

\subsubsection{DaDianNao}

DaDianNao is a hardware accelerator which focuses on both, computational cost and memory accesses.
For this purpose, it has embedded (on-chip) DRAM for parameter storage. Doing so, it archieves
$450.65$x speedup compared to GPU. However, unlike the EIE, it is incapable of handling compressed
DNNs and its main memory is still based on DRAM technology, while SRAM would be much faster.
Benefits of this accelerator is the scalability. It consists of multiple nodes of the same type,
and has been implemented in systems of up to 64 nodes, while this can be extended even further.

With this 64-chip system it has a throughput of $147938$ Frames/s and an energy efficiency of $9263$ Frames/J.
While the speed is better on a large scale due to its scalability, it has a bad energy efficiency.

\subsubsection{TrueNorth}

The TrueNorth supercomputer is a non-von Neumann system with transistor-based programmable neurons.
This way, it overcomes the memory bottleneck, and technically the speed is comparable to SRAM accesses.
For a standard VGA video at $30$ FPS, the chip consumes only $63$mW, which gives it a high energy efficiency
of $10839$ Frames/J, compared to other hardware accelerators. However, the EIE is even better in energy efficiency by a
factor of $13-18$, depending on process size and number of PEs.
Also, due to its specialized architecture with programmable neurons, it has a bad area efficiency of only $4.63$ Frames/s/$\text{mm}^2$,
which is only $\sim 0.23$\% of EIE. The throughput is relatively small compared to EIE, because it is also
unable to handle compressed DNNs.

\section{Future work}\label{sec:future}

Different compression algorithms have been presented in section~\ref{sec:compression}.
Not all of them are used for the EIE implementation. Some of them are orthogonal to the used compression methods,
so they can be implemented and applied to the DNN without interference with the EIE.
The different pruning strategies are an example for that. Other compression methods need an adjustment of the hardware architecture to different extends.

For Huffman encoding (section~\ref{sec:huffman}) the different bit widths of the weights need to be handled
by the hardware to fully exploit the possible compression ratio. Also, huffman tree lookups can be optimized
to reduce the number of memory accesses. All in all this is a promising optimization with
a lossless compression factor of up to $49$x.

Another promising method are HashNets (section~\ref{sec:hashnets}). They omit the index lookup from the matrix entirely,
and just compute lookup indices from hash functions.
These hash functions need to be implemented in hardware to be efficient considering energy and speed,
but it is technically possible \cite{10.5555/3045118.3045361}. A benefit of this method is the adjustable compression factor of up to
$64$x, depending on the accuracy constraints (this compression method is not lossless). This way, the architecture,
or at least the usage of it, can be adjusted to the users needs.

Further optimization methods are technology based. For computation, MAC operations can be outsourced to the memory,
performing in-memory computation \cite{MUTLU201928}. This would make a large portion of data transfers obsolete, which increases throughput and
energy efficiency. Also, the existing ALU-implementation can be replaced by approximating circuits \cite{1274006},
to the cost of a less accurate system, but another increase in speed and energy efficiency.

\bibliographystyle{IEEEtran}
\bibliography{Paper/references}

\end{document}