Neural Network Basic to Application
Author
Kim Woo Hyun
Last Updated
5 years ago
License
Creative Commons CC BY 4.0
Abstract
Explaining of Neural Network by painting style transfer
Explaining of Neural Network by painting style transfer
\documentclass{beamer}
% For more themes, color themes and font themes, see:
% http://deic.uab.es/~iblanes/beamer_gallery/index_by_theme.html
%
\mode<presentation>
{
\usetheme{Madrid} % or try default, Darmstadt, Warsaw, ...
\usecolortheme{default} % or try albatross, beaver, crane, ...
\usefonttheme{serif} % or try default, structurebold, ...
\setbeamertemplate{navigation symbols}{}
\setbeamertemplate{caption}[numbered]
}
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage{chemfig}
\usepackage[version=3]{mhchem}
\usepackage{wrapfig}
% On Overleaf, these lines give you sharper preview images.
% You might want to `comment them out before you export, though.
\usepackage{pgfpages}
\pgfpagesuselayout{resize to}[%
physical paper width=8in, physical paper height=6in]
% Here's where the presentation starts, with the info for the title slide
\title[Seminar]{Neural Network. Basic to application}
\subtitle{(painting style transfer)}
\author{Kim Woo Hyun}
\date{\today}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
% These three lines create an automatically generated table of contents.
\begin{frame}{Outline}
\tableofcontents
\end{frame}
\section{Neural Network}
\subsection{First Generation (ANN, Perceptron)}
\begin{frame}{First Generation}
\begin{block}{Artificial Neural Network : ANN}
At 1943 \textbf{\textit{McCulloch, Warren S.}}, and \textbf{\textit{Walter Pitts}} suggested
\end{block}
\centering
\includegraphics[scale=0.5]{ANN.PNG}
\begin{itemize}
\item Mimic the human neural structure by connecting switches
\end{itemize}
\end{frame}
\begin{frame}{First Generation}
\begin{block}{Perceptron}
In 1958 \textbf{\textit{Frank Rosenblatt}} suggested Linear Classifier.
\end{block}
\includegraphics[scale=0.2]{1_neuron.png}
\includegraphics[scale=0.2]{1_neuron_model.jpeg}
\begin{itemize}
\item Expected computer can do things human can do better at that time.
\item Basic structure is not changed until now.
\item Using sigmoid with \textbf{Activation function}.
(Make output $\in$ [0,1])
\end{itemize}
\end{frame}
\begin{frame}{First Generation}
\begin{block}{Problem}
In 1969 \textbf{\textit{Marvin Minsky, Seymour Papert}} proved limitations of perceptron.
\end{block}
\includegraphics[scale = 0.3]{1_minsky_book.jpg}
\includegraphics[scale = 0.6]{1_xor_unsolve.PNG}
It can't solve XOR problem even.
\end{frame}
\subsection{Second Generation (MLP, Back-propagation)}
\begin{frame}{Second Generation}
\begin{block}{Multi-Layer Perception : MLP}
Make neurons deeper by make \textbf{hidden layers} of perception
\end{block}
\includegraphics[scale = 0.3]{1_xor_solve.PNG}
\includegraphics[scale = 0.2]{1_MLP}
\begin{itemize}
\item Solve the Non-Linear problems with multiple linear classifier.
\item \textbf{Too many parameters!!}
\item Needs parameter controller.
\end{itemize}
\end{frame}
\begin{frame}{Second Generation}
\begin{block}{Back-propagation}
Feedback algorithm controls the weights of neural network.
\end{block}
\centering
\includegraphics[scale = 0.5]{1_BackP.png}
\begin{itemize}
\item $i$ : input layer
\item $h$ : hidden layer
\item $o$ : output layer
\item $w_{ij}$ : weight connected to the neuron i to j.
\end{itemize}
\end{frame}
\begin{frame}{Second Generation}
\includegraphics[scale = 0.4]{1_BackP.png}
\includegraphics[scale = 0.3]{1_sigmoid.png}
\begin{itemize}
\item $out$ : Output value of a neuron.
\item $in$ : sum of weighted output of connected neurons. ($in = \sum w*out$)
\item $t$ : Target value (Choose yourself!)
\item \textbf{Sigmoid} activation function. Ex) $out_{h3} = \sigma(in_{h3}) = \frac{1}{1+e^{-in_{h3}}}$
\end{itemize}
\end{frame}
\begin{frame}{Second Generation}
Error with Sum of square (Euclidean Distance)
\[E = \frac{1}{2}(t_5-out_{o5})^2 + \frac{1}{2}(t_6-out_{o6})^2\]
We want to see how much each weights influence to $E$ $\Rightarrow $ Calculate $\frac{\partial E}{\partial w_{ij}}$
Example) Calculate $\frac{\partial E}{\partial w_{35}}$ with \textbf{Chain-rule}
\[\frac{\partial E}{\partial w_{35}} = \frac{\partial E}{\partial out_{o5}}*\frac{\partial out_{o5}}{\partial in_{o5}}*\frac{\partial in_{o5}}{\partial w_{35}}\]
\centering
\includegraphics[scale = 0.4]{1_BackP.png}
\end{frame}
\begin{frame}{Second Generation}
First, \[\frac{\partial E}{\partial out_{o5}} = \frac{\partial }{\partial out_{o5}}\left [ \frac{1}{2}(t_5-out_{o5})^2 + \frac{1}{2}(t_6-out_{o6})^2 \right ] = out_{o5}-t_5\]
Second, \[\frac{\partial out_{05}}{\partial in_{o5}}= \frac{\partial \sigma(in_{o5}) }{\partial in_{o5}}\]
\end{frame}
\begin{frame}{Second Generation}
The sigmoid function $\sigma(x)$ is
\[\sigma(x) = \frac{1}{1+e^{-ax}}\]
The differential of sigmoid $\sigma(x)$
\begin{align*}
\sigma'(x) &= \frac{ae^{-ax}}{(1+e^{-ax})^2} \\
&= a\frac{1}{(1+e^{-ax})}\frac{e^{-ax}}{(1+e^{-ax})} \\
&= a\frac{1}{(1+e^{-ax})}\left( 1- \frac{1}{(1+e^{-ax})} \right ) \\
&= a\sigma(x)(1-\sigma(x))
\end{align*}
\end{frame}
\begin{frame}{Second Generation}
First, \[\frac{\partial E}{\partial out_{o5}} = \frac{\partial }{\partial out_{o5}}\left [ \frac{1}{2}(t_5-out_{o5})^2 + \frac{1}{2}(t_6-out_{o6})^2 \right ] = out_{o5}-t_5\]
Second, \[\frac{\partial out_{05}}{\partial in_{o5}}= \frac{\partial \sigma(in_{o5}) }{\partial in_{o5}} = \sigma(in_{o5})(1-\sigma(in_{o5})) = out_{o5}(1-out_{o5})\]
\end{frame}
\begin{frame}{Second Generation}
First, \[\frac{\partial E}{\partial out_{o5}} = \frac{\partial }{\partial out_{o5}}\left [ \frac{1}{2}(t_5-out_{o5})^2 + \frac{1}{2}(t_6-out_{o6})^2 \right ] = out_{o5}-t_5\]
Second, \[\frac{\partial out_{05}}{\partial in_{o5}}= \frac{\partial \sigma(in_{o5}) }{\partial in_{o5}} = \sigma(in_{o5})(1-\sigma(in_{o5})) = out_{o5}(1-out_{o5})\]
Third,
\[\frac{\partial in_{o5}}{\partial w_{35}} = \frac{\partial (out_{h3}*w_{35})}{\partial w_{35}} = out_{h3}\]
Finally,
\[\frac{\partial E}{\partial w_{35}} = (out_{o5}-t_5)(1-out_{o5})out_{o5}out_{h3}\]
\begin{block}{}
Beautifully, all parameters are already calculated and what we have to do is easy math.
\end{block}
\end{frame}
\begin{frame}{Second Generation}
Then, how to update weights?
\[w := w - r\frac{\partial E}{\partial w} \text{, r is constant called learning rate.}\]
So, updated $w_{35}$ is
\[w_{35} := w_{35} - r(out_{o5}-t_5)(1-out_{o5})out_{o5}out_{h3}\]
This method called \textbf{Gradient descent.}
\centering
\includegraphics[scale = 0.16]{1_Gradient_descent.png}
\end{frame}
\begin{frame}{Second Generation}
\begin{block}{Gradient descent}
Simply, moving to orthogonal direction from contour line.
\end{block}
\textit{Why the direction to orthogonal?}
At minimum point of f(x,y),
\[\nabla f(x,y) = \left( \frac{\partial f}{\partial x}, \frac{\partial f}{\partial y} \right ) = 0\]
Assume direction of contour line is $(a,b)$. Then using \textbf{Tayler series}, derive orthogonal direction by linearize the contour line.
\[f(x_1+a,y_1+b) \simeq f(x_1,y_2) + \frac{\partial f}{\partial x}a + \frac{\partial f}{\partial y}b + \dots\]
The condition of $(a,b)$ that minimize error is
\[\frac{\partial f}{\partial x}a + \frac{\partial f}{\partial y}b = 0 \]
\end{frame}
\begin{frame}{Second Generation}
If $a = \frac{\partial f}{\partial y}$ and $b = -\frac{\partial f}{\partial x}$.
\[\frac{\partial f}{\partial x}a + \frac{\partial f}{\partial y}b = \frac{\partial f}{\partial x}\frac{\partial f}{\partial y} + \frac{\partial f}{\partial y}(-\frac{\partial f}{\partial x}) = 0\]
In addition, the inner product of gradient and (a,b) is
\[(\nabla f(x,y))\cdot (a,b) = \left (\frac{\partial f}{\partial x} ,\frac{\partial f}{\partial y} \right )\cdot \left ( \frac{\partial f}{\partial y} ,-\frac{\partial f}{\partial x} \right ) = 0\]
\begin{block}{}
It means the vector orthogonal to contour line is gradient itself. And if we track the gradient until it is 0, we can find minimum point.
\end{block}
*Caution it can be a saddle point not minimum but I don't want to discuss in this time because I don't know.
\end{frame}
\begin{frame}{Second Generation}
Problems
\begin{itemize}
\item Gradient descent is bad at non-convex function, but sigmoid is non-convex function.
\[\sigma''(x) = a^{2}\sigma(x)(1-\sigma(x))(1-2\sigma(x))\]
\[a^{2}\sigma(x)(1-\sigma(x)) \geq 0 \text{ but } -1 \leq 1-2\sigma(x) \leq 1\]
\item Cost of back-propagation is Big.
\item Vanishing Gradient Problem.
\end{itemize}
\end{frame}
\begin{frame}{Second Generation}
\begin{block}{Cost of back-propagation.}
Cost is big at shallow layer.
\end{block}
For example,
\[\frac{\partial E}{\partial w_{13}} = \frac{\partial E}{\partial out_{h3}}*\frac{\partial out_{h3}}{\partial in_{h3}}*\frac{\partial in_{h3}}{\partial w_{13}}\]
\[\vdots\]
\[= \left [(out_{o5}-t_5)\{out_{o5}(1-out_{o5})\}w_{35} + (out_{o5}-t_5)\{out_{o6}(1-out_{o6})\}w_{36}\right ]\]
\[*(1-out_{h3})*out_{h3}*out_{i1}\]
Of course! since it is chain-rule algorithm, it is easier than looks like. However if we have very big network?
\end{frame}
\begin{frame}{Second Generation}
\begin{block}{Vanishing Gradient Problem}
Because of sigmoid function, gradient is going to 0 while repeat Back-propagation.
\end{block}
\centering
\includegraphics[scale = 0.5]{1_sigmoid.png}
\end{frame}
\subsection{Thrid Generation (ReLU)}
\begin{frame}{Thrid Generation}
\centering
\includegraphics[scale = 0.25]{1_relu.png}
\begin{block}{Rectified Linear Unit : ReLU}
\begin{itemize}
\item Convex : good at gradient descent.
\item Cost of Back-propagation is decrease. (since $f'(x)$ = 1 or 0 always)
\item Safe from Vanishing Gradient Problem
\end{itemize}
\end{block}
All problems are from bad activation function.
\end{frame}
\begin{frame}{Thrid Generation}
\centering
\includegraphics[scale = 0.5]{1_Bad_Act.PNG}
Notice at gap between tanh and ReLU.
\end{frame}
\section{Convolutional Neural Network}
\begin{frame}
Section 2. Convolutional Neural Network
\begin{itemize}
\item Convolution layer
\item ReLU layer
\item Pooling layer
\item Fully Connected layer
\end{itemize}
\end{frame}
\subsection{Convolution layer}
\begin{frame}{Convolution layer}
\begin{block}{2D Convolution}
Nothing specially different from 1D convolution.
\end{block}
\begin{columns}
\begin{column}{0.4\textwidth}
\includegraphics[scale = 0.5]{2_simple_conv.JPG}
\end{column}
\begin{column}{0.6\textwidth}
\begin{itemize}
\item Input size = 7x7x1
\item Filter size = 3x3
\item The number of filter = 1
\item Stride = 1
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Convolution layer}
\begin{block}{What is the filter do?}
Assume weights are already trained.
\end{block}
\centering
\includegraphics[scale = 0.5]{2_line_filter.png}
Curve detection filter and its visualization.
\end{frame}
\begin{frame}{Filter}
\centering
\includegraphics[scale = 0.5]{2_Mice.png}
\includegraphics[scale = 0.5]{2_mice_hip.png}
\begin{block}{}
If Original image has similar shape at part, the result of Mult and Sum has a large number.
\end{block}
\end{frame}
\begin{frame}{Filter}
\centering
\includegraphics[scale = 0.5]{2_mice_conv.png}
\begin{block}{}
In contrast, If not, the result has a small number.
\end{block}
\begin{block}{}
Trained filter can \textbf{give a score} for which feature exist or not!!
\end{block}
\end{frame}
\begin{frame}{Filter}
\centering
\includegraphics[scale = 0.5]{2_ActMap.png}
Each score is grouped together and forms layer by convolution.
\end{frame}
\begin{frame}{Padding}
\centering
\includegraphics[scale = 0.4]{3_padding.png}
\begin{block}{}
\begin{itemize}
\item Attach zeros around the layer. \ \ (Zero-padding)
\item Prevent from size decreasing while convolution.
\item To catch the features at edge more detail.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}{Convolution layer}
\begin{block}{Convolution}
W = width, H = Height, D = Depth, P = Padding, S = stride.
F = Filters W and H, N = Number of filters.
\end{block}
\begin{columns}[onlytextwidth]
\begin{column}{0.5\textwidth}
\centering
\includegraphics[scale=0.3]{2_conv.JPG}% Place your graphic here
\end{column}
\begin{column}{0.5\textwidth}
(6+1)x(6+1)x3 input
Two 3x3x3 filters
$\Rightarrow$ Two output with 3x3x2
\begin{itemize}
\item $W_{2} = \frac{W-F+2P}{S}+1 = \frac{6-3+2*1}{2}+1 = 3$
\item $H_{2} = \frac{H-F+2P}{S}+1 = \frac{6-3+2*1}{2}+1 = 3$
\item $D_{2} = N = 2$ \ \ \ (Depth is same with Number of filters)
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\subsection{ReLU layer}
\begin{frame}{ReLU layer}
\begin{columns}[onlytextwidth]
\begin{column}{0.5\textwidth}
\centering
\includegraphics[scale = 0.35]{1_relu.png}
\end{column}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item Zero OR Itself.
\item Used to give Non-linearity and threshold.
\item No parameter. No size change.
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{ReLU layer}
\begin{block}{Why we have to give a Non-linearity.}
Experimental result is given.
\end{block}
\centering
\includegraphics[scale=0.35]{2_why_NonL}
With Image.net classification test.
\end{frame}
\subsection{Pooling layer}
\begin{frame}{Pooling layer}
\includegraphics[scale = 0.25]{2_maxpool.jpeg}
\includegraphics[scale = 0.25]{2_dawnsampling.jpeg}
\begin{itemize}
\item Usually, using \textbf{Max-Pooling.} (If higher value is important)
\item No depth change.
\item \textbf{\textit{Reduce Complexity!!!!!!(Down-sampling)}} $\frac{1}{4} = 75\%$ reduced.
\item Not Recessary. (But Recommended)
\end{itemize}
\[W_{2} = \frac{W-F}{S}+1 = \frac{224-2}{2}+1 = 112\]
\end{frame}
\subsection{Fully Connected layer}
\begin{frame}{Fully Connected layer}
\centering
\includegraphics[scale = 0.6]{2_Fully.png}
\begin{itemize}
\item Make 2D layer to 1D line layer (Make layer to vector.)
\item Used to compare with target.
\item Making method is not only one.
\end{itemize}
\end{frame}
\section{Painting Style Transfer}
\begin{frame}
Section 3. Painting Style Transfer
\begin{itemize}
\item VGGnet
\item Algorithm and Loss function
\item Result
\end{itemize}
\end{frame}
\subsection{VGGnet}
\begin{frame}{VGGnet}
\centering
\includegraphics[scale = 1.8]{3_VGG_19.png}
\begin{block}{}
\begin{itemize}
\item $F_{conv} = 3 \ (3*3*D), S_{conv} = 1, Padding = 1$
\item $F_{Pool} = 2 \ (2*2*D), S_{pool} = 2$
\end{itemize}
\end{block}
\[\frac{W-F_{conv}+2P}{S_{conv}}+1 = \frac{224-3+2*1}{1}+1 = 224\]
\[\frac{W-F_{conv}}{S_{pool}} + 1 = \frac{224-2}{2} + 1 = 112\]
\end{frame}
\begin{frame}{Painting style transfer}
\centering
\includegraphics[scale = 0.125]{3_Structure.jpg}
\begin{block}{}
\begin{itemize}
\item Weights must be trained already.
\item $a = $ style image, $p = $ content image
\item $x = $ generated image.
\end{itemize}
\end{block}
\end{frame}
\subsection{Algorithm and Loss function}
\begin{frame}{Painting style transfer}
\begin{block}{}
\begin{itemize}
\item $N_l = $ Number of feature maps of $l$th layer
\item $M_l = $ Size of feature map of $l$th layer
\item $F^l \in \mathcal{R}^{N_l*M_l}$
\item $F^{l}_{ij}$ is the activation of the $i^{th}$ filter at position $j$ in layer $l$
\item $P^{l}_{ij}$ is same with $F^{l}_{ij}$ but it is from content image.(conv4\_2)
\end{itemize}
\end{block}
\[\mathcal{L}_{\text{content}}(\vec{p},\vec{x}, l)=\frac{1}{2}\sum_{i, j}(F_{ij}^{l}-P_{ij}^{l})^{2}.\]
\begin{block}{}
So this loss function want to minimize distance of each value of same position between content layer and generate layer.
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{itemize}
\item $G^l \in \mathcal{R}^{N_l*N_l}$
\item $G^{l}_{ij}$ is the inner product between the vectorized feature maps $i$ and j in layer $l$ (Gram matrix of style layer)
\item \[G_{ij}^{l}= \sum_{k}F_{ik}^{l}F_{jk}^{l}\]
\item $A^{l}_{ij}$ is same with $G^{l}_{ij}$ but it is from content image.
\end{itemize}
\end{block}
\[E_{l}= \frac{1}{4N_{l}^{2}M_{l}^{2}}\sum_{i, j}(G_{ij}^{l}-A_{ij}^{l})^{2}\]
\[\mathcal{L}_{\text{style}}(\vec{a},\vec{x})=\sum_{l=0}^{L}w_{l}E_{l}\]
\begin{block}{}
They have thought the style information is hide on correlation but I can't understand.
\end{block}
\end{frame}
\begin{frame}{Painting style transfer}
The differential of each loss function are
\[\frac{\partial \mathcal{L}_{\text{content}}}{\partial F_{ij}^{l}}=\begin{cases} (F^{l}-P^{l})_{ij} & \text{if}\ F_{ij}^{l} > 0\\ 0 & \text{if}\ F_{ij}^{l} < 0, \end{cases}\]
\[\frac{\partial E_{l}}{\partial F_{ij}^{l}}=\begin{cases} \frac{1}{N_{l}^{2}M_{l}^{2}}((F^{l})^{\mathrm{T}}(G^{l}-A^{l}))_{ji}& \text{if}\ F_{ij}^{l} > 0\\ 0& \text{if}\ F_{ij}^{l} < 0. \end{cases}\]
And the total loss is
\[\mathcal{L}_{\text{total}}(\vec{p},\vec{a},\vec{x})=\alpha \mathcal{L}_{\text{content}}(\vec{p},\vec{x})+\beta \mathcal{L}_{\text{style}}(\vec{a},\vec{x})\]
\begin{itemize}
\item $\alpha$ and $\beta$ are learning rate.
\end{itemize}
\end{frame}
\begin{frame}
\centering
\includegraphics[scale = 0.125]{3_Structure.jpg}
\[\vec{x} := \vec{x} - \lambda \frac{\partial \mathcal{L}_{total}}{\partial \vec{x}}\]
\begin{itemize}
\item $\lambda$ is learning rate.
\item At first, $\vec{x}$ is white noise image.
\item \textbf{Not learning weights, learning $\vec{x}$!!!!}
\end{itemize}
\end{frame}
\subsection{Result}
\begin{frame}{Result}
\centering
\begin{columns}[onlytextwidth]
\begin{column}{0.5\textwidth}
\centering
\includegraphics[scale = 0.16]{starry_night.jpg}
\centering
\[+\]
\centering
\includegraphics[scale = 0.16]{in4.JPG}
\end{column}
\begin{column}{0.5\textwidth}
\includegraphics[scale = 0.18]{2800.png}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Bonus}
\begin{block}{}
Thank you!
\end{block}
\begin{columns}[onlytextwidth]
\begin{column}{0.5\textwidth}
\centering
\includegraphics[scale = 0.2]{in3.jpg}
\end{column}
\begin{column}{0.5\textwidth}
\includegraphics[scale = 0.2]{2100.png}
\end{column}
\end{columns}
\end{frame}
\end{document}