
이번에는 RNN을 MNIST로 돌려보는 것을 코딩해본다.


Experimental setups

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import os
import matplotlib.pyplot as plt
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_gpu = 1
if torch.cuda.device_count() > 1:
    num_gpu = torch.cuda.device_count()
print("Let's use", num_gpu, "GPUs!")
print('our device', device)

Let's use 1 GPUs!
our device cuda


RNN 모델 설계

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, self.hidden_size, self.num_layers, batch_first=True)
        self.gru = nn.GRU(input_size, self.hidden_size, self.num_layers, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, num_classes)
    def forward(self, x, rnn):
        if rnn == 'lstm':
            rnn_layer = self.lstm
            h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
            c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
            out, _ = self.lstm(x, (h0, c0))
            rnn_layer = self.gru
            h = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
            out, _ = self.gru(x, h)
        out = self.fc(out[:, -1, :])
        return out

model call시 원하는 rnn 계열 (lstm, gru) 선택 가능하도록 구현함


Parameter & Model shape & Hyper-parameter check 

sequence_length = 28
input_size = 28
hidden_size = 128
num_layers = 2
num_classes = 10
batch_size = 100
num_epochs = 10
learning_rate = 0.01

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
#model shape
for p in model.parameters():
torch.Size([512, 28])
torch.Size([512, 128])
torch.Size([512, 128])
torch.Size([512, 128])
torch.Size([384, 28])
torch.Size([384, 128])
torch.Size([384, 128])
torch.Size([384, 128])
torch.Size([10, 128])
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

model_hp = count_parameters(model)
print('model"s hyper parameters', model_hp)
# model"s hyper parameters 374026


Dataset load, train, test loader 선언

train_loader = torch.utils.data.DataLoader(datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()),batch_size=batch_size, shuffle=True)
print(len(train_loader)) # 600
test_loader = torch.utils.data.DataLoader(datasets.MNIST('data', train=False, transform=transforms.ToTensor()),batch_size=1000)
print(len(test_loader)) # 10
Loss, optimizer 선언

이전에는 F.nll_loss로 하였는데, 이번에는 CrossEntropy loss를 선언하여 진행함

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


Training 진행

total_loss = 0
total_acc = 0
train_loss = []
train_accuracy = []
i = 0
for epoch in range(num_epochs):
    for data, target in train_loader:
        data = data.reshape(-1, sequence_length, input_size).to(device)
        target = target.to(device)
        output = model(data, 'lstm')
        loss = criterion(output, target)
        loss.backward()    # calc gradients
        total_loss += loss
        optimizer.step()   # update gradients
        prediction = output.data.max(1)[1]   # first column has actual prob.
        accuracy = prediction.eq(target.data).sum()/batch_size*100
        total_acc += accuracy
        if i % 10 == 0:
            print('Epoch: {}\t Train Step: {}\tLoss: {:.3f}\tAccuracy: {:.3f}'.format(epoch+1, i, loss, accuracy))
        i += 1
    print('Epoch: {} finished'.format(epoch+1))
Epoch: 10 finished

RNN은 lstm으로 진행하였음



Plotting 결과

plt.plot(np.arange(len(train_loss)), train_loss)

plt.plot(np.arange(len(train_accuracy)), train_accuracy)


step에 따른 training loss 변화도
step에 따른 training accuracy 변화도


Evaluation 결과

with torch.no_grad():
    correct = 0
    for data, target in test_loader:
        data = data.reshape(-1, sequence_length, input_size).to(device)
        target = target.to(device)        
        output = model(data, 'lstm')

        prediction = output.data.max(1)[1]
        correct += prediction.eq(target.data).sum()

print('\nTest set: Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))
# Test set: Accuracy: 97.63%


사실 RNN은 MNIST data를 돌리기에 최적의 모델이 아님

왜냐하면 RNN의 개념이 sequential 한 data에 적합하기 때문임


그럼에도 불구하고 결과값은 MLP의 성능을 넘어섰음


이번에는 이전 post와 같은 MNIST dataset을 활용하여, CNN으로 성능 뽑는것을 진행해본다.


CNN은 Fully-connected layer와 달리 flatten을 해줄 필요가 없어서 parameter가 비교적 적게 들고, 연산이 빠른 장점이 있으며, receptive field를 통해 local feature를 뽑는 것에 강인한 특징이 있음


Library importing 및 device 설정

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import os
import matplotlib.pyplot as plt
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_gpu = 1
if torch.cuda.device_count() > 1:
    num_gpu = torch.cuda.device_count()
print("Let's use", num_gpu, "GPUs!")

print('our device', device)
Let's use 1 GPUs!
our device cuda

2-layer CNN 네트워크 설계 (add here 부분에 batchnormalization, 더 깊게 쌓는 것들을 연습해보세요)

class CNN(nn.Module):
    def __init__(self, num_class, drop_prob):
        super(CNN, self).__init__()
        # input is 28x28
        # padding=2 for same padding
        self.conv1 = nn.Conv2d(1, 32, 5, padding=2) #input_channel, output_channel, filter_size, padding_size, (kernel=omit)
        # feature map size is 14*14 by pooling
        # padding=2 for same padding
        self.conv2 = nn.Conv2d(32, 64, 5, padding=2)
        # feature map size is 7*7 by pooling
        add here.. make more deep...

        batchnormalization ++
        self.dropout = nn.Dropout(p=drop_prob)

        self.fc1 = nn.Linear(64*7*7, 1024)
        self.reduce_layer = nn.Linear(1024, num_class)
        self.log_softmax = nn.LogSoftmax(dim=1)
    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2) # -> (B, 14, 14, 32)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        add here.. make more deep...
        and use dropout
        x = x.view(-1, 64*7*7)   # reshape Variable for using Linear (because linear only permit 1D. So we call this task as "flatten")
        x = F.relu(self.fc1(x))
        output = self.reduce_layer(x)
        return self.log_softmax(output)


Model loading 및 parameter, shape 체크

model = CNN(10, 0.3)
  (conv1): Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=3136, out_features=1024, bias=True)
  (reduce_layer): Linear(in_features=1024, out_features=10, bias=True)
  (log_softmax): LogSoftmax(dim=1)
#model shape
for p in model.parameters():
torch.Size([32, 1, 5, 5])
torch.Size([64, 32, 5, 5])
torch.Size([1024, 3136])
torch.Size([10, 1024])
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

model_hp = count_parameters(model)
print('model"s hyper parameters', model_hp)
# model"s hyper parameters 3274634

Data setup 및 train, test loader 설정

batch_size = 64
train_loader = torch.utils.data.DataLoader(datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()),batch_size=batch_size, shuffle=True)
print(len(train_loader)) # 938, 64 * 938 = 60032
test_loader = torch.utils.data.DataLoader(datasets.MNIST('data', train=False, transform=transforms.ToTensor()),batch_size=1000)
print(len(test_loader)) # 10, (10 * 1000 = 10000)
Adam optimizer, learning rate 1e-4로 설정

optimizer = optim.Adam(model.parameters(), lr=1e-4)


Model training, epoch은 10으로 설정

epochs = 10 ### change
total_loss = 0
total_acc = 0
train_loss = []
train_accuracy = []
i = 0
for epoch in range(epochs):
    for data, target in train_loader:
        data, target = Variable(data), Variable(target)
        data = data.to(device)        
        target = target.to(device)
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()    # calc gradients
        total_loss += loss
        optimizer.step()   # update gradients
        prediction = output.data.max(1)[1]   # first column has actual prob.
        accuracy = prediction.eq(target.data).sum()/batch_size*100
        total_acc += accuracy
        if i % 10 == 0:
            print('Epoch: {}\t Train Step: {}\tLoss: {:.3f}\tAccuracy: {:.3f}'.format(epoch+1, i, loss, accuracy))
        i += 1
    print('Epoch: {} finished'.format(epoch+1))
Epoch: 10 finished

Training accuracy에서 MLP 보다 높은 것을 알 수 있음


Plotting 결과

plt.plot(np.arange(len(train_loss)), train_loss)

plt.plot(np.arange(len(train_accuracy)), train_accuracy)

step에 따른 training loss 변화도
step에 따른 training accuracy 변화도


Evaluation 결과

with torch.no_grad():
    correct = 0
    for data, target in test_loader:
        data, target = Variable(data), Variable(target)
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        prediction = output.data.max(1)[1]
        correct += prediction.eq(target.data).sum()

print('\nTest set: Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))
# Test set: Accuracy: 99.11%

마찬가지로 MLP보다 CNN의 성능이 더 높음을 알 수 있음



이번 시간에는 여러 개의 Fully-connected layer를 쌓는 것을 코딩해본다.


먼저 아래의 코드를 통해 library 를 importing 함

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import os
import matplotlib.pyplot as plt
import numpy as np


또한 아래의 코드를 통해 현재의 pytorch 버전에 대해 확인함



이후 연산할 장치에 대해 선언해야 함

아래와 같이 device를 gpu로 설정함

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_gpu = 1
if torch.cuda.device_count() > 1:
    num_gpu = torch.cuda.device_count()
print("Let's use", num_gpu, "GPUs!") # 1
print('device', device) # cuda

이후 간단한 MLP (Multi-Layer Perceptron) 모델을 구현함


입력은 MNIST dataset을 사용함


MNIST dataset의 각각의 요소는 (28, 28) 의 shape을 갖고 있기 때문에,

MLP를 통과하기 위해서는 각 요소를 flatten 시켜야 함


즉 reshape 을 사용하여 (28, 28) --> (1, 784)로 변경해야 함


class MnistMLP(nn.Module):
    def __init__(self, num_class, drop_prob):
        super(MnistMLP, self).__init__()
        # input is 28x28
        # need for flatten ==> 784
        self.dropout = nn.Dropout(p=drop_prob)
        self.linear1 = nn.Linear(784, 512)
        self.linear2 = nn.Linear(512, 256)
        self.linear3 = nn.Linear(256, 10)

        self.reduce_layer = nn.Linear(10, num_class)
        self.logsoftmax = nn.LogSoftmax(dim=1)
    def forward(self, x):
        x = x.float()
        mlp1 = F.relu(self.linear1(x.view(-1, 784)))
        mlp1 = self.dropout(mlp1)
        mlp2 = F.relu(self.linear2(mlp1))
        mlp2 = self.dropout(mlp2)
        mlp3 = F.relu(self.linear3(mlp2))
        mlp3 = self.dropout(mlp3)
        output = self.reduce_layer(mlp3)

        return self.logsoftmax(output)


이후 아래의 코드처럼 모델을 선언하고 gpu에 올림

model = MnistMLP(10, 0.3)

  (dropout): Dropout(p=0.3, inplace=False)
  (linear1): Linear(in_features=784, out_features=512, bias=True)
  (linear2): Linear(in_features=512, out_features=256, bias=True)
  (linear3): Linear(in_features=256, out_features=10, bias=True)
  (reduce_layer): Linear(in_features=10, out_features=10, bias=True)
  (logsoftmax): LogSoftmax(dim=1)

MNIST의 class 개수는 10개 이므로, 첫 번째 인자에 10을 넣었고, dropout은 30%확률로 진행


작성한 모델의 매 layer 마다의 shape은 아래를 통해서 확인할 수 있고,

#model shape
for p in model.parameters():
torch.Size([512, 784])
torch.Size([256, 512])
torch.Size([10, 256])
torch.Size([10, 10])


총 hyperparameter는 아래를 통해 확인 가능함

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

model_hp = count_parameters(model)
print('model"s hyper parameters', model_hp)
model"s hyper parameters 535928


이제 모델 선언은 끝났고, data를 loading 해야 함

아래의 코드를 통해 MNIST dataset을 다운받고, train 및 test로 분할함

batch_size = 128
train_loader = torch.utils.data.DataLoader(datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()),batch_size=batch_size, shuffle=True)
print(len(train_loader)) # 118, 512 * 118 = 60000
test_loader = torch.utils.data.DataLoader(datasets.MNIST('data', train=False, transform=transforms.ToTensor()),batch_size=1000)
print(len(test_loader)) # 10, 10 * 1000 = 10000
Optimizer 선언은 아래와 같이 진행하며, 가장 많이 사용되는 Adam을 learning rate 1e-4로 사용

optimizer = optim.Adam(model.parameters(), lr=1e-4)


Training을 진행함 (epoch은 10까지만 진행)

epochs = 10 ### change
total_loss = 0
total_acc = 0
train_loss = []
train_accuracy = []
i = 0
for epoch in range(epochs):
    for data, target in train_loader:
        data, target = Variable(data), Variable(target)
        data = data.to(device)        
        target = target.to(device)
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()    # calc gradients
        total_loss += loss
        optimizer.step()   # update gradients
        prediction = output.data.max(1)[1]   # first column has actual prob.
        accuracy = prediction.eq(target.data).sum()/batch_size*100
        total_acc += accuracy
        if i % 10 == 0:
            print('Epoch: {}\t Train Step: {}\tLoss: {:.3f}\tAccuracy: {:.3f}'.format(epoch+1, i, loss, accuracy))
        i += 1
    print('Epoch: {} finished'.format(epoch+1))
Epoch: 10 finished

Training에 대한 loss를 시각화 하기 위해 matplotlib 사용

plt.plot(np.arange(len(train_loss)), train_loss)

plt.plot(np.arange(len(train_accuracy)), train_accuracy)

training step에 따른 loss 변화도
training step에 따른 accuracy 변화도

모델의 실제 성능 평가를 하기 위해 training에 쓰이지 않은 test data로 아래와 같이 평가 진행

with torch.no_grad():
    correct = 0
    for data, target in test_loader:
        data, target = Variable(data), Variable(target)
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        prediction = output.data.max(1)[1]
        correct += prediction.eq(target.data).sum()

print('\nTest set: Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))
#Test set: Accuracy: 96.04%

간단한 MLP 3-layer 만으로도 96%의 성능을 얻었음


