import torch
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
from torch import nn, optim
from torch.nn import functional as F

EPOCH = 1000
BATCH_SIZE = 128
LR = 0.001
DOWNLOAD_MNIST = False

train_data = datasets.MNIST(
    root='./mnist',
    train=True,
    transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]),#0-255 -> 0-1
    download=DOWNLOAD_MNIST
)
#plot one example
print(train_data.train_data.size())
print(train_data.train_labels.size())
plt.imshow(train_data.train_data[0].numpy(), cmap='gray')
plt.title('%i' % train_data.train_labels[0])
plt.show()

train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE,\
                               shuffle=True, num_workers=2)


test_data = datasets.MNIST(
    root='./mnist',
    train=False,
    transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]),#0-255 -> 0-1
    download=DOWNLOAD_MNIST
)
test_loader = DataLoader(dataset=test_data, batch_size=BATCH_SIZE,\
                               shuffle=True, num_workers=2)

x, label = iter(test_loader).next()  #这个iter能把一个batch_size提取出来
print("x:",x.shape, 'label:',label.shape)



class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(         # input shape (1, 28, 28)
            nn.Conv2d(
                in_channels=1,              # input height
                out_channels=10,            # n_filters
                kernel_size=5,              # filter size
                stride=1,                   # filter movement/step
                padding=2,                  # if want same width and length of this image after Conv2d, padding=(kernel_size-1)/2 if stride=1
            ),                              # output shape (16, 28, 28)
            nn.ReLU(),                      # activation
            nn.MaxPool2d(kernel_size=2),    # choose max value in 2x2 area, output shape (16, 14, 14)
        )
        self.conv2 = nn.Sequential(         # input shape (16, 14, 14)
            nn.Conv2d(10, 20, 5, 1, 2),     # output shape (32, 14, 14)
            nn.ReLU(),                      # activation
            nn.MaxPool2d(2),                # output shape (32, 7, 7)
        )
        self.out1 = nn.Linear(20 * 7 * 7, 512)   # fully connected layer, output 10 classes
        self.out2 = nn.Linear(512, 10)
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)           # flatten the output of conv2 to (batch_size, 32 * 7 * 7)
        output = self.out1(x)
        output = F.relu(output)
        output = self.out2(output)
        return output    # return x for visualization

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cnn = CNN().to(DEVICE)
print(cnn)  # net architecture
optimizer = optim.Adam(cnn.parameters(), lr=LR)   # optimize all cnn parameters
criteon = nn.CrossEntropyLoss().to(DEVICE)                     # the target label is not one-hotted

def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        output = model(data)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if(batch_idx+1)%30 == 0: 
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1)
            correct += torch.eq(pred, target).float().sum().item()
            test_loss += criterion(output, target)
            
#             test_loss += F.nll_loss(output, target, reduction='sum').item() # 将一批的损失相加
#             pred = output.max(1, keepdim=True)[1] # 找到概率最大的下标
#             correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))



# training and testing
for epoch in range(EPOCH):
    train(cnn, DEVICE, train_loader, optimizer, epoch)
    test(cnn, DEVICE, test_loader)

最后能得到99%的准确率