Data Engineer & DataOps
My LinkedIn
My GitHub
Source: https://github.com/zalandoresearch/fashion-mnist
Fashion-MNIST is a dataset of Zalando’s article images—consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. Zalando intends Fashion-MNIST to serve as a direct drop-in replacement for the original MNIST dataset for benchmarking machine learning algorithms. It shares the same image size and structure of training and testing splits.
Each image is 28 pixels in height and 28 pixels in width, for a total of 784 pixels in total. Each pixel has a single pixel-value associated with it, indicating the lightness or darkness of that pixel, with higher numbers meaning darker. This pixel-value is an integer between 0 and 255. The training and test data sets have 785 columns. The first column consists of the class labels (see above), and represents the article of clothing. The rest of the columns contain the pixel-values of the associated image.
import pandas as pd
data = pd.read_csv('fashion-mnist.csv')
data
label | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | ... | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | pixel784 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 8 | ... | 103 | 87 | 56 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | 53 | 99 | ... | 0 | 0 | 0 | 0 | 63 | 53 | 31 | 0 | 0 | 0 |
3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 137 | 126 | 140 | 0 | 133 | 224 | 222 | 56 | 0 | 0 |
4 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9995 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 32 | 23 | 14 | 20 | 0 | 0 | 1 | 0 | 0 | 0 |
9996 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 2 | 52 | 23 | 28 | 0 | 0 | 0 |
9997 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 175 | 172 | 172 | 182 | 199 | 222 | 42 | 0 | 1 | 0 |
9998 | 8 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
9999 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 140 | 119 | ... | 111 | 95 | 75 | 44 | 1 | 0 | 0 | 0 | 0 | 0 |
10000 rows × 785 columns
from sklearn.model_selection import train_test_split
X, y = data.iloc[:, 1:].to_numpy(), data.iloc[:, 0].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
for i in range(25):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(X_train[i].reshape(28, 28), cmap="binary")
plt.xlabel(class_names[y_train[i]])
plt.show()
# perform PCA and keep 95% of variance
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)
X_reduced = pca.fit_transform(X_train)
Most variance lies along the first two principal components
pca.explained_variance_ratio_[:10]
array([0.28701888, 0.18049693, 0.05945456, 0.04980945, 0.03838398,
0.03475959, 0.02353651, 0.01864746, 0.0139102 , 0.01324683])
# reconstruct data from compressed data
pca = PCA(n_components = 179)
X_reduced = pca.fit_transform(X_train)
X_recovered = pca.inverse_transform(X_reduced)
import matplotlib as mpl
import matplotlib.pyplot as plt
print('non-compressed image')
plt.imshow(X_train[100].reshape(28, 28), cmap="binary")
plt.axis("off")
plt.show()
non-compressed image
print('95% variance compressed image')
plt.imshow(X_recovered[100].reshape(28, 28), cmap="binary")
plt.axis("off")
plt.show()
95% variance compressed image
# helper function for plotting
from sklearn.preprocessing import MinMaxScaler
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
import numpy as np
def plot_digits(X, y, min_distance=0.05, images=None, figsize=(13, 10)):
# Let's scale the input features so that they range from 0 to 1
X_normalized = MinMaxScaler().fit_transform(X)
# Now we create the list of coordinates of the digits plotted so far.
# We pretend that one is already plotted far away at the start, to
# avoid `if` statements in the loop below
neighbors = np.array([[10., 10.]])
# The rest should be self-explanatory
plt.figure(figsize=figsize)
cmap = mpl.cm.get_cmap("prism")
digits = np.unique(y)
for digit in digits:
plt.scatter(X_normalized[y == digit, 0], X_normalized[y == digit, 1], c=[cmap(digit / 9)])
plt.axis("off")
ax = plt.gcf().gca() # get current axes in current figure
for index, image_coord in enumerate(X_normalized):
closest_distance = np.linalg.norm(np.array(neighbors) - image_coord, axis=1).min()
if closest_distance > min_distance:
neighbors = np.r_[neighbors, [image_coord]]
if images is None:
plt.text(image_coord[0], image_coord[1], str(int(y[index])),
color=cmap(y[index] / 9), fontdict={"weight": "bold", "size": 16})
else:
image = images[index].reshape(28, 28)
imagebox = AnnotationBbox(OffsetImage(image, cmap="binary"), image_coord)
ax.add_artist(imagebox)
X_pca_reduced = PCA(n_components=2, random_state=42).fit_transform(X_train)
plot_digits(X_pca_reduced, y_train)
plt.show()
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X_lda_reduced = LinearDiscriminantAnalysis(n_components=2).fit_transform(X_train, y_train)
plot_digits(X_lda_reduced, y_train, figsize=(12,12))
plt.show()
from sklearn.pipeline import Pipeline
from sklearn.manifold import LocallyLinearEmbedding
pca_lle = Pipeline([
("pca", PCA(n_components=0.95, random_state=42)),
("lle", LocallyLinearEmbedding(n_components=2, random_state=42)),
])
X_pca_lle_reduced = pca_lle.fit_transform(X_train)
plot_digits(X_pca_lle_reduced, y_train)
plt.show()
from sklearn.manifold import MDS
pca_mds = Pipeline([
("pca", PCA(n_components=0.95, random_state=42)),
("mds", MDS(n_components=2, random_state=42)),
])
X_pca_mds_reduced = pca_mds.fit_transform(X_train[:1000])
plot_digits(X_pca_mds_reduced, y_train[:1000])
plt.show()
from sklearn.manifold import Isomap
pca_isomap = Pipeline([
("pca", PCA(n_components=0.95, random_state=42)),
("isomap", Isomap(n_components=2)),
])
X_pca_isomap_reduced = pca_isomap.fit_transform(X_train)
plot_digits(X_pca_isomap_reduced, y_train)
plt.show()
from sklearn.manifold import TSNE
pca_tsne = Pipeline([
("pca", PCA(n_components=0.95, random_state=42)),
("tsne", TSNE(n_components=2, random_state=42)),
])
X_pca_tsne_reduced = pca_tsne.fit_transform(X_train)
plot_digits(X_pca_tsne_reduced, y_train)
plt.show()
from sklearn.decomposition import KernelPCA
from sklearn.manifold import TSNE
k_pca_tsne = Pipeline([
("kpca", KernelPCA(n_components=2, kernel='rbf', random_state=42)),
("tsne", TSNE(n_components=2, random_state=42)),
])
X_pca_tsne_reduced = k_pca_tsne.fit_transform(X_train)
plot_digits(X_pca_tsne_reduced, y_train)
plt.show()
k_pca_tsne = Pipeline([
("kpca", KernelPCA(n_components=2, kernel='poly', random_state=42)),
("tsne", TSNE(n_components=2, random_state=42)),
])
X_pca_tsne_reduced = k_pca_tsne.fit_transform(X_train)
plot_digits(X_pca_tsne_reduced, y_train)
plt.show()
k_pca_tsne = Pipeline([
("kpca", KernelPCA(n_components=2, kernel='sigmoid', random_state=42)),
("tsne", TSNE(n_components=2, random_state=42)),
])
X_pca_tsne_reduced = k_pca_tsne.fit_transform(X_train)
plot_digits(X_pca_tsne_reduced, y_train)
plt.show()
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_pca_tsne_reduced, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
pca_tsne = Pipeline([
("pca", PCA(n_components=0.95, random_state=42)),
("tsne", TSNE(n_components=2, random_state=42)),
])
X_pca_tsne_reduced_test = pca_tsne.fit_transform(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, lr.predict(X_pca_tsne_reduced_test))
0.0048484848484848485
lr = LogisticRegression(max_iter=100000)
lr.fit(X_pca_reduced, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100000,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
X_pca_reduced_test = PCA(n_components=2, random_state=42).fit_transform(X_test)
accuracy_score(y_test, lr.predict(X_pca_reduced_test))
0.5130303030303031