
Import libraries required for file operations

import os
import pickle
from glob import glob

# import basic numerical libraries
import numpy as np
import pandas as pd

# import keras libraries for image recognition
from keras.applications import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing import image as kimage

Data preparation

# download and unzip shirts folder from the directory
shirts_dict = dict()
for shirt in glob('shirts/*.jpg'):  # load all shirts
  img = kimage.load_img(shirt, target_size=(224, 224))   # VGG accepts images in 224 X 224 pixels
  img = preprocess_input(np.expand_dims(kimage.img_to_array(img), axis=0))  # so some preprocessing
  id = shirt.split('/')[-1].split('.')[0]
  shirts_dict[id] = img  # map image & shirt id
Number of shirts = 2908


Model training

model = VGG16(include_top=False, weights='imagenet')

shirts_matrix = np.zeros([no_of_shirts, 25088])   # initialize the matrix with zeros
for i, (id, img) in enumerate(shirts_dict.items()):  
  shirts_matrix[i, :] = model.predict(img).ravel()  # flatten the matrix
Downloading data from
58892288/58889256 [==============================] - 0s 0us/step
Model: "vgg16"
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, None, 3)]   0         
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
block2_pool (MaxPooling2D)   (None, None, None, 128)   0         
block3_conv1 (Conv2D)        (None, None, None, 256)   295168    
block3_conv2 (Conv2D)        (None, None, None, 256)   590080    
block3_conv3 (Conv2D)        (None, None, None, 256)   590080    
block3_pool (MaxPooling2D)   (None, None, None, 256)   0         
block4_conv1 (Conv2D)        (None, None, None, 512)   1180160   
block4_conv2 (Conv2D)        (None, None, None, 512)   2359808   
block4_conv3 (Conv2D)        (None, None, None, 512)   2359808   
block4_pool (MaxPooling2D)   (None, None, None, 512)   0         
block5_conv1 (Conv2D)        (None, None, None, 512)   2359808   
block5_conv2 (Conv2D)        (None, None, None, 512)   2359808   
block5_conv3 (Conv2D)        (None, None, None, 512)   2359808   
block5_pool (MaxPooling2D)   (None, None, None, 512)   0         
Total params: 14,714,688
Trainable params: 14,714,688
Non-trainable params: 0


Inference pipeline

matrix_id_to_shirt_id = dict()
shirt_id_to_matrix_id = dict()
for i, (id, img) in enumerate(shirts_dict.items()):  
    matrix_id_to_shirt_id[i] = id
    shirt_id_to_matrix_id[id] = i


Finding top 10 similar shirts

Display the sample shirt

from IPython.display import Image
import glob
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

images = []

for shirt in closest_shirts:
  shirt = 'shirts/'+shirt+'.jpg'
  for img_path in glob.glob(shirt):

columns = 5
for i, image in enumerate(images):
    plt.subplot(len(images) / columns + 1, columns, i + 1)

Model persistence

from sklearn.externals import joblib
joblib.dump(similarity, 'similarity.pkl')
joblib.dump(shirt_id_to_matrix_id, 'shirt_id_to_matrix_id.pkl')
joblib.dump(matrix_id_to_shirt_id, 'matrix_id_to_shirt_id.pkl')
loaded_model = joblib.load('similarity.pkl')
closest_ids = np.argsort(loaded_model[target_id, :])[::-1][0:10]
closest_shirts = [matrix_id_to_shirt_id[matrix_id] for matrix_id in closest_ids]
