Vector Search
Faiss
import faiss
from vector_engine.utils import vector_search, id2details
# Step 1: Change data type
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")
# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings.shape[1])
# Step 3: Pass the index to IndexIDMap
index = faiss.IndexIDMap(index)
# Step 4: Add vectors and their IDs
index.add_with_ids(embeddings, df.id.values)
print(f"Number of vectors in the Faiss index: {index.ntotal}")
# Retrieve the 10 nearest neighbours
D, I = index.search(np.array([embeddings[5415]]), k=10)
print(f'L2 distance: {D.flatten().tolist()}\n\nMAG paper IDs: {I.flatten().tolist()}')
# Wrap all steps in the vector_search function.
# It takes four arguments:
# A query, the sentence-level transformer, the Faiss index and the number of requested results
D, I = vector_search([user_query], model, index, num_results=10)
print(f'L2 distance: {D.flatten().tolist()}\n\nMAG paper IDs: {I.flatten().tolist()}')
# Serialise index and store it as a pickle
with open(f"{project_dir}/models/faiss_index.pickle", "wb") as h:
pickle.dump(faiss.serialize_index(index), h)
Elasticsearch
# download the latest elasticsearch version
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.11.1-linux-x86_64.tar.gz
!tar -xzvf elasticsearch-7.11.1-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.11.1
# prep the elasticsearch server
import os
from subprocess import Popen, PIPE, STDOUT
es_subprocess = Popen(['elasticsearch-7.11.1/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda : os.setuid(1))
# wait for a few minutes for the local host to start
!curl -X GET "localhost:9200/"
# install elasticsearch python api
!pip install -q elasticsearch
# check if elasticsearch server is properly running in the background
from elasticsearch import Elasticsearch, helpers
es_client = Elasticsearch(['localhost'])
es_client.info()
Annoy
!pip install -q annoy
from annoy import AnnoyIndex
# Defining data structures as empty dict
file_index_to_file_name = {}
file_index_to_file_vector = {}
file_index_to_product_id = {}
# Configuring annoy parameters
dims = 256
n_nearest_neighbors = 20
trees = 10000
# Reads all file names which stores feature vectors
allfiles = glob.glob('/content/img_vectors/*.npz')
t = AnnoyIndex(dims, metric='angular')
for findex, fname in tqdm(enumerate(allfiles)):
file_vector = np.loadtxt(fname)
file_name = os.path.basename(fname).split('.')[0]
file_index_to_file_name[findex] = file_name
file_index_to_file_vector[findex] = file_vector
try:
file_index_to_product_id[findex] = match_id(file_name)
except IndexError:
pass
t.add_item(findex, file_vector)
t.build(trees)
t.save('t.ann')
file_path = '/content/drive/MyDrive/ImgSim/'
t.save(file_path+'indexer.ann')
pickle.dump(file_index_to_file_name, open(file_path+"file_index_to_file_name.p", "wb"))
pickle.dump(file_index_to_file_vector, open(file_path+"file_index_to_file_vector.p", "wb"))
pickle.dump(file_index_to_product_id, open(file_path+"file_index_to_product_id.p", "wb"))
path_dict = {}
for path in Path('/content/Fashion_data/categories').rglob('*.jpg'):
path_dict[path.name] = path
nns = t.get_nns_by_vector(test_vec, n=topK)
plt.figure(figsize=(20, 10))
for i in range(topK):
x = file_index_to_file_name[nns[i]]
x = path_dict[x+'.jpg']
y = file_index_to_product_id[nns[i]]
title = '\n'.join([str(j) for j in list(styles.loc[y].values[-5:])])
plt.subplot(1, topK, i+1)
plt.title(title)
plt.imshow(mpimg.imread(x))
plt.axis('off')
plt.tight_layout()
Milvus Redis
!git clone -b 1.1 https://github.com/milvus-io/milvus.git
% cd /content/milvus/core
! ./ubuntu_build_deps.sh
!./build.sh -t Release
# !./build.sh -t Release -g
% cd /content/milvus/core/milvus
! echo $LD_LIBRARY_PATH
import os
os.environ['LD_LIBRARY_PATH'] +=":/content/milvus/core/milvus/lib"
! echo $LD_LIBRARY_PATH
% cd scripts
! nohup ./start_server.sh &
! cat nohup.out
from milvus import Milvus, IndexType, MetricType, Status
import redis
milv = Milvus(host = '127.0.0.1', port = 19530)
r = redis.StrictRedis(host="127.0.0.1", port=6379)
COLLECTION_NAME = 'demo_films'
PARTITION_NAME = 'Movie'
#Dropping collection for clean slate run
milv.drop_collection(COLLECTION_NAME)
param = {'collection_name':COLLECTION_NAME,
'dimension':32,
'index_file_size':2048,
'metric_type':MetricType.L2
}
milv.create_collection(param)
status = milv.insert(collection_name=COLLECTION_NAME, records=embeddings, ids=ids)
import numpy as np
from paddle_serving_app.local_predict import LocalPredictor
class RecallServerServicer(object):
def __init__(self):
self.uv_client = LocalPredictor()
self.uv_client.load_model_config("movie_recommender/user_vector_model/serving_server_dir")
def hash2(self, a):
return hash(a) % 1000000
def get_user_vector(self):
dic = {"userid": [], "gender": [], "age": [], "occupation": []}
lod = [0]
dic["userid"].append(self.hash2('0'))
dic["gender"].append(self.hash2('M'))
dic["age"].append(self.hash2('23'))
dic["occupation"].append(self.hash2('6'))
lod.append(1)
dic["userid.lod"] = lod
dic["gender.lod"] = lod
dic["age.lod"] = lod
dic["occupation.lod"] = lod
for key in dic:
dic[key] = np.array(dic[key]).astype(np.int64).reshape(len(dic[key]),1)
fetch_map = self.uv_client.predict(feed=dic, fetch=["save_infer_model/scale_0.tmp_1"], batch=True)
return fetch_map["save_infer_model/scale_0.tmp_1"].tolist()[0]
recall = RecallServerServicer()
user_vector = recall.get_user_vector()