向量数据库:faiss的IndexPQ中PQ的图解+实现质心表的融合

2024-01-07 20:51:43

IndexPQ

  • 一个indexPQ的简单示例:
import numpy as np
import faiss

# 生成一些随机数据作为示例
np.random.seed(42)
data = np.random.random((10000, 64)).astype('float32')

# 定义 PQ 索引的参数
m, nbits = 8, 8  # m: 子空间的数量, nbits: 每个子空间的比特数

# 创建 IndexPQ
index = faiss.IndexPQ(data.shape[1], m, nbits)

# 训练索引
index.train(data)

# 添加数据到索引
index.add(data)

# 查询示例
query_vector = np.random.random((1, 64)).astype('float32')
k = 5  # 获取前 k 个最近邻

# 进行查询
distances, indices = index.search(query_vector, k)

# 打印结果
print("Query Vector:")
print(query_vector)
print("\nIndices of Nearest Neighbors:")
print(indices)
print("\nDistances to Nearest Neighbors:")
print(distances)

在这里插入图片描述

参数解释
Mnumber of subquantizers ,输入向量被分为的片段的个数
dsubdimensionality of each subvector,每个子聚类表的长度
ksub = 2 n 2^n 2nnumber of centroids for each subquantizer,每个子聚类表的宽度

在这里插入图片描述

可通过faiss.vector_to_array(index.pq.centroids)查看质心表中的具体数值

  • index.train()之前:
    在这里插入图片描述
  • index.train()之后:
    在这里插入图片描述

实现“偷梁换柱”

import numpy as np
import faiss

# 生成一些随机数据作为示例
np.random.seed(42)
data = np.random.random((10000, 64)).astype('float32')
data2 = np.random.random((10000, 64)).astype('float32')

# 定义 PQ 索引的参数
m, nbits = 8, 8  # m: 子空间的数量, nbits: 每个子空间的比特数

# 创建 IndexPQ
index = faiss.IndexPQ(data.shape[1], m, nbits)
index.train(data)


index2 = faiss.IndexPQ(data.shape[1], m, nbits)
index2.train(data2)
index2.pq.centroids = index.pq.centroids

# 添加数据到索引
index.add(data)
index2.add(data)

# 查询示例
query_vector = np.random.random((1, 64)).astype('float32')
k = 5  # 获取前 k 个最近邻

# 进行查询
distances, indices = index.search(query_vector, k)
# 打印结果
print("Query Vector:")
print(query_vector)
print("\nIndices of Nearest Neighbors:")
print(indices)
print("\nDistances to Nearest Neighbors:")
print(distances)

distances, indices = index2.search(query_vector, k)
# 打印结果
print("Query Vector:")
print(query_vector)
print("\nIndices of Nearest Neighbors:")
print(indices)
print("\nDistances to Nearest Neighbors:")
print(distances)
# Query Vector:
# [[0.18171448 0.34181556 0.6398858  0.292473   0.44219118 0.63791186
#   0.19401862 0.17734843 0.26126006 0.38929975 0.02442818 0.72467136
#   0.9121011  0.0601452  0.42044804 0.56506294 0.9892394  0.2520515
#   0.12554157 0.3569948  0.7176223  0.6282157  0.53028387 0.19011611
#   0.8374111  0.91366297 0.6300717  0.21906242 0.34832168 0.6042122
#   0.55216706 0.15355448 0.47739747 0.07588766 0.45951515 0.46728414
#   0.8784772  0.2502514  0.8283812  0.77515835 0.7159397  0.6975115
#   0.24739715 0.89320683 0.07678613 0.7589492  0.29475844 0.8860514
#   0.8515612  0.9372315  0.5690415  0.02019571 0.78275704 0.02964665
#   0.36082503 0.22074123 0.4638003  0.3445418  0.8347299  0.3678306
#   0.00145097 0.44658396 0.02120558 0.74333763]]

# Indices of Nearest Neighbors:
# [[1356 3975 2011 5711 4734]]

# Distances to Nearest Neighbors:
# [[5.3155017 5.561659  5.6874743 5.7380037 5.762418 ]]
# Query Vector:
# [[0.18171448 0.34181556 0.6398858  0.292473   0.44219118 0.63791186
#   0.19401862 0.17734843 0.26126006 0.38929975 0.02442818 0.72467136
#   0.9121011  0.0601452  0.42044804 0.56506294 0.9892394  0.2520515
#   0.12554157 0.3569948  0.7176223  0.6282157  0.53028387 0.19011611
#   0.8374111  0.91366297 0.6300717  0.21906242 0.34832168 0.6042122
#   0.55216706 0.15355448 0.47739747 0.07588766 0.45951515 0.46728414
#   0.8784772  0.2502514  0.8283812  0.77515835 0.7159397  0.6975115
#   0.24739715 0.89320683 0.07678613 0.7589492  0.29475844 0.8860514
#   0.8515612  0.9372315  0.5690415  0.02019571 0.78275704 0.02964665
#   0.36082503 0.22074123 0.4638003  0.3445418  0.8347299  0.3678306
#   0.00145097 0.44658396 0.02120558 0.74333763]]

# Indices of Nearest Neighbors:
# [[1356 3975 2011 5711 4734]]

# Distances to Nearest Neighbors:
# [[5.3155017 5.561659  5.6874743 5.7380037 5.762418 ]]

另一种融合

import numpy as np
import faiss

# 定义 PQ 索引的参数
m, nbits = 8, 8  # m: 子空间的数量, nbits: 每个子空间的比特数

# 生成一些随机数据作为示例
np.random.seed(42)
data = np.random.random((10000, 64)).astype('float32')
data2 = np.random.random((10000, 64)).astype('float32')



# 创建 IndexPQ
index = faiss.IndexPQ(data.shape[1], m, nbits)
index.train(data)

index2 = faiss.IndexPQ(data.shape[1], m, nbits)
index2.train(data2)

index3 = faiss.IndexPQ(data.shape[1], m, nbits+1)# 312的融合
index3.train(data2)

# index2.pq.centroids = index.pq.centroids

faiss.copy_array_to_vector(
        np.hstack((
            faiss.vector_to_array(index.pq.centroids), 
            faiss.vector_to_array(index2.pq.centroids)
        )), 
        index3.pq.centroids
    )


# 添加数据到索引
index.add(data)
index2.add(data)
index3.add(data)

# 查询示例
query_vector = np.random.random((1, 64)).astype('float32')
k = 5  # 获取前 k 个最近邻

# 进行查询
distances, indices = index.search(query_vector, k)
# 打印结果
print("\nIndices of Nearest Neighbors:")
print(indices)
print("\nDistances to Nearest Neighbors:")
print(distances)

distances, indices = index2.search(query_vector, k)
# 打印结果
print("\nIndices of Nearest Neighbors:")
print(indices)
print("\nDistances to Nearest Neighbors:")
print(distances)


distances, indices = index3.search(query_vector, k)
# 打印结果
print("\nIndices of Nearest Neighbors:")
print(indices)
print("\nDistances to Nearest Neighbors:")
print(distances)


# Indices of Nearest Neighbors:
# [[1356 3975 2011 5711 4734]]

# Distances to Nearest Neighbors:
# [[5.3155017 5.561659  5.6874743 5.7380037 5.762418 ]]

# Indices of Nearest Neighbors:
# [[7929 4107  961 2473 4802]]

# Distances to Nearest Neighbors:
# [[5.2938    5.525796  5.57064   5.7225237 5.799486 ]]

# Indices of Nearest Neighbors:
# [[7929  961 8924 6034 7534]]

# Distances to Nearest Neighbors:
# [[5.272976 5.293335 5.605359 5.696639 5.707428]]
  • 我用以下代码进行了查询结果向量的输出,可见输出结果只有部分相同,这是因为我们修改(扩充)了距离计算的依赖。
# 打印最近邻居的向量
print("\nNearest Neighbors:")
for i in range(k):
    neighbor_index = indices[0, i]
    neighbor_vector = data[neighbor_index]
    print(f"Neighbor {i + 1}: Index {neighbor_index}, Distance {distances[0, i]}, Vector {neighbor_vector}")
Nearest Neighbors:
Neighbor 1: Index 1356, Distance 5.315501689910889, Vector [0.01101539 0.6567009  0.7633245  0.11660998 0.33732712 0.8499721
 0.68720007 0.30464375 0.7422429  0.88726753 0.30932006 0.6842837
 0.09341944 0.0586829  0.58625734 0.49242404 0.8100883  0.7802833
 0.2866956  0.5122624  0.7557766  0.27095273 0.36196133 0.05986348
 0.13048859 0.6102204  0.49675122 0.16859066 0.0072812  0.16903314
 0.7496399  0.09368231 0.40244937 0.23878902 0.54939663 0.51155233
 0.98295355 0.7728801  0.9383296  0.5779583  0.73778135 0.8950766
 0.041071   0.91545016 0.21177031 0.7050161  0.7733409  0.109326
 0.9530999  0.92655915 0.65455276 0.15532914 0.5660506  0.34414485
 0.9307643  0.40665573 0.69374937 0.6370151  0.2710153  0.53549683
 0.40998015 0.37462777 0.86400545 0.13975835]
Neighbor 2: Index 3975, Distance 5.56165885925293, Vector [0.6852252  0.79311645 0.3148995  0.42644194 0.43068996 0.21183491
 0.05787511 0.9602238  0.29530123 0.68910587 0.15870273 0.708609
 0.86639625 0.4510904  0.95853996 0.23694353 0.9699781  0.77007866
 0.48550996 0.40872052 0.46613166 0.24974766 0.01244073 0.43974018
 0.6752544  0.85017306 0.81168395 0.89650345 0.00525839 0.26145405
 0.16250415 0.26849723 0.01632813 0.28710592 0.73261696 0.00488606
 0.64295805 0.55107576 0.56322    0.731344   0.98232174 0.511173
 0.18898515 0.914521   0.59773636 0.7063284  0.73153925 0.97906655
 0.7590872  0.4468203  0.8288643  0.39922148 0.6796608  0.2297831
 0.6257001  0.5006799  0.8744495  0.14236866 0.12442626 0.14521043
 0.08433475 0.96692973 0.13060258 0.35526052]

Nearest Neighbors:
Neighbor 1: Index 7929, Distance 5.293799877166748, Vector [0.96233946 0.5737502  0.59273595 0.23098944 0.5369705  0.63797593
 0.42823425 0.24575251 0.8893288  0.54502964 0.8060116  0.65886575
 0.78253627 0.36670887 0.02456753 0.9354817  0.50337505 0.10899781
 0.2375323  0.617193   0.43202353 0.2877622  0.23769969 0.46321324
 0.54506296 0.92509645 0.6306161  0.29780295 0.4218431  0.03696149
 0.3116852  0.390165   0.9549252  0.3775373  0.5620233  0.9112755
 0.1394593  0.2466888  0.9241558  0.86005247 0.7937772  0.9627047
 0.09679138 0.8644842  0.071664   0.19625679 0.01667842 0.68986166
 0.71011275 0.7705593  0.67370415 0.07858868 0.4308906  0.09075476
 0.03766147 0.18467574 0.2782387  0.37127924 0.98378307 0.48489136
 0.22696696 0.07038712 0.22267212 0.10312359]
Neighbor 2: Index 4107, Distance 5.525795936584473, Vector [0.04460111 0.5836406  0.27762762 0.75389206 0.52659243 0.88937527
 0.5552024  0.43461925 0.12575674 0.29606643 0.19991362 0.86584586
 0.53224045 0.20149525 0.34396216 0.05069733 0.5733588  0.06891397
 0.55476147 0.6457947  0.6288594  0.30873945 0.02107575 0.02294
 0.05592747 0.21791738 0.37937504 0.93809557 0.72561693 0.70872927
 0.89278466 0.8034361  0.78736126 0.15266728 0.6486509  0.34981716
 0.91982204 0.00775846 0.585377   0.775304   0.5465568  0.80789727
 0.9480229  0.705922   0.7635816  0.4436006  0.7039021  0.7166679
 0.6503457  0.8582911  0.3602512  0.37543017 0.9880262  0.28702474
 0.14523816 0.4190667  0.86942685 0.15948081 0.83756304 0.5973361
 0.0859841  0.40533915 0.47337615 0.48650718]

Nearest Neighbors:
Neighbor 1: Index 7929, Distance 5.272975921630859, Vector [0.96233946 0.5737502  0.59273595 0.23098944 0.5369705  0.63797593
 0.42823425 0.24575251 0.8893288  0.54502964 0.8060116  0.65886575
 0.78253627 0.36670887 0.02456753 0.9354817  0.50337505 0.10899781
 0.2375323  0.617193   0.43202353 0.2877622  0.23769969 0.46321324
 0.54506296 0.92509645 0.6306161  0.29780295 0.4218431  0.03696149
 0.3116852  0.390165   0.9549252  0.3775373  0.5620233  0.9112755
 0.1394593  0.2466888  0.9241558  0.86005247 0.7937772  0.9627047
 0.09679138 0.8644842  0.071664   0.19625679 0.01667842 0.68986166
 0.71011275 0.7705593  0.67370415 0.07858868 0.4308906  0.09075476
 0.03766147 0.18467574 0.2782387  0.37127924 0.98378307 0.48489136
 0.22696696 0.07038712 0.22267212 0.10312359]
Neighbor 2: Index 961, Distance 5.2933349609375, Vector [0.9621167  0.2617852  0.48362496 0.88779247 0.4134914  0.52861816
 0.16878773 0.2850794  0.5061142  0.36490148 0.0382557  0.40082905
 0.81510574 0.11605944 0.01873139 0.11870275 0.6868702  0.79464465
 0.04872655 0.8875509  0.62732536 0.5181314  0.2535919  0.37170032
 0.94697326 0.9115464  0.62546456 0.57891124 0.21054466 0.95327854
 0.7553917  0.3822597  0.81583154 0.21187466 0.21322866 0.7909612
 0.559308   0.5558353  0.5736708  0.12580682 0.34955907 0.57307965
 0.24758843 0.50400496 0.55703527 0.9428139  0.2457758  0.43935728
 0.98151124 0.18678987 0.78001946 0.17715496 0.8500466  0.48797393
 0.9721615  0.17007497 0.68792635 0.69527924 0.7188754  0.10096876
 0.288561   0.33801684 0.3242876  0.6750207 ]

文章来源:https://blog.csdn.net/ResumeProject/article/details/135407317
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。