conda create -n tutorial4 python=3.9
conda activate tutorial4
conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
pip install peft numpy==1.26.4 matplotlib==3.8.4 ipykernel==6.29.5 transformers==4.42.4

# 如果以下目录存在， 可以直接复制:
cp -r /lustre/public/tutorial/models/models--BAAI--bge-m3/ ./

# 否则请自行下载：
export HF_ENDPOINT=https://hf-mirror.com
huggingface-cli download --resume-download BAAI/bge-m3 --local-dir models--BAAI--bge-m3

pip install -U FlagEmbedding

from FlagEmbedding import BGEM3FlagModel

# 填写模型路径
# VAR_PLACEHOLDER
model = BGEM3FlagModel('models--BAAI--bge-m3',  
                       use_fp16=True)

# 待计算的句子
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval,"
               " lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a "
               "set of documents based on the query terms "
               "appearing in each document"]

# 计算 Embedding
embeddings_1 = model.encode(sentences_1, 
                            batch_size=12, 
                            max_length=8192, 
                            )['dense_vecs']
embeddings_2 = model.encode(sentences_2)['dense_vecs']

# 计算相似度
similarity = embeddings_1 @ embeddings_2.T
print(similarity)
# 结果应该是：
# [[0.6265, 0.3477], [0.3499, 0.678 ]]

from FlagEmbedding import BGEM3FlagModel

# 填写模型路径
# VAR_PLACEHOLDER
model = BGEM3FlagModel('models--BAAI--bge-m3',  use_fp16=True) 

# 待计算的句子
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval,"
               " lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a "
               "set of documents based on the query terms "
               "appearing in each document"]

# 通过 lexical mathcing 计算相似度
output_1 = model.encode(
    sentences_1, return_dense=True, return_sparse=True,
    return_colbert_vecs=False)
output_2 = model.encode(
    sentences_2, return_dense=True, return_sparse=True,
    return_colbert_vecs=False)

lexical_scores = model.compute_lexical_matching_score(
    output_1['lexical_weights'][0], output_2['lexical_weights'][0])
print(lexical_scores)
# 0.19554901123046875
print(model.compute_lexical_matching_score(
    output_1['lexical_weights'][0], output_1['lexical_weights'][1]))
# 0.0

# 查看每个 token 的 weight：
print(model.convert_id_to_token(output_1['lexical_weights']))
# [{'What': 0.08356, 'is': 0.0814, 'B': 0.1296, 'GE': 0.252,
# 'M': 0.1702, '3': 0.2695, '?': 0.04092}, 
#  {'De': 0.05005, 'fin': 0.1368, 'ation': 0.04498, 'of': 0.0633,
# 'BM': 0.2515, '25': 0.3335}]

from FlagEmbedding import BGEM3FlagModel

# 填写模型路径
# VAR_PLACEHOLDER
model = BGEM3FlagModel('models--BAAI--bge-m3',  use_fp16=True) 

# 待计算的句子
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval,"
               " lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a "
               "set of documents based on the query terms "
               "appearing in each document"]

# 通过 colbert 计算相似度
output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True,
                        return_colbert_vecs=True)
output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True,
                        return_colbert_vecs=True)

print(model.colbert_score(
    output_1['colbert_vecs'][0], output_2['colbert_vecs'][0]))
print(model.colbert_score(
    output_1['colbert_vecs'][0], output_2['colbert_vecs'][1]))
# 0.7797
# 0.4620

from FlagEmbedding import BGEM3FlagModel

# 填写模型路径
# VAR_PLACEHOLDER
model = BGEM3FlagModel('models--BAAI--bge-m3',  use_fp16=True) 

# 待计算的句子
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval,"
               " lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a "
               "set of documents based on the query terms "
               "appearing in each document"]

sentence_pairs = [[i,j] for i in sentences_1 for j in sentences_2]

# 计算混合相似度
# w[0]*dense_score + w[1]*sparse_score + w[2]*colbert_score
print(model.compute_score(sentence_pairs, 
                          max_passage_length=128, 
                          weights_for_different_modes=[0.4, 0.2, 0.4]))

Tutorial4: Bge Embedding¶

1. 环境安装与应用创建¶

2. 下载模型¶

3. 模型使用¶

3.1 稠密检索¶

3.2 稀疏检索¶

3.3 多向量检索¶

3.4 加权语义相似度¶