Skip to content
0

文章发布较早,内容可能过时,阅读注意甄别。

1 CLIP模型的學習

github鏈接

1、openai/CLIP: CLIP (Contrastive Language-Image Pretraining), Predict the most relevant text snippet given an image

2、mlfoundations/open_clip: An open source implementation of CLIP.

1.1 代码实现

python
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
# print("device", device)
description = ["a diagram", "a dog", "a cat"]
model, preprocess = clip.load(r"/path/of/clip/pretrained/model", device=device)

image = preprocess(Image.open(r"path/of/image")).unsqueeze(0).to(device)
text = clip.tokenize(description).to(device)
# print("image", image.shape)  # image torch.Size([1, 3, 224, 224])
# print("text", text.shape)  # text torch.Size([3, 77])

with torch.no_grad():
    # 获取图像和文本的嵌入表示
    image_features = model.encode_image(image)
    # print("image_features", image_features.shape)  # image_features torch.Size([1, 512])
    text_features = model.encode_text(text)
    # print("text_features", text_features.shape)  # text_features torch.Size([3, 512])

    # 计算图像和文本的相似度
    logits_per_image, logits_per_text = model(image, text)
    # print("logits_per_image", logits_per_image, logits_per_image.shape)
    # print("logits_per_text", logits_per_text, logits_per_text.shape)
    """
    logits_per_image tensor([[19.8642, 25.7244, 19.0845]]) torch.Size([1, 3])
    logits_per_text tensor([[19.8642],
                            [25.7244],
                            [19.0845]]) torch.Size([3, 1])
    """
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()
    print("Label probs:", probs)  # Label probs: [[0.00283868 0.9958597  0.00130165]]

    _, pred = torch.max(logits_per_image, dim=1)
    # print("pred", pred)  # pred tensor([1])
    print(f"This is {description[pred[0]]}")  # This is a dog

2 Bert 預訓練模型

2.1 代碼實現

==説明:這裏加載的是預訓練的Bio_ClinicalBERT模型==

python
from transformers import AutoTokenizer, AutoModel

# 加载模型和分词器
path = '../BioClinical_BERT'
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModel.from_pretrained(path)

# 输入文本
text = "age is 67, gender is male"

# 获取文本的嵌入表示
encoded_input = tokenizer(text, return_tensors='pt')
print("encoded_input", encoded_input)
outputs = model(**encoded_input)
# print("outputs", outputs)
# 模型的輸出
print("pooler_output", outputs.pooler_output.shape)
# 输出模型的隐藏状态
print("last_hidden_state", outputs.last_hidden_state.shape)

打印結果輸出:

encoded_input {'input_ids': tensor([[ 101, 1425, 1110, 5486,  117, 5772, 1110, 2581,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}
pooler_output torch.Size([1, 768])
last_hidden_state torch.Size([1, 9, 768])
最近更新