-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
126 lines (96 loc) · 3.22 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
# -*-coding:utf-8 -*-
"""
@File : main.py
@Time : 2024/12/31 13:45:25
@Author : Tony Teng
@Version : 1.0
@Contact : [email protected]
@Desc : None
"""
import os
from PIL import Image
import numpy as np
def read_datasets():
image_folder = "datasets/image"
audio_folder = "datasets/audio"
images = os.listdir(image_folder)
audios = os.listdir(audio_folder)
datasets = {}
for image in images:
datasets[image.split(".")[0]] = {
"image": os.path.join(image_folder, image),
}
for audio in audios:
datasets[audio.split(".")[0]]["audio"] = os.path.join(audio_folder, audio)
return datasets
def load_models():
import laion_clap
import open_clip
clap_model = laion_clap.CLAP_Module(enable_fusion=True, device="cuda")
clap_model.load_ckpt()
clip_model, _, preprocess = open_clip.create_model_and_transforms(
"ViT-B-32", pretrained="laion2b_s34b_b79k"
)
clip_model.eval()
tokenizer = open_clip.get_tokenizer("ViT-B-32")
return clap_model, clip_model, tokenizer, preprocess
def load_image_bind():
import sys
sys.path.append("./ImageBind/")
from ImageBind.imagebind.models import imagebind_model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to("cuda")
return model
def cosine_distance(a, b):
return 1 - np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def l2_distance(a, b):
distance = np.linalg.norm(a - b)
return distance
def test_clap_clip():
datasets = read_datasets()
clap_model, clip_model, _, preprocess = load_models()
audio_embed = clap_model.get_audio_embedding_from_filelist(
[datasets["bird"]["audio"]]
)[0]
for name, dataset in datasets.items():
image = preprocess(Image.open(dataset["image"])).unsqueeze(0)
features = clip_model.encode_image(image)
features /= features.norm(dim=-1, keepdim=True)
image_embedding = features[0].detach().numpy()
distance = cosine_distance(image_embedding, audio_embed)
print(f"{name}: {distance}")
def test_imagebind():
datasets = read_datasets()
model = load_image_bind()
import torch
from ImageBind.imagebind.data import (
load_and_transform_audio_data,
load_and_transform_vision_data,
)
from ImageBind.imagebind.models.imagebind_model import ModalityType
keys = list(datasets)
images = []
audios = []
for _, dataset in datasets.items():
images.append(dataset["image"])
audios.append(dataset["audio"])
with torch.no_grad():
embeddings = model(
{
ModalityType.VISION: load_and_transform_vision_data(images, "cuda"),
ModalityType.AUDIO: load_and_transform_audio_data(audios, "cuda"),
}
)
resutls = torch.softmax(
embeddings[ModalityType.VISION] @ embeddings[ModalityType.AUDIO].T, dim=-1
)
np.set_printoptions(precision=3)
for i, key in enumerate(keys):
for idx, result in enumerate(resutls[i]):
print(f"{key} - {keys[idx]}: {result.cpu().detach().numpy()}")
def main():
test_imagebind()
if __name__ == "__main__":
main()