OFA-Sys · monatis · Jan 7, 2023
diff --git a/ofasys/adaptor/image_vqgan.py b/ofasys/adaptor/image_vqgan.py
@@ -22,12 +22,15 @@ def make_vqgan_code_bucket_position(bucket_size, num_relative_distance):
     coords_w = torch.arange(bucket_size)
     coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
     coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-    relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
-    relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+    relative_coords = coords_flatten[:, :, None] - \
+        coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+    relative_coords = relative_coords.permute(
+        1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
     relative_coords[:, :, 0] += bucket_size - 1  # shift to start from 0
     relative_coords[:, :, 1] += bucket_size - 1
     relative_coords[:, :, 0] *= 2 * bucket_size - 1
-    relative_position_index = torch.zeros(size=(bucket_size * bucket_size + 1,) * 2, dtype=relative_coords.dtype)
+    relative_position_index = torch.zeros(
+        size=(bucket_size * bucket_size + 1,) * 2, dtype=relative_coords.dtype)
     relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
     relative_position_index[0, 0:] = num_relative_distance - 3
     relative_position_index[0:, 0] = num_relative_distance - 2
@@ -47,15 +50,17 @@ class ImageVqganAdaptorConfig(BaseAdaptorConfig):
     )
     vqgan_factor: int = field(default=8, metadata={"help": "vqgan factor"})
     vqgan_model_path: str = field(
-        default="oss://ofasys/tasks/image_gen/vqgan/last.ckpt",
+        default="http://ofasys.oss-cn-zhangjiakou.aliyuncs.com/tasks/image_gen/vqgan/last.ckpt",
         metadata={"help": "path of vqgan model"},
     )
     vqgan_config_path: str = field(
-        default="oss://ofasys/tasks/image_gen/vqgan/model.yaml",
+        default="http://ofasys.oss-cn-zhangjiakou.aliyuncs.com/tasks/image_gen/vqgan/model.yaml",
         metadata={"help": "path of vqgan config"},
     )
-    use_encode: bool = field(default=True, metadata={"help": "where to use tokenizer.encode in map"})
-    code_entry_prefix: str = field(default='code', metadata={"help": "prefix of code entry in the global_dict"})
+    use_encode: bool = field(default=True, metadata={
+                             "help": "where to use tokenizer.encode in map"})
+    code_entry_prefix: str = field(default='code', metadata={
+                                   "help": "prefix of code entry in the global_dict"})
 
 
 @register_config("ofasys.adaptor", "image_vqgan", ImageVqganAdaptorConfig)
@@ -72,21 +77,29 @@ def __init__(
 
         self.window_size = cfg.code_image_size // cfg.vqgan_factor
 
-        self.embed_code_positions = Embedding(cfg.code_bucket_size**2 + 1, cfg.embed_dim)
+        self.embed_code_positions = Embedding(
+            cfg.code_bucket_size**2 + 1, cfg.embed_dim)
 
-        code_num_rel_dis = (2 * cfg.code_bucket_size - 1) * (2 * cfg.code_bucket_size - 1) + 3
-        code_rp_bucket = make_vqgan_code_bucket_position(cfg.code_bucket_size, code_num_rel_dis)
+        code_num_rel_dis = (2 * cfg.code_bucket_size - 1) * \
+            (2 * cfg.code_bucket_size - 1) + 3
+        code_rp_bucket = make_vqgan_code_bucket_position(
+            cfg.code_bucket_size, code_num_rel_dis)
         code_position_idx = (
-            torch.arange(self.window_size).unsqueeze(0).expand(self.window_size, self.window_size)
-            + torch.arange(self.window_size).unsqueeze(1) * cfg.code_bucket_size
+            torch.arange(self.window_size).unsqueeze(
+                0).expand(self.window_size, self.window_size)
+            + torch.arange(self.window_size).unsqueeze(1) *
+            cfg.code_bucket_size
             + 1
         )
-        code_position_idx = torch.cat([torch.tensor([0]), code_position_idx.view(-1)])
-        code_position_idx = torch.cat([code_position_idx, torch.tensor([1024] * 768)])
+        code_position_idx = torch.cat(
+            [torch.tensor([0]), code_position_idx.view(-1)])
+        code_position_idx = torch.cat(
+            [code_position_idx, torch.tensor([1024] * 768)])
 
         num_rel_pos_tables = 1 if self.cfg.share_attn_bias else self.num_layers
         self.code_rel_pos_table_list = nn.ModuleList(
-            [Embedding(code_num_rel_dis, cfg.num_attention_heads, zero_init=True) for _ in range(num_rel_pos_tables)]
+            [Embedding(code_num_rel_dis, cfg.num_attention_heads, zero_init=True)
+             for _ in range(num_rel_pos_tables)]
         )
         self.tokenizer = VQGANTokenizer(
             vqgan_config_path=cfg.vqgan_config_path,
@@ -102,8 +115,10 @@ def __init__(
 
     def get_rel_pos_bias(self, batch_size, seq_length, idx, **kwargs):
         code_position_idx = self.code_position_idx[:seq_length]
-        rp_bucket = self.code_rp_bucket[code_position_idx][:, code_position_idx]
-        values = F.embedding(rp_bucket, self.code_rel_pos_table_list[idx].weight)
+        rp_bucket = self.code_rp_bucket[code_position_idx][:,
+                                                           code_position_idx]
+        values = F.embedding(
+            rp_bucket, self.code_rel_pos_table_list[idx].weight)
         return values
 
     def update_sample(self, sample: Dict):
@@ -122,13 +137,18 @@ def update_sample(self, sample: Dict):
             for i, slot in enumerate(sample['net_input']['slots']):
                 if self.check_adaptor_slot(slot):
                     image_tensor = slot.value
-                    codes = self.tokenizer.encode(image_tensor.float()) + self.code_index_start
+                    codes = self.tokenizer.encode(
+                        image_tensor.float()) + self.code_index_start
                     batch_size = codes.size()[0]
-                    codes = torch.cat([codes.new_ones((batch_size, 1)) * 0, codes], dim=-1)
-                    codes = torch.cat([codes, codes.new_ones((batch_size, 1)) * 2], dim=-1)
-                    sample['net_input']['slots'][i].value = codes[:, :-1].contiguous()
+                    codes = torch.cat(
+                        [codes.new_ones((batch_size, 1)) * 0, codes], dim=-1)
+                    codes = torch.cat(
+                        [codes, codes.new_ones((batch_size, 1)) * 2], dim=-1)
+                    sample['net_input']['slots'][i].value = codes[:,
+                                                                  :-1].contiguous()
                     sample['target'] = codes[:, 1:].contiguous()
-                    sample['ntokens'] = sample['target'].ne(1).long().sum().item()
+                    sample['ntokens'] = sample['target'].ne(
+                        1).long().sum().item()
 
         return sample
 
@@ -172,7 +192,8 @@ def upgrade_state_dict_named(self, state_dict, name):
             num_posids_to_add = len(self.state_dict()["embed_code_positions.weight"]) - len(
                 state_dict[prefix + "embed_code_positions.weight"]
             )
-            embed_dim = state_dict[prefix + "embed_code_positions.weight"].size(1)
+            embed_dim = state_dict[prefix +
+                                   "embed_code_positions.weight"].size(1)
             new_pos_embed_to_add = torch.zeros(num_posids_to_add, embed_dim)
             nn.init.normal_(new_pos_embed_to_add, mean=0, std=embed_dim**-0.5)
             new_pos_embed_to_add = new_pos_embed_to_add.to(

diff --git a/requirements.txt b/requirements.txt
@@ -31,7 +31,7 @@ rapidfuzz
 pillow
 opencv-python
 timm>=0.5.0
-http://ofasys.oss-cn-zhangjiakou.aliyuncs.com/dependency/clip-1.0-py3-none-any.whl
+git+https://github.com/openai/clip.git
 diffusers
 matplotlib