From c308dbc6f223d3b5ffae865464fb881472eca579 Mon Sep 17 00:00:00 2001 From: Lengyue Date: Wed, 24 May 2023 12:20:17 -0400 Subject: [PATCH] update dinov2 layerscale init values --- timm/models/vision_transformer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index 0104e00701..b4f15cb8ba 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -1982,7 +1982,7 @@ def vit_small_patch14_dinov2(pretrained=False, **kwargs) -> VisionTransformer: """ ViT-S/14 for DINOv2 """ model_args = dict( - patch_size=14, embed_dim=384, depth=12, num_heads=6, init_values=1.0, img_size=518, + patch_size=14, embed_dim=384, depth=12, num_heads=6, init_values=1e-5, img_size=518, ) model = _create_vision_transformer( 'vit_small_patch14_dinov2', pretrained=pretrained, **dict(model_args, **kwargs)) @@ -1994,7 +1994,7 @@ def vit_base_patch14_dinov2(pretrained=False, **kwargs) -> VisionTransformer: """ ViT-B/14 for DINOv2 """ model_args = dict( - patch_size=14, embed_dim=768, depth=12, num_heads=12, init_values=1.0, img_size=518, + patch_size=14, embed_dim=768, depth=12, num_heads=12, init_values=1e-5, img_size=518, ) model = _create_vision_transformer( 'vit_base_patch14_dinov2', pretrained=pretrained, **dict(model_args, **kwargs)) @@ -2006,7 +2006,7 @@ def vit_large_patch14_dinov2(pretrained=False, **kwargs) -> VisionTransformer: """ ViT-L/14 for DINOv2 """ model_args = dict( - patch_size=14, embed_dim=1024, depth=24, num_heads=16, init_values=1.0, img_size=518, + patch_size=14, embed_dim=1024, depth=24, num_heads=16, init_values=1e-5, img_size=518, ) model = _create_vision_transformer( 'vit_large_patch14_dinov2', pretrained=pretrained, **dict(model_args, **kwargs)) @@ -2024,7 +2024,7 @@ def vit_giant_patch14_dinov2(pretrained=False, **kwargs) -> VisionTransformer: # With SwiGLUPacked, we need to set hidden_features = 2 * 4096 = 8192 model_args = dict( - patch_size=14, embed_dim=1536, depth=40, num_heads=24, init_values=1.0, + patch_size=14, embed_dim=1536, depth=40, num_heads=24, init_values=1e-5, mlp_ratio=2.66667 * 2, mlp_layer=SwiGLUPacked, img_size=518, act_layer=nn.SiLU ) model = _create_vision_transformer(