Merge pull request #203 from helicalAI/evo-2-updates

Evo 2 more comprehensive notebook and 40B configs
helicalAI · Feb 26, 2025 · 6fb608d · 6fb608d
2 parents d888020 + ffcc330
commit 6fb608d
Show file tree

Hide file tree

Showing 3 changed files with 1,478 additions and 377 deletions.
diff --git a/examples/notebooks/Evo-2.ipynb b/examples/notebooks/Evo-2.ipynb
diff --git a/helical/models/evo_2/README.md b/helical/models/evo_2/README.md
@@ -4,25 +4,13 @@
 
 **Model Name:** Evo 2  
 
-**Model Versions:** 1B, 7B and 40B  (Currently only the 1B and 7B models are available and the 40B coming soon)
+**Model Versions:** 1B, 7B and 40B
 
 **Model Description:** Evo 2 is a next-generation genomic model that integrates DNA, RNA, and protein data across all domains of life. It leverages the StripedHyena 2 architecture, combining convolutional, linear attention, and state-space models to efficiently process long sequences and capture complex biological patterns. Evo 2 is trained on a vast dataset encompassing trillions of nucleotides from eukaryotic and prokaryotic genomes, enabling broad cross-species applications and insights into human diseases, agriculture, and environmental science.
 
 ## Model Developers
 
-**Arc Institute**
-
-**Stanford University**
-
-**NVIDIA**
-
-**Liquid AI**
-
-**University of California, Berkeley**
-
-**Goodfire**
-
-**Columbia University**
+Arc Institute, Stanford University, NVIDIA, Liquid AI, University of California, Berkeley, Goodfire, Columbia University
 
 **Contact Information:** 
 

diff --git a/helical/models/evo_2/evo_2_config.py b/helical/models/evo_2/evo_2_config.py
@@ -94,14 +94,12 @@ def __init__(
                 "hcm_layer_idxs": [1, 5, 8, 12, 15, 19, 22, 26, 29],
                 "hcs_layer_idxs": [0, 4, 7, 11, 14, 18, 21, 25, 28],
                 "attn_layer_idxs": [3, 10, 17, 24, 31],
-
                 "hcm_filter_length": 128,
                 "hcl_filter_groups": 4096,
                 "hcm_filter_groups": 256,
                 "hcs_filter_groups": 256,
                 "hcs_filter_length": 7,
                 "num_layers": 32,
-
                 "short_filter_length": 3,
                 "num_attention_heads": 32,
                 "short_filter_bias": False,
@@ -205,6 +203,120 @@ def __init__(
                 "mlp_activation": "gelu",
                 "print_activations": False,
             },
+            "evo2-40b-base": {
+                "model_name": "evo2_40b_base",
+                "model_hf_name": "arcinstitute/evo2_40b_base",
+                "default_embedding_layer": "blocks.49.mlp.l3",
+                "vocab_size": 512,
+                "hidden_size": 8192,
+                "num_filters": 8192,
+                "hcl_layer_idxs": [2, 6, 9, 13, 16, 20, 23, 27, 30, 34, 38, 41, 45, 48],
+                "hcm_layer_idxs": [1, 5, 8, 12, 15, 19, 22, 26, 29, 33, 37, 40, 44, 47],
+                "hcs_layer_idxs": [0, 4, 7, 11, 14, 18, 21, 25, 28, 32, 36, 39, 43, 46],
+                "attn_layer_idxs": [3, 10, 17, 24, 31, 35, 42, 49],
+                "hcm_filter_length": 128,
+                "hcl_filter_groups": 8192,
+                "hcm_filter_groups": 512,
+                "hcs_filter_groups": 512,
+                "hcs_filter_length": 7,
+                "num_layers": 50,
+                "short_filter_length": 3,
+                "num_attention_heads": 64,
+                "short_filter_bias": False,
+                "mlp_init_method": "torch.nn.init.zeros_",
+                "mlp_output_init_method": "torch.nn.init.zeros_",
+                "eps": 0.000001,
+                "state_size": 16,
+                "rotary_emb_base": 1000000,
+                "make_vocab_size_divisible_by": 8,
+                "inner_size_multiple_of": 128,
+                "inner_mlp_size": 21888,
+                "log_intermediate_values": False,
+                "proj_groups": 1,
+                "hyena_filter_groups": 1,
+                "column_split_hyena": False,
+                "column_split": True,
+                "interleave": True,
+                "evo2_style_activations": True,
+                "use_fp8_input_projections": True,
+                "model_parallel_size": 1,
+                "pipe_parallel_size": 1,
+                "tie_embeddings": True,
+                "mha_out_proj_bias": True,
+                "hyena_out_proj_bias": True,
+                "hyena_flip_x1x2": False,
+                "qkv_proj_bias": False,
+                "max_seqlen": 8192,
+                "max_batch_size": 1,
+                "final_norm": True,
+                "use_flash_attn": True,
+                "use_flash_rmsnorm": False,
+                "use_flash_depthwise": False,
+                "use_flashfft": False,
+                "use_laughing_hyena": False,
+                "inference_mode": True,
+                "prefill_style": "fft",
+                "mlp_activation": "gelu",
+                "print_activations": False,
+            },
+            "evo2-40b": {
+                "model_name": "evo2_40b",
+                "model_hf_name": "arcinstitute/evo2_40b",
+                "default_embedding_layer": "blocks.49.mlp.l3",
+                "vocab_size": 512,
+                "hidden_size": 8192,
+                "num_filters": 8192,
+                "hcl_layer_idxs": [2, 6, 9, 13, 16, 20, 23, 27, 30, 34, 38, 41, 45, 48],
+                "hcm_layer_idxs": [1, 5, 8, 12, 15, 19, 22, 26, 29, 33, 37, 40, 44, 47],
+                "hcs_layer_idxs": [0, 4, 7, 11, 14, 18, 21, 25, 28, 32, 36, 39, 43, 46],
+                "attn_layer_idxs": [3, 10, 17, 24, 31, 35, 42, 49],
+                "hcm_filter_length": 128,
+                "hcl_filter_groups": 8192,
+                "hcm_filter_groups": 512,
+                "hcs_filter_groups": 512,
+                "hcs_filter_length": 7,
+                "num_layers": 50,
+                "short_filter_length": 3,
+                "num_attention_heads": 64,
+                "short_filter_bias": False,
+                "mlp_init_method": "torch.nn.init.zeros_",
+                "mlp_output_init_method": "torch.nn.init.zeros_",
+                "eps": 0.000001,
+                "state_size": 16,
+                "rotary_emb_base": 100000000000,
+                "rotary_emb_scaling_factor": 128,
+                "use_interpolated_rotary_pos_emb": True,
+                "make_vocab_size_divisible_by": 8,
+                "inner_size_multiple_of": 128,  # force GLU inner_size to be a multiple of
+                "inner_mlp_size": 22528,
+                "log_intermediate_values": False,
+                "proj_groups": 1,
+                "hyena_filter_groups": 1,
+                "column_split_hyena": False,
+                "column_split": True,
+                "interleave": True,
+                "evo2_style_activations": True,
+                "use_fp8_input_projections": True,
+                "model_parallel_size": 1,
+                "pipe_parallel_size": 1,
+                "tie_embeddings": True,
+                "mha_out_proj_bias": True,
+                "hyena_out_proj_bias": True,
+                "hyena_flip_x1x2": False,
+                "qkv_proj_bias": False,
+                "max_seqlen": 1048576,
+                "max_batch_size": 1,
+                "final_norm": True,
+                "use_flash_attn": True,
+                "use_flash_rmsnorm": False,
+                "use_flash_depthwise": False,
+                "use_flashfft": False,
+                "use_laughing_hyena": False,
+                "inference_mode": True,
+                "prefill_style": "fft",
+                "mlp_activation": "gelu",
+                "print_activations": False,
+            },
         }
         if model_name not in self.model_map:
             raise ValueError(