flexflow · goliaro · Apr 17, 2023 · Apr 21, 2023 · May 7, 2023 · May 7, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -158,10 +158,6 @@ endif()
 # option for nccl
 option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF)
 
-if (FF_GPU_BACKEND STREQUAL "hip_rocm" AND FF_USE_NCCL STREQUAL "ON")
-  message(FATAL_ERROR "NCCL: ON for FF_GPU_BACKEND: hip_rocm. hip_rocm backend must have NCCL disabled.")
-endif()
-
 # option for avx2
 option(FF_USE_AVX2 "Run FlexFlow with AVX2" OFF)
 
@@ -224,7 +220,9 @@ endif()
 
 # NCCL
 if(FF_USE_NCCL)
-  include(nccl)
+  if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")
+    include(nccl)
+  endif()
   list(APPEND FF_CC_FLAGS
     -DFF_USE_NCCL)
   list(APPEND FF_NVCC_FLAGS
@@ -369,11 +367,13 @@ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm")
   elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
     find_package(hipblas REQUIRED)
     find_package(miopen REQUIRED)
+    if(FF_USE_NCCL)
+      find_package(rccl REQUIRED)
+    endif()
     # find_package(rocrand REQUIRED)
     find_library(HIP_RAND_LIBRARY hiprand REQUIRED)
 
     add_compile_definitions(FF_USE_HIP_ROCM)
-
     # The hip cmake config module defines three targets, 
     # hip::amdhip64, hip::host, and hip::device.
     #
@@ -387,12 +387,15 @@ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm")
     # Docs (outdated):
     # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html
     target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY})
+    if(FF_USE_NCCL)
+        target_link_libraries(flexflow rccl)
+    endif()
   endif()
 else()
   message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}")
 endif()
 
-if(FF_USE_NCCL)
+if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda"))
   add_dependencies(flexflow ${NCCL_NAME})
 endif()
 

diff --git a/cmake/json.cmake b/cmake/json.cmake
@@ -1,4 +1 @@
-include(FetchContent)
-
-FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz)
-FetchContent_MakeAvailable(json)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/json)
diff --git a/config/config.inc b/config/config.inc
@@ -84,6 +84,8 @@ if [ "$FF_LEGION_NETWORKS" = "gasnet" ]; then
   elif [ "$FF_GASNET_CONDUIT" = "ucx" ]; then
     SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ucx"
     SET_LEGION_NETWORKS+=" -DFF_UCX_URL=$FF_UCX_URL"
+  elif [ "$FF_GASNET_CONDUIT" = "ofi" ]; then 
+    SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ofi"
   fi
 elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then
   SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=ucx"
@@ -182,7 +184,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then
         chmod +x "$(pwd)/nvidia_hipcc"
         SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc"
       else
-        SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc"
+        SET_CXX="-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc -DCMAKE_CXX_LINKER=$ROCM_PATH/bin/hipcc -DHIP_PATH=$ROCM_PATH/hip -DCMAKE_CXX_FLAGS='-I${MPICH_DIR}/include' -DCMAKE_EXE_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi' -DCMAKE_SHARED_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi'"
       fi
     fi
   fi

diff --git a/config/config.linux b/config/config.linux
@@ -38,7 +38,7 @@ FF_USE_PYTHON=${FF_USE_PYTHON:-ON}
 FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-}
 
 # select GASNET conduit
-FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv}
+FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ofi}
 
 # set UCX URL
 FF_UCX_URL=${FF_UCX_URL:-""}
@@ -70,11 +70,9 @@ FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
 if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
   echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid."
   exit 1
-elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" ]]; then
+elif [["$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm"]]; then
     # enable NCCL
     FF_USE_NCCL=${FF_USE_NCCL:-ON}
-else
-    FF_USE_NCCL=OFF
 fi
 
 function get_build_configs() {

diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py
@@ -3,16 +3,17 @@
 import sys
 
 import numpy as np
+import torch
 from flexflow.core import *
 from flexflow.torch.model import PyTorchModel
-from transformers import MT5ForConditionalGeneration, T5Tokenizer
-
+#from transformers import MT5ForConditionalGeneration, T5Tokenizer
+from transformers import BertForMaskedLM, BertTokenizer
 sys.path.append("./examples/python/pytorch/mt5")
 from mt5_torch import DataPreparer, get_dataloaders, set_seed
 
 BASE_DIR = "examples/python/pytorch/mt5"
 DATA_DIR = os.path.join(BASE_DIR, "data")
-NUMPY_DIR = os.path.join(DATA_DIR, "numpy")
+NUMPY_DIR = os.path.join(DATA_DIR, "numpy_candle")
 
 
 def data_to_numpy() -> None:
@@ -28,15 +29,17 @@ def data_to_numpy() -> None:
     """
     model_params = {
         "SEED": 42,
-        "MODEL": "google/mt5-small",
+        #"MODEL": "google/mt5-small",
+        "MODEL": "bert-base-uncased",
         "TRAIN_BATCH_SIZE": None,  # use the full dataset as one batch
         "EVAL_BATCH_SIZE": None,   # use the full dataset as one batch
         "TRAIN_EPOCHS": 1,         # unused
         "MAX_SOURCE_TEXT_LENGTH": 48,
         "MAX_TARGET_TEXT_LENGTH": 48,
     }
     set_seed(model_params)
-    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
+    #tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
+    tokenizer = BertTokenizer.from_pretrained(model_params["MODEL"])
     print("Getting dataloaders...")
     train_loader, eval_loader = get_dataloaders(tokenizer, model_params)
     assert len(train_loader) == 1
@@ -61,8 +64,8 @@ def preprocess_train() -> None:
     y_shape = y.shape
     assert len(y.shape) == 2, \
         "`y` should have shape (num examples, sequence length)"
-    y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long)
-    lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long)
+    y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32)
+    lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32)
     y_ids[:, :] = y[:, :-1]
     lm_labels[:, :] = y[:, 1:]
 
@@ -81,36 +84,54 @@ def preprocess_train() -> None:
 def top_level_task():
     ffconfig = FFConfig()
     ffmodel = FFModel(ffconfig)
-    model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
-
+    #model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+    model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+    #model = BertModel.from_pretrained("bert-base-uncased")
     # Load train data as numpy arrays
     print("Loading data...")
-    ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy"))
-    mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy"))
-    y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy"))
-    lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy"))
+    ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32')
+    ids = np.pad(ids, ((0,0), (0,17)), 'constant')
+    #ids = np.random.randint(0, 5, (1000, 512))
+    #print('ids_shape', ids.shape)
+    #print('ids', ids)
+    mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32')
+    mask = np.pad(mask, ((0,0), (0,17)), 'constant')
+    #mask = np.random.randint(0, 2, (1000, 512))
+    #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy"))
+    lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32')
+    lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant')
+    #lm_labels = np.random.randint(-1, 5, (1000, 512))
+    position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy()
+    token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy()
+
 
     batch_size = ffconfig.batch_size
     input_ids_shape = (batch_size, ids.shape[1])
     attention_mask_shape = (batch_size, mask.shape[1])
-    decoder_input_ids_shape = (batch_size, y_ids.shape[1])
+    #decoder_input_ids_shape = (batch_size, y_ids.shape[1])
     input_tensors = [
-        ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64),          # input_ids
-        ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64),     # attention_mask
-        ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64),  # decoder_input_ids
+        ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32),          # input_ids
+        ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32),     # attention_mask
+        #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64),  # decoder_input_ids
     ]
     encoder_seq_length = ids.shape[1]
-    decoder_seq_length = y_ids.shape[1]
-    seq_length = (encoder_seq_length, decoder_seq_length)
-    input_names = ["input_ids", "attention_mask", "decoder_input_ids"]
+    #decoder_seq_length = y_ids.shape[1]
+    #seq_length = (encoder_seq_length, decoder_seq_length)
+    seq_length = encoder_seq_length
+    #input_names = ["input_ids", "attention_mask", "decoder_input_ids"]
+    input_names = ["input_ids", "attention_mask"]
 
     print("Tracing the model...")
+    print(batch_size)
     hf_model = PyTorchModel(
         model, is_hf_model=True, input_names=input_names,
         batch_size=batch_size, seq_length=seq_length,
     )
     output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True)
-    ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)
+    #from flexflow.torch.model import file_to_ff
+    #file_to_ff("mt5.ff", ffmodel, input_tensors)
+    ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8)
+    # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)
 
     print("Compiling the model...")
     ffmodel.compile(
@@ -121,13 +142,21 @@ def top_level_task():
             MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
         ],
     )
+
+    # load weights here
+    ffmodel.load_bert_pretrained(checkpoint=model)
 
     print("Creating data loaders...")
+    print('id_dtype', ids.dtype)
+    print('mask_dtype', mask.dtype)
+    print('labels_dtype', lm_labels.dtype)
     input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids)
     attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask)
-    decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids)
+    #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids)
     # NOTE: We cast down the label tensor data to 32-bit to accommodate the
     # label tensor's required dtype
+    token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids)
+    position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id)
     labels_dl = ffmodel.create_data_loader(
         ffmodel.label_tensor, lm_labels.astype("int32")
     )
@@ -138,31 +167,32 @@ def top_level_task():
     print("Training...")
     epochs = ffconfig.epochs
     ffmodel.fit(
-        x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl],
+        #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl],
+        x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl],
         y=labels_dl, batch_size=batch_size, epochs=epochs,
     )
 
 
 if __name__ == "__main__":
-    # Generate the .tsv files if needed
-    if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \
-            not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")):
-        DataPreparer.data_to_tsv()
-    # Convert the .tsv files to .npy if needed
-    if not os.path.exists(NUMPY_DIR):
-        os.mkdir(NUMPY_DIR)
-    prefixes = ["train_", "eval_"]
-    suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"]
-    npy_filenames = [
-        pre + suf for pre, suf in itertools.product(prefixes, suffixes)
-    ]
-    if any(
-        not os.path.exists(os.path.join(NUMPY_DIR, filename))
-        for filename in npy_filenames
-    ):
-        data_to_numpy()
-    # Preprocess the training data if needed
-    if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \
-            not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")):
-        preprocess_train()
+    ## Generate the .tsv files if needed
+    #if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \
+    #        not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")):
+    #    DataPreparer.data_to_tsv()
+    ## Convert the .tsv files to .npy if needed
+    #if not os.path.exists(NUMPY_DIR):
+    #    os.mkdir(NUMPY_DIR)
+    #prefixes = ["train_", "eval_"]
+    #suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"]
+    #npy_filenames = [
+    #    pre + suf for pre, suf in itertools.product(prefixes, suffixes)
+    #]
+    #if any(
+    #    not os.path.exists(os.path.join(NUMPY_DIR, filename))
+    #    for filename in npy_filenames
+    #):
+    #    data_to_numpy()
+    ## Preprocess the training data if needed
+    #if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \
+    #        not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")):
+    #    preprocess_train()
     top_level_task()
diff --git a/examples/python/pytorch/mt5/mt5_torch.py b/examples/python/pytorch/mt5/mt5_torch.py
@@ -7,7 +7,7 @@
 import os
 
 import numpy as np
-import pandas as pd
+#import pandas as pd
 import torch
 from torch.utils.data import DataLoader, Dataset
 from transformers import MT5ForConditionalGeneration, T5Tokenizer
@@ -311,5 +311,5 @@ def TorchMT5Trainer(
         "MAX_TARGET_TEXT_LENGTH": 48,
         "LEARNING_RATE": 1e-4,
     }
-    device = torch.device(0)
+    device = torch.device('cpu')
     TorchMT5Trainer(model_params, device)
diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py
@@ -61,7 +61,11 @@ def to_string(self):
             size = dim['size']
             degree = dim['degree']
             parallel_idx = dim['parallel_idx']
-            toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]')
+            if dim['is_replica_dim']:
+                is_replica = 'r=t'
+            else:
+                is_replica = 'r=f'
+            toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx} {is_replica}]') 
         return f'TensorShape<{" ".join(toks)}>'
 
 class ParallelTensorBasePrinter:
@@ -77,9 +81,31 @@ def to_string(self):
             size = dim['size']
             degree = dim['degree']
             parallel_idx = dim['parallel_idx']
-            toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]')
+            tok = f'{i}=[s={size} d={degree} pi={parallel_idx} '
+            if dim['is_replica_dim']:
+                tok += 'r=t'
+            else:
+                tok += 'r=f'
+            tok += ']'
+            toks.append(tok)
         return f'ParallelTensorBase<{" ".join(toks)}>'
 
+class ParallelDimPrinter: 
+    def __init__(self, val):
+        self.val = val
+
+    def to_string(self):
+        size = self.val['size']
+        degree = self.val['degree']
+        parallel_idx = self.val['parallel_idx']
+        tok = f's={size} d={degree} pi={parallel_idx} '
+        if dim['is_replica_dim']:
+            tok += 'r=t'
+        else:
+            tok += 'r=f'
+        return f'ParallelDim<{tok}>'
+
+
 def build_pretty_printer():
     pp = gdb.printing.RegexpCollectionPrettyPrinter(
         "flexflow")
@@ -89,6 +115,7 @@ def build_pretty_printer():
     pp.add_printer('Domain', '^Legion::Domain$', DomainPrinter)
     pp.add_printer('ParallelTensorShape', '^FlexFlow::ParallelTensorShape$', TensorShapePrinter)
     pp.add_printer('ParallelTensorBase', '^FlexFlow::ParallelTensorBase$', ParallelTensorBasePrinter)
+    pp.add_printer('ParallelDim', '^FlexFlow::ParallelDim$', ParallelDimPrinter)
     return pp
 
 gdb.printing.register_pretty_printer(

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
@@ -28,8 +28,10 @@
 #error "Unknown device"
 #endif
 #include "tl/optional.hpp"
-#ifdef FF_USE_NCCL
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include <nccl.h>
+#else
+#include <rccl.h>
 #endif
 
 namespace FlexFlow {
@@ -122,6 +124,7 @@ class FFConfig {
   size_t workSpaceSize;
   Legion::Context lg_ctx;
   Legion::Runtime *lg_hlr;
+  Legion::IndexSpaceT<1> all_gpu_task_is;
   Legion::FieldSpace field_space;
   bool syntheticInput, profiling, perform_fusion;
   size_t simulator_work_space_size;
@@ -135,6 +138,8 @@ class FFConfig {
   bool enable_parameter_parallel;
   bool enable_attribute_parallel;
   bool enable_inplace_optimizations;
+  int data_parallelism_degree;
+  int tensor_parallelism_degree;
   // Control Tensor Op Math Conversion
   bool allow_tensor_op_math_conversion;
   std::string dataset_path;