Dylan8527 · Dylan8527 · Jun 12, 2023
diff --git a/applications/DeepSpeed-Chat/infer.sh b/applications/DeepSpeed-Chat/infer.sh
@@ -0,0 +1,2 @@
+python inference/chatbot.py \
+    --path training/step1_supervised_finetuning/output/bloomz-560m.phoenix_v1_test4
diff --git a/applications/DeepSpeed-Chat/inference/chatbot.py b/applications/DeepSpeed-Chat/inference/chatbot.py
@@ -10,7 +10,7 @@
 import os
 import json
 from transformers import pipeline, set_seed
-from transformers import AutoConfig, OPTForCausalLM, AutoTokenizer
+from transformers import AutoConfig, OPTForCausalLM, AutoTokenizer, BloomForCausalLM
 
 
 def parse_args():
@@ -43,9 +43,17 @@ def get_generator(path):
     tokenizer.pad_token = tokenizer.eos_token
 
     model_config = AutoConfig.from_pretrained(path)
-    model = OPTForCausalLM.from_pretrained(path,
-                                           from_tf=bool(".ckpt" in path),
-                                           config=model_config).half()
+    if 'bloom' in path:
+        model = BloomForCausalLM.from_pretrained(path,
+                                            from_tf=bool(".ckpt" in path),
+                                            config=model_config).half()
+    else:
+        model = OPTForCausalLM.from_pretrained(path,
+                                            from_tf=bool(".ckpt" in path),
+                                            config=model_config).half()
+    # model = OPTForCausalLM.from_pretrained(path,
+    #                                        from_tf=bool(".ckpt" in path),
+    #                                        config=model_config).half()
 
     model.config.end_token_id = tokenizer.eos_token_id
     model.config.pad_token_id = model.config.eos_token_id

diff --git a/...ions/DeepSpeed-Chat/training/step1_supervised_finetuning/evaluation_scripts/run_prompt.sh b/...ions/DeepSpeed-Chat/training/step1_supervised_finetuning/evaluation_scripts/run_prompt.sh
@@ -7,5 +7,5 @@
 # You can provide two models to compare the performance of the baseline and the finetuned model
 export CUDA_VISIBLE_DEVICES=0
 python prompt_eval.py \
-    --model_name_or_path_baseline XXX \
-    --model_name_or_path_finetune XXX
+    --model_name_or_path_baseline bigscience/bloomz-560m \
+    --model_name_or_path_finetune ./output/bloomz-560m.phoenix_v1_test4/
diff --git a/...epSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_gpu/run_560m.sh b/...epSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_gpu/run_560m.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Note that usually LoRA needs to use larger learning rate
+OUTPUT=$1
+ZERO_STAGE=$2
+if [ "$OUTPUT" == "" ]; then
+    OUTPUT=./output/bloomz-560m.phoenix_v1_test5
+fi
+if [ "$ZERO_STAGE" == "" ]; then
+    ZERO_STAGE=2
+fi
+mkdir -p $OUTPUT
+
+deepspeed --num_gpus 1 main.py \
+   --data_path custom/phoenix_v1 \
+   --model_name_or_path bigscience/bloomz-560m \
+   --data_split 2,4,4 --per_device_train_batch_size 2 --per_device_eval_batch_size 2  --lr_scheduler_type cosine --num_warmup_steps 0 \
+   --gradient_accumulation_steps 1 --zero_stage $ZERO_STAGE --local_rank 0 \
+   --max_seq_len 512 --learning_rate 9.65e-6 --weight_decay 0. --num_train_epochs 1 \
+   --deepspeed --output_dir $OUTPUT 2>&1 | tee $OUTPUT/training.log \
+
diff --git a/applications/DeepSpeed-Chat/training/utils/data/data_utils.py b/applications/DeepSpeed-Chat/training/utils/data/data_utils.py
@@ -64,6 +64,9 @@ def get_raw_dataset(dataset_name, output_path, seed, local_rank):
     elif "lmqg/qag_jaquad" in dataset_name:
         return raw_datasets.LmqgQagjaquadDataset(output_path, seed, local_rank,
                                                  dataset_name)
+    elif "custom/phoenix_v1" in dataset_name:
+        return raw_datasets.CustomPhoenixv1Dataset(output_path, seed, local_rank,
+                                                 dataset_name)
     else:
         raise RuntimeError(
             f"We do not have configs for dataset {dataset_name}, but you can add it by yourself in raw_datasets.py."

diff --git a/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py b/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py
@@ -6,6 +6,7 @@
 from torch.utils.data import Subset
 import re
 
+CustomDatasets = ['custom/phoenix_v1']
 
 # The template prompt dataset class that all new dataset porting needs to
 # follow in order to have a unified API and unified data format.
@@ -15,7 +16,8 @@ def __init__(self, output_path, seed, local_rank, dataset_name):
         self.output_path = output_path
         self.seed = seed
         self.local_rank = local_rank
-        self.raw_datasets = load_dataset(dataset_name)
+        if dataset_name not in CustomDatasets:
+            self.raw_datasets = load_dataset(dataset_name)
 
     def get_train_data(self):
         return
@@ -710,3 +712,82 @@ def get_prompt_and_rejected(self, sample):
             f"Warning: dataset {self.dataset_name} does not include rejected response."
         )
         return None
+
+
+
+# LLMZoo dataset
+class CustomPhoenixv1Dataset(PromptRawDataset):
+
+    def __init__(self, output_path, seed, local_rank, dataset_name):
+        super().__init__(output_path, seed, local_rank, dataset_name)
+        self.dataset_name = "custom/phoenix_v1"
+        self.dataset_name_clean = "custom_phoenix_v1"
+
+        raw_data = load_dataset(path='/home/vrlab/AI for cryo/phoenix-sft-data-v1/', data_files='data.json')
+        self.raw_datasets = raw_data.map(self.process_data)
+
+    def process_data(self, raw_data):
+        custom_data = {}
+        custom_data['id'] = raw_data['id']
+
+        if len(raw_data['conversations']) == 2: # Only use the data with both human and gpt response
+            custom_data['from_human'] = raw_data['conversations'][0]['value']
+            custom_data['from_gpt'] = raw_data['conversations'][1]['value']
+            assert raw_data['conversations'][0]['from'] == 'human' and raw_data['conversations'][1]['from'] == 'gpt'
+
+        else:
+            # all None
+            custom_data['from_human'] = None
+            custom_data['from_gpt'] = None
+
+        return custom_data
+
+
+
+    def get_train_data(self):
+        from .data_utils import get_raw_dataset_split_index
+        dataset = self.raw_datasets["train"]
+        index = get_raw_dataset_split_index(self.local_rank, self.output_path,
+                                            self.dataset_name_clean,
+                                            self.seed, "train_eval", "9,1", 0,
+                                            len(dataset))
+        dataset = Subset(dataset, index)
+        return dataset
+
+    def get_eval_data(self):
+        from .data_utils import get_raw_dataset_split_index
+        dataset = self.raw_datasets["train"]
+        index = get_raw_dataset_split_index(self.local_rank, self.output_path,
+                                            self.dataset_name_clean,
+                                            self.seed, "train_eval", "9,1", 1,
+                                            len(dataset))
+        dataset = Subset(dataset, index)
+        return dataset
+
+    def get_prompt(self, sample):
+        if sample['from_human'] is not None:
+            return " Human: " + sample['from_human'] + " Assistant:"
+        return None
+
+    def get_chosen(self, sample):
+        if sample['from_gpt'] is not None:
+            return " " + sample['from_gpt']
+        return None
+
+    def get_rejected(self, sample):
+        print(
+            f"Warning: dataset {self.dataset_name} does not include rejected response."
+        )
+        return None
+
+    def get_prompt_and_chosen(self, sample):
+        if sample['from_human'] is not None and sample['from_gpt'] is not None:
+            return " Human: " + sample[
+                'from_human'] + " Assistant: " + sample['from_gpt']
+        return None
+
+    def get_prompt_and_rejected(self, sample):
+        print(
+            f"Warning: dataset {self.dataset_name} does not include rejected response."
+        )
+        return None
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		python inference/chatbot.py \
		--path training/step1_supervised_finetuning/output/bloomz-560m.phoenix_v1_test4