diff --git a/applications/DeepSpeed-Chat/infer.sh b/applications/DeepSpeed-Chat/infer.sh new file mode 100644 index 000000000..c0eb8efb2 --- /dev/null +++ b/applications/DeepSpeed-Chat/infer.sh @@ -0,0 +1,2 @@ +python inference/chatbot.py \ + --path training/step1_supervised_finetuning/output/bloomz-560m.phoenix_v1_test4 \ No newline at end of file diff --git a/applications/DeepSpeed-Chat/inference/chatbot.py b/applications/DeepSpeed-Chat/inference/chatbot.py index 38b900d7d..310434b88 100644 --- a/applications/DeepSpeed-Chat/inference/chatbot.py +++ b/applications/DeepSpeed-Chat/inference/chatbot.py @@ -10,7 +10,7 @@ import os import json from transformers import pipeline, set_seed -from transformers import AutoConfig, OPTForCausalLM, AutoTokenizer +from transformers import AutoConfig, OPTForCausalLM, AutoTokenizer, BloomForCausalLM def parse_args(): @@ -43,9 +43,17 @@ def get_generator(path): tokenizer.pad_token = tokenizer.eos_token model_config = AutoConfig.from_pretrained(path) - model = OPTForCausalLM.from_pretrained(path, - from_tf=bool(".ckpt" in path), - config=model_config).half() + if 'bloom' in path: + model = BloomForCausalLM.from_pretrained(path, + from_tf=bool(".ckpt" in path), + config=model_config).half() + else: + model = OPTForCausalLM.from_pretrained(path, + from_tf=bool(".ckpt" in path), + config=model_config).half() + # model = OPTForCausalLM.from_pretrained(path, + # from_tf=bool(".ckpt" in path), + # config=model_config).half() model.config.end_token_id = tokenizer.eos_token_id model.config.pad_token_id = model.config.eos_token_id diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/evaluation_scripts/run_prompt.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/evaluation_scripts/run_prompt.sh index 2828e32a4..5f2a159a0 100644 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/evaluation_scripts/run_prompt.sh +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/evaluation_scripts/run_prompt.sh @@ -7,5 +7,5 @@ # You can provide two models to compare the performance of the baseline and the finetuned model export CUDA_VISIBLE_DEVICES=0 python prompt_eval.py \ - --model_name_or_path_baseline XXX \ - --model_name_or_path_finetune XXX + --model_name_or_path_baseline bigscience/bloomz-560m \ + --model_name_or_path_finetune ./output/bloomz-560m.phoenix_v1_test4/ diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_gpu/run_560m.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_gpu/run_560m.sh new file mode 100644 index 000000000..186e8f64e --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_gpu/run_560m.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# Note that usually LoRA needs to use larger learning rate +OUTPUT=$1 +ZERO_STAGE=$2 +if [ "$OUTPUT" == "" ]; then + OUTPUT=./output/bloomz-560m.phoenix_v1_test5 +fi +if [ "$ZERO_STAGE" == "" ]; then + ZERO_STAGE=2 +fi +mkdir -p $OUTPUT + +deepspeed --num_gpus 1 main.py \ + --data_path custom/phoenix_v1 \ + --model_name_or_path bigscience/bloomz-560m \ + --data_split 2,4,4 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --lr_scheduler_type cosine --num_warmup_steps 0 \ + --gradient_accumulation_steps 1 --zero_stage $ZERO_STAGE --local_rank 0 \ + --max_seq_len 512 --learning_rate 9.65e-6 --weight_decay 0. --num_train_epochs 1 \ + --deepspeed --output_dir $OUTPUT 2>&1 | tee $OUTPUT/training.log \ + \ No newline at end of file diff --git a/applications/DeepSpeed-Chat/training/utils/data/data_utils.py b/applications/DeepSpeed-Chat/training/utils/data/data_utils.py index a6e4a601a..71bc44558 100644 --- a/applications/DeepSpeed-Chat/training/utils/data/data_utils.py +++ b/applications/DeepSpeed-Chat/training/utils/data/data_utils.py @@ -64,6 +64,9 @@ def get_raw_dataset(dataset_name, output_path, seed, local_rank): elif "lmqg/qag_jaquad" in dataset_name: return raw_datasets.LmqgQagjaquadDataset(output_path, seed, local_rank, dataset_name) + elif "custom/phoenix_v1" in dataset_name: + return raw_datasets.CustomPhoenixv1Dataset(output_path, seed, local_rank, + dataset_name) else: raise RuntimeError( f"We do not have configs for dataset {dataset_name}, but you can add it by yourself in raw_datasets.py." diff --git a/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py b/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py index 23666e234..66ec63756 100644 --- a/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py +++ b/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py @@ -6,6 +6,7 @@ from torch.utils.data import Subset import re +CustomDatasets = ['custom/phoenix_v1'] # The template prompt dataset class that all new dataset porting needs to # follow in order to have a unified API and unified data format. @@ -15,7 +16,8 @@ def __init__(self, output_path, seed, local_rank, dataset_name): self.output_path = output_path self.seed = seed self.local_rank = local_rank - self.raw_datasets = load_dataset(dataset_name) + if dataset_name not in CustomDatasets: + self.raw_datasets = load_dataset(dataset_name) def get_train_data(self): return @@ -710,3 +712,82 @@ def get_prompt_and_rejected(self, sample): f"Warning: dataset {self.dataset_name} does not include rejected response." ) return None + + + +# LLMZoo dataset +class CustomPhoenixv1Dataset(PromptRawDataset): + + def __init__(self, output_path, seed, local_rank, dataset_name): + super().__init__(output_path, seed, local_rank, dataset_name) + self.dataset_name = "custom/phoenix_v1" + self.dataset_name_clean = "custom_phoenix_v1" + + raw_data = load_dataset(path='/home/vrlab/AI for cryo/phoenix-sft-data-v1/', data_files='data.json') + self.raw_datasets = raw_data.map(self.process_data) + + def process_data(self, raw_data): + custom_data = {} + custom_data['id'] = raw_data['id'] + + if len(raw_data['conversations']) == 2: # Only use the data with both human and gpt response + custom_data['from_human'] = raw_data['conversations'][0]['value'] + custom_data['from_gpt'] = raw_data['conversations'][1]['value'] + assert raw_data['conversations'][0]['from'] == 'human' and raw_data['conversations'][1]['from'] == 'gpt' + + else: + # all None + custom_data['from_human'] = None + custom_data['from_gpt'] = None + + return custom_data + + + + def get_train_data(self): + from .data_utils import get_raw_dataset_split_index + dataset = self.raw_datasets["train"] + index = get_raw_dataset_split_index(self.local_rank, self.output_path, + self.dataset_name_clean, + self.seed, "train_eval", "9,1", 0, + len(dataset)) + dataset = Subset(dataset, index) + return dataset + + def get_eval_data(self): + from .data_utils import get_raw_dataset_split_index + dataset = self.raw_datasets["train"] + index = get_raw_dataset_split_index(self.local_rank, self.output_path, + self.dataset_name_clean, + self.seed, "train_eval", "9,1", 1, + len(dataset)) + dataset = Subset(dataset, index) + return dataset + + def get_prompt(self, sample): + if sample['from_human'] is not None: + return " Human: " + sample['from_human'] + " Assistant:" + return None + + def get_chosen(self, sample): + if sample['from_gpt'] is not None: + return " " + sample['from_gpt'] + return None + + def get_rejected(self, sample): + print( + f"Warning: dataset {self.dataset_name} does not include rejected response." + ) + return None + + def get_prompt_and_chosen(self, sample): + if sample['from_human'] is not None and sample['from_gpt'] is not None: + return " Human: " + sample[ + 'from_human'] + " Assistant: " + sample['from_gpt'] + return None + + def get_prompt_and_rejected(self, sample): + print( + f"Warning: dataset {self.dataset_name} does not include rejected response." + ) + return None