Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions applications/DeepSpeed-Chat/infer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
python inference/chatbot.py \
--path training/step1_supervised_finetuning/output/bloomz-560m.phoenix_v1_test4
16 changes: 12 additions & 4 deletions applications/DeepSpeed-Chat/inference/chatbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import os
import json
from transformers import pipeline, set_seed
from transformers import AutoConfig, OPTForCausalLM, AutoTokenizer
from transformers import AutoConfig, OPTForCausalLM, AutoTokenizer, BloomForCausalLM


def parse_args():
Expand Down Expand Up @@ -43,9 +43,17 @@ def get_generator(path):
tokenizer.pad_token = tokenizer.eos_token

model_config = AutoConfig.from_pretrained(path)
model = OPTForCausalLM.from_pretrained(path,
from_tf=bool(".ckpt" in path),
config=model_config).half()
if 'bloom' in path:
model = BloomForCausalLM.from_pretrained(path,
from_tf=bool(".ckpt" in path),
config=model_config).half()
else:
model = OPTForCausalLM.from_pretrained(path,
from_tf=bool(".ckpt" in path),
config=model_config).half()
# model = OPTForCausalLM.from_pretrained(path,
# from_tf=bool(".ckpt" in path),
# config=model_config).half()

model.config.end_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@
# You can provide two models to compare the performance of the baseline and the finetuned model
export CUDA_VISIBLE_DEVICES=0
python prompt_eval.py \
--model_name_or_path_baseline XXX \
--model_name_or_path_finetune XXX
--model_name_or_path_baseline bigscience/bloomz-560m \
--model_name_or_path_finetune ./output/bloomz-560m.phoenix_v1_test4/
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

# Note that usually LoRA needs to use larger learning rate
OUTPUT=$1
ZERO_STAGE=$2
if [ "$OUTPUT" == "" ]; then
OUTPUT=./output/bloomz-560m.phoenix_v1_test5
fi
if [ "$ZERO_STAGE" == "" ]; then
ZERO_STAGE=2
fi
mkdir -p $OUTPUT

deepspeed --num_gpus 1 main.py \
--data_path custom/phoenix_v1 \
--model_name_or_path bigscience/bloomz-560m \
--data_split 2,4,4 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --lr_scheduler_type cosine --num_warmup_steps 0 \
--gradient_accumulation_steps 1 --zero_stage $ZERO_STAGE --local_rank 0 \
--max_seq_len 512 --learning_rate 9.65e-6 --weight_decay 0. --num_train_epochs 1 \
--deepspeed --output_dir $OUTPUT 2>&1 | tee $OUTPUT/training.log \

3 changes: 3 additions & 0 deletions applications/DeepSpeed-Chat/training/utils/data/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ def get_raw_dataset(dataset_name, output_path, seed, local_rank):
elif "lmqg/qag_jaquad" in dataset_name:
return raw_datasets.LmqgQagjaquadDataset(output_path, seed, local_rank,
dataset_name)
elif "custom/phoenix_v1" in dataset_name:
return raw_datasets.CustomPhoenixv1Dataset(output_path, seed, local_rank,
dataset_name)
else:
raise RuntimeError(
f"We do not have configs for dataset {dataset_name}, but you can add it by yourself in raw_datasets.py."
Expand Down
83 changes: 82 additions & 1 deletion applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from torch.utils.data import Subset
import re

CustomDatasets = ['custom/phoenix_v1']

# The template prompt dataset class that all new dataset porting needs to
# follow in order to have a unified API and unified data format.
Expand All @@ -15,7 +16,8 @@ def __init__(self, output_path, seed, local_rank, dataset_name):
self.output_path = output_path
self.seed = seed
self.local_rank = local_rank
self.raw_datasets = load_dataset(dataset_name)
if dataset_name not in CustomDatasets:
self.raw_datasets = load_dataset(dataset_name)

def get_train_data(self):
return
Expand Down Expand Up @@ -710,3 +712,82 @@ def get_prompt_and_rejected(self, sample):
f"Warning: dataset {self.dataset_name} does not include rejected response."
)
return None



# LLMZoo dataset
class CustomPhoenixv1Dataset(PromptRawDataset):

def __init__(self, output_path, seed, local_rank, dataset_name):
super().__init__(output_path, seed, local_rank, dataset_name)
self.dataset_name = "custom/phoenix_v1"
self.dataset_name_clean = "custom_phoenix_v1"

raw_data = load_dataset(path='/home/vrlab/AI for cryo/phoenix-sft-data-v1/', data_files='data.json')
self.raw_datasets = raw_data.map(self.process_data)

def process_data(self, raw_data):
custom_data = {}
custom_data['id'] = raw_data['id']

if len(raw_data['conversations']) == 2: # Only use the data with both human and gpt response
custom_data['from_human'] = raw_data['conversations'][0]['value']
custom_data['from_gpt'] = raw_data['conversations'][1]['value']
assert raw_data['conversations'][0]['from'] == 'human' and raw_data['conversations'][1]['from'] == 'gpt'

else:
# all None
custom_data['from_human'] = None
custom_data['from_gpt'] = None

return custom_data



def get_train_data(self):
from .data_utils import get_raw_dataset_split_index
dataset = self.raw_datasets["train"]
index = get_raw_dataset_split_index(self.local_rank, self.output_path,
self.dataset_name_clean,
self.seed, "train_eval", "9,1", 0,
len(dataset))
dataset = Subset(dataset, index)
return dataset

def get_eval_data(self):
from .data_utils import get_raw_dataset_split_index
dataset = self.raw_datasets["train"]
index = get_raw_dataset_split_index(self.local_rank, self.output_path,
self.dataset_name_clean,
self.seed, "train_eval", "9,1", 1,
len(dataset))
dataset = Subset(dataset, index)
return dataset

def get_prompt(self, sample):
if sample['from_human'] is not None:
return " Human: " + sample['from_human'] + " Assistant:"
return None

def get_chosen(self, sample):
if sample['from_gpt'] is not None:
return " " + sample['from_gpt']
return None

def get_rejected(self, sample):
print(
f"Warning: dataset {self.dataset_name} does not include rejected response."
)
return None

def get_prompt_and_chosen(self, sample):
if sample['from_human'] is not None and sample['from_gpt'] is not None:
return " Human: " + sample[
'from_human'] + " Assistant: " + sample['from_gpt']
return None

def get_prompt_and_rejected(self, sample):
print(
f"Warning: dataset {self.dataset_name} does not include rejected response."
)
return None