tobran
diff --git a/‎pipeline_parallelism/alexnet.py‎
Lines changed: 47 additions & 0 deletions b/‎pipeline_parallelism/alexnet.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎pipeline_parallelism/ds_config.json‎
Lines changed: 19 additions & 0 deletions b/‎pipeline_parallelism/ds_config.json‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎pipeline_parallelism/run.sh‎
Lines changed: 3 additions & 0 deletions b/‎pipeline_parallelism/run.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pipeline_parallelism/train.py‎
Lines changed: 157 additions & 0 deletions b/‎pipeline_parallelism/train.py‎
Lines changed: 157 additions & 0 deletions
@@ -0,0 +1,47 @@
+#
+# Implementation of AlexNet for illustrative purposes. The train.py driver
+# can import AlexNet from here or directly from torchvision.
+#
+# Taken from torchvision.models.alexnet:
+# https://pytorch.org/docs/1.6.0/_modules/torchvision/models/alexnet.html#alexnet
+
+
+import torch
+import torch.nn as nn
+
+
+class AlexNet(nn.Module):
+    def __init__(self, num_classes=1000):
+        super(AlexNet, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
+        self.classifier = nn.Sequential(
+            nn.Dropout(),
+            nn.Linear(256 * 6 * 6, 4096),
+            nn.ReLU(inplace=True),
+            nn.Dropout(),
+            nn.Linear(4096, 4096),
+            nn.ReLU(inplace=True),
+            nn.Linear(4096, num_classes),
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
@@ -0,0 +1,19 @@
+ {
+  "train_batch_size" : 256,
+  "train_micro_batch_size_per_gpu" : 8,
+
+   "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [
+        0.9,
+        0.999
+      ],
+      "eps": 1e-8
+    }
+  },
+  
+  "steps_per_print" : 10,
+  "wall_clock_breakdown" : false
+ }
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+deepspeed train.py --deepspeed_config=ds_config.json -p 2 --steps=200
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+
+import argparse
+
+import torch
+import torch.distributed as dist
+
+import torchvision
+import torchvision.transforms as transforms
+from torchvision.models import AlexNet
+from torchvision.models import vgg19
+
+import deepspeed
+from deepspeed.pipe import PipelineModule
+from deepspeed.utils import RepeatingLoader
+
+
+def cifar_trainset(local_rank, dl_path='/tmp/cifar10-data'):
+    transform = transforms.Compose([
+        transforms.Resize(256),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+
+    # Ensure only one rank downloads.
+    # Note: if the download path is not on a shared filesytem, remove the semaphore
+    # and switch to args.local_rank
+    dist.barrier()
+    if local_rank != 0:
+        dist.barrier()
+    trainset = torchvision.datasets.CIFAR10(root=dl_path,
+                                            train=True,
+                                            download=True,
+                                            transform=transform)
+    if local_rank == 0:
+        dist.barrier()
+    return trainset
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='CIFAR')
+    parser.add_argument('--local_rank',
+                        type=int,
+                        default=-1,
+                        help='local rank passed from distributed launcher')
+    parser.add_argument('-s',
+                        '--steps',
+                        type=int,
+                        default=100,
+                        help='quit after this many steps')
+    parser.add_argument('-p',
+                        '--pipeline-parallel-size',
+                        type=int,
+                        default=2,
+                        help='pipeline parallelism')
+    parser.add_argument('--backend',
+                        type=str,
+                        default='nccl',
+                        help='distributed backend')
+    parser.add_argument('--seed', type=int, default=1138, help='PRNG seed')
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    return args
+
+
+def train_base(args):
+    torch.manual_seed(args.seed)
+
+    # VGG also works :-)
+    #net = vgg19(num_classes=10)
+    net = AlexNet(num_classes=10)
+
+    trainset = cifar_trainset(args.local_rank)
+
+    engine, _, dataloader, __ = deepspeed.initialize(
+        args=args,
+        model=net,
+        model_parameters=[p for p in net.parameters() if p.requires_grad],
+        training_data=trainset)
+
+    dataloader = RepeatingLoader(dataloader)
+    data_iter = iter(dataloader)
+
+    rank = dist.get_rank()
+    gas = engine.gradient_accumulation_steps()
+
+    criterion = torch.nn.CrossEntropyLoss()
+
+    total_steps = args.steps * engine.gradient_accumulation_steps()
+    step = 0
+    for micro_step in range(total_steps):
+        batch = next(data_iter)
+        inputs = batch[0].to(engine.device)
+        labels = batch[1].to(engine.device)
+
+        outputs = engine(inputs)
+        loss = criterion(outputs, labels)
+        engine.backward(loss)
+        engine.step()
+
+        if micro_step % engine.gradient_accumulation_steps() == 0:
+            step += 1
+            if rank == 0 and (step % 10 == 0):
+                print(f'step: {step:3d} / {args.steps:3d} loss: {loss}')
+
+
+
+def join_layers(vision_model):
+    layers = [
+        *vision_model.features,
+        vision_model.avgpool,
+        lambda x: torch.flatten(x, 1),
+        *vision_model.classifier,
+    ]
+    return layers
+
+
+def train_pipe(args, part='parameters'):
+    torch.manual_seed(args.seed)
+    deepspeed.runtime.utils.set_random_seed(args.seed)
+
+    #
+    # Build the model
+    #
+
+    # VGG also works :-)
+    #net = vgg19(num_classes=10)
+    net = AlexNet(num_classes=10)
+    net = PipelineModule(layers=join_layers(net),
+                         loss_fn=torch.nn.CrossEntropyLoss(),
+                         num_stages=args.pipeline_parallel_size,
+                         partition_method=part,
+                         activation_checkpoint_interval=0)
+
+    trainset = cifar_trainset(args.local_rank)
+
+    engine, _, _, _ = deepspeed.initialize(
+        args=args,
+        model=net,
+        model_parameters=[p for p in net.parameters() if p.requires_grad],
+        training_data=trainset)
+
+    for step in range(args.steps):
+        loss = engine.train_batch()
+
+
+if __name__ == '__main__':
+    args = get_args()
+
+    torch.cuda.set_device(args.local_rank)
+    dist.init_process_group(backend=args.backend)
+
+    if args.pipeline_parallel_size == 0:
+        train_base(args)
+    else:
+        train_pipe(args)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#!/bin/bash`
	`2`	`+`
	`3`	`+deepspeed train.py --deepspeed_config=ds_config.json -p 2 --steps=200`