Object detection is a computer vision task that identifies objects in an image and determines their exact locations. It combines classification and localization to detect multiple objects simultaneously within a scene. YOLO a real-time detection algorithm that processes the entire image in a single pass, making it much faster than traditional multi-stage methods.
Key components of YOLO:
- Darknet-53 Backbone: 53-layer CNN for useful feature extraction.
- Detection Heads: Three prediction layers for multi-scale detection.
- Anchor Boxes: Predefined box shapes used to detect objects of different sizes.
Step By Step Implementation
Here we implement a complete YOLOv3 pipeline in TensorFlow from building the model and loading weights to running inference and visualizing final object detections.
Step 1: Import Required Libraries
- Imports numpy for numerical operations, cv2 for image processing.
- Import matplotlib for visualizing images, graphs and model outputs.
- Imports TensorFlow and Keras layers to build deep learning models like CNNs or YOLO architectures.
- Includes regularizers and loss functions for training and optimizing neural networks.
import numpy as np
import cv2, os
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import (
Add, Concatenate, Conv2D,
Input, Lambda, LeakyReLU,
UpSampling2D, ZeroPadding2D
)
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import (
binary_crossentropy,
sparse_categorical_crossentropy
)
Step 2: Define YOLO Layer List, Anchors, Masks and Class Names
- YOLOV3_LAYER_LIST contains all submodel layers for loading pretrained weights.
- yolo_anchors are the bounding box priors normalized by 416.
- yolo_anchor_masks map anchors to the output scales.
- class_names stores all COCO dataset class labels.
YOLOV3_LAYER_LIST = [
'yolo_darknet', 'yolo_conv_0', 'yolo_output_0',
'yolo_conv_1', 'yolo_output_1', 'yolo_conv_2', 'yolo_output_2'
]
yolo_anchors = np.array([
(10, 13), (16, 30), (33, 23),
(30, 61), (62, 45), (59, 119),
(116, 90), (156, 198), (373, 326)
], np.float32) / 416
yolo_anchor_masks = np.array([[6, 7, 8], [3, 4, 5], [0, 1, 2]])
class_names = [
'person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat',
'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench',
'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa',
'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse',
'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
'hair drier', 'toothbrush'
]
Step 3: Load Darknet Pretrained Weights
- load_darknet_weights reads official YOLOv3 .weights file.
- Handles convolutional layers with or without batch normalization.
- Loads weights into the TensorFlow Keras model.
def load_darknet_weights(model, weights_file):
wf = open(weights_file, 'rb')
major, minor, revision, seen, _ = np.fromfile(wf, dtype=np.int32, count=5)
layers = YOLOV3_LAYER_LIST
for layer_name in layers:
sub_model = model.get_layer(layer_name)
for i, layer in enumerate(sub_model.layers):
if not layer.name.startswith('conv2d'):
continue
batch_norm = None
if i + 1 < len(sub_model.layers) and sub_model.layers[i + 1].name.startswith('batch_norm'):
batch_norm = sub_model.layers[i + 1]
filters = layer.filters
size = layer.kernel_size[0]
in_dim = layer.input.shape[-1]
if batch_norm is None:
conv_bias = np.fromfile(wf, dtype=np.float32, count=filters)
else:
bn_weights = np.fromfile(wf, dtype=np.float32, count=4 * filters)
bn_weights = bn_weights.reshape((4, filters))[[1, 0, 2, 3]]
conv_shape = (filters, in_dim, size, size)
conv_weights = np.fromfile(wf, dtype=np.float32, count=np.prod(conv_shape))
conv_weights = conv_weights.reshape(conv_shape).transpose([2, 3, 1, 0])
if batch_norm is None:
layer.set_weights([conv_weights, conv_bias])
else:
layer.set_weights([conv_weights])
batch_norm.set_weights(bn_weights)
assert len(wf.read()) == 0, 'failed to read all data'
wf.close()
Step 4: Utility Functions
- transform_images resizes and normalizes image for model input.
- broadcast_iou computes IoU between predicted boxes and ground truth boxes for loss calculation.
def transform_images(x, size):
x = tf.image.resize(x, (size, size))
x = x / 255.0
return x
def broadcast_iou(box_1, box_2):
box_1 = tf.expand_dims(box_1, -2)
box_2 = tf.expand_dims(box_2, 0)
new_shape = tf.broadcast_dynamic_shape(tf.shape(box_1), tf.shape(box_2))
box_1 = tf.broadcast_to(box_1, new_shape)
box_2 = tf.broadcast_to(box_2, new_shape)
int_w = tf.maximum(tf.minimum(box_1[..., 2], box_2[..., 2]) - tf.maximum(box_1[..., 0], box_2[..., 0]), 0)
int_h = tf.maximum(tf.minimum(box_1[..., 3], box_2[..., 3]) - tf.maximum(box_1[..., 1], box_2[..., 1]), 0)
int_area = int_w * int_h
box_1_area = (box_1[..., 2] - box_1[..., 0]) * (box_1[..., 3] - box_1[..., 1])
box_2_area = (box_2[..., 2] - box_2[..., 0]) * (box_2[..., 3] - box_2[..., 1])
return int_area / (box_1_area + box_2_area - int_area)
Step 5: Custom BatchNormalization Layer
Handles training flags correctly during inference or fine-tuning.
class BatchNormalization(tf.keras.layers.BatchNormalization):
def call(self, x, training=False):
if training is None:
training = False
if isinstance(training, bool):
final_training_state = training and self.trainable
else:
final_training_state = tf.logical_and(training, self.trainable)
return super(BatchNormalization, self).call(x, training=final_training_state)
Step 6: Darknet Convolution, Residual and Blocks
- DarknetConv applies convolution with optional batch normalization and LeakyReLU using special padding for stride 2.
- DarknetResidual creates a residual shortcut by applying two DarknetConv layers and adding the input back.
- DarknetBlock performs downsampling with a convolution and then stacks multiple residual units.
def DarknetConv(x, filters, size, strides=1, batch_norm=True):
if strides == 1:
padding = 'same'
else:
x = ZeroPadding2D(((1, 0), (1, 0)))(x)
padding = 'valid'
x = Conv2D(filters, size, strides, padding, use_bias=not batch_norm, kernel_regularizer=l2(0.0005))(x)
if batch_norm:
x = BatchNormalization()(x)
x = LeakyReLU(alpha=0.1)(x)
return x
def DarknetResidual(x, filters):
prev = x
x = DarknetConv(x, filters // 2, 1)
x = DarknetConv(x, filters, 3)
x = Add()([prev, x])
return x
def DarknetBlock(x, filters, blocks):
x = DarknetConv(x, filters, 3, strides=2)
for _ in range(blocks):
x = DarknetResidual(x, filters)
return x
Step 7: Darknet Backbone
- Builds full Darknet53 backbone.
- Returns intermediate outputs for skip connections.
def YoloConv(x_in, filters, name=None):
if isinstance(x_in, tuple):
inputs = Input(x_in[0].shape[1:]), Input(x_in[1].shape[1:])
x, x_skip = inputs
x = DarknetConv(x, filters, 1)
x = UpSampling2D(2)(x)
x = Concatenate()([x, x_skip])
else:
x = inputs = Input(x_in.shape[1:])
x = DarknetConv(x, filters, 1)
x = DarknetConv(x, filters * 2, 3)
x = DarknetConv(x, filters, 1)
x = DarknetConv(x, filters * 2, 3)
x = DarknetConv(x, filters, 1)
return Model(inputs, x, name=name)(x_in)
def YoloOutput(x_in, filters, anchors, classes, name=None):
x = inputs = Input(x_in.shape[1:])
x = DarknetConv(x, filters * 2, 3)
x = DarknetConv(x, anchors * (classes + 5), 1, batch_norm=False)
x = Lambda(lambda x: tf.reshape(x, (-1, tf.shape(x)[1], tf.shape(x)[2], anchors, classes + 5)))(x)
return Model(inputs, x, name=name)(x_in)
Step 8: YOLO Heads
- YoloConv applies convolutional layers with optional skip connection.
- YoloOutput produces final predictions reshaped for each anchor.
def YoloConv(x_in, filters, name=None):
if isinstance(x_in, tuple):
inputs = Input(x_in[0].shape[1:]), Input(x_in[1].shape[1:])
x, x_skip = inputs
x = DarknetConv(x, filters, 1)
x = UpSampling2D(2)(x)
x = Concatenate()([x, x_skip])
else:
x = inputs = Input(x_in.shape[1:])
x = DarknetConv(x, filters, 1)
x = DarknetConv(x, filters * 2, 3)
x = DarknetConv(x, filters, 1)
x = DarknetConv(x, filters * 2, 3)
x = DarknetConv(x, filters, 1)
return Model(inputs, x, name=name)(x_in)
def YoloOutput(x_in, filters, anchors, classes, name=None):
x = inputs = Input(x_in.shape[1:])
x = DarknetConv(x, filters * 2, 3)
x = DarknetConv(x, anchors * (classes + 5), 1, batch_norm=False)
x = Lambda(lambda x: tf.reshape(x, (-1, tf.shape(x)[1], tf.shape(x)[2], anchors, classes + 5)))(x)
return Model(inputs, x, name=name)(x_in)
Step 9: YOLO Box Processing and NMS
- yolo_boxes converts raw predictions to bounding boxes in [y1, x1, y2, x2].
- yolo_nms applies Non-Maximum Suppression to remove duplicate boxes.
def yolo_boxes(pred, anchors, classes):
grid_size = tf.shape(pred)[1]
box_xy, box_wh, objectness, class_probs = tf.split(pred, (2, 2, 1, classes), axis=-1)
box_xy = tf.sigmoid(box_xy)
objectness = tf.sigmoid(objectness)
class_probs = tf.sigmoid(class_probs)
pred_box = tf.concat((box_xy, box_wh), axis=-1)
grid = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))
grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)
box_xy = (box_xy + tf.cast(grid, tf.float32)) / tf.cast(grid_size, tf.float32)
box_wh = tf.exp(box_wh) * anchors
box_x1y1 = box_xy - box_wh / 2
box_x2y2 = box_xy + box_wh / 2
y1 = box_x1y1[..., 1:2]
x1 = box_x1y1[..., 0:1]
y2 = box_x2y2[..., 1:2]
x2 = box_x2y2[..., 0:1]
bbox = tf.concat([y1, x1, y2, x2], axis=-1)
return bbox, objectness, class_probs, pred_box
def yolo_nms(outputs, anchors, masks, classes):
b, c, t = [], [], []
for o in outputs:
b.append(tf.reshape(o[0], (tf.shape(o[0])[0], -1, tf.shape(o[0])[-1])))
c.append(tf.reshape(o[1], (tf.shape(o[1])[0], -1, tf.shape(o[1])[-1])))
t.append(tf.reshape(o[2], (tf.shape(o[2])[0], -1, tf.shape(o[2])[-1])))
bbox = tf.concat(b, axis=1)
confidence = tf.concat(c, axis=1)
class_probs = tf.concat(t, axis=1)
scores = confidence * class_probs
boxes, scores, classes, valid_detections = tf.image.combined_non_max_suppression(
boxes=tf.reshape(bbox, (tf.shape(bbox)[0], -1, 1, 4)),
scores=tf.reshape(scores, (tf.shape(scores)[0], -1, tf.shape(scores)[-1])),
max_output_size_per_class=100,
max_total_size=100,
iou_threshold=0.5,
score_threshold=0.5
)
return boxes, scores, classes, valid_detections
Step 10: Build Full YOLOv3 Model
- Builds the full YOLOv3 network by passing the input through the Darknet backbone and collecting three feature maps.
- Applies YoloConv and YoloOutput on each feature scale to generate multi-scale predictions.
- In training mode, returns the three raw detection outputs for loss computation.
- In inference mode, converts outputs to bounding boxes and applies NMS to return final detections.
def YoloV3(size=None, channels=3, anchors=yolo_anchors, masks=yolo_anchor_masks, classes=80, training=False):
x = inputs = Input([size, size, channels])
x_36, x_61, x = Darknet(name='yolo_darknet')(x)
x = YoloConv(x, 512, name='yolo_conv_0')
output_0 = YoloOutput(x, 512, len(masks[0]), classes, name='yolo_output_0')
x = YoloConv((x, x_61), 256, name='yolo_conv_1')
output_1 = YoloOutput(x, 256, len(masks[1]), classes, name='yolo_output_1')
x = YoloConv((x, x_36), 128, name='yolo_conv_2')
output_2 = YoloOutput(x, 128, len(masks[2]), classes, name='yolo_output_2')
if training:
return Model(inputs, (output_0, output_1, output_2), name='yolov3')
boxes_0 = Lambda(lambda x: yolo_boxes(x, anchors[masks[0]], classes))(output_0)
boxes_1 = Lambda(lambda x: yolo_boxes(x, anchors[masks[1]], classes))(output_1)
boxes_2 = Lambda(lambda x: yolo_boxes(x, anchors[masks[2]], classes))(output_2)
outputs = Lambda(lambda x: yolo_nms(x, anchors, masks, classes))((boxes_0[:3], boxes_1[:3], boxes_2[:3]))
return Model(inputs, outputs, name='yolov3')
Step 11: Instantiate Model and Load Weights
- Creates a YOLOv3 model instance with input size 416×416 and 80 classes (COCO dataset).
- Downloads pretrained Darknet weights from the official YOLO website.
- Converts Darknet .weights into Keras-compatible format using load_darknet_weights().
- Calling summary() prints the full YOLOv3 layer architecture.
yolo = YoloV3(size=416, classes=80)
yolo.summary()
!wget https://data.pjreddie.com/files/yolov3.weights -O /content/yolov3.weights
load_darknet_weights(yolo, '/content/yolov3.weights')
Output:

Step 12: Predict and Draw Bounding Boxes
- Loads and preprocesses an input image resizes to 416×416 normalizes pixels.
- Runs YOLOv3 model to obtain bounding boxes, objectness scores and class predictions.
- Converts coordinates from normalized values to actual pixel positions and draws rectangles.
- Displays the final annotated image using Matplotlib when visualize is True.
def predict_and_draw(image_file, model, class_names, input_size=416, visualize=True, figsize=(12, 12)):
img_raw = tf.image.decode_image(open(image_file, 'rb').read(), channels=3)
img = tf.expand_dims(img_raw, 0)
img = transform_images(img, input_size)
boxes, scores, classes, nums = model.predict(img)
img_bgr = cv2.imread(image_file)
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
ih, iw = img_rgb.shape[:2]
num = int(nums[0])
for i in range(num):
y1, x1, y2, x2 = boxes[0][i]
x1, x2 = int(x1 * iw), int(x2 * iw)
y1, y2 = int(y1 * ih), int(y2 * ih)
x1, x2 = max(0, x1), min(iw - 1, x2)
y1, y2 = max(0, y1), min(ih - 1, y2)
cv2.rectangle(img_rgb, (x1, y1), (x2, y2), (255, 0, 0), 2)
cls = int(classes[0][i])
score = scores[0][i]
label = f"{class_names[cls]}: {score:.2f}"
cv2.putText(img_rgb, label, (x1, y1 - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
if visualize:
plt.figure(figsize=figsize)
plt.imshow(img_rgb)
plt.axis('off')
plt.show()
return boxes, scores, classes, nums
Step 14: Run Final Prediction on a New Image
- Here we performs object detection on a new input image using the YOLOv3 model already loaded and configured.
- The function predict_and_draw() handles preprocessing, prediction and drawing bounding boxes.
image_file = '/content/img.png'
predict_and_draw(image_file)
Output:
You can download full code from here