| 
 | 1 | +"""  | 
 | 2 | +Yolo V1 by tensorflow  | 
 | 3 | +"""  | 
 | 4 | + | 
 | 5 | +import numpy as np  | 
 | 6 | +import tensorflow as tf  | 
 | 7 | +import cv2  | 
 | 8 | + | 
 | 9 | + | 
 | 10 | +def leak_relu(x, alpha=0.1):  | 
 | 11 | +    return tf.maximum(alpha * x, x)  | 
 | 12 | + | 
 | 13 | +class Yolo(object):  | 
 | 14 | +    def __init__(self, weights_file, verbose=True):  | 
 | 15 | +        self.verbose = verbose  | 
 | 16 | +        # detection params  | 
 | 17 | +        self.S = 7  # cell size  | 
 | 18 | +        self.B = 2  # boxes_per_cell  | 
 | 19 | +        self.classes = ["aeroplane", "bicycle", "bird", "boat", "bottle",  | 
 | 20 | +                        "bus", "car", "cat", "chair", "cow", "diningtable",  | 
 | 21 | +                        "dog", "horse", "motorbike", "person", "pottedplant",  | 
 | 22 | +                        "sheep", "sofa", "train","tvmonitor"]  | 
 | 23 | +        self.C = len(self.classes) # number of classes  | 
 | 24 | +        # offset for box center (top left point of each cell)  | 
 | 25 | +        self.x_offset = np.transpose(np.reshape(np.array([np.arange(self.S)]*self.S*self.B),  | 
 | 26 | +                                              [self.B, self.S, self.S]), [1, 2, 0])  | 
 | 27 | +        self.y_offset = np.transpose(self.x_offset, [1, 0, 2])  | 
 | 28 | + | 
 | 29 | +        self.threshold = 0.2  # confidence scores threhold  | 
 | 30 | +        self.iou_threshold = 0.4  | 
 | 31 | +        #  the maximum number of boxes to be selected by non max suppression  | 
 | 32 | +        self.max_output_size = 10  | 
 | 33 | + | 
 | 34 | +        self.sess = tf.Session()  | 
 | 35 | +        self._build_net()  | 
 | 36 | +        self._build_detector()  | 
 | 37 | +        self._load_weights(weights_file)  | 
 | 38 | + | 
 | 39 | +    def _build_net(self):  | 
 | 40 | +        """build the network"""  | 
 | 41 | +        if self.verbose:  | 
 | 42 | +            print("Start to build the network ...")  | 
 | 43 | +        self.images = tf.placeholder(tf.float32, [None, 448, 448, 3])  | 
 | 44 | +        net = self._conv_layer(self.images, 1, 64, 7, 2)  | 
 | 45 | +        net = self._maxpool_layer(net, 1, 2, 2)  | 
 | 46 | +        net = self._conv_layer(net, 2, 192, 3, 1)  | 
 | 47 | +        net = self._maxpool_layer(net, 2, 2, 2)  | 
 | 48 | +        net = self._conv_layer(net, 3, 128, 1, 1)  | 
 | 49 | +        net = self._conv_layer(net, 4, 256, 3, 1)  | 
 | 50 | +        net = self._conv_layer(net, 5, 256, 1, 1)  | 
 | 51 | +        net = self._conv_layer(net, 6, 512, 3, 1)  | 
 | 52 | +        net = self._maxpool_layer(net, 6, 2, 2)  | 
 | 53 | +        net = self._conv_layer(net, 7, 256, 1, 1)  | 
 | 54 | +        net = self._conv_layer(net, 8, 512, 3, 1)  | 
 | 55 | +        net = self._conv_layer(net, 9, 256, 1, 1)  | 
 | 56 | +        net = self._conv_layer(net, 10, 512, 3, 1)  | 
 | 57 | +        net = self._conv_layer(net, 11, 256, 1, 1)  | 
 | 58 | +        net = self._conv_layer(net, 12, 512, 3, 1)  | 
 | 59 | +        net = self._conv_layer(net, 13, 256, 1, 1)  | 
 | 60 | +        net = self._conv_layer(net, 14, 512, 3, 1)  | 
 | 61 | +        net = self._conv_layer(net, 15, 512, 1, 1)  | 
 | 62 | +        net = self._conv_layer(net, 16, 1024, 3, 1)  | 
 | 63 | +        net = self._maxpool_layer(net, 16, 2, 2)  | 
 | 64 | +        net = self._conv_layer(net, 17, 512, 1, 1)  | 
 | 65 | +        net = self._conv_layer(net, 18, 1024, 3, 1)  | 
 | 66 | +        net = self._conv_layer(net, 19, 512, 1, 1)  | 
 | 67 | +        net = self._conv_layer(net, 20, 1024, 3, 1)  | 
 | 68 | +        net = self._conv_layer(net, 21, 1024, 3, 1)  | 
 | 69 | +        net = self._conv_layer(net, 22, 1024, 3, 2)  | 
 | 70 | +        net = self._conv_layer(net, 23, 1024, 3, 1)  | 
 | 71 | +        net = self._conv_layer(net, 24, 1024, 3, 1)  | 
 | 72 | +        net = self._flatten(net)  | 
 | 73 | +        net = self._fc_layer(net, 25, 512, activation=leak_relu)  | 
 | 74 | +        net = self._fc_layer(net, 26, 4096, activation=leak_relu)  | 
 | 75 | +        net = self._fc_layer(net, 27, self.S*self.S*(self.C+5*self.B))  | 
 | 76 | +        self.predicts = net  | 
 | 77 | + | 
 | 78 | +    def _build_detector(self):  | 
 | 79 | +        """Interpret the net output and get the predicted boxes"""  | 
 | 80 | +        # the width and height of orignal image  | 
 | 81 | +        self.width = tf.placeholder(tf.float32, name="img_w")  | 
 | 82 | +        self.height = tf.placeholder(tf.float32, name="img_h")  | 
 | 83 | +        # get class prob, confidence, boxes from net output  | 
 | 84 | +        idx1 = self.S * self.S * self.C  | 
 | 85 | +        idx2 = idx1 + self.S * self.S * self.B  | 
 | 86 | +        # class prediction  | 
 | 87 | +        class_probs = tf.reshape(self.predicts[0, :idx1], [self.S, self.S, self.C])  | 
 | 88 | +        # confidence  | 
 | 89 | +        confs = tf.reshape(self.predicts[0, idx1:idx2], [self.S, self.S, self.B])  | 
 | 90 | +        # boxes -> (x, y, w, h)  | 
 | 91 | +        boxes = tf.reshape(self.predicts[0, idx2:], [self.S, self.S, self.B, 4])  | 
 | 92 | + | 
 | 93 | +        # convert the x, y to the coordinates relative to the top left point of the image  | 
 | 94 | +        # the predictions of w, h are the square root  | 
 | 95 | +        # multiply the width and height of image  | 
 | 96 | +        boxes = tf.stack([(boxes[:, :, :, 0] + tf.constant(self.x_offset, dtype=tf.float32)) / self.S * self.width,  | 
 | 97 | +                          (boxes[:, :, :, 1] + tf.constant(self.y_offset, dtype=tf.float32)) / self.S * self.height,  | 
 | 98 | +                          tf.square(boxes[:, :, :, 2]) * self.width,  | 
 | 99 | +                          tf.square(boxes[:, :, :, 3]) * self.height], axis=3)  | 
 | 100 | + | 
 | 101 | +        # class-specific confidence scores [S, S, B, C]  | 
 | 102 | +        scores = tf.expand_dims(confs, -1) * tf.expand_dims(class_probs, 2)  | 
 | 103 | + | 
 | 104 | +        scores = tf.reshape(scores, [-1, self.C])  # [S*S*B, C]  | 
 | 105 | +        boxes = tf.reshape(boxes, [-1, 4])  # [S*S*B, 4]  | 
 | 106 | + | 
 | 107 | +        # find each box class, only select the max score  | 
 | 108 | +        box_classes = tf.argmax(scores, axis=1)  | 
 | 109 | +        box_class_scores = tf.reduce_max(scores, axis=1)  | 
 | 110 | + | 
 | 111 | +        # filter the boxes by the score threshold  | 
 | 112 | +        filter_mask = box_class_scores >= self.threshold  | 
 | 113 | +        scores = tf.boolean_mask(box_class_scores, filter_mask)  | 
 | 114 | +        boxes = tf.boolean_mask(boxes, filter_mask)  | 
 | 115 | +        box_classes = tf.boolean_mask(box_classes, filter_mask)  | 
 | 116 | + | 
 | 117 | +        # non max suppression (do not distinguish different classes)  | 
 | 118 | +        # ref: https://tensorflow.google.cn/api_docs/python/tf/image/non_max_suppression  | 
 | 119 | +        # box (x, y, w, h) -> box (x1, y1, x2, y2)  | 
 | 120 | +        _boxes = tf.stack([boxes[:, 0] - 0.5 * boxes[:, 2], boxes[:, 1] - 0.5 * boxes[:, 3],  | 
 | 121 | +                           boxes[:, 0] + 0.5 * boxes[:, 2], boxes[:, 1] + 0.5 * boxes[:, 3]], axis=1)  | 
 | 122 | +        nms_indices = tf.image.non_max_suppression(_boxes, scores,  | 
 | 123 | +                                                   self.max_output_size, self.iou_threshold)  | 
 | 124 | +        self.scores = tf.gather(scores, nms_indices)  | 
 | 125 | +        self.boxes = tf.gather(boxes, nms_indices)  | 
 | 126 | +        self.box_classes = tf.gather(box_classes, nms_indices)  | 
 | 127 | + | 
 | 128 | +    def _conv_layer(self, x, id, num_filters, filter_size, stride):  | 
 | 129 | +        """Conv layer"""  | 
 | 130 | +        in_channels = x.get_shape().as_list()[-1]  | 
 | 131 | +        weight = tf.Variable(tf.truncated_normal([filter_size, filter_size,  | 
 | 132 | +                                                  in_channels, num_filters], stddev=0.1))  | 
 | 133 | +        bias = tf.Variable(tf.zeros([num_filters,]))  | 
 | 134 | +        # padding, note: not using padding="VALID"  | 
 | 135 | +        pad_size = filter_size // 2  | 
 | 136 | +        pad_mat = np.array([[0, 0], [pad_size, pad_size], [pad_size, pad_size], [0, 0]])  | 
 | 137 | +        x_pad = tf.pad(x, pad_mat)  | 
 | 138 | +        conv = tf.nn.conv2d(x_pad, weight, strides=[1, stride, stride, 1], padding="VALID")  | 
 | 139 | +        output = leak_relu(tf.nn.bias_add(conv, bias))  | 
 | 140 | +        if self.verbose:  | 
 | 141 | +            print("    Layer %d: type=Conv, num_filter=%d, filter_size=%d, stride=%d, output_shape=%s" \  | 
 | 142 | +                  % (id, num_filters, filter_size, stride, str(output.get_shape())))  | 
 | 143 | +        return output  | 
 | 144 | + | 
 | 145 | +    def _fc_layer(self, x, id, num_out, activation=None):  | 
 | 146 | +        """fully connected layer"""  | 
 | 147 | +        num_in = x.get_shape().as_list()[-1]  | 
 | 148 | +        weight = tf.Variable(tf.truncated_normal([num_in, num_out], stddev=0.1))  | 
 | 149 | +        bias = tf.Variable(tf.zeros([num_out,]))  | 
 | 150 | +        output = tf.nn.xw_plus_b(x, weight, bias)  | 
 | 151 | +        if activation:  | 
 | 152 | +            output = activation(output)  | 
 | 153 | +        if self.verbose:  | 
 | 154 | +            print("    Layer %d: type=Fc, num_out=%d, output_shape=%s" \  | 
 | 155 | +                  % (id, num_out, str(output.get_shape())))  | 
 | 156 | +        return output  | 
 | 157 | + | 
 | 158 | +    def _maxpool_layer(self, x, id, pool_size, stride):  | 
 | 159 | +        output = tf.nn.max_pool(x, [1, pool_size, pool_size, 1],  | 
 | 160 | +                                strides=[1, stride, stride, 1], padding="SAME")  | 
 | 161 | +        if self.verbose:  | 
 | 162 | +            print("    Layer %d: type=MaxPool, pool_size=%d, stride=%d, output_shape=%s" \  | 
 | 163 | +                  % (id, pool_size, stride, str(output.get_shape())))  | 
 | 164 | +        return output  | 
 | 165 | + | 
 | 166 | +    def _flatten(self, x):  | 
 | 167 | +        """flatten the x"""  | 
 | 168 | +        tran_x = tf.transpose(x, [0, 3, 1, 2])  # channle first mode  | 
 | 169 | +        nums = np.product(x.get_shape().as_list()[1:])  | 
 | 170 | +        return tf.reshape(tran_x, [-1, nums])  | 
 | 171 | + | 
 | 172 | +    def _load_weights(self, weights_file):  | 
 | 173 | +        """Load weights from file"""  | 
 | 174 | +        if self.verbose:  | 
 | 175 | +            print("Start to load weights from file:%s" % (weights_file))  | 
 | 176 | +        saver = tf.train.Saver()  | 
 | 177 | +        saver.restore(self.sess, weights_file)  | 
 | 178 | + | 
 | 179 | +    def detect_from_file(self, image_file, deteted_boxes_file="boxes.txt",  | 
 | 180 | +                     detected_image_file="detected_image.jpg"):  | 
 | 181 | +        """Do detection given a image file"""  | 
 | 182 | +        # read image  | 
 | 183 | +        image = cv2.imread(image_file)  | 
 | 184 | +        img_h, img_w, _ = image.shape  | 
 | 185 | +        scores, boxes, box_classes = self._detect_from_image(image)  | 
 | 186 | +        predict_boxes = []  | 
 | 187 | +        for i in range(len(scores)):  | 
 | 188 | +            predict_boxes.append((self.classes[box_classes[i]], boxes[i, 0],  | 
 | 189 | +                                boxes[i, 1], boxes[i, 2], boxes[i, 3], scores[i]))  | 
 | 190 | +        self.show_results(image, predict_boxes, deteted_boxes_file, detected_image_file)  | 
 | 191 | + | 
 | 192 | +    def _detect_from_image(self, image):  | 
 | 193 | +        """Do detection given a cv image"""  | 
 | 194 | +        img_h, img_w, _ = image.shape  | 
 | 195 | +        img_resized = cv2.resize(image, (448, 448))  | 
 | 196 | +        img_RGB = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)  | 
 | 197 | +        img_resized_np = np.asarray(img_RGB)  | 
 | 198 | +        _images = np.zeros((1, 448, 448, 3), dtype=np.float32)  | 
 | 199 | +        _images[0] = (img_resized_np / 255.0) * 2.0 - 1.0  | 
 | 200 | +        scores, boxes, box_classes = self.sess.run([self.scores, self.boxes, self.box_classes],  | 
 | 201 | +                    feed_dict={self.images: _images, self.width: img_w, self.height: img_h})  | 
 | 202 | +        return scores, boxes, box_classes  | 
 | 203 | + | 
 | 204 | +    def show_results(self, image, results, imshow=True, deteted_boxes_file=None,  | 
 | 205 | +                     detected_image_file=None):  | 
 | 206 | +        """Show the detection boxes"""  | 
 | 207 | +        img_cp = image.copy()  | 
 | 208 | +        if deteted_boxes_file:  | 
 | 209 | +            f = open(deteted_boxes_file, "w")  | 
 | 210 | +        #  draw boxes  | 
 | 211 | +        for i in range(len(results)):  | 
 | 212 | +            x = int(results[i][1])  | 
 | 213 | +            y = int(results[i][2])  | 
 | 214 | +            w = int(results[i][3]) // 2  | 
 | 215 | +            h = int(results[i][4]) // 2  | 
 | 216 | +            if self.verbose:  | 
 | 217 | +                print("   class: %s, [x, y, w, h]=[%d, %d, %d, %d], confidence=%f" % (results[i][0],  | 
 | 218 | +                            x, y, w, h, results[i][-1]))  | 
 | 219 | + | 
 | 220 | +                cv2.rectangle(img_cp, (x - w, y - h), (x + w, y + h), (0, 255, 0), 2)  | 
 | 221 | +                cv2.rectangle(img_cp, (x - w, y - h - 20), (x + w, y - h), (125, 125, 125), -1)  | 
 | 222 | +                cv2.putText(img_cp, results[i][0] + ' : %.2f' % results[i][5], (x - w + 5, y - h - 7),  | 
 | 223 | +                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)  | 
 | 224 | +            if deteted_boxes_file:  | 
 | 225 | +                f.write(results[i][0] + ',' + str(x) + ',' + str(y) + ',' +  | 
 | 226 | +                        str(w) + ',' + str(h)+',' + str(results[i][5]) + '\n')  | 
 | 227 | +        if imshow:  | 
 | 228 | +            cv2.imshow('YOLO_small detection', img_cp)  | 
 | 229 | +            cv2.waitKey(1)  | 
 | 230 | +        if detected_image_file:  | 
 | 231 | +            cv2.imwrite(detected_image_file, img_cp)  | 
 | 232 | +        if deteted_boxes_file:  | 
 | 233 | +            f.close()  | 
 | 234 | + | 
 | 235 | +if __name__ == "__main__":  | 
 | 236 | +    yolo_net = Yolo("./weights/YOLO_small.ckpt")  | 
 | 237 | +    yolo_net.detect_from_file("./test/car.jpg")  | 
0 commit comments