diff --git a/CNNs/MobileNet.py b/CNNs/MobileNet.py new file mode 100644 index 0000000..612a71b --- /dev/null +++ b/CNNs/MobileNet.py @@ -0,0 +1,169 @@ +""" +2017/11/24 ref:https://github.com/Zehaos/MobileNet/blob/master/nets/mobilenet.py +""" + +import tensorflow as tf +from tensorflow.python.training import moving_averages + +UPDATE_OPS_COLLECTION = "_update_ops_" + +# create variable +def create_variable(name, shape, initializer, + dtype=tf.float32, trainable=True): + return tf.get_variable(name, shape=shape, dtype=dtype, + initializer=initializer, trainable=trainable) + +# batchnorm layer +def bacthnorm(inputs, scope, epsilon=1e-05, momentum=0.99, is_training=True): + inputs_shape = inputs.get_shape().as_list() + params_shape = inputs_shape[-1:] + axis = list(range(len(inputs_shape) - 1)) + + with tf.variable_scope(scope): + beta = create_variable("beta", params_shape, + initializer=tf.zeros_initializer()) + gamma = create_variable("gamma", params_shape, + initializer=tf.ones_initializer()) + # for inference + moving_mean = create_variable("moving_mean", params_shape, + initializer=tf.zeros_initializer(), trainable=False) + moving_variance = create_variable("moving_variance", params_shape, + initializer=tf.ones_initializer(), trainable=False) + if is_training: + mean, variance = tf.nn.moments(inputs, axes=axis) + update_move_mean = moving_averages.assign_moving_average(moving_mean, + mean, decay=momentum) + update_move_variance = moving_averages.assign_moving_average(moving_variance, + variance, decay=momentum) + tf.add_to_collection(UPDATE_OPS_COLLECTION, update_move_mean) + tf.add_to_collection(UPDATE_OPS_COLLECTION, update_move_variance) + else: + mean, variance = moving_mean, moving_variance + return tf.nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) + +# depthwise conv2d layer +def depthwise_conv2d(inputs, scope, filter_size=3, channel_multiplier=1, strides=1): + inputs_shape = inputs.get_shape().as_list() + in_channels = inputs_shape[-1] + with tf.variable_scope(scope): + filter = create_variable("filter", shape=[filter_size, filter_size, + in_channels, channel_multiplier], + initializer=tf.truncated_normal_initializer(stddev=0.01)) + + return tf.nn.depthwise_conv2d(inputs, filter, strides=[1, strides, strides, 1], + padding="SAME", rate=[1, 1]) + +# conv2d layer +def conv2d(inputs, scope, num_filters, filter_size=1, strides=1): + inputs_shape = inputs.get_shape().as_list() + in_channels = inputs_shape[-1] + with tf.variable_scope(scope): + filter = create_variable("filter", shape=[filter_size, filter_size, + in_channels, num_filters], + initializer=tf.truncated_normal_initializer(stddev=0.01)) + return tf.nn.conv2d(inputs, filter, strides=[1, strides, strides, 1], + padding="SAME") + +# avg pool layer +def avg_pool(inputs, pool_size, scope): + with tf.variable_scope(scope): + return tf.nn.avg_pool(inputs, [1, pool_size, pool_size, 1], + strides=[1, pool_size, pool_size, 1], padding="VALID") + +# fully connected layer +def fc(inputs, n_out, scope, use_bias=True): + inputs_shape = inputs.get_shape().as_list() + n_in = inputs_shape[-1] + with tf.variable_scope(scope): + weight = create_variable("weight", shape=[n_in, n_out], + initializer=tf.random_normal_initializer(stddev=0.01)) + if use_bias: + bias = create_variable("bias", shape=[n_out,], + initializer=tf.zeros_initializer()) + return tf.nn.xw_plus_b(inputs, weight, bias) + return tf.matmul(inputs, weight) + + +class MobileNet(object): + def __init__(self, inputs, num_classes=1000, is_training=True, + width_multiplier=1, scope="MobileNet"): + """ + The implement of MobileNet(ref:https://arxiv.org/abs/1704.04861) + :param inputs: 4-D Tensor of [batch_size, height, width, channels] + :param num_classes: number of classes + :param is_training: Boolean, whether or not the model is training + :param width_multiplier: float, controls the size of model + :param scope: Optional scope for variables + """ + self.inputs = inputs + self.num_classes = num_classes + self.is_training = is_training + self.width_multiplier = width_multiplier + + # construct model + with tf.variable_scope(scope): + # conv1 + net = conv2d(inputs, "conv_1", round(32 * width_multiplier), filter_size=3, + strides=2) # ->[N, 112, 112, 32] + net = tf.nn.relu(bacthnorm(net, "conv_1/bn", is_training=self.is_training)) + net = self._depthwise_separable_conv2d(net, 64, self.width_multiplier, + "ds_conv_2") # ->[N, 112, 112, 64] + net = self._depthwise_separable_conv2d(net, 128, self.width_multiplier, + "ds_conv_3", downsample=True) # ->[N, 56, 56, 128] + net = self._depthwise_separable_conv2d(net, 128, self.width_multiplier, + "ds_conv_4") # ->[N, 56, 56, 128] + net = self._depthwise_separable_conv2d(net, 256, self.width_multiplier, + "ds_conv_5", downsample=True) # ->[N, 28, 28, 256] + net = self._depthwise_separable_conv2d(net, 256, self.width_multiplier, + "ds_conv_6") # ->[N, 28, 28, 256] + net = self._depthwise_separable_conv2d(net, 512, self.width_multiplier, + "ds_conv_7", downsample=True) # ->[N, 14, 14, 512] + net = self._depthwise_separable_conv2d(net, 512, self.width_multiplier, + "ds_conv_8") # ->[N, 14, 14, 512] + net = self._depthwise_separable_conv2d(net, 512, self.width_multiplier, + "ds_conv_9") # ->[N, 14, 14, 512] + net = self._depthwise_separable_conv2d(net, 512, self.width_multiplier, + "ds_conv_10") # ->[N, 14, 14, 512] + net = self._depthwise_separable_conv2d(net, 512, self.width_multiplier, + "ds_conv_11") # ->[N, 14, 14, 512] + net = self._depthwise_separable_conv2d(net, 512, self.width_multiplier, + "ds_conv_12") # ->[N, 14, 14, 512] + net = self._depthwise_separable_conv2d(net, 1024, self.width_multiplier, + "ds_conv_13", downsample=True) # ->[N, 7, 7, 1024] + net = self._depthwise_separable_conv2d(net, 1024, self.width_multiplier, + "ds_conv_14") # ->[N, 7, 7, 1024] + net = avg_pool(net, 7, "avg_pool_15") + net = tf.squeeze(net, [1, 2], name="SpatialSqueeze") + self.logits = fc(net, self.num_classes, "fc_16") + self.predictions = tf.nn.softmax(self.logits) + + def _depthwise_separable_conv2d(self, inputs, num_filters, width_multiplier, + scope, downsample=False): + """depthwise separable convolution 2D function""" + num_filters = round(num_filters * width_multiplier) + strides = 2 if downsample else 1 + + with tf.variable_scope(scope): + # depthwise conv2d + dw_conv = depthwise_conv2d(inputs, "depthwise_conv", strides=strides) + # batchnorm + bn = bacthnorm(dw_conv, "dw_bn", is_training=self.is_training) + # relu + relu = tf.nn.relu(bn) + # pointwise conv2d (1x1) + pw_conv = conv2d(relu, "pointwise_conv", num_filters) + # bn + bn = bacthnorm(pw_conv, "pw_bn", is_training=self.is_training) + return tf.nn.relu(bn) + +if __name__ == "__main__": + # test data + inputs = tf.random_normal(shape=[4, 224, 224, 3]) + mobileNet = MobileNet(inputs) + writer = tf.summary.FileWriter("./logs", graph=tf.get_default_graph()) + init = tf.global_variables_initializer() + with tf.Session() as sess: + sess.run(init) + pred = sess.run(mobileNet.predictions) + print(pred.shape) + diff --git a/CNNs/ResNet50.py b/CNNs/ResNet50.py new file mode 100644 index 0000000..5f7d5d5 --- /dev/null +++ b/CNNs/ResNet50.py @@ -0,0 +1,143 @@ +""" +ResNet50 +2017/12/06 +""" + +import tensorflow as tf +from tensorflow.python.training import moving_averages + +fc_initializer = tf.contrib.layers.xavier_initializer +conv2d_initializer = tf.contrib.layers.xavier_initializer_conv2d + +# create weight variable +def create_var(name, shape, initializer, trainable=True): + return tf.get_variable(name, shape=shape, dtype=tf.float32, + initializer=initializer, trainable=trainable) + +# conv2d layer +def conv2d(x, num_outputs, kernel_size, stride=1, scope="conv2d"): + num_inputs = x.get_shape()[-1] + with tf.variable_scope(scope): + kernel = create_var("kernel", [kernel_size, kernel_size, + num_inputs, num_outputs], + conv2d_initializer()) + return tf.nn.conv2d(x, kernel, strides=[1, stride, stride, 1], + padding="SAME") + +# fully connected layer +def fc(x, num_outputs, scope="fc"): + num_inputs = x.get_shape()[-1] + with tf.variable_scope(scope): + weight = create_var("weight", [num_inputs, num_outputs], + fc_initializer()) + bias = create_var("bias", [num_outputs,], + tf.zeros_initializer()) + return tf.nn.xw_plus_b(x, weight, bias) + + +# batch norm layer +def batch_norm(x, decay=0.999, epsilon=1e-03, is_training=True, + scope="scope"): + x_shape = x.get_shape() + num_inputs = x_shape[-1] + reduce_dims = list(range(len(x_shape) - 1)) + with tf.variable_scope(scope): + beta = create_var("beta", [num_inputs,], + initializer=tf.zeros_initializer()) + gamma = create_var("gamma", [num_inputs,], + initializer=tf.ones_initializer()) + # for inference + moving_mean = create_var("moving_mean", [num_inputs,], + initializer=tf.zeros_initializer(), + trainable=False) + moving_variance = create_var("moving_variance", [num_inputs], + initializer=tf.ones_initializer(), + trainable=False) + if is_training: + mean, variance = tf.nn.moments(x, axes=reduce_dims) + update_move_mean = moving_averages.assign_moving_average(moving_mean, + mean, decay=decay) + update_move_variance = moving_averages.assign_moving_average(moving_variance, + variance, decay=decay) + tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_move_mean) + tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_move_variance) + else: + mean, variance = moving_mean, moving_variance + return tf.nn.batch_normalization(x, mean, variance, beta, gamma, epsilon) + + +# avg pool layer +def avg_pool(x, pool_size, scope): + with tf.variable_scope(scope): + return tf.nn.avg_pool(x, [1, pool_size, pool_size, 1], + strides=[1, pool_size, pool_size, 1], padding="VALID") + +# max pool layer +def max_pool(x, pool_size, stride, scope): + with tf.variable_scope(scope): + return tf.nn.max_pool(x, [1, pool_size, pool_size, 1], + [1, stride, stride, 1], padding="SAME") + +class ResNet50(object): + def __init__(self, inputs, num_classes=1000, is_training=True, + scope="resnet50"): + self.inputs =inputs + self.is_training = is_training + self.num_classes = num_classes + + with tf.variable_scope(scope): + # construct the model + net = conv2d(inputs, 64, 7, 2, scope="conv1") # -> [batch, 112, 112, 64] + net = tf.nn.relu(batch_norm(net, is_training=self.is_training, scope="bn1")) + net = max_pool(net, 3, 2, scope="maxpool1") # -> [batch, 56, 56, 64] + net = self._block(net, 256, 3, init_stride=1, is_training=self.is_training, + scope="block2") # -> [batch, 56, 56, 256] + net = self._block(net, 512, 4, is_training=self.is_training, scope="block3") + # -> [batch, 28, 28, 512] + net = self._block(net, 1024, 6, is_training=self.is_training, scope="block4") + # -> [batch, 14, 14, 1024] + net = self._block(net, 2048, 3, is_training=self.is_training, scope="block5") + # -> [batch, 7, 7, 2048] + net = avg_pool(net, 7, scope="avgpool5") # -> [batch, 1, 1, 2048] + net = tf.squeeze(net, [1, 2], name="SpatialSqueeze") # -> [batch, 2048] + self.logits = fc(net, self.num_classes, "fc6") # -> [batch, num_classes] + self.predictions = tf.nn.softmax(self.logits) + + + def _block(self, x, n_out, n, init_stride=2, is_training=True, scope="block"): + with tf.variable_scope(scope): + h_out = n_out // 4 + out = self._bottleneck(x, h_out, n_out, stride=init_stride, + is_training=is_training, scope="bottlencek1") + for i in range(1, n): + out = self._bottleneck(out, h_out, n_out, is_training=is_training, + scope=("bottlencek%s" % (i + 1))) + return out + + def _bottleneck(self, x, h_out, n_out, stride=None, is_training=True, scope="bottleneck"): + """ A residual bottleneck unit""" + n_in = x.get_shape()[-1] + if stride is None: + stride = 1 if n_in == n_out else 2 + + with tf.variable_scope(scope): + h = conv2d(x, h_out, 1, stride=stride, scope="conv_1") + h = batch_norm(h, is_training=is_training, scope="bn_1") + h = tf.nn.relu(h) + h = conv2d(h, h_out, 3, stride=1, scope="conv_2") + h = batch_norm(h, is_training=is_training, scope="bn_2") + h = tf.nn.relu(h) + h = conv2d(h, n_out, 1, stride=1, scope="conv_3") + h = batch_norm(h, is_training=is_training, scope="bn_3") + + if n_in != n_out: + shortcut = conv2d(x, n_out, 1, stride=stride, scope="conv_4") + shortcut = batch_norm(shortcut, is_training=is_training, scope="bn_4") + else: + shortcut = x + return tf.nn.relu(shortcut + h) + +if __name__ == "__main__": + x = tf.random_normal([32, 224, 224, 3]) + resnet50 = ResNet50(x) + print(resnet50.logits) \ No newline at end of file diff --git a/CNNs/ShuffleNet.py b/CNNs/ShuffleNet.py new file mode 100644 index 0000000..50c4fc8 --- /dev/null +++ b/CNNs/ShuffleNet.py @@ -0,0 +1,122 @@ +""" +implement a shuffleNet by pytorch +""" +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable + +dtype = torch.FloatTensor + +def shuffle_channels(x, groups): + """shuffle channels of a 4-D Tensor""" + batch_size, channels, height, width = x.size() + assert channels % groups == 0 + channels_per_group = channels // groups + # split into groups + x = x.view(batch_size, groups, channels_per_group, + height, width) + # transpose 1, 2 axis + x = x.transpose(1, 2).contiguous() + # reshape into orignal + x = x.view(batch_size, channels, height, width) + return x + +class ShuffleNetUnitA(nn.Module): + """ShuffleNet unit for stride=1""" + def __init__(self, in_channels, out_channels, groups=3): + super(ShuffleNetUnitA, self).__init__() + assert in_channels == out_channels + assert out_channels % 4 == 0 + bottleneck_channels = out_channels // 4 + self.groups = groups + self.group_conv1 = nn.Conv2d(in_channels, bottleneck_channels, + 1, groups=groups, stride=1) + self.bn2 = nn.BatchNorm2d(bottleneck_channels) + self.depthwise_conv3 = nn.Conv2d(bottleneck_channels, + bottleneck_channels, + 3, padding=1, stride=1, + groups=bottleneck_channels) + self.bn4 = nn.BatchNorm2d(bottleneck_channels) + self.group_conv5 = nn.Conv2d(bottleneck_channels, out_channels, + 1, stride=1, groups=groups) + self.bn6 = nn.BatchNorm2d(out_channels) + + def forward(self, x): + out = self.group_conv1(x) + out = F.relu(self.bn2(out)) + out = shuffle_channels(out, groups=self.groups) + out = self.depthwise_conv3(out) + out = self.bn4(out) + out = self.group_conv5(out) + out = self.bn6(out) + out = F.relu(x + out) + return out + +class ShuffleNetUnitB(nn.Module): + """ShuffleNet unit for stride=2""" + def __init__(self, in_channels, out_channels, groups=3): + super(ShuffleNetUnitB, self).__init__() + out_channels -= in_channels + assert out_channels % 4 == 0 + bottleneck_channels = out_channels // 4 + self.groups = groups + self.group_conv1 = nn.Conv2d(in_channels, bottleneck_channels, + 1, groups=groups, stride=1) + self.bn2 = nn.BatchNorm2d(bottleneck_channels) + self.depthwise_conv3 = nn.Conv2d(bottleneck_channels, + bottleneck_channels, + 3, padding=1, stride=2, + groups=bottleneck_channels) + self.bn4 = nn.BatchNorm2d(bottleneck_channels) + self.group_conv5 = nn.Conv2d(bottleneck_channels, out_channels, + 1, stride=1, groups=groups) + self.bn6 = nn.BatchNorm2d(out_channels) + + def forward(self, x): + out = self.group_conv1(x) + out = F.relu(self.bn2(out)) + out = shuffle_channels(out, groups=self.groups) + out = self.depthwise_conv3(out) + out = self.bn4(out) + out = self.group_conv5(out) + out = self.bn6(out) + x = F.avg_pool2d(x, 3, stride=2, padding=1) + out = F.relu(torch.cat([x, out], dim=1)) + return out + +class ShuffleNet(nn.Module): + """ShuffleNet for groups=3""" + def __init__(self, groups=3, in_channels=3, num_classes=1000): + super(ShuffleNet, self).__init__() + + self.conv1 = nn.Conv2d(in_channels, 24, 3, stride=2, padding=1) + stage2_seq = [ShuffleNetUnitB(24, 240, groups=3)] + \ + [ShuffleNetUnitA(240, 240, groups=3) for i in range(3)] + self.stage2 = nn.Sequential(*stage2_seq) + stage3_seq = [ShuffleNetUnitB(240, 480, groups=3)] + \ + [ShuffleNetUnitA(480, 480, groups=3) for i in range(7)] + self.stage3 = nn.Sequential(*stage3_seq) + stage4_seq = [ShuffleNetUnitB(480, 960, groups=3)] + \ + [ShuffleNetUnitA(960, 960, groups=3) for i in range(3)] + self.stage4 = nn.Sequential(*stage4_seq) + self.fc = nn.Linear(960, num_classes) + + def forward(self, x): + net = self.conv1(x) + net = F.max_pool2d(net, 3, stride=2, padding=1) + net = self.stage2(net) + net = self.stage3(net) + net = self.stage4(net) + net = F.avg_pool2d(net, 7) + net = net.view(net.size(0), -1) + net = self.fc(net) + logits = F.softmax(net) + return logits + +if __name__ == "__main__": + x = Variable(torch.randn([32, 3, 224, 224]).type(dtype), + requires_grad=False) + shuffleNet = ShuffleNet() + out = shuffleNet(x) + print(out.size()) diff --git a/CNNs/SqueezeNet.py b/CNNs/SqueezeNet.py new file mode 100644 index 0000000..47a5b09 --- /dev/null +++ b/CNNs/SqueezeNet.py @@ -0,0 +1,74 @@ +""" +2017/12/02 +""" + +import tensorflow as tf +import numpy as np + + +class SqueezeNet(object): + def __init__(self, inputs, nb_classes=1000, is_training=True): + # conv1 + net = tf.layers.conv2d(inputs, 96, [7, 7], strides=[2, 2], + padding="SAME", activation=tf.nn.relu, + name="conv1") + # maxpool1 + net = tf.layers.max_pooling2d(net, [3, 3], strides=[2, 2], + name="maxpool1") + # fire2 + net = self._fire(net, 16, 64, "fire2") + # fire3 + net = self._fire(net, 16, 64, "fire3") + # fire4 + net = self._fire(net, 32, 128, "fire4") + # maxpool4 + net = tf.layers.max_pooling2d(net, [3, 3], strides=[2, 2], + name="maxpool4") + # fire5 + net = self._fire(net, 32, 128, "fire5") + # fire6 + net = self._fire(net, 48, 192, "fire6") + # fire7 + net = self._fire(net, 48, 192, "fire7") + # fire8 + net = self._fire(net, 64, 256, "fire8") + # maxpool8 + net = tf.layers.max_pooling2d(net, [3, 3], strides=[2, 2], + name="maxpool8") + # fire9 + net = self._fire(net, 64, 256, "fire9") + # dropout + net = tf.layers.dropout(net, 0.5, training=is_training) + # conv10 + net = tf.layers.conv2d(net, 1000, [1, 1], strides=[1, 1], + padding="SAME", activation=tf.nn.relu, + name="conv10") + # avgpool10 + net = tf.layers.average_pooling2d(net, [13, 13], strides=[1, 1], + name="avgpool10") + # squeeze the axis + net = tf.squeeze(net, axis=[1, 2]) + + self.logits = net + self.prediction = tf.nn.softmax(net) + + + def _fire(self, inputs, squeeze_depth, expand_depth, scope): + with tf.variable_scope(scope): + squeeze = tf.layers.conv2d(inputs, squeeze_depth, [1, 1], + strides=[1, 1], padding="SAME", + activation=tf.nn.relu, name="squeeze") + # squeeze + expand_1x1 = tf.layers.conv2d(squeeze, expand_depth, [1, 1], + strides=[1, 1], padding="SAME", + activation=tf.nn.relu, name="expand_1x1") + expand_3x3 = tf.layers.conv2d(squeeze, expand_depth, [3, 3], + strides=[1, 1], padding="SAME", + activation=tf.nn.relu, name="expand_3x3") + return tf.concat([expand_1x1, expand_3x3], axis=3) + + +if __name__ == "__main__": + inputs = tf.random_normal([32, 224, 224, 3]) + net = SqueezeNet(inputs) + print(net.prediction) diff --git a/CNNs/densenet.py b/CNNs/densenet.py new file mode 100644 index 0000000..8f70424 --- /dev/null +++ b/CNNs/densenet.py @@ -0,0 +1,228 @@ +""" +DenseNet, original: https://github.com/pytorch/vision/blob/master/torchvision/models/densenet.py +""" +import re +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.model_zoo as model_zoo +import torchvision.transforms as transforms + +from PIL import Image +import numpy as np + +model_urls = { + 'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth', + 'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth', + 'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth', + 'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth', +} + + +class _DenseLayer(nn.Sequential): + """Basic unit of DenseBlock (using bottleneck layer) """ + def __init__(self, num_input_features, growth_rate, bn_size, drop_rate): + super(_DenseLayer, self).__init__() + self.add_module("norm1", nn.BatchNorm2d(num_input_features)) + self.add_module("relu1", nn.ReLU(inplace=True)) + self.add_module("conv1", nn.Conv2d(num_input_features, bn_size*growth_rate, + kernel_size=1, stride=1, bias=False)) + self.add_module("norm2", nn.BatchNorm2d(bn_size*growth_rate)) + self.add_module("relu2", nn.ReLU(inplace=True)) + self.add_module("conv2", nn.Conv2d(bn_size*growth_rate, growth_rate, + kernel_size=3, stride=1, padding=1, bias=False)) + self.drop_rate = drop_rate + + def forward(self, x): + new_features = super(_DenseLayer, self).forward(x) + if self.drop_rate > 0: + new_features = F.dropout(new_features, p=self.drop_rate, training=self.training) + return torch.cat([x, new_features], 1) + +class _DenseBlock(nn.Sequential): + """DenseBlock""" + def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate): + super(_DenseBlock, self).__init__() + for i in range(num_layers): + layer = _DenseLayer(num_input_features+i*growth_rate, growth_rate, bn_size, + drop_rate) + self.add_module("denselayer%d" % (i+1,), layer) + + +class _Transition(nn.Sequential): + """Transition layer between two adjacent DenseBlock""" + def __init__(self, num_input_feature, num_output_features): + super(_Transition, self).__init__() + self.add_module("norm", nn.BatchNorm2d(num_input_feature)) + self.add_module("relu", nn.ReLU(inplace=True)) + self.add_module("conv", nn.Conv2d(num_input_feature, num_output_features, + kernel_size=1, stride=1, bias=False)) + self.add_module("pool", nn.AvgPool2d(2, stride=2)) + + +class DenseNet(nn.Module): + "DenseNet-BC model" + def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16), num_init_features=64, + bn_size=4, compression_rate=0.5, drop_rate=0, num_classes=1000): + """ + :param growth_rate: (int) number of filters used in DenseLayer, `k` in the paper + :param block_config: (list of 4 ints) number of layers in each DenseBlock + :param num_init_features: (int) number of filters in the first Conv2d + :param bn_size: (int) the factor using in the bottleneck layer + :param compression_rate: (float) the compression rate used in Transition Layer + :param drop_rate: (float) the drop rate after each DenseLayer + :param num_classes: (int) number of classes for classification + """ + super(DenseNet, self).__init__() + # first Conv2d + self.features = nn.Sequential(OrderedDict([ + ("conv0", nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)), + ("norm0", nn.BatchNorm2d(num_init_features)), + ("relu0", nn.ReLU(inplace=True)), + ("pool0", nn.MaxPool2d(3, stride=2, padding=1)) + ])) + + # DenseBlock + num_features = num_init_features + for i, num_layers in enumerate(block_config): + block = _DenseBlock(num_layers, num_features, bn_size, growth_rate, drop_rate) + self.features.add_module("denseblock%d" % (i + 1), block) + num_features += num_layers*growth_rate + if i != len(block_config) - 1: + transition = _Transition(num_features, int(num_features*compression_rate)) + self.features.add_module("transition%d" % (i + 1), transition) + num_features = int(num_features * compression_rate) + + # final bn+ReLU + self.features.add_module("norm5", nn.BatchNorm2d(num_features)) + self.features.add_module("relu5", nn.ReLU(inplace=True)) + + # classification layer + self.classifier = nn.Linear(num_features, num_classes) + + # params initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1) + elif isinstance(m, nn.Linear): + nn.init.constant_(m.bias, 0) + + def forward(self, x): + features = self.features(x) + out = F.avg_pool2d(features, 7, stride=1).view(features.size(0), -1) + out = self.classifier(out) + return out + +class DenseNet_MNIST(nn.Module): + """DenseNet for MNIST dataset""" + def __init__(self, growth_rate=12, block_config=(6, 6, 6), num_init_features=16, + bn_size=4, compression_rate=0.5, drop_rate=0, num_classes=10): + """ + :param growth_rate: (int) number of filters used in DenseLayer, `k` in the paper + :param block_config: (list of 2 ints) number of layers in each DenseBlock + :param num_init_features: (int) number of filters in the first Conv2d + :param bn_size: (int) the factor using in the bottleneck layer + :param compression_rate: (float) the compression rate used in Transition Layer + :param drop_rate: (float) the drop rate after each DenseLayer + :param num_classes: (int) number of classes for classification + """ + super(DenseNet_MNIST, self).__init__() + # first Conv2d + self.features = nn.Sequential(OrderedDict([ + ("conv0", nn.Conv2d(1, num_init_features, kernel_size=3, stride=1, padding=1, bias=False)), + ("norm0", nn.BatchNorm2d(num_init_features)), + ("relu0", nn.ReLU(inplace=True)), + ])) + + # DenseBlock + num_features = num_init_features + for i, num_layers in enumerate(block_config): + block = _DenseBlock(num_layers, num_features, bn_size, growth_rate, drop_rate) + self.features.add_module("denseblock%d" % (i + 1), block) + num_features += num_layers * growth_rate + if i != len(block_config) - 1: + transition = _Transition(num_features, int(num_features * compression_rate)) + self.features.add_module("transition%d" % (i + 1), transition) + num_features = int(num_features * compression_rate) + + # final bn+ReLU + self.features.add_module("norm5", nn.BatchNorm2d(num_features)) + self.features.add_module("relu5", nn.ReLU(inplace=True)) + + # classification layer + self.classifier = nn.Linear(num_features, num_classes) + + # params initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1) + elif isinstance(m, nn.Linear): + nn.init.constant_(m.bias, 0) + + def forward(self, x): + features = self.features(x) + out = F.avg_pool2d(features, 7, stride=1).view(features.size(0), -1) + out = self.classifier(out) + return out + + +def densenet121(pretrained=False, **kwargs): + """DenseNet121""" + model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16), + **kwargs) + + if pretrained: + # '.'s are no longer allowed in module names, but pervious _DenseLayer + # has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'. + # They are also in the checkpoints in model_urls. This pattern is used + # to find such keys. + pattern = re.compile( + r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$') + state_dict = model_zoo.load_url(http://github.com/model_urls['densenet121']) + for key in list(state_dict.keys()): + res = pattern.match(key) + if res: + new_key = res.group(1) + res.group(2) + state_dict[new_key] = state_dict[key] + del state_dict[key] + model.load_state_dict(state_dict) + return model + +if __name__ == "__main__": + densenet = densenet121(pretrained=True) + densenet.eval() + + img = Image.open("./images/cat.jpg") + + trans_ops = transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + + images = trans_ops(img).view(-1, 3, 224, 224) + print(images) + outputs = densenet(images) + + _, predictions = outputs.topk(5, dim=1) + + labels = list(map(lambda s: s.strip(), open("./data/imagenet/synset_words.txt").readlines())) + for idx in predictions.numpy()[0]: + print("Predicted labels:", labels[idx]) + + + + + + + diff --git a/CNNs/mobilenet_v2.py b/CNNs/mobilenet_v2.py new file mode 100644 index 0000000..23d6c0f --- /dev/null +++ b/CNNs/mobilenet_v2.py @@ -0,0 +1,349 @@ +""" +2018-11-24 +""" + +from collections import namedtuple +import copy + +import tensorflow as tf + +slim = tf.contrib.slim + +def _make_divisible(v, divisor, min_value=None): + """make `v` is divided exactly by `divisor`, but keep the min_value""" + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +@slim.add_arg_scope +def _depth_multiplier_func(params, + multiplier, + divisible_by=8, + min_depth=8): + """get the new channles""" + if 'num_outputs' not in params: + return + d = params['num_outputs'] + params['num_outputs'] = _make_divisible(d * multiplier, divisible_by, + min_depth) + +def _fixed_padding(inputs, kernel_size, rate=1): + """Pads the input along the spatial dimensions independently of input size. + Pads the input such that if it was used in a convolution with 'VALID' padding, + the output would have the same dimensions as if the unpadded input was used + in a convolution with 'SAME' padding. + Args: + inputs: A tensor of size [batch, height_in, width_in, channels]. + kernel_size: The kernel to be used in the conv2d or max_pool2d operation. + rate: An integer, rate for atrous convolution. + Returns: + output: A tensor of size [batch, height_out, width_out, channels] with the + input, either intact (if kernel_size == 1) or padded (if kernel_size > 1). + """ + kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1), + kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)] + pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1] + pad_beg = [pad_total[0] // 2, pad_total[1] // 2] + pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]] + padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]], + [pad_beg[1], pad_end[1]], [0, 0]]) + return padded_inputs + + +@slim.add_arg_scope +def expanded_conv(x, + num_outputs, + expansion=6, + stride=1, + rate=1, + normalizer_fn=slim.batch_norm, + project_activation_fn=tf.identity, + padding="SAME", + scope=None): + """The expand conv op in MobileNetv2 + 1x1 conv -> depthwise 3x3 conv -> 1x1 linear conv + """ + with tf.variable_scope(scope, default_name="expanded_conv") as s, \ + tf.name_scope(s.original_name_scope): + prev_depth = x.get_shape().as_list()[3] + # the filters of expanded conv + inner_size = prev_depth * expansion + net = x + # only inner_size > prev_depth, use expanded conv + if inner_size > prev_depth: + net = slim.conv2d(net, inner_size, 1, normalizer_fn=normalizer_fn, + scope="expand") + # depthwise conv + net = slim.separable_conv2d(net, num_outputs=None, kernel_size=3, + depth_multiplier=1, stride=stride, + rate=rate, normalizer_fn=normalizer_fn, + padding=padding, scope="depthwise") + # projection + net = slim.conv2d(net, num_outputs, 1, normalizer_fn=normalizer_fn, + activation_fn=project_activation_fn, scope="project") + + # residual connection + if stride == 1 and net.get_shape().as_list()[-1] == prev_depth: + net += x + + return net + +def global_pool(x, pool_op=tf.nn.avg_pool): + """Applies avg pool to produce 1x1 output. + NOTE: This function is funcitonally equivalenet to reduce_mean, but it has + baked in average pool which has better support across hardware. + Args: + input_tensor: input tensor + pool_op: pooling op (avg pool is default) + Returns: + a tensor batch_size x 1 x 1 x depth. + """ + shape = x.get_shape().as_list() + if shape[1] is None or shape[2] is None: + kernel_size = tf.convert_to_tensor( + [1, tf.shape(x)[1], tf.shape(x)[2], 1]) + else: + kernel_size = [1, shape[1], shape[2], 1] + output = pool_op(x, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID') + # Recover output shape, for unknown shape. + output.set_shape([None, 1, 1, None]) + return output + + +_Op = namedtuple("Op", ['op', 'params', 'multiplier_func']) + +def op(op_func, **params): + return _Op(op=op_func, params=params, + multiplier_func=_depth_multiplier_func) + + +CONV_DEF = [op(slim.conv2d, num_outputs=32, stride=2, kernel_size=3), + op(expanded_conv, num_outputs=16, expansion=1), + op(expanded_conv, num_outputs=24, stride=2), + op(expanded_conv, num_outputs=24, stride=1), + op(expanded_conv, num_outputs=32, stride=2), + op(expanded_conv, num_outputs=32, stride=1), + op(expanded_conv, num_outputs=32, stride=1), + op(expanded_conv, num_outputs=64, stride=2), + op(expanded_conv, num_outputs=64, stride=1), + op(expanded_conv, num_outputs=64, stride=1), + op(expanded_conv, num_outputs=64, stride=1), + op(expanded_conv, num_outputs=96, stride=1), + op(expanded_conv, num_outputs=96, stride=1), + op(expanded_conv, num_outputs=96, stride=1), + op(expanded_conv, num_outputs=160, stride=2), + op(expanded_conv, num_outputs=160, stride=1), + op(expanded_conv, num_outputs=160, stride=1), + op(expanded_conv, num_outputs=320, stride=1), + op(slim.conv2d, num_outputs=1280, stride=1, kernel_size=1), + ] + + +def mobilenet_arg_scope(is_training=True, + weight_decay=0.00004, + stddev=0.09, + dropout_keep_prob=0.8, + bn_decay=0.997): + """Defines Mobilenet default arg scope. + Usage: + with tf.contrib.slim.arg_scope(mobilenet.training_scope()): + logits, endpoints = mobilenet_v2.mobilenet(input_tensor) + # the network created will be trainble with dropout/batch norm + # initialized appropriately. + Args: + is_training: if set to False this will ensure that all customizations are + set to non-training mode. This might be helpful for code that is reused + across both training/evaluation, but most of the time training_scope with + value False is not needed. If this is set to None, the parameters is not + added to the batch_norm arg_scope. + weight_decay: The weight decay to use for regularizing the model. + stddev: Standard deviation for initialization, if negative uses xavier. + dropout_keep_prob: dropout keep probability (not set if equals to None). + bn_decay: decay for the batch norm moving averages (not set if equals to + None). + Returns: + An argument scope to use via arg_scope. + """ + # Note: do not introduce parameters that would change the inference + # model here (for example whether to use bias), modify conv_def instead. + batch_norm_params = { + 'center': True, + 'scale': True, + 'decay': bn_decay, + 'is_training': is_training + } + if stddev < 0: + weight_intitializer = slim.initializers.xavier_initializer() + else: + weight_intitializer = tf.truncated_normal_initializer(stddev=stddev) + + # Set weight_decay for weights in Conv and FC layers. + with slim.arg_scope( + [slim.conv2d, slim.fully_connected, slim.separable_conv2d], + weights_initializer=weight_intitializer, + normalizer_fn=slim.batch_norm, + activation_fn=tf.nn.relu6), \ + slim.arg_scope([slim.batch_norm], **batch_norm_params), \ + slim.arg_scope([slim.dropout], is_training=is_training, + keep_prob=dropout_keep_prob), \ + slim.arg_scope([slim.conv2d, slim.separable_conv2d], + biases_initializer=None, + padding="SAME"), \ + slim.arg_scope([slim.conv2d], + weights_regularizer=slim.l2_regularizer(weight_decay)), \ + slim.arg_scope([slim.separable_conv2d], weights_regularizer=None) as s: + return s + + +def mobilenetv2(x, + num_classes=1001, + depth_multiplier=1.0, + scope='MobilenetV2', + finegrain_classification_mode=False, + min_depth=8, + divisible_by=8, + output_stride=None, + ): + """Mobilenet v2 + Args: + x: The input tensor + num_classes: number of classes + depth_multiplier: The multiplier applied to scale number of + channels in each layer. Note: this is called depth multiplier in the + paper but the name is kept for consistency with slim's model builder. + scope: Scope of the operator + finegrain_classification_mode: When set to True, the model + will keep the last layer large even for small multipliers. + The paper suggests that it improves performance for ImageNet-type of problems. + min_depth: If provided, will ensure that all layers will have that + many channels after application of depth multiplier. + divisible_by: If provided will ensure that all layers # channels + will be divisible by this number. + """ + conv_defs = CONV_DEF + + # keep the last conv layer very larger channel + if finegrain_classification_mode: + conv_defs = copy.deepcopy(conv_defs) + if depth_multiplier < 1: + conv_defs[-1].params['num_outputs'] /= depth_multiplier + + depth_args = {} + # NB: do not set depth_args unless they are provided to avoid overriding + # whatever default depth_multiplier might have thanks to arg_scope. + if min_depth is not None: + depth_args['min_depth'] = min_depth + if divisible_by is not None: + depth_args['divisible_by'] = divisible_by + + with slim.arg_scope([_depth_multiplier_func], **depth_args): + with tf.variable_scope(scope, default_name='Mobilenet'): + # The current_stride variable keeps track of the output stride of the + # activations, i.e., the running product of convolution strides up to the + # current network layer. This allows us to invoke atrous convolution + # whenever applying the next convolution would result in the activations + # having output stride larger than the target output_stride. + current_stride = 1 + + # The atrous convolution rate parameter. + rate = 1 + + net = x + # Insert default parameters before the base scope which includes + # any custom overrides set in mobilenet. + end_points = {} + scopes = {} + for i, opdef in enumerate(conv_defs): + params = dict(opdef.params) + opdef.multiplier_func(params, depth_multiplier) + stride = params.get('stride', 1) + if output_stride is not None and current_stride == output_stride: + # If we have reached the target output_stride, then we need to employ + # atrous convolution with stride=1 and multiply the atrous rate by the + # current unit's stride for use in subsequent layers. + layer_stride = 1 + layer_rate = rate + rate *= stride + else: + layer_stride = stride + layer_rate = 1 + current_stride *= stride + # Update params. + params['stride'] = layer_stride + # Only insert rate to params if rate > 1. + if layer_rate > 1: + params['rate'] = layer_rate + + try: + net = opdef.op(net, **params) + except Exception: + raise ValueError('Failed to create op %i: %r params: %r' % (i, opdef, params)) + + with tf.variable_scope('Logits'): + net = global_pool(net) + end_points['global_pool'] = net + if not num_classes: + return net, end_points + net = slim.dropout(net, scope='Dropout') + # 1 x 1 x num_classes + # Note: legacy scope name. + logits = slim.conv2d( + net, + num_classes, [1, 1], + activation_fn=None, + normalizer_fn=None, + biases_initializer=tf.zeros_initializer(), + scope='Conv2d_1c_1x1') + + logits = tf.squeeze(logits, [1, 2]) + + return logits + + +if __name__ == "__main__": + import cv2 + import numpy as np + + inputs = tf.placeholder(tf.uint8, [None, None, 3]) + images = tf.expand_dims(inputs, 0) + images = tf.cast(images, tf.float32) / 128. - 1 + images.set_shape((None, None, None, 3)) + images = tf.image.resize_images(images, (224, 224)) + + with slim.arg_scope(mobilenet_arg_scope(is_training=False)): + logits = mobilenetv2(images) + + # Restore using exponential moving average since it produces (1.5-2%) higher + # accuracy + ema = tf.train.ExponentialMovingAverage(0.999) + vars = ema.variables_to_restore() + + saver = tf.train.Saver(vars) + + print(len(tf.global_variables())) + for var in tf.global_variables(): + print(var) + checkpoint_path = r"C:\Users\xiaoh\Desktop\temp\mobilenet_v2_1.0_224\mobilenet_v2_1.0_224.ckpt" + image_file = "C:/Users/xiaoh/Desktop/temp/pandas.jpg" + with tf.Session() as sess: + saver.restore(sess, checkpoint_path) + + img = cv2.imread(image_file) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + + print(np.argmax(sess.run(logits, feed_dict={inputs: img})[0])) + + + + + + + + + + diff --git a/CNNs/shufflenet_v2.py b/CNNs/shufflenet_v2.py new file mode 100644 index 0000000..dbb25c9 --- /dev/null +++ b/CNNs/shufflenet_v2.py @@ -0,0 +1,243 @@ +""" +The implement of shufflenet_v2 by Keras +""" + +import tensorflow as tf +from tensorflow.keras.layers import Conv2D, DepthwiseConv2D +from tensorflow.keras.layers import MaxPool2D, GlobalAveragePooling2D, Dense +from tensorflow.keras.layers import BatchNormalization, Activation + + +def channle_shuffle(inputs, group): + """Shuffle the channel + Args: + inputs: 4D Tensor + group: int, number of groups + Returns: + Shuffled 4D Tensor + """ + in_shape = inputs.get_shape().as_list() + h, w, in_channel = in_shape[1:] + assert in_channel % group == 0 + l = tf.reshape(inputs, [-1, h, w, in_channel // group, group]) + l = tf.transpose(l, [0, 1, 2, 4, 3]) + l = tf.reshape(l, [-1, h, w, in_channel]) + + return l + +class Conv2D_BN_ReLU(tf.keras.Model): + """Conv2D -> BN -> ReLU""" + def __init__(self, channel, kernel_size=1, stride=1): + super(Conv2D_BN_ReLU, self).__init__() + + self.conv = Conv2D(channel, kernel_size, strides=stride, + padding="SAME", use_bias=False) + self.bn = BatchNormalization(axis=-1, momentum=0.9, epsilon=1e-5) + self.relu = Activation("relu") + + def call(self, inputs, training=True): + x = self.conv(inputs) + x = self.bn(x, training=training) + x = self.relu(x) + return x + +class DepthwiseConv2D_BN(tf.keras.Model): + """DepthwiseConv2D -> BN""" + def __init__(self, kernel_size=3, stride=1): + super(DepthwiseConv2D_BN, self).__init__() + + self.dconv = DepthwiseConv2D(kernel_size, strides=stride, + depth_multiplier=1, + padding="SAME", use_bias=False) + self.bn = BatchNormalization(axis=-1, momentum=0.9, epsilon=1e-5) + + def call(self, inputs, training=True): + x = self.dconv(inputs) + x = self.bn(x, training=training) + return x + + +class ShufflenetUnit1(tf.keras.Model): + def __init__(self, out_channel): + """The unit of shufflenetv2 for stride=1 + Args: + out_channel: int, number of channels + """ + super(ShufflenetUnit1, self).__init__() + + assert out_channel % 2 == 0 + self.out_channel = out_channel + + self.conv1_bn_relu = Conv2D_BN_ReLU(out_channel // 2, 1, 1) + self.dconv_bn = DepthwiseConv2D_BN(3, 1) + self.conv2_bn_relu = Conv2D_BN_ReLU(out_channel // 2, 1, 1) + + def call(self, inputs, training=False): + # split the channel + shortcut, x = tf.split(inputs, 2, axis=3) + + x = self.conv1_bn_relu(x, training=training) + x = self.dconv_bn(x, training=training) + x = self.conv2_bn_relu(x, training=training) + + x = tf.concat([shortcut, x], axis=3) + x = channle_shuffle(x, 2) + return x + +class ShufflenetUnit2(tf.keras.Model): + """The unit of shufflenetv2 for stride=2""" + def __init__(self, in_channel, out_channel): + super(ShufflenetUnit2, self).__init__() + + assert out_channel % 2 == 0 + self.in_channel = in_channel + self.out_channel = out_channel + + self.conv1_bn_relu = Conv2D_BN_ReLU(out_channel // 2, 1, 1) + self.dconv_bn = DepthwiseConv2D_BN(3, 2) + self.conv2_bn_relu = Conv2D_BN_ReLU(out_channel - in_channel, 1, 1) + + # for shortcut + self.shortcut_dconv_bn = DepthwiseConv2D_BN(3, 2) + self.shortcut_conv_bn_relu = Conv2D_BN_ReLU(in_channel, 1, 1) + + def call(self, inputs, training=False): + shortcut, x = inputs, inputs + + x = self.conv1_bn_relu(x, training=training) + x = self.dconv_bn(x, training=training) + x = self.conv2_bn_relu(x, training=training) + + shortcut = self.shortcut_dconv_bn(shortcut, training=training) + shortcut = self.shortcut_conv_bn_relu(shortcut, training=training) + + x = tf.concat([shortcut, x], axis=3) + x = channle_shuffle(x, 2) + return x + +class ShufflenetStage(tf.keras.Model): + """The stage of shufflenet""" + def __init__(self, in_channel, out_channel, num_blocks): + super(ShufflenetStage, self).__init__() + + self.in_channel = in_channel + self.out_channel = out_channel + + self.ops = [] + for i in range(num_blocks): + if i == 0: + op = ShufflenetUnit2(in_channel, out_channel) + else: + op = ShufflenetUnit1(out_channel) + self.ops.append(op) + + def call(self, inputs, training=False): + x = inputs + for op in self.ops: + x = op(x, training=training) + return x + + +class ShuffleNetv2(tf.keras.Model): + """Shufflenetv2""" + def __init__(self, num_classes, first_channel=24, channels_per_stage=(116, 232, 464)): + super(ShuffleNetv2, self).__init__() + + self.num_classes = num_classes + + self.conv1_bn_relu = Conv2D_BN_ReLU(first_channel, 3, 2) + self.pool1 = MaxPool2D(3, strides=2, padding="SAME") + self.stage2 = ShufflenetStage(first_channel, channels_per_stage[0], 4) + self.stage3 = ShufflenetStage(channels_per_stage[0], channels_per_stage[1], 8) + self.stage4 = ShufflenetStage(channels_per_stage[1], channels_per_stage[2], 4) + self.conv5_bn_relu = Conv2D_BN_ReLU(1024, 1, 1) + self.gap = GlobalAveragePooling2D() + self.linear = Dense(num_classes) + + def call(self, inputs, training=False): + x = self.conv1_bn_relu(inputs, training=training) + x = self.pool1(x) + x = self.stage2(x, training=training) + x = self.stage3(x, training=training) + x = self.stage4(x, training=training) + x = self.conv5_bn_relu(x, training=training) + x = self.gap(x) + x = self.linear(x) + return x + + +if __name__ =="__main__": + """ + inputs = tf.placeholder(tf.float32, [None, 224, 224, 3]) + + model = ShuffleNetv2(1000) + outputs = model(inputs) + + print(model.summary()) + + with tf.Session() as sess: + pass + + + vars = [] + for v in tf.global_variables(): + + vars.append((v.name, v)) + print(v.name) + print(len(vars)) + + + import numpy as np + + path = "C:/models/ShuffleNetV2-1x.npz" + weights = np.load(path) + np_vars = [] + for k in weights: + k_ = k.replace("beta", "gbeta") + k_ = k_.replace("/dconv", "/conv10_dconv") + k_ = k_.replace("shortcut_dconv", "shortcut_a_dconv") + k_ = k_.replace("conv5", "su_conv5") + k_ = k_.replace("linear", "t_linear") + np_vars.append((k_, weights[k])) + np_vars.sort(key=lambda x: x[0]) + + for k, _ in np_vars: + print(k) + + saver = tf.train.Saver(tf.global_variables()) + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + + assign_ops = [] + for id in range(len(vars)): + print(vars[id][0], np_vars[id][0]) + assign_ops.append(tf.assign(vars[id][1], np_vars[id][1])) + + sess.run(assign_ops) + saver.save(sess, "./models/shufflene_v2_1.0.ckpt") + + model.save("./models/shufflenet_v2_1.0.hdf5") + + """ + + import numpy as np + from tensorflow.keras.preprocessing import image + from tensorflow.keras.applications.densenet import preprocess_input, decode_predictions + + img_path = './images/cat.jpg' + img = image.load_img(img_path, target_size=(224, 224)) + x = image.img_to_array(img) + x = np.expand_dims(x, axis=0) + x = preprocess_input(x) + + inputs = tf.placeholder(tf.float32, [None, 224, 224, 3]) + model = ShuffleNetv2(1000) + outputs = model(inputs, training=False) + outputs = tf.nn.softmax(outputs) + + saver = tf.train.Saver() + with tf.Session() as sess: + saver.restore(sess, "./models/shufflene_v2_1.0.ckpt") + preds = sess.run(outputs, feed_dict={inputs: x}) + print(decode_predictions(preds, top=3)[0]) + diff --git a/ObjectDetections/SSD/SSD_demo.py b/ObjectDetections/SSD/SSD_demo.py new file mode 100644 index 0000000..bfc42a4 --- /dev/null +++ b/ObjectDetections/SSD/SSD_demo.py @@ -0,0 +1,33 @@ +""" +SSD demo +""" + +import cv2 +import numpy as np +import tensorflow as tf +import matplotlib.image as mpimg + +from ssd_300_vgg import SSD +from utils import preprocess_image, process_bboxes +from visualization import plt_bboxes + + +ssd_net = SSD() +classes, scores, bboxes = ssd_net.detections() +images = ssd_net.images() + +sess = tf.Session() +# Restore SSD model. +ckpt_filename = './ssd_checkpoints/ssd_vgg_300_weights.ckpt' +sess.run(tf.global_variables_initializer()) +saver = tf.train.Saver() +saver.restore(sess, ckpt_filename) + +img = cv2.imread('./demo/dog.jpg') +img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) +img_prepocessed = preprocess_image(img) +rclasses, rscores, rbboxes = sess.run([classes, scores, bboxes], + feed_dict={images: img_prepocessed}) +rclasses, rscores, rbboxes = process_bboxes(rclasses, rscores, rbboxes) + +plt_bboxes(img, rclasses, rscores, rbboxes) diff --git a/ObjectDetections/SSD/demo/README.md b/ObjectDetections/SSD/demo/README.md new file mode 100644 index 0000000..c200906 --- /dev/null +++ b/ObjectDetections/SSD/demo/README.md @@ -0,0 +1 @@ +222 diff --git a/ObjectDetections/SSD/demo/car2.jpg b/ObjectDetections/SSD/demo/car2.jpg new file mode 100644 index 0000000..8c24a69 Binary files /dev/null and b/ObjectDetections/SSD/demo/car2.jpg differ diff --git a/ObjectDetections/SSD/demo/dog.jpg b/ObjectDetections/SSD/demo/dog.jpg new file mode 100644 index 0000000..665a81c Binary files /dev/null and b/ObjectDetections/SSD/demo/dog.jpg differ diff --git a/ObjectDetections/SSD/demo/eagle.jpg b/ObjectDetections/SSD/demo/eagle.jpg new file mode 100644 index 0000000..8b75095 Binary files /dev/null and b/ObjectDetections/SSD/demo/eagle.jpg differ diff --git a/ObjectDetections/SSD/demo/horses.jpg b/ObjectDetections/SSD/demo/horses.jpg new file mode 100644 index 0000000..3a761f4 Binary files /dev/null and b/ObjectDetections/SSD/demo/horses.jpg differ diff --git a/ObjectDetections/SSD/demo/person.jpg b/ObjectDetections/SSD/demo/person.jpg new file mode 100644 index 0000000..61d377f Binary files /dev/null and b/ObjectDetections/SSD/demo/person.jpg differ diff --git a/ObjectDetections/SSD/demo/street.jpg b/ObjectDetections/SSD/demo/street.jpg new file mode 100644 index 0000000..6750d37 Binary files /dev/null and b/ObjectDetections/SSD/demo/street.jpg differ diff --git a/ObjectDetections/SSD/ssd_300_vgg.py b/ObjectDetections/SSD/ssd_300_vgg.py new file mode 100644 index 0000000..42bef4d --- /dev/null +++ b/ObjectDetections/SSD/ssd_300_vgg.py @@ -0,0 +1,233 @@ +""" +SSD net (vgg_based) 300x300 +""" +from collections import namedtuple + +import numpy as np +import tensorflow as tf + +from ssd_layers import conv2d, max_pool2d, l2norm, dropout, \ + pad2d, ssd_multibox_layer +from ssd_anchors import ssd_anchors_all_layers + +# SSD parameters +SSDParams = namedtuple('SSDParameters', ['img_shape', # the input image size: 300x300 + 'num_classes', # number of classes: 20+1 + 'no_annotation_label', + 'feat_layers', # list of names of layer for detection + 'feat_shapes', # list of feature map sizes of layer for detection + 'anchor_size_bounds', # the down and upper bounds of anchor sizes + 'anchor_sizes', # list of anchor sizes of layer for detection + 'anchor_ratios', # list of rations used in layer for detection + 'anchor_steps', # list of cell size (pixel size) of layer for detection + 'anchor_offset', # the center point offset + 'normalizations', # list of normalizations of layer for detection + 'prior_scaling' # + ]) +class SSD(object): + """SSD net 300""" + def __init__(self, is_training=True): + self.is_training = is_training + self.threshold = 0.5 # class score threshold + self.ssd_params = SSDParams(img_shape=(300, 300), + num_classes=21, + no_annotation_label=21, + feat_layers=["block4", "block7", "block8", "block9", "block10", "block11"], + feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], + anchor_size_bounds=[0.15, 0.90], # diff from the original paper + anchor_sizes=[(21., 45.), + (45., 99.), + (99., 153.), + (153., 207.), + (207., 261.), + (261., 315.)], + anchor_ratios=[[2, .5], + [2, .5, 3, 1. / 3], + [2, .5, 3, 1. / 3], + [2, .5, 3, 1. / 3], + [2, .5], + [2, .5]], + anchor_steps=[8, 16, 32, 64, 100, 300], + anchor_offset=0.5, + normalizations=[20, -1, -1, -1, -1, -1], + prior_scaling=[0.1, 0.1, 0.2, 0.2] + ) + + predictions, logits, locations = self._built_net() + #self._update_feat_shapes_from_net() + classes, scores, bboxes = self._bboxes_select(predictions, locations) + self._classes = classes + self._scores = scores + self._bboxes = bboxes + + def _built_net(self): + """Construct the SSD net""" + self.end_points = {} # record the detection layers output + self._images = tf.placeholder(tf.float32, shape=[None, self.ssd_params.img_shape[0], + self.ssd_params.img_shape[1], 3]) + with tf.variable_scope("ssd_300_vgg"): + # original vgg layers + # block 1 + net = conv2d(self._images, 64, 3, scope="conv1_1") + net = conv2d(net, 64, 3, scope="conv1_2") + self.end_points["block1"] = net + net = max_pool2d(net, 2, scope="pool1") + # block 2 + net = conv2d(net, 128, 3, scope="conv2_1") + net = conv2d(net, 128, 3, scope="conv2_2") + self.end_points["block2"] = net + net = max_pool2d(net, 2, scope="pool2") + # block 3 + net = conv2d(net, 256, 3, scope="conv3_1") + net = conv2d(net, 256, 3, scope="conv3_2") + net = conv2d(net, 256, 3, scope="conv3_3") + self.end_points["block3"] = net + net = max_pool2d(net, 2, scope="pool3") + # block 4 + net = conv2d(net, 512, 3, scope="conv4_1") + net = conv2d(net, 512, 3, scope="conv4_2") + net = conv2d(net, 512, 3, scope="conv4_3") + self.end_points["block4"] = net + net = max_pool2d(net, 2, scope="pool4") + # block 5 + net = conv2d(net, 512, 3, scope="conv5_1") + net = conv2d(net, 512, 3, scope="conv5_2") + net = conv2d(net, 512, 3, scope="conv5_3") + self.end_points["block5"] = net + print(net) + net = max_pool2d(net, 3, stride=1, scope="pool5") + print(net) + + # additional SSD layers + # block 6: use dilate conv + net = conv2d(net, 1024, 3, dilation_rate=6, scope="conv6") + self.end_points["block6"] = net + #net = dropout(net, is_training=self.is_training) + # block 7 + net = conv2d(net, 1024, 1, scope="conv7") + self.end_points["block7"] = net + # block 8 + net = conv2d(net, 256, 1, scope="conv8_1x1") + net = conv2d(pad2d(net, 1), 512, 3, stride=2, scope="conv8_3x3", + padding="valid") + self.end_points["block8"] = net + # block 9 + net = conv2d(net, 128, 1, scope="conv9_1x1") + net = conv2d(pad2d(net, 1), 256, 3, stride=2, scope="conv9_3x3", + padding="valid") + self.end_points["block9"] = net + # block 10 + net = conv2d(net, 128, 1, scope="conv10_1x1") + net = conv2d(net, 256, 3, scope="conv10_3x3", padding="valid") + self.end_points["block10"] = net + # block 11 + net = conv2d(net, 128, 1, scope="conv11_1x1") + net = conv2d(net, 256, 3, scope="conv11_3x3", padding="valid") + self.end_points["block11"] = net + + # class and location predictions + predictions = [] + logits = [] + locations = [] + for i, layer in enumerate(self.ssd_params.feat_layers): + cls, loc = ssd_multibox_layer(self.end_points[layer], self.ssd_params.num_classes, + self.ssd_params.anchor_sizes[i], + self.ssd_params.anchor_ratios[i], + self.ssd_params.normalizations[i], scope=layer+"_box") + predictions.append(tf.nn.softmax(cls)) + logits.append(cls) + locations.append(loc) + return predictions, logits, locations + + def _update_feat_shapes_from_net(self, predictions): + """ Obtain the feature shapes from the prediction layers""" + new_feat_shapes = [] + for l in predictions: + new_feat_shapes.append(l.get_shape().as_list()[1:]) + self.ssd_params._replace(feat_shapes=new_feat_shapes) + + def anchors(self): + """Get sSD anchors""" + return ssd_anchors_all_layers(self.ssd_params.img_shape, + self.ssd_params.feat_shapes, + self.ssd_params.anchor_sizes, + self.ssd_params.anchor_ratios, + self.ssd_params.anchor_steps, + self.ssd_params.anchor_offset, + np.float32) + + def _bboxes_decode_layer(self, feat_locations, anchor_bboxes, prior_scaling): + """ + Decode the feat location of one layer + params: + feat_locations: 5D Tensor, [batch_size, size, size, n_anchors, 4] + anchor_bboxes: list of Tensors(y, x, w, h) + shape: [size,size,1], [size, size,1], [n_anchors], [n_anchors] + prior_scaling: list of 4 floats + """ + yref, xref, href, wref = anchor_bboxes + print(yref) + # Compute center, height and width + cx = feat_locations[:, :, :, :, 0] * wref * prior_scaling[0] + xref + cy = feat_locations[:, :, :, :, 1] * href * prior_scaling[1] + yref + w = wref * tf.exp(feat_locations[:, :, :, :, 2] * prior_scaling[2]) + h = href * tf.exp(feat_locations[:, :, :, :, 3] * prior_scaling[3]) + # compute boxes coordinates (ymin, xmin, ymax,,xmax) + bboxes = tf.stack([cy - h / 2., cx - w / 2., + cy + h / 2., cx + w / 2.], axis=-1) + # shape [batch_size, size, size, n_anchors, 4] + return bboxes + + def _bboxes_select_layer(self, feat_predictions, feat_locations, anchor_bboxes, + prior_scaling): + """Select boxes from the feat layer, only for bacth_size=1""" + n_bboxes = np.product(feat_predictions.get_shape().as_list()[1:-1]) + # decode the location + bboxes = self._bboxes_decode_layer(feat_locations, anchor_bboxes, prior_scaling) + bboxes = tf.reshape(bboxes, [n_bboxes, 4]) + predictions = tf.reshape(feat_predictions, [n_bboxes, self.ssd_params.num_classes]) + # remove the background predictions + sub_predictions = predictions[:, 1:] + # choose the max score class + classes = tf.argmax(sub_predictions, axis=1) + 1 # class labels + scores = tf.reduce_max(sub_predictions, axis=1) # max_class scores + # Boxes selection: use threshold + filter_mask = scores > self.threshold + classes = tf.boolean_mask(classes, filter_mask) + scores = tf.boolean_mask(scores, filter_mask) + bboxes = tf.boolean_mask(bboxes, filter_mask) + return classes, scores, bboxes + + def _bboxes_select(self, predictions, locations): + """Select all bboxes predictions, only for bacth_size=1""" + anchor_bboxes_list = self.anchors() + classes_list = [] + scores_list = [] + bboxes_list = [] + # select bboxes for each feat layer + for n in range(len(predictions)): + anchor_bboxes = list(map(tf.convert_to_tensor, anchor_bboxes_list[n])) + classes, scores, bboxes = self._bboxes_select_layer(predictions[n], + locations[n], anchor_bboxes, self.ssd_params.prior_scaling) + classes_list.append(classes) + scores_list.append(scores) + bboxes_list.append(bboxes) + # combine all feat layers + classes = tf.concat(classes_list, axis=0) + scores = tf.concat(scores_list, axis=0) + bboxes = tf.concat(bboxes_list, axis=0) + return classes, scores, bboxes + + def images(self): + return self._images + + def detections(self): + return self._classes, self._scores, self._bboxes + + +if __name__ == "__main__": + ssd = SSD() + sess = tf.Session() + saver_ = tf.train.Saver() + saver_.restore(sess, "../SSD-Tensorflow-master/ssd_checkpoints/ssd_vgg_300_weights.ckpt") + diff --git a/ObjectDetections/SSD/ssd_anchors.py b/ObjectDetections/SSD/ssd_anchors.py new file mode 100644 index 0000000..a121f1f --- /dev/null +++ b/ObjectDetections/SSD/ssd_anchors.py @@ -0,0 +1,107 @@ +""" +SSD anchors +""" +import math + +import numpy as np + +def ssd_size_bounds_to_values(size_bounds, + n_feat_layers, + img_shape=(300, 300)): + """Compute the reference sizes of the anchor boxes from relative bounds. + The absolute values are measured in pixels, based on the network + default size (300 pixels). + + This function follows the computation performed in the original + implementation of SSD in Caffe. + + Return: + list of list containing the absolute sizes at each scale. For each scale, + the ratios only apply to the first value. + """ + assert img_shape[0] == img_shape[1] + + img_size = img_shape[0] + min_ratio = int(size_bounds[0] * 100) + max_ratio = int(size_bounds[1] * 100) + step = int(math.floor((max_ratio - min_ratio) / (n_feat_layers - 2))) + # Start with the following smallest sizes. + sizes = [[img_size * size_bounds[0] / 2, img_size * size_bounds[0]]] + for ratio in range(min_ratio, max_ratio + 1, step): + sizes.append((img_size * ratio / 100., + img_size * (ratio + step) / 100.)) + return sizes + +def ssd_anchor_one_layer(img_shape, + feat_shape, + sizes, + ratios, + step, + offset=0.5, + dtype=np.float32): + """Computer SSD default anchor boxes for one feature layer. + + Determine the relative position grid of the centers, and the relative + width and height. + + Arguments: + feat_shape: Feature shape, used for computing relative position grids; + size: Absolute reference sizes; + ratios: Ratios to use on these features; + img_shape: Image shape, used for computing height, width relatively to the + former; + offset: Grid offset. + + Return: + y, x, h, w: Relative x and y grids, and height and width. + """ + # Compute the position grid: simple way. + # y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]] + # y = (y.astype(dtype) + offset) / feat_shape[0] + # x = (x.astype(dtype) + offset) / feat_shape[1] + # Weird SSD-Caffe computation using steps values... + y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]] + y = (y.astype(dtype) + offset) * step / img_shape[0] + x = (x.astype(dtype) + offset) * step / img_shape[1] + + # Expand dims to support easy broadcasting. + y = np.expand_dims(y, axis=-1) # [size, size, 1] + x = np.expand_dims(x, axis=-1) # [size, size, 1] + + # Compute relative height and width. + # Tries to follow the original implementation of SSD for the order. + num_anchors = len(sizes) + len(ratios) + h = np.zeros((num_anchors, ), dtype=dtype) # [n_anchors] + w = np.zeros((num_anchors, ), dtype=dtype) # [n_anchors] + # Add first anchor boxes with ratio=1. + h[0] = sizes[0] / img_shape[0] + w[0] = sizes[0] / img_shape[1] + di = 1 + if len(sizes) > 1: + h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0] + w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1] + di += 1 + for i, r in enumerate(ratios): + h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r) + w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r) + return y, x, h, w + + +def ssd_anchors_all_layers(img_shape, + layers_shape, + anchor_sizes, + anchor_ratios, + anchor_steps, + offset=0.5, + dtype=np.float32): + """Compute anchor boxes for all feature layers. + """ + layers_anchors = [] + for i, s in enumerate(layers_shape): + anchor_bboxes = ssd_anchor_one_layer(img_shape, s, + anchor_sizes[i], + anchor_ratios[i], + anchor_steps[i], + offset=offset, dtype=dtype) + layers_anchors.append(anchor_bboxes) + return layers_anchors \ No newline at end of file diff --git a/ObjectDetections/SSD/ssd_layers.py b/ObjectDetections/SSD/ssd_layers.py new file mode 100644 index 0000000..071ffaa --- /dev/null +++ b/ObjectDetections/SSD/ssd_layers.py @@ -0,0 +1,61 @@ +""" +Layers for SSD +""" + +import tensorflow as tf + +# Conv2d: for stride = 1 +def conv2d(x, filters, kernel_size, stride=1, padding="same", + dilation_rate=1, activation=tf.nn.relu, scope="conv2d"): + kernel_sizes = [kernel_size] * 2 + strides = [stride] * 2 + dilation_rate = [dilation_rate] * 2 + return tf.layers.conv2d(x, filters, kernel_sizes, strides=strides, + dilation_rate=dilation_rate, padding=padding, + name=scope, activation=activation) + +# max pool2d: default pool_size = stride +def max_pool2d(x, pool_size, stride=None, scope="max_pool2d"): + pool_sizes = [pool_size] * 2 + strides = [pool_size] * 2 if stride is None else [stride] * 2 + return tf.layers.max_pooling2d(x, pool_sizes, strides, name=scope, padding="same") + +# pad2d: for conv2d with stride > 1 +def pad2d(x, pad): + return tf.pad(x, paddings=[[0, 0], [pad, pad], [pad, pad], [0, 0]]) + +# dropout +def dropout(x, rate=0.5, is_training=True): + return tf.layers.dropout(x, rate=rate, training=is_training) + +# l2norm (not bacth norm, spatial normalization) +def l2norm(x, scale, trainable=True, scope="L2Normalization"): + n_channels = x.get_shape().as_list()[-1] + l2_norm = tf.nn.l2_normalize(x, [3], epsilon=1e-12) + with tf.variable_scope(scope): + gamma = tf.get_variable("gamma", shape=[n_channels, ], dtype=tf.float32, + initializer=tf.constant_initializer(scale), + trainable=trainable) + return l2_norm * gamma + + +# multibox layer: get class and location predicitions from detection layer +def ssd_multibox_layer(x, num_classes, sizes, ratios, normalization=-1, scope="multibox"): + pre_shape = x.get_shape().as_list()[1:-1] + pre_shape = [-1] + pre_shape + with tf.variable_scope(scope): + # l2 norm + if normalization > 0: + x = l2norm(x, normalization) + print(x) + # numbers of anchors + n_anchors = len(sizes) + len(ratios) + # location predictions + loc_pred = conv2d(x, n_anchors*4, 3, activation=None, scope="conv_loc") + loc_pred = tf.reshape(loc_pred, pre_shape + [n_anchors, 4]) + # class prediction + cls_pred = conv2d(x, n_anchors*num_classes, 3, activation=None, scope="conv_cls") + cls_pred = tf.reshape(cls_pred, pre_shape + [n_anchors, num_classes]) + return cls_pred, loc_pred + + diff --git a/ObjectDetections/SSD/utils.py b/ObjectDetections/SSD/utils.py new file mode 100644 index 0000000..af73360 --- /dev/null +++ b/ObjectDetections/SSD/utils.py @@ -0,0 +1,130 @@ +""" +Help functions for SSD +""" + +import cv2 +import numpy as np + + +############## preprocess image ################## +# whiten the image +def whiten_image(image, means=(123., 117., 104.)): + """Subtracts the given means from each image channel""" + if image.ndim != 3: + raise ValueError('Input must be of size [height, width, C>0]') + num_channels = image.shape[-1] + if len(means) != num_channels: + raise ValueError('len(means) must match the number of channels') + + mean = np.array(means, dtype=image.dtype) + image = image - mean + return image + +def resize_image(image, size=(300, 300)): + return cv2.resize(image, size) + +def preprocess_image(image): + """Preprocess a image to inference""" + image_cp = np.copy(image).astype(np.float32) + # whiten the image + image_whitened = whiten_image(image_cp) + # resize the image + image_resized = resize_image(image_whitened) + # expand the batch_size dim + image_expanded = np.expand_dims(image_resized, axis=0) + return image_expanded + +############## process bboxes ################## +def bboxes_clip(bbox_ref, bboxes): + """Clip bounding boxes with respect to reference bbox. + """ + bboxes = np.copy(bboxes) + bboxes = np.transpose(bboxes) + bbox_ref = np.transpose(bbox_ref) + bboxes[0] = np.maximum(bboxes[0], bbox_ref[0]) + bboxes[1] = np.maximum(bboxes[1], bbox_ref[1]) + bboxes[2] = np.minimum(bboxes[2], bbox_ref[2]) + bboxes[3] = np.minimum(bboxes[3], bbox_ref[3]) + bboxes = np.transpose(bboxes) + return bboxes + +def bboxes_sort(classes, scores, bboxes, top_k=400): + """Sort bounding boxes by decreasing order and keep only the top_k + """ + # if priority_inside: + # inside = (bboxes[:, 0] > margin) & (bboxes[:, 1] > margin) & \ + # (bboxes[:, 2] < 1-margin) & (bboxes[:, 3] < 1-margin) + # idxes = np.argsort(-scores) + # inside = inside[idxes] + # idxes = np.concatenate([idxes[inside], idxes[~inside]]) + idxes = np.argsort(-scores) + classes = classes[idxes][:top_k] + scores = scores[idxes][:top_k] + bboxes = bboxes[idxes][:top_k] + return classes, scores, bboxes + +def bboxes_iou(bboxes1, bboxes2): + """Computing iou between bboxes1 and bboxes2. + Note: bboxes1 and bboxes2 can be multi-dimensional, but should broacastable. + """ + bboxes1 = np.transpose(bboxes1) + bboxes2 = np.transpose(bboxes2) + # Intersection bbox and volume. + int_ymin = np.maximum(bboxes1[0], bboxes2[0]) + int_xmin = np.maximum(bboxes1[1], bboxes2[1]) + int_ymax = np.minimum(bboxes1[2], bboxes2[2]) + int_xmax = np.minimum(bboxes1[3], bboxes2[3]) + + int_h = np.maximum(int_ymax - int_ymin, 0.) + int_w = np.maximum(int_xmax - int_xmin, 0.) + int_vol = int_h * int_w + # Union volume. + vol1 = (bboxes1[2] - bboxes1[0]) * (bboxes1[3] - bboxes1[1]) + vol2 = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1]) + iou = int_vol / (vol1 + vol2 - int_vol) + return iou + +def bboxes_nms(classes, scores, bboxes, nms_threshold=0.5): + """Apply non-maximum selection to bounding boxes. + """ + keep_bboxes = np.ones(scores.shape, dtype=np.bool) + for i in range(scores.size-1): + if keep_bboxes[i]: + # Computer overlap with bboxes which are following. + overlap = bboxes_iou(bboxes[i], bboxes[(i+1):]) + # Overlap threshold for keeping + checking part of the same class + keep_overlap = np.logical_or(overlap < nms_threshold, classes[(i+1):] != classes[i]) + keep_bboxes[(i+1):] = np.logical_and(keep_bboxes[(i+1):], keep_overlap) + + idxes = np.where(keep_bboxes) + return classes[idxes], scores[idxes], bboxes[idxes] + +def bboxes_resize(bbox_ref, bboxes): + """Resize bounding boxes based on a reference bounding box, + assuming that the latter is [0, 0, 1, 1] after transform. + """ + bboxes = np.copy(bboxes) + # Translate. + bboxes[:, 0] -= bbox_ref[0] + bboxes[:, 1] -= bbox_ref[1] + bboxes[:, 2] -= bbox_ref[0] + bboxes[:, 3] -= bbox_ref[1] + # Resize. + resize = [bbox_ref[2] - bbox_ref[0], bbox_ref[3] - bbox_ref[1]] + bboxes[:, 0] /= resize[0] + bboxes[:, 1] /= resize[1] + bboxes[:, 2] /= resize[0] + bboxes[:, 3] /= resize[1] + return bboxes + +def process_bboxes(rclasses, rscores, rbboxes, rbbox_img = (0.0, 0.0, 1.0, 1.0), + top_k=400, nms_threshold=0.5): + """Process the bboxes including sort and nms""" + rbboxes = bboxes_clip(rbbox_img, rbboxes) + rclasses, rscores, rbboxes = bboxes_sort(rclasses, rscores, rbboxes, top_k) + rclasses, rscores, rbboxes = bboxes_nms(rclasses, rscores, rbboxes, nms_threshold) + rbboxes = bboxes_resize(rbbox_img, rbboxes) + return rclasses, rscores, rbboxes + + + diff --git a/ObjectDetections/SSD/var_name.txt b/ObjectDetections/SSD/var_name.txt new file mode 100644 index 0000000..fe2c4a3 --- /dev/null +++ b/ObjectDetections/SSD/var_name.txt @@ -0,0 +1,71 @@ +ssd_300_vgg/conv1_1/kernel +ssd_300_vgg/conv1_1/bias +ssd_300_vgg/conv1_2/kernel +ssd_300_vgg/conv1_2/bias +ssd_300_vgg/conv2_1/kernel +ssd_300_vgg/conv2_1/bias +ssd_300_vgg/conv2_2/kernel +ssd_300_vgg/conv2_2/bias +ssd_300_vgg/conv3_1/kernel +ssd_300_vgg/conv3_1/bias +ssd_300_vgg/conv3_2/kernel +ssd_300_vgg/conv3_2/bias +ssd_300_vgg/conv3_3/kernel +ssd_300_vgg/conv3_3/bias +ssd_300_vgg/conv4_1/kernel +ssd_300_vgg/conv4_1/bias +ssd_300_vgg/conv4_2/kernel +ssd_300_vgg/conv4_2/bias +ssd_300_vgg/conv4_3/kernel +ssd_300_vgg/conv4_3/bias +ssd_300_vgg/conv5_1/kernel +ssd_300_vgg/conv5_1/bias +ssd_300_vgg/conv5_2/kernel +ssd_300_vgg/conv5_2/bias +ssd_300_vgg/conv5_3/kernel +ssd_300_vgg/conv5_3/bias +ssd_300_vgg/conv6/kernel +ssd_300_vgg/conv6/bias +ssd_300_vgg/conv7/kernel +ssd_300_vgg/conv7/bias +ssd_300_vgg/conv8_1x1/kernel +ssd_300_vgg/conv8_1x1/bias +ssd_300_vgg/conv8_3x3/kernel +ssd_300_vgg/conv8_3x3/bias +ssd_300_vgg/conv9_1x1/kernel +ssd_300_vgg/conv9_1x1/bias +ssd_300_vgg/conv9_3x3/kernel +ssd_300_vgg/conv9_3x3/bias +ssd_300_vgg/conv10_1x1/kernel +ssd_300_vgg/conv10_1x1/bias +ssd_300_vgg/conv10_3x3/kernel +ssd_300_vgg/conv10_3x3/bias +ssd_300_vgg/conv11_1x1/kernel +ssd_300_vgg/conv11_1x1/bias +ssd_300_vgg/conv11_3x3/kernel +ssd_300_vgg/conv11_3x3/bias +ssd_300_vgg/block4_box/L2Normalization/gamma +ssd_300_vgg/block4_box/conv_loc/kernel +ssd_300_vgg/block4_box/conv_loc/bias +ssd_300_vgg/block4_box/conv_cls/kernel +ssd_300_vgg/block4_box/conv_cls/bias +ssd_300_vgg/block7_box/conv_loc/kernel +ssd_300_vgg/block7_box/conv_loc/bias +ssd_300_vgg/block7_box/conv_cls/kernel +ssd_300_vgg/block7_box/conv_cls/bias +ssd_300_vgg/block8_box/conv_loc/kernel +ssd_300_vgg/block8_box/conv_loc/bias +ssd_300_vgg/block8_box/conv_cls/kernel +ssd_300_vgg/block8_box/conv_cls/bias +ssd_300_vgg/block9_box/conv_loc/kernel +ssd_300_vgg/block9_box/conv_loc/bias +ssd_300_vgg/block9_box/conv_cls/kernel +ssd_300_vgg/block9_box/conv_cls/bias +ssd_300_vgg/block10_box/conv_loc/kernel +ssd_300_vgg/block10_box/conv_loc/bias +ssd_300_vgg/block10_box/conv_cls/kernel +ssd_300_vgg/block10_box/conv_cls/bias +ssd_300_vgg/block11_box/conv_loc/kernel +ssd_300_vgg/block11_box/conv_loc/bias +ssd_300_vgg/block11_box/conv_cls/kernel +ssd_300_vgg/block11_box/conv_cls/bias diff --git a/ObjectDetections/SSD/visualization.py b/ObjectDetections/SSD/visualization.py new file mode 100644 index 0000000..d6ace07 --- /dev/null +++ b/ObjectDetections/SSD/visualization.py @@ -0,0 +1,119 @@ +# Copyright 2017 Paul Balanca. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import cv2 +import random + +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +import matplotlib.cm as mpcm + + +# class names +CLASSES = ["aeroplane", "bicycle", "bird", "boat", "bottle", + "bus", "car", "cat", "chair", "cow", "diningtable", + "dog", "horse", "motorbike", "person", "pottedplant", + "sheep", "sofa", "train","tvmonitor"] +# =========================================================================== # +# Some colormaps. +# =========================================================================== # +def colors_subselect(colors, num_classes=21): + dt = len(colors) // num_classes + sub_colors = [] + for i in range(num_classes): + color = colors[i*dt] + if isinstance(color[0], float): + sub_colors.append([int(c * 255) for c in color]) + else: + sub_colors.append([c for c in color]) + return sub_colors + +colors_plasma = colors_subselect(mpcm.plasma.colors, num_classes=21) +colors_tableau = [(255, 255, 255), (31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120), + (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150), + (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148), + (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199), + (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)] + + +# =========================================================================== # +# OpenCV drawing. +# =========================================================================== # +def draw_lines(img, lines, color=[255, 0, 0], thickness=2): + """Draw a collection of lines on an image. + """ + for line in lines: + for x1, y1, x2, y2 in line: + cv2.line(img, (x1, y1), (x2, y2), color, thickness) + + +def draw_rectangle(img, p1, p2, color=[255, 0, 0], thickness=2): + cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness) + + +def draw_bbox(img, bbox, shape, label, color=[255, 0, 0], thickness=2): + p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1])) + p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1])) + cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness) + p1 = (p1[0]+15, p1[1]) + cv2.putText(img, str(label), p1[::-1], cv2.FONT_HERSHEY_DUPLEX, 0.5, color, 1) + + +def bboxes_draw_on_img(img, classes, scores, bboxes, colors, thickness=2): + shape = img.shape + for i in range(bboxes.shape[0]): + bbox = bboxes[i] + color = colors[classes[i]] + # Draw bounding box... + p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1])) + p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1])) + cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness) + # Draw text... + s = '%s/%.3f' % (classes[i], scores[i]) + p1 = (p1[0]-5, p1[1]) + cv2.putText(img, s, p1[::-1], cv2.FONT_HERSHEY_DUPLEX, 0.4, color, 1) + + +# =========================================================================== # +# Matplotlib show... +# =========================================================================== # +def plt_bboxes(img, classes, scores, bboxes, figsize=(10,10), linewidth=1.5, show_class_name=True): + """Visualize bounding boxes. Largely inspired by SSD-MXNET! + """ + fig = plt.figure(figsize=figsize) + plt.imshow(img) + height = img.shape[0] + width = img.shape[1] + colors = dict() + for i in range(classes.shape[0]): + cls_id = int(classes[i]) + if cls_id >= 0: + score = scores[i] + if cls_id not in colors: + colors[cls_id] = (random.random(), random.random(), random.random()) + ymin = int(bboxes[i, 0] * height) + xmin = int(bboxes[i, 1] * width) + ymax = int(bboxes[i, 2] * height) + xmax = int(bboxes[i, 3] * width) + rect = plt.Rectangle((xmin, ymin), xmax - xmin, + ymax - ymin, fill=False, + edgecolor=colors[cls_id], + linewidth=linewidth) + plt.gca().add_patch(rect) + class_name = CLASSES[cls_id-1] if show_class_name else str(cls_id) + plt.gca().text(xmin, ymin - 2, + '{:s} | {:.3f}'.format(class_name, score), + bbox=dict(facecolor=colors[cls_id], alpha=0.5), + fontsize=12, color='white') + plt.show() diff --git a/ObjectDetections/yolo/test_images/car.jpg b/ObjectDetections/yolo/test_images/car.jpg new file mode 100644 index 0000000..8c24a69 Binary files /dev/null and b/ObjectDetections/yolo/test_images/car.jpg differ diff --git a/ObjectDetections/yolo/test_images/cat.jpg b/ObjectDetections/yolo/test_images/cat.jpg new file mode 100644 index 0000000..f4f2e76 Binary files /dev/null and b/ObjectDetections/yolo/test_images/cat.jpg differ diff --git a/ObjectDetections/yolo/test_images/person.jpg b/ObjectDetections/yolo/test_images/person.jpg new file mode 100644 index 0000000..61d377f Binary files /dev/null and b/ObjectDetections/yolo/test_images/person.jpg differ diff --git a/ObjectDetections/yolo/yolo.py b/ObjectDetections/yolo/yolo.py new file mode 100644 index 0000000..5fa6240 --- /dev/null +++ b/ObjectDetections/yolo/yolo.py @@ -0,0 +1,259 @@ +""" +Yolo V1 by tensorflow +""" + +import numpy as np +import tensorflow as tf +import cv2 + + +def leak_relu(x, alpha=0.1): + return tf.maximum(alpha * x, x) + +class Yolo(object): + def __init__(self, weights_file): + self.verbose = True + # detection params + self.S = 7 # cell size + self.B = 2 # boxes_per_cell + self.classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", + "bus", "car", "cat", "chair", "cow", "diningtable", + "dog", "horse", "motorbike", "person", "pottedplant", + "sheep", "sofa", "train","tvmonitor"] + self.C = len(self.classes) # number of classes + # offset for box center (top left point of each cell) + self.x_offset = np.transpose(np.reshape(np.array([np.arange(self.S)]*self.S*self.B), + [self.B, self.S, self.S]), [1, 2, 0]) + self.y_offset = np.transpose(self.x_offset, [1, 0, 2]) + + self.threshold = 0.2 # confidence scores threshold + self.iou_threshold = 0.5 + + self.sess = tf.Session() + self._build_net() + self._load_weights(weights_file) + + def _build_net(self): + """build the network""" + if self.verbose: + print("Start to build the network ...") + self.images = tf.placeholder(tf.float32, [None, 448, 448, 3]) + net = self._conv_layer(self.images, 1, 64, 7, 2) + net = self._maxpool_layer(net, 1, 2, 2) + net = self._conv_layer(net, 2, 192, 3, 1) + net = self._maxpool_layer(net, 2, 2, 2) + net = self._conv_layer(net, 3, 128, 1, 1) + net = self._conv_layer(net, 4, 256, 3, 1) + net = self._conv_layer(net, 5, 256, 1, 1) + net = self._conv_layer(net, 6, 512, 3, 1) + net = self._maxpool_layer(net, 6, 2, 2) + net = self._conv_layer(net, 7, 256, 1, 1) + net = self._conv_layer(net, 8, 512, 3, 1) + net = self._conv_layer(net, 9, 256, 1, 1) + net = self._conv_layer(net, 10, 512, 3, 1) + net = self._conv_layer(net, 11, 256, 1, 1) + net = self._conv_layer(net, 12, 512, 3, 1) + net = self._conv_layer(net, 13, 256, 1, 1) + net = self._conv_layer(net, 14, 512, 3, 1) + net = self._conv_layer(net, 15, 512, 1, 1) + net = self._conv_layer(net, 16, 1024, 3, 1) + net = self._maxpool_layer(net, 16, 2, 2) + net = self._conv_layer(net, 17, 512, 1, 1) + net = self._conv_layer(net, 18, 1024, 3, 1) + net = self._conv_layer(net, 19, 512, 1, 1) + net = self._conv_layer(net, 20, 1024, 3, 1) + net = self._conv_layer(net, 21, 1024, 3, 1) + net = self._conv_layer(net, 22, 1024, 3, 2) + net = self._conv_layer(net, 23, 1024, 3, 1) + net = self._conv_layer(net, 24, 1024, 3, 1) + net = self._flatten(net) + net = self._fc_layer(net, 25, 512, activation=leak_relu) + net = self._fc_layer(net, 26, 4096, activation=leak_relu) + net = self._fc_layer(net, 27, self.S*self.S*(self.C+5*self.B)) + self.predicts = net + + def _conv_layer(self, x, id, num_filters, filter_size, stride): + """Conv layer""" + in_channels = x.get_shape().as_list()[-1] + weight = tf.Variable(tf.truncated_normal([filter_size, filter_size, + in_channels, num_filters], stddev=0.1)) + bias = tf.Variable(tf.zeros([num_filters,])) + # padding, note: not using padding="SAME" + pad_size = filter_size // 2 + pad_mat = np.array([[0, 0], [pad_size, pad_size], [pad_size, pad_size], [0, 0]]) + x_pad = tf.pad(x, pad_mat) + conv = tf.nn.conv2d(x_pad, weight, strides=[1, stride, stride, 1], padding="VALID") + output = leak_relu(tf.nn.bias_add(conv, bias)) + if self.verbose: + print(" Layer %d: type=Conv, num_filter=%d, filter_size=%d, stride=%d, output_shape=%s" \ + % (id, num_filters, filter_size, stride, str(output.get_shape()))) + return output + + def _fc_layer(self, x, id, num_out, activation=None): + """fully connected layer""" + num_in = x.get_shape().as_list()[-1] + weight = tf.Variable(tf.truncated_normal([num_in, num_out], stddev=0.1)) + bias = tf.Variable(tf.zeros([num_out,])) + output = tf.nn.xw_plus_b(x, weight, bias) + if activation: + output = activation(output) + if self.verbose: + print(" Layer %d: type=Fc, num_out=%d, output_shape=%s" \ + % (id, num_out, str(output.get_shape()))) + return output + + def _maxpool_layer(self, x, id, pool_size, stride): + output = tf.nn.max_pool(x, [1, pool_size, pool_size, 1], + strides=[1, stride, stride, 1], padding="SAME") + if self.verbose: + print(" Layer %d: type=MaxPool, pool_size=%d, stride=%d, output_shape=%s" \ + % (id, pool_size, stride, str(output.get_shape()))) + return output + + def _flatten(self, x): + """flatten the x""" + tran_x = tf.transpose(x, [0, 3, 1, 2]) # channle first mode + nums = np.product(x.get_shape().as_list()[1:]) + return tf.reshape(tran_x, [-1, nums]) + + def _load_weights(self, weights_file): + """Load weights from file""" + if self.verbose: + print("Start to load weights from file:%s" % (weights_file)) + saver = tf.train.Saver() + saver.restore(self.sess, weights_file) + + def detect_from_file(self, image_file, imshow=True, deteted_boxes_file="boxes.txt", + detected_image_file="detected_image.jpg"): + """Do detection given a image file""" + # read image + image = cv2.imread(image_file) + img_h, img_w, _ = image.shape + predicts = self._detect_from_image(image) + predict_boxes = self._interpret_predicts(predicts, img_h, img_w) + self.show_results(image, predict_boxes, imshow, deteted_boxes_file, detected_image_file) + + def _detect_from_image(self, image): + """Do detection given a cv image""" + img_resized = cv2.resize(image, (448, 448)) + img_RGB = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB) + img_resized_np = np.asarray(img_RGB) + _images = np.zeros((1, 448, 448, 3), dtype=np.float32) + _images[0] = (img_resized_np / 255.0) * 2.0 - 1.0 + predicts = self.sess.run(self.predicts, feed_dict={self.images: _images})[0] + return predicts + + def _interpret_predicts(self, predicts, img_h, img_w): + """Interpret the predicts and get the detetction boxes""" + idx1 = self.S*self.S*self.C + idx2 = idx1 + self.S*self.S*self.B + # class prediction + class_probs = np.reshape(predicts[:idx1], [self.S, self.S, self.C]) + # confidence + confs = np.reshape(predicts[idx1:idx2], [self.S, self.S, self.B]) + # boxes -> (x, y, w, h) + boxes = np.reshape(predicts[idx2:], [self.S, self.S, self.B, 4]) + + # convert the x, y to the coordinates relative to the top left point of the image + boxes[:, :, :, 0] += self.x_offset + boxes[:, :, :, 1] += self.y_offset + boxes[:, :, :, :2] /= self.S + + # the predictions of w, h are the square root + boxes[:, :, :, 2:] = np.square(boxes[:, :, :, 2:]) + + # multiply the width and height of image + boxes[:, :, :, 0] *= img_w + boxes[:, :, :, 1] *= img_h + boxes[:, :, :, 2] *= img_w + boxes[:, :, :, 3] *= img_h + + # class-specific confidence scores [S, S, B, C] + scores = np.expand_dims(confs, -1) * np.expand_dims(class_probs, 2) + + scores = np.reshape(scores, [-1, self.C]) # [S*S*B, C] + boxes = np.reshape(boxes, [-1, 4]) # [S*S*B, 4] + + # filter the boxes when score < threhold + scores[scores < self.threshold] = 0.0 + + # non max suppression + self._non_max_suppression(scores, boxes) + + # report the boxes + predict_boxes = [] # (class, x, y, w, h, scores) + max_idxs = np.argmax(scores, axis=1) + for i in range(len(scores)): + max_idx = max_idxs[i] + if scores[i, max_idx] > 0.0: + predict_boxes.append((self.classes[max_idx], boxes[i, 0], boxes[i, 1], + boxes[i, 2], boxes[i, 3], scores[i, max_idx])) + return predict_boxes + + def _non_max_suppression(self, scores, boxes): + """Non max suppression""" + # for each class + for c in range(self.C): + sorted_idxs = np.argsort(scores[:, c]) + last = len(sorted_idxs) - 1 + while last > 0: + if scores[sorted_idxs[last], c] < 1e-6: + break + for i in range(last): + if scores[sorted_idxs[i], c] < 1e-6: + continue + if self._iou(boxes[sorted_idxs[i]], boxes[sorted_idxs[last]]) > self.iou_threshold: + scores[sorted_idxs[i], c] = 0.0 + last -= 1 + + def _iou(self, box1, box2): + """Compute the iou of two boxes""" + + inter_w = np.minimum(box1[0]+0.5*box1[2], box2[0]+0.5*box2[2]) - \ + np.maximum(box1[0]-0.5*box2[2], box2[0]-0.5*box2[2]) + inter_h = np.minimum(box1[1]+0.5*box1[3], box2[1]+0.5*box2[3]) - \ + np.maximum(box1[1]-0.5*box2[3], box2[1]-0.5*box2[3]) + if inter_h < 0 or inter_w < 0: + inter = 0 + else: + inter = inter_w * inter_h + union = box1[2]*box1[3] + box2[2]*box2[3] - inter + return inter / union + + def show_results(self, image, results, imshow=True, deteted_boxes_file=None, + detected_image_file=None): + """Show the detection boxes""" + img_cp = image.copy() + if deteted_boxes_file: + f = open(deteted_boxes_file, "w") + # draw boxes + for i in range(len(results)): + x = int(results[i][1]) + y = int(results[i][2]) + w = int(results[i][3]) // 2 + h = int(results[i][4]) // 2 + if self.verbose: + print(" class: %s, [x, y, w, h]=[%d, %d, %d, %d], confidence=%f" % (results[i][0], + x, y, w, h, results[i][-1])) + + cv2.rectangle(img_cp, (x - w, y - h), (x + w, y + h), (0, 255, 0), 2) + cv2.rectangle(img_cp, (x - w, y - h - 20), (x + w, y - h), (125, 125, 125), -1) + cv2.putText(img_cp, results[i][0] + ' : %.2f' % results[i][5], (x - w + 5, y - h - 7), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) + if deteted_boxes_file: + f.write(results[i][0] + ',' + str(x) + ',' + str(y) + ',' + + str(w) + ',' + str(h)+',' + str(results[i][5]) + '\n') + if imshow: + cv2.imshow('YOLO_small detection', img_cp) + cv2.waitKey(1) + if detected_image_file: + cv2.imwrite(detected_image_file, img_cp) + if deteted_boxes_file: + f.close() + +if __name__ == "__main__": + yolo_net = Yolo("./weights/YOLO_small.ckpt") + yolo_net.detect_from_file("./test/car.jpg") + + + diff --git a/ObjectDetections/yolo/yolo_tf.py b/ObjectDetections/yolo/yolo_tf.py new file mode 100644 index 0000000..24b32cb --- /dev/null +++ b/ObjectDetections/yolo/yolo_tf.py @@ -0,0 +1,237 @@ +""" +Yolo V1 by tensorflow +""" + +import numpy as np +import tensorflow as tf +import cv2 + + +def leak_relu(x, alpha=0.1): + return tf.maximum(alpha * x, x) + +class Yolo(object): + def __init__(self, weights_file, verbose=True): + self.verbose = verbose + # detection params + self.S = 7 # cell size + self.B = 2 # boxes_per_cell + self.classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", + "bus", "car", "cat", "chair", "cow", "diningtable", + "dog", "horse", "motorbike", "person", "pottedplant", + "sheep", "sofa", "train","tvmonitor"] + self.C = len(self.classes) # number of classes + # offset for box center (top left point of each cell) + self.x_offset = np.transpose(np.reshape(np.array([np.arange(self.S)]*self.S*self.B), + [self.B, self.S, self.S]), [1, 2, 0]) + self.y_offset = np.transpose(self.x_offset, [1, 0, 2]) + + self.threshold = 0.2 # confidence scores threhold + self.iou_threshold = 0.4 + # the maximum number of boxes to be selected by non max suppression + self.max_output_size = 10 + + self.sess = tf.Session() + self._build_net() + self._build_detector() + self._load_weights(weights_file) + + def _build_net(self): + """build the network""" + if self.verbose: + print("Start to build the network ...") + self.images = tf.placeholder(tf.float32, [None, 448, 448, 3]) + net = self._conv_layer(self.images, 1, 64, 7, 2) + net = self._maxpool_layer(net, 1, 2, 2) + net = self._conv_layer(net, 2, 192, 3, 1) + net = self._maxpool_layer(net, 2, 2, 2) + net = self._conv_layer(net, 3, 128, 1, 1) + net = self._conv_layer(net, 4, 256, 3, 1) + net = self._conv_layer(net, 5, 256, 1, 1) + net = self._conv_layer(net, 6, 512, 3, 1) + net = self._maxpool_layer(net, 6, 2, 2) + net = self._conv_layer(net, 7, 256, 1, 1) + net = self._conv_layer(net, 8, 512, 3, 1) + net = self._conv_layer(net, 9, 256, 1, 1) + net = self._conv_layer(net, 10, 512, 3, 1) + net = self._conv_layer(net, 11, 256, 1, 1) + net = self._conv_layer(net, 12, 512, 3, 1) + net = self._conv_layer(net, 13, 256, 1, 1) + net = self._conv_layer(net, 14, 512, 3, 1) + net = self._conv_layer(net, 15, 512, 1, 1) + net = self._conv_layer(net, 16, 1024, 3, 1) + net = self._maxpool_layer(net, 16, 2, 2) + net = self._conv_layer(net, 17, 512, 1, 1) + net = self._conv_layer(net, 18, 1024, 3, 1) + net = self._conv_layer(net, 19, 512, 1, 1) + net = self._conv_layer(net, 20, 1024, 3, 1) + net = self._conv_layer(net, 21, 1024, 3, 1) + net = self._conv_layer(net, 22, 1024, 3, 2) + net = self._conv_layer(net, 23, 1024, 3, 1) + net = self._conv_layer(net, 24, 1024, 3, 1) + net = self._flatten(net) + net = self._fc_layer(net, 25, 512, activation=leak_relu) + net = self._fc_layer(net, 26, 4096, activation=leak_relu) + net = self._fc_layer(net, 27, self.S*self.S*(self.C+5*self.B)) + self.predicts = net + + def _build_detector(self): + """Interpret the net output and get the predicted boxes""" + # the width and height of orignal image + self.width = tf.placeholder(tf.float32, name="img_w") + self.height = tf.placeholder(tf.float32, name="img_h") + # get class prob, confidence, boxes from net output + idx1 = self.S * self.S * self.C + idx2 = idx1 + self.S * self.S * self.B + # class prediction + class_probs = tf.reshape(self.predicts[0, :idx1], [self.S, self.S, self.C]) + # confidence + confs = tf.reshape(self.predicts[0, idx1:idx2], [self.S, self.S, self.B]) + # boxes -> (x, y, w, h) + boxes = tf.reshape(self.predicts[0, idx2:], [self.S, self.S, self.B, 4]) + + # convert the x, y to the coordinates relative to the top left point of the image + # the predictions of w, h are the square root + # multiply the width and height of image + boxes = tf.stack([(boxes[:, :, :, 0] + tf.constant(self.x_offset, dtype=tf.float32)) / self.S * self.width, + (boxes[:, :, :, 1] + tf.constant(self.y_offset, dtype=tf.float32)) / self.S * self.height, + tf.square(boxes[:, :, :, 2]) * self.width, + tf.square(boxes[:, :, :, 3]) * self.height], axis=3) + + # class-specific confidence scores [S, S, B, C] + scores = tf.expand_dims(confs, -1) * tf.expand_dims(class_probs, 2) + + scores = tf.reshape(scores, [-1, self.C]) # [S*S*B, C] + boxes = tf.reshape(boxes, [-1, 4]) # [S*S*B, 4] + + # find each box class, only select the max score + box_classes = tf.argmax(scores, axis=1) + box_class_scores = tf.reduce_max(scores, axis=1) + + # filter the boxes by the score threshold + filter_mask = box_class_scores >= self.threshold + scores = tf.boolean_mask(box_class_scores, filter_mask) + boxes = tf.boolean_mask(boxes, filter_mask) + box_classes = tf.boolean_mask(box_classes, filter_mask) + + # non max suppression (do not distinguish different classes) + # ref: https://tensorflow.google.cn/api_docs/python/tf/image/non_max_suppression + # box (x, y, w, h) -> box (x1, y1, x2, y2) + _boxes = tf.stack([boxes[:, 0] - 0.5 * boxes[:, 2], boxes[:, 1] - 0.5 * boxes[:, 3], + boxes[:, 0] + 0.5 * boxes[:, 2], boxes[:, 1] + 0.5 * boxes[:, 3]], axis=1) + nms_indices = tf.image.non_max_suppression(_boxes, scores, + self.max_output_size, self.iou_threshold) + self.scores = tf.gather(scores, nms_indices) + self.boxes = tf.gather(boxes, nms_indices) + self.box_classes = tf.gather(box_classes, nms_indices) + + def _conv_layer(self, x, id, num_filters, filter_size, stride): + """Conv layer""" + in_channels = x.get_shape().as_list()[-1] + weight = tf.Variable(tf.truncated_normal([filter_size, filter_size, + in_channels, num_filters], stddev=0.1)) + bias = tf.Variable(tf.zeros([num_filters,])) + # padding, note: not using padding="SAME" + pad_size = filter_size // 2 + pad_mat = np.array([[0, 0], [pad_size, pad_size], [pad_size, pad_size], [0, 0]]) + x_pad = tf.pad(x, pad_mat) + conv = tf.nn.conv2d(x_pad, weight, strides=[1, stride, stride, 1], padding="VALID") + output = leak_relu(tf.nn.bias_add(conv, bias)) + if self.verbose: + print(" Layer %d: type=Conv, num_filter=%d, filter_size=%d, stride=%d, output_shape=%s" \ + % (id, num_filters, filter_size, stride, str(output.get_shape()))) + return output + + def _fc_layer(self, x, id, num_out, activation=None): + """fully connected layer""" + num_in = x.get_shape().as_list()[-1] + weight = tf.Variable(tf.truncated_normal([num_in, num_out], stddev=0.1)) + bias = tf.Variable(tf.zeros([num_out,])) + output = tf.nn.xw_plus_b(x, weight, bias) + if activation: + output = activation(output) + if self.verbose: + print(" Layer %d: type=Fc, num_out=%d, output_shape=%s" \ + % (id, num_out, str(output.get_shape()))) + return output + + def _maxpool_layer(self, x, id, pool_size, stride): + output = tf.nn.max_pool(x, [1, pool_size, pool_size, 1], + strides=[1, stride, stride, 1], padding="SAME") + if self.verbose: + print(" Layer %d: type=MaxPool, pool_size=%d, stride=%d, output_shape=%s" \ + % (id, pool_size, stride, str(output.get_shape()))) + return output + + def _flatten(self, x): + """flatten the x""" + tran_x = tf.transpose(x, [0, 3, 1, 2]) # channle first mode + nums = np.product(x.get_shape().as_list()[1:]) + return tf.reshape(tran_x, [-1, nums]) + + def _load_weights(self, weights_file): + """Load weights from file""" + if self.verbose: + print("Start to load weights from file:%s" % (weights_file)) + saver = tf.train.Saver() + saver.restore(self.sess, weights_file) + + def detect_from_file(self, image_file, imshow=True, deteted_boxes_file="boxes.txt", + detected_image_file="detected_image.jpg"): + """Do detection given a image file""" + # read image + image = cv2.imread(image_file) + img_h, img_w, _ = image.shape + scores, boxes, box_classes = self._detect_from_image(image) + predict_boxes = [] + for i in range(len(scores)): + predict_boxes.append((self.classes[box_classes[i]], boxes[i, 0], + boxes[i, 1], boxes[i, 2], boxes[i, 3], scores[i])) + self.show_results(image, predict_boxes, imshow, deteted_boxes_file, detected_image_file) + + def _detect_from_image(self, image): + """Do detection given a cv image""" + img_h, img_w, _ = image.shape + img_resized = cv2.resize(image, (448, 448)) + img_RGB = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB) + img_resized_np = np.asarray(img_RGB) + _images = np.zeros((1, 448, 448, 3), dtype=np.float32) + _images[0] = (img_resized_np / 255.0) * 2.0 - 1.0 + scores, boxes, box_classes = self.sess.run([self.scores, self.boxes, self.box_classes], + feed_dict={self.images: _images, self.width: img_w, self.height: img_h}) + return scores, boxes, box_classes + + def show_results(self, image, results, imshow=True, deteted_boxes_file=None, + detected_image_file=None): + """Show the detection boxes""" + img_cp = image.copy() + if deteted_boxes_file: + f = open(deteted_boxes_file, "w") + # draw boxes + for i in range(len(results)): + x = int(results[i][1]) + y = int(results[i][2]) + w = int(results[i][3]) // 2 + h = int(results[i][4]) // 2 + if self.verbose: + print(" class: %s, [x, y, w, h]=[%d, %d, %d, %d], confidence=%f" % (results[i][0], + x, y, w, h, results[i][-1])) + + cv2.rectangle(img_cp, (x - w, y - h), (x + w, y + h), (0, 255, 0), 2) + cv2.rectangle(img_cp, (x - w, y - h - 20), (x + w, y - h), (125, 125, 125), -1) + cv2.putText(img_cp, results[i][0] + ' : %.2f' % results[i][5], (x - w + 5, y - h - 7), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) + if deteted_boxes_file: + f.write(results[i][0] + ',' + str(x) + ',' + str(y) + ',' + + str(w) + ',' + str(h)+',' + str(results[i][5]) + '\n') + if imshow: + cv2.imshow('YOLO_small detection', img_cp) + cv2.waitKey(1) + if detected_image_file: + cv2.imwrite(detected_image_file, img_cp) + if deteted_boxes_file: + f.close() + +if __name__ == "__main__": + yolo_net = Yolo("./weights/YOLO_small.ckpt") + yolo_net.detect_from_file("./test/car.jpg") diff --git a/ObjectDetections/yolo2/config.py b/ObjectDetections/yolo2/config.py new file mode 100644 index 0000000..ad7fa91 --- /dev/null +++ b/ObjectDetections/yolo2/config.py @@ -0,0 +1,25 @@ +""" +Yolov2 anchors and coco classes +""" + +""" +anchors = [[0.738768, 0.874946], + [2.42204, 2.65704], + [4.30971, 7.04493], + [10.246, 4.59428], + [12.6868, 11.8741]] +""" +anchors = [[0.57273, 0.677385], + [1.87446, 2.06253], + [3.33843, 5.47434], + [7.88282, 3.52778], + [9.77052, 9.16828]] + +def read_coco_labels(): + f = open("./data/coco_classes.txt") + class_names = [] + for l in f.readlines(): + class_names.append(l[:-1]) + return class_names + +class_names = read_coco_labels() \ No newline at end of file diff --git a/ObjectDetections/yolo2/data/coco_classes.txt b/ObjectDetections/yolo2/data/coco_classes.txt new file mode 100644 index 0000000..ca76c80 --- /dev/null +++ b/ObjectDetections/yolo2/data/coco_classes.txt @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/ObjectDetections/yolo2/demo.py b/ObjectDetections/yolo2/demo.py new file mode 100644 index 0000000..4a7183c --- /dev/null +++ b/ObjectDetections/yolo2/demo.py @@ -0,0 +1,50 @@ +""" +Demo for yolov2 +""" + +import numpy as np +import tensorflow as tf +import cv2 +from PIL import Image + +from model import darknet +from detect_ops import decode +from utils import preprocess_image, postprocess, draw_detection +from config import anchors, class_names + + +input_size = (416, 416) +image_file = "./images/car.jpg" +image = cv2.imread(image_file) +image_shape = image.shape[:2] +image_cp = preprocess_image(image, input_size) +""" +image = Image.open(image_file) +image_cp = image.resize(input_size, Image.BICUBIC) +image_cp = np.array(image_cp, dtype=np.float32)/255.0 +image_cp = np.expand_dims(image_cp, 0) +#print(image_cp) +""" + + +images = tf.placeholder(tf.float32, [1, input_size[0], input_size[1], 3]) +detection_feat = darknet(images) +feat_sizes = input_size[0] // 32, input_size[1] // 32 +detection_results = decode(detection_feat, feat_sizes, len(class_names), anchors) + +checkpoint_path = "./checkpoint_dir/yolo2_coco.ckpt" +saver = tf.train.Saver() +with tf.Session() as sess: + saver.restore(sess, checkpoint_path) + bboxes, obj_probs, class_probs = sess.run(detection_results, feed_dict={images: image_cp}) + +bboxes, scores, class_inds = postprocess(bboxes, obj_probs, class_probs, + image_shape=image_shape) +img_detection = draw_detection(image, bboxes, scores, class_inds, class_names) +cv2.imwrite("detection.jpg", img_detection) +cv2.imshow("detection results", img_detection) + +cv2.waitKey(0) + + + diff --git a/ObjectDetections/yolo2/detect_ops.py b/ObjectDetections/yolo2/detect_ops.py new file mode 100644 index 0000000..6060ece --- /dev/null +++ b/ObjectDetections/yolo2/detect_ops.py @@ -0,0 +1,39 @@ +""" +Detection ops for Yolov2 +""" + +import tensorflow as tf +import numpy as np + + +def decode(detection_feat, feat_sizes=(13, 13), num_classes=80, + anchors=None): + """decode from the detection feature""" + H, W = feat_sizes + num_anchors = len(anchors) + detetion_results = tf.reshape(detection_feat, [-1, H * W, num_anchors, + num_classes + 5]) + + bbox_xy = tf.nn.sigmoid(detetion_results[:, :, :, 0:2]) + bbox_wh = tf.exp(detetion_results[:, :, :, 2:4]) + obj_probs = tf.nn.sigmoid(detetion_results[:, :, :, 4]) + class_probs = tf.nn.softmax(detetion_results[:, :, :, 5:]) + + anchors = tf.constant(anchors, dtype=tf.float32) + + height_ind = tf.range(H, dtype=tf.float32) + width_ind = tf.range(W, dtype=tf.float32) + x_offset, y_offset = tf.meshgrid(height_ind, width_ind) + x_offset = tf.reshape(x_offset, [1, -1, 1]) + y_offset = tf.reshape(y_offset, [1, -1, 1]) + + # decode + bbox_x = (bbox_xy[:, :, :, 0] + x_offset) / W + bbox_y = (bbox_xy[:, :, :, 1] + y_offset) / H + bbox_w = bbox_wh[:, :, :, 0] * anchors[:, 0] / W * 0.5 + bbox_h = bbox_wh[:, :, :, 1] * anchors[:, 1] / H * 0.5 + + bboxes = tf.stack([bbox_x - bbox_w, bbox_y - bbox_h, + bbox_x + bbox_w, bbox_y + bbox_h], axis=3) + + return bboxes, obj_probs, class_probs diff --git a/ObjectDetections/yolo2/loss.py b/ObjectDetections/yolo2/loss.py new file mode 100644 index 0000000..931359f --- /dev/null +++ b/ObjectDetections/yolo2/loss.py @@ -0,0 +1,86 @@ +""" +Loss function for YOLOv2 +""" + +import numpy as np +import tensorflow as tf + +def compute_loss(predictions, targets, anchors, scales, num_classes=20, feat_sizes=(13, 13)): + """ + Compute the loss of Yolov2 for training + """ + H, W = feat_sizes + C = num_classes + B = len(anchors) + anchors = tf.constant(anchors, dtype=tf.float32) + anchors = tf.reshape(anchors, [1, 1, B, 2]) + + sprob, sconf, snoob, scoor = scales # the scales for different parts + + _coords = targets["coords"] # ground truth [-1, H*W, B, 4] + _probs = targets["probs"] # class probability [-1, H*W, B, C] one hot + _confs = targets["confs"] # 1 for object, 0 for background, [-1, H*W, B] + + # decode the net output + predictions = tf.reshape(predictions, [-1, H, W, B, (5 + C)]) + coords = predictions[:, :, :, :, 0:4] # t_x, t_y, t_w, t_h + coords = tf.reshape(coords, [-1, H*W, B, 4]) + coords_xy = tf.nn.sigmoid(coords[:, :, :, 0:2]) # (0, 1) relative cell top left + coords_wh = tf.sqrt(tf.exp(coords[:, :, :, 2:4]) * anchors / + np.reshape([W, H], [1, 1, 1, 2])) # sqrt of w, h (0, 1) + coords = tf.concat([coords_xy, coords_wh], axis=3) # [batch_size, H*W, B, 4] + + confs = tf.nn.sigmoid(predictions[:, :, :, :, 4]) # object confidence + confs = tf.reshape(confs, [-1, H*W, B, 1]) + + probs = tf.nn.softmax(predictions[:, :, :, :, 5:]) # class probability + probs = tf.reshape(probs, [-1, H*W, B, C]) + + preds = tf.concat([coords, confs, probs], axis=3) # [-1, H*W, B, (4+1+C)] + + # match ground truths with anchors (predictions in fact) + # assign ground truths to the predictions with the best IOU (select 1 among 5 anchors) + wh = tf.pow(coords[:, :, :, 2:4], 2) * np.reshape([W, H], [1, 1, 1, 2]) + areas = wh[:, :, :, 0] * wh[:, :, :, 1] + centers = coords[:, :, :, 0:2] + up_left, down_right = centers - (wh * 0.5), centers + (wh * 0.5) + + # the ground truth + _wh = tf.pow(_coords[:, :, :, 2:4], 2) * np.reshape([W, H], [1, 1, 1, 2]) + _areas = _wh[:, :, :, 0] * _wh[:, :, :, 1] + _centers = _coords[:, :, :, 0:2] + _up_left, _down_right = _centers - (_wh * 0.5), _centers + (_wh * 0.5) + + # compute IOU + inter_upleft = tf.maximum(up_left, _up_left) + inter_downright = tf.minimum(down_right, _down_right) + inter_wh = tf.maximum(inter_downright - inter_upleft, 0.0) + intersects = inter_wh[:, :, :, 0] * inter_wh[:, :, :, 1] + ious = tf.truediv(intersects, areas + _areas - intersects) + + best_iou_mask = tf.equal(ious, tf.reduce_max(ious, axis=2, keep_dims=True)) + best_iou_mask = tf.cast(best_iou_mask, tf.float32) + mask = best_iou_mask * _confs # [-1, H*W, B] + mask = tf.expand_dims(mask, -1) # [-1, H*W, B, 1] + + # compute weight terms + confs_w = snoob * (1 - mask) + sconf * mask + coords_w = scoor * mask + probs_w = sprob * mask + weights = tf.concat([coords_w, confs_w, probs_w], axis=3) + + truths = tf.concat([_coords, tf.expand_dims(_confs, -1), _probs], 3) + + loss = tf.pow(preds - truths, 2) * weights + loss = tf.reduce_sum(loss, axis=[1, 2, 3]) + loss = 0.5 * tf.reduce_mean(loss) + return loss + + + + + + + + + diff --git a/ObjectDetections/yolo2/model.png b/ObjectDetections/yolo2/model.png new file mode 100644 index 0000000..07ab142 Binary files /dev/null and b/ObjectDetections/yolo2/model.png differ diff --git a/ObjectDetections/yolo2/model.py b/ObjectDetections/yolo2/model.py new file mode 100644 index 0000000..697dd37 --- /dev/null +++ b/ObjectDetections/yolo2/model.py @@ -0,0 +1,89 @@ +""" +YOLOv2 implemented by Tensorflow, only for predicting +""" +import os + +import numpy as np +import tensorflow as tf + + + +######## basic layers ####### + +def leaky_relu(x): + return tf.nn.leaky_relu(x, alpha=0.1, name="leaky_relu") + +# Conv2d +def conv2d(x, filters, size, pad=0, stride=1, batch_normalize=1, + activation=leaky_relu, use_bias=False, name="conv2d"): + if pad > 0: + x = tf.pad(x, [[0, 0], [pad, pad], [pad, pad], [0, 0]]) + out = tf.layers.conv2d(x, filters, size, strides=stride, padding="VALID", + activation=None, use_bias=use_bias, name=name) + if batch_normalize == 1: + out = tf.layers.batch_normalization(out, axis=-1, momentum=0.9, + training=False, name=name+"_bn") + if activation: + out = activation(out) + return out + +# maxpool2d +def maxpool(x, size=2, stride=2, name="maxpool"): + return tf.layers.max_pooling2d(x, size, stride) + +# reorg layer +def reorg(x, stride): + return tf.extract_image_patches(x, [1, stride, stride, 1], + [1, stride, stride, 1], [1,1,1,1], padding="VALID") + + +def darknet(images, n_last_channels=425): + """Darknet19 for YOLOv2""" + net = conv2d(images, 32, 3, 1, name="conv1") + net = maxpool(net, name="pool1") + net = conv2d(net, 64, 3, 1, name="conv2") + net = maxpool(net, name="pool2") + net = conv2d(net, 128, 3, 1, name="conv3_1") + net = conv2d(net, 64, 1, name="conv3_2") + net = conv2d(net, 128, 3, 1, name="conv3_3") + net = maxpool(net, name="pool3") + net = conv2d(net, 256, 3, 1, name="conv4_1") + net = conv2d(net, 128, 1, name="conv4_2") + net = conv2d(net, 256, 3, 1, name="conv4_3") + net = maxpool(net, name="pool4") + net = conv2d(net, 512, 3, 1, name="conv5_1") + net = conv2d(net, 256, 1, name="conv5_2") + net = conv2d(net, 512, 3, 1, name="conv5_3") + net = conv2d(net, 256, 1, name="conv5_4") + net = conv2d(net, 512, 3, 1, name="conv5_5") + shortcut = net + net = maxpool(net, name="pool5") + net = conv2d(net, 1024, 3, 1, name="conv6_1") + net = conv2d(net, 512, 1, name="conv6_2") + net = conv2d(net, 1024, 3, 1, name="conv6_3") + net = conv2d(net, 512, 1, name="conv6_4") + net = conv2d(net, 1024, 3, 1, name="conv6_5") + # --------- + net = conv2d(net, 1024, 3, 1, name="conv7_1") + net = conv2d(net, 1024, 3, 1, name="conv7_2") + # shortcut + shortcut = conv2d(shortcut, 64, 1, name="conv_shortcut") + shortcut = reorg(shortcut, 2) + net = tf.concat([shortcut, net], axis=-1) + net = conv2d(net, 1024, 3, 1, name="conv8") + # detection layer + net = conv2d(net, n_last_channels, 1, batch_normalize=0, + activation=None, use_bias=True, name="conv_dec") + return net + + + +if __name__ == "__main__": + x = tf.random_normal([1, 416, 416, 3]) + model = darknet(x) + + saver = tf.train.Saver() + with tf.Session() as sess: + saver.restore(sess, "./checkpoint_dir/yolo2_coco.ckpt") + print(sess.run(model).shape) + diff --git a/ObjectDetections/yolo2/utils.py b/ObjectDetections/yolo2/utils.py new file mode 100644 index 0000000..5821a3b --- /dev/null +++ b/ObjectDetections/yolo2/utils.py @@ -0,0 +1,163 @@ +""" +Help functions for YOLOv2 +""" +import random +import colorsys + +import cv2 +import numpy as np + + + +############## preprocess image ################## + + +def preprocess_image(image, image_size=(416, 416)): + """Preprocess a image to inference""" + image_cp = np.copy(image).astype(np.float32) + # resize the image + image_rgb = cv2.cvtColor(image_cp, cv2.COLOR_BGR2RGB) + image_resized = cv2.resize(image_rgb, image_size) + # normalize + image_normalized = image_resized.astype(np.float32) / 255.0 + # expand the batch_size dim + image_expanded = np.expand_dims(image_normalized, axis=0) + return image_expanded + +def postprocess(bboxes, obj_probs, class_probs, image_shape=(416, 416), + threshold=0.5): + """post process the detection results""" + bboxes = np.reshape(bboxes, [-1, 4]) + bboxes[:, 0::2] *= float(image_shape[1]) + bboxes[:, 1::2] *= float(image_shape[0]) + bboxes = bboxes.astype(np.int32) + + # clip the bboxs + bbox_ref = [0, 0, image_shape[1] - 1, image_shape[0] - 1] + bboxes = bboxes_clip(bbox_ref, bboxes) + + obj_probs = np.reshape(obj_probs, [-1]) + class_probs = np.reshape(class_probs, [len(obj_probs), -1]) + class_inds = np.argmax(class_probs, axis=1) + class_probs = class_probs[np.arange(len(obj_probs)), class_inds] + scores = obj_probs * class_probs + + # filter bboxes with scores > threshold + keep_inds = scores > threshold + bboxes = bboxes[keep_inds] + scores = scores[keep_inds] + class_inds = class_inds[keep_inds] + + # sort top K + class_inds, scores, bboxes = bboxes_sort(class_inds, scores, bboxes) + # nms + class_inds, scores, bboxes = bboxes_nms(class_inds, scores, bboxes) + + return bboxes, scores, class_inds + +def draw_detection(im, bboxes, scores, cls_inds, labels, thr=0.3): + # for display + ############################ + # Generate colors for drawing bounding boxes. + hsv_tuples = [(x / float(len(labels)), 1., 1.) + for x in range(len(labels))] + colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) + colors = list( + map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), + colors)) + random.seed(10101) # Fixed seed for consistent colors across runs. + random.shuffle(colors) # Shuffle colors to decorrelate adjacent classes. + random.seed(None) # Reset seed to default. + # draw image + imgcv = np.copy(im) + h, w, _ = imgcv.shape + for i, box in enumerate(bboxes): + if scores[i] < thr: + continue + cls_indx = cls_inds[i] + + thick = int((h + w) / 300) + cv2.rectangle(imgcv, + (box[0], box[1]), (box[2], box[3]), + colors[cls_indx], thick) + mess = '%s: %.3f' % (labels[cls_indx], scores[i]) + if box[1] < 20: + text_loc = (box[0] + 2, box[1] + 15) + else: + text_loc = (box[0], box[1] - 10) + cv2.putText(imgcv, mess, text_loc, + cv2.FONT_HERSHEY_SIMPLEX, 1e-3 * h, colors[cls_indx], thick // 3) + + return imgcv + + +############## process bboxes ################## +def bboxes_clip(bbox_ref, bboxes): + """Clip bounding boxes with respect to reference bbox. + """ + bboxes = np.copy(bboxes) + bboxes = np.transpose(bboxes) + bbox_ref = np.transpose(bbox_ref) + bboxes[0] = np.maximum(bboxes[0], bbox_ref[0]) + bboxes[1] = np.maximum(bboxes[1], bbox_ref[1]) + bboxes[2] = np.minimum(bboxes[2], bbox_ref[2]) + bboxes[3] = np.minimum(bboxes[3], bbox_ref[3]) + bboxes = np.transpose(bboxes) + return bboxes + +def bboxes_sort(classes, scores, bboxes, top_k=400): + """Sort bounding boxes by decreasing order and keep only the top_k + """ + # if priority_inside: + # inside = (bboxes[:, 0] > margin) & (bboxes[:, 1] > margin) & \ + # (bboxes[:, 2] < 1-margin) & (bboxes[:, 3] < 1-margin) + # idxes = np.argsort(-scores) + # inside = inside[idxes] + # idxes = np.concatenate([idxes[inside], idxes[~inside]]) + idxes = np.argsort(-scores) + classes = classes[idxes][:top_k] + scores = scores[idxes][:top_k] + bboxes = bboxes[idxes][:top_k] + return classes, scores, bboxes + +def bboxes_iou(bboxes1, bboxes2): + """Computing iou between bboxes1 and bboxes2. + Note: bboxes1 and bboxes2 can be multi-dimensional, but should broacastable. + """ + bboxes1 = np.transpose(bboxes1) + bboxes2 = np.transpose(bboxes2) + # Intersection bbox and volume. + int_ymin = np.maximum(bboxes1[0], bboxes2[0]) + int_xmin = np.maximum(bboxes1[1], bboxes2[1]) + int_ymax = np.minimum(bboxes1[2], bboxes2[2]) + int_xmax = np.minimum(bboxes1[3], bboxes2[3]) + + int_h = np.maximum(int_ymax - int_ymin, 0.) + int_w = np.maximum(int_xmax - int_xmin, 0.) + int_vol = int_h * int_w + # Union volume. + vol1 = (bboxes1[2] - bboxes1[0]) * (bboxes1[3] - bboxes1[1]) + vol2 = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1]) + iou = int_vol / (vol1 + vol2 - int_vol) + return iou + +def bboxes_nms(classes, scores, bboxes, nms_threshold=0.5): + """Apply non-maximum selection to bounding boxes. + """ + keep_bboxes = np.ones(scores.shape, dtype=np.bool) + for i in range(scores.size-1): + if keep_bboxes[i]: + # Computer overlap with bboxes which are following. + overlap = bboxes_iou(bboxes[i], bboxes[(i+1):]) + # Overlap threshold for keeping + checking part of the same class + keep_overlap = np.logical_or(overlap < nms_threshold, classes[(i+1):] != classes[i]) + keep_bboxes[(i+1):] = np.logical_and(keep_bboxes[(i+1):], keep_overlap) + + idxes = np.where(keep_bboxes) + return classes[idxes], scores[idxes], bboxes[idxes] + + + + + + diff --git a/README.md b/README.md index 51227b4..b1ebefe 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ The deeplearning algorithms are carefully implemented by [tensorflow](https://www.tensorflow.org/). ### Environment - Python 3.5 -- tensorflow 0.12 +- tensorflow 1.4 +- pytorch 0.2.0 ### The deeplearning algorithms includes (now): - Logistic Regression [logisticRegression.py](https://github.com/xiaohu2015/DeepLearning_tutorials/blob/master/models/logisticRegression.py) @@ -15,6 +16,19 @@ The deeplearning algorithms are carefully implemented by [tensorflow](https://ww Note: the project aims at imitating the well-implemented algorithms in [Deep Learning Tutorials](http://www.deeplearning.net/tutorial/) (coded by [Theano](http://deeplearning.net/software/theano/index.html)). +### CNN Models +- MobileNet [[self](https://github.com/xiaohu2015/DeepLearning_tutorials/blob/master/CNNs/MobileNet.py) [paper](https://arxiv.org/abs/1704.04861) [ref](https://github.com/Zehaos/MobileNet/blob/master/nets/mobilenet.py)] +- MobileNetv2 [[self](https://github.com/xiaohu2015/DeepLearning_tutorials/blob/master/CNNs/mobilenet_v2.py) [paper](https://arxiv.org/pdf/1801.04381.pdf) [ref](https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet)] +- SqueezeNet [[self](https://github.com/xiaohu2015/DeepLearning_tutorials/blob/master/CNNs/SqueezeNet.py) [paper](https://arxiv.org/abs/1602.07360)] +- ResNet [[self](https://github.com/xiaohu2015/DeepLearning_tutorials/blob/master/CNNs/ResNet50.py) [caffe ref](https://github.com/KaimingHe/deep-residual-networks) [paper1](https://arxiv.org/abs/1512.03385) [paper2](https://arxiv.org/abs/1603.05027)] +- ShuffleNet [[self](https://github.com/xiaohu2015/DeepLearning_tutorials/blob/master/CNNs/ShuffleNet.py) by pytorch [paper](http://cn.arxiv.org/pdf/1707.01083v2)] +- ShuffleNetv2 [[self](https://github.com/xiaohu2015/DeepLearning_tutorials/blob/master/CNNs/shufflenet_v2.py) [ref](https://github.com/tensorpack/tensorpack/blob/master/examples/ImageNetModels/shufflenet.py) [paper](https://arxiv.org/abs/1807.11164)] +- DenseNet [[self](https://github.com/xiaohu2015/DeepLearning_tutorials/blob/master/CNNs/densenet.py) [pytorch_ref](https://github.com/pytorch/vision/blob/master/torchvision/models/densenet.py) [paper](https://arxiv.org/abs/1608.06993)] + +### Object detection +- YOLOv1 [[self](https://github.com/xiaohu2015/DeepLearning_tutorials/blob/master/ObjectDetections/yolo/yolo_tf.py) [paper](https://arxiv.org/abs/1506.02640) [ref](https://github.com/gliese581gg/YOLO_tensorflow)] +- SSD [[self](https://github.com/xiaohu2015/DeepLearning_tutorials/blob/master/ObjectDetections/SSD/SSD_demo.py) [paper](https://arxiv.org/pdf/1611.10012.pdf) [slides](http://www.cs.unc.edu/~wliu/papers/ssd_eccv2016_slide.pdf) [cafe](https://github.com/weiliu89/caffe/tree/ssd) [TF](https://arxiv.org/abs/1512.02325) [pytorch](https://github.com/amdegroot/ssd.pytorch) ] +- YOLOv2 [[self](https://github.com/xiaohu2015/DeepLearning_tutorials/tree/master/ObjectDetections/yolo2) [paper](https://arxiv.org/abs/1612.08242) [ref](https://github.com/yhcc/yolo2)] ### Practical examples You can find more practical examples with tensorflow here: @@ -54,3 +68,6 @@ You can find more practical examples with tensorflow here: #### Don't hesitate to star this project if it is helpful! ### If you benefit from the tutorial, please make a small donation by WeChat sweep. ![weichat](https://github.com/xiaohu2015/DeepLearning_tutorials/blob/master/results/weichat.jpg) +## 微信号:xiaoxiaohu1994 +## 欢迎关注微信公众号:机器学习算法全栈工程师(Jeemy110) +![公众号](https://github.com/xiaohu2015/DeepLearning_tutorials/blob/master/results/654362565405877642.jpg) diff --git a/examples/cnn_setence_classification/text_cnn.py b/examples/cnn_setence_classification/text_cnn.py index f186faf..3518a5c 100644 --- a/examples/cnn_setence_classification/text_cnn.py +++ b/examples/cnn_setence_classification/text_cnn.py @@ -81,7 +81,7 @@ def __init__(self, seq_len, vocab_size, embedding_size, filter_sizes, num_filter pooled_outputs.append(pool_output) # [None, 1, 1, num_filters] # Combine all pooled features num_filters_total = num_filters * len(filter_sizes) - self.h_pool = tf.concat(3, pooled_outputs) # [None, 1, 1, num_filters_total] + self.h_pool = tf.concat( pooled_outputs,3) # [None, 1, 1, num_filters_total] self.h_pool_flat = tf.reshape(self.h_pool, shape=[-1, num_filters_total]) # [None, num_filters_total] # The dropout layer @@ -100,7 +100,7 @@ def __init__(self, seq_len, vocab_size, embedding_size, filter_sizes, num_filter # The loss with tf.name_scope("loss"): - losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.y) + losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.y) self.loss = tf.reduce_mean(losses) + L2_loss * l2_reg_lambda # Accuracy diff --git a/examples/cnn_setence_classification/train_cnn.py b/examples/cnn_setence_classification/train_cnn.py index aefa2b5..bc58dc0 100644 --- a/examples/cnn_setence_classification/train_cnn.py +++ b/examples/cnn_setence_classification/train_cnn.py @@ -1,74 +1,74 @@ -""" -Test the TextRNN class -2016/12/22 -""" -import os -import sys -import numpy as np -import tensorflow as tf -from sklearn.model_selection import train_test_split -from tensorflow.contrib import learn - -from data_helpers import load_data_and_labels, batch_iter -from text_cnn import TextCNN - - -# Load original data -path = sys.path[0] -pos_filename = path + "/data/rt-polarity.pos" -neg_filename = path + "/data/rt-polarity.neg" - -X_data, y_data = load_data_and_labels(pos_filename, neg_filename) -max_document_length = max([len(sen.split(" ")) for sen in X_data]) -print("Max_document_length:,", max_document_length) -# Create the vacabulary -vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) -# The idx data -x = np.array(list(vocab_processor.fit_transform(X_data)), dtype=np.float32) -y = np.array(y_data, dtype=np.int32) -vocabulary_size = len(vocab_processor.vocabulary_) -print("The size of vocabulary:", vocabulary_size) -# Split the data -X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=1111) -print("X_train shape {0}, y_train shape {1}".format(X_train.shape, y_train.shape)) -print("X_test shape {0}, y_test shape {1}".format(X_test.shape, y_test.shape)) - -# The parameters of RNN -seq_len = X_train.shape[1] -vocab_size = vocabulary_size -embedding_size = 128 -filter_sizes = [2, 3, 4] -num_filters = 128 -num_classes = y_train.shape[1] -l2_reg_lambda = 0.0 - -# Construct RNN model -text_rnn_model = TextCNN(seq_len=seq_len, vocab_size=vocab_size, embedding_size=embedding_size, filter_sizes= - filter_sizes, num_filters=num_filters, num_classes=num_classes) -loss = text_rnn_model.loss -train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) -accuracy = text_rnn_model.accuracy -# The parameters for training -batch_size = 64 -training_epochs = 10 -dispaly_every = 1 -dropout_keep_prob = 0.5 - -batch_num = int(X_train.shape[0]/batch_size) - -sess = tf.Session() -sess.run(tf.global_variables_initializer()) -print("Starting training...") -for epoch in range(training_epochs): - avg_cost = 0 - for batch in range(batch_num): - _, cost = sess.run([train_op, loss], feed_dict={text_rnn_model.x: X_train[batch*batch_size:(batch+1)*batch_size], - text_rnn_model.y: y_train[batch*batch_size:(batch+1)*batch_size], - text_rnn_model.dropout_keep_prob:dropout_keep_prob}) - avg_cost += cost - if epoch % dispaly_every == 0: - cost, acc = sess.run([loss, accuracy], feed_dict={text_rnn_model.x: X_test, - text_rnn_model.y: y_test, - text_rnn_model.dropout_keep_prob: 1.0}) - print("\nEpoch {0} : loss {1}, accuracy {2}".format(epoch, cost, acc)) - +""" +Test the TextRNN class +2016/12/22 +""" +import os +import sys +import numpy as np +import tensorflow as tf +from sklearn.model_selection import train_test_split +from tensorflow.contrib import learn + +from data_helpers import load_data_and_labels, batch_iter +from text_cnn import TextCNN +import pudb;pu.db + +# Load original data +path = sys.path[0] +pos_filename = path + "/data/rt-polarity.pos" +neg_filename = path + "/data/rt-polarity.neg" + +X_data, y_data = load_data_and_labels(pos_filename, neg_filename) +max_document_length = max([len(sen.split(" ")) for sen in X_data]) +print("Max_document_length:,", max_document_length) +# Create the vacabulary +vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) +# The idx data +x = np.array(list(vocab_processor.fit_transform(X_data)), dtype=np.float32) +y = np.array(y_data, dtype=np.int32) +vocabulary_size = len(vocab_processor.vocabulary_) +print("The size of vocabulary:", vocabulary_size) +# Split the data +X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=1111) +print("X_train shape {0}, y_train shape {1}".format(X_train.shape, y_train.shape)) +print("X_test shape {0}, y_test shape {1}".format(X_test.shape, y_test.shape)) + +# The parameters of RNN +seq_len = X_train.shape[1] +vocab_size = vocabulary_size +embedding_size = 128 +filter_sizes = [2, 3, 4] +num_filters = 128 +num_classes = y_train.shape[1] +l2_reg_lambda = 0.0 + +# Construct RNN model +text_rnn_model = TextCNN(seq_len=seq_len, vocab_size=vocab_size, embedding_size=embedding_size, filter_sizes= + filter_sizes, num_filters=num_filters, num_classes=num_classes) +loss = text_rnn_model.loss +train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) +accuracy = text_rnn_model.accuracy +# The parameters for training +batch_size = 64 +training_epochs = 10 +dispaly_every = 1 +dropout_keep_prob = 0.5 + +batch_num = int(X_train.shape[0]/batch_size) + +sess = tf.Session() +sess.run(tf.global_variables_initializer()) +print("Starting training...") +for epoch in range(training_epochs): + avg_cost = 0 + for batch in range(batch_num): + _, cost = sess.run([train_op, loss], feed_dict={text_rnn_model.x: X_train[batch*batch_size:(batch+1)*batch_size], + text_rnn_model.y: y_train[batch*batch_size:(batch+1)*batch_size], + text_rnn_model.dropout_keep_prob:dropout_keep_prob}) + avg_cost += cost + if epoch % dispaly_every == 0: + cost, acc = sess.run([loss, accuracy], feed_dict={text_rnn_model.x: X_test, + text_rnn_model.y: y_test, + text_rnn_model.dropout_keep_prob: 1.0}) + print("\nEpoch {0} : loss {1}, accuracy {2}".format(epoch, cost, acc)) + diff --git a/results/654362565405877642.jpg b/results/654362565405877642.jpg new file mode 100644 index 0000000..5fbcede Binary files /dev/null and b/results/654362565405877642.jpg differ