diff --git a/captioning/__pycache__/__init__.cpython-38.pyc b/captioning/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 00000000..cf48a2d5 Binary files /dev/null and b/captioning/__pycache__/__init__.cpython-38.pyc differ diff --git a/captioning/data/__pycache__/__init__.cpython-38.pyc b/captioning/data/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 00000000..e5c5dbe2 Binary files /dev/null and b/captioning/data/__pycache__/__init__.cpython-38.pyc differ diff --git a/captioning/data/__pycache__/dataloader.cpython-38.pyc b/captioning/data/__pycache__/dataloader.cpython-38.pyc new file mode 100644 index 00000000..dc50ffe8 Binary files /dev/null and b/captioning/data/__pycache__/dataloader.cpython-38.pyc differ diff --git a/captioning/data/dataloader.py b/captioning/data/dataloader.py index 7f2ed030..e1558b82 100644 --- a/captioning/data/dataloader.py +++ b/captioning/data/dataloader.py @@ -107,6 +107,7 @@ def __init__(self, opt): # load the json file which contains additional information about the dataset print('DataLoader loading json file: ', opt.input_json) self.info = json.load(open(self.opt.input_json)) + self.images = self.info['images'] if 'ix_to_word' in self.info: self.ix_to_word = self.info['ix_to_word'] self.vocab_size = len(self.ix_to_word) @@ -136,13 +137,12 @@ def __init__(self, opt): self.att_loader = HybridLoader(self.opt.input_att_dir, '.npz', in_memory=self.data_in_memory) self.box_loader = HybridLoader(self.opt.input_box_dir, '.npy', in_memory=self.data_in_memory) - self.num_images = len(self.info['images']) # self.label_start_ix.shape[0] + self.num_images = len(self.images) # self.label_start_ix.shape[0] print('read %d image features' %(self.num_images)) # separate out indexes for each of the provided splits self.split_ix = {'train': [], 'val': [], 'test': []} - for ix in range(len(self.info['images'])): - img = self.info['images'][ix] + for ix, img in enumerate(self.images): if not 'split' in img: self.split_ix['train'].append(ix) self.split_ix['val'].append(ix) @@ -156,14 +156,58 @@ def __init__(self, opt): elif opt.train_only == 0: # restval self.split_ix['train'].append(ix) + if opt.data_augmentation: + self.seq_per_img_da = opt.seq_per_img_da + + # load the json file which contains additional information about the dataset + print('DataLoader da loading json file: ', opt.input_json_da) + self.info_da = json.load(open(self.opt.input_json_da)) + self.images_da = self.info_da['images'] + # if 'ix_to_word' in self.info_da: + # self.ix_to_word_da = self.info_da['ix_to_word'] + # self.vocab_size_da = len(self.ix_to_word_da) + # print('da vocab size is ', self.vocab_size_da) + + # open the hdf5 file + print('DataLoader da loading h5 file: ', opt.input_fc_dir_da, opt.input_att_dir_da, opt.input_label_h5_da) + + if self.opt.input_label_h5 != 'none': + self.h5_label_file_da = h5py.File(self.opt.input_label_h5_da, 'r', driver='core') + # load in the sequence data + seq_size = self.h5_label_file_da['labels'].shape + self.label_da = self.h5_label_file_da['labels'][:] + self.seq_length_da = seq_size[1] + print('max sequence length in da data is', self.seq_length_da) + # load the pointers in full to RAM (should be small enough) + self.label_start_ix_da = self.h5_label_file_da['label_start_ix'][:] + self.label_end_ix_da = self.h5_label_file_da['label_end_ix'][:] + else: + self.seq_length_da = 1 + + self.fc_loader_da = HybridLoader(self.opt.input_fc_dir_da, '.npy', in_memory=self.data_in_memory) + self.att_loader_da = HybridLoader(self.opt.input_att_dir_da, '.npz', in_memory=self.data_in_memory) + + self.num_images_da = len(self.images_da) # self.label_start_ix.shape[0] + print('read %d da image features' %(self.num_images_da)) + + for ix, img in enumerate(self.images_da): + # data augmentation only for training + if img['split'] == 'train' or img['split'] == 'restval': + self.split_ix['train'].append(ix + self.num_images) + print('assigned %d images to split train' %len(self.split_ix['train'])) print('assigned %d images to split val' %len(self.split_ix['val'])) print('assigned %d images to split test' %len(self.split_ix['test'])) + def get_captions(self, ix, seq_per_img): # fetch the sequence labels - ix1 = self.label_start_ix[ix] - 1 #label_start_ix starts from 1 - ix2 = self.label_end_ix[ix] - 1 + if self.opt.data_augmentation and ix >= self.num_images: + ix1 = self.label_start_ix_da[ix - self.num_images] - 1 #label_start_ix starts from 1 + ix2 = self.label_end_ix_da[ix - self.num_images] - 1 + else: + ix1 = self.label_start_ix[ix] - 1 #label_start_ix starts from 1 + ix2 = self.label_end_ix[ix] - 1 ncap = ix2 - ix1 + 1 # number of captions available for this image assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t' @@ -172,10 +216,16 @@ def get_captions(self, ix, seq_per_img): seq = np.zeros([seq_per_img, self.seq_length], dtype = 'int') for q in range(seq_per_img): ixl = random.randint(ix1,ix2) - seq[q, :] = self.label[ixl, :self.seq_length] + if self.opt.data_augmentation and ix >= self.num_images: + seq[q, :] = self.label_da[ixl, :self.seq_length] + else: + seq[q, :] = self.label[ixl, :self.seq_length] else: ixl = random.randint(ix1, ix2 - seq_per_img + 1) - seq = self.label[ixl: ixl + seq_per_img, :self.seq_length] + if self.opt.data_augmentation and ix >= self.num_images: + seq = self.label_da[ixl: ixl + seq_per_img, :self.seq_length] + else: + seq = self.label[ixl: ixl + seq_per_img, :self.seq_length] return seq @@ -210,15 +260,22 @@ def collate_func(self, batch, split): # Used for reward evaluation if hasattr(self, 'h5_label_file'): # if there is ground truth - gts.append(self.label[self.label_start_ix[ix] - 1: self.label_end_ix[ix]]) + if self.opt.data_augmentation and ix >= self.num_images: + gts.append(self.label_da[self.label_start_ix_da[ix - self.num_images] - 1: self.label_end_ix_da[ix - self.num_images]]) + else: + gts.append(self.label[self.label_start_ix[ix] - 1: self.label_end_ix[ix]]) else: gts.append([]) # record associated info as well info_dict = {} info_dict['ix'] = ix - info_dict['id'] = self.info['images'][ix]['id'] - info_dict['file_path'] = self.info['images'][ix].get('file_path', '') + if self.opt.data_augmentation and ix >= self.num_images: + info_dict['id'] = self.images_da[ix - self.num_images]['id'] + # info_dict['file_path'] = self.images_da[ix - self.num_images].get('file_path', '') + else: + info_dict['id'] = self.images[ix]['id'] + info_dict['file_path'] = self.info['images'][ix].get('file_path', '') infos.append(info_dict) # #sort by att_feat length @@ -264,16 +321,19 @@ def __getitem__(self, index): """ ix, it_pos_now, wrapped = index #self.split_ix[index] if self.use_att: - att_feat = self.att_loader.get(str(self.info['images'][ix]['id'])) + if self.opt.data_augmentation and ix >= self.num_images: + att_feat = self.att_loader_da.get(self.images_da[ix - self.num_images]['id']) + else: + att_feat = self.att_loader.get(str(self.images[ix]['id'])) # Reshape to K x C att_feat = att_feat.reshape(-1, att_feat.shape[-1]) if self.norm_att_feat: att_feat = att_feat / np.linalg.norm(att_feat, 2, 1, keepdims=True) if self.use_box: - box_feat = self.box_loader.get(str(self.info['images'][ix]['id'])) + box_feat = self.box_loader.get(str(self.images[ix]['id'])) # devided by image width and height x1,y1,x2,y2 = np.hsplit(box_feat, 4) - h,w = self.info['images'][ix]['height'], self.info['images'][ix]['width'] + h,w = self.images[ix]['height'], self.images[ix]['width'] box_feat = np.hstack((x1/w, y1/h, x2/w, y2/h, (x2-x1)*(y2-y1)/(w*h))) # question? x2-x1+1?? if self.norm_box_feat: box_feat = box_feat / np.linalg.norm(box_feat, 2, 1, keepdims=True) @@ -283,11 +343,14 @@ def __getitem__(self, index): else: att_feat = np.zeros((0,0), dtype='float32') if self.use_fc: - try: - fc_feat = self.fc_loader.get(str(self.info['images'][ix]['id'])) - except: + # try: + if self.opt.data_augmentation and ix >= self.num_images: + fc_feat = self.fc_loader_da.get(str(self.images_da[ix - self.num_images]['id'])) + else: + fc_feat = self.fc_loader.get(str(self.images[ix]['id'])) + # except: # Use average of attention when there is no fc provided (For bottomup feature) - fc_feat = att_feat.mean(0) + # fc_feat = att_feat.mean(0) else: fc_feat = np.zeros((0), dtype='float32') if hasattr(self, 'h5_label_file'): @@ -299,7 +362,10 @@ def __getitem__(self, index): ix, it_pos_now, wrapped) def __len__(self): - return len(self.info['images']) + if self.opt.data_augmentation: + return len(self.images) + len(self.images_da) + else: + return len(self.images) class DataLoader: def __init__(self, opt): diff --git a/captioning/models/__pycache__/AoAModel.cpython-38.pyc b/captioning/models/__pycache__/AoAModel.cpython-38.pyc new file mode 100644 index 00000000..7c09cb8b Binary files /dev/null and b/captioning/models/__pycache__/AoAModel.cpython-38.pyc differ diff --git a/captioning/models/__pycache__/AttModel.cpython-38.pyc b/captioning/models/__pycache__/AttModel.cpython-38.pyc new file mode 100644 index 00000000..2395612c Binary files /dev/null and b/captioning/models/__pycache__/AttModel.cpython-38.pyc differ diff --git a/captioning/models/__pycache__/BertCapModel.cpython-38.pyc b/captioning/models/__pycache__/BertCapModel.cpython-38.pyc new file mode 100644 index 00000000..4dd920f7 Binary files /dev/null and b/captioning/models/__pycache__/BertCapModel.cpython-38.pyc differ diff --git a/captioning/models/__pycache__/CaptionModel.cpython-38.pyc b/captioning/models/__pycache__/CaptionModel.cpython-38.pyc new file mode 100644 index 00000000..dfbc99f3 Binary files /dev/null and b/captioning/models/__pycache__/CaptionModel.cpython-38.pyc differ diff --git a/captioning/models/__pycache__/FCModel.cpython-38.pyc b/captioning/models/__pycache__/FCModel.cpython-38.pyc new file mode 100644 index 00000000..c4214186 Binary files /dev/null and b/captioning/models/__pycache__/FCModel.cpython-38.pyc differ diff --git a/captioning/models/__pycache__/M2Transformer.cpython-38.pyc b/captioning/models/__pycache__/M2Transformer.cpython-38.pyc new file mode 100644 index 00000000..24df91b5 Binary files /dev/null and b/captioning/models/__pycache__/M2Transformer.cpython-38.pyc differ diff --git a/captioning/models/__pycache__/ShowTellModel.cpython-38.pyc b/captioning/models/__pycache__/ShowTellModel.cpython-38.pyc new file mode 100644 index 00000000..561a02bc Binary files /dev/null and b/captioning/models/__pycache__/ShowTellModel.cpython-38.pyc differ diff --git a/captioning/models/__pycache__/TransformerModel.cpython-38.pyc b/captioning/models/__pycache__/TransformerModel.cpython-38.pyc new file mode 100644 index 00000000..441cc364 Binary files /dev/null and b/captioning/models/__pycache__/TransformerModel.cpython-38.pyc differ diff --git a/captioning/models/__pycache__/__init__.cpython-38.pyc b/captioning/models/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 00000000..cdaf1888 Binary files /dev/null and b/captioning/models/__pycache__/__init__.cpython-38.pyc differ diff --git a/captioning/models/__pycache__/cachedTransformer.cpython-38.pyc b/captioning/models/__pycache__/cachedTransformer.cpython-38.pyc new file mode 100644 index 00000000..25b04332 Binary files /dev/null and b/captioning/models/__pycache__/cachedTransformer.cpython-38.pyc differ diff --git a/captioning/models/__pycache__/utils.cpython-38.pyc b/captioning/models/__pycache__/utils.cpython-38.pyc new file mode 100644 index 00000000..e3b72448 Binary files /dev/null and b/captioning/models/__pycache__/utils.cpython-38.pyc differ diff --git a/captioning/modules/__pycache__/loss_wrapper.cpython-38.pyc b/captioning/modules/__pycache__/loss_wrapper.cpython-38.pyc new file mode 100644 index 00000000..98df0a3e Binary files /dev/null and b/captioning/modules/__pycache__/loss_wrapper.cpython-38.pyc differ diff --git a/captioning/modules/__pycache__/losses.cpython-38.pyc b/captioning/modules/__pycache__/losses.cpython-38.pyc new file mode 100644 index 00000000..260eb6f6 Binary files /dev/null and b/captioning/modules/__pycache__/losses.cpython-38.pyc differ diff --git a/captioning/utils/__pycache__/__init__.cpython-38.pyc b/captioning/utils/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 00000000..390d0779 Binary files /dev/null and b/captioning/utils/__pycache__/__init__.cpython-38.pyc differ diff --git a/captioning/utils/__pycache__/config.cpython-38.pyc b/captioning/utils/__pycache__/config.cpython-38.pyc new file mode 100644 index 00000000..d5690ae1 Binary files /dev/null and b/captioning/utils/__pycache__/config.cpython-38.pyc differ diff --git a/captioning/utils/__pycache__/eval_utils.cpython-38.pyc b/captioning/utils/__pycache__/eval_utils.cpython-38.pyc new file mode 100644 index 00000000..43c8e924 Binary files /dev/null and b/captioning/utils/__pycache__/eval_utils.cpython-38.pyc differ diff --git a/captioning/utils/__pycache__/misc.cpython-38.pyc b/captioning/utils/__pycache__/misc.cpython-38.pyc new file mode 100644 index 00000000..90eed6bc Binary files /dev/null and b/captioning/utils/__pycache__/misc.cpython-38.pyc differ diff --git a/captioning/utils/__pycache__/opts.cpython-38.pyc b/captioning/utils/__pycache__/opts.cpython-38.pyc new file mode 100644 index 00000000..ddb2ab62 Binary files /dev/null and b/captioning/utils/__pycache__/opts.cpython-38.pyc differ diff --git a/captioning/utils/__pycache__/resnet.cpython-38.pyc b/captioning/utils/__pycache__/resnet.cpython-38.pyc new file mode 100644 index 00000000..2ed6fcca Binary files /dev/null and b/captioning/utils/__pycache__/resnet.cpython-38.pyc differ diff --git a/captioning/utils/__pycache__/resnet_utils.cpython-38.pyc b/captioning/utils/__pycache__/resnet_utils.cpython-38.pyc new file mode 100644 index 00000000..cd133081 Binary files /dev/null and b/captioning/utils/__pycache__/resnet_utils.cpython-38.pyc differ diff --git a/captioning/utils/__pycache__/rewards.cpython-38.pyc b/captioning/utils/__pycache__/rewards.cpython-38.pyc new file mode 100644 index 00000000..0f12b33d Binary files /dev/null and b/captioning/utils/__pycache__/rewards.cpython-38.pyc differ diff --git a/captioning/utils/eval_utils.py b/captioning/utils/eval_utils.py index c4bc7f44..78abffc8 100644 --- a/captioning/utils/eval_utils.py +++ b/captioning/utils/eval_utils.py @@ -17,12 +17,9 @@ from . import misc as utils # load coco-caption if available -try: - sys.path.append("coco-caption") - from pycocotools.coco import COCO - from pycocoevalcap.eval import COCOEvalCap -except: - print('Warning: coco-caption not available') +from pycocotools.coco import COCO +from pycocoevalcap.eval import COCOEvalCap + bad_endings = ['a','an','the','in','for','at','of','with','before','after','on','upon','near','to','is','are','am'] bad_endings += ['the'] diff --git a/captioning/utils/opts.py b/captioning/utils/opts.py index 282bd72c..f260a8d2 100644 --- a/captioning/utils/opts.py +++ b/captioning/utils/opts.py @@ -38,6 +38,20 @@ def parse_opt(): parser.add_argument('--cached_tokens', type=str, default='coco-train-idxs', help='Cached token file for calculating cider score during self critical training.') + # Data Augmentation + parser.add_argument('--data_augmentation', type=bool, default=False, + help='True if use generated data as augmentation') + parser.add_argument('--input_json_da', type=str, default='data/coco.json', + help='path to the json file containing additional info and vocab') + parser.add_argument('--input_fc_dir_da', type=str, default='data/cocotalk_fc', + help='path to the directory containing the preprocessed fc feats') + parser.add_argument('--input_att_dir_da', type=str, default='data/cocotalk_att', + help='path to the directory containing the preprocessed att feats') + parser.add_argument('--input_label_h5_da', type=str, default='data/coco_label.h5', + help='path to the h5file containing the preprocessed dataset') + parser.add_argument('--seq_per_img_da', type=int, default=5, + help='number of captions to sample for each image during training. Done for efficiency since CNN forward pass is expensive. E.g. coco has 5 sents/image') + # Model settings parser.add_argument('--caption_model', type=str, default="show_tell", help='show_tell, show_attend_tell, all_img, fc, att2in, att2in2, att2all2, adaatt, adaattmo, updown, stackatt, denseatt, transformer') diff --git a/configs/fc.yml b/configs/fc.yml index 979b69ee..0359e186 100644 --- a/configs/fc.yml +++ b/configs/fc.yml @@ -1,16 +1,24 @@ caption_model: newfc -input_json: data/cocotalk.json -input_fc_dir: data/cocotalk_fc -input_att_dir: data/cocotalk_att -input_label_h5: data/cocotalk_label.h5 +input_json: /data/share/image-caption/cocotalk.json +input_fc_dir: /data/share/image-caption/cocotalk_fc +input_att_dir: /data/share/image-caption/cocotalk_att +input_label_h5: /data/share/image-caption/cocotalk_label.h5 learning_rate: 0.0005 learning_rate_decay_start: 0 scheduled_sampling_start: 0 -# checkpoint_path: $ckpt_path -# $start_from +checkpoint_path: /data/private/mxy/exp/image-caption/da/coco_lostgan_sub +# start_from: /data/share/image-caption/fc_nsc language_eval: 1 -save_checkpoint_every: 3000 +save_checkpoint_every: 1000 val_images_use: 5000 -batch_size: 10 -max_epochs: 30 \ No newline at end of file +batch_size: 100 +max_epochs: 30 + +# configs for da +data_augmentation: True +input_json_da: /data/share/image-caption/cocotalk_da_lostgan_sub.json +input_fc_dir_da: /data/share/image-caption/cocotalk_da_lostgan_sub_fc +input_att_dir_da: /data/share/image-caption/cocotalk_da_lostgan_sub_att +input_label_h5_da: /data/share/image-caption/cocotalk_da_lostgan_label.h5 +seq_per_img_da: 1 \ No newline at end of file diff --git a/prepro.sh b/prepro.sh new file mode 100644 index 00000000..752c508f --- /dev/null +++ b/prepro.sh @@ -0,0 +1,49 @@ +set -ex + +DATA_ROOT=/data/share/image-caption +MODEL=/data/private/mxy/data/image-caption + +# original +JSON=$DATA_ROOT/dataset_flickr30k.json +OUT_DIR=$DATA_ROOT/f30ktalk +IMG_ROOT=/data/share/UNITER/origin_imgs/flickr30k/flickr30k-images + +# da +# JSON_DA=$DATA_ROOT/dataset_coco.json +JSON_DA=$DATA_ROOT/dataset_coco_lostgan_sub.json +# JSON_DA=/data/share/data/coco2017/annotations/dataset_coco_lostgan.json # sub +OUT_DIR_DA=$DATA_ROOT/cocotalk_da_lostgan_sub +# IMG_ROOT_DA=/data/private/mxy/code/T2I_CL/DM-GAN+CL/output/coco_DMGAN_2021_08_15_14_19_42/Model/netG_epoch_120/f30k +# IMG_ROOT_DA=/data/share/Seg-Backtranslation/data/gen_imgs/train2017_2 +# IMG_ROOT_DA=/data/share/Seg-Backtranslation/data/gen_imgs_lostgan_train +IMG_ROOT_DA=/data/share/Seg-Backtranslation/data/gen_imgs_lostgan_train_sub +LBL_JSON_DA=$DATA_ROOT/cocotalk_da_lostgan_sub.json +LBL_H5_DA=$DATA_ROOT/cocotalk_da_lostgan_sub + +# pre labels for da +python scripts/prepro_labels_da.py \ + --input_json $JSON_DA \ + --output_json $LBL_JSON_DA \ + --output_h5 $LBL_H5_DA + +# pre ngram for da +# python scripts/prepro_ngrams.py \ +# --input_json $JSON_DA \ +# --dict_json $LBL_JSON_DA \ +# --output_pkl $DATA_ROOT/f30k_da-train \ +# --split train + + +# pre feats for da +# python scripts/prepro_feats_da.py \ +# --input_json $JSON_DA \ +# --output_dir $OUT_DIR_DA \ +# --images_root $IMG_ROOT_DA \ +# --model_root $MODEL + +# pre feats +# python scripts/prepro_feats.py \ +# --input_json $JSON \ +# --output_dir $OUT_DIR \ +# --images_root $IMG_ROOT \ +# --model_root $MODEL \ No newline at end of file diff --git a/scripts/prepro_feats.py b/scripts/prepro_feats.py index a59ccb43..d39e2314 100644 --- a/scripts/prepro_feats.py +++ b/scripts/prepro_feats.py @@ -34,7 +34,8 @@ #trn.ToTensor(), trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) - +import sys +sys.path.append('/data/private/mxy/code/ImageCaptioning.pytorch') from captioning.utils.resnet_utils import myResnet import captioning.utils.resnet as resnet diff --git a/scripts/prepro_feats_da.py b/scripts/prepro_feats_da.py new file mode 100644 index 00000000..9ed572f9 --- /dev/null +++ b/scripts/prepro_feats_da.py @@ -0,0 +1,136 @@ +""" +Preprocess a raw json dataset into features files for use in data_loader.py + +Input: json file that has the form +[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...] +example element in this list would look like +{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895} + +This script reads this json, does some basic preprocessing on the captions +(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays + +Output: two folders of features +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import json +import argparse +from random import shuffle, seed +import string +# non-standard dependencies: +import h5py +from six.moves import cPickle +import numpy as np +import torch +import torchvision.models as models +import skimage.io + +from torchvision import transforms as trn +preprocess = trn.Compose([ + #trn.ToTensor(), + trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) +]) +import sys +sys.path.append('/data/private/mxy/code/ImageCaptioning.pytorch') +from captioning.utils.resnet_utils import myResnet +import captioning.utils.resnet as resnet + + +def main(params): + net = getattr(resnet, params['model'])() + net.load_state_dict(torch.load(os.path.join(params['model_root'],params['model']+'.pth'))) + my_resnet = myResnet(net) + my_resnet.cuda() + my_resnet.eval() + + imgs = json.load(open(params['input_json'], 'r')) + imgs = imgs['images'] + N = len(imgs) + + seed(123) # make reproducible + + dir_fc = params['output_dir']+'_fc' + dir_att = params['output_dir']+'_att' + if not os.path.isdir(dir_fc): + os.mkdir(dir_fc) + if not os.path.isdir(dir_att): + os.mkdir(dir_att) + + total_cnt = 0 + for i,img in enumerate(imgs): + # flickr + # if i != img['imgid']: + # print(f"Error: id {img['imgid']} != index {i}") + # continue + # for j in range(len(img['sentences'])): + # img_name = f"{params['images_root']}/{i}/0_s_{j}_g2.png" + # if not os.path.isfile(img_name): + # print(f'{img_name} not exist!') + # break + # # load the image + # I = skimage.io.imread(img_name) + # # handle grayscale input images + # if len(I.shape) == 2: + # I = I[:,:,np.newaxis] + # I = np.concatenate((I,I,I), axis=2) + + # I = I.astype('float32')/255.0 + # I = torch.from_numpy(I.transpose([2,0,1])).cuda() + # I = preprocess(I) + # with torch.no_grad(): + # tmp_fc, tmp_att = my_resnet(I, params['att_size']) + # # write to pkl + # np.save(f"{dir_fc}/{i}_{j}", tmp_fc.data.cpu().float().numpy()) + # np.savez_compressed(f"{dir_att}/{i}_{j}", feat=tmp_att.data.cpu().float().numpy()) + # total_cnt += 1 + + # coco + img_name = f"{params['images_root']}/{str(img['cocoid']).zfill(12)}.jpg" + if not os.path.isfile(img_name): + print(f'{img_name} not exist!') + continue + # load the image + I = skimage.io.imread(img_name) + # handle grayscale input images + if len(I.shape) == 2: + I = I[:,:,np.newaxis] + I = np.concatenate((I,I,I), axis=2) + + I = I.astype('float32')/255.0 + I = torch.from_numpy(I.transpose([2,0,1])).cuda() + I = preprocess(I) + with torch.no_grad(): + tmp_fc, tmp_att = my_resnet(I, params['att_size']) + # write to pkl + np.save(f"{dir_fc}/{img['cocoid']}", tmp_fc.data.cpu().float().numpy()) + np.savez_compressed(f"{dir_att}/{img['cocoid']}", feat=tmp_att.data.cpu().float().numpy()) + total_cnt += 1 + + if i % 1000 == 0: + print('processing %d/%d (%.2f%% done) total images: %d' % (i, N, i*100.0/N, total_cnt)) + print('wrote ', params['output_dir']) + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + # input json + parser.add_argument('--input_json', required=True, help='input json file to process into hdf5') + parser.add_argument('--output_dir', default='data', help='output h5 file') + + # options + parser.add_argument('--batch_size', default=14, type=int, help='14x14 or 7x7') + parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json') + parser.add_argument('--att_size', default=14, type=int, help='14x14 or 7x7') + parser.add_argument('--model', default='resnet101', type=str, help='resnet101, resnet152') + parser.add_argument('--model_root', default='./data/imagenet_weights', type=str, help='model root') + + args = parser.parse_args() + params = vars(args) # convert to ordinary dict + print('parsed input parameters:') + print(json.dumps(params, indent = 2)) + main(params) diff --git a/scripts/prepro_labels_da.py b/scripts/prepro_labels_da.py new file mode 100644 index 00000000..4ea92c20 --- /dev/null +++ b/scripts/prepro_labels_da.py @@ -0,0 +1,164 @@ +""" +Preprocess a raw json dataset into hdf5/json files for use in data_loader.py + +Input: json file that has the form +[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...] +example element in this list would look like +{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895} + +This script reads this json, does some basic preprocessing on the captions +(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays + +Output: a json file and an hdf5 file +The hdf5 file contains several fields: +/labels is (M,max_length) uint32 array of encoded labels, zero padded +/label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the + first and last indices (in range 1..M) of labels for each image +/label_length stores the length of the sequence for each of the M sequences + +The json file has a dict that contains: +- an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed +- an 'images' field that is a list holding auxiliary information for each image, + such as in particular the 'split' it was assigned to. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import json +import argparse +from random import shuffle, seed +import string +# non-standard dependencies: +import h5py +import numpy as np +import torch +import torchvision.models as models +import skimage.io +from PIL import Image + + +def encode_captions(imgs, params, wtoi): + """ + encode all captions into one large array, which will be 1-indexed. + also produces label_start_ix and label_end_ix which store 1-indexed + and inclusive (Lua-style) pointers to the first and last caption for + each image in the dataset. + """ + + max_length = params['max_length'] + N = len(imgs) + M = sum(len(img['final_captions']) for img in imgs) # total number of captions + + label_arrays = [] + label_start_ix = np.zeros(N, dtype='uint32') # note: these will be one-indexed + label_end_ix = np.zeros(N, dtype='uint32') + label_length = np.zeros(M, dtype='uint32') + caption_counter = 0 + counter = 1 + for i,img in enumerate(imgs): + n = len(img['final_captions']) + assert n > 0, 'error: some image has no captions' + + Li = np.zeros((n, max_length), dtype='uint32') + for j,s in enumerate(img['final_captions']): + label_length[caption_counter] = min(max_length, len(s)) # record the length of this sequence + caption_counter += 1 + for k,w in enumerate(s): + if k < max_length: + Li[j,k] = wtoi[w] + + # note: word indices are 1-indexed, and captions are padded with zeros + label_arrays.append(Li) + label_start_ix[i] = counter + label_end_ix[i] = counter + n - 1 + + counter += n + + L = np.concatenate(label_arrays, axis=0) # put all the labels together + assert L.shape[0] == M, 'lengths don\'t match? that\'s weird' + assert np.all(label_length > 0), 'error: some caption had no words?' + + print('encoded captions to array of size ', L.shape) + return L, label_start_ix, label_end_ix, label_length + + +def main(params): + + imgs = json.load(open(params['input_json'], 'r')) + imgs = imgs['images'] + + seed(123) # make reproducible + + # create the vocab + # vocab = build_vocab(imgs, params) + with open('/data/share/image-caption/cocotalk.json') as f: + talk = json.load(f) + itow = talk['ix_to_word'] + vocab = list(itow.values()) + itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table + wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table + + for img in imgs: + img['final_captions'] = [] + for sent in img['sentences']: + txt = sent['tokens'] + caption = [w if w in vocab else 'UNK' for w in txt] + img['final_captions'].append(caption) + + # encode captions in large arrays, ready to ship to hdf5 file + L, label_start_ix, label_end_ix, label_length = encode_captions(imgs, params, wtoi) + + # create output h5 file + N = len(imgs) + f_lb = h5py.File(params['output_h5']+'_label.h5', "w") + f_lb.create_dataset("labels", dtype='uint32', data=L) + f_lb.create_dataset("label_start_ix", dtype='uint32', data=label_start_ix) + f_lb.create_dataset("label_end_ix", dtype='uint32', data=label_end_ix) + f_lb.create_dataset("label_length", dtype='uint32', data=label_length) + f_lb.close() + + # create output json file + out = {} + out['ix_to_word'] = itow # encode the (1-indexed) vocab + out['images'] = [] + for i,img in enumerate(imgs): + + jimg = {} + jimg['split'] = img['split'] + if 'filename' in img: jimg['file_path'] = os.path.join(img.get('filepath', ''), img['filename']) # copy it over, might need + if 'cocoid' in img: + jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful) + elif 'id' in img: + jimg['id'] = img['id'] + + if params['images_root'] != '': + with Image.open(os.path.join(params['images_root'], img['filepath'], img['filename'])) as _img: + jimg['width'], jimg['height'] = _img.size + + out['images'].append(jimg) + + json.dump(out, open(params['output_json'], 'w')) + print('wrote ', params['output_json']) + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + # input json + parser.add_argument('--input_json', required=True, help='input json file to process into hdf5') + parser.add_argument('--output_json', default='data.json', help='output json file') + parser.add_argument('--output_h5', default='data', help='output h5 file') + parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json') + + # options + parser.add_argument('--max_length', default=16, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.') + parser.add_argument('--word_count_threshold', default=5, type=int, help='only words that occur more than this number of times will be put in vocab') + + args = parser.parse_args() + params = vars(args) # convert to ordinary dict + print('parsed input parameters:') + print(json.dumps(params, indent = 2)) + main(params) diff --git a/scripts/prepro_ngrams.py b/scripts/prepro_ngrams.py index f7cdce47..d1328193 100644 --- a/scripts/prepro_ngrams.py +++ b/scripts/prepro_ngrams.py @@ -6,6 +6,8 @@ import json import argparse from six.moves import cPickle +import sys +sys.path.append('/data/private/mxy/code/ImageCaptioning.pytorch') import captioning.utils.misc as utils from collections import defaultdict diff --git a/scripts/write_coco_da.py b/scripts/write_coco_da.py new file mode 100644 index 00000000..4eeea08c --- /dev/null +++ b/scripts/write_coco_da.py @@ -0,0 +1,85 @@ +import os +import copy +import json + +def write_json_fs(root, ratio): + ''' + write json for fewshot + ''' + with open(f'{root}/dataset_coco.json', 'r') as fin: + dataset = json.load(fin) + images = dataset['images'] + + num_imgs = int(len(images) * ratio) + + dataset_da = { + 'dataset': f'coco_da_{ratio}', + 'images': [], + } + + i = 0 + for img in images: + if img['split'] == 'train' or img['split'] == 'restval': + dataset_da['images'].append(img) + i += 1 + if i == num_imgs: + break + + with open(f'{root}/dataset_coco_fs.json', 'w') as fout: + json.dump(dataset_da, fout) + +def write_json_lostgan(root, img_root): + ''' + write json for LostGAN which only generate about 70k images + ''' + with open(f'/data/share/data/coco2017/annotations/dataset_coco_lostgan.json', 'r') as fin: + dataset = json.load(fin) + images = dataset['images'] + + dataset_da = { + 'dataset': f'coco_da_lostgan', + 'images': [], + } + + for img in images: + img_name = f"{img_root}/{str(img['cocoid']).zfill(12)}.jpg" + if os.path.isfile(img_name): + new_img = copy.deepcopy(img) + new_img['sentences'] = [img['sentences']] + dataset_da['images'].append(new_img) + + print(f"wrote {len(dataset_da['images'])} images") + + with open(f'{root}/dataset_coco_lostgan_sub.json', 'w') as fout: + json.dump(dataset_da, fout) + +def write_json(root, ratio): + ''' + write json for da + param: ratio # of synthetic data / # of real data + 1:1 or 2:1 or 5:1 as usual + ''' + with open(f'{root}/dataset_coco.json', 'r') as fin: + dataset = json.load(fin) + images = dataset['images'] + + dataset_da = { + 'dataset': f'coco_da_{ratio}', + 'images': copy.deepcopy(images), + } + + for i in range(1, ratio): + for img in images: + new_img = copy.deepcopy(img) + new_img['cocoid'] = f"{img['cocoid']}_{i}" + dataset_da['images'].append(new_img) + + with open(f'{root}/dataset_coco_da_{ratio}.json', 'w') as fout: + json.dump(dataset_da, fout) + +if __name__ == '__main__': + root = '/data/share/image-caption' + img_root = '/data/share/Seg-Backtranslation/data/gen_imgs_lostgan_train_sub' + # ratio = 0.1 + # write_json_fs(root, ratio) + write_json_lostgan(root, img_root) \ No newline at end of file diff --git a/scripts/write_f30k_da.py b/scripts/write_f30k_da.py new file mode 100644 index 00000000..7e664744 --- /dev/null +++ b/scripts/write_f30k_da.py @@ -0,0 +1,36 @@ +import json + +def write_json(root, img_num): + ''' + split original json file for da + param: img_num # of original images used for generation. + # of generated images is supposed to be 5*img_num since 5 captions were given per image + ''' + with open(f'{root}/dataset_flickr30k.json', 'r') as fin: + dataset = json.load(fin) + images = dataset['images'][:img_num] + + dataset_da = { + 'dataset': 'flickr30k_da', + 'images': [] + } + for img in images: + for i,s in enumerate(img['sentences']): + new_img = { + 'sentids': [img['sentids'][i]], + 'imgid': img['imgid'], + 'sentences': [s], + 'split': img['split'], + 'id': f"{img['imgid']}_{i}" + } + dataset_da['images'].append(new_img) + + with open(f'{root}/dataset_flickr30k_da.json', 'w') as fout: + json.dump(dataset_da, fout) + +if __name__ == '__main__': + # root = '/data/private/mxy/code/T2I_CL/DM-GAN+CL/output/coco_DMGAN_2021_08_15_14_19_42/Model/netG_epoch_120/f30k' + # out_path = '/data/private/mxy/data' + root = '/data/share/image-caption' + img_num = 22684 + write_json(root, img_num) \ No newline at end of file diff --git a/tools/train.py b/tools/train.py index 4d015a81..49e1944d 100644 --- a/tools/train.py +++ b/tools/train.py @@ -14,7 +14,9 @@ from six.moves import cPickle import traceback from collections import defaultdict - +# for import error +import sys +sys.path.append('/data/private/mxy/code/ImageCaptioning.pytorch') import captioning.utils.opts as opts import captioning.models as models from captioning.data.dataloader import *