From 9c1dd523a733dd0749abfeda2052a9e8837892f4 Mon Sep 17 00:00:00 2001
From: "Ruotian(RT) Luo" <rluo@ttic.edu>
Date: Mon, 11 Dec 2017 11:09:50 -0600
Subject: [PATCH 1/4] Update README.md

Add a reference to cheat some citations.
---
 README.md | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f4032334..d235166c 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-# Neuraltalk2-pytorch
+# ImageCaptioning.pytorch
 
-Changes compared to neuraltalk2.
+This is an image captioning codebase in PyTorch. If you are familiar with neuraltalk2, here are the differences compared to neuraltalk2.
 - Instead of using random split, we use [karpathy's train-val-test split](http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip).
 - Instead of including the convnet in the model, we use preprocessed features. (finetuneable cnn version is in the branch **with_finetune**)
 - Use resnet instead of vgg; the feature extraction method is the same as in self-critical: run cnn on original image and adaptively average pool the last conv layer feature to fixed size .
@@ -97,6 +97,22 @@ The defualt split to evaluate is test. The default inference method is greedy de
 
 **Live demo**. Not supported now. Welcome pull request.
 
+## Reference
+If you find this implementation helpful, please consider citing this repo:
+
+```
+@misc{Luo2017,
+author = {Ruotian Luo},
+title = {An Image Captioning codebase in PyTorch},
+year = {2017},
+publisher = {GitHub},
+journal = {GitHub repository},
+howpublished = {\url{https://github.com/ruotianluo/ImageCaptioning.pytorch}},
+}
+```
+
+Of course, please cite the original paper of models you are using (You can find references in the model files).
+
 ## Acknowledgements
 
-Thanks the original [neuraltalk2](https://github.com/karpathy/neuraltalk2) and awesome PyTorch team.
\ No newline at end of file
+Thanks the original [neuraltalk2](https://github.com/karpathy/neuraltalk2) and awesome PyTorch team.

From 6b8a7404bd4ebd83a1f980112f0bf1f8099816e0 Mon Sep 17 00:00:00 2001
From: Reyhane Askari <ReyhaneAskari@users.noreply.github.com>
Date: Thu, 8 Mar 2018 15:10:14 -0500
Subject: [PATCH 2/4] saving to and loading from single h5 (#38)

* saving to and loading from single hdf5

* fixed the indexing

* flak8

* fixed paralle

* changed function name

* updated pre_process_script

* fix for type

* code cleanup

* script added to convert old numpy files into h5 data
---
 dataloader.py                 | 151 ++++++++++++++++++++--------------
 scripts/convert_old.py        |  58 +++++++++++++
 scripts/pre_process_script.sh |  15 ++++
 scripts/prepro_feats.py       |  57 +++++++------
 4 files changed, 192 insertions(+), 89 deletions(-)
 create mode 100644 scripts/convert_old.py
 create mode 100644 scripts/pre_process_script.sh

diff --git a/dataloader.py b/dataloader.py
index f1175356..6bfd2804 100644
--- a/dataloader.py
+++ b/dataloader.py
@@ -8,22 +8,17 @@
 import numpy as np
 import random
 
-import torch
 import torch.utils.data as data
 
 import multiprocessing
 
-def get_npy_data(ix, fc_file, att_file, use_att):
-    if use_att == True:
-        return (np.load(fc_file), np.load(att_file)['feat'], ix)
-    else:
-        return (np.load(fc_file), np.zeros((1,1,1)), ix)
 
 class DataLoader(data.Dataset):
 
     def reset_iterator(self, split):
         del self._prefetch_process[split]
-        self._prefetch_process[split] = BlobFetcher(split, self, split=='train')
+        self._prefetch_process[split] = BlobFetcher(split,
+                                                    self, split == 'train')
         self.iterators[split] = 0
 
     def get_vocab_size(self):
@@ -35,22 +30,40 @@ def get_vocab(self):
     def get_seq_length(self):
         return self.seq_length
 
+    def read_files(self):
+        self.feats_fc = h5py.File(os.path.join(
+            self.opt.input_fc_dir, 'feats_fc.h5'), 'r')
+        self.feats_att = h5py.File(os.path.join(
+            self.opt.input_att_dir, 'feats_att.h5'), 'r')
+
+    def get_data(self, ix):
+        self.read_files()
+        index = str(self.info['images'][ix]['id'])
+        if self.use_att:
+            return (np.array(self.feats_fc[index]).astype('float32'),
+                    np.array(self.feats_att[index]).astype('float32'), ix)
+        else:
+            return (np.array(self.feats_fc[index]).astype('float32'),
+                    np.zeros((1, 1, 1)).astype('float32'), ix)
+
     def __init__(self, opt):
         self.opt = opt
         self.batch_size = self.opt.batch_size
         self.seq_per_img = opt.seq_per_img
         self.use_att = getattr(opt, 'use_att', True)
 
-        # load the json file which contains additional information about the dataset
+        # load json file which contains additional information about dataset
         print('DataLoader loading json file: ', opt.input_json)
         self.info = json.load(open(self.opt.input_json))
         self.ix_to_word = self.info['ix_to_word']
         self.vocab_size = len(self.ix_to_word)
         print('vocab size is ', self.vocab_size)
-        
+
         # open the hdf5 file
-        print('DataLoader loading h5 file: ', opt.input_fc_dir, opt.input_att_dir, opt.input_label_h5)
-        self.h5_label_file = h5py.File(self.opt.input_label_h5, 'r', driver='core')
+        print('DataLoader loading h5 file: ', opt.input_fc_dir,
+              opt.input_att_dir, opt.input_label_h5)
+        self.h5_label_file = h5py.File(self.opt.input_label_h5, 'r',
+                                       driver='core')
 
         self.input_fc_dir = self.opt.input_fc_dir
         self.input_att_dir = self.opt.input_att_dir
@@ -64,7 +77,7 @@ def __init__(self, opt):
         self.label_end_ix = self.h5_label_file['label_end_ix'][:]
 
         self.num_images = self.label_start_ix.shape[0]
-        print('read %d image features' %(self.num_images))
+        print('read %d image features' % (self.num_images))
 
         # separate out indexes for each of the provided splits
         self.split_ix = {'train': [], 'val': [], 'test': []}
@@ -76,23 +89,27 @@ def __init__(self, opt):
                 self.split_ix['val'].append(ix)
             elif img['split'] == 'test':
                 self.split_ix['test'].append(ix)
-            elif opt.train_only == 0: # restval
+            elif opt.train_only == 0:  # restval
                 self.split_ix['train'].append(ix)
 
-        print('assigned %d images to split train' %len(self.split_ix['train']))
-        print('assigned %d images to split val' %len(self.split_ix['val']))
-        print('assigned %d images to split test' %len(self.split_ix['test']))
+        print('assigned %d images to split train' % len(self.split_ix['train']))
+        print('assigned %d images to split val' % len(self.split_ix['val']))
+        print('assigned %d images to split test' % len(self.split_ix['test']))
 
         self.iterators = {'train': 0, 'val': 0, 'test': 0}
-        
-        self._prefetch_process = {} # The three prefetch process
+
+        self._prefetch_process = {}  # The three prefetch process
         for split in self.iterators.keys():
-            self._prefetch_process[split] = BlobFetcher(split, self, split=='train')
+            self._prefetch_process[split] = BlobFetcher(split,
+                                                        self,
+                                                        split == 'train')
             # Terminate the child process when the parent exists
+
         def cleanup():
             print('Terminating BlobFetcher')
             for split in self.iterators.keys():
                 del self._prefetch_process[split]
+
         import atexit
         atexit.register(cleanup)
 
@@ -100,10 +117,12 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
         batch_size = batch_size or self.batch_size
         seq_per_img = seq_per_img or self.seq_per_img
 
-        fc_batch = [] # np.ndarray((batch_size * seq_per_img, self.opt.fc_feat_size), dtype = 'float32')
-        att_batch = [] # np.ndarray((batch_size * seq_per_img, 14, 14, self.opt.att_feat_size), dtype = 'float32')
-        label_batch = np.zeros([batch_size * seq_per_img, self.seq_length + 2], dtype = 'int')
-        mask_batch = np.zeros([batch_size * seq_per_img, self.seq_length + 2], dtype = 'float32')
+        fc_batch = []
+        att_batch = []
+        label_batch = np.zeros(
+            [batch_size * seq_per_img, self.seq_length + 2], dtype='int')
+        mask_batch = np.zeros(
+            [batch_size * seq_per_img, self.seq_length + 2], dtype='float32')
 
         wrapped = False
 
@@ -111,8 +130,6 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
         gts = []
 
         for i in range(batch_size):
-            import time
-            t_start = time.time()
             # fetch image
             tmp_fc, tmp_att,\
                 ix, tmp_wrapped = self._prefetch_process[split].get()
@@ -120,76 +137,79 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
             att_batch += [tmp_att] * seq_per_img
 
             # fetch the sequence labels
-            ix1 = self.label_start_ix[ix] - 1 #label_start_ix starts from 1
+            ix1 = self.label_start_ix[ix] - 1  # label_start_ix starts from 1
             ix2 = self.label_end_ix[ix] - 1
-            ncap = ix2 - ix1 + 1 # number of captions available for this image
-            assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t'
+            ncap = ix2 - ix1 + 1  # number of captions available for this image
+            assert ncap > 0, 'an image does not have any label.'
 
             if ncap < seq_per_img:
                 # we need to subsample (with replacement)
-                seq = np.zeros([seq_per_img, self.seq_length], dtype = 'int')
+                seq = np.zeros([seq_per_img, self.seq_length], dtype='int')
                 for q in range(seq_per_img):
-                    ixl = random.randint(ix1,ix2)
-                    seq[q, :] = self.h5_label_file['labels'][ixl, :self.seq_length]
+                    ixl = random.randint(ix1, ix2)
+                    seq[q, :] = self.h5_label_file['labels'][ixl,
+                                                             :self.seq_length]
             else:
                 ixl = random.randint(ix1, ix2 - seq_per_img + 1)
-                seq = self.h5_label_file['labels'][ixl: ixl + seq_per_img, :self.seq_length]
-            
-            label_batch[i * seq_per_img : (i + 1) * seq_per_img, 1 : self.seq_length + 1] = seq
+                seq = self.h5_label_file['labels'][ixl: ixl + seq_per_img,
+                                                   :self.seq_length]
+
+            label_batch[i * seq_per_img: (i + 1) * seq_per_img,
+                        1: self.seq_length + 1] = seq
 
             if tmp_wrapped:
                 wrapped = True
 
             # Used for reward evaluation
-            gts.append(self.h5_label_file['labels'][self.label_start_ix[ix] - 1: self.label_end_ix[ix]])
-        
+            gts.append(
+                self.h5_label_file['labels'][self.label_start_ix[ix] - 1:
+                                             self.label_end_ix[ix]])
+
             # record associated info as well
             info_dict = {}
             info_dict['ix'] = ix
             info_dict['id'] = self.info['images'][ix]['id']
             info_dict['file_path'] = self.info['images'][ix]['file_path']
             infos.append(info_dict)
-            #print(i, time.time() - t_start)
 
         # generate mask
-        t_start = time.time()
-        nonzeros = np.array(list(map(lambda x: (x != 0).sum()+2, label_batch)))
+        nonzeros = np.array(list(map(lambda x: (x != 0).sum() + 2, label_batch)))
         for ix, row in enumerate(mask_batch):
             row[:nonzeros[ix]] = 1
-        #print('mask', time.time() - t_start)
 
         data = {}
         data['fc_feats'] = np.stack(fc_batch)
         data['att_feats'] = np.stack(att_batch)
         data['labels'] = label_batch
         data['gts'] = gts
-        data['masks'] = mask_batch 
-        data['bounds'] = {'it_pos_now': self.iterators[split], 'it_max': len(self.split_ix[split]), 'wrapped': wrapped}
+        data['masks'] = mask_batch
+        data['bounds'] = {'it_pos_now': self.iterators[split],
+                          'it_max': len(self.split_ix[split]),
+                          'wrapped': wrapped}
         data['infos'] = infos
 
         return data
 
-    # It's not coherent to make DataLoader a subclass of Dataset, but essentially, we only need to implement the following to functions,
-    # so that the torch.utils.data.DataLoader can load the data according the index.
-    # However, it's minimum change to switch to pytorch data loading.
+    # It's not coherent to make DataLoader a subclass of Dataset,
+    # but essentially, we only need to implement the following to functions,
+    # so that the torch.utils.data.DataLoader can load the data according
+    # the index. However, it's minimum change to switch to pytorch data loading
     def __getitem__(self, index):
         """This function returns a tuple that is further passed to collate_fn
         """
-        ix = index #self.split_ix[index]
-        return get_npy_data(ix, \
-                os.path.join(self.input_fc_dir, str(self.info['images'][ix]['id']) + '.npy'),
-                os.path.join(self.input_att_dir, str(self.info['images'][ix]['id']) + '.npz'),
-                self.use_att
-                )
+        ix = index  # self.split_ix[index]
+        return self.get_data(ix)
 
     def __len__(self):
         return len(self.info['images'])
 
+
 class BlobFetcher():
     """Experimental class for prefetching blobs in a separate process."""
     def __init__(self, split, dataloader, if_shuffle=False):
         """
-        db is a list of tuples containing: imcrop_name, caption, bbox_feat of gt box, imname
+        db is a list of tuples containing: imcrop_name,
+        caption, bbox_feat of gt box, imname
         """
         self.split = split
         self.dataloader = dataloader
@@ -199,17 +219,21 @@ def __init__(self, split, dataloader, if_shuffle=False):
     def reset(self):
         """
         Two cases:
-        1. not hasattr(self, 'split_loader'): Resume from previous training. Create the dataset given the saved split_ix and iterator
-        2. wrapped: a new epoch, the split_ix and iterator have been updated in the get_minibatch_inds already.
+        1. not hasattr(self, 'split_loader'): Resume from previous training.
+        Create the dataset given the saved split_ix and iterator
+        2. wrapped: a new epoch, the split_ix and iterator have been updated in
+         the get_minibatch_inds already.
         """
         # batch_size is 0, the merge is done in DataLoader class
-        self.split_loader = iter(data.DataLoader(dataset=self.dataloader,
-                                            batch_size=1,
-                                            sampler=self.dataloader.split_ix[self.split][self.dataloader.iterators[self.split]:],
-                                            shuffle=False,
-                                            pin_memory=True,
-                                            num_workers=multiprocessing.cpu_count(),
-                                            collate_fn=lambda x: x[0]))
+        sampler = self.dataloader.split_ix[self.split][self.dataloader.iterators[self.split]:]
+        self.split_loader = iter(
+            data.DataLoader(dataset=self.dataloader,
+                            batch_size=1,
+                            sampler=sampler,
+                            shuffle=False,
+                            pin_memory=True,
+                            num_workers=multiprocessing.cpu_count(),
+                            collate_fn=lambda x: x[0]))
 
     def _get_next_minibatch_inds(self):
         max_index = len(self.dataloader.split_ix[self.split])
@@ -227,7 +251,7 @@ def _get_next_minibatch_inds(self):
         self.dataloader.iterators[self.split] = ri_next
 
         return ix, wrapped
-    
+
     def get(self):
         if not hasattr(self, 'split_loader'):
             self.reset()
@@ -236,7 +260,6 @@ def get(self):
         tmp = self.split_loader.next()
         if wrapped:
             self.reset()
-
         assert tmp[2] == ix, "ix not equal"
 
-        return tmp + [wrapped]
\ No newline at end of file
+        return tmp + [wrapped]
diff --git a/scripts/convert_old.py b/scripts/convert_old.py
new file mode 100644
index 00000000..ce5de678
--- /dev/null
+++ b/scripts/convert_old.py
@@ -0,0 +1,58 @@
+import argparse
+import h5py
+import os
+import numpy as np
+import json
+
+
+def main(params):
+    if not os.path.isdir(params['fc_output_dir']):
+        os.mkdir(params['fc_output_dir'])
+    if not os.path.isdir(params['att_output_dir']):
+        os.mkdir(params['att_output_dir'])
+
+    imgs = json.load(open(params['input_json'], 'r'))
+    imgs = imgs['images']
+    N = len(imgs)
+
+    with h5py.File(os.path.join(params['fc_output_dir'], 'feats_fc.h5')) as file_fc,\
+            h5py.File(os.path.join(params['att_output_dir'], 'feats_att.h5')) as file_att:
+        for i, img in enumerate(imgs):
+            d_set_fc = file_fc.create_dataset(
+                str(img['cocoid']), (2048,), dtype="float")
+            d_set_att = file_att.create_dataset(
+                str(img['cocoid']),
+                (params['att_size'], params['att_size'], 2048), dtype="float")
+
+            npy_fc_path = os.path.join(
+                params['fc_input_dir'][:-1],
+                str(img['cocoid']) + '.npy')
+            npy_att_path = os.path.join(
+                params['att_input_dir'][:-1],
+                str(img['cocoid']) + '.npz')
+
+            d_set_fc[...] = np.load(npy_fc_path)
+            d_set_att[...] = np.load(npy_att_path)['feat']
+            if i % 1000 == 0:
+                print('processing %d/%d (%.2f%% done)' % (i, N, i * 100.0 / N))
+        file_fc.close()
+        file_att.close()
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
+    parser.add_argument('--fc_output_dir', default='data', help='output directory for fc')
+    parser.add_argument('--att_output_dir', default='data', help='output directory for att')
+    parser.add_argument('--fc_input_dir', default='data', help='input directory for numpy fc files')
+    parser.add_argument('--att_input_dir', default='data', help='input directory for numpy att files')
+    parser.add_argument('--att_size', default=14, type=int, help='14x14 or 7x7')
+
+    args = parser.parse_args()
+    params = vars(args)  # convert to ordinary dict
+    print('parsed input parameters:')
+    print(json.dumps(params, indent=2))
+
+    main(params)
diff --git a/scripts/pre_process_script.sh b/scripts/pre_process_script.sh
new file mode 100644
index 00000000..6452ab08
--- /dev/null
+++ b/scripts/pre_process_script.sh
@@ -0,0 +1,15 @@
+mkdir data
+cd data
+wget https://download.pytorch.org/models/resnet101-5d3b4d8f.pth -O resnet101.pth
+wget http://images.cocodataset.org/zips/train2014.zip
+wget http://images.cocodataset.org/zips/val2014.zip
+unzip train2014.zip
+unzip val2014.zip
+wget http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip
+unzip caption_datasets.zip
+python ./prepro_labels.py --input_json ./dataset_coco.json --output_json ./cocotalk.json --output_h5 ./cocotalk
+python ./prepro_feats.py --input_json ./dataset_coco.json --output_dir ./cocotalk --images_root ./ --model_root ./
+# in case of corrunption due to bad file closure:
+# h5clear --status data/cocotalk_att/feats_att.h5 
+# h5clear --status data/coco_preprocessed/data/cocotalk_att/feats_att.h5
+
diff --git a/scripts/prepro_feats.py b/scripts/prepro_feats.py
index 6489e49f..4641cd53 100644
--- a/scripts/prepro_feats.py
+++ b/scripts/prepro_feats.py
@@ -30,14 +30,11 @@
 import os
 import json
 import argparse
-from random import shuffle, seed
-import string
-# non-standard dependencies:
 import h5py
-from six.moves import cPickle
+from random import shuffle, seed
+
 import numpy as np
 import torch
-import torchvision.models as models
 from torch.autograd import Variable
 import skimage.io
 
@@ -50,6 +47,7 @@
 from misc.resnet_utils import myResnet
 import misc.resnet as resnet
 
+
 def main(params):
   net = getattr(resnet, params['model'])()
   net.load_state_dict(torch.load(os.path.join(params['model_root'],params['model']+'.pth')))
@@ -70,25 +68,34 @@ def main(params):
   if not os.path.isdir(dir_att):
     os.mkdir(dir_att)
 
-  for i,img in enumerate(imgs):
-    # load the image
-    I = skimage.io.imread(os.path.join(params['images_root'], img['filepath'], img['filename']))
-    # handle grayscale input images
-    if len(I.shape) == 2:
-      I = I[:,:,np.newaxis]
-      I = np.concatenate((I,I,I), axis=2)
-
-    I = I.astype('float32')/255.0
-    I = torch.from_numpy(I.transpose([2,0,1])).cuda()
-    I = Variable(preprocess(I), volatile=True)
-    tmp_fc, tmp_att = my_resnet(I, params['att_size'])
-    # write to pkl
-    np.save(os.path.join(dir_fc, str(img['cocoid'])), tmp_fc.data.cpu().float().numpy())
-    np.savez_compressed(os.path.join(dir_att, str(img['cocoid'])), feat=tmp_att.data.cpu().float().numpy())
-
-    if i % 1000 == 0:
-      print('processing %d/%d (%.2f%% done)' % (i, N, i*100.0/N))
-  print('wrote ', params['output_dir'])
+  with h5py.File(os.path.join(dir_fc, 'feats_fc.h5')) as file_fc,\
+       h5py.File(os.path.join(dir_att, 'feats_att.h5')) as file_att:
+    for i, img in enumerate(imgs):
+      # load the image
+      I = skimage.io.imread(os.path.join(params['images_root'], img['filepath'], img['filename']))
+      # handle grayscale input images
+      if len(I.shape) == 2:
+        I = I[:,:,np.newaxis]
+        I = np.concatenate((I,I,I), axis=2)
+
+      I = I.astype('float32')/255.0
+      I = torch.from_numpy(I.transpose([2,0,1])).cuda()
+      I = Variable(preprocess(I), volatile=True)
+      tmp_fc, tmp_att = my_resnet(I, params['att_size'])
+      # write to hdf5
+
+      d_set_fc = file_fc.create_dataset(str(img['cocoid']), 
+        (2048,), dtype="float")
+      d_set_att = file_att.create_dataset(str(img['cocoid']), 
+        (params['att_size'], params['att_size'], 2048), dtype="float")
+
+      d_set_fc[...] = tmp_fc.data.cpu().float().numpy()
+      d_set_att[...] = tmp_att.data.cpu().float().numpy()
+      if i % 1000 == 0:
+        print('processing %d/%d (%.2f%% done)' % (i, N, i*100.0 / N))
+    file_fc.close()
+    file_att.close()
+
 
 if __name__ == "__main__":
 
@@ -96,7 +103,7 @@ def main(params):
 
   # input json
   parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
-  parser.add_argument('--output_dir', default='data', help='output h5 file')
+  parser.add_argument('--output_dir', default='data', help='output directory')
 
   # options
   parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')

From 99de7af6e71e3c4b579a664b84ce187ea9a532e8 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Thu, 8 Mar 2018 14:27:36 -0600
Subject: [PATCH 3/4] Change convert_old.py(no need to specify the feature size
 in advance, we can directly use the loaded array size to create dataset);
 remove prepro_process_script.sh

---
 scripts/convert_old.py        | 19 ++++++++-----------
 scripts/pre_process_script.sh | 15 ---------------
 2 files changed, 8 insertions(+), 26 deletions(-)
 delete mode 100644 scripts/pre_process_script.sh

diff --git a/scripts/convert_old.py b/scripts/convert_old.py
index ce5de678..5744f1c1 100644
--- a/scripts/convert_old.py
+++ b/scripts/convert_old.py
@@ -18,21 +18,19 @@ def main(params):
     with h5py.File(os.path.join(params['fc_output_dir'], 'feats_fc.h5')) as file_fc,\
             h5py.File(os.path.join(params['att_output_dir'], 'feats_att.h5')) as file_att:
         for i, img in enumerate(imgs):
-            d_set_fc = file_fc.create_dataset(
-                str(img['cocoid']), (2048,), dtype="float")
-            d_set_att = file_att.create_dataset(
-                str(img['cocoid']),
-                (params['att_size'], params['att_size'], 2048), dtype="float")
-
             npy_fc_path = os.path.join(
-                params['fc_input_dir'][:-1],
+                params['fc_input_dir'],
                 str(img['cocoid']) + '.npy')
             npy_att_path = os.path.join(
-                params['att_input_dir'][:-1],
+                params['att_input_dir'],
                 str(img['cocoid']) + '.npz')
 
-            d_set_fc[...] = np.load(npy_fc_path)
-            d_set_att[...] = np.load(npy_att_path)['feat']
+            d_set_fc = file_fc.create_dataset(
+                str(img['cocoid']), data=np.load(npy_fc_path))
+            d_set_att = file_att.create_dataset(
+                str(img['cocoid']),
+                data=np.load(npy_att_path)['feat'])
+
             if i % 1000 == 0:
                 print('processing %d/%d (%.2f%% done)' % (i, N, i * 100.0 / N))
         file_fc.close()
@@ -48,7 +46,6 @@ def main(params):
     parser.add_argument('--att_output_dir', default='data', help='output directory for att')
     parser.add_argument('--fc_input_dir', default='data', help='input directory for numpy fc files')
     parser.add_argument('--att_input_dir', default='data', help='input directory for numpy att files')
-    parser.add_argument('--att_size', default=14, type=int, help='14x14 or 7x7')
 
     args = parser.parse_args()
     params = vars(args)  # convert to ordinary dict
diff --git a/scripts/pre_process_script.sh b/scripts/pre_process_script.sh
deleted file mode 100644
index 6452ab08..00000000
--- a/scripts/pre_process_script.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-mkdir data
-cd data
-wget https://download.pytorch.org/models/resnet101-5d3b4d8f.pth -O resnet101.pth
-wget http://images.cocodataset.org/zips/train2014.zip
-wget http://images.cocodataset.org/zips/val2014.zip
-unzip train2014.zip
-unzip val2014.zip
-wget http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip
-unzip caption_datasets.zip
-python ./prepro_labels.py --input_json ./dataset_coco.json --output_json ./cocotalk.json --output_h5 ./cocotalk
-python ./prepro_feats.py --input_json ./dataset_coco.json --output_dir ./cocotalk --images_root ./ --model_root ./
-# in case of corrunption due to bad file closure:
-# h5clear --status data/cocotalk_att/feats_att.h5 
-# h5clear --status data/coco_preprocessed/data/cocotalk_att/feats_att.h5
-

From 622b6a5ffe9ee599911306b464dfa1ed2a19fa37 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Thu, 8 Mar 2018 14:27:54 -0600
Subject: [PATCH 4/4] Update Readme for legacy conversion.

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index d235166c..c71197f3 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@ Once we have these, we can now invoke the `prepro_*.py` script, which will read
 ```bash
 $ python scripts/prepro_labels.py --input_json data/dataset_coco.json --output_json data/cocotalk.json --output_h5 data/cocotalk
 $ python scripts/prepro_feats.py --input_json data/dataset_coco.json --output_dir data/cocotalk --images_root $IMAGE_ROOT
+
 ```
 
 `prepro_labels.py` will map all words that occur <= 5 times to a special `UNK` token, and create a vocabulary for all the remaining words. The image information and vocabulary are dumped into `data/cocotalk.json` and discretized caption data are dumped into `data/cocotalk_label.h5`.
@@ -39,6 +40,12 @@ $ python scripts/prepro_feats.py --input_json data/dataset_coco.json --output_di
 
 (Check the prepro scripts for more options, like other resnet models or other attention sizes.)
 
+**Legacy:** previously we extract features into separate npy/npz files for each image, but it would be slower to load on some NFS and also to copy them around. We now save all the features in h5 file. If you want to convert from previous npy/npz files to h5 file, you can use run
+
+```bash
+$ python scripts/convert_old.py --input_json data/dataset_coco.json --fc_input_dir data/cocotalk_fc/ --att_input_dir data/cocotalk_att/ --fc_output_dir data/cocotalk_fc --att_output_dir data/cocotalk_att/
+```
+
 **Warning**: the prepro script will fail with the default MSCOCO data because one of their images is corrupted. See [this issue](https://github.com/karpathy/neuraltalk2/issues/4) for the fix, it involves manually replacing one image in the dataset.
 
 ### Start training