Interesting statistics:
Total captions: 40445 Average caption length: 11.78 words Max caption length: 38 words Min caption length: 1 words
Tokenizing Captions
This section handles the preparation of textual data by converting image captions into sequences of integers. It also splits the dataset into training and validation sets, ensuring that 85% of the images are used for training.
# Tokenize captions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in captions)
# Train-test split
images = data['image'].unique().tolist()
nimages = len(images)
split_index = round(0.85 * nimages)
train_images = images[:split_index]
val_images = images[split_index:]
train = data[data['image'].isin(train_images)]
test = data[data['image'].isin(val_images)]
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)
tokenizer.texts_to_sequences([captions[1]])[0]Image Feature Extraction
This section uses the DenseNet201 model to extract meaningful image features, which are saved in a dictionary for use during model training. Each image is resized, normalized, and converted into a format compatible with the model.
# Use DenseNet201 for image feature extraction
model = DenseNet201()
fe = Model(inputs=model.input, outputs=model.layers[-2].output)
img_size = 224
features = {}
for image in tqdm(data['image'].unique().tolist()):
img = load_img(os.path.join(image_path, image), target_size=(img_size, img_size))
img = img_to_array(img)
img = img / 255.
img = np.expand_dims(img, axis=0)
feature = fe.predict(img, verbose=0)
features[image] = featureCustom Data Generator
This custom data generator class facilitates efficient model training by yielding batches of data, including image features and tokenized caption sequences. It handles batch creation, shuffling, and ensures compatibility with the model’s input format.
class CustomDataGenerator(Sequence):
def __init__(self, df, X_col, y_col, batch_size, directory, tokenizer,
vocab_size, max_length, features, shuffle=True):
self.df = df.copy()
self.X_col = X_col
self.y_col = y_col
self.directory = directory
self.batch_size = batch_size
self.tokenizer = tokenizer
self.vocab_size = vocab_size
self.max_length = max_length
self.features = features
self.shuffle = shuffle
self.n = len(self.df)
def on_epoch_end(self):
if self.shuffle:
self.df = self.df.sample(frac=1).reset_index(drop=True)
def __len__(self):
return self.n // self.batch_size
def __getitem__(self, index):
batch = self.df.iloc[index * self.batch_size:(index + 1) * self.batch_size, :]
X1, X2, y = self.__get_data(batch)
return (X1, X2), y
def __get_data(self, batch):
X1, X2, y = list(), list(), list()
images = batch[self.X_col].tolist()
for image in images:
feature = self.features[image][0]
captions = batch.loc[batch[self.X_col] == image, self.y_col].tolist()
for caption in captions:
seq = self.tokenizer.texts_to_sequences([caption])[0]
for i in range(1, len(seq)):
in_seq, out_seq = seq[:i], seq[i]
in_seq = pad_sequences([in_seq], maxlen=self.max_length)[0]
out_seq = to_categorical([out_seq], num_classes=self.vocab_size)[0]
X1.append(feature)
X2.append(in_seq)
y.append(out_seq)
X1, X2, y = np.array(X1), np.array(X2), np.array(y)
return X1, X2, yNext week we will look at the model creation and the image and text feature layers!