Date created: 2016/01/11
Last modified: 2020/05/02
Description: Transferring the style of a reference image to target image using gradient descent.
Style transfer consists in generating an image with the same "content" as a base image, but with the "style" of a different picture (typically artistic). This is achieved through the optimization of a loss function that has 3 components: "style loss", "content loss", and "total variation loss":
Reference: A Neural Algorithm of Artistic Style
import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras.applications import vgg19 base_image_path = keras.utils.get_file("paris.jpg", "https://i.imgur.com/F28w3Ac.jpg") style_reference_image_path = keras.utils.get_file( "starry_night.jpg", "https://i.imgur.com/9ooB60I.jpg" ) result_prefix = "paris_generated" # Weights of the different loss components total_variation_weight = 1e-6 style_weight = 1e-6 content_weight = 2.5e-8 # Dimensions of the generated picture. width, height = keras.preprocessing.image.load_img(base_image_path).size img_nrows = 400 img_ncols = int(width * img_nrows / height)
Downloading data from https://i.imgur.com/F28w3Ac.jpg 106496/102437 [===============================] - 0s 0us/step 114688/102437 [=================================] - 0s 0us/step Downloading data from https://i.imgur.com/9ooB60I.jpg 942080/935806 [==============================] - 0s 0us/step 950272/935806 [==============================] - 0s 0us/step
from IPython.display import Image, display display(Image(base_image_path)) display(Image(style_reference_image_path))
def preprocess_image(image_path): # Util function to open, resize and format pictures into appropriate tensors img = keras.preprocessing.image.load_img( image_path, target_size=(img_nrows, img_ncols) ) img = keras.preprocessing.image.img_to_array(img) img = np.expand_dims(img, axis=0) img = vgg19.preprocess_input(img) return tf.convert_to_tensor(img) def deprocess_image(x): # Util function to convert a tensor into a valid image x = x.reshape((img_nrows, img_ncols, 3)) # Remove zero-center by mean pixel x[:, :, 0] += 103.939 x[:, :, 1] += 116.779 x[:, :, 2] += 123.68 # 'BGR'->'RGB' x = x[:, :, ::-1] x = np.clip(x, 0, 255).astype("uint8") return x
First, we need to define 4 utility functions:
gram_matrix(used to compute the style loss)
style_lossfunction, which keeps the generated image close to the local textures of the style reference image
content_lossfunction, which keeps the high-level representation of the generated image close to that of the base image
total_variation_lossfunction, a regularization loss which keeps the generated image locally-coherent
# The gram matrix of an image tensor (feature-wise outer product) def gram_matrix(x): x = tf.transpose(x, (2, 0, 1)) features = tf.reshape(x, (tf.shape(x), -1)) gram = tf.matmul(features, tf.transpose(features)) return gram # The "style loss" is designed to maintain # the style of the reference image in the generated image. # It is based on the gram matrices (which capture style) of # feature maps from the style reference image # and from the generated image def style_loss(style, combination): S = gram_matrix(style) C = gram_matrix(combination) channels = 3 size = img_nrows * img_ncols return tf.reduce_sum(tf.square(S - C)) / (4.0 * (channels**2) * (size**2)) # An auxiliary loss function # designed to maintain the "content" of the # base image in the generated image def content_loss(base, combination): return tf.reduce_sum(tf.square(combination - base)) # The 3rd loss function, total variation loss, # designed to keep the generated image locally coherent def total_variation_loss(x): a = tf.square( x[:, : img_nrows - 1, : img_ncols - 1, :] - x[:, 1:, : img_ncols - 1, :] ) b = tf.square( x[:, : img_nrows - 1, : img_ncols - 1, :] - x[:, : img_nrows - 1, 1:, :] ) return tf.reduce_sum(tf.pow(a + b, 1.25))
Next, let's create a feature extraction model that retrieves the intermediate activations of VGG19 (as a dict, by name).
# Build a VGG19 model loaded with pre-trained ImageNet weights model = vgg19.VGG19(weights="imagenet", include_top=False) # Get the symbolic outputs of each "key" layer (we gave them unique names). outputs_dict = dict([(layer.name, layer.output) for layer in model.layers]) # Set up a model that returns the activation values for every layer in # VGG19 (as a dict). feature_extractor = keras.Model(inputs=model.inputs, outputs=outputs_dict)
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5 80142336/80134624 [==============================] - 1s 0us/step 80150528/80134624 [==============================] - 1s 0us/step
Finally, here's the code that computes the style transfer loss.
# List of layers to use for the style loss. style_layer_names = [ "block1_conv1", "block2_conv1", "block3_conv1", "block4_conv1", "block5_conv1", ] # The layer to use for the content loss. content_layer_name = "block5_conv2" def compute_loss(combination_image, base_image, style_reference_image): input_tensor = tf.concat( [base_image, style_reference_image, combination_image], axis=0 ) features = feature_extractor(input_tensor) # Initialize the loss loss = tf.zeros(shape=()) # Add content loss layer_features = features[content_layer_name] base_image_features = layer_features[0, :, :, :] combination_features = layer_features[2, :, :, :] loss = loss + content_weight * content_loss( base_image_features, combination_features ) # Add style loss for layer_name in style_layer_names: layer_features = features[layer_name] style_reference_features = layer_features[1, :, :, :] combination_features = layer_features[2, :, :, :] sl = style_loss(style_reference_features, combination_features) loss += (style_weight / len(style_layer_names)) * sl # Add total variation loss loss += total_variation_weight * total_variation_loss(combination_image) return loss
To compile it, and thus make it fast.
@tf.function def compute_loss_and_grads(combination_image, base_image, style_reference_image): with tf.GradientTape() as tape: loss = compute_loss(combination_image, base_image, style_reference_image) grads = tape.gradient(loss, combination_image) return loss, grads
Repeatedly run vanilla gradient descent steps to minimize the loss, and save the resulting image every 100 iterations.
We decay the learning rate by 0.96 every 100 steps.
optimizer = keras.optimizers.SGD( keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=100.0, decay_steps=100, decay_rate=0.96 ) ) base_image = preprocess_image(base_image_path) style_reference_image = preprocess_image(style_reference_image_path) combination_image = tf.Variable(preprocess_image(base_image_path)) iterations = 4000 for i in range(1, iterations + 1): loss, grads = compute_loss_and_grads( combination_image, base_image, style_reference_image ) optimizer.apply_gradients([(grads, combination_image)]) if i % 100 == 0: print("Iteration %d: loss=%.2f" % (i, loss)) img = deprocess_image(combination_image.numpy()) fname = result_prefix + "_at_iteration_%d.png" % i keras.preprocessing.image.save_img(fname, img)
Iteration 100: loss=11021.84 Iteration 200: loss=8516.83 Iteration 300: loss=7572.59 Iteration 400: loss=7062.75 Iteration 500: loss=6734.12 Iteration 600: loss=6498.55 Iteration 700: loss=6319.12 Iteration 800: loss=6176.68 Iteration 900: loss=6060.16 Iteration 1000: loss=5962.53 Iteration 1100: loss=5879.59 Iteration 1200: loss=5808.41 Iteration 1300: loss=5746.57 Iteration 1400: loss=5692.11 Iteration 1500: loss=5643.77 Iteration 1600: loss=5600.53 Iteration 1700: loss=5561.75 Iteration 1800: loss=5526.84 Iteration 1900: loss=5495.23 Iteration 2000: loss=5466.59 Iteration 2100: loss=5440.56 Iteration 2200: loss=5416.80 Iteration 2300: loss=5395.01 Iteration 2400: loss=5375.02 Iteration 2500: loss=5356.57 Iteration 2600: loss=5339.50 Iteration 2700: loss=5323.70 Iteration 2800: loss=5309.09 Iteration 2900: loss=5295.48 Iteration 3000: loss=5282.80 Iteration 3100: loss=5270.98 Iteration 3200: loss=5259.91 Iteration 3300: loss=5249.54 Iteration 3400: loss=5239.84 Iteration 3500: loss=5230.77 Iteration 3600: loss=5222.23 Iteration 3700: loss=5214.22 Iteration 3800: loss=5206.67 Iteration 3900: loss=5199.56 Iteration 4000: loss=5192.88
After 4000 iterations, you get the following result:
display(Image(result_prefix + "_at_iteration_4000.png"))
Example available on HuggingFace Trained Model | Demo --- | --- |