Commit 50a12f07 authored by Arian Bakhtiarnia's avatar Arian Bakhtiarnia
Browse files

All code

parent 4d3a35bf
# MultiExitViT
# Introduction
This code was developed and executed using Jupyter notebooks.
The following instructions assume Ubuntu 20.04 operating system with superuser access, Nvidia GPUs, GPU drivers already installed and CUDA version 10.1, 11.0 or 11.2.
# Setting Up the Environment
1. [Install Docker](https://docs.docker.com/engine/install/ubuntu/)
2. [Install Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#setting-up-nvidia-container-toolkit)
3. `sudo docker run --gpus all -it -p 8888:8888 tensorflow/tensorflow:2.3.2-gpu-jupyter` (also tested with the `tensorflow/tensorflow:2.4.1-gpu-jupyter` image)
4. Copy the URL provided in docker logs (including the token).
5. <kbd>CTRL</kbd>+<kbd>P</kbd> then <kbd>CTRL</kbd>+<kbd>Q</kbd> to detach from the container without terminating the execution.
6. Install SciPy inside the container: `sudo docker exec -it [container_name] bash` (you can find the container name from the output of `sudo docker ps`) then `pip install scipy==1.5` (use <kbd>CTRL</kbd>+<kbd>D</kbd> to terminate and detach)
7. Paste the copied URL in your browser to open Jupyter (if you are running the docker container on a remote server, you need to replace the IP address with that of the server).
8. Upload all of the `.ipynb` files in this repository.
# Running the Experiments
Note: in each of the notebooks, you can modify `SELECTED_GPUS` to specify which GPUs to use. If you only have a single GPU available, set `SELECTED_GPUS = [0]`. The distributed training may not be supported in some notebooks.
1. Run the `train_cifar10_backbone`, `train_cifar100_backbone`, `train_fashion_mnist_backbone` and`train_disco_backbone` notebooks to train the backbones.
2. Run the `precompute_cifar_features`, `precompute_disco_features` and `precompute_fashion_mnist_features` notebooks to precompute the intermediate representations of the backbones.
3. Run the `ee` and `cw` notebooks to run the end-to-end and classifier-wise experiments, respectively. You can change the `dataset`, `head_type`, `version` and other parameters given to the `train` function.
4. Run the `calculate_flops` notebook to calculate the FLOPS, the `calculate_maes` notebook to calculate MAEs for the DISCO dataset cases, and the `plots` notebook to draw the plots.
%% Cell type:code id: tags:
``` python
SELECTED_GPUS = [7]
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(gpu_number) for gpu_number in SELECTED_GPUS])
import tensorflow as tf
tf.get_logger().setLevel('INFO')
assert len(tf.config.list_physical_devices('GPU')) > 0
GPUS = tf.config.experimental.list_physical_devices('GPU')
for gpu in GPUS:
tf.config.experimental.set_memory_growth(gpu, True)
DISTRIBUTED_STRATEGY = tf.distribute.MirroredStrategy(
cross_device_ops=tf.distribute.NcclAllReduce(),
devices=['/gpu:%d' % index for index in range(len(SELECTED_GPUS))]
)
NUM_GPUS = DISTRIBUTED_STRATEGY.num_replicas_in_sync
print('Number of devices: {}'.format(NUM_GPUS))
import math
import numpy as np
import pickle
import sys
from skimage import transform
from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2_as_graph
from vit_keras import vit
from vit_keras.layers import ClassToken, AddPositionEmbs, MultiHeadSelfAttention, TransformerBlock
IMAGE_SIZE = 384
PATCH_SIZE = 16
HIDDEN_DIM = 768
MLP_DIM = 3072
CHANNELS_MLP_DIM = 3072
TOKENS_MLP_DIM = 384
```
%% Cell type:code id: tags:
``` python
def get_flops(model):
"""
from https://github.com/tensorflow/tensorflow/issues/32809#issuecomment-768977280
"""
concrete = tf.function(lambda inputs: model(inputs))
concrete_func = concrete.get_concrete_function(
[tf.TensorSpec([1, *inputs.shape[1:]]) for inputs in model.inputs])
frozen_func, graph_def = convert_variables_to_constants_v2_as_graph(concrete_func)
with tf.Graph().as_default() as graph:
tf.graph_util.import_graph_def(graph_def, name='')
run_meta = tf.compat.v1.RunMetadata()
opts = tf.compat.v1.profiler.ProfileOptionBuilder.float_operation()
flops = tf.compat.v1.profiler.profile(graph=graph, run_meta=run_meta, cmd="op", options=opts)
return flops.total_float_ops
```
%% Cell type:code id: tags:
``` python
# from https://github.com/leondgarse/Keras_mlp/blob/main/res_mlp.py
def channel_affine(inputs, use_bias=True, weight_init_value=1, name=''):
ww_init = tfkeras.initializers.Constant(weight_init_value) if weight_init_value != 1 else 'ones'
nn = tf.keras.backend.expand_dims(inputs, 1)
nn = tf.keras.layers.DepthwiseConv2D(1, depthwise_initializer=ww_init, use_bias=use_bias, name=name + 'affine')(nn)
return tf.keras.backend.squeeze(nn, 1)
def mlp_block(inputs, mlp_dim, activation='gelu', name=''):
affine_inputs = channel_affine(inputs, use_bias=True, name=name + '1_')
nn = tf.keras.layers.Permute((2, 1), name=name + 'permute_1')(affine_inputs)
nn = tf.keras.layers.Dense(nn.shape[-1], name=name + 'dense_1')(nn)
nn = tf.keras.layers.Permute((2, 1), name=name + 'permute_2')(nn)
nn = channel_affine(nn, use_bias=False, name=name + '1_gamma_')
skip_conn = tf.keras.layers.Add(name=name + 'add_1')([nn, affine_inputs])
affine_skip = channel_affine(skip_conn, use_bias=True, name=name + '2_')
nn = tf.keras.layers.Dense(mlp_dim, name=name + 'dense_2_1')(affine_skip)
nn = tf.keras.layers.Activation(activation, name=name + 'gelu')(nn)
nn = tf.keras.layers.Dense(inputs.shape[-1], name=name + 'dense_2_2')(nn)
nn = channel_affine(nn, use_bias=False, name=name + '2_gamma_')
nn = tf.keras.layers.Add(name=name + 'add_2')([nn, affine_skip])
return nn
```
%% Cell type:code id: tags:
``` python
# from https://github.com/Benjamin-Etheredge/mlp-mixer-keras/blob/main/mlp_mixer_keras/mlp_mixer.py
class MlpBlock(tf.keras.layers.Layer):
def __init__(self, dim, hidden_dim, activation=None, **kwargs):
super(MlpBlock, self).__init__(**kwargs)
if activation is None:
activation = tf.keras.activations.gelu
self.dim = dim
self.dense1 = tf.keras.layers.Dense(hidden_dim)
self.activation = tf.keras.layers.Activation(activation)
self.dense2 = tf.keras.layers.Dense(dim)
def call(self, inputs):
x = inputs
x = self.dense1(x)
x = self.activation(x)
x = self.dense2(x)
return x
def compute_output_shape(self, input_signature):
return (input_signature[0], self.dim)
class MixerBlock(tf.keras.layers.Layer):
def __init__(
self,
num_patches,
channel_dim,
token_mixer_hidden_dim,
channel_mixer_hidden_dim=None,
activation=None,
**kwargs
):
super(MixerBlock, self).__init__(**kwargs)
if activation is None:
activation = tf.keras.activations.gelu
if channel_mixer_hidden_dim is None:
channel_mixer_hidden_dim = token_mixer_hidden_dim
self.norm1 = tf.keras.layers.LayerNormalization(axis=1)
self.permute1 = tf.keras.layers.Permute((2, 1))
self.token_mixer = MlpBlock(num_patches, token_mixer_hidden_dim, name='token_mixer')
self.permute2 = tf.keras.layers.Permute((2, 1))
self.norm2 = tf.keras.layers.LayerNormalization(axis=1)
self.channel_mixer = MlpBlock(channel_dim, channel_mixer_hidden_dim, name='channel_mixer')
self.skip_connection1 = tf.keras.layers.Add()
self.skip_connection2 = tf.keras.layers.Add()
def call(self, inputs):
x = inputs
skip_x = x
x = self.norm1(x)
x = self.permute1(x)
x = self.token_mixer(x)
x = self.permute2(x)
x = self.skip_connection1([x, skip_x])
skip_x = x
x = self.norm2(x)
x = self.channel_mixer(x)
x = self.skip_connection2([x, skip_x])
return x
def compute_output_shape(self, input_shape):
return input_shape
```
%% Cell type:code id: tags:
``` python
def get_branch_id(branch_number):
if branch_number == 1:
return 'transformer_block'
else:
return 'transformer_block_%d' % (branch_number - 1)
def get_model(dataset_name, branch_type, branch_number):
if dataset_name == 'disco':
model_file_name = 'vit_cc_backbone_v2.h5'
output_units = 1
output_activation = None
elif dataset_name == 'fashion_mnist':
model_file_name = 'vit_fashion_mnist_v1.h5'
output_units = 10
output_activation = 'softmax'
elif dataset_name == 'cifar10':
model_file_name = 'vit_cifar10_v1.h5'
output_units = 10
output_activation = 'softmax'
else:
model_file_name = 'vit_cifar100_v1.h5'
output_units = 100
output_activation = 'softmax'
backbone_model = tf.keras.models.load_model(model_file_name, custom_objects={
'ClassToken': ClassToken,
'AddPositionEmbs': AddPositionEmbs,
'MultiHeadSelfAttention': MultiHeadSelfAttention,
'TransformerBlock': TransformerBlock,
})
# freeze
for layer in backbone_model.layers:
layer.trainable = False
if branch_type == 'mlp':
y, _ = backbone_model.get_layer(get_branch_id(branch_number)).output
y = tf.keras.layers.LayerNormalization(
epsilon=1e-6, name="Transformer/encoder_norm"
)(y)
y = tf.keras.layers.Lambda(lambda v: v[:, 0], name="ExtractToken")(y)
elif branch_type == 'vit':
y, _ = backbone_model.get_layer(get_branch_id(branch_number)).output
y, _ = TransformerBlock(
num_heads=12,
mlp_dim=3072,
dropout=0.1,
name=f"Transformer/encoderblock_x",
)(y)
y = tf.keras.layers.LayerNormalization(
epsilon=1e-6, name="Transformer/encoder_norm"
)(y)
y = tf.keras.layers.Lambda(lambda v: v[:, 0], name="ExtractToken")(y)
elif branch_type.startswith('cnn_'):
y0, _ = backbone_model.get_layer(get_branch_id(branch_number)).output
channels = HIDDEN_DIM
width = height = IMAGE_SIZE // PATCH_SIZE
y1 = tf.keras.layers.Lambda(lambda v: v[:, 1:], name='RemoveToken')(y0)
y1 = tf.keras.layers.Reshape((width, height, channels), name='cnn_reshape')(y1)
y2 = tf.keras.layers.Lambda(lambda v: v[:, 0], name='ExtractToken')(y0)
y2 = tf.keras.layers.RepeatVector(width * height)(y2)
y2 = tf.keras.layers.Reshape((width, height, channels), name='cls_reshape')(y2)
if branch_type == 'cnn_ignore':
y = y1
elif branch_type == 'cnn_add':
y = tf.keras.layers.Add()([y1, y2])
elif branch_type == 'cnn_project':
y = tf.keras.layers.Concatenate()([y1, y2])
y = tf.keras.layers.Conv2D(
filters=16,
kernel_size=(3, 3),
activation='elu',
padding='same'
)(y)
y = tf.keras.layers.MaxPool2D(pool_size=(2, 2))(y)
y = tf.keras.layers.Flatten()(y)
elif branch_type == 'resmlp':
y, _ = backbone_model.get_layer(get_branch_id(branch_number)).output
y = mlp_block(y, mlp_dim=MLP_DIM, name='mlp_mixer')
y = tf.keras.layers.GlobalAveragePooling1D()(y)
elif branch_type == 'mlp_mixer':
y, _ = backbone_model.get_layer(get_branch_id(branch_number)).output
num_patches = (IMAGE_SIZE // PATCH_SIZE) ** 2 + 1
y = MixerBlock(
num_patches=num_patches,
channel_dim=HIDDEN_DIM,
token_mixer_hidden_dim=TOKENS_MLP_DIM,
channel_mixer_hidden_dim=CHANNELS_MLP_DIM
)(y)
y = tf.keras.layers.GlobalAveragePooling1D()(y)
else:
raise Exception('Unknown branch type: %s' % branch_type)
# MLP head
initializer = tf.keras.initializers.he_normal()
regularizer = tf.keras.regularizers.l2()
y = tf.keras.layers.Dense(
units=256,
activation='elu',
kernel_initializer=initializer,
kernel_regularizer=regularizer
)(y)
y = tf.keras.layers.Dropout(0.5)(y)
y = tf.keras.layers.Dense(
units=256,
activation='elu',
kernel_initializer=initializer,
kernel_regularizer=regularizer
)(y)
y = tf.keras.layers.Dropout(0.5)(y)
y = tf.keras.layers.Dense(
units=output_units,
activation=output_activation,
kernel_initializer=initializer,
kernel_regularizer=regularizer
)(y)
model = tf.keras.models.Model(
inputs=backbone_model.get_layer(index=0).input,
outputs=y
)
return model
```
%% Cell type:code id: tags:
``` python
branch_types = [
'mlp',
'vit',
'cnn_ignore',
'cnn_add',
'cnn_project',
'resmlp',
'mlp_mixer',
]
dataset_names = [
'cifar10',
'cifar100',
'disco',
'fashion_mnist',
]
for dataset_name in dataset_names:
for branch_type in branch_types:
flops = []
for branch_number in range(1, 12):
tf.keras.backend.clear_session()
flops.append(get_flops(get_model(dataset_name, branch_type, branch_number)) / 10 ** 9)
print('###', dataset_name, branch_type)
print(flops)
```
%% Cell type:code id: tags:
``` python
SELECTED_GPUS = [7]
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(gpu_number) for gpu_number in SELECTED_GPUS])
import tensorflow as tf
tf.get_logger().setLevel('INFO')
assert len(tf.config.list_physical_devices('GPU')) > 0
GPUS = tf.config.experimental.list_physical_devices('GPU')
for gpu in GPUS:
tf.config.experimental.set_memory_growth(gpu, True)
DISTRIBUTED_STRATEGY = tf.distribute.MirroredStrategy(
cross_device_ops=tf.distribute.NcclAllReduce(),
devices=['/gpu:%d' % index for index in range(len(SELECTED_GPUS))]
)
NUM_GPUS = DISTRIBUTED_STRATEGY.num_replicas_in_sync
print('Number of devices: {}'.format(NUM_GPUS))
import math
import numpy as np
import pickle
import random
import string
import sys
from skimage import transform
from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2_as_graph
from vit_keras import vit
from vit_keras.layers import ClassToken, AddPositionEmbs, MultiHeadSelfAttention, TransformerBlock
VIDEO_PATCHES = (2, 3)
```
%% Cell type:code id: tags:
``` python
# from https://github.com/Benjamin-Etheredge/mlp-mixer-keras/blob/main/mlp_mixer_keras/mlp_mixer.py
class MlpBlock(tf.keras.layers.Layer):
def __init__(self, dim, hidden_dim, activation=None, **kwargs):
super(MlpBlock, self).__init__(**kwargs)
if activation is None:
activation = tf.keras.activations.gelu
self.dim = dim
self.hidden_dim = hidden_dim
self.activation = activation
self.dense1 = tf.keras.layers.Dense(hidden_dim)
self.activation = tf.keras.layers.Activation(activation)
self.dense2 = tf.keras.layers.Dense(dim)
def call(self, inputs):
x = inputs
x = self.dense1(x)
x = self.activation(x)
x = self.dense2(x)
return x
def compute_output_shape(self, input_signature):
return (input_signature[0], self.dim)
def get_config(self):
config = super(MlpBlock, self).get_config().copy()
config.update({
'dim': self.dim,
'hidden_dim': self.hidden_dim,
'activation': self.activation,
})
return config
class MixerBlock(tf.keras.layers.Layer):
def __init__(
self,
num_patches,
channel_dim,
token_mixer_hidden_dim,
channel_mixer_hidden_dim=None,
activation=None,
**kwargs
):
super(MixerBlock, self).__init__(**kwargs)
if activation is None:
activation = tf.keras.activations.gelu
if channel_mixer_hidden_dim is None:
channel_mixer_hidden_dim = token_mixer_hidden_dim
self.num_patches = num_patches
self.channel_dim = channel_dim
self.token_mixer_hidden_dim = token_mixer_hidden_dim
self.channel_mixer_hidden_dim = channel_mixer_hidden_dim
self.activation = activation
self.norm1 = tf.keras.layers.LayerNormalization(axis=1)
self.permute1 = tf.keras.layers.Permute((2, 1))
self.token_mixer = MlpBlock(num_patches, token_mixer_hidden_dim, name='token_mixer')
self.permute2 = tf.keras.layers.Permute((2, 1))
self.norm2 = tf.keras.layers.LayerNormalization(axis=1)
self.channel_mixer = MlpBlock(channel_dim, channel_mixer_hidden_dim, name='channel_mixer')
self.skip_connection1 = tf.keras.layers.Add()
self.skip_connection2 = tf.keras.layers.Add()
def get_config(self):
config = super(MixerBlock, self).get_config().copy()
config.update({
'num_patches': self.num_patches,
'channel_dim': self.channel_dim,
'token_mixer_hidden_dim': self.token_mixer_hidden_dim,
'channel_mixer_hidden_dim': self.channel_mixer_hidden_dim,
'activation': self.activation,
})
return config
def call(self, inputs):
x = inputs
skip_x = x
x = self.norm1(x)
x = self.permute1(x)
x = self.token_mixer(x)
x = self.permute2(x)
x = self.skip_connection1([x, skip_x])
skip_x = x
x = self.norm2(x)
x = self.channel_mixer(x)
x = self.skip_connection2([x, skip_x])
return x
def compute_output_shape(self, input_shape):
return input_shape
```
%% Cell type:code id: tags:
``` python
def get_branch_id(branch_number):
if branch_number == 1:
return 'transformer_block'
else:
return 'transformer_block_%d' % (branch_number - 1)
def get_model(branch_type, branch_number, version):
backbone_model = tf.keras.models.load_model('vit_cc_backbone_v2.h5', custom_objects={
'ClassToken': ClassToken,
'AddPositionEmbs': AddPositionEmbs,
'MultiHeadSelfAttention': MultiHeadSelfAttention,
'TransformerBlock': TransformerBlock,
})
y, _ = backbone_model.get_layer(get_branch_id(branch_number)).output
backend_model = tf.keras.models.Model(
inputs=backbone_model.get_layer(index=0).input,
outputs=y
)
backend_model._name='backend_model'
frontend_model = tf.keras.models.load_model(
'vit_disco_cw_%d_%s_head_precomputed_%s.h5' % (branch_number, branch_type, version),
custom_objects={
'ClassToken': ClassToken,
'AddPositionEmbs': AddPositionEmbs,
'MultiHeadSelfAttention': MultiHeadSelfAttention,
'TransformerBlock': TransformerBlock,
'MlpBlock': MlpBlock,
'MixerBlock': MixerBlock,
}
)
frontend_model._name = 'frontend_model'
model = tf.keras.Sequential([
backend_model,
frontend_model
])
return model
```
%% Cell type:code id: tags:
``` python
DISCO_PATH = 'disco'
CACHE_DIR = os.path.join(DISCO_PATH, 'vit_cache')
def horizontal_flip(image):
return np.flip(image, axis=1)
class CCSequence(tf.keras.utils.Sequence):
def __init__(self, split, batch_size):
self.split = split
self.split_len = sum([
1 if file_name.startswith(self.split) else 0 for file_name in os.listdir(CACHE_DIR)
])
self.batch_size = batch_size
self.random_permutation = np.random.permutation(self.split_len)
def __len__(self):
return math.ceil(self.split_len / self.batch_size)
def on_epoch_end(self):
self.random_permutation = np.random.permutation(self.split_len)
def __getitem__(self, index):
spectrograms = []
images = []
density_maps = []
if self.split == 'test':
index_generator = range(
index * self.batch_size,
min((index + 1) * self.batch_size, self.split_len)
)
else:
index_generator = self.random_permutation[index * self.batch_size:(index + 1) * self.batch_size]
for random_index in index_generator:
all_path = os.path.join(
CACHE_DIR,
'%s_%d.pkl' % (self.split, random_index)
)
with open(all_path, 'rb') as all_file:
data = pickle.load(all_file)
if self.split == 'train' and random.random() < 0.5: # flip augmentation
images.append(horizontal_flip(data['image']))
else:
images.append(data['image'])
density_maps.append(np.sum(data['density_map']))