Commit 48cc3c3f authored by Arian Bakhtiarnia's avatar Arian Bakhtiarnia
Browse files

Add Thumos feature extraction

parent ff884e6e
......@@ -11,4 +11,4 @@ This code was developed and executed using Jupyter notebooks. The following inst
5. <kbd>CTRL</kbd>+<kbd>P</kbd> then <kbd>CTRL</kbd>+<kbd>Q</kbd> to detach from the container without terminating the execution.
6. Paste the copied URL in your browser to open Jupyter (if you are running the docker container on a remote server, you need to replace the IP address with that of the server).
7. `git clone https://github.com/LukasHedegaard/continual-transformers`
8. Move `continual_transformer_audio_classification.ipynb` inside the `notebooks` directory of the above repository and run it.
8. Move `.ipynb` files inside the `notebooks` directory of the above repository and run them.
%% Cell type:code id: tags:
``` python
SELECTED_GPUS = [7]
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(gpu_number) for gpu_number in SELECTED_GPUS])
import tensorflow as tf
tf.get_logger().setLevel('INFO')
assert len(tf.config.list_physical_devices('GPU')) > 0
GPUS = tf.config.experimental.list_physical_devices('GPU')
for gpu in GPUS:
tf.config.experimental.set_memory_growth(gpu, True)
ROOT_DIR = '..'
os.chdir(ROOT_DIR)
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pickle
import sys
import tensorflow as tf
import wave
from preprocess_sound import preprocess_sound
from subprocess import check_output
VIDEO_DIR = os.path.join('thumos14', 'data')
FEATURES_DIR = os.path.join('thumos14', 'featuresv2')
def ensure_dir(directory):
if not os.path.exists(directory):
os.makedirs(directory)
ensure_dir(FEATURES_DIR)
TMP_WAV_PATH = 'tmp.wav'
FPS = 5
AUDIO_SIZE = (96, 64, 1)
```
%% Cell type:code id: tags:
``` python
def VGGish(load_weights=True, weights='audioset',
input_tensor=None, input_shape=AUDIO_SIZE,
out_dim=128, include_top=True, pooling='avg'):
if weights not in {'audioset', None}:
raise ValueError('The `weights` argument should be either '
'`None` (random initialization) or `audioset` '
'(pre-training on audioset).')
if input_tensor is None:
aud_input = tf.keras.layers.Input(shape=input_shape, name='input_1')
else:
aud_input = input_tensor
x = tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv1')(aud_input)
x = tf.keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x)
x = tf.keras.layers.Conv2D(128, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv2')(x)
x = tf.keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x)
x = tf.keras.layers.Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_1')(x)
x = tf.keras.layers.Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_2')(x)
x = tf.keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x)
x = tf.keras.layers.Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_1')(x)
x = tf.keras.layers.Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_2')(x)
x = tf.keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x)
if include_top:
x = tf.keras.layers.Flatten(name='flatten_')(x)
x = tf.keras.layers.Dense(4096, activation='relu', name='vggish_fc1/fc1_1')(x)
x = tf.keras.layers.Dense(4096, activation='relu', name='vggish_fc1/fc1_2')(x)
x = tf.keras.layers.Dense(out_dim, activation='relu', name='vggish_fc2')(x)
else:
if pooling == 'avg':
x = tf.keras.layers.GlobalAveragePooling2D()(x)
elif pooling == 'max':
x = tf.keras.layers.GlobalMaxPooling2D()(x)
model = tf.keras.models.Model(aud_input, x, name='VGGish')
if load_weights:
if weights == 'audioset':
if include_top:
model.load_weights('vggish_audioset_weights.h5')
else:
model.load_weights('vggish_audioset_weights_without_fc2.h5')
else:
print("failed to load weights")
return model
```
%% Cell type:code id: tags:
``` python
def extract_features():
backbone_model = VGGish(
include_top=True,
load_weights=True,
input_shape=AUDIO_SIZE
)
model = tf.keras.models.Model(
inputs=backbone_model.get_layer(index=0).input,
outputs=backbone_model.get_layer(index=-2).output
)
for sub_dir in ['test', 'val']:
print(sub_dir)
video_sub_dir = os.path.join(VIDEO_DIR, sub_dir)
total_files = len(
[name for name in os.listdir(video_sub_dir) if os.path.isfile(os.path.join(video_sub_dir, name))]
)
features_sub_dir = os.path.join(FEATURES_DIR, sub_dir)
ensure_dir(features_sub_dir)
for i, file_name in enumerate(os.listdir(video_sub_dir)):
sys.stdout.write('\r[%d/%d]' % (i + 1, total_files))
sys.stdout.flush()
if file_name.endswith('.mp4'):
feature_file_path = os.path.join(features_sub_dir, file_name.replace('.mp4', '.pkl'))
if not os.path.exists(feature_file_path):
check_output([
'ffmpeg',
'-y',
'-i',
os.path.join(video_sub_dir, file_name),
TMP_WAV_PATH,
])
with wave.open(TMP_WAV_PATH, 'rb') as wave_file:
sampling_rate = wave_file.getframerate()
waveform, _ = librosa.load(TMP_WAV_PATH, sr=sampling_rate)
features = []
for start_index in range(0, waveform.shape[0], sampling_rate // FPS):
sub_waveform = waveform[start_index:start_index + sampling_rate]
if sub_waveform.shape[0] < sampling_rate:
zero_padding = np.zeros(sampling_rate - sub_waveform.shape[0])
sub_waveform = np.concatenate([sub_waveform, zero_padding])
spectrogram = preprocess_sound(sub_waveform, sampling_rate)
spectrogram = np.moveaxis(spectrogram, 0, -1)
output = model(np.expand_dims(spectrogram, axis=0))
features.append(output[0])
features = np.array(features)
with open(feature_file_path, 'wb') as feature_file:
pickle.dump(features, feature_file)
print() # newline
```
%% Cell type:code id: tags:
``` python
extract_features()
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment