Commit c9632b97 authored by Jonathan Juhl's avatar Jonathan Juhl
Browse files

now with gui

parents 54423fbc 9876d443
This diff is collapsed.
import numpy as np
from scipy import interpolate
from scipy.signal import argrelextrema
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import warnings
from typing import Tuple, Optional, Iterable
class KneeLocator(object):
def __init__(
self,
x: Iterable[float],
y: Iterable[float],
S: float = 1.0,
curve: str = "concave",
direction: str = "increasing",
interp_method: str = "interp1d",
online: bool = False,
):
"""
Once instantiated, this class attempts to find the point of maximum
curvature on a line. The knee is accessible via the `.knee` attribute.
:param x: x values.
:param y: y values.
:param S: Sensitivity, original paper suggests default of 1.0
:param curve: If 'concave', algorithm will detect knees. If 'convex', it
will detect elbows.
:param direction: one of {"increasing", "decreasing"}
:param interp_method: one of {"interp1d", "polynomial"}
:param online: Will correct old knee points if True, will return first knee if False
"""
# Step 0: Raw Input
self.x = np.array(x)
self.y = np.array(y)
self.curve = curve
self.direction = direction
self.N = len(self.x)
self.S = S
self.all_knees = set()
self.all_norm_knees = set()
self.all_knees_y = []
self.all_norm_knees_y = []
self.online = online
# Step 1: fit a smooth line
if interp_method == "interp1d":
uspline = interpolate.interp1d(self.x, self.y)
self.Ds_y = uspline(self.x)
elif interp_method == "polynomial":
pn_model = PolynomialFeatures(7)
xpn = pn_model.fit_transform(self.x.reshape(-1, 1))
regr_model = LinearRegression()
regr_model.fit(xpn, self.y)
self.Ds_y = regr_model.predict(
pn_model.fit_transform(self.x.reshape(-1, 1))
)
else:
warnings.warn(
"{} is an invalid interp_method parameter, use either 'interp1d' or 'polynomial'".format(
interp_method
)
)
return
# Step 2: normalize values
self.x_normalized = self.__normalize(self.x)
self.y_normalized = self.__normalize(self.Ds_y)
# Step 3: Calculate the Difference curve
self.x_normalized, self.y_normalized = self.transform_xy(
self.x_normalized, self.y_normalized, self.direction, self.curve
)
# normalized difference curve
self.y_difference = self.y_normalized - self.x_normalized
self.x_difference = self.x_normalized.copy()
# Step 4: Identify local maxima/minima
# local maxima
self.maxima_indices = argrelextrema(self.y_difference, np.greater_equal)[0]
self.x_difference_maxima = self.x_difference[self.maxima_indices]
self.y_difference_maxima = self.y_difference[self.maxima_indices]
# local minima
self.minima_indices = argrelextrema(self.y_difference, np.less_equal)[0]
self.x_difference_minima = self.x_difference[self.minima_indices]
self.y_difference_minima = self.y_difference[self.minima_indices]
# Step 5: Calculate thresholds
self.Tmx = self.y_difference_maxima - (
self.S * np.abs(np.diff(self.x_normalized).mean())
)
# Step 6: find knee
self.knee, self.norm_knee = self.find_knee()
# Step 7: If we have a knee, extract data about it
self.knee_y = self.norm_knee_y = None
if self.knee:
self.knee_y = self.y[self.x == self.knee][0]
self.norm_knee_y = self.y_normalized[self.x_normalized == self.norm_knee][0]
@staticmethod
def __normalize(a: Iterable[float]) -> Iterable[float]:
"""normalize an array
:param a: The array to normalize
"""
return (a - min(a)) / (max(a) - min(a))
@staticmethod
def transform_xy(
x: Iterable[float], y: Iterable[float], direction: str, curve: str
) -> Tuple[Iterable[float], Iterable[float]]:
"""transform x and y to concave, increasing based on given direction and curve"""
# convert elbows to knees
if curve == "convex":
x = x.max() - x
y = y.max() - y
# flip decreasing functions to increasing
if direction == "decreasing":
y = np.flip(y, axis=0)
if curve == "convex":
x = np.flip(x, axis=0)
y = np.flip(y, axis=0)
return x, y
def find_knee(self,):
"""This function finds and sets the knee value and the normalized knee value. """
if not self.maxima_indices.size:
warnings.warn(
"No local maxima found in the difference curve\n"
"The line is probably not polynomial, try plotting\n"
"the difference curve with plt.plot(knee.x_difference, knee.y_difference)\n"
"Also check that you aren't mistakenly setting the curve argument",
RuntimeWarning,
)
return None, None
# placeholder for which threshold region i is located in.
maxima_threshold_index = 0
minima_threshold_index = 0
# traverse the difference curve
for i, x in enumerate(self.x_difference):
# skip points on the curve before the the first local maxima
if i < self.maxima_indices[0]:
continue
j = i + 1
# reached the end of the curve
if x == 1.0:
break
# if we're at a local max, increment the maxima threshold index and continue
if (self.maxima_indices == i).any():
threshold = self.Tmx[maxima_threshold_index]
threshold_index = i
maxima_threshold_index += 1
# values in difference curve are at or after a local minimum
if (self.minima_indices == i).any():
threshold = 0.0
minima_threshold_index += 1
if self.y_difference[j] < threshold:
if self.curve == "convex":
if self.direction == "decreasing":
knee = self.x[threshold_index]
norm_knee = self.x_normalized[threshold_index]
else:
knee = self.x[-(threshold_index + 1)]
norm_knee = self.x_normalized[-(threshold_index + 1)]
elif self.curve == "concave":
if self.direction == "decreasing":
knee = self.x[-(threshold_index + 1)]
norm_knee = self.x_normalized[-(threshold_index + 1)]
else:
knee = self.x[threshold_index]
norm_knee = self.x_normalized[threshold_index]
# add the y value at the knee
y_at_knee = self.y[self.x == knee][0]
y_norm_at_knee = self.y_normalized[self.x_normalized == norm_knee][0]
if knee not in self.all_knees:
self.all_knees_y.append(y_at_knee)
self.all_norm_knees_y.append(y_norm_at_knee)
# now add the knee
self.all_knees.add(knee)
self.all_norm_knees.add(norm_knee)
# if detecting in offline mode, return the first knee found
if self.online is False:
return knee, norm_knee
if self.all_knees == set():
warnings.warn("No knee/elbow found")
return None, None
return knee, norm_knee
def plot_knee_normalized(self, figsize: Optional[Tuple[int, int]] = None):
"""Plot the normalized curve, the difference curve (x_difference, y_normalized) and the knee, if it exists.
:param figsize: Optional[Tuple[int, int]
The figure size of the plot. Example (12, 8)
:return: NoReturn
"""
import matplotlib.pyplot as plt
if figsize is None:
figsize = (6, 6)
plt.figure(figsize=figsize)
plt.title("Normalized Knee Point")
plt.plot(self.x_normalized, self.y_normalized, "b", label="normalized curve")
plt.plot(self.x_difference, self.y_difference, "r", label="difference curve")
plt.xticks(
np.arange(self.x_normalized.min(), self.x_normalized.max() + 0.1, 0.1)
)
plt.yticks(
np.arange(self.y_difference.min(), self.y_normalized.max() + 0.1, 0.1)
)
plt.vlines(
self.norm_knee,
plt.ylim()[0],
plt.ylim()[1],
linestyles="--",
label="knee/elbow",
)
plt.legend(loc="best")
def plot_knee(self, figsize: Optional[Tuple[int, int]] = None):
"""
Plot the curve and the knee, if it exists
:param figsize: Optional[Tuple[int, int]
The figure size of the plot. Example (12, 8)
:return: NoReturn
"""
import matplotlib.pyplot as plt
if figsize is None:
figsize = (6, 6)
plt.figure(figsize=figsize)
plt.title("Knee Point")
plt.plot(self.x, self.y, "b", label="data")
plt.vlines(
self.knee, plt.ylim()[0], plt.ylim()[1], linestyles="--", label="knee/elbow"
)
plt.legend(loc="best")
# Niceties for users working with elbows rather than knees
@property
def elbow(self):
return self.knee
@property
def norm_elbow(self):
return self.norm_knee
@property
def elbow_y(self):
return self.knee_y
@property
def norm_elbow_y(self):
return self.norm_knee_y
@property
def all_elbows(self):
return self.all_knees
@property
def all_norm_elbows(self):
return self.all_norm_knees
@property
def all_elbows_y(self):
return self.all_knees_y
@property
def all_norm_elbows_y(self):
return self.all_norm_knees_y
\ No newline at end of file
......@@ -32,35 +32,59 @@ _`ln -s ../sortem/sortem bin/.`_
The algorithm is works the following way:
1) The algorithm computes transformation invariant keypoints represented as a binary vector of [-1,1] , meaning the feature vector contains the same information for all projections from the same molecule(https://ieeexplore.ieee.org/abstract/document/9169844).
1) The algorithm encodes a image into a feature vector.
2) The vector expresses key characteristics of the protein, each pixel of the computed 16 x 16 image is weighed by a value between [0,max]. 4 areas of the protein are is extracted used for training of the neural network, improving the classification. ()
2) the generator utilizes the feature vector to recreate the 3D model and reproject it onto the 2D plane and apply a known CTF and a trainable Noise.
3) Each protein component is represented as a binarized vector which is concatenated with the other part , partial and full image vectors, improving the overall accuracy(https://arxiv.org/pdf/1902.09941.pdf).
3) The discriminator performs a featurewise loss to check how real the reproduced model is compared to the data. We then cluster the feature vectos.
Running SortEM
parser.add_argument('--gpu_id',type=int, default= 0, help='GPU ID. The ID of the GPU to execute the operations on. ')
parser.add_argument('--num_cpus',type=int,default = 8,help='The maximum allowed cpus to use for preprocessing data (image resize and normalization')
parser.add_argument('--star', type=str, nargs='+',
help='list of path to the star files, wild cards are accepted. The star file must refer to the .mrc files')
parser.add_argument('--batch_size', type=int,default=[100,75,50,20,10], nargs='+',
help='deep learning model training batch size for each image scale')
--num_gpus how many gpus to use( only tested on single gpu, can run on multi gpu)
--gpu_list list of strings of specific gpus to use if not using slurm queue ,write: gpu:0 gpu:1 gpu:2 (does not work for multi gpu yet)
--num_cpus integer, how many CPUs to use to preprocess the images optained from the mrc files.
--float16 , write True to use half precision, works well on volta series and higher, increases training speed up to 2.5 times.
--star list of star files, can contain wild cards
--ab The batch size to train with on a single gpu.
--o The output director (defaults ./results)
--mp The max particles to use pr training epoch.
--epochs The number of epochs, such that the total number of training imabes are epochs*mp
--tr Use pretrained model, this will skip step 1 and 2, and the optimization procedure in step 3 so everything is just predicted. This can predict image dater within 10 min for a huge dataset.
--log If the star file contains classes you can track the training with actual human classification, from Relion / cryosparc (to test to see if its worth it)
--num_classes How many parts of the protein to rfine you want when you want to compare pretraining (step 1) with the number of classes in star file.
- Finalize multi gpu support
- Finalize transfer learning support
Example of typical run , the star file is required, the --ab argument is the batch size. If the batch size is to big
parser.add_argument('--o', type=str,default='./results',
help='output directory')
parser.add_argument('--f16', dest='f16',action='store_true',
help='Apply Tensor core acceleration to training and inference, requires compute capability of 10.0 or higher.')
parser.add_argument('--save_model', type=int,default=5,help='validation interval where models at full size are printed out.')
parser.add_argument('--lr_g',type=float,default=[10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6),10**(-7),0.5*10**(-7)], nargs='+',help='The staircase learning rates of the generator')
parser.add_argument('--lr_d',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6)], nargs='+',help='The staircase learning rates of the discriminator')
parser.add_argument('--lr_e',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6)], nargs='+',help='The staircase learning rates of the encoder')
parser.add_argument('--ctf', dest='ctf',action='store_true',default=False,help='Use CTF parameters for model.')
parser.add_argument('--noise', dest='noise',action='store_true',default=False ,help='Use the noise generator to generate and scale the noise')
parser.add_argument('--steps',type=int,default=[10000,10000,10000,10000,10000], nargs='+',help='how many epochs( runs through the dataset) before termination')
parser.add_argument('--l_reg',type=float,default=0.01,help='the lambda regulization of the diversity score loss if the noise generator is active')
parser.add_argument('--frames',type=int,default=4,help='number of models to generate from each cluster')
parser.add_argument('--umap_p_size',type=int,default=100,help='The number of feature vectors to use for training Umap'
parser.add_argument('--umap_t_size',type=int,default=100,help='The number of feature vectors to use for intermediate evaluation of clusters in the umap algorithm')
parser.add_argument('--neighbours',type=int,default=30,help='number of neighbours in the graph creation algorithm')
parser.add_argument('--t_res',type=int,default=None,choices=[32,64,128,256,512],help='The maximum resolution to train the model on')
parser.add_argument('--minimum_size',type=int,default=500,help='the minimum size before its considered an actual cluster, anything else less is considered noise')-
Example of typical run , the star file is required, and use --ctf and --noise to run on real data.
python3 main.py --star /u/misser11/Sortinator/p28/*.star --ab 64
python3 main.py --star /u/misser11/Sortinator/p28/*.star --ctf --noise
......@@ -22,7 +22,8 @@ class GAN_NERF():
else:
arl = np.argmin(np.abs(l-self.args['size']))
self.args['resize'] = l[arl]
predict_b_size = args['batch_size'][args]
predict_b_size = args['batch_size'][arl]
tmp = l[arl]
self.predict_steps = int(np.ceil(args['number_particles']/(predict_b_size)))
self.predict_b_size = predict_b_size
self.dic = dic
......@@ -93,11 +94,15 @@ class GAN_NERF():
next(gen_index)
for i in range(int(current_step),int(tmp[index+1])): # continues where we left off
print("data step %i of %i " %(i,int(c_sum[-1])))
data = next(gen_image)
alpha = next(gen_index)
self.trainer.single_device_train_step(data,alpha,index)
output = self.trainer.single_device_train_step(data,alpha,index)
print("data step %i of %i, generator loss: %f, dicriminator loss: %f, encoder loss %f. " %(i,int(c_sum[-1]),
float(output['gen_loss'].numpy()),
float(output['dis_loss'].numpy()),
float(output['enc_loss'].numpy())))
self.trainer.vd['steps'].assign_add(1)
if (int(i) % int(self.args['save_model'])) == 0:
......
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import LogNorm
from scipy.stats.kde import gaussian_kde
a = np.random.multivariate_normal([0,0], [[ 1, 0], [0, 0.01]], check_valid='warn', tol=1e-8,size=[10000])
b = np.random.multivariate_normal([0,0], [[ 1, 0], [0, 0.01]], check_valid='warn', tol=1e-8,size=[10000])
c = np.random.multivariate_normal([0,0], [[ 1, 0], [0, 0.01]], check_valid='warn', tol=1e-8,size=[10000])
s = np.concatenate([a,b,c],axis=0)
x = s[:,0]
y = s[:,1]
H, xedges, yedges =np.histogram2d(x, y, bins=100)
plt.pcolormesh(xedges, yedges , H ,alpha=0.9)
import matplotlib
matplotlib.use('Agg')
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, UpSampling2D,Activation,GlobalAveragePooling2D,PReLU,Conv2DTranspose,BatchNormalization,Flatten
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from mrc_loader_sortem import mrc_loader
import numpy as np
from os.path import join
from os.path import isfile
from super_clas_sortem import super_class
from models import Unet
class Denoise(super_class):
def __init__(self,parameter_file_path,not_projected_star_files,projected_star_files,bytes_pr_record,validate=None,val_bytes=None):
super_class.__init__(self ,parameter_file_path)
self.not_projected_star_files = not_projected_star_files
self.validate = validate
self.val_bytes = val_bytes
self.projected_star_files = projected_star_files
self.paths = [self.not_projected_star_files,self.projected_star_files]
self.bytes_pr_record = bytes_pr_record
num_steps = (self.max_particles/(self.batch_size*self.num_gpus))*self.epochs
self.steps = num_steps
self.opt = self.optimizer(self.steps)
self.unet = Unet()
self.train()
@tf.function
def mask(self,image):
image = tf.squeeze(image)
x = tf.range(64)
s = tf.random.shuffle(x)
_,y = tf.meshgrid(x,x)
rs = tf.reduce_sum(tf.cast(tf.equal(y,tf.roll(s,int(64/2),axis=0)),self.precision)*image,axis=2)
selected_pixels = tf.cast(tf.equal(y,s),self.precision)
not_selected_pixels = tf.cast(tf.not_equal(y,s),self.precision)
m_image = tf.transpose(tf.transpose(tf.stack([selected_pixels]*self.batch_size,axis=0),perm=[1,0,2])*rs,perm=[1,0,2])
return m_image+not_selected_pixels*image,selected_pixels
@tf.function
def loss(self,image,estimate,mask):
a = Flatten()(image)
b = Flatten()(estimate)
c = tf.reshape(mask,[-1])
mean = tf.reduce_mean(((a-b)*c)**2)
return mean
def plotlib(self,image,raw_image):
s = np.concatenate(np.split(image,image.shape[0]),axis=1)
t = np.concatenate(np.split(raw_image,raw_image.shape[0]),axis=1)
plt.imshow(np.squeeze(np.concatenate([s,t],axis=2)),cmap='gray')
plt.savefig(join(self.results,'image_signal.png'),)
@tf.function
def predict_net_low(self,raw_data):
stage3_img = self.unet(raw_data)
return stage3_img
@tf.function
def train_net_L(self,raw_data_image):
raw_data_image = tf.cast(raw_data_image,self.precision)
swaped_pixels,mask = self.mask(raw_data_image)
#plt.imshow(swaped_pixels[0])
#plt.savefig('test.png');exit()
swaped_pixels = tf.expand_dims(swaped_pixels,axis=-1)
with tf.GradientTape() as tape:
estimate= self.unet(swaped_pixels)
loss= self.loss(raw_data_image,estimate,mask)
variables = self.unet.trainable_weights
self.apply_grad(loss,variables,tape)
return loss
def train(self):
strategy,distribute = self.generator('contrastive',self.validate,self.val_bytes,self.batch_size)
dis = iter(distribute)
if self.validate != None:
strategy_validate,distribute_val = self.generator('predict',self.validate,self.val_bytes,self.predict_batch_size)
dis_val = iter(distribute_val)
pred_data = next(dis_val)
if self.verbose:
pred_data,y = pred_data
if not isfile(join(self.models,'unet.index')):
ite = 1
while True:
raw_data_image,perm = next(dis)
loss = strategy.run(self.train_net_L,args=(perm,))
if ite % self.validate_interval == 0 and self.validate:
validation_images = strategy_validate.run(self.predict_net_low,args=(pred_data,))
self.plotlib(validation_images,pred_data)
self.unet.save_weights(join(self.models,'unet'))
print("step:%i of %i" %(ite,self.steps),loss.numpy())
ite +=1
if self.steps > ite:
break
\ No newline at end of file
......@@ -14,7 +14,7 @@ def main():
parser.add_argument('--star', type=str, nargs='+',
help='list of path to the star files, wild cards are accepted. The star file must refer to the .mrc files')
parser.add_argument('--batch_size', type=int,default=[100,75,50,20,10], nargs='+',
parser.add_argument('--batch_size', type=int,default=[50,25,15,5,2], nargs='+',
help='deep learning model training batch size for each image scale')
parser.add_argument('--o', type=str,default='./results',
......@@ -23,13 +23,13 @@ def main():
parser.add_argument('--f16', dest='f16',action='store_true',
help='Apply Tensor core acceleration to training and inference, requires compute capability of 10.0 or higher.')
parser.add_argument('--save_model', type=int,default=500,help='validation interval where models at full size are printed out.')
parser.add_argument('--save_model', type=int,default=200,help='validation interval where models at full size are printed out.')
parser.add_argument('--lr_g',type=float,default=[10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6),10**(-7),0.5*10**(-7)], nargs='+',help='The staircase learning rates of the generator')
parser.add_argument('--lr_g',type=float,default=[10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6),10**(-7)], nargs='+',help='The staircase learning rates of the generator')
parser.add_argument('--lr_d',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6)], nargs='+',help='The staircase learning rates of the discriminator')
parser.add_argument('--lr_d',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6)], nargs='+',help='The staircase learning rates of the discriminator')
parser.add_argument('--lr_e',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6)], nargs='+',help='The staircase learning rates of the encoder')
parser.add_argument('--lr_e',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6)], nargs='+',help='The staircase learning rates of the encoder')
parser.add_argument('--ctf', dest='ctf',action='store_true',default=False,help='Use CTF parameters for model.')
......
import numpy as np
nmi = normalized_mutual_info_score
ari = adjusted_rand_score
def cos_grad(grad1, grad2):
grad1_list = []
grad2_list = []
for i in range(len(grad1)):
grad1_list.append(grad1[i][0].flatten())
grad2_list.append(grad2[i][0].flatten())
grad1_vector = np.concatenate(grad1_list)
grad2_vector = np.concatenate(grad2_list)
return np.matmul(grad1_vector, grad2_vector) / ((np.linalg.norm(grad1_vector)) * (np.linalg.norm(grad2_vector)))
def acc(y_true, y_pred):