Commit c9632b97 by Jonathan Juhl

### now with gui

parents 54423fbc 9876d443
This diff is collapsed.
 import numpy as np from scipy import interpolate from scipy.signal import argrelextrema from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression import warnings from typing import Tuple, Optional, Iterable class KneeLocator(object): def __init__( self, x: Iterable[float], y: Iterable[float], S: float = 1.0, curve: str = "concave", direction: str = "increasing", interp_method: str = "interp1d", online: bool = False, ): """ Once instantiated, this class attempts to find the point of maximum curvature on a line. The knee is accessible via the `.knee` attribute. :param x: x values. :param y: y values. :param S: Sensitivity, original paper suggests default of 1.0 :param curve: If 'concave', algorithm will detect knees. If 'convex', it will detect elbows. :param direction: one of {"increasing", "decreasing"} :param interp_method: one of {"interp1d", "polynomial"} :param online: Will correct old knee points if True, will return first knee if False """ # Step 0: Raw Input self.x = np.array(x) self.y = np.array(y) self.curve = curve self.direction = direction self.N = len(self.x) self.S = S self.all_knees = set() self.all_norm_knees = set() self.all_knees_y = [] self.all_norm_knees_y = [] self.online = online # Step 1: fit a smooth line if interp_method == "interp1d": uspline = interpolate.interp1d(self.x, self.y) self.Ds_y = uspline(self.x) elif interp_method == "polynomial": pn_model = PolynomialFeatures(7) xpn = pn_model.fit_transform(self.x.reshape(-1, 1)) regr_model = LinearRegression() regr_model.fit(xpn, self.y) self.Ds_y = regr_model.predict( pn_model.fit_transform(self.x.reshape(-1, 1)) ) else: warnings.warn( "{} is an invalid interp_method parameter, use either 'interp1d' or 'polynomial'".format( interp_method ) ) return # Step 2: normalize values self.x_normalized = self.__normalize(self.x) self.y_normalized = self.__normalize(self.Ds_y) # Step 3: Calculate the Difference curve self.x_normalized, self.y_normalized = self.transform_xy( self.x_normalized, self.y_normalized, self.direction, self.curve ) # normalized difference curve self.y_difference = self.y_normalized - self.x_normalized self.x_difference = self.x_normalized.copy() # Step 4: Identify local maxima/minima # local maxima self.maxima_indices = argrelextrema(self.y_difference, np.greater_equal)[0] self.x_difference_maxima = self.x_difference[self.maxima_indices] self.y_difference_maxima = self.y_difference[self.maxima_indices] # local minima self.minima_indices = argrelextrema(self.y_difference, np.less_equal)[0] self.x_difference_minima = self.x_difference[self.minima_indices] self.y_difference_minima = self.y_difference[self.minima_indices] # Step 5: Calculate thresholds self.Tmx = self.y_difference_maxima - ( self.S * np.abs(np.diff(self.x_normalized).mean()) ) # Step 6: find knee self.knee, self.norm_knee = self.find_knee() # Step 7: If we have a knee, extract data about it self.knee_y = self.norm_knee_y = None if self.knee: self.knee_y = self.y[self.x == self.knee][0] self.norm_knee_y = self.y_normalized[self.x_normalized == self.norm_knee][0] @staticmethod def __normalize(a: Iterable[float]) -> Iterable[float]: """normalize an array :param a: The array to normalize """ return (a - min(a)) / (max(a) - min(a)) @staticmethod def transform_xy( x: Iterable[float], y: Iterable[float], direction: str, curve: str ) -> Tuple[Iterable[float], Iterable[float]]: """transform x and y to concave, increasing based on given direction and curve""" # convert elbows to knees if curve == "convex": x = x.max() - x y = y.max() - y # flip decreasing functions to increasing if direction == "decreasing": y = np.flip(y, axis=0) if curve == "convex": x = np.flip(x, axis=0) y = np.flip(y, axis=0) return x, y def find_knee(self,): """This function finds and sets the knee value and the normalized knee value. """ if not self.maxima_indices.size: warnings.warn( "No local maxima found in the difference curve\n" "The line is probably not polynomial, try plotting\n" "the difference curve with plt.plot(knee.x_difference, knee.y_difference)\n" "Also check that you aren't mistakenly setting the curve argument", RuntimeWarning, ) return None, None # placeholder for which threshold region i is located in. maxima_threshold_index = 0 minima_threshold_index = 0 # traverse the difference curve for i, x in enumerate(self.x_difference): # skip points on the curve before the the first local maxima if i < self.maxima_indices[0]: continue j = i + 1 # reached the end of the curve if x == 1.0: break # if we're at a local max, increment the maxima threshold index and continue if (self.maxima_indices == i).any(): threshold = self.Tmx[maxima_threshold_index] threshold_index = i maxima_threshold_index += 1 # values in difference curve are at or after a local minimum if (self.minima_indices == i).any(): threshold = 0.0 minima_threshold_index += 1 if self.y_difference[j] < threshold: if self.curve == "convex": if self.direction == "decreasing": knee = self.x[threshold_index] norm_knee = self.x_normalized[threshold_index] else: knee = self.x[-(threshold_index + 1)] norm_knee = self.x_normalized[-(threshold_index + 1)] elif self.curve == "concave": if self.direction == "decreasing": knee = self.x[-(threshold_index + 1)] norm_knee = self.x_normalized[-(threshold_index + 1)] else: knee = self.x[threshold_index] norm_knee = self.x_normalized[threshold_index] # add the y value at the knee y_at_knee = self.y[self.x == knee][0] y_norm_at_knee = self.y_normalized[self.x_normalized == norm_knee][0] if knee not in self.all_knees: self.all_knees_y.append(y_at_knee) self.all_norm_knees_y.append(y_norm_at_knee) # now add the knee self.all_knees.add(knee) self.all_norm_knees.add(norm_knee) # if detecting in offline mode, return the first knee found if self.online is False: return knee, norm_knee if self.all_knees == set(): warnings.warn("No knee/elbow found") return None, None return knee, norm_knee def plot_knee_normalized(self, figsize: Optional[Tuple[int, int]] = None): """Plot the normalized curve, the difference curve (x_difference, y_normalized) and the knee, if it exists. :param figsize: Optional[Tuple[int, int] The figure size of the plot. Example (12, 8) :return: NoReturn """ import matplotlib.pyplot as plt if figsize is None: figsize = (6, 6) plt.figure(figsize=figsize) plt.title("Normalized Knee Point") plt.plot(self.x_normalized, self.y_normalized, "b", label="normalized curve") plt.plot(self.x_difference, self.y_difference, "r", label="difference curve") plt.xticks( np.arange(self.x_normalized.min(), self.x_normalized.max() + 0.1, 0.1) ) plt.yticks( np.arange(self.y_difference.min(), self.y_normalized.max() + 0.1, 0.1) ) plt.vlines( self.norm_knee, plt.ylim()[0], plt.ylim()[1], linestyles="--", label="knee/elbow", ) plt.legend(loc="best") def plot_knee(self, figsize: Optional[Tuple[int, int]] = None): """ Plot the curve and the knee, if it exists :param figsize: Optional[Tuple[int, int] The figure size of the plot. Example (12, 8) :return: NoReturn """ import matplotlib.pyplot as plt if figsize is None: figsize = (6, 6) plt.figure(figsize=figsize) plt.title("Knee Point") plt.plot(self.x, self.y, "b", label="data") plt.vlines( self.knee, plt.ylim()[0], plt.ylim()[1], linestyles="--", label="knee/elbow" ) plt.legend(loc="best") # Niceties for users working with elbows rather than knees @property def elbow(self): return self.knee @property def norm_elbow(self): return self.norm_knee @property def elbow_y(self): return self.knee_y @property def norm_elbow_y(self): return self.norm_knee_y @property def all_elbows(self): return self.all_knees @property def all_norm_elbows(self): return self.all_norm_knees @property def all_elbows_y(self): return self.all_knees_y @property def all_norm_elbows_y(self): return self.all_norm_knees_y \ No newline at end of file
 ... ... @@ -32,35 +32,59 @@ _`ln -s ../sortem/sortem bin/.`_ The algorithm is works the following way: 1) The algorithm computes transformation invariant keypoints represented as a binary vector of [-1,1] , meaning the feature vector contains the same information for all projections from the same molecule(https://ieeexplore.ieee.org/abstract/document/9169844). 1) The algorithm encodes a image into a feature vector. 2) The vector expresses key characteristics of the protein, each pixel of the computed 16 x 16 image is weighed by a value between [0,max]. 4 areas of the protein are is extracted used for training of the neural network, improving the classification. () 2) the generator utilizes the feature vector to recreate the 3D model and reproject it onto the 2D plane and apply a known CTF and a trainable Noise. 3) Each protein component is represented as a binarized vector which is concatenated with the other part , partial and full image vectors, improving the overall accuracy(https://arxiv.org/pdf/1902.09941.pdf). 3) The discriminator performs a featurewise loss to check how real the reproduced model is compared to the data. We then cluster the feature vectos. Running SortEM parser.add_argument('--gpu_id',type=int, default= 0, help='GPU ID. The ID of the GPU to execute the operations on. ') parser.add_argument('--num_cpus',type=int,default = 8,help='The maximum allowed cpus to use for preprocessing data (image resize and normalization') parser.add_argument('--star', type=str, nargs='+', help='list of path to the star files, wild cards are accepted. The star file must refer to the .mrc files') parser.add_argument('--batch_size', type=int,default=[100,75,50,20,10], nargs='+', help='deep learning model training batch size for each image scale') --num_gpus how many gpus to use( only tested on single gpu, can run on multi gpu) --gpu_list list of strings of specific gpus to use if not using slurm queue ,write: gpu:0 gpu:1 gpu:2 (does not work for multi gpu yet) --num_cpus integer, how many CPUs to use to preprocess the images optained from the mrc files. --float16 , write True to use half precision, works well on volta series and higher, increases training speed up to 2.5 times. --star list of star files, can contain wild cards --ab The batch size to train with on a single gpu. --o The output director (defaults ./results) --mp The max particles to use pr training epoch. --epochs The number of epochs, such that the total number of training imabes are epochs*mp --tr Use pretrained model, this will skip step 1 and 2, and the optimization procedure in step 3 so everything is just predicted. This can predict image dater within 10 min for a huge dataset. --log If the star file contains classes you can track the training with actual human classification, from Relion / cryosparc (to test to see if its worth it) --num_classes How many parts of the protein to rfine you want when you want to compare pretraining (step 1) with the number of classes in star file. - Finalize multi gpu support - Finalize transfer learning support Example of typical run , the star file is required, the --ab argument is the batch size. If the batch size is to big parser.add_argument('--o', type=str,default='./results', help='output directory') parser.add_argument('--f16', dest='f16',action='store_true', help='Apply Tensor core acceleration to training and inference, requires compute capability of 10.0 or higher.') parser.add_argument('--save_model', type=int,default=5,help='validation interval where models at full size are printed out.') parser.add_argument('--lr_g',type=float,default=[10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6),10**(-7),0.5*10**(-7)], nargs='+',help='The staircase learning rates of the generator') parser.add_argument('--lr_d',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6)], nargs='+',help='The staircase learning rates of the discriminator') parser.add_argument('--lr_e',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6)], nargs='+',help='The staircase learning rates of the encoder') parser.add_argument('--ctf', dest='ctf',action='store_true',default=False,help='Use CTF parameters for model.') parser.add_argument('--noise', dest='noise',action='store_true',default=False ,help='Use the noise generator to generate and scale the noise') parser.add_argument('--steps',type=int,default=[10000,10000,10000,10000,10000], nargs='+',help='how many epochs( runs through the dataset) before termination') parser.add_argument('--l_reg',type=float,default=0.01,help='the lambda regulization of the diversity score loss if the noise generator is active') parser.add_argument('--frames',type=int,default=4,help='number of models to generate from each cluster') parser.add_argument('--umap_p_size',type=int,default=100,help='The number of feature vectors to use for training Umap' parser.add_argument('--umap_t_size',type=int,default=100,help='The number of feature vectors to use for intermediate evaluation of clusters in the umap algorithm') parser.add_argument('--neighbours',type=int,default=30,help='number of neighbours in the graph creation algorithm') parser.add_argument('--t_res',type=int,default=None,choices=[32,64,128,256,512],help='The maximum resolution to train the model on') parser.add_argument('--minimum_size',type=int,default=500,help='the minimum size before its considered an actual cluster, anything else less is considered noise')- Example of typical run , the star file is required, and use --ctf and --noise to run on real data. python3 main.py --star /u/misser11/Sortinator/p28/*.star --ab 64 python3 main.py --star /u/misser11/Sortinator/p28/*.star --ctf --noise
 ... ... @@ -22,7 +22,8 @@ class GAN_NERF(): else: arl = np.argmin(np.abs(l-self.args['size'])) self.args['resize'] = l[arl] predict_b_size = args['batch_size'][args] predict_b_size = args['batch_size'][arl] tmp = l[arl] self.predict_steps = int(np.ceil(args['number_particles']/(predict_b_size))) self.predict_b_size = predict_b_size self.dic = dic ... ... @@ -93,11 +94,15 @@ class GAN_NERF(): next(gen_index) for i in range(int(current_step),int(tmp[index+1])): # continues where we left off print("data step %i of %i " %(i,int(c_sum[-1]))) data = next(gen_image) alpha = next(gen_index) self.trainer.single_device_train_step(data,alpha,index) output = self.trainer.single_device_train_step(data,alpha,index) print("data step %i of %i, generator loss: %f, dicriminator loss: %f, encoder loss %f. " %(i,int(c_sum[-1]), float(output['gen_loss'].numpy()), float(output['dis_loss'].numpy()), float(output['enc_loss'].numpy()))) self.trainer.vd['steps'].assign_add(1) if (int(i) % int(self.args['save_model'])) == 0: ... ...
 import numpy as np import matplotlib.pyplot as plt import matplotlib.cm as cm from matplotlib.colors import LogNorm from scipy.stats.kde import gaussian_kde a = np.random.multivariate_normal([0,0], [[ 1, 0], [0, 0.01]], check_valid='warn', tol=1e-8,size=[10000]) b = np.random.multivariate_normal([0,0], [[ 1, 0], [0, 0.01]], check_valid='warn', tol=1e-8,size=[10000]) c = np.random.multivariate_normal([0,0], [[ 1, 0], [0, 0.01]], check_valid='warn', tol=1e-8,size=[10000]) s = np.concatenate([a,b,c],axis=0) x = s[:,0] y = s[:,1] H, xedges, yedges =np.histogram2d(x, y, bins=100) plt.pcolormesh(xedges, yedges , H ,alpha=0.9)
 import matplotlib matplotlib.use('Agg') import tensorflow as tf from tensorflow.keras.layers import Conv2D, UpSampling2D,Activation,GlobalAveragePooling2D,PReLU,Conv2DTranspose,BatchNormalization,Flatten import matplotlib.pyplot as plt plt.switch_backend('agg') from mrc_loader_sortem import mrc_loader import numpy as np from os.path import join from os.path import isfile from super_clas_sortem import super_class from models import Unet class Denoise(super_class): def __init__(self,parameter_file_path,not_projected_star_files,projected_star_files,bytes_pr_record,validate=None,val_bytes=None): super_class.__init__(self ,parameter_file_path) self.not_projected_star_files = not_projected_star_files self.validate = validate self.val_bytes = val_bytes self.projected_star_files = projected_star_files self.paths = [self.not_projected_star_files,self.projected_star_files] self.bytes_pr_record = bytes_pr_record num_steps = (self.max_particles/(self.batch_size*self.num_gpus))*self.epochs self.steps = num_steps self.opt = self.optimizer(self.steps) self.unet = Unet() self.train() @tf.function def mask(self,image): image = tf.squeeze(image) x = tf.range(64) s = tf.random.shuffle(x) _,y = tf.meshgrid(x,x) rs = tf.reduce_sum(tf.cast(tf.equal(y,tf.roll(s,int(64/2),axis=0)),self.precision)*image,axis=2) selected_pixels = tf.cast(tf.equal(y,s),self.precision) not_selected_pixels = tf.cast(tf.not_equal(y,s),self.precision) m_image = tf.transpose(tf.transpose(tf.stack([selected_pixels]*self.batch_size,axis=0),perm=[1,0,2])*rs,perm=[1,0,2]) return m_image+not_selected_pixels*image,selected_pixels @tf.function def loss(self,image,estimate,mask): a = Flatten()(image) b = Flatten()(estimate) c = tf.reshape(mask,[-1]) mean = tf.reduce_mean(((a-b)*c)**2) return mean def plotlib(self,image,raw_image): s = np.concatenate(np.split(image,image.shape[0]),axis=1) t = np.concatenate(np.split(raw_image,raw_image.shape[0]),axis=1) plt.imshow(np.squeeze(np.concatenate([s,t],axis=2)),cmap='gray') plt.savefig(join(self.results,'image_signal.png'),) @tf.function def predict_net_low(self,raw_data): stage3_img = self.unet(raw_data) return stage3_img @tf.function def train_net_L(self,raw_data_image): raw_data_image = tf.cast(raw_data_image,self.precision) swaped_pixels,mask = self.mask(raw_data_image) #plt.imshow(swaped_pixels[0]) #plt.savefig('test.png');exit() swaped_pixels = tf.expand_dims(swaped_pixels,axis=-1) with tf.GradientTape() as tape: estimate= self.unet(swaped_pixels) loss= self.loss(raw_data_image,estimate,mask) variables = self.unet.trainable_weights self.apply_grad(loss,variables,tape) return loss def train(self): strategy,distribute = self.generator('contrastive',self.validate,self.val_bytes,self.batch_size) dis = iter(distribute) if self.validate != None: strategy_validate,distribute_val = self.generator('predict',self.validate,self.val_bytes,self.predict_batch_size) dis_val = iter(distribute_val) pred_data = next(dis_val) if self.verbose: pred_data,y = pred_data if not isfile(join(self.models,'unet.index')): ite = 1 while True: raw_data_image,perm = next(dis) loss = strategy.run(self.train_net_L,args=(perm,)) if ite % self.validate_interval == 0 and self.validate: validation_images = strategy_validate.run(self.predict_net_low,args=(pred_data,)) self.plotlib(validation_images,pred_data) self.unet.save_weights(join(self.models,'unet')) print("step:%i of %i" %(ite,self.steps),loss.numpy()) ite +=1 if self.steps > ite: break \ No newline at end of file
 ... ... @@ -14,7 +14,7 @@ def main(): parser.add_argument('--star', type=str, nargs='+', help='list of path to the star files, wild cards are accepted. The star file must refer to the .mrc files') parser.add_argument('--batch_size', type=int,default=[100,75,50,20,10], nargs='+', parser.add_argument('--batch_size', type=int,default=[50,25,15,5,2], nargs='+', help='deep learning model training batch size for each image scale') parser.add_argument('--o', type=str,default='./results', ... ... @@ -23,13 +23,13 @@ def main(): parser.add_argument('--f16', dest='f16',action='store_true', help='Apply Tensor core acceleration to training and inference, requires compute capability of 10.0 or higher.') parser.add_argument('--save_model', type=int,default=500,help='validation interval where models at full size are printed out.') parser.add_argument('--save_model', type=int,default=200,help='validation interval where models at full size are printed out.') parser.add_argument('--lr_g',type=float,default=[10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6),10**(-7),0.5*10**(-7)], nargs='+',help='The staircase learning rates of the generator') parser.add_argument('--lr_g',type=float,default=[10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6),10**(-7)], nargs='+',help='The staircase learning rates of the generator') parser.add_argument('--lr_d',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6)], nargs='+',help='The staircase learning rates of the discriminator') parser.add_argument('--lr_d',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6)], nargs='+',help='The staircase learning rates of the discriminator') parser.add_argument('--lr_e',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6)], nargs='+',help='The staircase learning rates of the encoder') parser.add_argument('--lr_e',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6)], nargs='+',help='The staircase learning rates of the encoder') parser.add_argument('--ctf', dest='ctf',action='store_true',default=False,help='Use CTF parameters for model.') ... ...
 import numpy as np nmi = normalized_mutual_info_score ari = adjusted_rand_score def cos_grad(grad1, grad2): grad1_list = [] grad2_list = [] for i in range(len(grad1)): grad1_list.append(grad1[i][0].flatten()) grad2_list.append(grad2[i][0].flatten()) grad1_vector = np.concatenate(grad1_list) grad2_vector = np.concatenate(grad2_list) return np.matmul(grad1_vector, grad2_vector) / ((np.linalg.norm(grad1_vector)) * (np.linalg.norm(grad2_vector))) def acc(y_true, y_pred):