Commit cee88668 authored by Jonathan Juhl's avatar Jonathan Juhl
Browse files

final model

parent dfde4461
This diff is collapsed.
......@@ -38,21 +38,27 @@ class control_flow:
print("no star files in directory. You must point to atleast one star file to run Sortinator.")
exit()
star_files = [join(dir_path,join(star,i)) for i in star_files]
self.args['star'] = [join(dir_path,join(star,i)) for i in star_files]
else:
self.args['star'] = [join(dir_path,i) for i in self.args['star']]
s_file = []
for i in self.args['star']:
if i[0] != '/':
i = '/'+i
if isabs(i):
s_file.append(i)
else:
s_file.append(join(dir_path,i))
self.args['star'] = s_file
if not all(isfile(i) for i in required):
depth,mrc_paths = self.get_star_file_parameters()
elif self.args['ctf'] and not all(isfile(i) for i in ctf):
depth,mrc_paths = self.get_star_file_parameters()
else:
depth = np.load(join(self.args['tmp'],'depth.npy'))
mrc_paths = np.load(join(self.args['tmp'],'names.npy'))
depth = np.load(join(self.args['tmp'],'depth.npy'))
mrc_paths = np.load(join(self.args['tmp'],'names.npy'))
length,bytes_pr_record = self.get_parameters(mrc_paths[0])
......@@ -60,15 +66,15 @@ class control_flow:
self.args['size'] = length
self.args['number_particles'] = depth
self.args['bpr'] = bytes_pr_record
if args['ctf']:
self.args['kvolts'] = tf.constant(np.load(join(self.args['tmp'],'electron_volts.npy')),tf.float32)
self.args['sphe_ab'] = tf.constant(np.load(join(self.args['tmp'],'spherical_abberation.npy')),tf.float32)
self.args['amp_contrast'] = tf.constant(np.load(join(self.args['tmp'],'amplitude_contrast.npy')),tf.float32)
GAN_NERF(self.args)
final_labels = np.load(join(self.args['results'],'labels.npy'))
self.write_star_file(star_files,final_labels)
exit()
# final_labels = np.load(join(self.args['results'],'labels.npy'))
# self.write_star_file(star_files,final_labels)
# print(self.args );exit()
def get_star_file_parameters(self):
......@@ -141,6 +147,7 @@ class control_flow:
for row in reader:
s = row.split()
if self.args['ctf']:
if len(take_that) == len(s) and counter ==2:
......@@ -149,20 +156,23 @@ class control_flow:
V = float(s[voltage])
electron_volts = (1.23*10**3)/np.sqrt(V*(V*10**(-7)*1.96+1))
counter = 0
s = row.split()
if len(header)== len(row.split()):
f+=1
current_name = row.split()[name].split('@')[1]
if len(names) != 0:
if names[-1] != current_name:
if '@' in row:
f+=1
current_name = row.split()[name].split('@')[1]
if len(names) != 0:
if names[-1] != current_name:
names.append(join(c,current_name))
else:
names.append(join(c,current_name))
else:
names.append(join(c,current_name))
if self.args['verbose']:
labels_list.append(int(s[class_num]))
if self.args['verbose']:
labels_list.append(int(s[class_num]))
if counter == 1:
......@@ -175,9 +185,10 @@ class control_flow:
ctf_params.append([float(s[phase_shift]),float(s[defocusU]),float(s[defocusV]),float(s[defocusAngle])])
current_id = row.split()[name].split('@')[0]
np.save(join(self.args['tmp'],'depth.npy'),f)
np.save(join(self.args['tmp'],'names.npy'),names)
np.save(join(self.args['tmp'],'names.npy'),np.unique(names))
if self.args['ctf']:
np.save(join(self.args['tmp'],'electron_volts.npy'),V)
np.save(join(self.args['tmp'],'spherical_abberation.npy'),abberation_d)
......
import tensorflow as tf
import numpy as np
import umap
from trainer_sortem import Trainer
from mrc_loader_sortem import mrc_loader
......@@ -7,22 +8,24 @@ class GAN_NERF():
def __init__(self,args):
self.args = args
self.predict_steps = int(np.ceil(args['number_particles']/(args['num_gpus']*args['p_batch_size'])))
steps = 0
stages_list = []
l = [32,64,128,256]
c = [args['s_1'],args['s_2'],args['s_3'],args['s_4']]
for en,i in enumerate(l):
if i <= args['size']:
steps+=c[en]
stages_list.append(en*np.ones(c[en]))
else:
stages_list.append(np.concatenate([en*np.ones(c[en]),4*np.ones(self.args['top_off'])],axis=0))
break
self.stages = np.concatenate(stages_list)
dic = {32:1,64:2,128:3,256:4,512:5}
self.predict_steps = int(np.ceil(args['number_particles']/(args['num_gpus']*args['batch_size'])))
l = np.asarray([32,64,128,256,512])
self.args['resize'] = l[np.argmin(np.abs(l-self.args['size']))]
l_list = []
alpha_list = []
im_shape_list = []
for index,steps in enumerate(args['steps'][:dic[self.args['resize']]]):
l_list += [index]*steps
alpha_list += [np.linspace(0.0,1.0,steps)]
im_shape_list += [l[index]]*steps
self.max_steps = steps
self.alphas = tf.constant(tf.concat(alpha_list,axis=0))
self.index = tf.constant(np.asarray(l_list))
self.im_shape = tf.constant(np.asarray(im_shape_list))
self.steps = self.alphas.shape[0]
gpu_list = []
if args['gpu_list'] is None:
args['gpu_list'] = []
......@@ -44,68 +47,99 @@ class GAN_NERF():
gpu_list.append('GPU:0')
else:
gpu_list.append('GPU:%i' %args['gpu_list'][0])
generator = mrc_loader(args).generate(self.alphas,self.index,self.im_shape)
generator = mrc_loader(args).generate()
predict_generator = mrc_loader(args).pred_generate()
predict_generator = mrc_loader(args).pred_generate()
if args['num_gpus'] > 1:
strategy = tf.distribute.MirroredStrategy(devices= gpu_list )
self.generator = iter(strategy.experimental_distribute_dataset( generator))
self.generator_pred = iter(strategy.experimental_distribute_dataset( predict_generator ))
self.generator = strategy.experimental_distribute_dataset( generator)
self.generator_pred = strategy.experimental_distribute_dataset( predict_generator )
else:
strategy = tf.distribute.OneDeviceStrategy(device=gpu_list[0])
self.generator = iter(strategy.experimental_distribute_dataset( generator ))
self.generator_pred = iter(strategy.experimental_distribute_dataset( predict_generator ))
self.generator = strategy.experimental_distribute_dataset( generator )
self.generator_pred = strategy.experimental_distribute_dataset( predict_generator )
args['strategy'] = strategy
args['max_steps'] = self.max_steps
self.trainer = Trainer(args)
#self.train()
self.train()
self.predict()
def train(self):
print('Begin training: ', '-' * 60)
current_step = self.trainer.step_variable
gen = iter(self.generator)
for i in range(int(current_step)):
data = next(self.generator)
for i in range(self.max_steps -current_step): # continue where you came from
for i in range(int(current_step)):
# this starts the data recording at where it left off
# this is to prevent when continuation of training the model does not use the same data
data = next(gen)
for i in range(self.steps -current_step): # continues where we left off
print("data step %i" %i )
data = next(self.generator)
data = next(gen)
if self.args['ctf']:
params = {'image':data[0][0],
'ctf':data[0][1],
'alpha':data[1],
'index':data[2],
'shape':data[3],
}
else:
params = {'image':data[0],
'ctf': None,
'alpha':data[1],
'index':data[2],
'shape':data[3],
}
if self.args['num_gpus'] == 1:
self.trainer.single_device_train_step(data,self.stages[i])
self.trainer.single_device_train_step(params)
else:
self.trainer.distributed_training_step(data,self.stages[i])
if (i % self.args['vi']) == 0:
self.trainer.write_summaries()
if (i % self.args['save_model']) == 0:
self.trainer.save_checkpoint()
self.trainer.save_best_model()
if i % self.args['movie_int']:
self.trainer.distributed_training_step(params)
if (i % self.args['record']) == 0:
if self.args['num_gpus'] == 1:
self.single_device_model_maker.model_maker()
else:
self.multi_device_model_maker.model_maker()
self.trainer.write_summaries()
if (i % self.args['save_model']) == 0:
self.trainer.save_checkpoint()
self.trainer.save_best_model()
def over_cluster(self):
self.trainer.load_best_model()
trainer.sparse_water_sheed_algorithm()
def predict(self):
self.trainer.load_best_model()
#self.trainer.model_maker()
bools = isfile(join(self.args['results'],'over_cluster.npy'))
if bools:
labels = np.load(join(self.args['results'],'over_cluster.npy'))
clusters = []
for i in range(self.predict_steps):
image = next(self.generator_pred )
if self.args['num_gpus'] == 1:
current = self.trainer.single_device_prediction_step(i)
current = self.trainer.single_device_prediction_step(image)
else:
current = self.trainer.dis_predict_step(i)
clusters.append(current)
np.save(join(self.args['results'],'class_labels.npy'),np.asarray(clusters).flatten()[:self.args['number_particles']])
current = self.trainer.dis_predict_step(image)
clusters.append(current.numpy())
training_feat = np.asarray(clusters).flatten()[:self.args['umap_size'],:]
predict_feat = np.asarray(clusters).flatten()[self.args['umap_size']:,:]
training_set = umap.UMAP(neighbours = self.args['neighbours'])
TD = training_set.fit_predict(training_feat)
TD_after = training_set.fit_predict(predict_feat)
information = np.concatenate([TD,TD_after],axis=0)
labels = hdbscan.HDBSCAN(min_cluster_size=self.args['minimum_size'], min_samples=10).fet_predict(information)
np.save(join(self.args['results'],'class_labels.npy'),labels[:self.args['number_particles']])
\ No newline at end of file
......@@ -14,106 +14,51 @@ def main():
parser.add_argument('--gpu_list',type=int, nargs='+',default = None,
help='List of GPU devises, if None it will run on gpus sequentially from GPU:0 and up.')
parser.add_argument('--num_cpus',type=int,default = 8,help='The maximum allowed cpus to use for preprocessing data and Kmeans clustering')
parser.add_argument('--star', type=str, nargs='+',
help='list of path to the star files, wild cards are accepted. The star file must refer to the .mrc files')
parser.add_argument('--batch_size', type=int,default=100,
help='deep learning model training batch')
parser.add_argument('--p_batch_size', type=int,default=200,
help='deep learning model training batch')
parser.add_argument('--o', type=str,default='./results',
help='output directory')
parser.add_argument('--f16', dest='f16',action='store_true',
help='Apply Tensor core acceleration to training and inference, requires compute capability of 10.0 or higher.')
parser.add_argument('--mp', type=int,default=100,
help='max amount of steps to train pr. size')
parser.add_argument('--vi', type=int,default=100,help='validation interval where statistics are printed out.')
parser.add_argument('--movie_int', type=int,default=500,help='validation interval where models at full size are printed out.')
parser.add_argument('--record', type=int,default=1000,help='validation interval where models size is printed out.')
parser.add_argument('--save_model', type=int,default=100,help='validation interval where models at full size are printed out.')
parser.add_argument('--verbose',dest='verbose', action='store_true',help='se the performance of the model by including original class labels')
parser.add_argument('--num_parts',type=int,default=4,help='Number of gaussian components to use. (This is the maximum number)')
parser.add_argument('--lr_b_g',type=float,default=10**(-4),help='The start learning rate of the generator')
parser.add_argument('--lr_g',type=float,default=10**(-4),help='The start learning rate of the generator')
parser.add_argument('--lr_b_d',type=float,default=10**(-4),help='The start learning rate of the descriminator')
parser.add_argument('--lr_d',type=float,default=10**(-4),help='The start learning rate of the descriminator')
parser.add_argument('--lr_e_g',type=float,default=10**(-4),help='The end learning rate of the generator')
parser.add_argument('--lr_e_d',type=float,default=10**(-4),help='The end learning rate of the descriminator')
parser.add_argument('--lr_e',type=float,default=10**(-5),help='The start learning rate of the encoder')
parser.add_argument('--ctf', dest='ctf',action='store_true',default=False,help='Use CTF parameters for model.')
parser.add_argument('--noise', dest='noise',action='store_true',help='Use the noise generator for model. Set true or false boolean.')
parser.add_argument('--s_1_steps',type=int,default=10,help='how many steps to generate the 32 x 32 model')
parser.add_argument('--s_2_steps',type=int,default=2,help='how many steps to generate the 64 x 64 model')
parser.add_argument('--s_3_steps',type=int,default=5,help='how many steps to generate the 128 x 128 model')
parser.add_argument('--s_4_steps',type=int,default=6,help='how many steps to generate the 256 x 256 model')
parser.add_argument('--top_off',type=int,default=6,help='steps to finish training at the speficied resolution')
parser.add_argument('--noise', dest='noise',action='store_false',default=False ,help='Use the noise generator to generate and scale the noise')
parser.add_argument('--l_reg',type=float,default=0.01,help='the lambda regulization of the diversity score loss')
parser.add_argument('--feature_size',type=int,default=128,help='the input feature size')
parser.add_argument('--over_cluster', dest='over_cluster',action='store_true',default=False,help='Use CTF parameters for model.')
parser.add_argument('--dstep',type=int,default=5,help='How many frames over each axis the protein is made in the UMAP reduction')
parser.add_argument('--seg_mode', dest='seg_mode',action='store_true',default = False,help='decomposition of the image to its individuel parts. Can be used on datasets like the ribosome')
parser.add_argument('--noise_bg', dest='noise_bg',action='store_true',default = False,help='To use a noise background estimator to mask instead of the noise generator')
parser.add_argument('--no_gen', dest='no_gen',action='store_true',default = False,help='Using a 3D volumetric model instead of the generator')
parser.add_argument('--TD_mode', dest='TD_mode',action='store_true',default = False,help='If you wish to switch to 2D classification instead')
parser.add_argument('--Only_VAE', dest='Only_VAE',action='store_true',default = False,help='To only use VAE to perform the classification')
parser.add_argument('--steps',type=int,default=[10000,100000,100000,10000,100000], nargs='+',help='how many epochs( runs through the dataset) before termination')
parser.add_argument('--imsize',type=int,default=128,choices=[128,256,512],help='the training image size. It can be 128 x 128, 256 x 256, and 512 x 512')
parser.add_argument('--l_reg',type=float,default=0.01,help='the lambda regulization of the diversity score loss if the noise generator is active')
parser.add_argument('--m_batch_size',type=int,default=25,help='the batch size to make the 3D model')
parser.add_argument('--frames',type=int,default=36,help='number of movie frames')
parser.add_argument('--frames',type=int,default=8,help='number of models to generate from each cluster')
parser.add_argument('--use_eulers',dest='use_eulers',action='store_true',help='if to use the standard euler rotation matrix instead')
parser.add_argument('--umap_p_size',type=int,default=100000,help='The UMAP size to train the umap model. It is trained on the CPU in parallel')
parser.add_argument('--no_angle', dest='no_angle',action='store_true',default = False,help='Do not use any angles to do the classifcation')
args = parser.parse_args()
if isinstance(args.Only_VAE,bool):
print("perform 2D classification is a bool", isinstance(args.Only_VAE,bool))
else:
assert print("The 2D classification is not a bool")
if isinstance(args.TD_mode,bool):
print("perform 2D classification is a bool" , isinstance(args.TD_mode,bool))
else:
assert print("The 2D classification is not a bool")
if isinstance(args.no_gen,bool):
print("background is instance of: bool", isinstance(args.no_gen,bool))
else:
assert print("The no generation is not a bool")
if isinstance(args.noise_bg,bool):
print("background is instance of: bool", isinstance(args.noise_bg,bool))
else:
assert print("The segmentation is not a bool")
if isinstance(args.seg_mode,bool):
print("segmentation is instance of: bool", isinstance(args.seg_mode,bool))
else:
assert print("The segmentation is not a bool")
parser.add_argument('--umap_t_size',type=int,default=10000,help='The UMAP size')
if isinstance(args.dstep,int) and args.dstep > 0:
print("dstep is instance of: int", isinstance(args.dstep,int), "and is: %i" %args.dstep)
else:
assert print("the dstep is not an integer or is less than 0")
parser.add_argument('--neighbours',type=int,default=30,help='number of neighbours in the graph creation algorithm')
if isinstance(args.movie_int,int) and args.movie_int > 0:
print("movie int is instance of: int", isinstance(args.movie_int,int), "and is: %i" %args.movie_int)
else:
assert print("the dstep is not an integer or is less than 0")
parser.add_argument('--minimum_size',type=int,default=500,help='the minimum size before its considered an actual cluster, anything else less is considered noise and will be discarded')
args = parser.parse_args()
if isinstance(args.num_gpus,int) and args.num_gpus > 0:
......@@ -122,7 +67,7 @@ def main():
assert print("the number of gpus is not an integer or is less than 0")
if isinstance(args.num_cpus,int) and args.num_cpus > 0:
print("num gpus is instance of: int", isinstance(args.num_cpus,int), "and is: %i" %args.num_cpus)
print("num cpus is instance of: int", isinstance(args.num_cpus,int), "and is: %i" %args.num_cpus)
else:
assert print("the number of cpus is not an integer or is less than 0")
......@@ -142,8 +87,9 @@ def main():
print("training batch is int and larger than 0",args.batch_size)
else:
assert print("training batch is not int and less than 0 ")
if isinstance(args.p_batch_size,int) and args.p_batch_size > 0:
print("prediction batch is int and larger than 0",args.p_batch_size)
if isinstance(args.m_batch_size,int) and args.m_batch_size > 0:
print("making model is int and larger than 0",args.m_batch_size)
else:
assert print("prediction batch is not int or less than 0 ")
if isinstance(args.o,str):
......@@ -157,99 +103,58 @@ def main():
assert print("half precision is not a bool")
if isinstance(args.mp,int) and args.mp > 0:
print("max steps",args.mp)
if isinstance(args.steps,list) and len(args.steps)==5:
print("The number of epochs is a list and is "+ str(args.steps))
else:
assert print("max steps is not an integer")
assert print("epochs is not an integer")
if isinstance(args.vi,int) and args.vi > 0:
print("validation interval is an integer",args.vi)
else:
assert print("validation interval is not an integer and larger than zero")
if isinstance(args.verbose,bool):
print("record NMI losses is a bool",args.verbose)
else:
assert print("record NMI losses is not a bool.")
if isinstance(args.num_parts,int) and args.num_parts > 0:
print("num clusters is a integer and is larger than 0",args.num_parts)
else:
assert print("record NMI losses is not a bool.")
if isinstance(args.lr_b_g,float) and args.lr_b_g > 0.0:
if isinstance(args.lr_g,float) and args.lr_g > 0.0:
print("the learning rate for generator is a float and larger than 0.0")
else:
assert print("the learning rate for generator in beggining is a float and larger than 0.0")
assert print("the learning rate for generator is not a float and larger than 0.0")
if isinstance(args.lr_b_d,float) and args.lr_b_d > 0.0:
print("the learning rate for discriminator in beginning is a float and larger than 0.0")
else:
assert print("the learning rate for generator in beginning is a float and larger than 0.0")
if isinstance(args.lr_e_d,float) and args.lr_e_d > 0.0:
print("the learning rate for discriminator in ending is a float and larger than 0.0")
if isinstance(args.lr_d,float) and args.lr_d > 0.0:
print("the learning rate for discriminator in beginning is not a float and larger than 0.0")
else:
assert print("the learning rate for generator in ending is a float and larger than 0.0")
if isinstance(args.lr_e_g,float) and args.lr_e_g > 0.0:
print("the learning rate for discriminator in ending is a float and larger than 0.0")
else:
assert print("the learning rate for generator in ending is a float and larger than 0.0")
assert print("the learning rate for discriminator in beginning is a float and larger than 0.0")
if isinstance(args.ctf,bool):
print("the ctf is a bool")
else:
assert print("the ctf is not a bool")
if isinstance(args.s_1_steps,int) and isinstance(args.s_2_steps,int) and isinstance(args.s_3_steps,int) and isinstance(args.s_4_steps,int) and isinstance(args.top_off,int):
print("the training steps is a integer")
if isinstance(args.frames,int) and args.frames > 0:
print("the number of frames is an int and is larger than zero")
else:
assert print("the training steps is not a integer")
assert print("it is not a integer and is not larger than 0")
if isinstance(args.minimum_size,int) and args.minimum_size > 0:
print("the mninimum size of the cluster is an integer and is larger than 0")
else:
assert print("the minimum size of the cluster is not an integer or is not larger than 0")
if isinstance(args.feature_size,int):
print("the training steps is a float")
if isinstance(args.neighbours,int) and args.neighbours > 0:
print("the nearest neighbour size is an integer and is larger than zero")
else:
assert print("the training steps is not a float")
assert print("the nearest neighbour size is not an integer or is not larger than zero")
if isinstance(args.over_cluster,bool):
print("the training steps is a float")
else:
assert print("the training steps is not a float")
if isinstance(args.umap_t_size,int) and args.umap_t_size > 0:
print("umap_size is an integer and is larger than zero")
if isinstance(args.no_angle,bool):
print("the no angle is a bool")
else:
assert print("the no angle is not a bool")
assert print(" umap_size is not an integer")
if isinstance(args.frames,int) and args.frames > 0:
print("the number of frames is an int and is larger than zero")
else:
assert print("it is not a integer and is not larger than 0")
if isinstance(args.m_batch_size,int) and args.m_batch_size > 0:
print("the batch size is an int and is larger than zero")
else:
assert print("it is not a integer and is not larger than 0")
if isinstance(args.umap_p_size,int) and args.umap_p_size > 0:
print("umap_size is an integer and is larger than zero")
if isinstance(args.use_eulers,bool):
print("use euler angles is a bool")
else:
assert print("use euler angles is not a bool")
assert print(" umap_size is not an integer")
if not isdir(args.o):
......@@ -262,28 +167,18 @@ def main():
mkdir(join(args.o,'results'))
if not isdir(join(args.o,'best_model')):
mkdir(join(args.o,'best_model'))
model_list = []
for i in range(args.num_parts):
if not isdir(join(join(args.o,'results'),'model_%i' %i)):
mkdir(join(join(args.o,'results'),'model_%i' %i))
model_list.append(join(join(args.o,'results'),'model_%i' %i))
args_dic = {'num_gpus': args.num_gpus,
'num_cpus': args.num_cpus,
'star': args.star,
'gpu_list': args.gpu_list,
'batch_size': args.batch_size,
'p_batch_size': args.p_batch_size,
'o': args.o,
'f16': args.f16,
'mp': args.mp,
'vi':args.vi,
'verbose':args.verbose,
'num_parts': args.num_parts,
'lr_b_g': args.lr_b_g,
'lr_b_d': args.lr_b_d,
'lr_e_g': args.lr_e_g,
'lr_e_d': args.lr_e_d,
'record':args.record,
'lr_g': args.lr_g,
'lr_d': args.lr_d,
'lr_e': args.lr_e,
'ctf': args.ctf,
'noise': args.noise,