Commit 2c376be1 authored by Jonathan Juhl's avatar Jonathan Juhl
Browse files

corrections

parent 9ec83b9f
...@@ -59,24 +59,26 @@ class control_flow: ...@@ -59,24 +59,26 @@ class control_flow:
depth = np.load(join(self.args['tmp'],'depth.npy')) depth = np.load(join(self.args['tmp'],'depth.npy'))
mrc_paths = np.load(join(self.args['tmp'],'names.npy')) mrc_paths = np.load(join(self.args['tmp'],'names.npy'))
length,bytes_pr_record = self.get_parameters(mrc_paths[0]) length,bytes_pr_record,floattype = self.get_parameters(mrc_paths[0])
val_type = {2:tf.float32,12:tf.float16}
self.args['mrc_paths'] = mrc_paths self.args['mrc_paths'] = mrc_paths
self.args['size'] = length self.args['size'] = length
self.args['number_particles'] = depth self.args['number_particles'] = depth
self.args['bpr'] = bytes_pr_record self.args['bpr'] = bytes_pr_record
self.args['type'] = val_type[floattype]
if args['ctf']: if args['ctf']:
self.args['kvolts'] = tf.constant(np.load(join(self.args['tmp'],'electron_volts.npy')),tf.float32) self.args['kvolts'] = tf.constant(np.load(join(self.args['tmp'],'electron_volts.npy')),tf.float32)
self.args['sphe_ab'] = tf.constant(np.load(join(self.args['tmp'],'spherical_abberation.npy')),tf.float32) self.args['sphe_ab'] = tf.constant(np.load(join(self.args['tmp'],'spherical_abberation.npy')),tf.float32)
self.args['amp_contrast'] = tf.constant(np.load(join(self.args['tmp'],'amplitude_contrast.npy')),tf.float32) self.args['amp_contrast'] = tf.constant(np.load(join(self.args['tmp'],'amplitude_contrast.npy')),tf.float32)
if not isfile(join(self.args['results'],'final_labels.npy')):
GAN_NERF(self.args)
final_labels = np.load(join(self.args['results'],'final_labels.npy'))
self.write_star_file(self.args['star'],final_labels)
GAN_NERF(self.args)
exit()
# final_labels = np.load(join(self.args['results'],'labels.npy'))
# self.write_star_file(star_files,final_labels)
# print(self.args );exit()
def get_star_file_parameters(self): def get_star_file_parameters(self):
counter = [] counter = []
...@@ -136,14 +138,6 @@ class control_flow: ...@@ -136,14 +138,6 @@ class control_flow:
phase_shift = header.index('_rlnPhaseShift') phase_shift = header.index('_rlnPhaseShift')
ctf_params = [] ctf_params = []
if self.args['verbose']:
try:
class_num = header.index('_rlnClassNumber')
except:
self.args['verbose'] = False
print("the --verbose true cannot be run as _rlnClassNumber is missing ")
for row in reader: for row in reader:
...@@ -170,8 +164,7 @@ class control_flow: ...@@ -170,8 +164,7 @@ class control_flow:
names.append(join(c,current_name)) names.append(join(c,current_name))
else: else:
names.append(join(c,current_name)) names.append(join(c,current_name))
if self.args['verbose']:
labels_list.append(int(s[class_num]))
...@@ -194,8 +187,7 @@ class control_flow: ...@@ -194,8 +187,7 @@ class control_flow:
np.save(join(self.args['tmp'],'spherical_abberation.npy'),abberation_d) np.save(join(self.args['tmp'],'spherical_abberation.npy'),abberation_d)
np.save(join(self.args['tmp'],'amplitude_contrast.npy'),amp_contrast_d) np.save(join(self.args['tmp'],'amplitude_contrast.npy'),amp_contrast_d)
np.save(join(self.args['tmp'],'ctf_params.npy'),np.asarray(ctf_params)) np.save(join(self.args['tmp'],'ctf_params.npy'),np.asarray(ctf_params))
if self.args['verbose']:
np.save(join(self.args['tmp'],'labels.npy'),np.asarray(labels_list))
return f,np.unique(names) return f,np.unique(names)
...@@ -217,27 +209,29 @@ class control_flow: ...@@ -217,27 +209,29 @@ class control_flow:
if len(row.split()) == len(header): if len(row.split()) == len(header):
names.append(row.split()) names.append(row.split())
labels = labels[np.greater(labels,-1)]
for index,i in enumerate(np.unique(labels)): for index,i in enumerate(np.unique(labels)):
f = open(join( self.args['star'],'cluster_%s.star' %index), 'w') if i > -1:
f = open(join(join(self.args['results'],'model_%i' %index),'cluster.star'), 'w')
with f:
with f:
f.write('\n')
f.write('data_images\n')
f.write('\n')
f.write('loop_\n ')
for z in header:
f.write(z)
f.write('')
for lab,row in zip(labels.tolist(),names):
if lab == i:
f.write(' '.join(row)+'\n')
f.close()
f.write('\n')
f.write('data_images\n')
f.write('\n')
f.write('loop_\n ')
for z in header:
f.write(z)
f.write('')
for lab,row in zip(labels.tolist(),names):
if lab == i:
f.write(' '.join(row)+'\n')
f.close()
def read(self,filename,header = 1024): def read(self,filename,header = 1024):
with open(filename,'rb') as f: with open(filename,'rb') as f:
...@@ -246,14 +240,15 @@ class control_flow: ...@@ -246,14 +240,15 @@ class control_flow:
NX = np.fromstring(binary_header[0:4],np.int32) NX = np.fromstring(binary_header[0:4],np.int32)
NY = np.fromstring(binary_header[4:8],np.int32) NY = np.fromstring(binary_header[4:8],np.int32)
NZ = np.fromstring(binary_header[8:12],np.int32) NZ = np.fromstring(binary_header[8:12],np.int32)
floattype = np.fromstring(binary_header[12:16],np.int32)[0]
recordsize = int((file_size-header)/(NZ[0])) recordsize = int((file_size-header)/(NZ[0]))
return NX[0],NY[0],NZ[0],recordsize return NX[0],NY[0],NZ[0],recordsize,floattype
def get_parameters(self,paths): def get_parameters(self,paths):
width,length,depth,record_size_new = self.read(paths) width,length,depth,record_size_new,floattype = self.read(paths)
return length,record_size_new return length,record_size_new,floattype
import tensorflow as tf import tensorflow as tf
import numpy as np import numpy as np
import umap import umap
from os.path import join from os.path import join,isfile, isdir
from trainer_sortem import Trainer from trainer_sortem import Trainer
from mrc_loader_sortem import mrc_loader from mrc_loader_sortem import mrc_loader
from utils_sortem import save_model_to_mrcfile,pred_umap
from os import mkdir
class GAN_NERF(): class GAN_NERF():
def __init__(self,args): def __init__(self,args):
self.args = args self.args = args
dic = {32:1,64:2,128:3,256:4,512:5} dic = {32:0,64:1,128:2,256:3,512:4}
self.predict_steps = int(np.ceil(args['number_particles']/(args['num_gpus']*args['batch_size'])))
self.dic = dic
l = np.asarray([32,64,128,256,512]) l = np.asarray([32,64,128,256,512])
self.args['resize'] = l[np.argmin(np.abs(l-self.args['size']))] if args['t_res'] != None:
l_list = [] predict_b_size = args['batch_size'][dic[args['t_res']]]
alpha_list = [] self.args['resize'] = args['t_res']
im_shape_list = [] arl = dic[args['t_res']]
for index,steps in enumerate(args['steps'][:dic[self.args['resize']]]): tmp = l[arl]
l_list += [index]*steps
alpha_list += [np.linspace(0.0,1.0,steps)]
im_shape_list += [l[index]]*steps
self.alphas = tf.constant(tf.concat(alpha_list,axis=0))
self.index = tf.constant(np.asarray(l_list))
self.im_shape = tf.constant(np.asarray(im_shape_list))
self.steps = self.alphas.shape[0]
gpu_list = []
if args['gpu_list'] is None:
args['gpu_list'] = []
if args['num_gpus'] > 1:
if len(args['gpu_list']) == args['num_gpus'] or len(args['gpu_list']) == 0:
if len(args['gpu_list']) == 0:
for i in range(args['num_gpus']):
gpu_list.append('GPU:%i' %i)
else:
for i in args['gpu_list']:
gpu_list.append('GPU:%i' %i)
else:
print("the number of gpus specified is not equal to the number of gpu names. Either leave gpu names parameter unmodified or correct it.")
exit()
else: else:
if len(args['gpu_list']) == 0: arl = np.argmin(np.abs(l-self.args['size']))
gpu_list.append('GPU:0') self.args['resize'] = l[arl]
else: predict_b_size = args['batch_size'][args]
gpu_list.append('GPU:%i' %args['gpu_list'][0]) self.predict_steps = int(np.ceil(args['number_particles']/(predict_b_size)))
self.predict_b_size = predict_b_size
generator = mrc_loader(args).generate(self.alphas,self.index,self.im_shape) self.dic = dic
predict_generator = mrc_loader(args).pred_generate()
output_generator = mrc_loader(args).pred_generate()
if args['num_gpus'] > 1:
strategy = tf.distribute.MirroredStrategy(devices= gpu_list )
self.generator = strategy.experimental_distribute_dataset( generator)
self.generator_pred = strategy.experimental_distribute_dataset( predict_generator )
self.output_generator = strategy.experimental_distribute_dataset( output_generator )
else:
strategy = tf.distribute.OneDeviceStrategy(device=gpu_list[0])
self.generator = strategy.experimental_distribute_dataset( generator )
self.generator_pred = strategy.experimental_distribute_dataset( predict_generator )
self.output_generator = strategy.experimental_distribute_dataset( output_generator )
self.arl = arl
self.tmp = tmp
self.generator = []
self.generator_index = []
predict_generator = mrc_loader(args,self.args['resize'],predict_b_size).pred_generate()
strategy = tf.distribute.OneDeviceStrategy(device='GPU:%i' %args['gpu_id'])
for batch,i in zip(args['batch_size'],list(range(arl+1))):
generator,generator_index = mrc_loader(args,l[i],batch).generate(args['steps'][i])
self.generator.append(strategy.experimental_distribute_dataset( generator ))
self.generator_index.append(strategy.experimental_distribute_dataset(generator_index))
self.generator_pred = strategy.experimental_distribute_dataset( predict_generator )
args['strategy'] = strategy args['strategy'] = strategy
self.trainer = Trainer(args) self.trainer = Trainer(args)
self.train() self.train()
self.predict() self.predict()
def initialize_variables(self):
with self.args['strategy'].scope():
if self.args['noise']:
self.trainer.Noise['N%i' %self.dic[self.args['resize']]](tf.random.normal(shape=[1,4,4,4]),tf.ones(shape=[1,1,1]))
self.trainer.Encoder['E%i' %self.dic[self.args['resize']]](tf.ones(shape=[1,self.args['resize'],self.args['resize'],1]),tf.ones(shape=[1,1,1]))
self.trainer.Discriminator['D%i' %self.dic[self.args['resize']]](tf.ones(shape=[1,self.args['resize'],self.args['resize'],1]),tf.ones(shape=[1,1,1]))
x = tf.linspace(-0.5,0.5,int(self.args['resize']/2)+1)
y = tf.linspace(-0.5,0.5,self.args['resize'])
X,Y =tf.meshgrid(y,x)
stack = tf.cast(tf.expand_dims(tf.stack([tf.reshape(X,[-1]),tf.reshape(X,[-1]),tf.zeros_like(tf.reshape(X,[-1]))],axis=-1),axis=1),tf.float32)
self.trainer.vd['Generator'](self.args['resize'],tf.random.normal([1,10]),stack)
self.trainer.restore_checkpoint()
def train(self): def train(self):
print('Begin training: ', '-' * 60) print('Begin training: ', '-' * 60)
current_step = self.trainer.step_variable self.initialize_variables()
gen = iter(self.generator) current_step = self.trainer.vd['steps']
c_sum = np.cumsum( self.args['steps'])[:(self.arl+1)]
print("The training til star at step %i of step %i" %(int(current_step.numpy()),int(c_sum[-1])))
pred = iter(self.generator_pred) pred = iter(self.generator_pred)
for i in range(int(current_step)):
# this starts the data recording at where it left off tmp = np.concatenate([[0.0],c_sum],axis=0)
# this is to prevent when continuation of training the model does not use the same data
data = next(gen) for index,current in enumerate(c_sum.tolist()):
for i in range(self.steps -current_step): # continues where we left off batch_size = self.args['batch_size'][index]
print("data step %i" %i )
data = next(gen) if current-current_step >= 0:
if self.args['ctf']:
params = {'image':tf.expand_dims(tf.squeeze(data[0][0]),axis=-1),
'ctf':data[0][1],
'alpha':data[1],
'index':data[2],
}
else:
params = {'image':tf.expand_dims(tf.squeeze(data[0]),axis=-1),
'ctf': None,
'alpha':data[1],
'index':data[2],
}
if self.args['num_gpus'] == 1:
self.trainer.single_device_train_step(params)
else:
self.trainer.distributed_training_step(params)
if (i % self.args['record']) == 0:
if self.args['num_gpus'] == 1:
features = []
current_shape = params['image'].numpy().shape[1]
for kk in range(int(np.ceil(self.args['umap_t_size']/self.args['batch_size']))):
data = next(pred)
features.append(self.args['strategy'].run(self.trainer.get_features,
args=(data,params['alpha'],self.trainer.Encoder[int(params['index'])],current_shape)))
self.trainer.write_summaries(features)
else:
data = next(pred)
features = []
for kk in range(int(np.ceil(self.args['umap_t_size']/(self.num_gpus*self.args['batch_size'])))):
features.append(self.args['strategy'].run(self.trainer.get_features,
args=(data,params['alpha'],self.trainer.Encoder[int(params['index'])],current_shape)).reduce())
self.trainer.write_summaries(features)
gen_image = iter(self.generator[index])
gen_index = iter(self.generator_index[index])
for i in range(int(current_step)-int(tmp[index])):
next(gen_index)
for i in range(int(current_step),int(tmp[index+1])): # continues where we left off
print("data step %i of %i " %(i,int(c_sum[-1])))
data = next(gen_image)
alpha = next(gen_index)
self.trainer.single_device_train_step(data,alpha,index)
if (i % self.args['save_model']) == 0: self.trainer.vd['steps'].assign_add(1)
self.trainer.save_checkpoint() if (int(i) % int(self.args['save_model'])) == 0:
self.trainer.save_checkpoint()
if (int(i) % int(self.args['save_model'])) == 0:
features = []
for kk in range(int(np.ceil(self.args['umap_t_size']/batch_size))):
data = next(gen_image)
print("computing prediction vectors: %i of %i" %(kk,int(np.ceil(self.args['umap_t_size']/batch_size))))
features.append(self.args['strategy'].run(self.trainer.get_features,
args=(data['image'],alpha,self.trainer.Encoder['E%i'%int(index)])))
self.trainer.write_summaries(features)
tf.keras.backend.clear_session()
self.trainer.save_checkpoint()
def predict(self): def predict(self):
pred = iter(self.output_generator) step_checker = 0
pred = iter(self.generator_pred)
output_vectors = [] output_vectors = []
if not isfile(join(self.args['results'],'final_featur_vectors.npy')): if not isfile(join(self.args['results'],'final_featur_vectors.npy')):
if self.args['num_gpus'] > 1:
for kk in range(int(np.ceil(self.args['depth']/self.args['num_gpus']*self.args['batch_size']))): for kk in range(int(np.ceil(self.args['number_particles']/self.predict_b_size))):
data = next(pred)
output_vectors.append(self.args['strategy'].run(self.trainer.get_features,args=(data,params['alpha'],self.trainer.Encoder[self.dic[self.args['resize']]],current_shape)).reduce())
else:
for kk in range(int(np.ceil(self.args['depth']/self.args['batch_size']))):
data = next(pred) data = next(pred)
output_vectors.append(self.args['strategy'].run(self.trainer.get_features,args=(data,params['alpha'],self.trainer.Encoder[self.dic[self.args['resize']]],current_shape)))
print("predicting feature vector step %i of %i" %(step_checker,int(np.ceil(self.args['number_particles']/self.predict_b_size))))
output_vectors.append(self.args['strategy'].run(self.trainer.get_features,
args=(data,tf.constant([1.0]),self.trainer.Encoder['E%i'%self.dic[self.args['resize']]])).numpy())
step_checker += 1
np.save(join(self.args['results'],'final_featur_vectors.npy')) np.save(join(self.args['results'],'final_featur_vectors.npy'),np.concatenate(output_vectors,axis=0))
labels,umap_output,collect_centers = pred_umap(args,feature_vector) feature_vector = np.load(join(self.args['results'],'final_featur_vectors.npy'))
print("calculating clusters")
if not isfile(join(self.args['results'],'final_labels.npy')): if not isfile(join(self.args['results'],'final_labels.npy')):
np.save(join(self.args['results'],'final_labels.npy')) labels,umap_output,collect_centers = pred_umap(self.args,feature_vector)
np.save(join(self.args['results'],'final_umap_output.npy')) np.save(join(self.args['results'],'final_labels.npy'),labels)
np.save(join(self.args['results'],'final_collect_centers.npy')) np.save(join(self.args['results'],'final_umap_output.npy'),umap_output)
np.save(join(self.args['results'],'final_collect_centers.npy'),collect_centers)
labels = np.load(join(self.args['results'],'final_labels.npy'))[:self.args['number_particles']]
centers = np.load(join(self.args['results'],'final_collect_centers.npy'))
x = tf.linspace(-0.5,0.5,int(self.args['size']/2)+1)
y = tf.linspace(-0.5,0.5,self.args['size'])
z = tf.linspace(-0.5,0.5,self.args['size'])
X,Y,Z = tf.meshgrid(z,y,x)
for i in range(self.predict_steps): for k in np.unique(labels):
image = next(self.generator_pred )
for index,i in enumerate(np.split(centers,int(centers.shape[0]/self.args['frames']),axis=0)):
if self.args['num_gpus'] == 1: for index_0,m in enumerate(np.split(i,i.shape[0],axis=0)):
current = self.trainer.single_device_prediction_step(image) if not isdir(join(self.args['results'],'model_%i' %index)):
else: mkdir(join(self.args['results'],'model_%i' %index))
current = self.trainer.dis_predict_step(image) singular_model = []
clusters.append(current.numpy()) if not isfile(join(join(self.args['results'],'model_%i' %index),'model_frame_%i.mrcs' %index_0)):
training_feat = np.asarray(clusters).flatten()[:self.args['umap_size'],:] for x,y,z in zip(np.split(X,X.shape[0],axis=0),np.split(Y,Y.shape[0],axis=0),np.split(Z,Z.shape[0],axis=0)):
predict_feat = np.asarray(clusters).flatten()[self.args['umap_size']:,:]
training_set = umap.UMAP(neighbours = self.args['neighbours']) stack = tf.stack([tf.reshape(x,[-1]),tf.reshape(y,[-1]),tf.reshape(z,[-1])],axis=-1)
TD = training_set.fit_predict(training_feat) stack = tf.reshape(stack,[-1,1,3])
TD_after = training_set.fit_predict(predict_feat) sliced = self.trainer.generate(self.args['size'],stack,m)
information = np.concatenate([TD,TD_after],axis=0)
labels = hdbscan.HDBSCAN(min_cluster_size=self.args['minimum_size'], min_samples=10).fet_predict(information) singular_model.append(sliced)
np.save(join(self.args['results'],'class_labels.npy'),labels[:self.args['number_particles']])
full_model = np.fft.irfftn(np.concatenate(singular_model,axis=0)).astype(np.float32)
save_model_to_mrcfile(full_model,join(join(self.args['results'],'model_%i' %index),'model_frame_%i.mrcs' %index_0))
\ No newline at end of file
...@@ -9,16 +9,12 @@ from os.path import isdir,join ...@@ -9,16 +9,12 @@ from os.path import isdir,join
def main(): def main():
parser = argparse.ArgumentParser(description='Run sortinator.') parser = argparse.ArgumentParser(description='Run sortinator.')
parser.add_argument('--num_gpus',type=int,default = 1, parser.add_argument('--gpu_id',type=int, default= 0, help='GPU ID')
help='Number of GPUs to use.')
parser.add_argument('--gpu_list',type=int, nargs='+',default = None,
help='List of GPU devises, if None it will run on gpus sequentially from GPU:0 and up.')
parser.add_argument('--num_cpus',type=int,default = 8,help='The maximum allowed cpus to use for preprocessing data and Kmeans clustering') parser.add_argument('--num_cpus',type=int,default = 8,help='The maximum allowed cpus to use for preprocessing data and Kmeans clustering')
parser.add_argument('--star', type=str, nargs='+', parser.add_argument('--star', type=str, nargs='+',
help='list of path to the star files, wild cards are accepted. The star file must refer to the .mrc files') help='list of path to the star files, wild cards are accepted. The star file must refer to the .mrc files')
parser.add_argument('--batch_size', type=int,default=100, parser.add_argument('--batch_size', type=int,default=[25,15,5,2,1], nargs='+',
help='deep learning model training batch') help='deep learning model training batch')
parser.add_argument('--o', type=str,default='./results', parser.add_argument('--o', type=str,default='./results',
...@@ -27,136 +23,38 @@ def main(): ...@@ -27,136 +23,38 @@ def main():
parser.add_argument('--f16', dest='f16',action='store_true', parser.add_argument('--f16', dest='f16',action='store_true',
help='Apply Tensor core acceleration to training and inference, requires compute capability of 10.0 or higher.') help='Apply Tensor core acceleration to training and inference, requires compute capability of 10.0 or higher.')
parser.add_argument('--record', type=int,default=1000,help='validation interval where models size is printed out.') parser.add_argument('--save_model', type=int,default=5,help='validation interval where models at full size are printed out.')
parser.add_argument('--save_model', type=int,default=100,help='validation interval where models at full size are printed out.')
parser.add_argument('--lr_g',type=float,default=10**(-4),help='The start learning rate of the generator') parser.add_argument('--lr_g',type=float,default=[10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6),10**(-7),0.5*10**(-7)], nargs='+',help='The start learning rate of the generator')
parser.add_argument('--lr_d',type=float,default=10**(-4),help='The start learning rate of the descriminator') parser.add_argument('--lr_d',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6)], nargs='+',help='The start learning rate of the descriminator')
parser.add_argument('--lr_e',type=float,default=10**(-5),help='The start learning rate of the encoder') parser.add_argument('--lr_e',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6)], nargs='+',help='The start learning rate of the encoder')
parser.add_argument('--ctf', dest='ctf',action='store_true',default=False,help='Use CTF parameters for model.') parser.add_argument('--ctf', dest='ctf',action='store_true',default=False,help='Use CTF parameters for model.')
parser.add_argument('--noise', dest='noise',action='store_true',default=False ,help='Use the noise generator to generate and scale the noise') parser.add_argument('--noise', dest='noise',action='store_true',default=False ,help='Use the noise generator to generate and scale the noise')
parser.add_argument('--steps',type=int,default=[10000,100000,100000,10000,100000], nargs='+',help='how many epochs( runs through the dataset) before termination') parser.add_argument('--steps',type=int,default=[5,5,5,5,5], nargs='+',help='how many epochs( runs through the dataset) before termination')
parser.add_argument('--imsize',type=int,default=128,choices=[128,256,512],help='the training image size. It can be 128 x 128, 256 x 256, and 512 x 512')
parser.add_argument('--l_reg',type=float,default=0.01,help='the lambda regulization of the diversity score loss if the noise generator is active') parser.add_argument('--l_reg',type=float,default=0.01,help='the lambda regulization of the diversity score loss if the noise generator is active')