Commit 2c376be1 authored by Jonathan Juhl's avatar Jonathan Juhl
Browse files

corrections

parent 9ec83b9f
......@@ -59,24 +59,26 @@ class control_flow:
depth = np.load(join(self.args['tmp'],'depth.npy'))
mrc_paths = np.load(join(self.args['tmp'],'names.npy'))
length,bytes_pr_record = self.get_parameters(mrc_paths[0])
length,bytes_pr_record,floattype = self.get_parameters(mrc_paths[0])
val_type = {2:tf.float32,12:tf.float16}
self.args['mrc_paths'] = mrc_paths
self.args['size'] = length
self.args['number_particles'] = depth
self.args['bpr'] = bytes_pr_record
self.args['type'] = val_type[floattype]
if args['ctf']:
self.args['kvolts'] = tf.constant(np.load(join(self.args['tmp'],'electron_volts.npy')),tf.float32)
self.args['sphe_ab'] = tf.constant(np.load(join(self.args['tmp'],'spherical_abberation.npy')),tf.float32)
self.args['amp_contrast'] = tf.constant(np.load(join(self.args['tmp'],'amplitude_contrast.npy')),tf.float32)
if not isfile(join(self.args['results'],'final_labels.npy')):
GAN_NERF(self.args)
exit()
# final_labels = np.load(join(self.args['results'],'labels.npy'))
# self.write_star_file(star_files,final_labels)
# print(self.args );exit()
final_labels = np.load(join(self.args['results'],'final_labels.npy'))
self.write_star_file(self.args['star'],final_labels)
def get_star_file_parameters(self):
counter = []
......@@ -136,14 +138,6 @@ class control_flow:
phase_shift = header.index('_rlnPhaseShift')
ctf_params = []
if self.args['verbose']:
try:
class_num = header.index('_rlnClassNumber')
except:
self.args['verbose'] = False
print("the --verbose true cannot be run as _rlnClassNumber is missing ")
for row in reader:
......@@ -170,8 +164,7 @@ class control_flow:
names.append(join(c,current_name))
else:
names.append(join(c,current_name))
if self.args['verbose']:
labels_list.append(int(s[class_num]))
......@@ -194,8 +187,7 @@ class control_flow:
np.save(join(self.args['tmp'],'spherical_abberation.npy'),abberation_d)
np.save(join(self.args['tmp'],'amplitude_contrast.npy'),amp_contrast_d)
np.save(join(self.args['tmp'],'ctf_params.npy'),np.asarray(ctf_params))
if self.args['verbose']:
np.save(join(self.args['tmp'],'labels.npy'),np.asarray(labels_list))
return f,np.unique(names)
......@@ -217,8 +209,10 @@ class control_flow:
if len(row.split()) == len(header):
names.append(row.split())
labels = labels[np.greater(labels,-1)]
for index,i in enumerate(np.unique(labels)):
f = open(join( self.args['star'],'cluster_%s.star' %index), 'w')
if i > -1:
f = open(join(join(self.args['results'],'model_%i' %index),'cluster.star'), 'w')
with f:
......@@ -246,14 +240,15 @@ class control_flow:
NX = np.fromstring(binary_header[0:4],np.int32)
NY = np.fromstring(binary_header[4:8],np.int32)
NZ = np.fromstring(binary_header[8:12],np.int32)
floattype = np.fromstring(binary_header[12:16],np.int32)[0]
recordsize = int((file_size-header)/(NZ[0]))
return NX[0],NY[0],NZ[0],recordsize
return NX[0],NY[0],NZ[0],recordsize,floattype
def get_parameters(self,paths):
width,length,depth,record_size_new = self.read(paths)
return length,record_size_new
width,length,depth,record_size_new,floattype = self.read(paths)
return length,record_size_new,floattype
import tensorflow as tf
import numpy as np
import umap
from os.path import join
from os.path import join,isfile, isdir
from trainer_sortem import Trainer
from mrc_loader_sortem import mrc_loader
from utils_sortem import save_model_to_mrcfile,pred_umap
from os import mkdir
class GAN_NERF():
def __init__(self,args):
self.args = args
dic = {32:1,64:2,128:3,256:4,512:5}
self.predict_steps = int(np.ceil(args['number_particles']/(args['num_gpus']*args['batch_size'])))
self.dic = dic
dic = {32:0,64:1,128:2,256:3,512:4}
l = np.asarray([32,64,128,256,512])
self.args['resize'] = l[np.argmin(np.abs(l-self.args['size']))]
l_list = []
alpha_list = []
im_shape_list = []
for index,steps in enumerate(args['steps'][:dic[self.args['resize']]]):
l_list += [index]*steps
alpha_list += [np.linspace(0.0,1.0,steps)]
im_shape_list += [l[index]]*steps
self.alphas = tf.constant(tf.concat(alpha_list,axis=0))
self.index = tf.constant(np.asarray(l_list))
self.im_shape = tf.constant(np.asarray(im_shape_list))
self.steps = self.alphas.shape[0]
gpu_list = []
if args['gpu_list'] is None:
args['gpu_list'] = []
if args['num_gpus'] > 1:
if len(args['gpu_list']) == args['num_gpus'] or len(args['gpu_list']) == 0:
if len(args['gpu_list']) == 0:
for i in range(args['num_gpus']):
gpu_list.append('GPU:%i' %i)
else:
for i in args['gpu_list']:
gpu_list.append('GPU:%i' %i)
if args['t_res'] != None:
predict_b_size = args['batch_size'][dic[args['t_res']]]
self.args['resize'] = args['t_res']
arl = dic[args['t_res']]
tmp = l[arl]
else:
print("the number of gpus specified is not equal to the number of gpu names. Either leave gpu names parameter unmodified or correct it.")
exit()
else:
if len(args['gpu_list']) == 0:
gpu_list.append('GPU:0')
else:
gpu_list.append('GPU:%i' %args['gpu_list'][0])
arl = np.argmin(np.abs(l-self.args['size']))
self.args['resize'] = l[arl]
predict_b_size = args['batch_size'][args]
self.predict_steps = int(np.ceil(args['number_particles']/(predict_b_size)))
self.predict_b_size = predict_b_size
self.dic = dic
generator = mrc_loader(args).generate(self.alphas,self.index,self.im_shape)
self.arl = arl
self.tmp = tmp
predict_generator = mrc_loader(args).pred_generate()
output_generator = mrc_loader(args).pred_generate()
if args['num_gpus'] > 1:
strategy = tf.distribute.MirroredStrategy(devices= gpu_list )
self.generator = strategy.experimental_distribute_dataset( generator)
self.generator_pred = strategy.experimental_distribute_dataset( predict_generator )
self.output_generator = strategy.experimental_distribute_dataset( output_generator )
else:
self.generator = []
self.generator_index = []
strategy = tf.distribute.OneDeviceStrategy(device=gpu_list[0])
self.generator = strategy.experimental_distribute_dataset( generator )
predict_generator = mrc_loader(args,self.args['resize'],predict_b_size).pred_generate()
strategy = tf.distribute.OneDeviceStrategy(device='GPU:%i' %args['gpu_id'])
for batch,i in zip(args['batch_size'],list(range(arl+1))):
generator,generator_index = mrc_loader(args,l[i],batch).generate(args['steps'][i])
self.generator.append(strategy.experimental_distribute_dataset( generator ))
self.generator_index.append(strategy.experimental_distribute_dataset(generator_index))
self.generator_pred = strategy.experimental_distribute_dataset( predict_generator )
self.output_generator = strategy.experimental_distribute_dataset( output_generator )
args['strategy'] = strategy
self.trainer = Trainer(args)
self.train()
self.predict()
def initialize_variables(self):
with self.args['strategy'].scope():
if self.args['noise']:
self.trainer.Noise['N%i' %self.dic[self.args['resize']]](tf.random.normal(shape=[1,4,4,4]),tf.ones(shape=[1,1,1]))
self.trainer.Encoder['E%i' %self.dic[self.args['resize']]](tf.ones(shape=[1,self.args['resize'],self.args['resize'],1]),tf.ones(shape=[1,1,1]))
self.trainer.Discriminator['D%i' %self.dic[self.args['resize']]](tf.ones(shape=[1,self.args['resize'],self.args['resize'],1]),tf.ones(shape=[1,1,1]))
x = tf.linspace(-0.5,0.5,int(self.args['resize']/2)+1)
y = tf.linspace(-0.5,0.5,self.args['resize'])
X,Y =tf.meshgrid(y,x)
stack = tf.cast(tf.expand_dims(tf.stack([tf.reshape(X,[-1]),tf.reshape(X,[-1]),tf.zeros_like(tf.reshape(X,[-1]))],axis=-1),axis=1),tf.float32)
self.trainer.vd['Generator'](self.args['resize'],tf.random.normal([1,10]),stack)
self.trainer.restore_checkpoint()
def train(self):
print('Begin training: ', '-' * 60)
current_step = self.trainer.step_variable
gen = iter(self.generator)
self.initialize_variables()
current_step = self.trainer.vd['steps']
c_sum = np.cumsum( self.args['steps'])[:(self.arl+1)]
print("The training til star at step %i of step %i" %(int(current_step.numpy()),int(c_sum[-1])))
pred = iter(self.generator_pred)
for i in range(int(current_step)):
# this starts the data recording at where it left off
# this is to prevent when continuation of training the model does not use the same data
data = next(gen)
for i in range(self.steps -current_step): # continues where we left off
print("data step %i" %i )
data = next(gen)
if self.args['ctf']:
params = {'image':tf.expand_dims(tf.squeeze(data[0][0]),axis=-1),
'ctf':data[0][1],
'alpha':data[1],
'index':data[2],
}
else:
params = {'image':tf.expand_dims(tf.squeeze(data[0]),axis=-1),
'ctf': None,
'alpha':data[1],
'index':data[2],
}
if self.args['num_gpus'] == 1:
self.trainer.single_device_train_step(params)
else:
self.trainer.distributed_training_step(params)
if (i % self.args['record']) == 0:
if self.args['num_gpus'] == 1:
features = []
current_shape = params['image'].numpy().shape[1]
for kk in range(int(np.ceil(self.args['umap_t_size']/self.args['batch_size']))):
data = next(pred)
tmp = np.concatenate([[0.0],c_sum],axis=0)
for index,current in enumerate(c_sum.tolist()):
features.append(self.args['strategy'].run(self.trainer.get_features,
args=(data,params['alpha'],self.trainer.Encoder[int(params['index'])],current_shape)))
self.trainer.write_summaries(features)
else:
data = next(pred)
features = []
for kk in range(int(np.ceil(self.args['umap_t_size']/(self.num_gpus*self.args['batch_size'])))):
batch_size = self.args['batch_size'][index]
features.append(self.args['strategy'].run(self.trainer.get_features,
args=(data,params['alpha'],self.trainer.Encoder[int(params['index'])],current_shape)).reduce())
self.trainer.write_summaries(features)
if current-current_step >= 0:
gen_image = iter(self.generator[index])
gen_index = iter(self.generator_index[index])
for i in range(int(current_step)-int(tmp[index])):
next(gen_index)
if (i % self.args['save_model']) == 0:
for i in range(int(current_step),int(tmp[index+1])): # continues where we left off
print("data step %i of %i " %(i,int(c_sum[-1])))
data = next(gen_image)
alpha = next(gen_index)
self.trainer.single_device_train_step(data,alpha,index)
self.trainer.vd['steps'].assign_add(1)
if (int(i) % int(self.args['save_model'])) == 0:
self.trainer.save_checkpoint()
if (int(i) % int(self.args['save_model'])) == 0:
features = []
for kk in range(int(np.ceil(self.args['umap_t_size']/batch_size))):
data = next(gen_image)
print("computing prediction vectors: %i of %i" %(kk,int(np.ceil(self.args['umap_t_size']/batch_size))))
features.append(self.args['strategy'].run(self.trainer.get_features,
args=(data['image'],alpha,self.trainer.Encoder['E%i'%int(index)])))
self.trainer.write_summaries(features)
tf.keras.backend.clear_session()
self.trainer.save_checkpoint()
def predict(self):
pred = iter(self.output_generator)
step_checker = 0
pred = iter(self.generator_pred)
output_vectors = []
if not isfile(join(self.args['results'],'final_featur_vectors.npy')):
if self.args['num_gpus'] > 1:
for kk in range(int(np.ceil(self.args['depth']/self.args['num_gpus']*self.args['batch_size']))):
data = next(pred)
output_vectors.append(self.args['strategy'].run(self.trainer.get_features,args=(data,params['alpha'],self.trainer.Encoder[self.dic[self.args['resize']]],current_shape)).reduce())
else:
for kk in range(int(np.ceil(self.args['depth']/self.args['batch_size']))):
for kk in range(int(np.ceil(self.args['number_particles']/self.predict_b_size))):
data = next(pred)
output_vectors.append(self.args['strategy'].run(self.trainer.get_features,args=(data,params['alpha'],self.trainer.Encoder[self.dic[self.args['resize']]],current_shape)))
print("predicting feature vector step %i of %i" %(step_checker,int(np.ceil(self.args['number_particles']/self.predict_b_size))))
output_vectors.append(self.args['strategy'].run(self.trainer.get_features,
args=(data,tf.constant([1.0]),self.trainer.Encoder['E%i'%self.dic[self.args['resize']]])).numpy())
step_checker += 1
np.save(join(self.args['results'],'final_featur_vectors.npy'),np.concatenate(output_vectors,axis=0))
feature_vector = np.load(join(self.args['results'],'final_featur_vectors.npy'))
print("calculating clusters")
np.save(join(self.args['results'],'final_featur_vectors.npy'))
labels,umap_output,collect_centers = pred_umap(args,feature_vector)
if not isfile(join(self.args['results'],'final_labels.npy')):
np.save(join(self.args['results'],'final_labels.npy'))
np.save(join(self.args['results'],'final_umap_output.npy'))
np.save(join(self.args['results'],'final_collect_centers.npy'))
labels,umap_output,collect_centers = pred_umap(self.args,feature_vector)
np.save(join(self.args['results'],'final_labels.npy'),labels)
np.save(join(self.args['results'],'final_umap_output.npy'),umap_output)
np.save(join(self.args['results'],'final_collect_centers.npy'),collect_centers)
labels = np.load(join(self.args['results'],'final_labels.npy'))[:self.args['number_particles']]
centers = np.load(join(self.args['results'],'final_collect_centers.npy'))
x = tf.linspace(-0.5,0.5,int(self.args['size']/2)+1)
y = tf.linspace(-0.5,0.5,self.args['size'])
z = tf.linspace(-0.5,0.5,self.args['size'])
X,Y,Z = tf.meshgrid(z,y,x)
for k in np.unique(labels):
for index,i in enumerate(np.split(centers,int(centers.shape[0]/self.args['frames']),axis=0)):
for index_0,m in enumerate(np.split(i,i.shape[0],axis=0)):
if not isdir(join(self.args['results'],'model_%i' %index)):
mkdir(join(self.args['results'],'model_%i' %index))
singular_model = []
if not isfile(join(join(self.args['results'],'model_%i' %index),'model_frame_%i.mrcs' %index_0)):
for x,y,z in zip(np.split(X,X.shape[0],axis=0),np.split(Y,Y.shape[0],axis=0),np.split(Z,Z.shape[0],axis=0)):
stack = tf.stack([tf.reshape(x,[-1]),tf.reshape(y,[-1]),tf.reshape(z,[-1])],axis=-1)
stack = tf.reshape(stack,[-1,1,3])
sliced = self.trainer.generate(self.args['size'],stack,m)
singular_model.append(sliced)
full_model = np.fft.irfftn(np.concatenate(singular_model,axis=0)).astype(np.float32)
save_model_to_mrcfile(full_model,join(join(self.args['results'],'model_%i' %index),'model_frame_%i.mrcs' %index_0))
for i in range(self.predict_steps):
image = next(self.generator_pred )
if self.args['num_gpus'] == 1:
current = self.trainer.single_device_prediction_step(image)
else:
current = self.trainer.dis_predict_step(image)
clusters.append(current.numpy())
training_feat = np.asarray(clusters).flatten()[:self.args['umap_size'],:]
predict_feat = np.asarray(clusters).flatten()[self.args['umap_size']:,:]
training_set = umap.UMAP(neighbours = self.args['neighbours'])
TD = training_set.fit_predict(training_feat)
TD_after = training_set.fit_predict(predict_feat)
information = np.concatenate([TD,TD_after],axis=0)
labels = hdbscan.HDBSCAN(min_cluster_size=self.args['minimum_size'], min_samples=10).fet_predict(information)
np.save(join(self.args['results'],'class_labels.npy'),labels[:self.args['number_particles']])
\ No newline at end of file
......@@ -9,16 +9,12 @@ from os.path import isdir,join
def main():
parser = argparse.ArgumentParser(description='Run sortinator.')
parser.add_argument('--num_gpus',type=int,default = 1,
help='Number of GPUs to use.')
parser.add_argument('--gpu_list',type=int, nargs='+',default = None,
help='List of GPU devises, if None it will run on gpus sequentially from GPU:0 and up.')
parser.add_argument('--gpu_id',type=int, default= 0, help='GPU ID')
parser.add_argument('--num_cpus',type=int,default = 8,help='The maximum allowed cpus to use for preprocessing data and Kmeans clustering')
parser.add_argument('--star', type=str, nargs='+',
help='list of path to the star files, wild cards are accepted. The star file must refer to the .mrc files')
parser.add_argument('--batch_size', type=int,default=100,
parser.add_argument('--batch_size', type=int,default=[25,15,5,2,1], nargs='+',
help='deep learning model training batch')
parser.add_argument('--o', type=str,default='./results',
......@@ -27,136 +23,38 @@ def main():
parser.add_argument('--f16', dest='f16',action='store_true',
help='Apply Tensor core acceleration to training and inference, requires compute capability of 10.0 or higher.')
parser.add_argument('--record', type=int,default=1000,help='validation interval where models size is printed out.')
parser.add_argument('--save_model', type=int,default=100,help='validation interval where models at full size are printed out.')
parser.add_argument('--save_model', type=int,default=5,help='validation interval where models at full size are printed out.')
parser.add_argument('--lr_g',type=float,default=10**(-4),help='The start learning rate of the generator')
parser.add_argument('--lr_g',type=float,default=[10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6),10**(-7),0.5*10**(-7)], nargs='+',help='The start learning rate of the generator')
parser.add_argument('--lr_d',type=float,default=10**(-4),help='The start learning rate of the descriminator')
parser.add_argument('--lr_d',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6)], nargs='+',help='The start learning rate of the descriminator')
parser.add_argument('--lr_e',type=float,default=10**(-5),help='The start learning rate of the encoder')
parser.add_argument('--lr_e',type=float,default=[10**(-4),0.5*10**(-4),10**(-5),0.5*10**(-5),10**(-6),0.5*10**(-6)], nargs='+',help='The start learning rate of the encoder')
parser.add_argument('--ctf', dest='ctf',action='store_true',default=False,help='Use CTF parameters for model.')
parser.add_argument('--noise', dest='noise',action='store_true',default=False ,help='Use the noise generator to generate and scale the noise')
parser.add_argument('--steps',type=int,default=[10000,100000,100000,10000,100000], nargs='+',help='how many epochs( runs through the dataset) before termination')
parser.add_argument('--imsize',type=int,default=128,choices=[128,256,512],help='the training image size. It can be 128 x 128, 256 x 256, and 512 x 512')
parser.add_argument('--steps',type=int,default=[5,5,5,5,5], nargs='+',help='how many epochs( runs through the dataset) before termination')
parser.add_argument('--l_reg',type=float,default=0.01,help='the lambda regulization of the diversity score loss if the noise generator is active')
parser.add_argument('--m_batch_size',type=int,default=25,help='the batch size to make the 3D model')
parser.add_argument('--frames',type=int,default=8,help='number of models to generate from each cluster')
parser.add_argument('--frames',type=int,default=4,help='number of models to generate from each cluster')
parser.add_argument('--umap_p_size',type=int,default=100000,help='The UMAP size to train the umap model. It is trained on the CPU in parallel')
parser.add_argument('--umap_p_size',type=int,default=100,help='The UMAP size to train the umap model. It is trained on the CPU in parallel')
parser.add_argument('--umap_t_size',type=int,default=10000,help='The UMAP size')
parser.add_argument('--umap_t_size',type=int,default=100,help='The UMAP size')
parser.add_argument('--neighbours',type=int,default=30,help='number of neighbours in the graph creation algorithm')
parser.add_argument('--t_res',type=int,default=None,choices=[32,64,128,256,512],help='number of neighbours in the graph creation algorithm')
parser.add_argument('--minimum_size',type=int,default=500,help='the minimum size before its considered an actual cluster, anything else less is considered noise and will be discarded')
args = parser.parse_args()
if isinstance(args.num_gpus,int) and args.num_gpus > 0:
print("num gpus is instance of: int", isinstance(args.num_gpus,int), "and is: %i" %args.num_gpus)
else:
assert print("the number of gpus is not an integer or is less than 0")
if isinstance(args.num_cpus,int) and args.num_cpus > 0:
print("num cpus is instance of: int", isinstance(args.num_cpus,int), "and is: %i" %args.num_cpus)
else:
assert print("the number of cpus is not an integer or is less than 0")
if isinstance(args.star,list) and all(isinstance(x, str) for x in args.star):
print("star file is a list of strings",args.star)
else:
assert print("star file is not a string ")
if (isinstance(args.gpu_list,list) and all(isinstance(x, int) for x in args.gpu_list)) or args.gpu_list == None:
print("gpu list file is a list of integers",args.gpu_list)
else:
assert print("gpu list file is not a list of integers")
if isinstance(args.batch_size,int) and args.batch_size > 0:
print("training batch is int and larger than 0",args.batch_size)
else:
assert print("training batch is not int and less than 0 ")
if isinstance(args.m_batch_size,int) and args.m_batch_size > 0:
print("making model is int and larger than 0",args.m_batch_size)
else:
assert print("prediction batch is not int or less than 0 ")
if isinstance(args.o,str):
print("output directory is a string",args.o)
else:
assert print("output directory is not a string")
if isinstance(args.f16,bool):
print("half precision is a bool",args.f16)
else:
assert print("half precision is not a bool")
if isinstance(args.steps,list) and len(args.steps)==5:
print("The number of epochs is a list and is "+ str(args.steps))
else:
assert print("epochs is not an integer")
if isinstance(args.lr_g,float) and args.lr_g > 0.0:
print("the learning rate for generator is a float and larger than 0.0")
else:
assert print("the learning rate for generator is not a float and larger than 0.0")
if isinstance(args.lr_d,float) and args.lr_d > 0.0:
print("the learning rate for discriminator in beginning is not a float and larger than 0.0")
else:
assert print("the learning rate for discriminator in beginning is a float and larger than 0.0")
if isinstance(args.ctf,bool):
print("the ctf is a bool")
else:
assert print("the ctf is not a bool")
if isinstance(args.frames,int) and args.frames > 0:
print("the number of frames is an int and is larger than zero")
else:
assert print("it is not a integer and is not larger than 0")
if isinstance(args.minimum_size,int) and args.minimum_size > 0:
print("the mninimum size of the cluster is an integer and is larger than 0")
else:
assert print("the minimum size of the cluster is not an integer or is not larger than 0")
if isinstance(args.neighbours,int) and args.neighbours > 0:
print("the nearest neighbour size is an integer and is larger than zero")
else:
assert print("the nearest neighbour size is not an integer or is not larger than zero")
if isinstance(args.umap_t_size,int) and args.umap_t_size > 0:
print("umap_size is an integer and is larger than zero")
else:
assert print(" umap_size is not an integer")
if isinstance(args.umap_p_size,int) and args.umap_p_size > 0:
print("umap_size is an integer and is larger than zero")
else:
assert print(" umap_size is not an integer")
if not isdir(args.o):
mkdir(args.o)
if not isdir(join(args.o,'tmp')):
......@@ -165,17 +63,15 @@ def main():
mkdir(join(args.o,'model'))
if not isdir(join(args.o,'results')):
mkdir(join(args.o,'results'))
if not isdir(join(args.o,'best_model')):
mkdir(join(args.o,'best_model'))
args_dic = {'num_gpus': args.num_gpus,
args_dic = {'gpu_id': args.gpu_id,
'num_cpus': args.num_cpus,