CG simulation of a protein using anisotropic precomputed potentials

"""
Generate and simulate a coarse-grained potential of a protein.
This is done in a number of steps.

In the first step, generate an orientation file.
This file is used to determine if the orientation is identical to a previous orientation.
For example, when the spherical polar angle is zero, the orientation is the same for all azimuthal
angles.
Orientation files may be included with TabulateTwoRigidBody3D::input_orientation_file.
For each orientation, a value of -1 identifies a unique orientation and a value of zero or more
identifies an earlier orientation that is identical.
These orientation files only need to be generated once per num_orientations_per_pi value.

After this step, the next script after_1_contact is called in post_process.
"""

import copy
import os.path
import argparse
import subprocess
import pandas as pd
from pyfeasst import fstio

def parse():
    """ Parse arguments for this and following scripts """
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--feasst_install', type=str, default='../../../build/',
                        help='FEASST install directory (e.g., the path to build)')
    parser.add_argument('--num_orientations_per_pi', type=int, default=2,
                        help='maximum number of orientations per 180 degrees')
    parser.add_argument('--run_type', '-r', type=int, default=0,
                        help='0: run, 1: submit to queue, 2: post-process')
    parser.add_argument('--hours_terminate', type=float, default=14*24,
                        help='hours until termination')
    parser.add_argument('--num_nodes', type=int, default=1, help='Number of nodes in queue')
    parser.add_argument('--procs_per_node', type=int, default=16, help='Number processors in a node')
    parser.add_argument('--scratch', type=str, default=None,
                        help='Optionally write scheduled job to scratch/logname/jobid.')
    parser.add_argument('--queue_flags', type=str, default="",
                        help='extra flags for queue (e.g., for slurm, "-p queue")')
    parser.add_argument('--node', type=int, default=0, help='node ID')
    parser.add_argument('--queue_id', type=int, default=-1, help='If != -1, read args from file')
    parser.add_argument('--queue_task', type=int, default=0, help='If > 0, restart from checkpoint')

    # additional in contact step (or more)
    parser.add_argument('--pH', type=float, default=6, help='pH')
    parser.add_argument('--domain1', type=str, default='4lyt', help='fstprt file')
    parser.add_argument('--domain2', type=str, default='4lyt', help='fstprt file')
    parser.add_argument('--contact_xyz_file', type=str, default='',
                        help='If not empty, print contact configuration for each orientation')
    parser.add_argument('--contact_xyz_index', type=int, default=-1,
                        help='If not -1, print contact configuration for one orientation')

    # additional in energy step (or more)
    parser.add_argument('--num_z', type=int, default=7, help='num of distances per orientation')
    parser.add_argument('--gamma', type=float, default=-4, help='stretching exponent for table')
    parser.add_argument('--temperature', type=float, default=298.15, help='temperature in Kelvin')
    parser.add_argument('--ionic_strength', type=float, default=0.15, help='formulation ionic strength of NaCl in Molar units')
    parser.add_argument('--smoothing_distance', type=float, default=2, help='distance from cutoff to smooth to zero')

    # additional in b2 step
    parser.add_argument('--table_file', type=str, default='energy.txt', help='file describe cg table potential.')
    parser.add_argument('--cubic_side_length', type=float, default=1e4, help='cubic side length')
    parser.add_argument('--molecular_weight', type=float, default=14315.03534, help='molecular weight of protein in g/mol')
    parser.add_argument('--reference_sigma', type=float, default=30, help='size of hard sphere on COM of rigid domain')
    parser.add_argument('--ignore_energy', type=str, default='false', help='true if interaction is excluded volume only.')
    parser.add_argument('--ignore_intra_energy', type=str, default='false', help='true if intra interaction is excluded volume only.')
    parser.add_argument('--num_beta_taylor', type=int, default=10, help='number of Tayler series derivatives')
    parser.add_argument('--show_plot', type=int, default=0, help='show extrapolation plot if != 0')
    parser.add_argument('--trials_per', type=int, default=int(1e5), help='number of trials per iteration')
    parser.add_argument('--equilibration', type=int, default=int(2e1), help='number of iterations in equilibration')
    parser.add_argument('--production', type=int, default=int(2e1), help='number of iterations in production')
    parser.add_argument('--seed', type=int, default=-1,
                        help='Random number generator seed. If -1, assign random seed to each sim.')
    parser.add_argument('--fstprt', type=str, default='/feasst/plugin/aniso/particle/aniso_tabular.fstprt', help='fstprt file')
    return parser

def write_feasst_script(params, script_file):
    """ Write fst script for a single simulation with keys of params {} enclosed. """
    with open(script_file, 'w', encoding='utf-8') as myfile:
        myfile.write("""
MonteCarlo
Configuration cubic_side_length 2e2 particle_type0 /feasst/particle/spce.fstprt particle_type1 /feasst/particle/spce.fstprt \
  add_particles_of_type0 1 add_particles_of_type1 1 \
  group0 fixed fixed_particle_type 0 group1 mobile mobile_particle_type 1
Potential Model ModelEmpty
TabulateTwoRigidBody3D num_orientations_per_pi {num_orientations_per_pi} output_orientation_file {prefix}{num_orientations_per_pi}.txt

MonteCarlo
Configuration cubic_side_length 2e2 particle_type0 /feasst/particle/spce.fstprt particle_type1 /feasst/particle/propane.fstprt \
  add_particles_of_type0 1 add_particles_of_type1 1 \
  group0 fixed fixed_particle_type 0 group1 mobile mobile_particle_type 1
Potential Model ModelEmpty
TabulateTwoRigidBody3D num_orientations_per_pi {num_orientations_per_pi} output_orientation_file {prefix}{num_orientations_per_pi}_ij.txt
""".format(**params))

def post_process(params):
    """ Check the final file length and then launch the next step. """
    nk = params['num_orientations_per_pi']
    for ij in [True, False]:
        extra = ''
        expected = (2*nk+1)**2 * (nk+1)**3
        if ij:
            expected = (2*nk+1)**3 * (nk+1)**2
            extra = '_ij'
        #print('expected number of orientations:', expected)
        df = pd.read_csv('''{prefix}{num_orientations_per_pi}'''.format(**params)+extra+'.txt', skiprows=1, sep=r'\s+')
        assert expected == len(df.columns)
    print('launching after_1_1_contact.py')
    subprocess.check_call('python after_1_1_contact.py '+fstio.dict_to_argparse(params['original_args']),
                          shell=True, executable='/bin/bash')

if __name__ == '__main__':
    prsr = parse()
    args, unknown_args = prsr.parse_known_args()
    assert len(unknown_args) == 0, 'An unknown argument was included: '+str(unknown_args)
    prms = vars(args)
    prms['original_args'] = copy.deepcopy(prms)
    prms['script'] = __file__
    prms['prefix'] = 'orientations'
    prms['sim_id_file'] = prms['prefix'] + '_sim_ids.txt'
    prms['minutes'] = int(prms['hours_terminate']*60) # minutes allocated on queue
    prms['hours_terminate'] = 0.99*prms['hours_terminate'] - 0.0333 # terminate before queue
    prms['procs_per_sim'] = prms['procs_per_node']
    prms['num_nodes'] = 1
    prms['num_sims'] = prms['num_nodes']
    orient_file_prefix = '''{prefix}{num_orientations_per_pi}'''.format(**prms)
    if os.path.isfile(orient_file_prefix+'.txt') and \
       os.path.isfile(orient_file_prefix+'_ij.txt'):
        print('using existing:', orient_file_prefix)
        post_process(params=prms)
    else:
        fstio.run_simulations(params=prms,
                              sim_node_dependent_params=None,
                              write_feasst_script=write_feasst_script,
                              post_process=post_process,
                              queue_function=fstio.slurm_single_node,
                              args=args)
"""
Generate a contact table of a protein, by default described by the provided 4lyt.pdb file.
Each atom is modeled as a hard sphere and the contact distance is computed over a number of
orientations.
"""

import copy
import sys
import os.path
import subprocess
from pyfeasst import fstio
from pyfeasst import coarse_grain_pdb
from launch_1_cg_protein import parse

def write_feasst_script(params, script_file):
    """ Write fst script for a single simulation with keys of params {} enclosed. """
    with open(script_file, 'w', encoding='utf-8') as myfile:
        myfile.write("""
MonteCarlo
Configuration cubic_side_length 2e2 particle_type0 {domain1}.fstprt particle_type1 {domain2}.fstprt \
  add_particles_of_type0 1 add_particles_of_type1 1 cutoff 0 set_cutoff_min_to_sigma true {vis_extra}
Potential Model HardSphere VisitModel VisitModelCell min_length max_sigma energy_cutoff 1e100
TabulateTwoRigidBody3D proc {sim} num_proc {num_sims} input_orientation_file {orientation_file} num_z -1 output_table_file {prefix}{sim}.txt {contact_xyz_file} contact_xyz_index {contact_xyz_index}
""".format(**params))

def post_process(params):
    """ Combining table files, checking length and then launching the next step """
    subprocess.check_call(['sleep', '5']) # without this command, combine doesn't read all tables?
    if not os.path.isfile('''{prefix}.txt'''.format(**params)):
        fstio.combine_tables_two_rigid_body(prefix=params['prefix'], suffix='.txt',
                                            num_procs=params['num_sims'])
        if params['num_orientations_per_pi'] == 6 and params['pH'] == 6 and \
           params['domain1'] == '4lyt' and  params['domain2'] == params['domain1']:
            with open("""{prefix}.txt""".format(**params), 'r', encoding="utf-8") as file1:
                lines = file1.readlines()
            #print(len(lines))
            assert len(lines) == 681
            assert lines[0] == 'site_types 1 0\n'
            assert lines[-1] == '-1 160\n'
    print('launching after_1_2_energy.py')
    subprocess.check_call('python after_1_2_energy.py '+fstio.dict_to_argparse(params['original_args']), shell=True, executable='/bin/bash')

if __name__ == '__main__':
    parser = parse()
    args, unknown_args = parser.parse_known_args()
    assert len(unknown_args) == 0, 'An unknown argument was included: '+str(unknown_args)
    prms = vars(args)
    prms['original_args'] = copy.deepcopy(prms)
    prms['prefix'] = 'contact'
    if os.path.isfile('''{prefix}.txt'''.format(**prms)):
        print('using existing:', '''{prefix}.txt'''.format(**prms))
        post_process(prms)
        sys.exit()
    prms['script'] = __file__
    prms['sim_id_file'] = prms['prefix'] + '_sim_ids.txt'
    prms['minutes'] = int(prms['hours_terminate']*60) # minutes allocated on queue
    prms['hours_terminate'] = 0.99*prms['hours_terminate'] - 0.0333 # terminate before queue
    prms['orientation_file'] = 'orientations' + str(prms['num_orientations_per_pi'])
    if prms['domain1'] != prms['domain2']:
        prms['orientation_file'] += '_ij'
    prms['orientation_file'] += '.txt'
    prms['procs_per_sim'] = 1
    prms['num_sims'] = prms['num_nodes']*prms['procs_per_node']
    if prms['contact_xyz_file'] != '':
        prms['contact_xyz_file'] = 'contact_xyz_file ' + prms['contact_xyz_file']
        prms['vis_extra'] = 'group0 fixed fixed_particle_type 0 group1 mobile mobile_particle_type 1'
    else:
        prms['vis_extra'] = ''

    # convert pdb to pqr to fstprt
    if prms['run_type'] != 2:
        for domain in [prms['domain1'], prms['domain2']]:
            # example of how pdb is converted to pqr
            #subprocess.check_call('pdb2pqr30 --titration-state-method=propka --with-ph='+str(prms['pH'])+' --ff=PARSE '+domain+'.pdb '+domain+'.pqr --drop-water > '+domain+'.log 2>&1', shell=True, executable='/bin/bash')
            coarse_grain_pdb.write_fstprt(domain)

    fstio.run_simulations(params=prms,
                          sim_node_dependent_params=None,
                          #sim_node_dependent_params=sim_node_dependent_params,
                          write_feasst_script=write_feasst_script,
                          post_process=post_process,
                          queue_function=fstio.slurm_single_node,
                          args=args)
"""
Generate an energy table of a protein, by default described by the provided 4lyt.pdb file.
Each atom is modeled as a hard sphere with an LJ and screened charge interation computed over a number of orientations.
"""

import copy
import sys
import os.path
import subprocess
import numpy as np
from pyfeasst import fstio
from pyfeasst import physical_constants
from launch_1_cg_protein import parse

def write_feasst_script(params, script_file):
    """ Write fst script for a single simulation with keys of params {} enclosed. """
    with open(script_file, 'w', encoding='utf-8') as myfile:
        myfile.write("""
MonteCarlo
Configuration cubic_side_length {initial_box} particle_type0 {domain1}.fstprt particle_type1 {domain2}.fstprt \
  add_particles_of_type0 1 add_particles_of_type1 1 cutoff {cutoff} \
  group0 fixed fixed_particle_type 0 group1 mobile mobile_particle_type 1
Potential Model ModelTwoBodyFactory \
  model0 HardSphere \
  model1 LennardJones \
  model2 DebyeHuckel kappa {kappa} dielectric {dielectric_water} smoothing_distance {smoothing_distance} \
  VisitModel VisitModelCell min_length max_cutoff energy_cutoff 1e100
TabulateTwoRigidBody3D proc {sim} num_proc {num_sims} input_orientation_file {orientation_file} num_z {num_z} smoothing_distance {smoothing_distance} input_table_file {contact_file} output_table_file {prefix}{sim}.txt gamma {gamma}
""".format(**params))

def post_process(params):
    """ Combine tables, check their length and launch the next step """
    subprocess.check_call(['sleep', '15']) # without this command, combine doesn't read all tables?
    if not os.path.isfile('''{prefix}.txt'''.format(**params)):
        fstio.combine_tables_two_rigid_body(prefix=params['prefix'], suffix='.txt',
                                            num_procs=params['num_sims'])
        if params['num_orientations_per_pi'] == 6 and params['domain1'] == '4lyt' and \
           params['domain2'] == params['domain1']:
            with open("""{prefix}.txt""".format(**params), 'r', encoding="utf-8") as file1:
                lines = file1.readlines()
            #print(len(lines))
            assert len(lines) == 681
            assert lines[0] == 'site_types 1 0\n'
            assert lines[6] == '3.762260e+01 -4.069026e+00 -7.593451e-04\n'
            assert lines[-1] == '-1 160\n'
    print('launching after_1_3_b2.py')
    subprocess.check_call('python after_1_3_b2.py '+fstio.dict_to_argparse(params['original_args']),
                          shell=True, executable='/bin/bash')

if __name__ == '__main__':
    parser = parse()
    args, unknown_args = parser.parse_known_args()
    assert len(unknown_args) == 0, 'An unknown argument was included: '+str(unknown_args)
    prms = vars(args)
    prms['original_args'] = copy.deepcopy(prms)
    prms['script'] = __file__
    prms['prefix'] = 'energy'
    if os.path.isfile('''{prefix}.txt'''.format(**prms)):
        print('using existing:', '''{prefix}.txt'''.format(**prms))
        post_process(prms)
        sys.exit()
    prms['sim_id_file'] = prms['prefix'] + '_sim_ids.txt'
    prms['minutes'] = int(prms['hours_terminate']*60) # minutes allocated on queue
    prms['hours_terminate'] = 0.99*prms['hours_terminate'] - 0.0333 # terminate before queue
    prms['orientation_file'] = 'orientations' + str(prms['num_orientations_per_pi'])
    if prms['domain1'] != prms['domain2']:
        prms['orientation_file'] += '_ij'
    prms['orientation_file'] += '.txt'
    prms['contact_file'] = 'contact.txt'
    prms['procs_per_sim'] = 1
    prms['num_sims'] = prms['num_nodes']*prms['procs_per_node']
    temp_cel = prms['temperature'] - 273.15
    prms['dielectric_water'] = 87.74 - 0.40008*temp_cel + 9.398e-4*temp_cel**2 - 1.4e-6*temp_cel**3
    eps_0 = physical_constants.VacuumElectricPermittivity().value()
    elem_q = physical_constants.ElementaryCharge().value()
    na = physical_constants.AvogadroConstant().value()
    kb = physical_constants.BoltzmannConstant().value()
    prms['kappa'] = np.sqrt(2*(elem_q**2)*prms['ionic_strength']*(1e3)*na/(prms['dielectric_water']*eps_0*kb*prms['temperature']*1e20))
    prms['cutoff'] = 5/prms['kappa']
    prms['initial_box'] = 4*prms['cutoff'] # initial box adjusted by TabulateTwoRigidBody3D

    fstio.run_simulations(params=prms,
                          sim_node_dependent_params=None,
                          #sim_node_dependent_params=sim_node_dependent_params,
                          write_feasst_script=write_feasst_script,
                          post_process=post_process,
                          queue_function=fstio.slurm_single_node,
                          args=args)
"""
Compute the second virial coefficient using coarse-grained models.
"""

import subprocess
import numpy as np
import pandas as pd
from pyfeasst import accumulator
from pyfeasst import fstio
from pyfeasst import physical_constants
from launch_1_cg_protein import parse

def write_feasst_script(params, script_file):
    """ Write fst script for a single simulation with keys of params {} enclosed. """
    with open(script_file, 'w', encoding='utf-8') as myfile:
        myfile.write("""
MonteCarlo
RandomMT19937 seed {seed}
Configuration cubic_side_length {cubic_side_length} particle_type0 {fstprt} add_particles_of_type0 2 group0 mobile mobile_particle_index 1
Potential Model TwoBodyTable VisitModelInner VisitModelInnerTable table_file {table_file} ignore_energy {ignore_energy}
RefPotential Model HardSphere sigma 0 cutoff 0 sigma0 {reference_sigma} cutoff0 {reference_sigma}
ThermoParams beta {beta}
MayerSampling num_beta_taylor {num_beta_taylor} num_trials_per_iteration {trials_per} num_iterations_to_complete {equilibration}
TrialTranslate new_only true reference_index 0 tunable_param 35 group mobile
TrialRotate new_only true reference_index 0 tunable_param 40

# tune trial parameters
CriteriaWriter trials_per_write {trials_per} output_file {prefix}_{sim}_b2_eq.txt
Log trials_per_write {trials_per} output_file {prefix}_{sim}_eq.txt
#Movie trials_per_write {trials_per} output_file {prefix}_{sim}_eq.xyz
#CPUTime trials_per_write {trials_per} output_file {prefix}_{sim}_cpu.txt append true
Tune
#Run num_trials {equilibration}
Run until_criteria_complete true
RemoveAnalyze name CriteriaWriter
RemoveAnalyze name Log
#RemoveAnalyze name Movie
RemoveModify name Tune

# production
CriteriaWriter trials_per_write {trials_per} output_file {prefix}_{sim}_b2.txt
Log trials_per_write {trials_per} output_file {prefix}_{sim}.txt
#Movie trials_per_write {trials_per} output_file {prefix}_{sim}.xyz
MayerSampling num_beta_taylor {num_beta_taylor} num_trials_per_iteration {trials_per} num_iterations_to_complete {production}
Run until_criteria_complete true
""".format(**params))

def post_process(params):
    """ Compute statistics and output B2 value in appropriate units. """
    subprocess.check_call(['sleep', '5']) # without this command, combine doesn't read all tables?
    b2acc = accumulator.Accumulator()
    ref = (2*np.pi/3)*params['reference_sigma']**3
    mw = params['molecular_weight']/1e3 # kDa
    ref *= 1e-26*physical_constants.AvogadroConstant().value()/mw/mw # A3 to 10^4 molml/g2
    for p in range(params['procs_per_node']):
        with open(params['prefix']+'_'+str(p)+"_b2.txt", 'r', encoding="utf-8") as file1:
#        with open(params['prefix']+'_'+str(p)+"_b2_eq.txt", 'r') as file1:
            lines = file1.readlines()
        if len(lines) != 0:
            #print('p', p, 'lines[0]', lines[0])
            exec('iprm=' + lines[0], globals())
            #print(iprm)
            #print('ref(A^3)', ref)
            b2_molmlg2=iprm['second_virial_ratio']*ref
            #print('b2(A^3)', b2)
            #print(b2_molmlg2, '10^-4 mol*ml*g^-2')
            b2acc.add(b2_molmlg2)
            if p == 0:
                df = pd.DataFrame(data={p: iprm['beta_taylor']})
            else:
                df[p] = iprm['beta_taylor']
    print('b2 (mol*ml/g2)', b2acc.mean(), b2acc.stdev()/np.sqrt(params['procs_per_node']))
    if params['molecular_weight'] == 14315.03534:
        assert np.abs(b2acc.mean() - 1.72) < 0.1

if __name__ == '__main__':
    parser = parse()
    args, unknown_args = parser.parse_known_args()
    assert len(unknown_args) == 0, 'An unknown argument was included: '+str(unknown_args)
    prms = vars(args)
    prms['prefix'] = 'b2'
    prms['sim_id_file'] = prms['prefix'] + '_sim_ids.txt'
    prms['script'] = __file__
    prms['minutes'] = int(prms['hours_terminate']*60) # minutes allocated on queue
    prms['hours_terminate'] = 0.99*prms['hours_terminate'] - 0.0333 # terminate before queue
    prms['procs_per_sim'] = 1
    prms['num_sims'] = prms['num_nodes']*prms['procs_per_node']
    prms['beta'] = 1./(prms['temperature']*physical_constants.MolarGasConstant().value()/1e3) # mol/kJ

    fstio.run_simulations(params=prms,
                          sim_node_dependent_params=None,
                          #sim_node_dependent_params=sim_node_dependent_params,
                          write_feasst_script=write_feasst_script,
                          post_process=post_process,
                          queue_function=fstio.slurm_single_node,
                          args=args)