Statistiques
| Branche: | Tag: | Révision :

dockonsurf / modules / formats.py @ 1d8c374e

Historique | Voir | Annoter | Télécharger (10,51 ko)

1
"""Module for the interconversion between different kinds of atomic data.
2

3
functions:
4
confs_to_mol_list: Converts the conformers inside a rdkit.Mol object to a list
5
    of separate rdkit.Mol objects.
6
rdkit_mol_to_ase_atoms: Converts a rdkit.Mol object into ase.Atoms object.
7
add_special_atoms: Allows ase to use custom elements with symbols not in the
8
    periodic table.
9
adapt_format: Converts the coordinate files into a required library object type.
10
read_coords_cp2k: Reads the coordinates from a CP2K restart file and returns an
11
    ase.Atoms object.
12
collect_coords: Directs the reading of coordinates on a set of subdirectories.
13
read_energy_cp2k: Reads the CP2K out file and returns its final energy.
14
collect_energies: Directs the reading of energies on a set of subdirectories.
15
"""
16

    
17
import logging
18

    
19
import rdkit.Chem.AllChem as Chem
20

    
21
logger = logging.getLogger('DockOnSurf')
22

    
23

    
24
def confs_to_mol_list(mol: Chem.rdchem.Mol, idx_lst=None):
25
    """Converts the conformers inside a rdkit mol object to a list of
26
    separate mol objects.
27

28
    @param mol: rdkit mol object containing at least one conformer.
29
    @param idx_lst: list of conformer indices to be considered. If not passed,
30
        all conformers are considered.
31
    @return: list of separate mol objects.
32
    """
33
    if idx_lst is None:
34
        idx_lst = list(range(mol.GetNumConformers()))
35
    return [Chem.MolFromMolBlock(
36
        Chem.MolToMolBlock(mol, confId=int(idx)).replace("3D", ""),
37
        removeHs=False) for idx in idx_lst]
38

    
39

    
40
def rdkit_mol_to_ase_atoms(mol: Chem.rdchem.Mol):
41
    """Converts a rdkit mol object into ase Atoms object.
42
    @param mol: rdkit mol object containing only one conformer.
43
    @return ase.Atoms: ase Atoms object with the same coordinates.
44
    """
45
    from ase import Atoms
46
    if mol.GetNumConformers() > 1:
47
        logger.warning('A mol object with multiple conformers is parsed, '
48
                       'converting to Atoms only the first conformer.')
49
    symbols = [atm.GetSymbol() for atm in mol.GetAtoms()]
50
    positions = mol.GetConformer(0).GetPositions()
51
    return Atoms(symbols=symbols, positions=positions)
52

    
53

    
54
def add_special_atoms(symbol_pairs):
55
    """Allows ase to use custom elements with symbols not in the periodic table.
56

57
    This function adds new chemical elements to be used by ase. Every new custom
58
    element must have a traditional (present in the periodic table) partner
59
    from which to obtain all its properties.
60
    @param symbol_pairs: List of tuples containing pairs of chemical symbols.
61
        Every tuple contains a pair of chemical symbols, the first label must be
62
        the label of the custom element and the second one the symbol of the
63
        reference one (traditional present on the periodic table).
64
    @return:
65
    """  # TODO Enable special atoms for rdkit
66
    import numpy as np
67
    from ase import data
68
    for i, pair in enumerate(symbol_pairs):
69
        data.chemical_symbols += [pair[0]]
70
        z_orig = data.atomic_numbers[pair[1]]
71
        orig_iupac_mass = data.atomic_masses_iupac2016[z_orig]
72
        orig_com_mass = data.atomic_masses_common[z_orig]
73
        data.atomic_numbers[pair[0]] = max(data.atomic_numbers.values()) + 1
74
        data.atomic_names += [pair[0]]
75
        data.atomic_masses_iupac2016 = np.append(data.atomic_masses_iupac2016,
76
                                                 orig_iupac_mass)
77
        data.atomic_masses = data.atomic_masses_iupac2016
78
        data.atomic_masses_common = np.append(data.atomic_masses_common,
79
                                              orig_com_mass)
80
        data.covalent_radii = np.append(data.covalent_radii,
81
                                        data.covalent_radii[z_orig])
82
        data.reference_states += [data.reference_states[z_orig]]
83
        # TODO Add vdw_radii, gsmm and aml (smaller length)
84

    
85

    
86
def adapt_format(requirement, coord_file, spec_atms=tuple()):
87
    """Converts the coordinate files into a required library object type.
88

89
    Depending on the library required to use and the file type, it converts the
90
    coordinate file into a library-workable object.
91
    @param requirement: str, the library for which the conversion should be
92
    made. Accepted values: 'ase', 'rdkit'.
93
    @param coord_file: str, path to the coordinates file aiming to convert.
94
    Accepted file tyoes: 'xyz', 'mol'.
95
    @param spec_atms: List of tuples containing pairs of new/traditional
96
        chemical symbols.
97
    @return: an object the required library can work with.
98
    """
99
    import ase.io
100
    from ase.io.formats import filetype
101

    
102
    from modules.utilities import try_command
103

    
104
    req_vals = ['rdkit', 'ase']
105
    lib_err = f"The conversion to the '{requirement}' library object type" \
106
              f" has not yet been implemented"
107
    conv_info = f"Converted {coord_file} to {requirement} object type"
108

    
109
    fil_type_err = f'The {filetype(coord_file)} file format is not supported'
110

    
111
    if requirement not in req_vals:
112
        logger.error(lib_err)
113
        raise NotImplementedError(lib_err)
114

    
115
    if requirement == 'rdkit':
116
        from modules.xyz2mol import xyz2mol
117
        if filetype(coord_file) == 'xyz':  # TODO Include detection of charges.
118
            ase_atms = ase.io.read(coord_file)
119
            atomic_nums = ase_atms.get_atomic_numbers().tolist()
120
            xyz_coordinates = ase_atms.positions.tolist()
121
            rd_mol_obj = xyz2mol(atomic_nums, xyz_coordinates)
122
            logger.debug(conv_info)
123
            return Chem.AddHs(rd_mol_obj)
124
        elif filetype(coord_file) == 'mol':
125
            logger.debug(conv_info)
126
            return Chem.AddHs(Chem.MolFromMolFile(coord_file, removeHs=False))
127
        else:
128
            ase_atms = try_command(ase.io.read,
129
                                   [(ase.io.formats.UnknownFileTypeError,
130
                                     fil_type_err)],
131
                                   coord_file)
132
            atomic_nums = ase_atms.get_atomic_numbers().tolist()
133
            xyz_coordinates = ase_atms.positions.tolist()
134
            return xyz2mol(atomic_nums, xyz_coordinates)
135

    
136
    if requirement == 'ase':
137
        add_special_atoms(spec_atms)
138
        if filetype(coord_file) == 'xyz':
139
            logger.debug(conv_info)
140
            return ase.io.read(coord_file)
141
        elif filetype(coord_file) == 'mol':
142
            logger.debug(conv_info)
143
            rd_mol = Chem.AddHs(Chem.MolFromMolFile(coord_file, removeHs=False))
144
            return rdkit_mol_to_ase_atoms(rd_mol)
145
        else:
146
            return try_command(ase.io.read,
147
                               [(ase.io.formats.UnknownFileTypeError,
148
                                 fil_type_err)],
149
                               coord_file)
150

    
151

    
152
def read_coords_cp2k(file, spec_atoms=tuple()):
153
    """Reads the coordinates from a CP2K restart file and returns an ase.Atoms
154
     object.
155

156
    @param file: The file to read containing the coordinates.
157
    @param spec_atoms: List of tuples containing the pairs of chemical symbols.
158
    @return: ase.Atoms object of the coordinates in the file.
159
    """
160
    import numpy as np
161
    from ase import Atoms
162
    from pycp2k import CP2K
163

    
164
    cp2k = CP2K()
165
    cp2k.parse(file)
166
    force_eval = cp2k.CP2K_INPUT.FORCE_EVAL_list[0]
167
    raw_coords = force_eval.SUBSYS.COORD.Default_keyword
168
    symbols = [atom.split()[0] for atom in raw_coords]
169
    positions = np.array([[float(coord) for coord in atom.split()[1:]]
170
                          for atom in raw_coords])
171
    if len(spec_atoms) > 0:
172
        add_special_atoms(spec_atoms)
173
    return Atoms(symbols=symbols, positions=positions)
174

    
175

    
176
def read_coords_vasp(file, spec_atoms=tuple()):
177
    """Reads the coordinates from VASP OUTCAR and returns an ase.Atoms object.
178

179
    @param file: The file to read containing the coordinates.
180
    @param spec_atoms: List of tuples containing the pairs of chemical symbols.
181
    @return: ase.Atoms object of the coordinates in the file.
182
    """
183
    import os
184
    import ase.io
185
    if not os.path.isfile(file):
186
        err_msg = f"File {file} not found."
187
        logger.error(err_msg)
188
        raise FileNotFoundError(err_msg)
189
    if len(spec_atoms) > 0:
190
        add_special_atoms(spec_atoms)
191
    return ase.io.read(file)
192

    
193

    
194
def read_energy_cp2k(file):
195
    """Reads the CP2K output file and returns its final energy.
196

197
    @param file: The file from which the energy should be read.
198
    @return: The last energy on the out file in eV.
199
    """
200
    out_fh = open(file, 'r')
201
    energy = None
202
    for line in out_fh:
203
        if "ENERGY| Total FORCE_EVAL ( QS ) energy (a.u.):" in line:
204
            energy = float(line.strip().split(':')[1]) * 27.2113845  # Ha to eV
205
    out_fh.close()
206
    return energy
207

    
208

    
209
def collect_confs(dir_list, code, run_type, spec_atms=tuple()):
210
    """Reads the coordinates and energies of a list of finished calculations.
211

212
    Given a dockonsurf directory hierarchy: project/run_type/conf_X
213
    (run_type = ['isolated', 'screening' or 'refinement']) it reads the
214
    coordinates of each conf_X, it assigns its total energy from the calculation
215
    and assigns the conf_X label to track its origin. Finally it returns the
216
    ase.Atoms object.
217

218
    @param dir_list: List of directories where to read the coords from.
219
    @param code: the code that produced the calculation results files.
220
    @param run_type: the type of calculation (and also the name of the folder)
221
        containing the calculation subdirectories.
222
    @param spec_atms: List of tuples containing pairs of new/traditional
223
        chemical symbols.
224
    @return: list of ase.Atoms objects.
225
    """
226
    from glob import glob
227
    import os
228
    from modules.utilities import is_binary
229
    atoms_list = []
230
    for conf_dir in dir_list:
231
        conf_path = f"{run_type}/{conf_dir}/"
232
        if code == 'cp2k':
233
            ase_atms = read_coords_cp2k(glob(f"{conf_path}/*-1.restart")[0],
234
                                        spec_atms)
235
            # Assign energy
236
            for fil in os.listdir(conf_path):
237
                if is_binary(conf_path+fil):
238
                    continue
239
                conf_energy = read_energy_cp2k(conf_path+fil)
240
                if conf_energy is not None:
241
                    ase_atms.info["energy"] = conf_energy
242
                    break
243
            ase_atms.info[run_type[:3]] = conf_dir
244
            atoms_list.append(ase_atms)
245
        elif code == 'vasp':
246
            ase_atms = read_coords_vasp(f"{conf_path}/OUTCAR", spec_atms)
247
            ase_atms.info["energy"] = ase_atms.get_total_energy() * 27.2113845
248
            ase_atms.info[run_type[:3]] = conf_dir
249
            atoms_list.append(ase_atms)
250
        else:
251
            err_msg = f"Collect coordinates not implemented for '{code}'."
252
            logger.error(err_msg)
253
            raise NotImplementedError(err_msg)
254
    return atoms_list
255