Statistiques
| Branche: | Tag: | Révision :

dockonsurf / modules / formats.py @ cdc1edbe

Historique | Voir | Annoter | Télécharger (11,32 ko)

1
"""Module for the interconversion between different kinds of atomic data.
2

3
functions:
4
confs_to_mol_list: Converts the conformers inside a rdkit.Mol object to a list
5
    of separate rdkit.Mol objects.
6
rdkit_mol_to_ase_atoms: Converts a rdkit.Mol object into ase.Atoms object.
7
add_special_atoms: Allows ase to use custom elements with symbols not in the
8
    periodic table.
9
adapt_format: Converts the coordinate files into a required library object type.
10
read_coords_cp2k: Reads the coordinates from a CP2K restart file and returns an
11
    ase.Atoms object.
12
collect_coords: Directs the reading of coordinates on a set of subdirectories.
13
read_energy_cp2k: Reads the CP2K out file and returns its final energy.
14
collect_energies: Directs the reading of energies on a set of subdirectories.
15
"""
16

    
17
import logging
18

    
19
import rdkit.Chem.AllChem as Chem
20

    
21
logger = logging.getLogger('DockOnSurf')
22

    
23

    
24
def confs_to_mol_list(mol: Chem.rdchem.Mol, idx_lst=None):
25
    """Converts the conformers inside a rdkit mol object to a list of
26
    separate mol objects.
27

28
    @param mol: rdkit mol object containing at least one conformer.
29
    @param idx_lst: list of conformer indices to be considered. If not passed,
30
        all conformers are considered.
31
    @return: list of separate mol objects.
32
    """
33
    if idx_lst is None:
34
        idx_lst = list(range(mol.GetNumConformers()))
35
    return [Chem.MolFromMolBlock(
36
        Chem.MolToMolBlock(mol, confId=int(idx)).replace("3D", ""),
37
        removeHs=False) for idx in idx_lst]
38

    
39

    
40
def rdkit_mol_to_ase_atoms(mol: Chem.rdchem.Mol):
41
    """Converts a rdkit mol object into ase Atoms object.
42
    @param mol: rdkit mol object containing only one conformer.
43
    @return ase.Atoms: ase Atoms object with the same coordinates.
44
    """
45
    from ase import Atoms
46
    if mol.GetNumConformers() > 1:
47
        logger.warning('A mol object with multiple conformers is parsed, '
48
                       'converting to Atoms only the first conformer.')
49
    symbols = [atm.GetSymbol() for atm in mol.GetAtoms()]
50
    positions = mol.GetConformer(0).GetPositions()
51
    return Atoms(symbols=symbols, positions=positions)
52

    
53

    
54
def add_special_atoms(symbol_pairs):
55
    """Allows ase to use custom elements with symbols not in the periodic table.
56

57
    This function adds new chemical elements to be used by ase. Every new custom
58
    element must have a traditional (present in the periodic table) partner
59
    from which to obtain all its properties.
60
    @param symbol_pairs: List of tuples containing pairs of chemical symbols.
61
        Every tuple contains a pair of chemical symbols, the first label must be
62
        the label of the custom element and the second one the symbol of the
63
        reference one (traditional present on the periodic table).
64
    @return:
65
    """  # TODO Enable special atoms for rdkit
66
    import numpy as np
67
    from ase import data
68
    for i, pair in enumerate(symbol_pairs):
69
        data.chemical_symbols += [pair[0]]
70
        z_orig = data.atomic_numbers[pair[1]]
71
        orig_iupac_mass = data.atomic_masses_iupac2016[z_orig]
72
        orig_com_mass = data.atomic_masses_common[z_orig]
73
        data.atomic_numbers[pair[0]] = max(data.atomic_numbers.values()) + 1
74
        data.atomic_names += [pair[0]]
75
        data.atomic_masses_iupac2016 = np.append(data.atomic_masses_iupac2016,
76
                                                 orig_iupac_mass)
77
        data.atomic_masses = data.atomic_masses_iupac2016
78
        data.atomic_masses_common = np.append(data.atomic_masses_common,
79
                                              orig_com_mass)
80
        data.covalent_radii = np.append(data.covalent_radii,
81
                                        data.covalent_radii[z_orig])
82
        data.reference_states += [data.reference_states[z_orig]]
83
        # TODO Add vdw_radii, gsmm and aml (smaller length)
84

    
85

    
86
def adapt_format(requirement, coord_file, spec_atms=tuple()):
87
    """Converts the coordinate files into a required library object type.
88

89
    Depending on the library required to use and the file type, it converts the
90
    coordinate file into a library-workable object.
91
    @param requirement: str, the library for which the conversion should be
92
    made. Accepted values: 'ase', 'rdkit'.
93
    @param coord_file: str, path to the coordinates file aiming to convert.
94
    Accepted file tyoes: 'xyz', 'mol'.
95
    @param spec_atms: List of tuples containing pairs of new/traditional
96
        chemical symbols.
97
    @return: an object the required library can work with.
98
    """
99
    import ase.io
100
    from ase.io.formats import filetype
101

    
102
    from modules.utilities import try_command
103

    
104
    req_vals = ['rdkit', 'ase']
105
    lib_err = f"The conversion to the '{requirement}' library object type" \
106
              f" has not yet been implemented"
107
    conv_info = f"Converted {coord_file} to {requirement} object type"
108

    
109
    fil_type_err = f'The {filetype(coord_file)} file format is not supported'
110

    
111
    if requirement not in req_vals:
112
        logger.error(lib_err)
113
        raise NotImplementedError(lib_err)
114

    
115
    if requirement == 'rdkit':
116
        from modules.xyz2mol import xyz2mol
117
        if filetype(coord_file) == 'xyz':  # TODO Include detection of charges.
118
            ase_atms = ase.io.read(coord_file)
119
            atomic_nums = ase_atms.get_atomic_numbers().tolist()
120
            xyz_coordinates = ase_atms.positions.tolist()
121
            rd_mol_obj = xyz2mol(atomic_nums, xyz_coordinates)
122
            logger.debug(conv_info)
123
            return Chem.AddHs(rd_mol_obj)
124
        elif filetype(coord_file) == 'mol':
125
            logger.debug(conv_info)
126
            return Chem.AddHs(Chem.MolFromMolFile(coord_file, removeHs=False))
127
        else:
128
            ase_atms = try_command(ase.io.read,
129
                                   [(ase.io.formats.UnknownFileTypeError,
130
                                     fil_type_err)],
131
                                   coord_file)
132
            atomic_nums = ase_atms.get_atomic_numbers().tolist()
133
            xyz_coordinates = ase_atms.positions.tolist()
134
            return xyz2mol(atomic_nums, xyz_coordinates)
135

    
136
    if requirement == 'ase':
137
        add_special_atoms(spec_atms)
138
        if filetype(coord_file) == 'xyz':
139
            logger.debug(conv_info)
140
            return ase.io.read(coord_file)
141
        elif filetype(coord_file) == 'mol':
142
            logger.debug(conv_info)
143
            rd_mol = Chem.AddHs(Chem.MolFromMolFile(coord_file, removeHs=False))
144
            return rdkit_mol_to_ase_atoms(rd_mol)
145
        else:
146
            return try_command(ase.io.read,
147
                               [(ase.io.formats.UnknownFileTypeError,
148
                                 fil_type_err)],
149
                               coord_file)
150

    
151

    
152
def read_coords_cp2k(file, spec_atoms=tuple()):
153
    """Reads the coordinates from a CP2K restart file and returns an ase.Atoms
154
     object.
155

156
    @param file: The file to read containing the coordinates.
157
    @param spec_atoms: List of tuples containing the pairs of chemical symbols.
158
    @return: ase.Atoms object of the coordinates in the file.
159
    """
160
    import numpy as np
161
    from ase import Atoms
162
    from pycp2k import CP2K
163

    
164
    cp2k = CP2K()
165
    cp2k.parse(file)
166
    force_eval = cp2k.CP2K_INPUT.FORCE_EVAL_list[0]
167
    raw_coords = force_eval.SUBSYS.COORD.Default_keyword
168
    symbols = [atom.split()[0] for atom in raw_coords]
169
    positions = np.array([atom.split()[1:] for atom in raw_coords])
170
    if len(spec_atoms) > 0:
171
        add_special_atoms(spec_atoms)
172
    return Atoms(symbols=symbols, positions=positions)
173

    
174

    
175
def read_coords_vasp(file, spec_atoms=tuple()):
176
    """Reads the coordinates from VASP OUTCAR and returns an ase.Atoms object.
177

178
    @param file: The file to read containing the coordinates.
179
    @param spec_atoms: List of tuples containing the pairs of chemical symbols.
180
    @return: ase.Atoms object of the coordinates in the file.
181
    """
182
    import os
183
    import ase.io
184
    if not os.path.isfile(file):
185
        err_msg = f"File {file} not found."
186
        logger.error(err_msg)
187
        raise FileNotFoundError(err_msg)
188
    if len(spec_atoms) > 0:
189
        add_special_atoms(spec_atoms)
190
    return ase.io.read(file)
191

    
192

    
193
def collect_coords(conf_list, code, run_type, spec_atms=tuple()):
194
    """Directs the reading of coordinates on a set of subdirectories.
195

196
    Given a dockonsurf directory hierarchy: project/run_type/conf_X
197
    (run_type = ['isolated', 'screening' or 'refinement']) with finished
198
    calculations produced by a given code, stored in every conf_X subdirectory,
199
    it collects the coordinates of the specified conf_X subdirectories in a
200
    single run_type by calling the adequate function (depending on the code) and
201
    returns a list of ase.Atoms objects.
202

203
    @param conf_list: List of directories where to read the coords from.
204
    @param code: the code that produced the calculation results files.
205
    @param run_type: the type of calculation (and also the name of the folder)
206
                     containing the calculation subdirectories.
207
    @param spec_atms: List of tuples containing pairs of new/traditional
208
        chemical symbols.
209
    @return: list of ase.Atoms objects.
210
    """
211
    from glob import glob
212
    atoms_list = []
213
    for conf in conf_list:
214
        if code == 'cp2k':
215
            atoms_list.append(read_coords_cp2k(glob(f"{run_type}/{conf}/*-1"
216
                                                    f".restart")[0], spec_atms))
217
        elif code == 'vasp':
218
            atoms_list.append(read_coords_vasp(f"{run_type}/{conf}/OUTCAR",
219
                                               spec_atms))
220
        else:
221
            err_msg = f"Collect coordinates not implemented for '{code}'."
222
            logger.error(err_msg)
223
            raise NotImplementedError(err_msg)
224
    return atoms_list
225

    
226

    
227
def read_energy_cp2k(file):
228
    """Reads the CP2K out file and returns its final energy.
229

230
    @param file: The file from which the energy should be read.
231
    @return: The last energy on the out file.
232
    """
233
    out_fh = open(file, 'r')
234
    energy = None
235
    for line in out_fh:
236
        if "ENERGY| Total FORCE_EVAL ( QS ) energy (a.u.):" in line:
237
            energy = float(line.strip().split(':')[1]) * 27.2113845  # Ha to eV
238
    out_fh.close()
239
    return energy
240

    
241

    
242
def collect_energies(conf_list, code, run_type):
243
    """Directs the reading of energies on a set of subdirectories.
244

245
    Given a dockonsurf directory hierarchy: project/run_type/conf_X
246
    (run_type = ['isolated', 'screening' or 'refinement']) with finished
247
    calculations produced by a given code, stored in every conf_X subdirectory,
248
    it collects the energies of the specified conf_X subdirectories in a
249
    single run_type by calling the adequate function (depending on the code) and
250
    returns a list of energies.
251

252
    @param conf_list: List of directories where to read the energy.
253
    @param code: The code that produced the calculation output files.
254
    @param run_type: The type of calculation (and also the name of the folder)
255
                     containing the calculation subdirectories.
256
    @return: List of energies
257
    """
258
    from glob import glob
259
    import numpy as np
260

    
261
    energies = []
262
    for conf in conf_list:
263
        if code == 'cp2k':
264
            energies.append(read_energy_cp2k(
265
                glob(f"{run_type}/{conf}/*.out")[0]))
266
        else:
267
            err_msg = f"Collect energies not implemented for '{code}'."
268
            logger.error(err_msg)
269
            raise NotImplementedError(err_msg)
270

    
271
    if len(energies) == 0:
272
        err = f"No results found on {run_type}"
273
        logger.error(err)
274
        raise FileNotFoundError(err)
275

    
276
    return np.array(energies)