Statistiques
| Branche: | Tag: | Révision :

dockonsurf / modules / formats.py @ 748a6036

Historique | Voir | Annoter | Télécharger (10,26 ko)

1
"""Module for the interconversion between different kinds of atomic data.
2

3
functions:
4
confs_to_mol_list: Converts the conformers inside a rdkit.Mol object to a list
5
    of separate rdkit.Mol objects.
6
rdkit_mol_to_ase_atoms: Converts a rdkit.Mol object into ase.Atoms object.
7
add_special_atoms: Allows ase to use custom elements with symbols not in the
8
    periodic table.
9
adapt_format: Converts the coordinate files into a required library object type.
10
read_coords_cp2k: Reads the coordinates from a CP2K restart file and returns an
11
    ase.Atoms object.
12
collect_coords: Directs the reading of coordinates on a set of subdirectories.
13
read_energy_cp2k: Reads the CP2K out file and returns its final energy.
14
collect_energies: Directs the reading of energies on a set of subdirectories.
15
"""
16

    
17
import logging
18

    
19
import rdkit.Chem.AllChem as Chem
20

    
21
logger = logging.getLogger('DockOnSurf')
22

    
23

    
24
def confs_to_mol_list(mol: Chem.rdchem.Mol, idx_lst=None):
25
    """Converts the conformers inside a rdkit mol object to a list of
26
    separate mol objects.
27

28
    @param mol: rdkit mol object containing at least one conformer.
29
    @param idx_lst: list of conformer indices to be considered. If not passed,
30
        all conformers are considered.
31
    @return: list of separate mol objects.
32
    """
33
    if idx_lst is None:
34
        idx_lst = list(range(mol.GetNumConformers()))
35
    return [Chem.MolFromMolBlock(
36
        Chem.MolToMolBlock(mol, confId=int(idx)).replace("3D", ""),
37
        removeHs=False) for idx in idx_lst]
38

    
39

    
40
def rdkit_mol_to_ase_atoms(mol: Chem.rdchem.Mol):
41
    """Converts a rdkit mol object into ase Atoms object.
42
    @param mol: rdkit mol object containing only one conformer.
43
    @return ase.Atoms: ase Atoms object with the same coordinates.
44
    """
45
    from ase import Atoms
46
    if mol.GetNumConformers() > 1:
47
        logger.warning('A mol object with multiple conformers is parsed, '
48
                       'converting to Atoms only the first conformer.')
49
    symbols = [atm.GetSymbol() for atm in mol.GetAtoms()]
50
    positions = mol.GetConformer(0).GetPositions()
51
    return Atoms(symbols=symbols, positions=positions)
52

    
53

    
54
def add_special_atoms(symbol_pairs):
55
    """Allows ase to use custom elements with symbols not in the periodic table.
56

57
    This function adds new chemical elements to be used by ase. Every new custom
58
    element must have a traditional (present in the periodic table) partner
59
    from which to obtain all its properties.
60
    @param symbol_pairs: List of tuples containing pairs of chemical symbols.
61
        Every tuple contains a pair of chemical symbols, the first label must be
62
        the label of the custom element and the second one the symbol of the
63
        reference one (traditional present on the periodic table).
64
    @return:
65
    """  # TODO Enable special atoms for rdkit
66
    import numpy as np
67
    from ase import data
68
    for i, pair in enumerate(symbol_pairs):
69
        data.chemical_symbols += [pair[0]]
70
        z_orig = data.atomic_numbers[pair[1]]
71
        orig_iupac_mass = data.atomic_masses_iupac2016[z_orig]
72
        orig_com_mass = data.atomic_masses_common[z_orig]
73
        data.atomic_numbers[pair[0]] = max(data.atomic_numbers.values()) + 1
74
        data.atomic_names += [pair[0]]
75
        data.atomic_masses_iupac2016 = np.append(data.atomic_masses_iupac2016,
76
                                                 orig_iupac_mass)
77
        data.atomic_masses = data.atomic_masses_iupac2016
78
        data.atomic_masses_common = np.append(data.atomic_masses_common,
79
                                              orig_com_mass)
80
        data.covalent_radii = np.append(data.covalent_radii,
81
                                        data.covalent_radii[z_orig])
82
        data.reference_states += [data.reference_states[z_orig]]
83
        # TODO Add vdw_radii, gsmm and aml (smaller length)
84

    
85

    
86
def adapt_format(requirement, coord_file, spec_atms=tuple()):
87
    """Converts the coordinate files into a required library object type.
88

89
    Depending on the library required to use and the file type, it converts the
90
    coordinate file into a library-workable object.
91
    @param requirement: str, the library for which the conversion should be
92
    made. Accepted values: 'ase', 'rdkit'.
93
    @param coord_file: str, path to the coordinates file aiming to convert.
94
    Accepted file tyoes: 'xyz', 'mol'.
95
    @param spec_atms: List of tuples containing pairs of new/traditional
96
        chemical symbols.
97
    @return: an object the required library can work with.
98
    """
99
    import ase.io
100
    from ase.io.formats import filetype
101

    
102
    from modules.utilities import try_command
103

    
104
    req_vals = ['rdkit', 'ase']
105
    lib_err = f"The conversion to the '{requirement}' library object type" \
106
              f" has not yet been implemented"
107
    conv_info = f"Converted {coord_file} to {requirement} object type"
108

    
109
    fil_type_err = f'The {filetype(coord_file)} file format is not supported'
110

    
111
    if requirement not in req_vals:
112
        logger.error(lib_err)
113
        raise NotImplementedError(lib_err)
114

    
115
    if requirement == 'rdkit':
116
        from modules.xyz2mol import xyz2mol
117
        if filetype(coord_file) == 'xyz':  # TODO Include detection of charges.
118
            ase_atms = ase.io.read(coord_file)
119
            atomic_nums = ase_atms.get_atomic_numbers().tolist()
120
            xyz_coordinates = ase_atms.positions.tolist()
121
            rd_mol_obj = xyz2mol(atomic_nums, xyz_coordinates)
122
            logger.debug(conv_info)
123
            return Chem.AddHs(rd_mol_obj)
124
        elif filetype(coord_file) == 'mol':
125
            logger.debug(conv_info)
126
            return Chem.AddHs(Chem.MolFromMolFile(coord_file, removeHs=False))
127
        else:
128
            ase_atms = try_command(ase.io.read,
129
                                   [(ase.io.formats.UnknownFileTypeError,
130
                                     fil_type_err)],
131
                                   coord_file)
132
            atomic_nums = ase_atms.get_atomic_numbers().tolist()
133
            xyz_coordinates = ase_atms.positions.tolist()
134
            return xyz2mol(atomic_nums, xyz_coordinates)
135

    
136
    if requirement == 'ase':
137
        add_special_atoms(spec_atms)
138
        if filetype(coord_file) == 'xyz':
139
            logger.debug(conv_info)
140
            return ase.io.read(coord_file)
141
        elif filetype(coord_file) == 'mol':
142
            logger.debug(conv_info)
143
            rd_mol = Chem.AddHs(Chem.MolFromMolFile(coord_file, removeHs=False))
144
            return rdkit_mol_to_ase_atoms(rd_mol)
145
        else:
146
            return try_command(ase.io.read,
147
                               [(ase.io.formats.UnknownFileTypeError,
148
                                 fil_type_err)],
149
                               coord_file)
150

    
151

    
152
def read_coords_cp2k(file, spec_atoms=tuple()):
153
    """Reads the coordinates from a CP2K restart file and returns an ase.Atoms
154
     object.
155

156
    @param file: The file to read containing the coordinates.
157
    @param spec_atoms: List of tuples containing the pairs of chemical symbols.
158
    @return: ase.Atoms object of the coordinates in the file.
159
    """
160
    import numpy as np
161
    from ase import Atoms
162
    from pycp2k import CP2K
163

    
164
    cp2k = CP2K()
165
    cp2k.parse(file)
166
    force_eval = cp2k.CP2K_INPUT.FORCE_EVAL_list[0]
167
    raw_coords = force_eval.SUBSYS.COORD.Default_keyword
168
    symbols = [atom.split()[0] for atom in raw_coords]
169
    positions = np.array([atom.split()[1:] for atom in raw_coords])
170
    if len(spec_atoms) > 0:
171
        add_special_atoms(spec_atoms)
172
    return Atoms(symbols=symbols, positions=positions)
173

    
174

    
175
def collect_coords(conf_list, code, run_type, spec_atms=tuple()):
176
    """Directs the reading of coordinates on a set of subdirectories.
177

178
    Given a dockonsurf directory hierarchy: project/run_type/conf_X
179
    (run_type = ['isolated', 'screening' or 'refinement']) with finished
180
    calculations produced by a given code, stored in every conf_X subdirectory,
181
    it collects the coordinates of the specified conf_X subdirectories in a
182
    single run_type by calling the adequate function (depending on the code) and
183
    returns a list of ase.Atoms objects.
184

185
    @param conf_list: List of directories where to read the coords from.
186
    @param code: the code that produced the calculation results files.
187
    @param run_type: the type of calculation (and also the name of the folder)
188
                     containing the calculation subdirectories.
189
    @param spec_atms: List of tuples containing pairs of new/traditional
190
        chemical symbols.
191
    @return: list of ase.Atoms objects.
192
    """
193
    from glob import glob
194
    atoms_list = []
195
    for conf in conf_list:
196
        if code == 'cp2k':
197
            atoms_list.append(read_coords_cp2k(glob(f"{run_type}/{conf}/*-1"
198
                                                    f".restart")[0], spec_atms))
199
        # elif code == 'vasp'
200
    return atoms_list
201

    
202

    
203
def read_energy_cp2k(file):
204
    """Reads the CP2K out file and returns its final energy.
205

206
    @param file: The file from which the energy should be read.
207
    @return: The last energy on the out file.
208
    """
209
    out_fh = open(file, 'r')
210
    energy = None
211
    for line in out_fh:
212
        if "ENERGY| Total FORCE_EVAL ( QS ) energy (a.u.):" in line:
213
            energy = float(line.strip().split(':')[1]) * 27.2113845  # Ha to eV
214
    out_fh.close()
215
    return energy
216

    
217

    
218
def collect_energies(conf_list, code, run_type):
219
    """Directs the reading of energies on a set of subdirectories.
220

221
    Given a dockonsurf directory hierarchy: project/run_type/conf_X
222
    (run_type = ['isolated', 'screening' or 'refinement']) with finished
223
    calculations produced by a given code, stored in every conf_X subdirectory,
224
    it collects the energies of the specified conf_X subdirectories in a
225
    single run_type by calling the adequate function (depending on the code) and
226
    returns a list of energies.
227

228
    @param conf_list: List of directories where to read the energy.
229
    @param code: The code that produced the calculation output files.
230
    @param run_type: The type of calculation (and also the name of the folder)
231
                     containing the calculation subdirectories.
232
    @return: List of energies
233
    """
234
    from glob import glob
235
    import numpy as np
236

    
237
    energies = []
238
    for conf in conf_list:
239
        if code == 'cp2k':
240
            energies.append(read_energy_cp2k(
241
                glob(f"{run_type}/{conf}/*.out")[0]))
242

    
243
    if len(energies) == 0:
244
        err = f"No results found on {run_type}"
245
        logger.error(err)
246
        raise FileNotFoundError(err)
247

    
248
    return np.array(energies)