Statistiques
| Branche: | Tag: | Révision :

dockonsurf / modules / formats.py @ 4e82c425

Historique | Voir | Annoter | Télécharger (10,57 ko)

1
"""Module for the conversion between different kinds of atomic data.
2

3
functions:
4
confs_to_mol_list: Converts the conformers inside a rdkit.Mol object to a list
5
    of separate rdkit.Mol objects.
6
rdkit_mol_to_ase_atoms: Converts a rdkit.Mol object into ase.Atoms object.
7
add_special_atoms: Allows ase to use custom elements with symbols not in the
8
    periodic table.
9
adapt_format: Converts the coordinate files into a required library object type.
10
read_coords_cp2k: Reads the coordinates from a CP2K restart file and returns an
11
    ase.Atoms object.
12
read_coords_vasp: Reads the coordinates from VASP OUTCAR file and returns an
13
    ase.Atoms object.
14
read_energy_cp2k: Reads the CP2K out file and returns its final energy.
15
collect_confs: Reads the coordinates and energies of a list of finished
16
    calculations.
17
"""
18

    
19
import logging
20

    
21
import rdkit.Chem.AllChem as Chem
22

    
23
logger = logging.getLogger('DockOnSurf')
24

    
25

    
26
def confs_to_mol_list(mol: Chem.rdchem.Mol, idx_lst=None):
27
    """Converts the conformers inside a rdkit mol object to a list of
28
    separate mol objects.
29

30
    @param mol: rdkit mol object containing at least one conformer.
31
    @param idx_lst: list of conformer indices to be considered. If not passed,
32
        all conformers are considered.
33
    @return: list of separate mol objects.
34
    """
35
    if idx_lst is None:
36
        idx_lst = list(range(mol.GetNumConformers()))
37
    return [Chem.MolFromMolBlock(
38
        Chem.MolToMolBlock(mol, confId=int(idx)).replace("3D", ""),
39
        removeHs=False) for idx in idx_lst]
40

    
41

    
42
def rdkit_mol_to_ase_atoms(mol: Chem.rdchem.Mol):
43
    """Converts a rdkit mol object into ase Atoms object.
44
    @param mol: rdkit mol object containing only one conformer.
45
    @return ase.Atoms: ase Atoms object with the same coordinates.
46
    """
47
    from ase import Atoms
48
    if mol.GetNumConformers() > 1:
49
        logger.warning('A mol object with multiple conformers is parsed, '
50
                       'converting to Atoms only the first conformer.')
51
    symbols = [atm.GetSymbol() for atm in mol.GetAtoms()]
52
    positions = mol.GetConformer(0).GetPositions()
53
    return Atoms(symbols=symbols, positions=positions)
54

    
55

    
56
def add_special_atoms(symbol_pairs):
57
    """Allows ase to use custom elements with symbols not in the periodic table.
58

59
    This function adds new chemical elements to be used by ase. Every new custom
60
    element must have a traditional (present in the periodic table) partner
61
    from which to obtain all its properties.
62
    @param symbol_pairs: List of tuples containing pairs of chemical symbols.
63
        Every tuple contains a pair of chemical symbols, the first label must be
64
        the label of the custom element and the second one the symbol of the
65
        reference one (traditional present on the periodic table).
66
    @return:
67
    """  # TODO Enable special atoms for rdkit
68
    import numpy as np
69
    from ase import data
70
    for i, pair in enumerate(symbol_pairs):
71
        data.chemical_symbols += [pair[0]]
72
        z_orig = data.atomic_numbers[pair[1]]
73
        orig_iupac_mass = data.atomic_masses_iupac2016[z_orig]
74
        orig_com_mass = data.atomic_masses_common[z_orig]
75
        data.atomic_numbers[pair[0]] = max(data.atomic_numbers.values()) + 1
76
        data.atomic_names += [pair[0]]
77
        data.atomic_masses_iupac2016 = np.append(data.atomic_masses_iupac2016,
78
                                                 orig_iupac_mass)
79
        data.atomic_masses = data.atomic_masses_iupac2016
80
        data.atomic_masses_common = np.append(data.atomic_masses_common,
81
                                              orig_com_mass)
82
        data.covalent_radii = np.append(data.covalent_radii,
83
                                        data.covalent_radii[z_orig])
84
        data.reference_states += [data.reference_states[z_orig]]
85
        # TODO Add vdw_radii, gsmm and aml (smaller length)
86

    
87

    
88
def adapt_format(requirement, coord_file, spec_atms=tuple()):
89
    """Converts the coordinate files into a required library object type.
90

91
    Depending on the library required to use and the file type, it converts the
92
    coordinate file into a library-workable object.
93
    @param requirement: str, the library for which the conversion should be
94
        made. Accepted values: 'ase', 'rdkit'.
95
    @param coord_file: str, path to the coordinates file aiming to convert.
96
        Accepted file formats: all file formats readable by ase.
97
    @param spec_atms: List of tuples containing pairs of new/traditional
98
        chemical symbols.
99
    @return: an object the required library can work with.
100
    """
101
    import ase.io
102
    from ase.io.formats import filetype
103

    
104
    from modules.utilities import try_command
105

    
106
    req_vals = ['rdkit', 'ase']
107
    lib_err = f"The conversion to the '{requirement}' library object type" \
108
              f" has not yet been implemented"
109
    conv_info = f"Converted {coord_file} to {requirement} object type"
110

    
111
    fil_type_err = f'The {filetype(coord_file)} file format is not supported'
112

    
113
    if requirement not in req_vals:
114
        logger.error(lib_err)
115
        raise NotImplementedError(lib_err)
116

    
117
    if requirement == 'rdkit':
118
        from modules.xyz2mol import xyz2mol
119
        if filetype(coord_file) == 'xyz':  # TODO Include detection of charges.
120
            ase_atms = ase.io.read(coord_file)
121
            atomic_nums = ase_atms.get_atomic_numbers().tolist()
122
            xyz_coordinates = ase_atms.positions.tolist()
123
            rd_mol_obj = xyz2mol(atomic_nums, xyz_coordinates)
124
            logger.debug(conv_info)
125
            return Chem.AddHs(rd_mol_obj)
126
        elif filetype(coord_file) == 'mol':
127
            logger.debug(conv_info)
128
            return Chem.AddHs(Chem.MolFromMolFile(coord_file, removeHs=False))
129
        else:
130
            ase_atms = try_command(ase.io.read,
131
                                   [(ase.io.formats.UnknownFileTypeError,
132
                                     fil_type_err)],
133
                                   coord_file)
134
            atomic_nums = ase_atms.get_atomic_numbers().tolist()
135
            xyz_coordinates = ase_atms.positions.tolist()
136
            return xyz2mol(atomic_nums, xyz_coordinates)
137

    
138
    if requirement == 'ase':
139
        add_special_atoms(spec_atms)
140
        if filetype(coord_file) == 'xyz':
141
            logger.debug(conv_info)
142
            return ase.io.read(coord_file)
143
        elif filetype(coord_file) == 'mol':
144
            logger.debug(conv_info)
145
            rd_mol = Chem.AddHs(Chem.MolFromMolFile(coord_file, removeHs=False))
146
            return rdkit_mol_to_ase_atoms(rd_mol)
147
        else:
148
            return try_command(ase.io.read,
149
                               [(ase.io.formats.UnknownFileTypeError,
150
                                 fil_type_err)],
151
                               coord_file)
152

    
153

    
154
def read_coords_cp2k(file, spec_atoms=tuple()):
155
    """Reads the coordinates from a CP2K restart file and returns an ase.Atoms
156
     object.
157

158
    @param file: The file to read containing the coordinates.
159
    @param spec_atoms: List of tuples containing the pairs of chemical symbols.
160
    @return: ase.Atoms object of the coordinates in the file.
161
    """
162
    import numpy as np
163
    from ase import Atoms
164
    from pycp2k import CP2K
165

    
166
    cp2k = CP2K()
167
    cp2k.parse(file)
168
    force_eval = cp2k.CP2K_INPUT.FORCE_EVAL_list[0]
169
    raw_coords = force_eval.SUBSYS.COORD.Default_keyword
170
    symbols = [atom.split()[0] for atom in raw_coords]
171
    positions = np.array([[float(coord) for coord in atom.split()[1:]]
172
                          for atom in raw_coords])
173
    if len(spec_atoms) > 0:
174
        add_special_atoms(spec_atoms)
175
    return Atoms(symbols=symbols, positions=positions)
176

    
177

    
178
def read_coords_vasp(file, spec_atoms=tuple()):
179
    """Reads the coordinates from VASP OUTCAR and returns an ase.Atoms object.
180

181
    @param file: The file to read containing the coordinates.
182
    @param spec_atoms: List of tuples containing the pairs of chemical symbols.
183
    @return: ase.Atoms object of the coordinates in the file.
184
    """
185
    import os
186
    import ase.io
187
    if not os.path.isfile(file):
188
        err_msg = f"File {file} not found."
189
        logger.error(err_msg)
190
        raise FileNotFoundError(err_msg)
191
    if len(spec_atoms) > 0:
192
        add_special_atoms(spec_atoms)
193
    return ase.io.read(file)
194

    
195

    
196
def read_energy_cp2k(file):
197
    """Reads the CP2K output file and returns its final energy.
198

199
    @param file: The file from which the energy should be read.
200
    @return: The last energy on the out file in eV.
201
    """
202
    out_fh = open(file, 'r')
203
    energy = None
204
    for line in out_fh:
205
        if "ENERGY| Total FORCE_EVAL ( QS ) energy (a.u.):" in line:
206
            energy = float(line.strip().split(':')[1]) * 27.2113845  # Ha to eV
207
    out_fh.close()
208
    return energy
209

    
210

    
211
def collect_confs(dir_list, code, run_type, spec_atms=tuple()):
212
    """Reads the coordinates and energies of a list of finished calculations.
213

214
    Given a dockonsurf directory hierarchy: project/run_type/conf_X
215
    (run_type = ['isolated', 'screening' or 'refinement']) it reads the
216
    coordinates of each conf_X, it assigns its total energy from the calculation
217
    and assigns the conf_X label to track its origin. Finally it returns the
218
    ase.Atoms object.
219

220
    @param dir_list: List of directories where to read the coords from.
221
    @param code: the code that produced the calculation results files.
222
    @param run_type: the type of calculation (and also the name of the folder)
223
        containing the calculation subdirectories.
224
    @param spec_atms: List of tuples containing pairs of new/traditional
225
        chemical symbols.
226
    @return: list of ase.Atoms objects.
227
    """
228
    from glob import glob
229
    import os
230
    from modules.utilities import is_binary
231
    atoms_list = []
232
    for conf_dir in dir_list:
233
        conf_path = f"{run_type}/{conf_dir}/"
234
        if code == 'cp2k':
235
            ase_atms = read_coords_cp2k(glob(f"{conf_path}/*-1.restart")[0],
236
                                        spec_atms)
237
            # Assign energy
238
            for fil in os.listdir(conf_path):
239
                if is_binary(conf_path + fil):
240
                    continue
241
                conf_energy = read_energy_cp2k(conf_path + fil)
242
                if conf_energy is not None:
243
                    ase_atms.info["energy"] = conf_energy
244
                    break
245
            ase_atms.info[run_type[:3]] = conf_dir
246
            atoms_list.append(ase_atms)
247
        elif code == 'vasp':
248
            ase_atms = read_coords_vasp(f"{conf_path}/OUTCAR", spec_atms)
249
            ase_atms.info["energy"] = ase_atms.get_total_energy() * 27.2113845
250
            ase_atms.info[run_type[:3]] = conf_dir
251
            atoms_list.append(ase_atms)
252
        else:
253
            err_msg = f"Collect coordinates not implemented for '{code}'."
254
            logger.error(err_msg)
255
            raise NotImplementedError(err_msg)
256
    return atoms_list