dockonsurf / modules / formats.py @ cf980c86
Historique | Voir | Annoter | Télécharger (10,57 ko)
1 |
"""Module for the conversion between different kinds of atomic data.
|
---|---|
2 |
|
3 |
functions:
|
4 |
confs_to_mol_list: Converts the conformers inside a rdkit.Mol object to a list
|
5 |
of separate rdkit.Mol objects.
|
6 |
rdkit_mol_to_ase_atoms: Converts a rdkit.Mol object into ase.Atoms object.
|
7 |
add_special_atoms: Allows ase to use custom elements with symbols not in the
|
8 |
periodic table.
|
9 |
adapt_format: Converts the coordinate files into a required library object type.
|
10 |
read_coords_cp2k: Reads the coordinates from a CP2K restart file and returns an
|
11 |
ase.Atoms object.
|
12 |
read_coords_vasp: Reads the coordinates from VASP OUTCAR file and returns an
|
13 |
ase.Atoms object.
|
14 |
read_energy_cp2k: Reads the CP2K out file and returns its final energy.
|
15 |
collect_confs: Reads the coordinates and energies of a list of finished
|
16 |
calculations.
|
17 |
"""
|
18 |
|
19 |
import logging |
20 |
|
21 |
import rdkit.Chem.AllChem as Chem |
22 |
|
23 |
logger = logging.getLogger('DockOnSurf')
|
24 |
|
25 |
|
26 |
def confs_to_mol_list(mol: Chem.rdchem.Mol, idx_lst=None): |
27 |
"""Converts the conformers inside a rdkit mol object to a list of
|
28 |
separate mol objects.
|
29 |
|
30 |
@param mol: rdkit mol object containing at least one conformer.
|
31 |
@param idx_lst: list of conformer indices to be considered. If not passed,
|
32 |
all conformers are considered.
|
33 |
@return: list of separate mol objects.
|
34 |
"""
|
35 |
if idx_lst is None: |
36 |
idx_lst = list(range(mol.GetNumConformers())) |
37 |
return [Chem.MolFromMolBlock(
|
38 |
Chem.MolToMolBlock(mol, confId=int(idx)).replace("3D", ""), |
39 |
removeHs=False) for idx in idx_lst] |
40 |
|
41 |
|
42 |
def rdkit_mol_to_ase_atoms(mol: Chem.rdchem.Mol): |
43 |
"""Converts a rdkit mol object into ase Atoms object.
|
44 |
@param mol: rdkit mol object containing only one conformer.
|
45 |
@return ase.Atoms: ase Atoms object with the same coordinates.
|
46 |
"""
|
47 |
from ase import Atoms |
48 |
if mol.GetNumConformers() > 1: |
49 |
logger.warning('A mol object with multiple conformers is parsed, '
|
50 |
'converting to Atoms only the first conformer.')
|
51 |
symbols = [atm.GetSymbol() for atm in mol.GetAtoms()] |
52 |
positions = mol.GetConformer(0).GetPositions()
|
53 |
return Atoms(symbols=symbols, positions=positions)
|
54 |
|
55 |
|
56 |
def add_special_atoms(symbol_pairs): |
57 |
"""Allows ase to use custom elements with symbols not in the periodic table.
|
58 |
|
59 |
This function adds new chemical elements to be used by ase. Every new custom
|
60 |
element must have a traditional (present in the periodic table) partner
|
61 |
from which to obtain all its properties.
|
62 |
@param symbol_pairs: List of tuples containing pairs of chemical symbols.
|
63 |
Every tuple contains a pair of chemical symbols, the first label must be
|
64 |
the label of the custom element and the second one the symbol of the
|
65 |
reference one (traditional present on the periodic table).
|
66 |
@return:
|
67 |
""" # TODO Enable special atoms for rdkit |
68 |
import numpy as np |
69 |
from ase import data |
70 |
for i, pair in enumerate(symbol_pairs): |
71 |
data.chemical_symbols += [pair[0]]
|
72 |
z_orig = data.atomic_numbers[pair[1]]
|
73 |
orig_iupac_mass = data.atomic_masses_iupac2016[z_orig] |
74 |
orig_com_mass = data.atomic_masses_common[z_orig] |
75 |
data.atomic_numbers[pair[0]] = max(data.atomic_numbers.values()) + 1 |
76 |
data.atomic_names += [pair[0]]
|
77 |
data.atomic_masses_iupac2016 = np.append(data.atomic_masses_iupac2016, |
78 |
orig_iupac_mass) |
79 |
data.atomic_masses = data.atomic_masses_iupac2016 |
80 |
data.atomic_masses_common = np.append(data.atomic_masses_common, |
81 |
orig_com_mass) |
82 |
data.covalent_radii = np.append(data.covalent_radii, |
83 |
data.covalent_radii[z_orig]) |
84 |
data.reference_states += [data.reference_states[z_orig]] |
85 |
# TODO Add vdw_radii, gsmm and aml (smaller length)
|
86 |
|
87 |
|
88 |
def adapt_format(requirement, coord_file, spec_atms=tuple()): |
89 |
"""Converts the coordinate files into a required library object type.
|
90 |
|
91 |
Depending on the library required to use and the file type, it converts the
|
92 |
coordinate file into a library-workable object.
|
93 |
@param requirement: str, the library for which the conversion should be
|
94 |
made. Accepted values: 'ase', 'rdkit'.
|
95 |
@param coord_file: str, path to the coordinates file aiming to convert.
|
96 |
Accepted file formats: all file formats readable by ase.
|
97 |
@param spec_atms: List of tuples containing pairs of new/traditional
|
98 |
chemical symbols.
|
99 |
@return: an object the required library can work with.
|
100 |
"""
|
101 |
import ase.io |
102 |
from ase.io.formats import filetype |
103 |
|
104 |
from modules.utilities import try_command |
105 |
|
106 |
req_vals = ['rdkit', 'ase'] |
107 |
lib_err = f"The conversion to the '{requirement}' library object type" \
|
108 |
f" has not yet been implemented"
|
109 |
conv_info = f"Converted {coord_file} to {requirement} object type"
|
110 |
|
111 |
fil_type_err = f'The {filetype(coord_file)} file format is not supported'
|
112 |
|
113 |
if requirement not in req_vals: |
114 |
logger.error(lib_err) |
115 |
raise NotImplementedError(lib_err) |
116 |
|
117 |
if requirement == 'rdkit': |
118 |
from modules.xyz2mol import xyz2mol |
119 |
if filetype(coord_file) == 'xyz': # TODO Include detection of charges. |
120 |
ase_atms = ase.io.read(coord_file) |
121 |
atomic_nums = ase_atms.get_atomic_numbers().tolist() |
122 |
xyz_coordinates = ase_atms.positions.tolist() |
123 |
rd_mol_obj = xyz2mol(atomic_nums, xyz_coordinates) |
124 |
logger.debug(conv_info) |
125 |
return Chem.AddHs(rd_mol_obj)
|
126 |
elif filetype(coord_file) == 'mol': |
127 |
logger.debug(conv_info) |
128 |
return Chem.AddHs(Chem.MolFromMolFile(coord_file, removeHs=False)) |
129 |
else:
|
130 |
ase_atms = try_command(ase.io.read, |
131 |
[(ase.io.formats.UnknownFileTypeError, |
132 |
fil_type_err)], |
133 |
coord_file) |
134 |
atomic_nums = ase_atms.get_atomic_numbers().tolist() |
135 |
xyz_coordinates = ase_atms.positions.tolist() |
136 |
return xyz2mol(atomic_nums, xyz_coordinates)
|
137 |
|
138 |
if requirement == 'ase': |
139 |
add_special_atoms(spec_atms) |
140 |
if filetype(coord_file) == 'xyz': |
141 |
logger.debug(conv_info) |
142 |
return ase.io.read(coord_file)
|
143 |
elif filetype(coord_file) == 'mol': |
144 |
logger.debug(conv_info) |
145 |
rd_mol = Chem.AddHs(Chem.MolFromMolFile(coord_file, removeHs=False))
|
146 |
return rdkit_mol_to_ase_atoms(rd_mol)
|
147 |
else:
|
148 |
return try_command(ase.io.read,
|
149 |
[(ase.io.formats.UnknownFileTypeError, |
150 |
fil_type_err)], |
151 |
coord_file) |
152 |
|
153 |
|
154 |
def read_coords_cp2k(file, spec_atoms=tuple()): |
155 |
"""Reads the coordinates from a CP2K restart file and returns an ase.Atoms
|
156 |
object.
|
157 |
|
158 |
@param file: The file to read containing the coordinates.
|
159 |
@param spec_atoms: List of tuples containing the pairs of chemical symbols.
|
160 |
@return: ase.Atoms object of the coordinates in the file.
|
161 |
"""
|
162 |
import numpy as np |
163 |
from ase import Atoms |
164 |
from pycp2k import CP2K |
165 |
|
166 |
cp2k = CP2K() |
167 |
cp2k.parse(file)
|
168 |
force_eval = cp2k.CP2K_INPUT.FORCE_EVAL_list[0]
|
169 |
raw_coords = force_eval.SUBSYS.COORD.Default_keyword |
170 |
symbols = [atom.split()[0] for atom in raw_coords] |
171 |
positions = np.array([[float(coord) for coord in atom.split()[1:]] |
172 |
for atom in raw_coords]) |
173 |
if len(spec_atoms) > 0: |
174 |
add_special_atoms(spec_atoms) |
175 |
return Atoms(symbols=symbols, positions=positions)
|
176 |
|
177 |
|
178 |
def read_coords_vasp(file, spec_atoms=tuple()): |
179 |
"""Reads the coordinates from VASP OUTCAR and returns an ase.Atoms object.
|
180 |
|
181 |
@param file: The file to read containing the coordinates.
|
182 |
@param spec_atoms: List of tuples containing the pairs of chemical symbols.
|
183 |
@return: ase.Atoms object of the coordinates in the file.
|
184 |
"""
|
185 |
import os |
186 |
import ase.io |
187 |
if not os.path.isfile(file): |
188 |
err_msg = f"File {file} not found."
|
189 |
logger.error(err_msg) |
190 |
raise FileNotFoundError(err_msg)
|
191 |
if len(spec_atoms) > 0: |
192 |
add_special_atoms(spec_atoms) |
193 |
return ase.io.read(file) |
194 |
|
195 |
|
196 |
def read_energy_cp2k(file): |
197 |
"""Reads the CP2K output file and returns its final energy.
|
198 |
|
199 |
@param file: The file from which the energy should be read.
|
200 |
@return: The last energy on the out file in eV.
|
201 |
"""
|
202 |
out_fh = open(file, 'r') |
203 |
energy = None
|
204 |
for line in out_fh: |
205 |
if "ENERGY| Total FORCE_EVAL ( QS ) energy (a.u.):" in line: |
206 |
energy = float(line.strip().split(':')[1]) * 27.2113845 # Ha to eV |
207 |
out_fh.close() |
208 |
return energy
|
209 |
|
210 |
|
211 |
def collect_confs(dir_list, code, run_type, spec_atms=tuple()): |
212 |
"""Reads the coordinates and energies of a list of finished calculations.
|
213 |
|
214 |
Given a dockonsurf directory hierarchy: project/run_type/conf_X
|
215 |
(run_type = ['isolated', 'screening' or 'refinement']) it reads the
|
216 |
coordinates of each conf_X, it assigns its total energy from the calculation
|
217 |
and assigns the conf_X label to track its origin. Finally it returns the
|
218 |
ase.Atoms object.
|
219 |
|
220 |
@param dir_list: List of directories where to read the coords from.
|
221 |
@param code: the code that produced the calculation results files.
|
222 |
@param run_type: the type of calculation (and also the name of the folder)
|
223 |
containing the calculation subdirectories.
|
224 |
@param spec_atms: List of tuples containing pairs of new/traditional
|
225 |
chemical symbols.
|
226 |
@return: list of ase.Atoms objects.
|
227 |
"""
|
228 |
from glob import glob |
229 |
import os |
230 |
from modules.utilities import is_binary |
231 |
atoms_list = [] |
232 |
for conf_dir in dir_list: |
233 |
conf_path = f"{run_type}/{conf_dir}/"
|
234 |
if code == 'cp2k': |
235 |
ase_atms = read_coords_cp2k(glob(f"{conf_path}/*-1.restart")[0], |
236 |
spec_atms) |
237 |
# Assign energy
|
238 |
for fil in os.listdir(conf_path): |
239 |
if is_binary(conf_path + fil):
|
240 |
continue
|
241 |
conf_energy = read_energy_cp2k(conf_path + fil) |
242 |
if conf_energy is not None: |
243 |
ase_atms.info["energy"] = conf_energy
|
244 |
break
|
245 |
ase_atms.info[run_type[:3]] = conf_dir
|
246 |
atoms_list.append(ase_atms) |
247 |
elif code == 'vasp': |
248 |
ase_atms = read_coords_vasp(f"{conf_path}/OUTCAR", spec_atms)
|
249 |
ase_atms.info["energy"] = ase_atms.get_total_energy() * 27.2113845 |
250 |
ase_atms.info[run_type[:3]] = conf_dir
|
251 |
atoms_list.append(ase_atms) |
252 |
else:
|
253 |
err_msg = f"Collect coordinates not implemented for '{code}'."
|
254 |
logger.error(err_msg) |
255 |
raise NotImplementedError(err_msg) |
256 |
return atoms_list
|