dockonsurf / modules / clustering.py @ 78fcb188
Historique | Voir | Annoter | Télécharger (8,59 ko)
1 |
"""Functions to cluster structures.
|
---|---|
2 |
|
3 |
functions:
|
4 |
get_labels_affty: Clusters data in affinity matrix form by assigning labels to
|
5 |
data points.
|
6 |
get_labels_vector: Clusters data in vectorial form by assigning labels to
|
7 |
data points.
|
8 |
get_clusters: Groups data-points belonging to the same cluster into arrays of
|
9 |
indices.
|
10 |
get_exemplars_affty: Computes the exemplars for every cluster and returns a list
|
11 |
of indices.
|
12 |
plot_clusters: Plots the clustered data casting a color to every cluster.
|
13 |
clustering: Directs the clustering process by calling the relevant functions.
|
14 |
"""
|
15 |
import logging |
16 |
|
17 |
import hdbscan |
18 |
import numpy as np |
19 |
|
20 |
logger = logging.getLogger('DockOnSurf')
|
21 |
|
22 |
|
23 |
def get_rmsd(mol_list: list, remove_Hs="c"): |
24 |
"""Computes the rmsd matrix of the conformers in a rdkit mol object.
|
25 |
|
26 |
@param mol_list: list of rdkit mol objects containing the conformers.
|
27 |
@param remove_Hs: bool or str,
|
28 |
@return rmsd_matrix: Matrix containing the rmsd values of every pair of
|
29 |
conformers.
|
30 |
|
31 |
The RMSD values of every pair of conformers is computed, stored in matrix
|
32 |
form and returned back. The calculation of rmsd values can take into
|
33 |
account all hydrogens, none, or only the ones not linked to carbon atoms.
|
34 |
"""
|
35 |
import rdkit.Chem.AllChem as Chem |
36 |
|
37 |
if len(mol_list) < 2: |
38 |
err = "The provided molecule has less than 2 conformers"
|
39 |
logger.error(err) |
40 |
raise ValueError(err) |
41 |
|
42 |
if not remove_Hs: |
43 |
pass
|
44 |
elif remove_Hs or remove_Hs.lower() == "all": |
45 |
mol_list = [Chem.RemoveHs(mol) for mol in mol_list] |
46 |
elif remove_Hs.lower() == "c": |
47 |
from isolated import remove_C_linked_Hs |
48 |
mol_list = [remove_C_linked_Hs(mol) for mol in mol_list] |
49 |
else:
|
50 |
err = "remove_Hs value does not have an acceptable value"
|
51 |
logger.error(err) |
52 |
raise ValueError(err) |
53 |
|
54 |
num_confs = len(mol_list)
|
55 |
conf_ids = list(range(num_confs)) |
56 |
rmsd_mtx = np.zeros((num_confs, num_confs)) |
57 |
for id1 in conf_ids: # TODO reduce RMSD precision |
58 |
for id2 in conf_ids[id1 + 1:]: |
59 |
rmsd = Chem.GetBestRMS(mol_list[id1], mol_list[id2]) |
60 |
rmsd_mtx[id1][id2] = rmsd |
61 |
rmsd_mtx[id2][id1] = rmsd |
62 |
|
63 |
return rmsd_mtx
|
64 |
|
65 |
|
66 |
def get_labels_affty(affty_mtx, kind="rmsd"): |
67 |
"""Clusters data in affinity matrix form by assigning labels to data points.
|
68 |
|
69 |
@param affty_mtx: Data to be clustered, it must be an affinity matrix.
|
70 |
(Eg. Euclidean distances between points, RMSD Matrix, etc.).
|
71 |
Shape: [n_points, n_points]
|
72 |
@param kind: Which kind of data the affinity matrix contains.
|
73 |
@return: list of cluster labels. Every data point is assigned a number
|
74 |
corresponding to the cluster it belongs to.
|
75 |
"""
|
76 |
if np.average(affty_mtx) < 1e-3 and kind == "rmsd": |
77 |
sing_clust = True
|
78 |
min_size = int(len(affty_mtx) / 2) |
79 |
else:
|
80 |
sing_clust = False
|
81 |
min_size = 20
|
82 |
hdbs = hdbscan.HDBSCAN(metric="precomputed",
|
83 |
min_samples=5,
|
84 |
min_cluster_size=min_size, |
85 |
allow_single_cluster=sing_clust) |
86 |
return hdbs.fit_predict(affty_mtx)
|
87 |
|
88 |
|
89 |
def get_labels_vector(): |
90 |
"""Clusters data in vectorial form by assigning labels to data points.
|
91 |
|
92 |
@return: list of cluster labels. Every data point is assigned a number
|
93 |
corresponding to the cluster it belongs to.
|
94 |
"""
|
95 |
return []
|
96 |
|
97 |
|
98 |
def get_clusters(labels): |
99 |
"""Groups data-points belonging to the same cluster into arrays of indices.
|
100 |
|
101 |
@param labels: list of cluster labels (numbers) corresponding to the cluster
|
102 |
it belongs to.
|
103 |
@return: tuple of arrays. Every array contains the indices (relative to the
|
104 |
labels list) of the data points belonging to the same cluster.
|
105 |
"""
|
106 |
n_clusters = max(labels) + 1 |
107 |
return tuple(np.where(labels == clust_num)[0] |
108 |
for clust_num in range(n_clusters)) |
109 |
|
110 |
|
111 |
def get_exemplars_affty(affty_mtx, clusters): |
112 |
"""Computes the exemplars for every cluster and returns a list of indices.
|
113 |
|
114 |
@param affty_mtx: Data structured in form of affinity matrix. eg. Euclidean
|
115 |
distances between points, RMSD Matrix, etc.) shape: [n_points, n_points].
|
116 |
@param clusters: tuple of arrays. Every array contains the indices (relative
|
117 |
to the affinity matrix) of the data points belonging to the same cluster.
|
118 |
@return: list of indices (relative to the affinity matrix) exemplars for
|
119 |
every cluster.
|
120 |
"""
|
121 |
from sklearn.cluster import AffinityPropagation |
122 |
clust_affty_mtcs = tuple(affty_mtx[np.ix_(clust, clust)]
|
123 |
for clust in clusters) |
124 |
exemplars = [] |
125 |
for i, mtx in enumerate(clust_affty_mtcs): |
126 |
pref = -1e6 * np.max(np.abs(mtx))
|
127 |
af = AffinityPropagation(affinity='precomputed', preference=pref,
|
128 |
damping=0.95, max_iter=2000, |
129 |
random_state=None).fit(mtx)
|
130 |
exemplars.append(clusters[i][af.cluster_centers_indices_[0]])
|
131 |
return exemplars
|
132 |
|
133 |
|
134 |
def plot_clusters(labels, x, y, exemplars=None, save=True): |
135 |
"""Plots the clustered data casting a color to every cluster.
|
136 |
|
137 |
@param labels: list of cluster labels (numbers) corresponding to the cluster
|
138 |
it belongs to.
|
139 |
@param x: list of data of the x axis.
|
140 |
@param y: list of data of the y axis.
|
141 |
@param exemplars: list of data point indices (relative to the labels list)
|
142 |
considered as cluster exemplars.
|
143 |
@param save: bool, Whether to save the generated plot into a file or not.
|
144 |
(in the latter case the plot is shown in a new window)
|
145 |
"""
|
146 |
import matplotlib.pyplot as plt |
147 |
from matplotlib import cm, colors |
148 |
|
149 |
n_clusters = max(labels) + 1 |
150 |
rb = cm.get_cmap('gist_rainbow', max(n_clusters, 1)) |
151 |
rb.set_under() |
152 |
plt.figure(figsize=(10, 8)) |
153 |
for i in range(len(labels)): |
154 |
plt.plot(x[i], y[i], c=rb(labels[i]), marker='.')
|
155 |
if len(exemplars) > 0 and i == exemplars[labels[i]]: |
156 |
plt.plot(x[i], y[i], c=rb(labels[i]), marker="x",
|
157 |
markersize=15,
|
158 |
label=f"Exemplar cluster {labels[i]}")
|
159 |
plt.title(f'Found {n_clusters} Clusters.')
|
160 |
plt.xlabel("Energy")
|
161 |
plt.ylabel("MOI")
|
162 |
plt.legend() |
163 |
|
164 |
bounds = list(range(max(n_clusters, 1))) |
165 |
norm = colors.Normalize(vmin=min(labels), vmax=max(labels)) |
166 |
plt.colorbar(cm.ScalarMappable(norm=norm, cmap=rb), ticks=bounds) |
167 |
if save:
|
168 |
from modules.utilities import check_bak |
169 |
check_bak('clusters.png')
|
170 |
plt.savefig('clusters.png')
|
171 |
plt.close("all")
|
172 |
else:
|
173 |
plt.show() |
174 |
|
175 |
|
176 |
def clustering(data, plot=False, x=None, y=None): |
177 |
"""Directs the clustering process by calling the relevant functions.
|
178 |
|
179 |
@param data: The data to be clustered. It must be stored in vector form
|
180 |
[n_features, n_samples] or in affinity matrix form [n_samples, n_samples],
|
181 |
symmetric and 0 in the main diagonal. (Eg. Euclidean distances between
|
182 |
points, RMSD Matrix, etc.).
|
183 |
@param plot: bool, Whether to plot the clustered data.
|
184 |
@param x: Necessary only if plot is turned on. X values to plot the data.
|
185 |
@param y: Necessary only if plot is turned on. Y values to plot the data.
|
186 |
@return: list of exemplars, list of indices (relative to data)
|
187 |
exemplars for every cluster.
|
188 |
"""
|
189 |
from collections.abc import Iterable |
190 |
|
191 |
data_err = "Data must be stored in vector form [n_features, n_samples] or" \
|
192 |
"in affinity matrix form [n_samples, n_samples]: symmetric " \
|
193 |
"and 0 in the main diagonal. Eg. RMSD matrix"
|
194 |
debug_err = "On debug mode x and y should be provided"
|
195 |
|
196 |
if plot and not (isinstance(x, Iterable) and isinstance(y, Iterable)): |
197 |
logger.error(debug_err) |
198 |
raise ValueError(debug_err) |
199 |
if not isinstance(data, np.ndarray): |
200 |
data = np.array(data) |
201 |
if len(data.shape) != 2: |
202 |
logger.error(data_err) |
203 |
raise ValueError(data_err) |
204 |
|
205 |
if data.shape[0] == data.shape[1] \ |
206 |
and (np.tril(data).T == np.triu(data)).all():
|
207 |
logger.info("Clustering using affinity matrix.")
|
208 |
labels = get_labels_affty(data) |
209 |
if max(labels) == -1: |
210 |
logger.warning('Clustering of conformers did not converge. Try '
|
211 |
"setting a smaller 'min_samples' parameter.")
|
212 |
exemplars = list(range(data.shape[0])) |
213 |
else:
|
214 |
clusters = get_clusters(labels) |
215 |
exemplars = get_exemplars_affty(data, clusters) |
216 |
if plot:
|
217 |
plot_clusters(labels, x, y, exemplars, save=True)
|
218 |
logger.info(f'Conformers are grouped in {len(exemplars)} clusters.')
|
219 |
return exemplars
|
220 |
else:
|
221 |
not_impl_err = 'Clustering not yet implemented for vectorized data'
|
222 |
logger.error(not_impl_err) |
223 |
raise NotImplementedError(not_impl_err) |