dockonsurf / modules / clustering.py @ 0db30d07
Historique | Voir | Annoter | Télécharger (6,81 ko)
1 |
"""Functions to cluster structures.
|
---|---|
2 |
|
3 |
functions:
|
4 |
get_labels_affty: Clusters data in affinity matrix form by assigning labels to
|
5 |
data points.
|
6 |
get_labels_vector: Clusters data in vectorial form by assigning labels to
|
7 |
data points.
|
8 |
get_clusters: Groups data-points belonging to the same cluster into arrays of
|
9 |
indices.
|
10 |
get_exemplars_affty: Computes the exemplars for every cluster and returns a list
|
11 |
of indices.
|
12 |
plot_clusters: Plots the clustered data casting a color to every cluster.
|
13 |
clustering: Directs the clustering process by calling the relevant functions.
|
14 |
"""
|
15 |
import logging |
16 |
|
17 |
import hdbscan |
18 |
import numpy as np |
19 |
|
20 |
logger = logging.getLogger('DockOnSurf')
|
21 |
|
22 |
|
23 |
def get_labels_affty(affty_mtx, kind="rmsd"): |
24 |
"""Clusters data in affinity matrix form by assigning labels to data points.
|
25 |
|
26 |
@param affty_mtx: Data to be clustered, it must be an affinity matrix.
|
27 |
(Eg. Euclidean distances between points, RMSD Matrix, etc.).
|
28 |
Shape: [n_points, n_points]
|
29 |
@param kind: Which kind of data the affinity matrix contains.
|
30 |
@return: list of cluster labels. Every data point is assigned a number
|
31 |
corresponding to the cluster it belongs to.
|
32 |
"""
|
33 |
if np.average(affty_mtx) < 1e-3 and kind == "rmsd": |
34 |
sing_clust = True
|
35 |
min_size = int(len(affty_mtx) / 2) |
36 |
else:
|
37 |
sing_clust = False
|
38 |
min_size = 20
|
39 |
hdbs = hdbscan.HDBSCAN(metric="precomputed",
|
40 |
min_samples=5,
|
41 |
min_cluster_size=min_size, |
42 |
allow_single_cluster=sing_clust) |
43 |
return hdbs.fit_predict(affty_mtx)
|
44 |
|
45 |
|
46 |
def get_labels_vector(): |
47 |
"""Clusters data in vectorial form by assigning labels to data points.
|
48 |
|
49 |
@return: list of cluster labels. Every data point is assigned a number
|
50 |
corresponding to the cluster it belongs to.
|
51 |
"""
|
52 |
return []
|
53 |
|
54 |
|
55 |
def get_clusters(labels): |
56 |
"""Groups data-points belonging to the same cluster into arrays of indices.
|
57 |
|
58 |
@param labels: list of cluster labels (numbers) corresponding to the cluster
|
59 |
it belongs to.
|
60 |
@return: tuple of arrays. Every array contains the indices (relative to the
|
61 |
labels list) of the data points belonging to the same cluster.
|
62 |
"""
|
63 |
n_clusters = max(labels) + 1 |
64 |
return tuple(np.where(labels == clust_num)[0] |
65 |
for clust_num in range(n_clusters)) |
66 |
|
67 |
|
68 |
def get_exemplars_affty(affty_mtx, clusters): |
69 |
"""Computes the exemplars for every cluster and returns a list of indices.
|
70 |
|
71 |
@param affty_mtx: Data structured in form of affinity matrix. eg. Euclidean
|
72 |
distances between points, RMSD Matrix, etc.) shape: [n_points, n_points].
|
73 |
@param clusters: tuple of arrays. Every array contains the indices (relative
|
74 |
to the affinity matrix) of the data points belonging to the same cluster.
|
75 |
@return: list of indices (relative to the affinity matrix) exemplars for
|
76 |
every cluster.
|
77 |
"""
|
78 |
from sklearn.cluster import AffinityPropagation |
79 |
clust_affty_mtcs = tuple(affty_mtx[np.ix_(clust, clust)]
|
80 |
for clust in clusters) |
81 |
exemplars = [] |
82 |
for i, mtx in enumerate(clust_affty_mtcs): |
83 |
pref = -1e6 * np.max(np.abs(mtx))
|
84 |
af = AffinityPropagation(affinity='precomputed',
|
85 |
preference=pref, |
86 |
damping=0.95,
|
87 |
max_iter=2000).fit(mtx)
|
88 |
exemplars.append(clusters[i][af.cluster_centers_indices_[0]])
|
89 |
return exemplars
|
90 |
|
91 |
|
92 |
def plot_clusters(labels, x, y, exemplars=None, save=True): |
93 |
"""Plots the clustered data casting a color to every cluster.
|
94 |
|
95 |
@param labels: list of cluster labels (numbers) corresponding to the cluster
|
96 |
it belongs to.
|
97 |
@param x: list of data of the x axis.
|
98 |
@param y: list of data of the y axis.
|
99 |
@param exemplars: list of data point indices (relative to the labels list)
|
100 |
considered as cluster exemplars.
|
101 |
@param save: bool, Whether to save the generated plot into a file or not.
|
102 |
(in the latter case the plot is shown in a new window)
|
103 |
"""
|
104 |
import matplotlib.pyplot as plt |
105 |
from matplotlib import cm, colors |
106 |
|
107 |
n_clusters = max(labels) + 1 |
108 |
rb = cm.get_cmap('gist_rainbow', max(n_clusters, 1)) |
109 |
rb.set_under() |
110 |
plt.figure(figsize=(10, 8)) |
111 |
for i in range(len(labels)): |
112 |
plt.plot(x[i], y[i], c=rb(labels[i]), marker='.')
|
113 |
if i == exemplars[labels[i]]:
|
114 |
plt.plot(x[i], y[i], c=rb(labels[i]), marker="x",
|
115 |
markersize=15,
|
116 |
label=f"Exemplar cluster {labels[i]}")
|
117 |
plt.title(f'Found {n_clusters} Clusters.')
|
118 |
plt.xlabel("Energy")
|
119 |
plt.ylabel("MOI")
|
120 |
plt.legend() |
121 |
|
122 |
bounds = list(range(max(n_clusters, 1))) |
123 |
norm = colors.Normalize(vmin=min(labels), vmax=max(labels)) |
124 |
plt.colorbar(cm.ScalarMappable(norm=norm, cmap=rb), ticks=bounds) |
125 |
if save:
|
126 |
plt.savefig(f'clusters.png')
|
127 |
plt.close("all")
|
128 |
else:
|
129 |
plt.show() |
130 |
|
131 |
|
132 |
def clustering(data, debug=False, x=None, y=None): |
133 |
"""Directs the clustering process by calling the relevant functions.
|
134 |
|
135 |
@param data: The data to be clustered. It must be stored in vector form
|
136 |
[n_features, n_samples] or in affinity matrix form [n_samples, n_samples],
|
137 |
symmetric and 0 in the main diagonal. (Eg. Euclidean distances between
|
138 |
points, RMSD Matrix, etc.).
|
139 |
@param debug: bool, Whether to report debug information and plot the
|
140 |
clustered data.
|
141 |
@param x: Necessary only if debug is turned on. X values to plot the data.
|
142 |
@param y: Necessary only if debug is turned on. X values to plot the data.
|
143 |
@return: list of exemplars, list of indices (relative to data)
|
144 |
exemplars for every cluster.
|
145 |
"""
|
146 |
from collections.abc import Iterable |
147 |
|
148 |
data_err = "Data must be stored in vector form [n_features, n_samples] or" \
|
149 |
"in affinity matrix form [n_samples, n_samples]: symmetric " \
|
150 |
"and 0 in the main diagonal. Eg. RMSD matrix"
|
151 |
debug_err = "On debug mode x and y should be provided"
|
152 |
|
153 |
if debug and not (isinstance(x, Iterable) and isinstance(y, Iterable)): |
154 |
logger.error(debug_err) |
155 |
raise ValueError(debug_err) |
156 |
if not isinstance(data, np.ndarray): |
157 |
data = np.array(data) |
158 |
if len(data.shape) != 2: |
159 |
logger.error(data_err) |
160 |
raise ValueError(data_err) |
161 |
|
162 |
if data.shape[0] == data.shape[1] \ |
163 |
and (np.tril(data).T == np.triu(data)).all():
|
164 |
logger.info("Clustering using affinity matrix")
|
165 |
labels = get_labels_affty(data) |
166 |
if max(labels) == -1: |
167 |
logger.error('Clustering of conformers did not converge. Try '
|
168 |
"setting a smaller 'min_samples' parameter")
|
169 |
clusters = get_clusters(labels) |
170 |
exemplars = get_exemplars_affty(data, clusters) |
171 |
if debug:
|
172 |
plot_clusters(labels, x, y, exemplars, save=True)
|
173 |
logger.info(f'Conformers are grouped in {len(exemplars)} clusters.')
|
174 |
return exemplars
|
175 |
else:
|
176 |
pass
|