dockonsurf / modules / clustering.py @ 0db30d07
Historique | Voir | Annoter | Télécharger (6,81 ko)
1 | 75325cc3 | Carles | """Functions to cluster structures.
|
---|---|---|---|
2 | 75325cc3 | Carles |
|
3 | 75325cc3 | Carles | functions:
|
4 | 75325cc3 | Carles | get_labels_affty: Clusters data in affinity matrix form by assigning labels to
|
5 | 75325cc3 | Carles | data points.
|
6 | 75325cc3 | Carles | get_labels_vector: Clusters data in vectorial form by assigning labels to
|
7 | 75325cc3 | Carles | data points.
|
8 | 75325cc3 | Carles | get_clusters: Groups data-points belonging to the same cluster into arrays of
|
9 | 75325cc3 | Carles | indices.
|
10 | 75325cc3 | Carles | get_exemplars_affty: Computes the exemplars for every cluster and returns a list
|
11 | 75325cc3 | Carles | of indices.
|
12 | 75325cc3 | Carles | plot_clusters: Plots the clustered data casting a color to every cluster.
|
13 | 75325cc3 | Carles | clustering: Directs the clustering process by calling the relevant functions.
|
14 | 75325cc3 | Carles | """
|
15 | 75325cc3 | Carles | import logging |
16 | 75325cc3 | Carles | |
17 | 75325cc3 | Carles | import hdbscan |
18 | 75325cc3 | Carles | import numpy as np |
19 | 75325cc3 | Carles | |
20 | 75325cc3 | Carles | logger = logging.getLogger('DockOnSurf')
|
21 | 75325cc3 | Carles | |
22 | 75325cc3 | Carles | |
23 | 75325cc3 | Carles | def get_labels_affty(affty_mtx, kind="rmsd"): |
24 | 75325cc3 | Carles | """Clusters data in affinity matrix form by assigning labels to data points.
|
25 | 75325cc3 | Carles |
|
26 | 75325cc3 | Carles | @param affty_mtx: Data to be clustered, it must be an affinity matrix.
|
27 | 75325cc3 | Carles | (Eg. Euclidean distances between points, RMSD Matrix, etc.).
|
28 | 75325cc3 | Carles | Shape: [n_points, n_points]
|
29 | 75325cc3 | Carles | @param kind: Which kind of data the affinity matrix contains.
|
30 | 75325cc3 | Carles | @return: list of cluster labels. Every data point is assigned a number
|
31 | 75325cc3 | Carles | corresponding to the cluster it belongs to.
|
32 | 75325cc3 | Carles | """
|
33 | 75325cc3 | Carles | if np.average(affty_mtx) < 1e-3 and kind == "rmsd": |
34 | 75325cc3 | Carles | sing_clust = True
|
35 | 75325cc3 | Carles | min_size = int(len(affty_mtx) / 2) |
36 | 75325cc3 | Carles | else:
|
37 | 75325cc3 | Carles | sing_clust = False
|
38 | 75325cc3 | Carles | min_size = 20
|
39 | 75325cc3 | Carles | hdbs = hdbscan.HDBSCAN(metric="precomputed",
|
40 | 75325cc3 | Carles | min_samples=5,
|
41 | 75325cc3 | Carles | min_cluster_size=min_size, |
42 | 75325cc3 | Carles | allow_single_cluster=sing_clust) |
43 | 75325cc3 | Carles | return hdbs.fit_predict(affty_mtx)
|
44 | 75325cc3 | Carles | |
45 | 75325cc3 | Carles | |
46 | 75325cc3 | Carles | def get_labels_vector(): |
47 | 75325cc3 | Carles | """Clusters data in vectorial form by assigning labels to data points.
|
48 | 75325cc3 | Carles |
|
49 | 75325cc3 | Carles | @return: list of cluster labels. Every data point is assigned a number
|
50 | 75325cc3 | Carles | corresponding to the cluster it belongs to.
|
51 | 75325cc3 | Carles | """
|
52 | 75325cc3 | Carles | return []
|
53 | 75325cc3 | Carles | |
54 | 75325cc3 | Carles | |
55 | 75325cc3 | Carles | def get_clusters(labels): |
56 | 75325cc3 | Carles | """Groups data-points belonging to the same cluster into arrays of indices.
|
57 | 75325cc3 | Carles |
|
58 | 75325cc3 | Carles | @param labels: list of cluster labels (numbers) corresponding to the cluster
|
59 | 75325cc3 | Carles | it belongs to.
|
60 | 75325cc3 | Carles | @return: tuple of arrays. Every array contains the indices (relative to the
|
61 | 75325cc3 | Carles | labels list) of the data points belonging to the same cluster.
|
62 | 75325cc3 | Carles | """
|
63 | 75325cc3 | Carles | n_clusters = max(labels) + 1 |
64 | 75325cc3 | Carles | return tuple(np.where(labels == clust_num)[0] |
65 | 75325cc3 | Carles | for clust_num in range(n_clusters)) |
66 | 75325cc3 | Carles | |
67 | 75325cc3 | Carles | |
68 | 75325cc3 | Carles | def get_exemplars_affty(affty_mtx, clusters): |
69 | 75325cc3 | Carles | """Computes the exemplars for every cluster and returns a list of indices.
|
70 | 75325cc3 | Carles |
|
71 | 75325cc3 | Carles | @param affty_mtx: Data structured in form of affinity matrix. eg. Euclidean
|
72 | 75325cc3 | Carles | distances between points, RMSD Matrix, etc.) shape: [n_points, n_points].
|
73 | 75325cc3 | Carles | @param clusters: tuple of arrays. Every array contains the indices (relative
|
74 | 75325cc3 | Carles | to the affinity matrix) of the data points belonging to the same cluster.
|
75 | 75325cc3 | Carles | @return: list of indices (relative to the affinity matrix) exemplars for
|
76 | 75325cc3 | Carles | every cluster.
|
77 | 75325cc3 | Carles | """
|
78 | 75325cc3 | Carles | from sklearn.cluster import AffinityPropagation |
79 | 75325cc3 | Carles | clust_affty_mtcs = tuple(affty_mtx[np.ix_(clust, clust)]
|
80 | 75325cc3 | Carles | for clust in clusters) |
81 | 75325cc3 | Carles | exemplars = [] |
82 | 75325cc3 | Carles | for i, mtx in enumerate(clust_affty_mtcs): |
83 | 75325cc3 | Carles | pref = -1e6 * np.max(np.abs(mtx))
|
84 | 75325cc3 | Carles | af = AffinityPropagation(affinity='precomputed',
|
85 | 75325cc3 | Carles | preference=pref, |
86 | 75325cc3 | Carles | damping=0.95,
|
87 | 75325cc3 | Carles | max_iter=2000).fit(mtx)
|
88 | 75325cc3 | Carles | exemplars.append(clusters[i][af.cluster_centers_indices_[0]])
|
89 | 75325cc3 | Carles | return exemplars
|
90 | 75325cc3 | Carles | |
91 | 75325cc3 | Carles | |
92 | 75325cc3 | Carles | def plot_clusters(labels, x, y, exemplars=None, save=True): |
93 | 75325cc3 | Carles | """Plots the clustered data casting a color to every cluster.
|
94 | 75325cc3 | Carles |
|
95 | 75325cc3 | Carles | @param labels: list of cluster labels (numbers) corresponding to the cluster
|
96 | 75325cc3 | Carles | it belongs to.
|
97 | 75325cc3 | Carles | @param x: list of data of the x axis.
|
98 | 75325cc3 | Carles | @param y: list of data of the y axis.
|
99 | 75325cc3 | Carles | @param exemplars: list of data point indices (relative to the labels list)
|
100 | 75325cc3 | Carles | considered as cluster exemplars.
|
101 | 75325cc3 | Carles | @param save: bool, Whether to save the generated plot into a file or not.
|
102 | 75325cc3 | Carles | (in the latter case the plot is shown in a new window)
|
103 | 75325cc3 | Carles | """
|
104 | 75325cc3 | Carles | import matplotlib.pyplot as plt |
105 | 75325cc3 | Carles | from matplotlib import cm, colors |
106 | 75325cc3 | Carles | |
107 | 75325cc3 | Carles | n_clusters = max(labels) + 1 |
108 | 75325cc3 | Carles | rb = cm.get_cmap('gist_rainbow', max(n_clusters, 1)) |
109 | 75325cc3 | Carles | rb.set_under() |
110 | 75325cc3 | Carles | plt.figure(figsize=(10, 8)) |
111 | 75325cc3 | Carles | for i in range(len(labels)): |
112 | 75325cc3 | Carles | plt.plot(x[i], y[i], c=rb(labels[i]), marker='.')
|
113 | 75325cc3 | Carles | if i == exemplars[labels[i]]:
|
114 | 75325cc3 | Carles | plt.plot(x[i], y[i], c=rb(labels[i]), marker="x",
|
115 | 75325cc3 | Carles | markersize=15,
|
116 | 75325cc3 | Carles | label=f"Exemplar cluster {labels[i]}")
|
117 | 75325cc3 | Carles | plt.title(f'Found {n_clusters} Clusters.')
|
118 | 75325cc3 | Carles | plt.xlabel("Energy")
|
119 | 75325cc3 | Carles | plt.ylabel("MOI")
|
120 | 75325cc3 | Carles | plt.legend() |
121 | 75325cc3 | Carles | |
122 | 75325cc3 | Carles | bounds = list(range(max(n_clusters, 1))) |
123 | 75325cc3 | Carles | norm = colors.Normalize(vmin=min(labels), vmax=max(labels)) |
124 | 75325cc3 | Carles | plt.colorbar(cm.ScalarMappable(norm=norm, cmap=rb), ticks=bounds) |
125 | 75325cc3 | Carles | if save:
|
126 | 75325cc3 | Carles | plt.savefig(f'clusters.png')
|
127 | 75325cc3 | Carles | plt.close("all")
|
128 | 75325cc3 | Carles | else:
|
129 | 75325cc3 | Carles | plt.show() |
130 | 75325cc3 | Carles | |
131 | 75325cc3 | Carles | |
132 | 75325cc3 | Carles | def clustering(data, debug=False, x=None, y=None): |
133 | 75325cc3 | Carles | """Directs the clustering process by calling the relevant functions.
|
134 | 75325cc3 | Carles |
|
135 | 75325cc3 | Carles | @param data: The data to be clustered. It must be stored in vector form
|
136 | 75325cc3 | Carles | [n_features, n_samples] or in affinity matrix form [n_samples, n_samples],
|
137 | 75325cc3 | Carles | symmetric and 0 in the main diagonal. (Eg. Euclidean distances between
|
138 | 75325cc3 | Carles | points, RMSD Matrix, etc.).
|
139 | 75325cc3 | Carles | @param debug: bool, Whether to report debug information and plot the
|
140 | 75325cc3 | Carles | clustered data.
|
141 | 75325cc3 | Carles | @param x: Necessary only if debug is turned on. X values to plot the data.
|
142 | 75325cc3 | Carles | @param y: Necessary only if debug is turned on. X values to plot the data.
|
143 | 75325cc3 | Carles | @return: list of exemplars, list of indices (relative to data)
|
144 | 75325cc3 | Carles | exemplars for every cluster.
|
145 | 75325cc3 | Carles | """
|
146 | 75325cc3 | Carles | from collections.abc import Iterable |
147 | 75325cc3 | Carles | |
148 | 75325cc3 | Carles | data_err = "Data must be stored in vector form [n_features, n_samples] or" \
|
149 | 75325cc3 | Carles | "in affinity matrix form [n_samples, n_samples]: symmetric " \
|
150 | 75325cc3 | Carles | "and 0 in the main diagonal. Eg. RMSD matrix"
|
151 | 75325cc3 | Carles | debug_err = "On debug mode x and y should be provided"
|
152 | 75325cc3 | Carles | |
153 | 75325cc3 | Carles | if debug and not (isinstance(x, Iterable) and isinstance(y, Iterable)): |
154 | 75325cc3 | Carles | logger.error(debug_err) |
155 | 75325cc3 | Carles | raise ValueError(debug_err) |
156 | 75325cc3 | Carles | if not isinstance(data, np.ndarray): |
157 | 75325cc3 | Carles | data = np.array(data) |
158 | 75325cc3 | Carles | if len(data.shape) != 2: |
159 | 75325cc3 | Carles | logger.error(data_err) |
160 | 75325cc3 | Carles | raise ValueError(data_err) |
161 | 75325cc3 | Carles | |
162 | 75325cc3 | Carles | if data.shape[0] == data.shape[1] \ |
163 | 75325cc3 | Carles | and (np.tril(data).T == np.triu(data)).all():
|
164 | 75325cc3 | Carles | logger.info("Clustering using affinity matrix")
|
165 | 75325cc3 | Carles | labels = get_labels_affty(data) |
166 | db7349af | Carles | if max(labels) == -1: |
167 | db7349af | Carles | logger.error('Clustering of conformers did not converge. Try '
|
168 | db7349af | Carles | "setting a smaller 'min_samples' parameter")
|
169 | 75325cc3 | Carles | clusters = get_clusters(labels) |
170 | 75325cc3 | Carles | exemplars = get_exemplars_affty(data, clusters) |
171 | 75325cc3 | Carles | if debug:
|
172 | 75325cc3 | Carles | plot_clusters(labels, x, y, exemplars, save=True)
|
173 | 680944e4 | Carles | logger.info(f'Conformers are grouped in {len(exemplars)} clusters.')
|
174 | 75325cc3 | Carles | return exemplars
|
175 | 75325cc3 | Carles | else:
|
176 | 75325cc3 | Carles | pass |