Statistiques
| Branche: | Tag: | Révision :

dockonsurf / modules / clustering.py @ 0db30d07

Historique | Voir | Annoter | Télécharger (6,81 ko)

1 75325cc3 Carles
"""Functions to cluster structures.
2 75325cc3 Carles

3 75325cc3 Carles
functions:
4 75325cc3 Carles
get_labels_affty: Clusters data in affinity matrix form by assigning labels to
5 75325cc3 Carles
data points.
6 75325cc3 Carles
get_labels_vector: Clusters data in vectorial form by assigning labels to
7 75325cc3 Carles
data points.
8 75325cc3 Carles
get_clusters: Groups data-points belonging to the same cluster into arrays of
9 75325cc3 Carles
indices.
10 75325cc3 Carles
get_exemplars_affty: Computes the exemplars for every cluster and returns a list
11 75325cc3 Carles
of indices.
12 75325cc3 Carles
plot_clusters: Plots the clustered data casting a color to every cluster.
13 75325cc3 Carles
clustering: Directs the clustering process by calling the relevant functions.
14 75325cc3 Carles
"""
15 75325cc3 Carles
import logging
16 75325cc3 Carles
17 75325cc3 Carles
import hdbscan
18 75325cc3 Carles
import numpy as np
19 75325cc3 Carles
20 75325cc3 Carles
logger = logging.getLogger('DockOnSurf')
21 75325cc3 Carles
22 75325cc3 Carles
23 75325cc3 Carles
def get_labels_affty(affty_mtx, kind="rmsd"):
24 75325cc3 Carles
    """Clusters data in affinity matrix form by assigning labels to data points.
25 75325cc3 Carles

26 75325cc3 Carles
    @param affty_mtx: Data to be clustered, it must be an affinity matrix.
27 75325cc3 Carles
    (Eg. Euclidean distances between points, RMSD Matrix, etc.).
28 75325cc3 Carles
    Shape: [n_points, n_points]
29 75325cc3 Carles
    @param kind: Which kind of data the affinity matrix contains.
30 75325cc3 Carles
    @return: list of cluster labels. Every data point is assigned a number
31 75325cc3 Carles
    corresponding to the cluster it belongs to.
32 75325cc3 Carles
    """
33 75325cc3 Carles
    if np.average(affty_mtx) < 1e-3 and kind == "rmsd":
34 75325cc3 Carles
        sing_clust = True
35 75325cc3 Carles
        min_size = int(len(affty_mtx) / 2)
36 75325cc3 Carles
    else:
37 75325cc3 Carles
        sing_clust = False
38 75325cc3 Carles
        min_size = 20
39 75325cc3 Carles
    hdbs = hdbscan.HDBSCAN(metric="precomputed",
40 75325cc3 Carles
                           min_samples=5,
41 75325cc3 Carles
                           min_cluster_size=min_size,
42 75325cc3 Carles
                           allow_single_cluster=sing_clust)
43 75325cc3 Carles
    return hdbs.fit_predict(affty_mtx)
44 75325cc3 Carles
45 75325cc3 Carles
46 75325cc3 Carles
def get_labels_vector():
47 75325cc3 Carles
    """Clusters data in vectorial form by assigning labels to data points.
48 75325cc3 Carles

49 75325cc3 Carles
    @return: list of cluster labels. Every data point is assigned a number
50 75325cc3 Carles
    corresponding to the cluster it belongs to.
51 75325cc3 Carles
    """
52 75325cc3 Carles
    return []
53 75325cc3 Carles
54 75325cc3 Carles
55 75325cc3 Carles
def get_clusters(labels):
56 75325cc3 Carles
    """Groups data-points belonging to the same cluster into arrays of indices.
57 75325cc3 Carles

58 75325cc3 Carles
    @param labels: list of cluster labels (numbers) corresponding to the cluster
59 75325cc3 Carles
    it belongs to.
60 75325cc3 Carles
    @return: tuple of arrays. Every array contains the indices (relative to the
61 75325cc3 Carles
    labels list) of the data points belonging to the same cluster.
62 75325cc3 Carles
    """
63 75325cc3 Carles
    n_clusters = max(labels) + 1
64 75325cc3 Carles
    return tuple(np.where(labels == clust_num)[0]
65 75325cc3 Carles
                 for clust_num in range(n_clusters))
66 75325cc3 Carles
67 75325cc3 Carles
68 75325cc3 Carles
def get_exemplars_affty(affty_mtx, clusters):
69 75325cc3 Carles
    """Computes the exemplars for every cluster and returns a list of indices.
70 75325cc3 Carles

71 75325cc3 Carles
    @param affty_mtx: Data structured in form of affinity matrix. eg. Euclidean
72 75325cc3 Carles
    distances between points, RMSD Matrix, etc.) shape: [n_points, n_points].
73 75325cc3 Carles
    @param clusters: tuple of arrays. Every array contains the indices (relative
74 75325cc3 Carles
    to the affinity matrix) of the data points belonging to the same cluster.
75 75325cc3 Carles
    @return: list of indices (relative to the affinity matrix) exemplars for
76 75325cc3 Carles
    every cluster.
77 75325cc3 Carles
    """
78 75325cc3 Carles
    from sklearn.cluster import AffinityPropagation
79 75325cc3 Carles
    clust_affty_mtcs = tuple(affty_mtx[np.ix_(clust, clust)]
80 75325cc3 Carles
                             for clust in clusters)
81 75325cc3 Carles
    exemplars = []
82 75325cc3 Carles
    for i, mtx in enumerate(clust_affty_mtcs):
83 75325cc3 Carles
        pref = -1e6 * np.max(np.abs(mtx))
84 75325cc3 Carles
        af = AffinityPropagation(affinity='precomputed',
85 75325cc3 Carles
                                 preference=pref,
86 75325cc3 Carles
                                 damping=0.95,
87 75325cc3 Carles
                                 max_iter=2000).fit(mtx)
88 75325cc3 Carles
        exemplars.append(clusters[i][af.cluster_centers_indices_[0]])
89 75325cc3 Carles
    return exemplars
90 75325cc3 Carles
91 75325cc3 Carles
92 75325cc3 Carles
def plot_clusters(labels, x, y, exemplars=None, save=True):
93 75325cc3 Carles
    """Plots the clustered data casting a color to every cluster.
94 75325cc3 Carles

95 75325cc3 Carles
    @param labels: list of cluster labels (numbers) corresponding to the cluster
96 75325cc3 Carles
    it belongs to.
97 75325cc3 Carles
    @param x: list of data of the x axis.
98 75325cc3 Carles
    @param y: list of data of the y axis.
99 75325cc3 Carles
    @param exemplars: list of data point indices (relative to the labels list)
100 75325cc3 Carles
    considered as cluster exemplars.
101 75325cc3 Carles
    @param save: bool, Whether to save the generated plot into a file or not.
102 75325cc3 Carles
    (in the latter case the plot is shown in a new window)
103 75325cc3 Carles
    """
104 75325cc3 Carles
    import matplotlib.pyplot as plt
105 75325cc3 Carles
    from matplotlib import cm, colors
106 75325cc3 Carles
107 75325cc3 Carles
    n_clusters = max(labels) + 1
108 75325cc3 Carles
    rb = cm.get_cmap('gist_rainbow', max(n_clusters, 1))
109 75325cc3 Carles
    rb.set_under()
110 75325cc3 Carles
    plt.figure(figsize=(10, 8))
111 75325cc3 Carles
    for i in range(len(labels)):
112 75325cc3 Carles
        plt.plot(x[i], y[i], c=rb(labels[i]), marker='.')
113 75325cc3 Carles
        if i == exemplars[labels[i]]:
114 75325cc3 Carles
            plt.plot(x[i], y[i], c=rb(labels[i]), marker="x",
115 75325cc3 Carles
                     markersize=15,
116 75325cc3 Carles
                     label=f"Exemplar cluster {labels[i]}")
117 75325cc3 Carles
    plt.title(f'Found {n_clusters} Clusters.')
118 75325cc3 Carles
    plt.xlabel("Energy")
119 75325cc3 Carles
    plt.ylabel("MOI")
120 75325cc3 Carles
    plt.legend()
121 75325cc3 Carles
122 75325cc3 Carles
    bounds = list(range(max(n_clusters, 1)))
123 75325cc3 Carles
    norm = colors.Normalize(vmin=min(labels), vmax=max(labels))
124 75325cc3 Carles
    plt.colorbar(cm.ScalarMappable(norm=norm, cmap=rb), ticks=bounds)
125 75325cc3 Carles
    if save:
126 75325cc3 Carles
        plt.savefig(f'clusters.png')
127 75325cc3 Carles
        plt.close("all")
128 75325cc3 Carles
    else:
129 75325cc3 Carles
        plt.show()
130 75325cc3 Carles
131 75325cc3 Carles
132 75325cc3 Carles
def clustering(data, debug=False, x=None, y=None):
133 75325cc3 Carles
    """Directs the clustering process by calling the relevant functions.
134 75325cc3 Carles

135 75325cc3 Carles
    @param data: The data to be clustered. It must be stored in vector form
136 75325cc3 Carles
    [n_features, n_samples] or in affinity matrix form [n_samples, n_samples],
137 75325cc3 Carles
    symmetric and 0 in the main diagonal. (Eg. Euclidean distances between
138 75325cc3 Carles
    points, RMSD Matrix, etc.).
139 75325cc3 Carles
    @param debug: bool, Whether to report debug information and plot the
140 75325cc3 Carles
    clustered data.
141 75325cc3 Carles
    @param x: Necessary only if debug is turned on. X values to plot the data.
142 75325cc3 Carles
    @param y: Necessary only if debug is turned on. X values to plot the data.
143 75325cc3 Carles
    @return: list of exemplars, list of indices (relative to data)
144 75325cc3 Carles
    exemplars for every cluster.
145 75325cc3 Carles
    """
146 75325cc3 Carles
    from collections.abc import Iterable
147 75325cc3 Carles
148 75325cc3 Carles
    data_err = "Data must be stored in vector form [n_features, n_samples] or" \
149 75325cc3 Carles
               "in affinity matrix form [n_samples, n_samples]: symmetric " \
150 75325cc3 Carles
               "and 0 in the main diagonal. Eg. RMSD matrix"
151 75325cc3 Carles
    debug_err = "On debug mode x and y should be provided"
152 75325cc3 Carles
153 75325cc3 Carles
    if debug and not (isinstance(x, Iterable) and isinstance(y, Iterable)):
154 75325cc3 Carles
        logger.error(debug_err)
155 75325cc3 Carles
        raise ValueError(debug_err)
156 75325cc3 Carles
    if not isinstance(data, np.ndarray):
157 75325cc3 Carles
        data = np.array(data)
158 75325cc3 Carles
    if len(data.shape) != 2:
159 75325cc3 Carles
        logger.error(data_err)
160 75325cc3 Carles
        raise ValueError(data_err)
161 75325cc3 Carles
162 75325cc3 Carles
    if data.shape[0] == data.shape[1] \
163 75325cc3 Carles
            and (np.tril(data).T == np.triu(data)).all():
164 75325cc3 Carles
        logger.info("Clustering using affinity matrix")
165 75325cc3 Carles
        labels = get_labels_affty(data)
166 db7349af Carles
        if max(labels) == -1:
167 db7349af Carles
            logger.error('Clustering of conformers did not converge. Try '
168 db7349af Carles
                         "setting a smaller 'min_samples' parameter")
169 75325cc3 Carles
        clusters = get_clusters(labels)
170 75325cc3 Carles
        exemplars = get_exemplars_affty(data, clusters)
171 75325cc3 Carles
        if debug:
172 75325cc3 Carles
            plot_clusters(labels, x, y, exemplars, save=True)
173 680944e4 Carles
        logger.info(f'Conformers are grouped in {len(exemplars)} clusters.')
174 75325cc3 Carles
        return exemplars
175 75325cc3 Carles
    else:
176 75325cc3 Carles
        pass