Statistiques
| Branche: | Tag: | Révision :

dockonsurf / modules / clustering.py @ 0db30d07

Historique | Voir | Annoter | Télécharger (6,81 ko)

1
"""Functions to cluster structures.
2

3
functions:
4
get_labels_affty: Clusters data in affinity matrix form by assigning labels to
5
data points.
6
get_labels_vector: Clusters data in vectorial form by assigning labels to
7
data points.
8
get_clusters: Groups data-points belonging to the same cluster into arrays of
9
indices.
10
get_exemplars_affty: Computes the exemplars for every cluster and returns a list
11
of indices.
12
plot_clusters: Plots the clustered data casting a color to every cluster.
13
clustering: Directs the clustering process by calling the relevant functions.
14
"""
15
import logging
16

    
17
import hdbscan
18
import numpy as np
19

    
20
logger = logging.getLogger('DockOnSurf')
21

    
22

    
23
def get_labels_affty(affty_mtx, kind="rmsd"):
24
    """Clusters data in affinity matrix form by assigning labels to data points.
25

26
    @param affty_mtx: Data to be clustered, it must be an affinity matrix.
27
    (Eg. Euclidean distances between points, RMSD Matrix, etc.).
28
    Shape: [n_points, n_points]
29
    @param kind: Which kind of data the affinity matrix contains.
30
    @return: list of cluster labels. Every data point is assigned a number
31
    corresponding to the cluster it belongs to.
32
    """
33
    if np.average(affty_mtx) < 1e-3 and kind == "rmsd":
34
        sing_clust = True
35
        min_size = int(len(affty_mtx) / 2)
36
    else:
37
        sing_clust = False
38
        min_size = 20
39
    hdbs = hdbscan.HDBSCAN(metric="precomputed",
40
                           min_samples=5,
41
                           min_cluster_size=min_size,
42
                           allow_single_cluster=sing_clust)
43
    return hdbs.fit_predict(affty_mtx)
44

    
45

    
46
def get_labels_vector():
47
    """Clusters data in vectorial form by assigning labels to data points.
48

49
    @return: list of cluster labels. Every data point is assigned a number
50
    corresponding to the cluster it belongs to.
51
    """
52
    return []
53

    
54

    
55
def get_clusters(labels):
56
    """Groups data-points belonging to the same cluster into arrays of indices.
57

58
    @param labels: list of cluster labels (numbers) corresponding to the cluster
59
    it belongs to.
60
    @return: tuple of arrays. Every array contains the indices (relative to the
61
    labels list) of the data points belonging to the same cluster.
62
    """
63
    n_clusters = max(labels) + 1
64
    return tuple(np.where(labels == clust_num)[0]
65
                 for clust_num in range(n_clusters))
66

    
67

    
68
def get_exemplars_affty(affty_mtx, clusters):
69
    """Computes the exemplars for every cluster and returns a list of indices.
70

71
    @param affty_mtx: Data structured in form of affinity matrix. eg. Euclidean
72
    distances between points, RMSD Matrix, etc.) shape: [n_points, n_points].
73
    @param clusters: tuple of arrays. Every array contains the indices (relative
74
    to the affinity matrix) of the data points belonging to the same cluster.
75
    @return: list of indices (relative to the affinity matrix) exemplars for
76
    every cluster.
77
    """
78
    from sklearn.cluster import AffinityPropagation
79
    clust_affty_mtcs = tuple(affty_mtx[np.ix_(clust, clust)]
80
                             for clust in clusters)
81
    exemplars = []
82
    for i, mtx in enumerate(clust_affty_mtcs):
83
        pref = -1e6 * np.max(np.abs(mtx))
84
        af = AffinityPropagation(affinity='precomputed',
85
                                 preference=pref,
86
                                 damping=0.95,
87
                                 max_iter=2000).fit(mtx)
88
        exemplars.append(clusters[i][af.cluster_centers_indices_[0]])
89
    return exemplars
90

    
91

    
92
def plot_clusters(labels, x, y, exemplars=None, save=True):
93
    """Plots the clustered data casting a color to every cluster.
94

95
    @param labels: list of cluster labels (numbers) corresponding to the cluster
96
    it belongs to.
97
    @param x: list of data of the x axis.
98
    @param y: list of data of the y axis.
99
    @param exemplars: list of data point indices (relative to the labels list)
100
    considered as cluster exemplars.
101
    @param save: bool, Whether to save the generated plot into a file or not.
102
    (in the latter case the plot is shown in a new window)
103
    """
104
    import matplotlib.pyplot as plt
105
    from matplotlib import cm, colors
106

    
107
    n_clusters = max(labels) + 1
108
    rb = cm.get_cmap('gist_rainbow', max(n_clusters, 1))
109
    rb.set_under()
110
    plt.figure(figsize=(10, 8))
111
    for i in range(len(labels)):
112
        plt.plot(x[i], y[i], c=rb(labels[i]), marker='.')
113
        if i == exemplars[labels[i]]:
114
            plt.plot(x[i], y[i], c=rb(labels[i]), marker="x",
115
                     markersize=15,
116
                     label=f"Exemplar cluster {labels[i]}")
117
    plt.title(f'Found {n_clusters} Clusters.')
118
    plt.xlabel("Energy")
119
    plt.ylabel("MOI")
120
    plt.legend()
121

    
122
    bounds = list(range(max(n_clusters, 1)))
123
    norm = colors.Normalize(vmin=min(labels), vmax=max(labels))
124
    plt.colorbar(cm.ScalarMappable(norm=norm, cmap=rb), ticks=bounds)
125
    if save:
126
        plt.savefig(f'clusters.png')
127
        plt.close("all")
128
    else:
129
        plt.show()
130

    
131

    
132
def clustering(data, debug=False, x=None, y=None):
133
    """Directs the clustering process by calling the relevant functions.
134

135
    @param data: The data to be clustered. It must be stored in vector form
136
    [n_features, n_samples] or in affinity matrix form [n_samples, n_samples],
137
    symmetric and 0 in the main diagonal. (Eg. Euclidean distances between
138
    points, RMSD Matrix, etc.).
139
    @param debug: bool, Whether to report debug information and plot the
140
    clustered data.
141
    @param x: Necessary only if debug is turned on. X values to plot the data.
142
    @param y: Necessary only if debug is turned on. X values to plot the data.
143
    @return: list of exemplars, list of indices (relative to data)
144
    exemplars for every cluster.
145
    """
146
    from collections.abc import Iterable
147

    
148
    data_err = "Data must be stored in vector form [n_features, n_samples] or" \
149
               "in affinity matrix form [n_samples, n_samples]: symmetric " \
150
               "and 0 in the main diagonal. Eg. RMSD matrix"
151
    debug_err = "On debug mode x and y should be provided"
152

    
153
    if debug and not (isinstance(x, Iterable) and isinstance(y, Iterable)):
154
        logger.error(debug_err)
155
        raise ValueError(debug_err)
156
    if not isinstance(data, np.ndarray):
157
        data = np.array(data)
158
    if len(data.shape) != 2:
159
        logger.error(data_err)
160
        raise ValueError(data_err)
161

    
162
    if data.shape[0] == data.shape[1] \
163
            and (np.tril(data).T == np.triu(data)).all():
164
        logger.info("Clustering using affinity matrix")
165
        labels = get_labels_affty(data)
166
        if max(labels) == -1:
167
            logger.error('Clustering of conformers did not converge. Try '
168
                         "setting a smaller 'min_samples' parameter")
169
        clusters = get_clusters(labels)
170
        exemplars = get_exemplars_affty(data, clusters)
171
        if debug:
172
            plot_clusters(labels, x, y, exemplars, save=True)
173
        logger.info(f'Conformers are grouped in {len(exemplars)} clusters.')
174
        return exemplars
175
    else:
176
        pass