Statistiques
| Branche: | Tag: | Révision :

dockonsurf / modules / clustering.py @ 8279a51d

Historique | Voir | Annoter | Télécharger (8,74 ko)

1 75325cc3 Carles
"""Functions to cluster structures.
2 75325cc3 Carles

3 75325cc3 Carles
functions:
4 75325cc3 Carles
get_labels_affty: Clusters data in affinity matrix form by assigning labels to
5 75325cc3 Carles
data points.
6 75325cc3 Carles
get_labels_vector: Clusters data in vectorial form by assigning labels to
7 75325cc3 Carles
data points.
8 75325cc3 Carles
get_clusters: Groups data-points belonging to the same cluster into arrays of
9 75325cc3 Carles
indices.
10 75325cc3 Carles
get_exemplars_affty: Computes the exemplars for every cluster and returns a list
11 75325cc3 Carles
of indices.
12 75325cc3 Carles
plot_clusters: Plots the clustered data casting a color to every cluster.
13 75325cc3 Carles
clustering: Directs the clustering process by calling the relevant functions.
14 75325cc3 Carles
"""
15 75325cc3 Carles
import logging
16 f6431316 Carles Marti
import warnings
17 75325cc3 Carles
18 75325cc3 Carles
import hdbscan
19 75325cc3 Carles
import numpy as np
20 75325cc3 Carles
21 75325cc3 Carles
logger = logging.getLogger('DockOnSurf')
22 75325cc3 Carles
23 75325cc3 Carles
24 fff3aab1 Carles
def get_rmsd(mol_list: list, remove_Hs="c"):
25 fff3aab1 Carles
    """Computes the rmsd matrix of the conformers in a rdkit mol object.
26 fff3aab1 Carles

27 fff3aab1 Carles
    @param mol_list: list of rdkit mol objects containing the conformers.
28 fff3aab1 Carles
    @param remove_Hs: bool or str,
29 fff3aab1 Carles
    @return rmsd_matrix: Matrix containing the rmsd values of every pair of
30 fff3aab1 Carles
    conformers.
31 fff3aab1 Carles

32 fff3aab1 Carles
    The RMSD values of every pair of conformers is computed, stored in matrix
33 fff3aab1 Carles
    form and returned back. The calculation of rmsd values can take into
34 fff3aab1 Carles
    account all hydrogens, none, or only the ones not linked to carbon atoms.
35 fff3aab1 Carles
    """
36 fff3aab1 Carles
    import rdkit.Chem.AllChem as Chem
37 fff3aab1 Carles
38 fff3aab1 Carles
    if len(mol_list) < 2:
39 fff3aab1 Carles
        err = "The provided molecule has less than 2 conformers"
40 fff3aab1 Carles
        logger.error(err)
41 fff3aab1 Carles
        raise ValueError(err)
42 fff3aab1 Carles
43 fff3aab1 Carles
    if not remove_Hs:
44 fff3aab1 Carles
        pass
45 fff3aab1 Carles
    elif remove_Hs or remove_Hs.lower() == "all":
46 fff3aab1 Carles
        mol_list = [Chem.RemoveHs(mol) for mol in mol_list]
47 fff3aab1 Carles
    elif remove_Hs.lower() == "c":
48 fff3aab1 Carles
        from isolated import remove_C_linked_Hs
49 fff3aab1 Carles
        mol_list = [remove_C_linked_Hs(mol) for mol in mol_list]
50 fff3aab1 Carles
    else:
51 fff3aab1 Carles
        err = "remove_Hs value does not have an acceptable value"
52 fff3aab1 Carles
        logger.error(err)
53 fff3aab1 Carles
        raise ValueError(err)
54 fff3aab1 Carles
55 fff3aab1 Carles
    num_confs = len(mol_list)
56 fff3aab1 Carles
    conf_ids = list(range(num_confs))
57 fff3aab1 Carles
    rmsd_mtx = np.zeros((num_confs, num_confs))
58 f8c4eafe Carles
    for id1 in conf_ids:  # TODO reduce RMSD precision
59 fff3aab1 Carles
        for id2 in conf_ids[id1 + 1:]:
60 fff3aab1 Carles
            rmsd = Chem.GetBestRMS(mol_list[id1], mol_list[id2])
61 fff3aab1 Carles
            rmsd_mtx[id1][id2] = rmsd
62 fff3aab1 Carles
            rmsd_mtx[id2][id1] = rmsd
63 fff3aab1 Carles
64 fff3aab1 Carles
    return rmsd_mtx
65 fff3aab1 Carles
66 fff3aab1 Carles
67 75325cc3 Carles
def get_labels_affty(affty_mtx, kind="rmsd"):
68 75325cc3 Carles
    """Clusters data in affinity matrix form by assigning labels to data points.
69 75325cc3 Carles

70 75325cc3 Carles
    @param affty_mtx: Data to be clustered, it must be an affinity matrix.
71 75325cc3 Carles
    (Eg. Euclidean distances between points, RMSD Matrix, etc.).
72 75325cc3 Carles
    Shape: [n_points, n_points]
73 75325cc3 Carles
    @param kind: Which kind of data the affinity matrix contains.
74 75325cc3 Carles
    @return: list of cluster labels. Every data point is assigned a number
75 75325cc3 Carles
    corresponding to the cluster it belongs to.
76 75325cc3 Carles
    """
77 75325cc3 Carles
    if np.average(affty_mtx) < 1e-3 and kind == "rmsd":
78 75325cc3 Carles
        sing_clust = True
79 75325cc3 Carles
        min_size = int(len(affty_mtx) / 2)
80 75325cc3 Carles
    else:
81 75325cc3 Carles
        sing_clust = False
82 75325cc3 Carles
        min_size = 20
83 75325cc3 Carles
    hdbs = hdbscan.HDBSCAN(metric="precomputed",
84 75325cc3 Carles
                           min_samples=5,
85 75325cc3 Carles
                           min_cluster_size=min_size,
86 75325cc3 Carles
                           allow_single_cluster=sing_clust)
87 75325cc3 Carles
    return hdbs.fit_predict(affty_mtx)
88 75325cc3 Carles
89 75325cc3 Carles
90 75325cc3 Carles
def get_labels_vector():
91 75325cc3 Carles
    """Clusters data in vectorial form by assigning labels to data points.
92 75325cc3 Carles

93 75325cc3 Carles
    @return: list of cluster labels. Every data point is assigned a number
94 75325cc3 Carles
    corresponding to the cluster it belongs to.
95 75325cc3 Carles
    """
96 75325cc3 Carles
    return []
97 75325cc3 Carles
98 75325cc3 Carles
99 75325cc3 Carles
def get_clusters(labels):
100 75325cc3 Carles
    """Groups data-points belonging to the same cluster into arrays of indices.
101 75325cc3 Carles

102 75325cc3 Carles
    @param labels: list of cluster labels (numbers) corresponding to the cluster
103 75325cc3 Carles
    it belongs to.
104 75325cc3 Carles
    @return: tuple of arrays. Every array contains the indices (relative to the
105 75325cc3 Carles
    labels list) of the data points belonging to the same cluster.
106 75325cc3 Carles
    """
107 75325cc3 Carles
    n_clusters = max(labels) + 1
108 75325cc3 Carles
    return tuple(np.where(labels == clust_num)[0]
109 75325cc3 Carles
                 for clust_num in range(n_clusters))
110 75325cc3 Carles
111 75325cc3 Carles
112 75325cc3 Carles
def get_exemplars_affty(affty_mtx, clusters):
113 75325cc3 Carles
    """Computes the exemplars for every cluster and returns a list of indices.
114 75325cc3 Carles

115 75325cc3 Carles
    @param affty_mtx: Data structured in form of affinity matrix. eg. Euclidean
116 75325cc3 Carles
    distances between points, RMSD Matrix, etc.) shape: [n_points, n_points].
117 75325cc3 Carles
    @param clusters: tuple of arrays. Every array contains the indices (relative
118 75325cc3 Carles
    to the affinity matrix) of the data points belonging to the same cluster.
119 75325cc3 Carles
    @return: list of indices (relative to the affinity matrix) exemplars for
120 75325cc3 Carles
    every cluster.
121 75325cc3 Carles
    """
122 75325cc3 Carles
    from sklearn.cluster import AffinityPropagation
123 75325cc3 Carles
    clust_affty_mtcs = tuple(affty_mtx[np.ix_(clust, clust)]
124 75325cc3 Carles
                             for clust in clusters)
125 75325cc3 Carles
    exemplars = []
126 75325cc3 Carles
    for i, mtx in enumerate(clust_affty_mtcs):
127 75325cc3 Carles
        pref = -1e6 * np.max(np.abs(mtx))
128 8e31f746 Carles Marti
        warnings.filterwarnings("error")
129 f6431316 Carles Marti
        try:
130 f6431316 Carles Marti
            af = AffinityPropagation(affinity='precomputed', preference=pref,
131 f6431316 Carles Marti
                                     damping=0.95, max_iter=2000,
132 f6431316 Carles Marti
                                     random_state=None).fit(mtx)
133 f6431316 Carles Marti
        except UserWarning as w:
134 f6431316 Carles Marti
            logger.warning(str(w))
135 75325cc3 Carles
        exemplars.append(clusters[i][af.cluster_centers_indices_[0]])
136 75325cc3 Carles
    return exemplars
137 75325cc3 Carles
138 75325cc3 Carles
139 75325cc3 Carles
def plot_clusters(labels, x, y, exemplars=None, save=True):
140 75325cc3 Carles
    """Plots the clustered data casting a color to every cluster.
141 75325cc3 Carles

142 75325cc3 Carles
    @param labels: list of cluster labels (numbers) corresponding to the cluster
143 75325cc3 Carles
    it belongs to.
144 75325cc3 Carles
    @param x: list of data of the x axis.
145 75325cc3 Carles
    @param y: list of data of the y axis.
146 75325cc3 Carles
    @param exemplars: list of data point indices (relative to the labels list)
147 75325cc3 Carles
    considered as cluster exemplars.
148 75325cc3 Carles
    @param save: bool, Whether to save the generated plot into a file or not.
149 75325cc3 Carles
    (in the latter case the plot is shown in a new window)
150 75325cc3 Carles
    """
151 75325cc3 Carles
    import matplotlib.pyplot as plt
152 75325cc3 Carles
    from matplotlib import cm, colors
153 75325cc3 Carles
154 75325cc3 Carles
    n_clusters = max(labels) + 1
155 75325cc3 Carles
    rb = cm.get_cmap('gist_rainbow', max(n_clusters, 1))
156 75325cc3 Carles
    rb.set_under()
157 75325cc3 Carles
    plt.figure(figsize=(10, 8))
158 75325cc3 Carles
    for i in range(len(labels)):
159 75325cc3 Carles
        plt.plot(x[i], y[i], c=rb(labels[i]), marker='.')
160 f8c4eafe Carles
        if len(exemplars) > 0 and i == exemplars[labels[i]]:
161 75325cc3 Carles
            plt.plot(x[i], y[i], c=rb(labels[i]), marker="x",
162 75325cc3 Carles
                     markersize=15,
163 75325cc3 Carles
                     label=f"Exemplar cluster {labels[i]}")
164 75325cc3 Carles
    plt.title(f'Found {n_clusters} Clusters.')
165 75325cc3 Carles
    plt.xlabel("Energy")
166 75325cc3 Carles
    plt.ylabel("MOI")
167 75325cc3 Carles
    plt.legend()
168 75325cc3 Carles
169 75325cc3 Carles
    bounds = list(range(max(n_clusters, 1)))
170 75325cc3 Carles
    norm = colors.Normalize(vmin=min(labels), vmax=max(labels))
171 75325cc3 Carles
    plt.colorbar(cm.ScalarMappable(norm=norm, cmap=rb), ticks=bounds)
172 75325cc3 Carles
    if save:
173 05464650 Carles Marti
        from modules.utilities import check_bak
174 05464650 Carles Marti
        check_bak('clusters.png')
175 05464650 Carles Marti
        plt.savefig('clusters.png')
176 75325cc3 Carles
        plt.close("all")
177 75325cc3 Carles
    else:
178 75325cc3 Carles
        plt.show()
179 75325cc3 Carles
180 75325cc3 Carles
181 05464650 Carles Marti
def clustering(data, plot=False, x=None, y=None):
182 75325cc3 Carles
    """Directs the clustering process by calling the relevant functions.
183 75325cc3 Carles

184 75325cc3 Carles
    @param data: The data to be clustered. It must be stored in vector form
185 75325cc3 Carles
    [n_features, n_samples] or in affinity matrix form [n_samples, n_samples],
186 75325cc3 Carles
    symmetric and 0 in the main diagonal. (Eg. Euclidean distances between
187 75325cc3 Carles
    points, RMSD Matrix, etc.).
188 05464650 Carles Marti
    @param plot: bool, Whether to plot the clustered data.
189 05464650 Carles Marti
    @param x: Necessary only if plot is turned on. X values to plot the data.
190 05464650 Carles Marti
    @param y: Necessary only if plot is turned on. Y values to plot the data.
191 75325cc3 Carles
    @return: list of exemplars, list of indices (relative to data)
192 75325cc3 Carles
    exemplars for every cluster.
193 75325cc3 Carles
    """
194 75325cc3 Carles
    from collections.abc import Iterable
195 75325cc3 Carles
196 75325cc3 Carles
    data_err = "Data must be stored in vector form [n_features, n_samples] or" \
197 75325cc3 Carles
               "in affinity matrix form [n_samples, n_samples]: symmetric " \
198 75325cc3 Carles
               "and 0 in the main diagonal. Eg. RMSD matrix"
199 75325cc3 Carles
    debug_err = "On debug mode x and y should be provided"
200 75325cc3 Carles
201 05464650 Carles Marti
    if plot and not (isinstance(x, Iterable) and isinstance(y, Iterable)):
202 75325cc3 Carles
        logger.error(debug_err)
203 75325cc3 Carles
        raise ValueError(debug_err)
204 75325cc3 Carles
    if not isinstance(data, np.ndarray):
205 75325cc3 Carles
        data = np.array(data)
206 75325cc3 Carles
    if len(data.shape) != 2:
207 75325cc3 Carles
        logger.error(data_err)
208 75325cc3 Carles
        raise ValueError(data_err)
209 75325cc3 Carles
210 75325cc3 Carles
    if data.shape[0] == data.shape[1] \
211 75325cc3 Carles
            and (np.tril(data).T == np.triu(data)).all():
212 695dcff8 Carles Marti
        logger.info("Clustering using affinity matrix.")
213 75325cc3 Carles
        labels = get_labels_affty(data)
214 db7349af Carles
        if max(labels) == -1:
215 d0939bb2 Carles Marti
            logger.warning('Clustering of conformers did not converge. Try '
216 695dcff8 Carles Marti
                           "setting a smaller 'min_samples' parameter.")
217 d0939bb2 Carles Marti
            exemplars = list(range(data.shape[0]))
218 d0939bb2 Carles Marti
        else:
219 d0939bb2 Carles Marti
            clusters = get_clusters(labels)
220 d0939bb2 Carles Marti
            exemplars = get_exemplars_affty(data, clusters)
221 05464650 Carles Marti
        if plot:
222 75325cc3 Carles
            plot_clusters(labels, x, y, exemplars, save=True)
223 680944e4 Carles
        logger.info(f'Conformers are grouped in {len(exemplars)} clusters.')
224 75325cc3 Carles
        return exemplars
225 75325cc3 Carles
    else:
226 f8c4eafe Carles
        not_impl_err = 'Clustering not yet implemented for vectorized data'
227 f8c4eafe Carles
        logger.error(not_impl_err)
228 f8c4eafe Carles
        raise NotImplementedError(not_impl_err)