Statistiques
| Branche: | Tag: | Révision :

dockonsurf / modules / clustering.py @ 4e82c425

Historique | Voir | Annoter | Télécharger (9,37 ko)

1 75325cc3 Carles
"""Functions to cluster structures.
2 75325cc3 Carles

3 75325cc3 Carles
functions:
4 4e82c425 Carles Martí
get_rmsd: Computes the rmsd matrix of the conformers in a list of rdkit mol
5 4e82c425 Carles Martí
    objects.
6 75325cc3 Carles
get_labels_affty: Clusters data in affinity matrix form by assigning labels to
7 4e82c425 Carles Martí
    data points.
8 75325cc3 Carles
get_labels_vector: Clusters data in vectorial form by assigning labels to
9 4e82c425 Carles Martí
    data points.
10 75325cc3 Carles
get_clusters: Groups data-points belonging to the same cluster into arrays of
11 4e82c425 Carles Martí
    indices.
12 75325cc3 Carles
get_exemplars_affty: Computes the exemplars for every cluster and returns a list
13 4e82c425 Carles Martí
    of indices.
14 75325cc3 Carles
plot_clusters: Plots the clustered data casting a color to every cluster.
15 75325cc3 Carles
clustering: Directs the clustering process by calling the relevant functions.
16 75325cc3 Carles
"""
17 75325cc3 Carles
import logging
18 75325cc3 Carles
19 75325cc3 Carles
import hdbscan
20 75325cc3 Carles
import numpy as np
21 75325cc3 Carles
22 75325cc3 Carles
logger = logging.getLogger('DockOnSurf')
23 75325cc3 Carles
24 75325cc3 Carles
25 fff3aab1 Carles
def get_rmsd(mol_list: list, remove_Hs="c"):
26 4e82c425 Carles Martí
    """Computes the rmsd matrix of the conformers in a list of rdkit mol objects
27 fff3aab1 Carles

28 fff3aab1 Carles
    @param mol_list: list of rdkit mol objects containing the conformers.
29 fff3aab1 Carles
    @param remove_Hs: bool or str,
30 fff3aab1 Carles
    @return rmsd_matrix: Matrix containing the rmsd values of every pair of
31 fff3aab1 Carles
    conformers.
32 fff3aab1 Carles

33 fff3aab1 Carles
    The RMSD values of every pair of conformers is computed, stored in matrix
34 fff3aab1 Carles
    form and returned back. The calculation of rmsd values can take into
35 fff3aab1 Carles
    account all hydrogens, none, or only the ones not linked to carbon atoms.
36 fff3aab1 Carles
    """
37 fff3aab1 Carles
    import rdkit.Chem.AllChem as Chem
38 fff3aab1 Carles
39 fff3aab1 Carles
    if len(mol_list) < 2:
40 fff3aab1 Carles
        err = "The provided molecule has less than 2 conformers"
41 fff3aab1 Carles
        logger.error(err)
42 fff3aab1 Carles
        raise ValueError(err)
43 fff3aab1 Carles
44 fff3aab1 Carles
    if not remove_Hs:
45 fff3aab1 Carles
        pass
46 fff3aab1 Carles
    elif remove_Hs or remove_Hs.lower() == "all":
47 fff3aab1 Carles
        mol_list = [Chem.RemoveHs(mol) for mol in mol_list]
48 fff3aab1 Carles
    elif remove_Hs.lower() == "c":
49 c492296f Carles Martí
        from modules.isolated import remove_C_linked_Hs
50 fff3aab1 Carles
        mol_list = [remove_C_linked_Hs(mol) for mol in mol_list]
51 fff3aab1 Carles
    else:
52 fff3aab1 Carles
        err = "remove_Hs value does not have an acceptable value"
53 fff3aab1 Carles
        logger.error(err)
54 fff3aab1 Carles
        raise ValueError(err)
55 fff3aab1 Carles
56 fff3aab1 Carles
    num_confs = len(mol_list)
57 fff3aab1 Carles
    conf_ids = list(range(num_confs))
58 fff3aab1 Carles
    rmsd_mtx = np.zeros((num_confs, num_confs))
59 f8c4eafe Carles
    for id1 in conf_ids:  # TODO reduce RMSD precision
60 fff3aab1 Carles
        for id2 in conf_ids[id1 + 1:]:
61 fff3aab1 Carles
            rmsd = Chem.GetBestRMS(mol_list[id1], mol_list[id2])
62 fff3aab1 Carles
            rmsd_mtx[id1][id2] = rmsd
63 fff3aab1 Carles
            rmsd_mtx[id2][id1] = rmsd
64 fff3aab1 Carles
65 fff3aab1 Carles
    return rmsd_mtx
66 fff3aab1 Carles
67 fff3aab1 Carles
68 75325cc3 Carles
def get_labels_affty(affty_mtx, kind="rmsd"):
69 75325cc3 Carles
    """Clusters data in affinity matrix form by assigning labels to data points.
70 75325cc3 Carles

71 75325cc3 Carles
    @param affty_mtx: Data to be clustered, it must be an affinity matrix.
72 75325cc3 Carles
    (Eg. Euclidean distances between points, RMSD Matrix, etc.).
73 75325cc3 Carles
    Shape: [n_points, n_points]
74 75325cc3 Carles
    @param kind: Which kind of data the affinity matrix contains.
75 75325cc3 Carles
    @return: list of cluster labels. Every data point is assigned a number
76 75325cc3 Carles
    corresponding to the cluster it belongs to.
77 75325cc3 Carles
    """
78 75325cc3 Carles
    if np.average(affty_mtx) < 1e-3 and kind == "rmsd":
79 75325cc3 Carles
        sing_clust = True
80 75325cc3 Carles
        min_size = int(len(affty_mtx) / 2)
81 75325cc3 Carles
    else:
82 75325cc3 Carles
        sing_clust = False
83 75325cc3 Carles
        min_size = 20
84 75325cc3 Carles
    hdbs = hdbscan.HDBSCAN(metric="precomputed",
85 75325cc3 Carles
                           min_samples=5,
86 75325cc3 Carles
                           min_cluster_size=min_size,
87 75325cc3 Carles
                           allow_single_cluster=sing_clust)
88 75325cc3 Carles
    return hdbs.fit_predict(affty_mtx)
89 75325cc3 Carles
90 75325cc3 Carles
91 75325cc3 Carles
def get_labels_vector():
92 75325cc3 Carles
    """Clusters data in vectorial form by assigning labels to data points.
93 75325cc3 Carles

94 75325cc3 Carles
    @return: list of cluster labels. Every data point is assigned a number
95 75325cc3 Carles
    corresponding to the cluster it belongs to.
96 75325cc3 Carles
    """
97 4e82c425 Carles Martí
    # TODO Implement it.
98 75325cc3 Carles
    return []
99 75325cc3 Carles
100 75325cc3 Carles
101 75325cc3 Carles
def get_clusters(labels):
102 75325cc3 Carles
    """Groups data-points belonging to the same cluster into arrays of indices.
103 75325cc3 Carles

104 75325cc3 Carles
    @param labels: list of cluster labels (numbers) corresponding to the cluster
105 75325cc3 Carles
    it belongs to.
106 75325cc3 Carles
    @return: tuple of arrays. Every array contains the indices (relative to the
107 75325cc3 Carles
    labels list) of the data points belonging to the same cluster.
108 75325cc3 Carles
    """
109 75325cc3 Carles
    n_clusters = max(labels) + 1
110 75325cc3 Carles
    return tuple(np.where(labels == clust_num)[0]
111 75325cc3 Carles
                 for clust_num in range(n_clusters))
112 75325cc3 Carles
113 75325cc3 Carles
114 75325cc3 Carles
def get_exemplars_affty(affty_mtx, clusters):
115 75325cc3 Carles
    """Computes the exemplars for every cluster and returns a list of indices.
116 75325cc3 Carles

117 75325cc3 Carles
    @param affty_mtx: Data structured in form of affinity matrix. eg. Euclidean
118 75325cc3 Carles
    distances between points, RMSD Matrix, etc.) shape: [n_points, n_points].
119 75325cc3 Carles
    @param clusters: tuple of arrays. Every array contains the indices (relative
120 75325cc3 Carles
    to the affinity matrix) of the data points belonging to the same cluster.
121 4e82c425 Carles Martí
    @return: list of indices (relative to the affinity matrix) of the exemplars
122 4e82c425 Carles Martí
    for every cluster.
123 4e82c425 Carles Martí

124 4e82c425 Carles Martí
    This function finds the exemplars of already clusterized data. It does
125 4e82c425 Carles Martí
    that by (i) building a rmsd matrix for each existing cluster with the values
126 4e82c425 Carles Martí
    of the total RMSD matrix (ii) carrying out an actual clustering for each
127 4e82c425 Carles Martí
    cluster-specific matrix using a set of parameters (large negative value of
128 4e82c425 Carles Martí
    preference) such that it always finds only one cluster and (iii) it then
129 4e82c425 Carles Martí
    calculates the exemplar for the matrix.
130 75325cc3 Carles
    """
131 75325cc3 Carles
    from sklearn.cluster import AffinityPropagation
132 4e82c425 Carles Martí
    # Splits Total RMSD matrix into cluster-specific RMSD matrices.
133 75325cc3 Carles
    clust_affty_mtcs = tuple(affty_mtx[np.ix_(clust, clust)]
134 75325cc3 Carles
                             for clust in clusters)
135 75325cc3 Carles
    exemplars = []
136 4e82c425 Carles Martí
    # Carries out the forced-to-converge-to-1 clustering for each already
137 4e82c425 Carles Martí
    # existing cluster rmsd matrix and calculates the exemplar.
138 75325cc3 Carles
    for i, mtx in enumerate(clust_affty_mtcs):
139 75325cc3 Carles
        pref = -1e6 * np.max(np.abs(mtx))
140 78fcb188 Carles Martí
        af = AffinityPropagation(affinity='precomputed', preference=pref,
141 78fcb188 Carles Martí
                                 damping=0.95, max_iter=2000,
142 78fcb188 Carles Martí
                                 random_state=None).fit(mtx)
143 75325cc3 Carles
        exemplars.append(clusters[i][af.cluster_centers_indices_[0]])
144 75325cc3 Carles
    return exemplars
145 75325cc3 Carles
146 75325cc3 Carles
147 75325cc3 Carles
def plot_clusters(labels, x, y, exemplars=None, save=True):
148 75325cc3 Carles
    """Plots the clustered data casting a color to every cluster.
149 75325cc3 Carles

150 75325cc3 Carles
    @param labels: list of cluster labels (numbers) corresponding to the cluster
151 75325cc3 Carles
    it belongs to.
152 75325cc3 Carles
    @param x: list of data of the x axis.
153 75325cc3 Carles
    @param y: list of data of the y axis.
154 75325cc3 Carles
    @param exemplars: list of data point indices (relative to the labels list)
155 75325cc3 Carles
    considered as cluster exemplars.
156 75325cc3 Carles
    @param save: bool, Whether to save the generated plot into a file or not.
157 75325cc3 Carles
    (in the latter case the plot is shown in a new window)
158 75325cc3 Carles
    """
159 75325cc3 Carles
    import matplotlib.pyplot as plt
160 75325cc3 Carles
    from matplotlib import cm, colors
161 75325cc3 Carles
162 75325cc3 Carles
    n_clusters = max(labels) + 1
163 75325cc3 Carles
    rb = cm.get_cmap('gist_rainbow', max(n_clusters, 1))
164 75325cc3 Carles
    rb.set_under()
165 75325cc3 Carles
    plt.figure(figsize=(10, 8))
166 75325cc3 Carles
    for i in range(len(labels)):
167 75325cc3 Carles
        plt.plot(x[i], y[i], c=rb(labels[i]), marker='.')
168 f8c4eafe Carles
        if len(exemplars) > 0 and i == exemplars[labels[i]]:
169 75325cc3 Carles
            plt.plot(x[i], y[i], c=rb(labels[i]), marker="x",
170 75325cc3 Carles
                     markersize=15,
171 75325cc3 Carles
                     label=f"Exemplar cluster {labels[i]}")
172 75325cc3 Carles
    plt.title(f'Found {n_clusters} Clusters.')
173 75325cc3 Carles
    plt.xlabel("Energy")
174 75325cc3 Carles
    plt.ylabel("MOI")
175 75325cc3 Carles
    plt.legend()
176 75325cc3 Carles
177 75325cc3 Carles
    bounds = list(range(max(n_clusters, 1)))
178 75325cc3 Carles
    norm = colors.Normalize(vmin=min(labels), vmax=max(labels))
179 75325cc3 Carles
    plt.colorbar(cm.ScalarMappable(norm=norm, cmap=rb), ticks=bounds)
180 75325cc3 Carles
    if save:
181 05464650 Carles Marti
        from modules.utilities import check_bak
182 05464650 Carles Marti
        check_bak('clusters.png')
183 05464650 Carles Marti
        plt.savefig('clusters.png')
184 75325cc3 Carles
        plt.close("all")
185 75325cc3 Carles
    else:
186 75325cc3 Carles
        plt.show()
187 75325cc3 Carles
188 75325cc3 Carles
189 05464650 Carles Marti
def clustering(data, plot=False, x=None, y=None):
190 75325cc3 Carles
    """Directs the clustering process by calling the relevant functions.
191 75325cc3 Carles

192 75325cc3 Carles
    @param data: The data to be clustered. It must be stored in vector form
193 75325cc3 Carles
    [n_features, n_samples] or in affinity matrix form [n_samples, n_samples],
194 75325cc3 Carles
    symmetric and 0 in the main diagonal. (Eg. Euclidean distances between
195 75325cc3 Carles
    points, RMSD Matrix, etc.).
196 05464650 Carles Marti
    @param plot: bool, Whether to plot the clustered data.
197 05464650 Carles Marti
    @param x: Necessary only if plot is turned on. X values to plot the data.
198 05464650 Carles Marti
    @param y: Necessary only if plot is turned on. Y values to plot the data.
199 75325cc3 Carles
    @return: list of exemplars, list of indices (relative to data)
200 75325cc3 Carles
    exemplars for every cluster.
201 75325cc3 Carles
    """
202 75325cc3 Carles
    from collections.abc import Iterable
203 75325cc3 Carles
204 75325cc3 Carles
    data_err = "Data must be stored in vector form [n_features, n_samples] or" \
205 75325cc3 Carles
               "in affinity matrix form [n_samples, n_samples]: symmetric " \
206 75325cc3 Carles
               "and 0 in the main diagonal. Eg. RMSD matrix"
207 75325cc3 Carles
    debug_err = "On debug mode x and y should be provided"
208 75325cc3 Carles
209 05464650 Carles Marti
    if plot and not (isinstance(x, Iterable) and isinstance(y, Iterable)):
210 75325cc3 Carles
        logger.error(debug_err)
211 75325cc3 Carles
        raise ValueError(debug_err)
212 75325cc3 Carles
    if not isinstance(data, np.ndarray):
213 75325cc3 Carles
        data = np.array(data)
214 75325cc3 Carles
    if len(data.shape) != 2:
215 75325cc3 Carles
        logger.error(data_err)
216 75325cc3 Carles
        raise ValueError(data_err)
217 75325cc3 Carles
218 75325cc3 Carles
    if data.shape[0] == data.shape[1] \
219 75325cc3 Carles
            and (np.tril(data).T == np.triu(data)).all():
220 695dcff8 Carles Marti
        logger.info("Clustering using affinity matrix.")
221 75325cc3 Carles
        labels = get_labels_affty(data)
222 db7349af Carles
        if max(labels) == -1:
223 d0939bb2 Carles Marti
            logger.warning('Clustering of conformers did not converge. Try '
224 695dcff8 Carles Marti
                           "setting a smaller 'min_samples' parameter.")
225 d0939bb2 Carles Marti
            exemplars = list(range(data.shape[0]))
226 d0939bb2 Carles Marti
        else:
227 d0939bb2 Carles Marti
            clusters = get_clusters(labels)
228 d0939bb2 Carles Marti
            exemplars = get_exemplars_affty(data, clusters)
229 05464650 Carles Marti
        if plot:
230 75325cc3 Carles
            plot_clusters(labels, x, y, exemplars, save=True)
231 680944e4 Carles
        logger.info(f'Conformers are grouped in {len(exemplars)} clusters.')
232 75325cc3 Carles
        return exemplars
233 75325cc3 Carles
    else:
234 f8c4eafe Carles
        not_impl_err = 'Clustering not yet implemented for vectorized data'
235 f8c4eafe Carles
        logger.error(not_impl_err)
236 f8c4eafe Carles
        raise NotImplementedError(not_impl_err)