Source code for ConservedWaterSearch.hydrogen_orientation

from __future__ import annotations

from typing import TYPE_CHECKING

try:
    import matplotlib.pyplot as plt
except ImportError:
    plt = None

if TYPE_CHECKING:
    try:
        from matplotlib.axes import Axes
        from matplotlib.figure import Figure
    except ImportError:
        Axes, Figure = None, None

import numpy as np
from sklearn.cluster import OPTICS, KMeans


[docs] def hydrogen_orientation_analysis( orientations: np.ndarray, pct_size_buffer: float = 0.85, kmeans_ang_cutoff: float = 120, kmeans_inertia_cutoff: float = 0.4, FCW_angdiff_cutoff: float = 5, FCW_angstd_cutoff: float = 17, min_samp_data_size_pct: float = 0.15, nonFCW_angdiff_cutoff: float = 15, HCW_angstd_cutoff: float = 17, WCW_angstd_cutoff: float = 20, weakly_explained: float = 0.7, xiFCW: tuple[float] | list[float] = (0.03,), xiHCW: tuple[float] | list[float] = (0.05, 0.01), xiWCW: tuple[float] | list[float] = (0.05, 0.001), njobs: int = 1, verbose: int = 0, debugH: int = 0, plotreach: bool = False, which: tuple[str] | list[str] = ("FCW", "HCW", "WCW"), normalize_orientations: bool = True, ) -> list: """Determines if the water cluster is conserved and of what type. High level function that does hydrogen orientation analysis. Checks if the water cluster belongs into one of the following groups by analyzing hydrogen orientations: - FCW (Fully Conserved Water): hydrogens are strongly oriented in two directions with angle of 104.5 - HCW (Half Conserved Water): one set (cluster) of hydrogens is oriented in certain directions and other are spread into different orientations with angle of 104.5 - WCW (Weakly Conserved Water): several orientation combinations exist with satisfying water angles See :cite:`conservedwatersearch2022` for more information on water classification :ref:`conservedwaters_theory_background_methods`. If orientations don't satisfy the criteria for any of the waters, an empty list is returned. Args: orientations (np.ndarray): array of hydrogen orientations in space pct_size_buffer (float, optional): Minimum allowed size of the hydrogen orientation cluster. Defaults to 0.85. kmeans_ang_cutoff (float, optional): Maximum value of angle (in deg) allowed for FCW in kmeans clustering to be considered correct water angle. Defaults to 120. kmeans_inertia_cutoff (float, optional): upper limit allowed on kmeans inertia (measure of spread of data in a cluster). Defaults to 0.4. FCW_angdiff_cutoff (float, optional): Maximum value of angle (in deg) allowed for FCW in OPTICS clustering to be considered correct water angle. Defaults to 5. FCW_angstd_cutoff (float, optional): Maximal standard deviation of angle distribution of orientations of two hydrogens allowed for water to be considered FCW. Defaults to 17. min_samp_data_size_pct (float, optional): Minimum samples to choose for OPTICS clustering as percentage of number of water molecules considered for HCW and WCW. Defaults to 0.15. nonFCW_angdiff_cutoff (float, optional): Maximum standard deviation of angle allowed for HCW and WCW to be considered correct water angle. Defaults to 15. HCW_angstd_cutoff (float, optional): Maximum standard deviation cutoff for WCW angles to be considered correct water angles. Defaults to 17. WCW_angstd_cutoff (float, optional): Maximum standard deviation cutoff for WCW angles to be considered correct water angles. Defaults to 20. weakly_explained (float, optional): percentage of explained hydrogen orientations for water to be considered WCW. Defaults to 0.7. xiFCW (tuple, optional): Xi value for OPTICS clustering for FCW. Don't touch this unless you know what you are doing. Defaults to (0.03). xiHCW (tuple, optional): Xi value for OPTICS clustering for HCW. Don't touch this unless you know what you are doing. Defaults to (0.05, 0.01). xiWCW (tuple, optional): Xi value for OPTICS clustering for WCW. Don't touch this unless you know what you are doing. Defaults to (0.05, 0.001). njobs (int, optional): how many cpu cores to use for clustering. Defaults to 1. verbose (int, optional): verbosity of output. Defaults to 0. debugH (int, optional): debug level for orientations. Defaults to 0. plotreach (bool, optional): weather to plot the reachability plot for OPTICS when debuging. Defaults to False. which (tuple, optional): tuple of strings denoting which water types to search for. Allowed is any combination of FCW (fully conserved waters), HCW (half conserved waters) and WCW (weakly conserved waters). Defaults to ["FCW", "HCW", "WCW"]. normalize_orientations (bool, optional): weather to normalize the orientation vectors to unit distance. Defaults to True. Returns: list: returns a list containing two orientations of hydrogens and water classification string ("FCW", "HCW", "WCW"), if not conserved returns an empty list """ orientations = np.array(orientations) # check length of orientations - it has to be bigger then 1 and even orishape = orientations.shape if len(orishape) != 2: value_error_string = "Orientations have to be a 2D array" raise ValueError(value_error_string) Hnum = orishape[0] dimnum = orishape[1] if dimnum != 3: value_error_string = "Orientations must be vectors of dimension 3" raise ValueError(value_error_string) if Hnum < 2 or Hnum % 2 != 0: value_error_string = ( "Number of orientations must be even!" " Each water molecule has 2 hydrogen atoms!" ) raise ValueError(value_error_string) if normalize_orientations: orientations = orientations / np.linalg.norm( orientations, axis=1, keepdims=True ) # For Debug np.set_printoptions(precision=4) # Check if water is conserved if "FCW" in which: for xi in xiFCW: conserved = find_fully_conserved_orientations( orientations, pct_size_buffer=pct_size_buffer, kmeans_ang_cutoff=kmeans_ang_cutoff, kmeans_inertia_cutoff=kmeans_inertia_cutoff, angdiff_cutoff=FCW_angdiff_cutoff, angstd_cutoff=FCW_angstd_cutoff, xi=xi, njobs=njobs, verbose=verbose, debugH=debugH, plotreach=plotreach, ) if len(conserved) > 0: return conserved if "HCW" in which: for xi in xiHCW: half_conserved = find_half_conserved_orientations( orientations, pct_size_buffer=pct_size_buffer, min_samp_data_size_pct=min_samp_data_size_pct, angdiff_cutoff=nonFCW_angdiff_cutoff, angstd_cutoff=HCW_angstd_cutoff, xi=xi, njobs=njobs, verbose=verbose, debugH=debugH, plotreach=plotreach, ) if len(half_conserved) > 0: return half_conserved if "WCW" in which: for xi in xiWCW: semi_conserved = find_weakly_conserved_orientations( orientations, pct_size_buffer=pct_size_buffer, min_samp_data_size_pct=min_samp_data_size_pct, pct_explained=weakly_explained, angdiff_cutoff=nonFCW_angdiff_cutoff, angstd_cutoff=WCW_angstd_cutoff, xi=xi, njobs=njobs, verbose=verbose, debugH=debugH, plotreach=plotreach, ) if len(semi_conserved): return semi_conserved return []
# I want this to return an array with hydrogen orientations combinations
[docs] def find_fully_conserved_orientations( orientations: np.ndarray, pct_size_buffer: float = 0.85, kmeans_ang_cutoff: float = 120, kmeans_inertia_cutoff: float = 0.4, angdiff_cutoff: float = 5, angstd_cutoff: float = 17.0, xi: float = 0.03, njobs: int = 1, verbose: int = 0, debugH: int = 0, plotreach: bool = False, ) -> list: """Check if orientations belong to FCW. Checks if given oxygen cluster can be considered as a fully conserved water based on hydrogen orientations. Fully conserved water is one which has well defined hydrogen orientations in two distinctive groups (ie strongly hydrogen bonded for both hydrogens). To check if water is conserved, one first checks if k means clustering of hydrogen orientations gives two distinctive clusters with low inertia and required angle between the clusters. Afterwards more rigorous check is carried out with OPTICS clustering where again the spread of orientations and angle is considered. Args: orientations (np.ndarray): array of hydrogen orientations in space pct_size_buffer (float, optional): Minimum allowed size of the hydrogen orientation cluster. Defaults to 0.85. kmeans_ang_cutoff (float, optional): Maximum value of angle (in deg) allowed for FCW in kmeans clustering to be considered correct water angle. Defaults to 120. kmeans_inertia_cutoff (float, optional): upper limit allowed on kmeans inertia (measure of spread of data in a cluster). Defaults to 0.4. angdiff_cutoff (float, optional): Maximum value of angle (in deg) allowed for FCW in OPTICS clustering to be considered correct water angle. Defaults to 5. angstd_cutoff (float, optional): Maximal standard deviation of angle distribution of orientations of two hydrogens allowed for water to be considered FCW. Defaults to 17. xi (float, optional): Xi value for OPTICS clustering for FCW. Don't touch this unless you know what you are doing. Defaults to 0.03. njobs (int, optional): how many cpu cores to use for clustering. Defaults to 1. verbose (int, optional): verbosity of output. Defaults to 0. debugH (int, optional): debug level for orientations. Defaults to 0. plotreach (bool, optional): weather to plot the reachability plot for OPTICS when debugging. Defaults to False. Returns: list: returns a list containing two orientations of hydrogens and water classification string "FCW", if not FCW returns empty list """ # number of elements in oxygen cluster neioc = int(len(orientations) / 2) # for debugging counts = np.asarray([0]) values = np.asarray([0]) angs12 = 0 angs21 = 0 biggest = 0 secondbiggest = 0 labels = 0 cc = 0 ss = "" # conserved hydrogens fully_conserved = [] # Kmeans clustering prepareation - for conserved water analysis only kmeans = KMeans(n_clusters=2, n_init=10) # fit kmeans clustering kmeans.fit(orientations) if len(kmeans.cluster_centers_) == 2: Kcv1 = kmeans.cluster_centers_[0] Kcv2 = kmeans.cluster_centers_[1] # Calculate angles between centers of clusters that # represent hydrogen orientation ang = ( 360.0 / (2.0 * np.pi) * np.arccos( np.clip( np.dot( Kcv1 / np.linalg.norm(Kcv1), Kcv2 / np.linalg.norm(Kcv2), ), -1.0, 1.0, ) ) ) # check if angle between cluster centres is about right and # check if spread (variance) of orientations is sufficiently low if ( ang < kmeans_ang_cutoff and kmeans.inertia_ / len(orientations) < kmeans_inertia_cutoff ): # Perform OPTICS clustering on hydrogen orientations; minsamp is # same for all water types, however here we force min_cluster size # to be given. msp = int(neioc * pct_size_buffer) msp = msp if msp >= 2 else 2 cc = OPTICS(min_samples=msp, xi=xi, n_jobs=njobs) cc.fit(orientations) labels = cc.labels_ # Calculate number of elements in each cluster (values, counts) = np.unique(labels[labels != -1], return_counts=True) # if number of optics clusters for hydrogen clustering is >0 if len(np.sort(np.unique(labels[labels != -1]))) == 2: # find two biggest clusters biggest = values[np.argsort(counts)[-1]] secondbiggest = values[np.argsort(counts)[-2]] # check if size of two biggest clusters are between # 0.85 and 1.15*sizeofOxygenCluster if ( len(orientations[labels == biggest]) <= neioc * (2 - pct_size_buffer) and len(orientations[labels == secondbiggest]) <= neioc * (2 - pct_size_buffer) and len(orientations[labels == biggest]) > neioc * pct_size_buffer and len(orientations[labels == secondbiggest]) > neioc * pct_size_buffer ): # calculate the average orientation of two biggest clusters avang1 = np.mean(orientations[labels == biggest], axis=0) avang2 = np.mean(orientations[labels == secondbiggest], axis=0) # calculate average angle between average orientation of # one cluster with elements(orientations) of the other cluster angs12 = ( np.arccos( np.dot(orientations[labels == secondbiggest], avang1.T) ) * 360.0 / (2.0 * np.pi) ) angs21 = ( np.arccos(np.dot(orientations[labels == biggest], avang2.T)) * 360.0 / (2.0 * np.pi) ) # check if average angles are low and that their # std. deviation is low if ( np.abs(np.mean(angs21) - np.mean(angs12)) < angdiff_cutoff and np.std(angs12) < angstd_cutoff and np.std(angs21) < angstd_cutoff ): # conserved if verbose > 0: print("conserved found") fully_conserved.append( __return_normalized_orientation_pair( orientations, labels, biggest, secondbiggest, "FCW", ) ) # Debug needs neoic,orientations,kmeans,ang,k,counts,biggest, # s econdbiggest,angs12,angs21,cc kmeans if verbose > 0 or debugH > 0: sk = ( f"Conserved - kmeans analysis:\n" f"angle - mean: {ang:.2f}(calced)<{kmeans_ang_cutoff:.2f};" f" inertia per H: {kmeans.inertia_ / len(orientations):.2f}" f"(calced)<{kmeans_inertia_cutoff:.2f}\n" ) if debugH == 2 or (debugH == 1 and len(fully_conserved) > 0): __plot3Dorients(111, kmeans.labels_, orientations, sk) # OPTICS if debugH > 0 or verbose > 0: ss = ( f"Fully Conserved - OPTICS analysis;depends on pct size" f" buffer:{pct_size_buffer:.2f}\n" f"cluster sizes: {counts}; clust labels:{values};biggest: {biggest}" f"; secondbiggest: {secondbiggest}\n" f"clusters size: {neioc * pct_size_buffer:.2f}<" f"{len(orientations[labels == biggest])}," f"{len(orientations[labels == secondbiggest])}" f"(calced)<{neioc * (2 - pct_size_buffer):.2f}\n" f"angle mean: {np.abs(np.mean(angs21) - np.mean(angs12)):.2f}(calced)" f"<{angdiff_cutoff:.2f}); 12 std : {np.std(angs12):.2f};21 std: " f"{np.std(angs21):.2f} (both <{angstd_cutoff:.2f})\n" ) # Printing if verbose == 2 or (verbose == 1 and len(fully_conserved) > 0): print(sk + ss) # Debug plots __hydrogen_orient_plots( labels, orientations, cc, ss, ( f"Reachability of optics plot\n" f"minsamples={int(neioc * pct_size_buffer)}; xi={xi}" ), len(fully_conserved) > 0, debugH, plotreach, ) # End of function return fully_conserved
[docs] def find_half_conserved_orientations( orientations: np.ndarray, pct_size_buffer: float = 0.85, min_samp_data_size_pct: float = 0.35, angdiff_cutoff: float = 15, angstd_cutoff: float = 17.0, xi: float = 0.01, njobs: int = 1, verbose: int = 0, debugH: int = 0, plotreach: bool = False, ) -> list: """Checks if given orientations belong to HCW. Checks if given oxygen cluster can be considered as a half conserved water based on hydrogen orientations. Half conserved water is one which has one well defined hydrogen orientation (ie one strongly hydrogen bonded hydrogen). To check if water is half conserved, one calculates OPTICS clustering of hydrogen orientations. One then loops over clusters in an attempt to find a hydrogen orientation cluster which is the size of oxygen cluster and weather the angle between that cluster with all other orientations is of right angle and if spread of orientations is sufficiently low. Args: orientations (np.ndarray): array of hydrogen orientations in space pct_size_buffer (float, optional): Minimum allowed size of the hydrogen orientation cluster. Defaults to 0.85. min_samp_data_size_pct (float, optional): Minimum samples to choose for OPTICS clustering as percentage of number of water molecules considered for HCW and WCW. Defaults to 0.15. angdiff_cutoff (float, optional): Maximum standard deviation of angle allowed for HCW to be considered correct water angle. Defaults to 15. angstd_cutoff (float, optional): Maximum standard deviation cutoff for WCW angles to be considered correct water angles. Defaults to 17. xi (float, optional): Xi value for OPTICS clustering for HCW. Don't touch this unless you know what you are doing. Defaults to 0.01. njobs (int, optional): how many cpu cores to use for clustering. Defaults to 1. verbose (int, optional): verbosity of output. Defaults to 0. debugH (int, optional): debug level for orientations. Defaults to 0. plotreach (bool, optional): weather to plot the reachability plot for OPTICS when debugging. Defaults to False. Returns: list: returns a list containing two orientations of hydrogens and water classification string "HCW", if not HCW returns an empty list """ # number of elements in oxygen cluster neioc = int(len(orientations) / 2) half_conserved = [] sk = "" ss = "" # Optics clustering for hydrogen orientations - minsamp is same for all water types msp = int(neioc * min_samp_data_size_pct) msp = msp if msp >= 2 else 2 cc = OPTICS( min_samples=msp, xi=xi, n_jobs=njobs, ) cc.fit(orientations) # lebels without -1 labels = cc.labels_[cc.labels_ != -1] (values, counts) = np.unique(labels, return_counts=True) ind = np.argmax(counts[counts != -1]) bestcand = values[ind] labels = cc.labels_ # if there is more then 1 cluster if len(np.unique(labels)) > 1: N_elems = len(orientations[labels == bestcand]) # Debug if verbose > 0: sk += f"Cluster {bestcand} has {neioc * pct_size_buffer:.2f}<" sk += f"{N_elems}<{neioc * (2 - pct_size_buffer):.2f} elements\n" # check if number of elements in hydrogen orientation cluster is # between 0.80 and 1.20 elements of Oxygen cluster if ( N_elems < neioc * (2 - pct_size_buffer) and N_elems > neioc * pct_size_buffer ): # calculating average angle between average orientation of selected # cluster and all other orientations avang1 = np.mean(orientations[labels == bestcand], axis=0) angs1all = ( np.arccos(np.dot(orientations[labels != bestcand], avang1.T)) * 360.0 / (2.0 * np.pi) ) # Debug if verbose > 0 or debugH > 0: sk += "result angles: mean= " sk += f"{np.abs(np.mean(angs1all) - 104.5):.2f}(Calced)<" sk += f"{angdiff_cutoff:.2f}; std dev={np.std(angs1all):.2f}" sk += f"(Calced)<{angstd_cutoff:.2f}\n" # check if angle and its std deviation is satisfactory if ( np.abs(np.mean(angs1all) - 104.5) < angdiff_cutoff and np.std(angs1all) < angstd_cutoff ): for j in np.unique(labels[labels != bestcand]): # -1 has to be inclueded in case main cluster is ok and -1 # is everywhere, to produce a HCW still if verbose > 0: print("half conserved found", bestcand, j) half_conserved.append( __return_normalized_orientation_pair( orientations, labels, bestcand, j, "HCW" ) ) # Debug if verbose > 0 or debugH > 0: ss = ( f"Half Conserved - OPTICS analysis;xi={xi}" f"best: {bestcand}; N_hyd_cls (w/o -1): " f"{len(np.unique(labels[labels != -1]))};\n N elem in biggest:" f"{neioc * (2 - pct_size_buffer):.2f}>" f"{len(orientations[labels == bestcand])}>" f"{neioc * pct_size_buffer:.2f}\n" ) # Printing if verbose == 2 or (verbose == 1 and len(half_conserved) > 0): print(ss + sk) # Debug plots __hydrogen_orient_plots( labels, orientations, cc, ss + sk, ( f"Reachability of optics plot\n minsamples=" f"{int(neioc * min_samp_data_size_pct)}; xi={xi}" ), len(half_conserved) > 0, debugH, plotreach, ) # End of function return half_conserved
[docs] def find_weakly_conserved_orientations( orientations: np.ndarray, pct_size_buffer: float = 0.85, lower_bound_pct_buffer: float = 0.35, min_samp_data_size_pct: float = 0.15, pct_explained: float = 0.7, angdiff_cutoff: float = 15, angstd_cutoff: float = 20.0, xi: float = 0.01, njobs: int = 1, verbose: int = 0, debugH: int = 0, plotreach: bool = False, ) -> list: """Checks if given orientations belong to WCW. Checks if given oxygen cluster can be considered as a weakly conserved water based on hydrogen orientations. weakly conserved water is one which has no well defined hydrogen orientation (ie no strongly hydrogen bonded hydrogen) but still has distinct hydrogen orientational clusters. To check if water is weakly conserved, one calculates OPTICS clustering of hydrogen orientations. One then loops over clusters in an atempt to find a pair of hydrogen orientation clusters which is of the same size and weather the angle between the two clusters is of right angle and if spread of orientations is sufficiently low. Aditionally triplets are checked as well. Here we do the same check but we are looking at cluster one vs two other clusters combined. Args: orientations (np.ndarray): array of hydrogen orientations in space pct_size_buffer (float, optional): Minimum allowed size of the hydrogen orientation cluster. Defaults to 0.85. lower_bound_pct_buffer (float, optional): Minimum allowed size of the hydrogen orientation cluster. Defaults to 0.35. min_samp_data_size_pct (float, optional): Minimum samples to choose for OPTICS clustering as percentage of number of water molecules considered for HCW and WCW. Defaults to 0.15. pct_explained (float, optional): percentage of explained hydrogen orientations for water to be considered WCW. Defaults to 0.7. angdiff_cutoff (float, optional): Maximum standard deviation of angle allowed for WCW to be considered correct water angle. Defaults to 15. angstd_cutoff (float, optional): Maximum standard deviation cutoff for WCW angles to be considered correct water angles. Defaults to 20. xi (float, optional): Xi value for OPTICS clustering for WCW. Don't touch this unless you know what you are doing. Defaults to 0.01. njobs (int, optional): how many cpu cores to use for clustering. Defaults to 1. verbose (int, optional): verbosity of output. Defaults to 0. debugH (int, optional): debug level for orientations. Defaults to 0. plotreach (bool, optional): weather to plot the reachability plot for OPTICS when debuging. Defaults to False. Returns: list: returns a list containing two orientations of hydrogens and water classification string "WCW", if not WCW returns an empty list """ # number of elements in oxygen cluster neioc = int(len(orientations) / 2) # Optics clustering for hydrogen orientations - minsamp is same for all water types msp = int(neioc * min_samp_data_size_pct) msp = msp if msp >= 2 else 2 cc = OPTICS( min_samples=msp, xi=xi, n_jobs=njobs, ) cc.fit(orientations) weakly_conserved = [] # lebels without -1 labels = cc.labels_ (_, counts) = np.unique(labels, return_counts=True) weakly = [] sk = "" ss = "" # array of already used clusters used = [] # list of labels lbls = np.unique(labels) # if there is more then 1 cluster if len(np.unique(labels)) > 1: # Chekc over pairs of clusters # loop over OPTICS clusters for ii in lbls[lbls != -1]: # check if in used if ii in used: continue N_elems = len(orientations[labels == ii]) # Debug if verbose > 0 or debugH > 0: sk += f"Cluster {ii} has {neioc * lower_bound_pct_buffer:.2f}<" sk += f"{N_elems}<{neioc * (2 - pct_size_buffer):.2f} elements\n" # check if size of hydorgen orientation cluster is between 1.20 # and lower_bound_pct_buffer times number of elements in oxygen cluster if ( N_elems < neioc * (2 - pct_size_buffer) and N_elems > neioc * lower_bound_pct_buffer ): avang1 = np.mean(orientations[labels == ii], axis=0) # loop over clusters but not ii lblsni = lbls[lbls != ii] for jj in lblsni: # check if already used if ii in used: break if jj in used: continue # calculate average angle between average orientation of ii # and all orientations in cluster jj angs1j = ( np.arccos(np.dot(orientations[labels == jj], avang1.T)) * 360.0 / (2.0 * np.pi) ) N_elems_jj = len(orientations[labels == jj]) # Debug if verbose > 0 or debugH > 0: maxcomp = np.max([N_elems, N_elems_jj]) calcedcomp = (1 - pct_size_buffer) * maxcomp sk += f"cluster combo:{ii} & {jj}size:{N_elems}," sk += f"{neioc * lower_bound_pct_buffer:.2f}<{N_elems_jj}<" sk += f"{neioc * (2 - pct_size_buffer):.2f}\n" sk += f"size comparison {np.abs(N_elems - N_elems_jj)}" sk += f"(calced) < {calcedcomp} \n" sk += f"ang diff={np.abs(np.mean(angs1j) - 104.5):.2f}" sk += f"(calced)<{angdiff_cutoff:.2f},std dev:" sk += f"{angstd_cutoff:.2f}>{np.std(angs1j):.2f}(calced)\n" # check if size of new cluster and check if size of clusters # is about equal if ( N_elems_jj > neioc * (2 - pct_size_buffer) or N_elems_jj < neioc * lower_bound_pct_buffer or np.abs(N_elems - N_elems_jj) > (1 - pct_size_buffer) * np.max([N_elems, N_elems_jj]) ): continue # check if angles and std devs are good if ( np.abs(np.mean(angs1j) - 104.5) < angdiff_cutoff and np.std(angs1j) < angstd_cutoff and len(orientations[labels == jj]) > neioc * lower_bound_pct_buffer and len(orientations[labels == jj]) < neioc * (2 - pct_size_buffer) ): used.append(ii) used.append(jj) weakly.append([ii, jj]) break # check for triplets if cluster ii has same size as clusters jj+kk # and satisfactory angles # loop over OPTICS clusters for ii in lbls[lbls != -1]: # check if in used if ii in used: continue N_elems = len(orientations[labels == ii]) # average orientation of ii avang1 = np.mean(orientations[labels == ii], axis=0) # loop over clusters but not ii lblsni = lbls[lbls != ii] for jj in lblsni: # check if already used if ii in used: break if jj in used: continue N_elems_jj = len(orientations[labels == jj]) avangj = np.mean(orientations[labels == jj], axis=0) # loop over clusters but not ii or jj lblsnij = lblsni[lblsni != jj] for kk in lblsnij: if ii in used: break if jj in used: break if kk in used: continue N_elems_kk = len(orientations[labels == kk]) avangk = np.mean(orientations[labels == kk], axis=0) # calculate average angle between average orientation of ii # and all orientations in cluster jj and kk angs1jk = ( np.arccos( np.dot( orientations[(labels == jj) | (labels == kk)], avang1.T, ) ) * 360.0 / (2.0 * np.pi) ) angs1j = ( np.arccos( np.dot( orientations[(labels == jj)], avang1.T, ) ) * 360.0 / (2.0 * np.pi) ) angs1k = ( np.arccos( np.dot( orientations[(labels == kk)], avang1.T, ) ) * 360.0 / (2.0 * np.pi) ) angsjk = ( np.arccos( np.dot( orientations[(labels == kk)], avangj.T, ) ) * 360.0 / (2.0 * np.pi) ) angskj = ( np.arccos( np.dot( orientations[(labels == jj)], avangk.T, ) ) * 360.0 / (2.0 * np.pi) ) # Debug if verbose > 0 or debugH > 0: maxprint = np.max([N_elems, N_elems_jj + N_elems_kk]) sizeingprint = (1 - pct_size_buffer) * maxprint sk += f"cluster combo:{ii} & {jj} & {kk} size:" sk += f"{N_elems},{N_elems_jj}, {N_elems_kk} \n" sk += "size check " sk += f"{np.abs(N_elems - (N_elems_jj + N_elems_kk))}" sk += f"(calced)< {sizeingprint}\n" sk += f"ang diff={np.abs(np.mean(angs1jk) - 104.5):.2f}" sk += f"(calced)<{angdiff_cutoff:.2f},std dev:" sk += f"{angstd_cutoff:.2f}>{np.std(angs1jk):.2f}" sk += "(calced)\n" sk += f"ij {ii},{jj}, {np.abs(np.mean(angs1j) - 104.5)}," sk += f"<,angdiff_cutoff,and std {np.std(angs1j)}<" sk += f"{angstd_cutoff} \n" sk += f"ik {ii},{kk}, {np.abs(np.mean(angs1k) - 104.5)}," sk += f"<,angdiff_cutoff,and std {np.std(angs1k)}<" sk += f"{angstd_cutoff} \n" sk += f"kj {kk},{jj}, {np.abs(np.mean(angskj) - 104.5)}," sk += f"<,angdiff_cutoff,and std {np.std(angskj)}<" sk += f"{angstd_cutoff} \n" sk += f"jk {jj},{kk}, {np.abs(np.mean(angsjk) - 104.5)}," sk += f"<,angdiff_cutoff,and std {np.std(angsjk)}<" sk += f"{angstd_cutoff} \n" # check if size of clusters is about equal ii==jj+kk # with angle combination if ( np.abs(N_elems - (N_elems_jj + N_elems_kk)) < (1 - pct_size_buffer) * np.max([N_elems, N_elems_jj + N_elems_kk]) and ( np.abs(np.mean(angs1jk) - 104.5) < angdiff_cutoff and np.std(angs1jk) < angstd_cutoff ) ) or ( ( np.abs(np.mean(angs1j) - 104.5) < angdiff_cutoff and np.std(angs1j) < angstd_cutoff ) and ( np.abs(np.mean(angs1k) - 104.5) < angdiff_cutoff and np.std(angs1k) < angstd_cutoff ) and ( np.abs(np.mean(angsjk) - 104.5) < angdiff_cutoff and np.std(angsjk) < angstd_cutoff ) and ( np.abs(np.mean(angskj) - 104.5) < angdiff_cutoff and np.std(angskj) < angstd_cutoff ) ): weakly.append([ii, jj]) weakly.append([ii, kk]) used.append(ii) used.append(jj) used.append(kk) break # check if used ones account for pct_explained percentage of orientations total = 0 for i in used: total += len(orientations[labels == i]) if verbose > 1: print("total length=", total, ">", 2 * neioc * pct_explained) for ws in weakly: print("weakly combo", ws[0], ws[1]) if total > 2 * neioc * pct_explained: if verbose > 0: print("weakly found") for ws in weakly: weakly_conserved.append( __return_normalized_orientation_pair( orientations, labels, ws[0], ws[1], "WCW" ) ) # or if angle beteween a larger set of clusters is water angle for all of # them - this means that this is circular triplet or multiplet else: used = [] weakly = [] # loop over OPTICS clusters for ii in lbls: this = [] # check if in used if ii in used: continue # average orientation of ii avangi = np.mean(orientations[labels == ii], axis=0) # loop over clusters but not ii lblsni = lbls[lbls != ii] for jj in lblsni: # check if already used if jj in used: continue avangj = np.mean(orientations[labels == jj], axis=0) # loop over clusters but not ii or jj lblsnij = lblsni[lblsni != jj] angsij = ( np.arccos( np.dot( orientations[(labels == jj)], avangi.T, ) ) * 360.0 / (2.0 * np.pi) ) angsji = ( np.arccos( np.dot( orientations[(labels == ii)], avangj.T, ) ) * 360.0 / (2.0 * np.pi) ) if verbose > 0 or debugH > 0: sk += f"ij,{ii},{jj}, angs and std ij: " sk += f"{np.abs(np.mean(angsij) - 104.5)}, {np.std(angsij)};" sk += f"ji: {np.abs(np.mean(angsji) - 104.5)}," sk += f" {np.std(angsji)} \n" if ( np.abs(np.mean(angsij) - 104.5) < angdiff_cutoff and np.std(angsij) < angstd_cutoff ) and ( np.abs(np.mean(angsji) - 104.5) < angdiff_cutoff and np.std(angsji) < angstd_cutoff ): go = True for kk in this: avangk = np.mean(orientations[labels == kk], axis=0) angsjk = ( np.arccos( np.dot( orientations[(labels == kk)], avangj.T, ) ) * 360.0 / (2.0 * np.pi) ) angskj = ( np.arccos( np.dot( orientations[(labels == jj)], avangk.T, ) ) * 360.0 / (2.0 * np.pi) ) if verbose > 0 or debugH > 0: sk += f"jk,{jj},{kk}, angs and std jk:" sk += f"{np.abs(np.mean(angsjk) - 104.5)}, " sk += f"{np.std(angsjk)}; " sk += f"kj: {np.abs(np.mean(angskj) - 104.5)}, " sk += f"{np.std(angskj)} \n" if ( np.abs(np.mean(angskj) - 104.5) > angdiff_cutoff and np.std(angskj) > angstd_cutoff ) and ( np.abs(np.mean(angsjk) - 104.5) > angdiff_cutoff and np.std(angsjk) > angstd_cutoff ): go = False if go: weakly.append([ii, jj]) used.append(ii) used.append(jj) total = 0 for i in used: total += len(orientations[labels == i]) if verbose > 2: print("total length=", total, ">", 2 * neioc * pct_explained) for ws in weakly: print("weakly combo", ws[0], ws[1]) if total > 2 * neioc * pct_explained: if verbose > 0: print("weakly found") for ws in weakly: weakly_conserved.append( __return_normalized_orientation_pair( orientations, labels, ws[0], ws[1], "WCW" ) ) # Debug if verbose > 0 or debugH > 0: ss = ( f"weakly Conserved - OPTICS analysis;xi={xi}\n" f"Number of hydrogen clusters : {len(np.unique(labels))};\n" f"number of elements : {counts}; range needed for best cluster:" f"(depends on numbpct) {neioc * (2 - pct_size_buffer):.2f}," f"{neioc * lower_bound_pct_buffer:.2f}\n" ) # Debug Printing if verbose == 2 or (verbose == 1 and len(weakly_conserved) > 0): print(ss + sk) # Debug plots __hydrogen_orient_plots( labels, orientations, cc, ss, ( f"Reachability of optics plot\n" f"minsamples={int(neioc * min_samp_data_size_pct)}; xi={xi}" ), len(weakly_conserved) > 0, debugH, plotreach, ) return weakly_conserved
def __hydrogen_orient_plots( labels, orientations, cc, ss, rtit, conserved, debugH, plotreach: bool ) -> None: """Collection of plots for hydrogen orientation. For debuging purposes. Not ment for direct usage. """ if debugH == 2 or (debugH == 1 and conserved): if plotreach: fig = __plot3Dorients(111, labels, orientations, ss) else: fig = __plot3Dorients(121, labels, orientations, ss) if plotreach: __plotreachability(122, orientations, cc, fig=fig, tit=rtit) def __plot3Dorients(subplot, labels: int, orientations: np.ndarray, tip: str) -> Figure: """Function for plotting 3D orientations. For debuging only. """ if plt is None: msg = "install matplotlib" raise Exception(msg) fig = plt.figure() if isinstance(labels, int): return fig ax = fig.add_subplot(subplot, projection="3d") ax.set_title(tip) for j in np.unique(labels): jaba = orientations[labels == j] ax.scatter( jaba[:, 0], jaba[:, 1], jaba[:, 2], label=f"{j} ({len(labels[labels == j])})", ) if j > -1: ax.quiver( 0, 0, 0, np.mean(jaba[:, 0]), np.mean(jaba[:, 1]), np.mean(jaba[:, 2]), color="gray", arrow_length_ratio=0.0, linewidths=5, ) ax.scatter(0, 0, 0, c="crimson", s=1000) ax.legend() ax.grid(False) ax.axis("off") ax.set_aspect("equal") ax.autoscale(tight=True) ax.dist = 6 return fig def __plotreachability( subplot, orientations, cc, fig: Figure | None = None, tit=None ) -> Figure: """Function for plotting reachability. For debuging purposes only. """ if plt is None: msg = "install matplotlib" raise Exception(msg) if fig is None: fig: Figure = plt.figure() if not isinstance(cc, OPTICS): return fig lblls = cc.labels_[cc.ordering_] labels = cc.labels_ reachability = cc.reachability_[cc.ordering_] ax2 = fig.add_subplot(subplot) fig.gca().set_prop_cycle(None) space = np.arange(len(orientations)) ax2.plot(space, reachability) if tit is not None: ax2.set_title(tit) for clst in np.unique(lblls): if clst == -1: label = np.mean(np.ma.masked_invalid(cc.reachability_[labels == clst])) ax2.plot( space[lblls == clst], reachability[lblls == clst], label=f"{clst} ({len(space[lblls == clst])}), avg reach={label}", color="blue", ) else: label = np.mean(np.ma.masked_invalid(cc.reachability_[labels == clst])) ax2.plot( space[lblls == clst], reachability[lblls == clst], label=f"{clst} ({len(space[lblls == clst])}), avg reach={label}", ) ax2.legend() return fig def __return_normalized_orientation_pair( orientations: np.ndarray, labels: np.ndarray, i: int, j: int, typel: str, ) -> list[np.ndarray | str]: """Helper function for normalizing orientations. Not ment for general usage. """ v1 = np.mean(orientations[labels == i], axis=0) v2 = np.mean(orientations[labels == j], axis=0) return [ v1 / np.linalg.norm(v1) * np.linalg.norm(orientations[0]), v2 / np.linalg.norm(v2) * np.linalg.norm(orientations[0]), typel, ]