from __future__ import annotations
from typing import TYPE_CHECKING
try:
import matplotlib.pyplot as plt
except ImportError:
plt = None
if TYPE_CHECKING:
try:
from matplotlib.axes import Axes
from matplotlib.figure import Figure
except ImportError:
Axes, Figure = None, None
import numpy as np
from sklearn.cluster import OPTICS, KMeans
[docs]
def hydrogen_orientation_analysis(
orientations: np.ndarray,
pct_size_buffer: float = 0.85,
kmeans_ang_cutoff: float = 120,
kmeans_inertia_cutoff: float = 0.4,
FCW_angdiff_cutoff: float = 5,
FCW_angstd_cutoff: float = 17,
min_samp_data_size_pct: float = 0.15,
nonFCW_angdiff_cutoff: float = 15,
HCW_angstd_cutoff: float = 17,
WCW_angstd_cutoff: float = 20,
weakly_explained: float = 0.7,
xiFCW: tuple[float] | list[float] = (0.03,),
xiHCW: tuple[float] | list[float] = (0.05, 0.01),
xiWCW: tuple[float] | list[float] = (0.05, 0.001),
njobs: int = 1,
verbose: int = 0,
debugH: int = 0,
plotreach: bool = False,
which: tuple[str] | list[str] = ("FCW", "HCW", "WCW"),
normalize_orientations: bool = True,
) -> list:
"""Determines if the water cluster is conserved and of what type.
High level function that does hydrogen orientation analysis. Checks
if the water cluster belongs into one of the following groups by
analyzing hydrogen orientations:
- FCW (Fully Conserved Water): hydrogens are strongly oriented in
two directions with angle of 104.5
- HCW (Half Conserved Water): one set (cluster) of hydrogens is
oriented in certain directions and other are spread into different
orientations with angle of 104.5
- WCW (Weakly Conserved Water): several orientation combinations
exist with satisfying water angles
See :cite:`conservedwatersearch2022` for more information on water
classification :ref:`conservedwaters_theory_background_methods`.
If orientations don't satisfy the criteria for any of the waters, an
empty list is returned.
Args:
orientations (np.ndarray): array of hydrogen
orientations in space
pct_size_buffer (float, optional): Minimum allowed size of the
hydrogen orientation cluster. Defaults to 0.85.
kmeans_ang_cutoff (float, optional): Maximum value of angle (in
deg) allowed for FCW in kmeans clustering to be considered
correct water angle. Defaults to 120.
kmeans_inertia_cutoff (float, optional): upper limit allowed on
kmeans inertia (measure of spread of data in a cluster).
Defaults to 0.4.
FCW_angdiff_cutoff (float, optional): Maximum value of angle (in
deg) allowed for FCW in OPTICS clustering to be
considered correct water angle. Defaults to 5.
FCW_angstd_cutoff (float, optional): Maximal standard deviation
of angle distribution of orientations of two hydrogens
allowed for water to be considered FCW. Defaults to 17.
min_samp_data_size_pct (float, optional): Minimum samples to
choose for OPTICS clustering as percentage of
number of water molecules considered for HCW and WCW.
Defaults to 0.15.
nonFCW_angdiff_cutoff (float, optional): Maximum standard
deviation of angle allowed for HCW and WCW to be considered
correct water angle. Defaults to 15.
HCW_angstd_cutoff (float, optional): Maximum standard deviation
cutoff for WCW angles to be considered correct water angles.
Defaults to 17.
WCW_angstd_cutoff (float, optional): Maximum standard deviation
cutoff for WCW angles to be considered correct water angles.
Defaults to 20.
weakly_explained (float, optional): percentage of explained
hydrogen orientations for water to be considered WCW.
Defaults to 0.7.
xiFCW (tuple, optional): Xi value for OPTICS clustering for FCW. Don't
touch this unless you know what you are doing. Defaults to (0.03).
xiHCW (tuple, optional): Xi value for OPTICS clustering for HCW. Don't
touch this unless you know what you are doing.
Defaults to (0.05, 0.01).
xiWCW (tuple, optional): Xi value for OPTICS clustering for WCW. Don't
touch this unless you know what you are doing.
Defaults to (0.05, 0.001).
njobs (int, optional): how many cpu cores to use for clustering.
Defaults to 1.
verbose (int, optional): verbosity of output. Defaults to 0.
debugH (int, optional): debug level for orientations. Defaults to 0.
plotreach (bool, optional): weather to plot the reachability
plot for OPTICS when debuging. Defaults to False.
which (tuple, optional): tuple of strings denoting which water types to
search for. Allowed is any combination of FCW (fully
conserved waters), HCW (half conserved waters) and WCW
(weakly conserved waters). Defaults to ["FCW", "HCW",
"WCW"].
normalize_orientations (bool, optional): weather to normalize
the orientation vectors to unit distance. Defaults to True.
Returns:
list: returns a list containing two orientations of hydrogens
and water classification string ("FCW", "HCW", "WCW"), if
not conserved returns an empty list
"""
orientations = np.array(orientations)
# check length of orientations - it has to be bigger then 1 and even
orishape = orientations.shape
if len(orishape) != 2:
value_error_string = "Orientations have to be a 2D array"
raise ValueError(value_error_string)
Hnum = orishape[0]
dimnum = orishape[1]
if dimnum != 3:
value_error_string = "Orientations must be vectors of dimension 3"
raise ValueError(value_error_string)
if Hnum < 2 or Hnum % 2 != 0:
value_error_string = (
"Number of orientations must be even!"
" Each water molecule has 2 hydrogen atoms!"
)
raise ValueError(value_error_string)
if normalize_orientations:
orientations = orientations / np.linalg.norm(
orientations, axis=1, keepdims=True
)
# For Debug
np.set_printoptions(precision=4)
# Check if water is conserved
if "FCW" in which:
for xi in xiFCW:
conserved = find_fully_conserved_orientations(
orientations,
pct_size_buffer=pct_size_buffer,
kmeans_ang_cutoff=kmeans_ang_cutoff,
kmeans_inertia_cutoff=kmeans_inertia_cutoff,
angdiff_cutoff=FCW_angdiff_cutoff,
angstd_cutoff=FCW_angstd_cutoff,
xi=xi,
njobs=njobs,
verbose=verbose,
debugH=debugH,
plotreach=plotreach,
)
if len(conserved) > 0:
return conserved
if "HCW" in which:
for xi in xiHCW:
half_conserved = find_half_conserved_orientations(
orientations,
pct_size_buffer=pct_size_buffer,
min_samp_data_size_pct=min_samp_data_size_pct,
angdiff_cutoff=nonFCW_angdiff_cutoff,
angstd_cutoff=HCW_angstd_cutoff,
xi=xi,
njobs=njobs,
verbose=verbose,
debugH=debugH,
plotreach=plotreach,
)
if len(half_conserved) > 0:
return half_conserved
if "WCW" in which:
for xi in xiWCW:
semi_conserved = find_weakly_conserved_orientations(
orientations,
pct_size_buffer=pct_size_buffer,
min_samp_data_size_pct=min_samp_data_size_pct,
pct_explained=weakly_explained,
angdiff_cutoff=nonFCW_angdiff_cutoff,
angstd_cutoff=WCW_angstd_cutoff,
xi=xi,
njobs=njobs,
verbose=verbose,
debugH=debugH,
plotreach=plotreach,
)
if len(semi_conserved):
return semi_conserved
return []
# I want this to return an array with hydrogen orientations combinations
[docs]
def find_fully_conserved_orientations(
orientations: np.ndarray,
pct_size_buffer: float = 0.85,
kmeans_ang_cutoff: float = 120,
kmeans_inertia_cutoff: float = 0.4,
angdiff_cutoff: float = 5,
angstd_cutoff: float = 17.0,
xi: float = 0.03,
njobs: int = 1,
verbose: int = 0,
debugH: int = 0,
plotreach: bool = False,
) -> list:
"""Check if orientations belong to FCW.
Checks if given oxygen cluster can be considered as a fully
conserved water based on hydrogen orientations. Fully conserved
water is one which has well defined hydrogen orientations in two
distinctive groups (ie strongly hydrogen bonded for both hydrogens).
To check if water is conserved, one first checks if k means
clustering of hydrogen orientations gives two distinctive clusters
with low inertia and required angle between the clusters. Afterwards
more rigorous check is carried out with OPTICS clustering where
again the spread of orientations and angle is considered.
Args:
orientations (np.ndarray): array of hydrogen
orientations in space
pct_size_buffer (float, optional): Minimum allowed size of the
hydrogen orientation cluster. Defaults to 0.85.
kmeans_ang_cutoff (float, optional): Maximum value of angle (in
deg) allowed for FCW in kmeans clustering to be considered
correct water angle. Defaults to 120.
kmeans_inertia_cutoff (float, optional): upper limit allowed on
kmeans inertia (measure of spread of data in a cluster).
Defaults to 0.4.
angdiff_cutoff (float, optional): Maximum value of angle (in
deg) allowed for FCW in OPTICS clustering to be
considered correct water angle. Defaults to 5.
angstd_cutoff (float, optional): Maximal standard deviation
of angle distribution of orientations of two hydrogens
allowed for water to be considered FCW. Defaults to 17.
xi (float, optional): Xi value for OPTICS clustering for FCW. Don't
touch this unless you know what you are doing.
Defaults to 0.03.
njobs (int, optional): how many cpu cores to use for clustering.
Defaults to 1.
verbose (int, optional): verbosity of output. Defaults to 0.
debugH (int, optional): debug level for orientations. Defaults to 0.
plotreach (bool, optional): weather to plot the reachability
plot for OPTICS when debugging. Defaults to False.
Returns:
list: returns a list containing two orientations of hydrogens
and water classification string "FCW", if not FCW returns
empty list
"""
# number of elements in oxygen cluster
neioc = int(len(orientations) / 2)
# for debugging
counts = np.asarray([0])
values = np.asarray([0])
angs12 = 0
angs21 = 0
biggest = 0
secondbiggest = 0
labels = 0
cc = 0
ss = ""
# conserved hydrogens
fully_conserved = []
# Kmeans clustering prepareation - for conserved water analysis only
kmeans = KMeans(n_clusters=2, n_init=10)
# fit kmeans clustering
kmeans.fit(orientations)
if len(kmeans.cluster_centers_) == 2:
Kcv1 = kmeans.cluster_centers_[0]
Kcv2 = kmeans.cluster_centers_[1]
# Calculate angles between centers of clusters that
# represent hydrogen orientation
ang = (
360.0
/ (2.0 * np.pi)
* np.arccos(
np.clip(
np.dot(
Kcv1 / np.linalg.norm(Kcv1),
Kcv2 / np.linalg.norm(Kcv2),
),
-1.0,
1.0,
)
)
)
# check if angle between cluster centres is about right and
# check if spread (variance) of orientations is sufficiently low
if (
ang < kmeans_ang_cutoff
and kmeans.inertia_ / len(orientations) < kmeans_inertia_cutoff
):
# Perform OPTICS clustering on hydrogen orientations; minsamp is
# same for all water types, however here we force min_cluster size
# to be given.
msp = int(neioc * pct_size_buffer)
msp = msp if msp >= 2 else 2
cc = OPTICS(min_samples=msp, xi=xi, n_jobs=njobs)
cc.fit(orientations)
labels = cc.labels_
# Calculate number of elements in each cluster
(values, counts) = np.unique(labels[labels != -1], return_counts=True)
# if number of optics clusters for hydrogen clustering is >0
if len(np.sort(np.unique(labels[labels != -1]))) == 2:
# find two biggest clusters
biggest = values[np.argsort(counts)[-1]]
secondbiggest = values[np.argsort(counts)[-2]]
# check if size of two biggest clusters are between
# 0.85 and 1.15*sizeofOxygenCluster
if (
len(orientations[labels == biggest])
<= neioc * (2 - pct_size_buffer)
and len(orientations[labels == secondbiggest])
<= neioc * (2 - pct_size_buffer)
and len(orientations[labels == biggest]) > neioc * pct_size_buffer
and len(orientations[labels == secondbiggest])
> neioc * pct_size_buffer
):
# calculate the average orientation of two biggest clusters
avang1 = np.mean(orientations[labels == biggest], axis=0)
avang2 = np.mean(orientations[labels == secondbiggest], axis=0)
# calculate average angle between average orientation of
# one cluster with elements(orientations) of the other cluster
angs12 = (
np.arccos(
np.dot(orientations[labels == secondbiggest], avang1.T)
)
* 360.0
/ (2.0 * np.pi)
)
angs21 = (
np.arccos(np.dot(orientations[labels == biggest], avang2.T))
* 360.0
/ (2.0 * np.pi)
)
# check if average angles are low and that their
# std. deviation is low
if (
np.abs(np.mean(angs21) - np.mean(angs12)) < angdiff_cutoff
and np.std(angs12) < angstd_cutoff
and np.std(angs21) < angstd_cutoff
):
# conserved
if verbose > 0:
print("conserved found")
fully_conserved.append(
__return_normalized_orientation_pair(
orientations,
labels,
biggest,
secondbiggest,
"FCW",
)
)
# Debug needs neoic,orientations,kmeans,ang,k,counts,biggest,
# s econdbiggest,angs12,angs21,cc kmeans
if verbose > 0 or debugH > 0:
sk = (
f"Conserved - kmeans analysis:\n"
f"angle - mean: {ang:.2f}(calced)<{kmeans_ang_cutoff:.2f};"
f" inertia per H: {kmeans.inertia_ / len(orientations):.2f}"
f"(calced)<{kmeans_inertia_cutoff:.2f}\n"
)
if debugH == 2 or (debugH == 1 and len(fully_conserved) > 0):
__plot3Dorients(111, kmeans.labels_, orientations, sk)
# OPTICS
if debugH > 0 or verbose > 0:
ss = (
f"Fully Conserved - OPTICS analysis;depends on pct size"
f" buffer:{pct_size_buffer:.2f}\n"
f"cluster sizes: {counts}; clust labels:{values};biggest: {biggest}"
f"; secondbiggest: {secondbiggest}\n"
f"clusters size: {neioc * pct_size_buffer:.2f}<"
f"{len(orientations[labels == biggest])},"
f"{len(orientations[labels == secondbiggest])}"
f"(calced)<{neioc * (2 - pct_size_buffer):.2f}\n"
f"angle mean: {np.abs(np.mean(angs21) - np.mean(angs12)):.2f}(calced)"
f"<{angdiff_cutoff:.2f}); 12 std : {np.std(angs12):.2f};21 std: "
f"{np.std(angs21):.2f} (both <{angstd_cutoff:.2f})\n"
)
# Printing
if verbose == 2 or (verbose == 1 and len(fully_conserved) > 0):
print(sk + ss)
# Debug plots
__hydrogen_orient_plots(
labels,
orientations,
cc,
ss,
(
f"Reachability of optics plot\n"
f"minsamples={int(neioc * pct_size_buffer)}; xi={xi}"
),
len(fully_conserved) > 0,
debugH,
plotreach,
)
# End of function
return fully_conserved
[docs]
def find_half_conserved_orientations(
orientations: np.ndarray,
pct_size_buffer: float = 0.85,
min_samp_data_size_pct: float = 0.35,
angdiff_cutoff: float = 15,
angstd_cutoff: float = 17.0,
xi: float = 0.01,
njobs: int = 1,
verbose: int = 0,
debugH: int = 0,
plotreach: bool = False,
) -> list:
"""Checks if given orientations belong to HCW.
Checks if given oxygen cluster can be considered as a half conserved
water based on hydrogen orientations. Half conserved water is one
which has one well defined hydrogen orientation (ie one strongly
hydrogen bonded hydrogen). To check if water is half conserved, one
calculates OPTICS clustering of hydrogen orientations. One then
loops over clusters in an attempt to find a hydrogen orientation
cluster which is the size of oxygen cluster and weather the angle
between that cluster with all other orientations is of right angle
and if spread of orientations is sufficiently low.
Args:
orientations (np.ndarray): array of hydrogen
orientations in space
pct_size_buffer (float, optional): Minimum allowed size of the
hydrogen orientation cluster. Defaults to 0.85.
min_samp_data_size_pct (float, optional): Minimum samples to
choose for OPTICS clustering as percentage of
number of water molecules considered for HCW and WCW.
Defaults to 0.15.
angdiff_cutoff (float, optional): Maximum standard
deviation of angle allowed for HCW to be considered
correct water angle. Defaults to 15.
angstd_cutoff (float, optional): Maximum standard deviation
cutoff for WCW angles to be considered correct water angles.
Defaults to 17.
xi (float, optional): Xi value for OPTICS clustering for HCW. Don't
touch this unless you know what you are doing.
Defaults to 0.01.
njobs (int, optional): how many cpu cores to use for clustering.
Defaults to 1.
verbose (int, optional): verbosity of output. Defaults to 0.
debugH (int, optional): debug level for orientations. Defaults to 0.
plotreach (bool, optional): weather to plot the reachability
plot for OPTICS when debugging. Defaults to False.
Returns:
list: returns a list containing two orientations of hydrogens
and water classification string "HCW", if not HCW returns
an empty list
"""
# number of elements in oxygen cluster
neioc = int(len(orientations) / 2)
half_conserved = []
sk = ""
ss = ""
# Optics clustering for hydrogen orientations - minsamp is same for all water types
msp = int(neioc * min_samp_data_size_pct)
msp = msp if msp >= 2 else 2
cc = OPTICS(
min_samples=msp,
xi=xi,
n_jobs=njobs,
)
cc.fit(orientations)
# lebels without -1
labels = cc.labels_[cc.labels_ != -1]
(values, counts) = np.unique(labels, return_counts=True)
ind = np.argmax(counts[counts != -1])
bestcand = values[ind]
labels = cc.labels_
# if there is more then 1 cluster
if len(np.unique(labels)) > 1:
N_elems = len(orientations[labels == bestcand])
# Debug
if verbose > 0:
sk += f"Cluster {bestcand} has {neioc * pct_size_buffer:.2f}<"
sk += f"{N_elems}<{neioc * (2 - pct_size_buffer):.2f} elements\n"
# check if number of elements in hydrogen orientation cluster is
# between 0.80 and 1.20 elements of Oxygen cluster
if (
N_elems < neioc * (2 - pct_size_buffer)
and N_elems > neioc * pct_size_buffer
):
# calculating average angle between average orientation of selected
# cluster and all other orientations
avang1 = np.mean(orientations[labels == bestcand], axis=0)
angs1all = (
np.arccos(np.dot(orientations[labels != bestcand], avang1.T))
* 360.0
/ (2.0 * np.pi)
)
# Debug
if verbose > 0 or debugH > 0:
sk += "result angles: mean= "
sk += f"{np.abs(np.mean(angs1all) - 104.5):.2f}(Calced)<"
sk += f"{angdiff_cutoff:.2f}; std dev={np.std(angs1all):.2f}"
sk += f"(Calced)<{angstd_cutoff:.2f}\n"
# check if angle and its std deviation is satisfactory
if (
np.abs(np.mean(angs1all) - 104.5) < angdiff_cutoff
and np.std(angs1all) < angstd_cutoff
):
for j in np.unique(labels[labels != bestcand]):
# -1 has to be inclueded in case main cluster is ok and -1
# is everywhere, to produce a HCW still
if verbose > 0:
print("half conserved found", bestcand, j)
half_conserved.append(
__return_normalized_orientation_pair(
orientations, labels, bestcand, j, "HCW"
)
)
# Debug
if verbose > 0 or debugH > 0:
ss = (
f"Half Conserved - OPTICS analysis;xi={xi}"
f"best: {bestcand}; N_hyd_cls (w/o -1): "
f"{len(np.unique(labels[labels != -1]))};\n N elem in biggest:"
f"{neioc * (2 - pct_size_buffer):.2f}>"
f"{len(orientations[labels == bestcand])}>"
f"{neioc * pct_size_buffer:.2f}\n"
)
# Printing
if verbose == 2 or (verbose == 1 and len(half_conserved) > 0):
print(ss + sk)
# Debug plots
__hydrogen_orient_plots(
labels,
orientations,
cc,
ss + sk,
(
f"Reachability of optics plot\n minsamples="
f"{int(neioc * min_samp_data_size_pct)}; xi={xi}"
),
len(half_conserved) > 0,
debugH,
plotreach,
)
# End of function
return half_conserved
[docs]
def find_weakly_conserved_orientations(
orientations: np.ndarray,
pct_size_buffer: float = 0.85,
lower_bound_pct_buffer: float = 0.35,
min_samp_data_size_pct: float = 0.15,
pct_explained: float = 0.7,
angdiff_cutoff: float = 15,
angstd_cutoff: float = 20.0,
xi: float = 0.01,
njobs: int = 1,
verbose: int = 0,
debugH: int = 0,
plotreach: bool = False,
) -> list:
"""Checks if given orientations belong to WCW.
Checks if given oxygen cluster can be considered as a weakly
conserved water based on hydrogen orientations. weakly conserved
water is one which has no well defined hydrogen orientation (ie no
strongly hydrogen bonded hydrogen) but still has distinct hydrogen
orientational clusters. To check if water is weakly conserved, one
calculates OPTICS clustering of hydrogen orientations. One then
loops over clusters in an atempt to find a pair of hydrogen
orientation clusters which is of the same size and weather the angle
between the two clusters is of right angle and if spread of
orientations is sufficiently low. Aditionally triplets are checked
as well. Here we do the same check but we are looking at cluster one
vs two other clusters combined.
Args:
orientations (np.ndarray): array of hydrogen
orientations in space
pct_size_buffer (float, optional): Minimum allowed size of the
hydrogen orientation cluster. Defaults to 0.85.
lower_bound_pct_buffer (float, optional): Minimum allowed size of the
hydrogen orientation cluster. Defaults to 0.35.
min_samp_data_size_pct (float, optional): Minimum samples to
choose for OPTICS clustering as percentage of
number of water molecules considered for HCW and WCW.
Defaults to 0.15.
pct_explained (float, optional): percentage of explained
hydrogen orientations for water to be considered WCW.
Defaults to 0.7.
angdiff_cutoff (float, optional): Maximum standard
deviation of angle allowed for WCW to be considered
correct water angle. Defaults to 15.
angstd_cutoff (float, optional): Maximum standard deviation
cutoff for WCW angles to be considered correct water angles.
Defaults to 20.
xi (float, optional): Xi value for OPTICS clustering for WCW. Don't
touch this unless you know what you are doing.
Defaults to 0.01.
njobs (int, optional): how many cpu cores to use for clustering.
Defaults to 1.
verbose (int, optional): verbosity of output. Defaults to 0.
debugH (int, optional): debug level for orientations. Defaults to 0.
plotreach (bool, optional): weather to plot the reachability
plot for OPTICS when debuging. Defaults to False.
Returns:
list: returns a list containing two orientations of hydrogens
and water classification string "WCW", if not WCW returns
an empty list
"""
# number of elements in oxygen cluster
neioc = int(len(orientations) / 2)
# Optics clustering for hydrogen orientations - minsamp is same for all water types
msp = int(neioc * min_samp_data_size_pct)
msp = msp if msp >= 2 else 2
cc = OPTICS(
min_samples=msp,
xi=xi,
n_jobs=njobs,
)
cc.fit(orientations)
weakly_conserved = []
# lebels without -1
labels = cc.labels_
(_, counts) = np.unique(labels, return_counts=True)
weakly = []
sk = ""
ss = ""
# array of already used clusters
used = []
# list of labels
lbls = np.unique(labels)
# if there is more then 1 cluster
if len(np.unique(labels)) > 1:
# Chekc over pairs of clusters
# loop over OPTICS clusters
for ii in lbls[lbls != -1]:
# check if in used
if ii in used:
continue
N_elems = len(orientations[labels == ii])
# Debug
if verbose > 0 or debugH > 0:
sk += f"Cluster {ii} has {neioc * lower_bound_pct_buffer:.2f}<"
sk += f"{N_elems}<{neioc * (2 - pct_size_buffer):.2f} elements\n"
# check if size of hydorgen orientation cluster is between 1.20
# and lower_bound_pct_buffer times number of elements in oxygen cluster
if (
N_elems < neioc * (2 - pct_size_buffer)
and N_elems > neioc * lower_bound_pct_buffer
):
avang1 = np.mean(orientations[labels == ii], axis=0)
# loop over clusters but not ii
lblsni = lbls[lbls != ii]
for jj in lblsni:
# check if already used
if ii in used:
break
if jj in used:
continue
# calculate average angle between average orientation of ii
# and all orientations in cluster jj
angs1j = (
np.arccos(np.dot(orientations[labels == jj], avang1.T))
* 360.0
/ (2.0 * np.pi)
)
N_elems_jj = len(orientations[labels == jj])
# Debug
if verbose > 0 or debugH > 0:
maxcomp = np.max([N_elems, N_elems_jj])
calcedcomp = (1 - pct_size_buffer) * maxcomp
sk += f"cluster combo:{ii} & {jj}size:{N_elems},"
sk += f"{neioc * lower_bound_pct_buffer:.2f}<{N_elems_jj}<"
sk += f"{neioc * (2 - pct_size_buffer):.2f}\n"
sk += f"size comparison {np.abs(N_elems - N_elems_jj)}"
sk += f"(calced) < {calcedcomp} \n"
sk += f"ang diff={np.abs(np.mean(angs1j) - 104.5):.2f}"
sk += f"(calced)<{angdiff_cutoff:.2f},std dev:"
sk += f"{angstd_cutoff:.2f}>{np.std(angs1j):.2f}(calced)\n"
# check if size of new cluster and check if size of clusters
# is about equal
if (
N_elems_jj > neioc * (2 - pct_size_buffer)
or N_elems_jj < neioc * lower_bound_pct_buffer
or np.abs(N_elems - N_elems_jj)
> (1 - pct_size_buffer) * np.max([N_elems, N_elems_jj])
):
continue
# check if angles and std devs are good
if (
np.abs(np.mean(angs1j) - 104.5) < angdiff_cutoff
and np.std(angs1j) < angstd_cutoff
and len(orientations[labels == jj])
> neioc * lower_bound_pct_buffer
and len(orientations[labels == jj])
< neioc * (2 - pct_size_buffer)
):
used.append(ii)
used.append(jj)
weakly.append([ii, jj])
break
# check for triplets if cluster ii has same size as clusters jj+kk
# and satisfactory angles
# loop over OPTICS clusters
for ii in lbls[lbls != -1]:
# check if in used
if ii in used:
continue
N_elems = len(orientations[labels == ii])
# average orientation of ii
avang1 = np.mean(orientations[labels == ii], axis=0)
# loop over clusters but not ii
lblsni = lbls[lbls != ii]
for jj in lblsni:
# check if already used
if ii in used:
break
if jj in used:
continue
N_elems_jj = len(orientations[labels == jj])
avangj = np.mean(orientations[labels == jj], axis=0)
# loop over clusters but not ii or jj
lblsnij = lblsni[lblsni != jj]
for kk in lblsnij:
if ii in used:
break
if jj in used:
break
if kk in used:
continue
N_elems_kk = len(orientations[labels == kk])
avangk = np.mean(orientations[labels == kk], axis=0)
# calculate average angle between average orientation of ii
# and all orientations in cluster jj and kk
angs1jk = (
np.arccos(
np.dot(
orientations[(labels == jj) | (labels == kk)],
avang1.T,
)
)
* 360.0
/ (2.0 * np.pi)
)
angs1j = (
np.arccos(
np.dot(
orientations[(labels == jj)],
avang1.T,
)
)
* 360.0
/ (2.0 * np.pi)
)
angs1k = (
np.arccos(
np.dot(
orientations[(labels == kk)],
avang1.T,
)
)
* 360.0
/ (2.0 * np.pi)
)
angsjk = (
np.arccos(
np.dot(
orientations[(labels == kk)],
avangj.T,
)
)
* 360.0
/ (2.0 * np.pi)
)
angskj = (
np.arccos(
np.dot(
orientations[(labels == jj)],
avangk.T,
)
)
* 360.0
/ (2.0 * np.pi)
)
# Debug
if verbose > 0 or debugH > 0:
maxprint = np.max([N_elems, N_elems_jj + N_elems_kk])
sizeingprint = (1 - pct_size_buffer) * maxprint
sk += f"cluster combo:{ii} & {jj} & {kk} size:"
sk += f"{N_elems},{N_elems_jj}, {N_elems_kk} \n"
sk += "size check "
sk += f"{np.abs(N_elems - (N_elems_jj + N_elems_kk))}"
sk += f"(calced)< {sizeingprint}\n"
sk += f"ang diff={np.abs(np.mean(angs1jk) - 104.5):.2f}"
sk += f"(calced)<{angdiff_cutoff:.2f},std dev:"
sk += f"{angstd_cutoff:.2f}>{np.std(angs1jk):.2f}"
sk += "(calced)\n"
sk += f"ij {ii},{jj}, {np.abs(np.mean(angs1j) - 104.5)},"
sk += f"<,angdiff_cutoff,and std {np.std(angs1j)}<"
sk += f"{angstd_cutoff} \n"
sk += f"ik {ii},{kk}, {np.abs(np.mean(angs1k) - 104.5)},"
sk += f"<,angdiff_cutoff,and std {np.std(angs1k)}<"
sk += f"{angstd_cutoff} \n"
sk += f"kj {kk},{jj}, {np.abs(np.mean(angskj) - 104.5)},"
sk += f"<,angdiff_cutoff,and std {np.std(angskj)}<"
sk += f"{angstd_cutoff} \n"
sk += f"jk {jj},{kk}, {np.abs(np.mean(angsjk) - 104.5)},"
sk += f"<,angdiff_cutoff,and std {np.std(angsjk)}<"
sk += f"{angstd_cutoff} \n"
# check if size of clusters is about equal ii==jj+kk
# with angle combination
if (
np.abs(N_elems - (N_elems_jj + N_elems_kk))
< (1 - pct_size_buffer)
* np.max([N_elems, N_elems_jj + N_elems_kk])
and (
np.abs(np.mean(angs1jk) - 104.5) < angdiff_cutoff
and np.std(angs1jk) < angstd_cutoff
)
) or (
(
np.abs(np.mean(angs1j) - 104.5) < angdiff_cutoff
and np.std(angs1j) < angstd_cutoff
)
and (
np.abs(np.mean(angs1k) - 104.5) < angdiff_cutoff
and np.std(angs1k) < angstd_cutoff
)
and (
np.abs(np.mean(angsjk) - 104.5) < angdiff_cutoff
and np.std(angsjk) < angstd_cutoff
)
and (
np.abs(np.mean(angskj) - 104.5) < angdiff_cutoff
and np.std(angskj) < angstd_cutoff
)
):
weakly.append([ii, jj])
weakly.append([ii, kk])
used.append(ii)
used.append(jj)
used.append(kk)
break
# check if used ones account for pct_explained percentage of orientations
total = 0
for i in used:
total += len(orientations[labels == i])
if verbose > 1:
print("total length=", total, ">", 2 * neioc * pct_explained)
for ws in weakly:
print("weakly combo", ws[0], ws[1])
if total > 2 * neioc * pct_explained:
if verbose > 0:
print("weakly found")
for ws in weakly:
weakly_conserved.append(
__return_normalized_orientation_pair(
orientations, labels, ws[0], ws[1], "WCW"
)
)
# or if angle beteween a larger set of clusters is water angle for all of
# them - this means that this is circular triplet or multiplet
else:
used = []
weakly = []
# loop over OPTICS clusters
for ii in lbls:
this = []
# check if in used
if ii in used:
continue
# average orientation of ii
avangi = np.mean(orientations[labels == ii], axis=0)
# loop over clusters but not ii
lblsni = lbls[lbls != ii]
for jj in lblsni:
# check if already used
if jj in used:
continue
avangj = np.mean(orientations[labels == jj], axis=0)
# loop over clusters but not ii or jj
lblsnij = lblsni[lblsni != jj]
angsij = (
np.arccos(
np.dot(
orientations[(labels == jj)],
avangi.T,
)
)
* 360.0
/ (2.0 * np.pi)
)
angsji = (
np.arccos(
np.dot(
orientations[(labels == ii)],
avangj.T,
)
)
* 360.0
/ (2.0 * np.pi)
)
if verbose > 0 or debugH > 0:
sk += f"ij,{ii},{jj}, angs and std ij: "
sk += f"{np.abs(np.mean(angsij) - 104.5)}, {np.std(angsij)};"
sk += f"ji: {np.abs(np.mean(angsji) - 104.5)},"
sk += f" {np.std(angsji)} \n"
if (
np.abs(np.mean(angsij) - 104.5) < angdiff_cutoff
and np.std(angsij) < angstd_cutoff
) and (
np.abs(np.mean(angsji) - 104.5) < angdiff_cutoff
and np.std(angsji) < angstd_cutoff
):
go = True
for kk in this:
avangk = np.mean(orientations[labels == kk], axis=0)
angsjk = (
np.arccos(
np.dot(
orientations[(labels == kk)],
avangj.T,
)
)
* 360.0
/ (2.0 * np.pi)
)
angskj = (
np.arccos(
np.dot(
orientations[(labels == jj)],
avangk.T,
)
)
* 360.0
/ (2.0 * np.pi)
)
if verbose > 0 or debugH > 0:
sk += f"jk,{jj},{kk}, angs and std jk:"
sk += f"{np.abs(np.mean(angsjk) - 104.5)}, "
sk += f"{np.std(angsjk)}; "
sk += f"kj: {np.abs(np.mean(angskj) - 104.5)}, "
sk += f"{np.std(angskj)} \n"
if (
np.abs(np.mean(angskj) - 104.5) > angdiff_cutoff
and np.std(angskj) > angstd_cutoff
) and (
np.abs(np.mean(angsjk) - 104.5) > angdiff_cutoff
and np.std(angsjk) > angstd_cutoff
):
go = False
if go:
weakly.append([ii, jj])
used.append(ii)
used.append(jj)
total = 0
for i in used:
total += len(orientations[labels == i])
if verbose > 2:
print("total length=", total, ">", 2 * neioc * pct_explained)
for ws in weakly:
print("weakly combo", ws[0], ws[1])
if total > 2 * neioc * pct_explained:
if verbose > 0:
print("weakly found")
for ws in weakly:
weakly_conserved.append(
__return_normalized_orientation_pair(
orientations, labels, ws[0], ws[1], "WCW"
)
)
# Debug
if verbose > 0 or debugH > 0:
ss = (
f"weakly Conserved - OPTICS analysis;xi={xi}\n"
f"Number of hydrogen clusters : {len(np.unique(labels))};\n"
f"number of elements : {counts}; range needed for best cluster:"
f"(depends on numbpct) {neioc * (2 - pct_size_buffer):.2f},"
f"{neioc * lower_bound_pct_buffer:.2f}\n"
)
# Debug Printing
if verbose == 2 or (verbose == 1 and len(weakly_conserved) > 0):
print(ss + sk)
# Debug plots
__hydrogen_orient_plots(
labels,
orientations,
cc,
ss,
(
f"Reachability of optics plot\n"
f"minsamples={int(neioc * min_samp_data_size_pct)}; xi={xi}"
),
len(weakly_conserved) > 0,
debugH,
plotreach,
)
return weakly_conserved
def __hydrogen_orient_plots(
labels, orientations, cc, ss, rtit, conserved, debugH, plotreach: bool
) -> None:
"""Collection of plots for hydrogen orientation.
For debuging purposes. Not ment for direct usage.
"""
if debugH == 2 or (debugH == 1 and conserved):
if plotreach:
fig = __plot3Dorients(111, labels, orientations, ss)
else:
fig = __plot3Dorients(121, labels, orientations, ss)
if plotreach:
__plotreachability(122, orientations, cc, fig=fig, tit=rtit)
def __plot3Dorients(subplot, labels: int, orientations: np.ndarray, tip: str) -> Figure:
"""Function for plotting 3D orientations.
For debuging only.
"""
if plt is None:
msg = "install matplotlib"
raise Exception(msg)
fig = plt.figure()
if isinstance(labels, int):
return fig
ax = fig.add_subplot(subplot, projection="3d")
ax.set_title(tip)
for j in np.unique(labels):
jaba = orientations[labels == j]
ax.scatter(
jaba[:, 0],
jaba[:, 1],
jaba[:, 2],
label=f"{j} ({len(labels[labels == j])})",
)
if j > -1:
ax.quiver(
0,
0,
0,
np.mean(jaba[:, 0]),
np.mean(jaba[:, 1]),
np.mean(jaba[:, 2]),
color="gray",
arrow_length_ratio=0.0,
linewidths=5,
)
ax.scatter(0, 0, 0, c="crimson", s=1000)
ax.legend()
ax.grid(False)
ax.axis("off")
ax.set_aspect("equal")
ax.autoscale(tight=True)
ax.dist = 6
return fig
def __plotreachability(
subplot, orientations, cc, fig: Figure | None = None, tit=None
) -> Figure:
"""Function for plotting reachability.
For debuging purposes only.
"""
if plt is None:
msg = "install matplotlib"
raise Exception(msg)
if fig is None:
fig: Figure = plt.figure()
if not isinstance(cc, OPTICS):
return fig
lblls = cc.labels_[cc.ordering_]
labels = cc.labels_
reachability = cc.reachability_[cc.ordering_]
ax2 = fig.add_subplot(subplot)
fig.gca().set_prop_cycle(None)
space = np.arange(len(orientations))
ax2.plot(space, reachability)
if tit is not None:
ax2.set_title(tit)
for clst in np.unique(lblls):
if clst == -1:
label = np.mean(np.ma.masked_invalid(cc.reachability_[labels == clst]))
ax2.plot(
space[lblls == clst],
reachability[lblls == clst],
label=f"{clst} ({len(space[lblls == clst])}), avg reach={label}",
color="blue",
)
else:
label = np.mean(np.ma.masked_invalid(cc.reachability_[labels == clst]))
ax2.plot(
space[lblls == clst],
reachability[lblls == clst],
label=f"{clst} ({len(space[lblls == clst])}), avg reach={label}",
)
ax2.legend()
return fig
def __return_normalized_orientation_pair(
orientations: np.ndarray,
labels: np.ndarray,
i: int,
j: int,
typel: str,
) -> list[np.ndarray | str]:
"""Helper function for normalizing orientations.
Not ment for general usage.
"""
v1 = np.mean(orientations[labels == i], axis=0)
v2 = np.mean(orientations[labels == j], axis=0)
return [
v1 / np.linalg.norm(v1) * np.linalg.norm(orientations[0]),
v2 / np.linalg.norm(v2) * np.linalg.norm(orientations[0]),
typel,
]