Table of contents¶
Apply Deeptime SVD/TICA to alanine, compare eigenvalues with PyEMMA, and tune the projection process.
Step 1¶
In [ ]:
#!/usr/bin/env python3
import argparse
import os
from pathlib import Path
import mdtraj as md
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from deeptime.clustering import KMeans
from deeptime.markov.msm import MaximumLikelihoodMSM
COURSE_DIR = Path(os.environ.get("COURSE_DIR", str(Path.home() / "Concepcion26"))).expanduser()
def compute_features(topology: str, trajectories: list[str]) -> np.ndarray:
traj = md.load(trajectories, top=topology)
atom_indices = traj.top.select("backbone")
pairs = [(atom_indices[i], atom_indices[i + 1]) for i in range(len(atom_indices) - 1)]
distances = md.compute_distances(traj, pairs)
return distances
def main() -> None:
parser = argparse.ArgumentParser(description="deeptime MSM on alanine dipeptide")
parser.add_argument("-t", "--traj", nargs="+", default=None, help="Trajectory files")
parser.add_argument("-p", "--top", default=None, help="Topology file")
parser.add_argument("-k", "--k", type=int, default=50, help="Number of clusters")
parser.add_argument("--lag", type=int, default=10, help="Lag time")
parser.add_argument("--pca", type=int, default=2, help="PCA components")
args = parser.parse_args()
data_dir = COURSE_DIR / "data"
traj_files = args.traj or [str(data_dir / "alanine-dipeptide.dcd") ]
topology = args.top or str(data_dir / "alanine-dipeptide.pdb")
out_dir = COURSE_DIR / "results" / "07-deeptime" / "simple"
out_dir.mkdir(parents=True, exist_ok=True)
try:
features = compute_features(topology, traj_files)
except ValueError as exc:
alt_traj = data_dir / "alanine-dipeptide-multi.pdb"
if alt_traj.exists() and args.traj is None and args.top is None:
print("Topology/trajectory mismatch; using multi-model PDB:", alt_traj)
traj_files = [str(alt_traj)]
topology = str(alt_traj)
features = compute_features(topology, traj_files)
else:
raise
pca = PCA(n_components=args.pca)
pca_coords = pca.fit_transform(features)
plt.figure()
plt.scatter(pca_coords[:, 0], pca_coords[:, 1], s=5, alpha=0.5)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Alanine PCA")
plt.savefig(out_dir / "pca.png", dpi=150)
kmeans = KMeans(n_clusters=args.k, max_iter=50)
dtrajs = kmeans.fit_transform(pca_coords)
msm = MaximumLikelihoodMSM(lagtime=args.lag).fit(dtrajs).fetch_model()
timescales = msm.timescales()[:5]
print("Timescales:", timescales)
(out_dir / "timescales.txt").write_text("\\n".join(str(t) for t in timescales))
if __name__ == "__main__":
main()